{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 6.622338113863537, "eval_steps": 500, "global_step": 1907, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003476749239461104, "grad_norm": 28.668624877929688, "learning_rate": 0.0, "loss": 10.9346, "step": 1 }, { "epoch": 0.006953498478922208, "grad_norm": 29.20085334777832, "learning_rate": 5.235602094240838e-07, "loss": 10.9266, "step": 2 }, { "epoch": 0.010430247718383311, "grad_norm": 27.985437393188477, "learning_rate": 1.0471204188481676e-06, "loss": 10.8227, "step": 3 }, { "epoch": 0.013906996957844416, "grad_norm": 26.395610809326172, "learning_rate": 1.5706806282722513e-06, "loss": 10.6153, "step": 4 }, { "epoch": 0.017383746197305518, "grad_norm": 21.0169677734375, "learning_rate": 2.094240837696335e-06, "loss": 10.3783, "step": 5 }, { "epoch": 0.020860495436766623, "grad_norm": 17.197437286376953, "learning_rate": 2.617801047120419e-06, "loss": 10.0927, "step": 6 }, { "epoch": 0.024337244676227728, "grad_norm": 14.133214950561523, "learning_rate": 3.1413612565445026e-06, "loss": 9.8757, "step": 7 }, { "epoch": 0.027813993915688832, "grad_norm": 12.546614646911621, "learning_rate": 3.664921465968586e-06, "loss": 9.5995, "step": 8 }, { "epoch": 0.03129074315514994, "grad_norm": 10.003512382507324, "learning_rate": 4.18848167539267e-06, "loss": 9.4723, "step": 9 }, { "epoch": 0.034767492394611035, "grad_norm": 9.371498107910156, "learning_rate": 4.712041884816754e-06, "loss": 9.2018, "step": 10 }, { "epoch": 0.03824424163407214, "grad_norm": 7.8304948806762695, "learning_rate": 5.235602094240838e-06, "loss": 9.0917, "step": 11 }, { "epoch": 0.041720990873533245, "grad_norm": 7.299487113952637, "learning_rate": 5.759162303664922e-06, "loss": 8.9398, "step": 12 }, { "epoch": 0.04519774011299435, "grad_norm": 6.750319480895996, "learning_rate": 6.282722513089005e-06, "loss": 8.8567, "step": 13 }, { "epoch": 0.048674489352455455, "grad_norm": 6.2069993019104, "learning_rate": 6.8062827225130895e-06, "loss": 8.6588, "step": 14 }, { "epoch": 0.05215123859191656, "grad_norm": 5.454948425292969, "learning_rate": 7.329842931937172e-06, "loss": 8.5728, "step": 15 }, { "epoch": 0.055627987831377665, "grad_norm": 5.076164722442627, "learning_rate": 7.853403141361257e-06, "loss": 8.5335, "step": 16 }, { "epoch": 0.05910473707083876, "grad_norm": 5.935672283172607, "learning_rate": 8.37696335078534e-06, "loss": 8.3915, "step": 17 }, { "epoch": 0.06258148631029987, "grad_norm": 6.291903972625732, "learning_rate": 8.900523560209424e-06, "loss": 8.3351, "step": 18 }, { "epoch": 0.06605823554976098, "grad_norm": 4.611499786376953, "learning_rate": 9.424083769633508e-06, "loss": 8.1489, "step": 19 }, { "epoch": 0.06953498478922207, "grad_norm": 4.213597297668457, "learning_rate": 9.947643979057591e-06, "loss": 8.1105, "step": 20 }, { "epoch": 0.07301173402868318, "grad_norm": 7.032574653625488, "learning_rate": 1.0471204188481676e-05, "loss": 7.9326, "step": 21 }, { "epoch": 0.07648848326814428, "grad_norm": 3.5775418281555176, "learning_rate": 1.099476439790576e-05, "loss": 8.0387, "step": 22 }, { "epoch": 0.07996523250760539, "grad_norm": 3.3227384090423584, "learning_rate": 1.1518324607329843e-05, "loss": 8.0855, "step": 23 }, { "epoch": 0.08344198174706649, "grad_norm": 3.577180862426758, "learning_rate": 1.2041884816753927e-05, "loss": 7.8638, "step": 24 }, { "epoch": 0.0869187309865276, "grad_norm": 3.077481508255005, "learning_rate": 1.256544502617801e-05, "loss": 7.7775, "step": 25 }, { "epoch": 0.0903954802259887, "grad_norm": 2.4729714393615723, "learning_rate": 1.3089005235602096e-05, "loss": 7.7653, "step": 26 }, { "epoch": 0.0938722294654498, "grad_norm": 2.7105133533477783, "learning_rate": 1.3612565445026179e-05, "loss": 7.8085, "step": 27 }, { "epoch": 0.09734897870491091, "grad_norm": 2.2809700965881348, "learning_rate": 1.4136125654450264e-05, "loss": 7.6766, "step": 28 }, { "epoch": 0.10082572794437202, "grad_norm": 2.30652117729187, "learning_rate": 1.4659685863874344e-05, "loss": 7.703, "step": 29 }, { "epoch": 0.10430247718383312, "grad_norm": 2.233243703842163, "learning_rate": 1.518324607329843e-05, "loss": 7.7391, "step": 30 }, { "epoch": 0.10777922642329422, "grad_norm": 2.0807292461395264, "learning_rate": 1.5706806282722515e-05, "loss": 7.5976, "step": 31 }, { "epoch": 0.11125597566275533, "grad_norm": 2.4727425575256348, "learning_rate": 1.6230366492146596e-05, "loss": 7.5206, "step": 32 }, { "epoch": 0.11473272490221642, "grad_norm": 3.1418378353118896, "learning_rate": 1.675392670157068e-05, "loss": 7.6335, "step": 33 }, { "epoch": 0.11820947414167753, "grad_norm": 3.3428750038146973, "learning_rate": 1.7277486910994763e-05, "loss": 7.5816, "step": 34 }, { "epoch": 0.12168622338113863, "grad_norm": 2.7240543365478516, "learning_rate": 1.780104712041885e-05, "loss": 7.4649, "step": 35 }, { "epoch": 0.12516297262059975, "grad_norm": 2.6777167320251465, "learning_rate": 1.8324607329842934e-05, "loss": 7.2798, "step": 36 }, { "epoch": 0.12863972186006084, "grad_norm": 2.6221325397491455, "learning_rate": 1.8848167539267016e-05, "loss": 7.2434, "step": 37 }, { "epoch": 0.13211647109952196, "grad_norm": 3.304597854614258, "learning_rate": 1.93717277486911e-05, "loss": 7.3748, "step": 38 }, { "epoch": 0.13559322033898305, "grad_norm": 2.2426793575286865, "learning_rate": 1.9895287958115183e-05, "loss": 7.2165, "step": 39 }, { "epoch": 0.13906996957844414, "grad_norm": 2.3125767707824707, "learning_rate": 2.0418848167539268e-05, "loss": 7.0828, "step": 40 }, { "epoch": 0.14254671881790526, "grad_norm": 2.921846389770508, "learning_rate": 2.0942408376963353e-05, "loss": 7.242, "step": 41 }, { "epoch": 0.14602346805736635, "grad_norm": 2.0602433681488037, "learning_rate": 2.1465968586387435e-05, "loss": 7.1855, "step": 42 }, { "epoch": 0.14950021729682747, "grad_norm": 2.180553436279297, "learning_rate": 2.198952879581152e-05, "loss": 7.0872, "step": 43 }, { "epoch": 0.15297696653628856, "grad_norm": 1.9306440353393555, "learning_rate": 2.25130890052356e-05, "loss": 7.1991, "step": 44 }, { "epoch": 0.15645371577574968, "grad_norm": 2.243671417236328, "learning_rate": 2.3036649214659687e-05, "loss": 7.087, "step": 45 }, { "epoch": 0.15993046501521077, "grad_norm": 1.9487829208374023, "learning_rate": 2.3560209424083772e-05, "loss": 6.9366, "step": 46 }, { "epoch": 0.1634072142546719, "grad_norm": 2.5406157970428467, "learning_rate": 2.4083769633507854e-05, "loss": 7.1055, "step": 47 }, { "epoch": 0.16688396349413298, "grad_norm": 3.9518744945526123, "learning_rate": 2.460732984293194e-05, "loss": 6.8859, "step": 48 }, { "epoch": 0.1703607127335941, "grad_norm": 1.8939940929412842, "learning_rate": 2.513089005235602e-05, "loss": 6.9724, "step": 49 }, { "epoch": 0.1738374619730552, "grad_norm": 2.785097360610962, "learning_rate": 2.5654450261780106e-05, "loss": 6.8924, "step": 50 }, { "epoch": 0.1773142112125163, "grad_norm": 3.876585006713867, "learning_rate": 2.617801047120419e-05, "loss": 6.7097, "step": 51 }, { "epoch": 0.1807909604519774, "grad_norm": 1.672093391418457, "learning_rate": 2.6701570680628273e-05, "loss": 6.8596, "step": 52 }, { "epoch": 0.1842677096914385, "grad_norm": 3.9122424125671387, "learning_rate": 2.7225130890052358e-05, "loss": 6.745, "step": 53 }, { "epoch": 0.1877444589308996, "grad_norm": 2.6010124683380127, "learning_rate": 2.7748691099476443e-05, "loss": 6.7213, "step": 54 }, { "epoch": 0.1912212081703607, "grad_norm": 3.1334500312805176, "learning_rate": 2.827225130890053e-05, "loss": 6.5959, "step": 55 }, { "epoch": 0.19469795740982182, "grad_norm": 2.5764386653900146, "learning_rate": 2.879581151832461e-05, "loss": 6.7594, "step": 56 }, { "epoch": 0.1981747066492829, "grad_norm": 2.5421552658081055, "learning_rate": 2.931937172774869e-05, "loss": 6.5791, "step": 57 }, { "epoch": 0.20165145588874403, "grad_norm": 2.9680330753326416, "learning_rate": 2.9842931937172774e-05, "loss": 6.5585, "step": 58 }, { "epoch": 0.20512820512820512, "grad_norm": 2.292030096054077, "learning_rate": 3.036649214659686e-05, "loss": 6.4137, "step": 59 }, { "epoch": 0.20860495436766624, "grad_norm": 3.2336649894714355, "learning_rate": 3.0890052356020944e-05, "loss": 6.542, "step": 60 }, { "epoch": 0.21208170360712733, "grad_norm": 2.3037948608398438, "learning_rate": 3.141361256544503e-05, "loss": 6.4735, "step": 61 }, { "epoch": 0.21555845284658845, "grad_norm": 2.364863157272339, "learning_rate": 3.1937172774869115e-05, "loss": 6.4433, "step": 62 }, { "epoch": 0.21903520208604954, "grad_norm": 1.9736865758895874, "learning_rate": 3.246073298429319e-05, "loss": 6.2436, "step": 63 }, { "epoch": 0.22251195132551066, "grad_norm": 2.201551914215088, "learning_rate": 3.298429319371728e-05, "loss": 6.4195, "step": 64 }, { "epoch": 0.22598870056497175, "grad_norm": 2.3755462169647217, "learning_rate": 3.350785340314136e-05, "loss": 6.3625, "step": 65 }, { "epoch": 0.22946544980443284, "grad_norm": 3.011631727218628, "learning_rate": 3.403141361256545e-05, "loss": 6.3454, "step": 66 }, { "epoch": 0.23294219904389396, "grad_norm": 2.120392084121704, "learning_rate": 3.455497382198953e-05, "loss": 6.2755, "step": 67 }, { "epoch": 0.23641894828335505, "grad_norm": 2.4527335166931152, "learning_rate": 3.507853403141361e-05, "loss": 6.3017, "step": 68 }, { "epoch": 0.23989569752281617, "grad_norm": 2.6498172283172607, "learning_rate": 3.56020942408377e-05, "loss": 6.2205, "step": 69 }, { "epoch": 0.24337244676227726, "grad_norm": 1.6044883728027344, "learning_rate": 3.612565445026178e-05, "loss": 6.1502, "step": 70 }, { "epoch": 0.24684919600173838, "grad_norm": 4.1391143798828125, "learning_rate": 3.664921465968587e-05, "loss": 6.1017, "step": 71 }, { "epoch": 0.2503259452411995, "grad_norm": 2.0595903396606445, "learning_rate": 3.717277486910995e-05, "loss": 6.1089, "step": 72 }, { "epoch": 0.25380269448066056, "grad_norm": 3.4644057750701904, "learning_rate": 3.769633507853403e-05, "loss": 6.0465, "step": 73 }, { "epoch": 0.2572794437201217, "grad_norm": 2.4427716732025146, "learning_rate": 3.8219895287958116e-05, "loss": 6.0676, "step": 74 }, { "epoch": 0.2607561929595828, "grad_norm": 3.2828521728515625, "learning_rate": 3.87434554973822e-05, "loss": 6.1147, "step": 75 }, { "epoch": 0.2642329421990439, "grad_norm": 2.433551788330078, "learning_rate": 3.926701570680629e-05, "loss": 6.0704, "step": 76 }, { "epoch": 0.267709691438505, "grad_norm": 1.9476454257965088, "learning_rate": 3.9790575916230365e-05, "loss": 5.9814, "step": 77 }, { "epoch": 0.2711864406779661, "grad_norm": 2.0444705486297607, "learning_rate": 4.031413612565445e-05, "loss": 5.8327, "step": 78 }, { "epoch": 0.2746631899174272, "grad_norm": 2.3648533821105957, "learning_rate": 4.0837696335078535e-05, "loss": 5.9608, "step": 79 }, { "epoch": 0.2781399391568883, "grad_norm": 2.4138662815093994, "learning_rate": 4.136125654450262e-05, "loss": 5.9469, "step": 80 }, { "epoch": 0.2816166883963494, "grad_norm": 2.0390286445617676, "learning_rate": 4.1884816753926706e-05, "loss": 5.8486, "step": 81 }, { "epoch": 0.2850934376358105, "grad_norm": 2.036783218383789, "learning_rate": 4.240837696335079e-05, "loss": 5.8894, "step": 82 }, { "epoch": 0.28857018687527164, "grad_norm": 2.660581111907959, "learning_rate": 4.293193717277487e-05, "loss": 5.8379, "step": 83 }, { "epoch": 0.2920469361147327, "grad_norm": 2.0146775245666504, "learning_rate": 4.3455497382198955e-05, "loss": 5.7308, "step": 84 }, { "epoch": 0.2955236853541938, "grad_norm": 2.3022539615631104, "learning_rate": 4.397905759162304e-05, "loss": 5.816, "step": 85 }, { "epoch": 0.29900043459365494, "grad_norm": 2.7125766277313232, "learning_rate": 4.4502617801047125e-05, "loss": 5.6184, "step": 86 }, { "epoch": 0.30247718383311606, "grad_norm": 2.6640212535858154, "learning_rate": 4.50261780104712e-05, "loss": 5.8341, "step": 87 }, { "epoch": 0.3059539330725771, "grad_norm": 2.317542314529419, "learning_rate": 4.554973821989529e-05, "loss": 5.7043, "step": 88 }, { "epoch": 0.30943068231203824, "grad_norm": 1.7317227125167847, "learning_rate": 4.6073298429319374e-05, "loss": 5.7464, "step": 89 }, { "epoch": 0.31290743155149936, "grad_norm": 2.3583664894104004, "learning_rate": 4.659685863874346e-05, "loss": 5.5552, "step": 90 }, { "epoch": 0.3163841807909605, "grad_norm": 2.680361270904541, "learning_rate": 4.7120418848167544e-05, "loss": 5.6926, "step": 91 }, { "epoch": 0.31986093003042154, "grad_norm": 1.4900928735733032, "learning_rate": 4.764397905759162e-05, "loss": 5.6321, "step": 92 }, { "epoch": 0.32333767926988266, "grad_norm": 2.226301431655884, "learning_rate": 4.816753926701571e-05, "loss": 5.4741, "step": 93 }, { "epoch": 0.3268144285093438, "grad_norm": 2.516113519668579, "learning_rate": 4.869109947643979e-05, "loss": 5.687, "step": 94 }, { "epoch": 0.33029117774880484, "grad_norm": 1.9042409658432007, "learning_rate": 4.921465968586388e-05, "loss": 5.5558, "step": 95 }, { "epoch": 0.33376792698826596, "grad_norm": 1.791879653930664, "learning_rate": 4.973821989528796e-05, "loss": 5.4747, "step": 96 }, { "epoch": 0.3372446762277271, "grad_norm": 1.4365205764770508, "learning_rate": 5.026178010471204e-05, "loss": 5.5098, "step": 97 }, { "epoch": 0.3407214254671882, "grad_norm": 2.371295690536499, "learning_rate": 5.0785340314136134e-05, "loss": 5.448, "step": 98 }, { "epoch": 0.34419817470664926, "grad_norm": 2.3430371284484863, "learning_rate": 5.130890052356021e-05, "loss": 5.5506, "step": 99 }, { "epoch": 0.3476749239461104, "grad_norm": 1.4759836196899414, "learning_rate": 5.18324607329843e-05, "loss": 5.4155, "step": 100 }, { "epoch": 0.3511516731855715, "grad_norm": 2.003601551055908, "learning_rate": 5.235602094240838e-05, "loss": 5.5077, "step": 101 }, { "epoch": 0.3546284224250326, "grad_norm": 1.7206441164016724, "learning_rate": 5.287958115183246e-05, "loss": 5.4507, "step": 102 }, { "epoch": 0.3581051716644937, "grad_norm": 2.2505338191986084, "learning_rate": 5.3403141361256546e-05, "loss": 5.3292, "step": 103 }, { "epoch": 0.3615819209039548, "grad_norm": 2.510773181915283, "learning_rate": 5.3926701570680624e-05, "loss": 5.4533, "step": 104 }, { "epoch": 0.3650586701434159, "grad_norm": 1.8072589635849, "learning_rate": 5.4450261780104716e-05, "loss": 5.355, "step": 105 }, { "epoch": 0.368535419382877, "grad_norm": 2.089144706726074, "learning_rate": 5.4973821989528795e-05, "loss": 5.5014, "step": 106 }, { "epoch": 0.3720121686223381, "grad_norm": 1.5899293422698975, "learning_rate": 5.5497382198952887e-05, "loss": 5.376, "step": 107 }, { "epoch": 0.3754889178617992, "grad_norm": 1.6699646711349487, "learning_rate": 5.6020942408376965e-05, "loss": 5.3309, "step": 108 }, { "epoch": 0.37896566710126034, "grad_norm": 1.2876421213150024, "learning_rate": 5.654450261780106e-05, "loss": 5.3524, "step": 109 }, { "epoch": 0.3824424163407214, "grad_norm": 2.535942316055298, "learning_rate": 5.7068062827225135e-05, "loss": 5.3004, "step": 110 }, { "epoch": 0.3859191655801825, "grad_norm": 1.8234626054763794, "learning_rate": 5.759162303664922e-05, "loss": 5.2188, "step": 111 }, { "epoch": 0.38939591481964364, "grad_norm": 2.6738812923431396, "learning_rate": 5.81151832460733e-05, "loss": 5.3345, "step": 112 }, { "epoch": 0.39287266405910476, "grad_norm": 2.4427976608276367, "learning_rate": 5.863874345549738e-05, "loss": 5.2417, "step": 113 }, { "epoch": 0.3963494132985658, "grad_norm": 2.158632516860962, "learning_rate": 5.916230366492147e-05, "loss": 5.2943, "step": 114 }, { "epoch": 0.39982616253802694, "grad_norm": 2.0583152770996094, "learning_rate": 5.968586387434555e-05, "loss": 5.2959, "step": 115 }, { "epoch": 0.40330291177748806, "grad_norm": 1.8146238327026367, "learning_rate": 6.020942408376964e-05, "loss": 5.3123, "step": 116 }, { "epoch": 0.4067796610169492, "grad_norm": 1.9138127565383911, "learning_rate": 6.073298429319372e-05, "loss": 5.1541, "step": 117 }, { "epoch": 0.41025641025641024, "grad_norm": 2.2583582401275635, "learning_rate": 6.125654450261781e-05, "loss": 5.0541, "step": 118 }, { "epoch": 0.41373315949587136, "grad_norm": 1.597258448600769, "learning_rate": 6.178010471204189e-05, "loss": 5.199, "step": 119 }, { "epoch": 0.4172099087353325, "grad_norm": 2.2339518070220947, "learning_rate": 6.230366492146598e-05, "loss": 5.1763, "step": 120 }, { "epoch": 0.42068665797479354, "grad_norm": 2.5288898944854736, "learning_rate": 6.282722513089006e-05, "loss": 5.2478, "step": 121 }, { "epoch": 0.42416340721425466, "grad_norm": 1.9757497310638428, "learning_rate": 6.335078534031414e-05, "loss": 5.0547, "step": 122 }, { "epoch": 0.4276401564537158, "grad_norm": 2.4785637855529785, "learning_rate": 6.387434554973823e-05, "loss": 5.0945, "step": 123 }, { "epoch": 0.4311169056931769, "grad_norm": 2.1948068141937256, "learning_rate": 6.439790575916231e-05, "loss": 5.119, "step": 124 }, { "epoch": 0.43459365493263796, "grad_norm": 1.3128104209899902, "learning_rate": 6.492146596858639e-05, "loss": 5.0568, "step": 125 }, { "epoch": 0.4380704041720991, "grad_norm": 2.0916574001312256, "learning_rate": 6.544502617801048e-05, "loss": 5.197, "step": 126 }, { "epoch": 0.4415471534115602, "grad_norm": 1.8343582153320312, "learning_rate": 6.596858638743456e-05, "loss": 5.1277, "step": 127 }, { "epoch": 0.4450239026510213, "grad_norm": 2.083747386932373, "learning_rate": 6.649214659685863e-05, "loss": 5.1871, "step": 128 }, { "epoch": 0.4485006518904824, "grad_norm": 2.1757895946502686, "learning_rate": 6.701570680628273e-05, "loss": 5.0276, "step": 129 }, { "epoch": 0.4519774011299435, "grad_norm": 2.2232532501220703, "learning_rate": 6.75392670157068e-05, "loss": 4.9795, "step": 130 }, { "epoch": 0.4554541503694046, "grad_norm": 1.853911280632019, "learning_rate": 6.80628272251309e-05, "loss": 5.0363, "step": 131 }, { "epoch": 0.4589308996088657, "grad_norm": 2.0266318321228027, "learning_rate": 6.858638743455498e-05, "loss": 5.2211, "step": 132 }, { "epoch": 0.4624076488483268, "grad_norm": 2.087162971496582, "learning_rate": 6.910994764397905e-05, "loss": 4.8979, "step": 133 }, { "epoch": 0.4658843980877879, "grad_norm": 1.4458317756652832, "learning_rate": 6.963350785340315e-05, "loss": 5.0776, "step": 134 }, { "epoch": 0.46936114732724904, "grad_norm": 2.3262505531311035, "learning_rate": 7.015706806282722e-05, "loss": 5.0269, "step": 135 }, { "epoch": 0.4728378965667101, "grad_norm": 2.3064498901367188, "learning_rate": 7.068062827225132e-05, "loss": 4.8792, "step": 136 }, { "epoch": 0.4763146458061712, "grad_norm": 1.705779790878296, "learning_rate": 7.12041884816754e-05, "loss": 4.9817, "step": 137 }, { "epoch": 0.47979139504563234, "grad_norm": 1.8515121936798096, "learning_rate": 7.172774869109949e-05, "loss": 5.106, "step": 138 }, { "epoch": 0.48326814428509346, "grad_norm": 1.824192762374878, "learning_rate": 7.225130890052356e-05, "loss": 4.9356, "step": 139 }, { "epoch": 0.4867448935245545, "grad_norm": 1.516313076019287, "learning_rate": 7.277486910994766e-05, "loss": 5.0519, "step": 140 }, { "epoch": 0.49022164276401564, "grad_norm": 1.6829155683517456, "learning_rate": 7.329842931937174e-05, "loss": 4.8605, "step": 141 }, { "epoch": 0.49369839200347676, "grad_norm": 1.4541552066802979, "learning_rate": 7.382198952879581e-05, "loss": 4.94, "step": 142 }, { "epoch": 0.4971751412429379, "grad_norm": 2.4809017181396484, "learning_rate": 7.43455497382199e-05, "loss": 4.959, "step": 143 }, { "epoch": 0.500651890482399, "grad_norm": 2.6929731369018555, "learning_rate": 7.486910994764398e-05, "loss": 5.0495, "step": 144 }, { "epoch": 0.5041286397218601, "grad_norm": 1.3168176412582397, "learning_rate": 7.539267015706806e-05, "loss": 4.8998, "step": 145 }, { "epoch": 0.5076053889613211, "grad_norm": 1.5721124410629272, "learning_rate": 7.591623036649214e-05, "loss": 4.8676, "step": 146 }, { "epoch": 0.5110821382007823, "grad_norm": 2.32256817817688, "learning_rate": 7.643979057591623e-05, "loss": 5.0139, "step": 147 }, { "epoch": 0.5145588874402434, "grad_norm": 2.1673014163970947, "learning_rate": 7.696335078534031e-05, "loss": 4.942, "step": 148 }, { "epoch": 0.5180356366797044, "grad_norm": 1.5646917819976807, "learning_rate": 7.74869109947644e-05, "loss": 4.7135, "step": 149 }, { "epoch": 0.5215123859191656, "grad_norm": 1.453816294670105, "learning_rate": 7.801047120418848e-05, "loss": 4.7374, "step": 150 }, { "epoch": 0.5249891351586267, "grad_norm": 1.178645372390747, "learning_rate": 7.853403141361257e-05, "loss": 4.7643, "step": 151 }, { "epoch": 0.5284658843980878, "grad_norm": 2.270442247390747, "learning_rate": 7.905759162303665e-05, "loss": 4.8833, "step": 152 }, { "epoch": 0.5319426336375489, "grad_norm": 1.6220506429672241, "learning_rate": 7.958115183246073e-05, "loss": 4.8259, "step": 153 }, { "epoch": 0.53541938287701, "grad_norm": 1.7205973863601685, "learning_rate": 8.010471204188482e-05, "loss": 4.8368, "step": 154 }, { "epoch": 0.5388961321164711, "grad_norm": 2.033658504486084, "learning_rate": 8.06282722513089e-05, "loss": 4.7614, "step": 155 }, { "epoch": 0.5423728813559322, "grad_norm": 2.0730934143066406, "learning_rate": 8.115183246073299e-05, "loss": 4.7387, "step": 156 }, { "epoch": 0.5458496305953933, "grad_norm": 1.7333288192749023, "learning_rate": 8.167539267015707e-05, "loss": 4.8901, "step": 157 }, { "epoch": 0.5493263798348544, "grad_norm": 1.4700742959976196, "learning_rate": 8.219895287958116e-05, "loss": 4.9365, "step": 158 }, { "epoch": 0.5528031290743155, "grad_norm": 1.8137691020965576, "learning_rate": 8.272251308900524e-05, "loss": 4.785, "step": 159 }, { "epoch": 0.5562798783137766, "grad_norm": 1.1198782920837402, "learning_rate": 8.324607329842933e-05, "loss": 4.7751, "step": 160 }, { "epoch": 0.5597566275532377, "grad_norm": 1.5133748054504395, "learning_rate": 8.376963350785341e-05, "loss": 4.7732, "step": 161 }, { "epoch": 0.5632333767926988, "grad_norm": 1.357731819152832, "learning_rate": 8.429319371727749e-05, "loss": 4.7859, "step": 162 }, { "epoch": 0.56671012603216, "grad_norm": 1.9853086471557617, "learning_rate": 8.481675392670158e-05, "loss": 4.8198, "step": 163 }, { "epoch": 0.570186875271621, "grad_norm": 2.5406196117401123, "learning_rate": 8.534031413612566e-05, "loss": 4.8113, "step": 164 }, { "epoch": 0.5736636245110821, "grad_norm": 2.0581886768341064, "learning_rate": 8.586387434554974e-05, "loss": 4.8693, "step": 165 }, { "epoch": 0.5771403737505433, "grad_norm": 1.1141613721847534, "learning_rate": 8.638743455497382e-05, "loss": 4.7263, "step": 166 }, { "epoch": 0.5806171229900043, "grad_norm": 1.7544695138931274, "learning_rate": 8.691099476439791e-05, "loss": 4.7621, "step": 167 }, { "epoch": 0.5840938722294654, "grad_norm": 2.428950548171997, "learning_rate": 8.743455497382199e-05, "loss": 4.7001, "step": 168 }, { "epoch": 0.5875706214689266, "grad_norm": 2.087353467941284, "learning_rate": 8.795811518324608e-05, "loss": 4.7774, "step": 169 }, { "epoch": 0.5910473707083876, "grad_norm": 1.6976741552352905, "learning_rate": 8.848167539267016e-05, "loss": 4.7362, "step": 170 }, { "epoch": 0.5945241199478487, "grad_norm": 1.7949515581130981, "learning_rate": 8.900523560209425e-05, "loss": 4.8683, "step": 171 }, { "epoch": 0.5980008691873099, "grad_norm": 1.0558027029037476, "learning_rate": 8.952879581151833e-05, "loss": 4.6915, "step": 172 }, { "epoch": 0.6014776184267709, "grad_norm": 2.1845736503601074, "learning_rate": 9.00523560209424e-05, "loss": 4.6825, "step": 173 }, { "epoch": 0.6049543676662321, "grad_norm": 1.852575659751892, "learning_rate": 9.05759162303665e-05, "loss": 4.5881, "step": 174 }, { "epoch": 0.6084311169056932, "grad_norm": 2.295853853225708, "learning_rate": 9.109947643979058e-05, "loss": 4.8337, "step": 175 }, { "epoch": 0.6119078661451542, "grad_norm": 1.8543741703033447, "learning_rate": 9.162303664921467e-05, "loss": 4.7888, "step": 176 }, { "epoch": 0.6153846153846154, "grad_norm": 1.4840337038040161, "learning_rate": 9.214659685863875e-05, "loss": 4.7088, "step": 177 }, { "epoch": 0.6188613646240765, "grad_norm": 1.1520745754241943, "learning_rate": 9.267015706806284e-05, "loss": 4.7188, "step": 178 }, { "epoch": 0.6223381138635375, "grad_norm": 1.4230411052703857, "learning_rate": 9.319371727748692e-05, "loss": 4.7505, "step": 179 }, { "epoch": 0.6258148631029987, "grad_norm": 1.6348016262054443, "learning_rate": 9.371727748691101e-05, "loss": 4.7266, "step": 180 }, { "epoch": 0.6292916123424598, "grad_norm": 1.824994444847107, "learning_rate": 9.424083769633509e-05, "loss": 4.7228, "step": 181 }, { "epoch": 0.632768361581921, "grad_norm": 1.3238110542297363, "learning_rate": 9.476439790575917e-05, "loss": 4.74, "step": 182 }, { "epoch": 0.636245110821382, "grad_norm": 1.4533603191375732, "learning_rate": 9.528795811518324e-05, "loss": 4.639, "step": 183 }, { "epoch": 0.6397218600608431, "grad_norm": 1.7972502708435059, "learning_rate": 9.581151832460732e-05, "loss": 4.6973, "step": 184 }, { "epoch": 0.6431986093003043, "grad_norm": 1.4870437383651733, "learning_rate": 9.633507853403142e-05, "loss": 4.6286, "step": 185 }, { "epoch": 0.6466753585397653, "grad_norm": 1.4717066287994385, "learning_rate": 9.68586387434555e-05, "loss": 4.5181, "step": 186 }, { "epoch": 0.6501521077792264, "grad_norm": 1.4903441667556763, "learning_rate": 9.738219895287959e-05, "loss": 4.6885, "step": 187 }, { "epoch": 0.6536288570186876, "grad_norm": 2.0264744758605957, "learning_rate": 9.790575916230366e-05, "loss": 4.7168, "step": 188 }, { "epoch": 0.6571056062581486, "grad_norm": 1.5279369354248047, "learning_rate": 9.842931937172776e-05, "loss": 4.6545, "step": 189 }, { "epoch": 0.6605823554976097, "grad_norm": 1.2937990427017212, "learning_rate": 9.895287958115183e-05, "loss": 4.648, "step": 190 }, { "epoch": 0.6640591047370709, "grad_norm": 0.9555952548980713, "learning_rate": 9.947643979057593e-05, "loss": 4.5756, "step": 191 }, { "epoch": 0.6675358539765319, "grad_norm": 1.3998730182647705, "learning_rate": 0.0001, "loss": 4.538, "step": 192 }, { "epoch": 0.6710126032159931, "grad_norm": 0.8394646048545837, "learning_rate": 9.999991620751589e-05, "loss": 4.5968, "step": 193 }, { "epoch": 0.6744893524554542, "grad_norm": 1.3991713523864746, "learning_rate": 9.999966483034437e-05, "loss": 4.6261, "step": 194 }, { "epoch": 0.6779661016949152, "grad_norm": 1.7438290119171143, "learning_rate": 9.999924586932803e-05, "loss": 4.5051, "step": 195 }, { "epoch": 0.6814428509343764, "grad_norm": 1.0810681581497192, "learning_rate": 9.999865932587107e-05, "loss": 4.5634, "step": 196 }, { "epoch": 0.6849196001738375, "grad_norm": 1.470165729522705, "learning_rate": 9.99979052019394e-05, "loss": 4.7187, "step": 197 }, { "epoch": 0.6883963494132985, "grad_norm": 1.3457802534103394, "learning_rate": 9.999698350006063e-05, "loss": 4.5977, "step": 198 }, { "epoch": 0.6918730986527597, "grad_norm": 1.7207729816436768, "learning_rate": 9.999589422332404e-05, "loss": 4.5589, "step": 199 }, { "epoch": 0.6953498478922208, "grad_norm": 1.0946606397628784, "learning_rate": 9.999463737538053e-05, "loss": 4.3829, "step": 200 }, { "epoch": 0.6988265971316818, "grad_norm": 1.4563919305801392, "learning_rate": 9.99932129604427e-05, "loss": 4.5988, "step": 201 }, { "epoch": 0.702303346371143, "grad_norm": 1.7498416900634766, "learning_rate": 9.999162098328474e-05, "loss": 4.6138, "step": 202 }, { "epoch": 0.7057800956106041, "grad_norm": 1.4779905080795288, "learning_rate": 9.998986144924251e-05, "loss": 4.6133, "step": 203 }, { "epoch": 0.7092568448500652, "grad_norm": 0.8581809401512146, "learning_rate": 9.99879343642134e-05, "loss": 4.5436, "step": 204 }, { "epoch": 0.7127335940895263, "grad_norm": 1.22317373752594, "learning_rate": 9.998583973465646e-05, "loss": 4.5361, "step": 205 }, { "epoch": 0.7162103433289874, "grad_norm": 1.2488147020339966, "learning_rate": 9.998357756759222e-05, "loss": 4.5957, "step": 206 }, { "epoch": 0.7196870925684485, "grad_norm": 1.6058237552642822, "learning_rate": 9.998114787060282e-05, "loss": 4.562, "step": 207 }, { "epoch": 0.7231638418079096, "grad_norm": 1.423363208770752, "learning_rate": 9.997855065183184e-05, "loss": 4.6001, "step": 208 }, { "epoch": 0.7266405910473707, "grad_norm": 0.9326280951499939, "learning_rate": 9.99757859199844e-05, "loss": 4.4833, "step": 209 }, { "epoch": 0.7301173402868318, "grad_norm": 1.4092398881912231, "learning_rate": 9.997285368432703e-05, "loss": 4.447, "step": 210 }, { "epoch": 0.7335940895262929, "grad_norm": 0.8156947493553162, "learning_rate": 9.996975395468772e-05, "loss": 4.5549, "step": 211 }, { "epoch": 0.737070838765754, "grad_norm": 1.4543663263320923, "learning_rate": 9.996648674145583e-05, "loss": 4.4712, "step": 212 }, { "epoch": 0.7405475880052151, "grad_norm": 1.1543498039245605, "learning_rate": 9.996305205558207e-05, "loss": 4.7086, "step": 213 }, { "epoch": 0.7440243372446762, "grad_norm": 1.3337583541870117, "learning_rate": 9.995944990857849e-05, "loss": 4.5306, "step": 214 }, { "epoch": 0.7475010864841374, "grad_norm": 1.3848955631256104, "learning_rate": 9.995568031251838e-05, "loss": 4.4306, "step": 215 }, { "epoch": 0.7509778357235984, "grad_norm": 1.0516972541809082, "learning_rate": 9.995174328003631e-05, "loss": 4.5513, "step": 216 }, { "epoch": 0.7544545849630595, "grad_norm": 1.375748634338379, "learning_rate": 9.9947638824328e-05, "loss": 4.5299, "step": 217 }, { "epoch": 0.7579313342025207, "grad_norm": 0.9336188435554504, "learning_rate": 9.99433669591504e-05, "loss": 4.5093, "step": 218 }, { "epoch": 0.7614080834419817, "grad_norm": 1.1167926788330078, "learning_rate": 9.99389276988215e-05, "loss": 4.5166, "step": 219 }, { "epoch": 0.7648848326814428, "grad_norm": 1.5452927350997925, "learning_rate": 9.993432105822034e-05, "loss": 4.5704, "step": 220 }, { "epoch": 0.768361581920904, "grad_norm": 0.8405366539955139, "learning_rate": 9.9929547052787e-05, "loss": 4.4324, "step": 221 }, { "epoch": 0.771838331160365, "grad_norm": 0.9402779936790466, "learning_rate": 9.992460569852256e-05, "loss": 4.5248, "step": 222 }, { "epoch": 0.7753150803998262, "grad_norm": 1.0658060312271118, "learning_rate": 9.991949701198889e-05, "loss": 4.4155, "step": 223 }, { "epoch": 0.7787918296392873, "grad_norm": 1.407897710800171, "learning_rate": 9.99142210103088e-05, "loss": 4.517, "step": 224 }, { "epoch": 0.7822685788787483, "grad_norm": 1.3789775371551514, "learning_rate": 9.990877771116589e-05, "loss": 4.5166, "step": 225 }, { "epoch": 0.7857453281182095, "grad_norm": 1.715079665184021, "learning_rate": 9.99031671328044e-05, "loss": 4.4957, "step": 226 }, { "epoch": 0.7892220773576706, "grad_norm": 0.7953473329544067, "learning_rate": 9.989738929402934e-05, "loss": 4.4265, "step": 227 }, { "epoch": 0.7926988265971316, "grad_norm": 1.3501601219177246, "learning_rate": 9.98914442142063e-05, "loss": 4.4762, "step": 228 }, { "epoch": 0.7961755758365928, "grad_norm": 1.1037392616271973, "learning_rate": 9.988533191326137e-05, "loss": 4.553, "step": 229 }, { "epoch": 0.7996523250760539, "grad_norm": 0.9815847873687744, "learning_rate": 9.987905241168117e-05, "loss": 4.4133, "step": 230 }, { "epoch": 0.803129074315515, "grad_norm": 0.9624298214912415, "learning_rate": 9.987260573051269e-05, "loss": 4.3117, "step": 231 }, { "epoch": 0.8066058235549761, "grad_norm": 0.8100147843360901, "learning_rate": 9.986599189136325e-05, "loss": 4.4257, "step": 232 }, { "epoch": 0.8100825727944372, "grad_norm": 0.8231883645057678, "learning_rate": 9.98592109164005e-05, "loss": 4.3901, "step": 233 }, { "epoch": 0.8135593220338984, "grad_norm": 0.7882739901542664, "learning_rate": 9.985226282835216e-05, "loss": 4.4451, "step": 234 }, { "epoch": 0.8170360712733594, "grad_norm": 1.0251091718673706, "learning_rate": 9.984514765050619e-05, "loss": 4.4834, "step": 235 }, { "epoch": 0.8205128205128205, "grad_norm": 1.483324408531189, "learning_rate": 9.983786540671051e-05, "loss": 4.3826, "step": 236 }, { "epoch": 0.8239895697522817, "grad_norm": 1.0795848369598389, "learning_rate": 9.983041612137301e-05, "loss": 4.4195, "step": 237 }, { "epoch": 0.8274663189917427, "grad_norm": 0.9076191186904907, "learning_rate": 9.982279981946143e-05, "loss": 4.3718, "step": 238 }, { "epoch": 0.8309430682312038, "grad_norm": 0.7669966816902161, "learning_rate": 9.981501652650337e-05, "loss": 4.3789, "step": 239 }, { "epoch": 0.834419817470665, "grad_norm": 0.9206002354621887, "learning_rate": 9.980706626858607e-05, "loss": 4.4972, "step": 240 }, { "epoch": 0.837896566710126, "grad_norm": 0.9446786046028137, "learning_rate": 9.979894907235639e-05, "loss": 4.4284, "step": 241 }, { "epoch": 0.8413733159495871, "grad_norm": 0.898154616355896, "learning_rate": 9.979066496502074e-05, "loss": 4.3296, "step": 242 }, { "epoch": 0.8448500651890483, "grad_norm": 1.006205677986145, "learning_rate": 9.978221397434496e-05, "loss": 4.3849, "step": 243 }, { "epoch": 0.8483268144285093, "grad_norm": 1.512513279914856, "learning_rate": 9.977359612865423e-05, "loss": 4.4939, "step": 244 }, { "epoch": 0.8518035636679705, "grad_norm": 0.9920049905776978, "learning_rate": 9.976481145683299e-05, "loss": 4.3769, "step": 245 }, { "epoch": 0.8552803129074316, "grad_norm": 0.7966930866241455, "learning_rate": 9.97558599883248e-05, "loss": 4.4977, "step": 246 }, { "epoch": 0.8587570621468926, "grad_norm": 0.715207040309906, "learning_rate": 9.974674175313228e-05, "loss": 4.4068, "step": 247 }, { "epoch": 0.8622338113863538, "grad_norm": 0.9936574697494507, "learning_rate": 9.973745678181705e-05, "loss": 4.477, "step": 248 }, { "epoch": 0.8657105606258149, "grad_norm": 1.4594191312789917, "learning_rate": 9.972800510549951e-05, "loss": 4.4045, "step": 249 }, { "epoch": 0.8691873098652759, "grad_norm": 0.7639303207397461, "learning_rate": 9.971838675585888e-05, "loss": 4.3301, "step": 250 }, { "epoch": 0.8726640591047371, "grad_norm": 0.901237964630127, "learning_rate": 9.970860176513291e-05, "loss": 4.5048, "step": 251 }, { "epoch": 0.8761408083441982, "grad_norm": 1.1843230724334717, "learning_rate": 9.9698650166118e-05, "loss": 4.4119, "step": 252 }, { "epoch": 0.8796175575836592, "grad_norm": 0.8304505944252014, "learning_rate": 9.96885319921689e-05, "loss": 4.3257, "step": 253 }, { "epoch": 0.8830943068231204, "grad_norm": 0.7715577483177185, "learning_rate": 9.96782472771987e-05, "loss": 4.4614, "step": 254 }, { "epoch": 0.8865710560625815, "grad_norm": 0.9905951619148254, "learning_rate": 9.966779605567866e-05, "loss": 4.389, "step": 255 }, { "epoch": 0.8900478053020426, "grad_norm": 0.8776286244392395, "learning_rate": 9.965717836263812e-05, "loss": 4.4466, "step": 256 }, { "epoch": 0.8935245545415037, "grad_norm": 0.7492311596870422, "learning_rate": 9.964639423366442e-05, "loss": 4.4444, "step": 257 }, { "epoch": 0.8970013037809648, "grad_norm": 0.7437993288040161, "learning_rate": 9.96354437049027e-05, "loss": 4.4429, "step": 258 }, { "epoch": 0.9004780530204259, "grad_norm": 0.9150131344795227, "learning_rate": 9.962432681305586e-05, "loss": 4.2684, "step": 259 }, { "epoch": 0.903954802259887, "grad_norm": 0.9309634566307068, "learning_rate": 9.961304359538437e-05, "loss": 4.4381, "step": 260 }, { "epoch": 0.9074315514993481, "grad_norm": 1.0173296928405762, "learning_rate": 9.960159408970616e-05, "loss": 4.3821, "step": 261 }, { "epoch": 0.9109083007388092, "grad_norm": 1.237082839012146, "learning_rate": 9.958997833439657e-05, "loss": 4.3013, "step": 262 }, { "epoch": 0.9143850499782703, "grad_norm": 0.8809208869934082, "learning_rate": 9.95781963683881e-05, "loss": 4.4633, "step": 263 }, { "epoch": 0.9178617992177314, "grad_norm": 0.8925305008888245, "learning_rate": 9.956624823117036e-05, "loss": 4.3338, "step": 264 }, { "epoch": 0.9213385484571925, "grad_norm": 1.145957589149475, "learning_rate": 9.955413396278989e-05, "loss": 4.363, "step": 265 }, { "epoch": 0.9248152976966536, "grad_norm": 0.9999008178710938, "learning_rate": 9.954185360385013e-05, "loss": 4.3985, "step": 266 }, { "epoch": 0.9282920469361148, "grad_norm": 1.0845799446105957, "learning_rate": 9.952940719551112e-05, "loss": 4.4413, "step": 267 }, { "epoch": 0.9317687961755758, "grad_norm": 1.0020147562026978, "learning_rate": 9.951679477948947e-05, "loss": 4.2964, "step": 268 }, { "epoch": 0.9352455454150369, "grad_norm": 0.8358871340751648, "learning_rate": 9.95040163980582e-05, "loss": 4.4035, "step": 269 }, { "epoch": 0.9387222946544981, "grad_norm": 0.9009070992469788, "learning_rate": 9.949107209404665e-05, "loss": 4.51, "step": 270 }, { "epoch": 0.9421990438939591, "grad_norm": 1.1610163450241089, "learning_rate": 9.947796191084017e-05, "loss": 4.4029, "step": 271 }, { "epoch": 0.9456757931334202, "grad_norm": 0.9122158288955688, "learning_rate": 9.946468589238021e-05, "loss": 4.2494, "step": 272 }, { "epoch": 0.9491525423728814, "grad_norm": 0.908501148223877, "learning_rate": 9.945124408316398e-05, "loss": 4.3541, "step": 273 }, { "epoch": 0.9526292916123424, "grad_norm": 0.9475510120391846, "learning_rate": 9.943763652824436e-05, "loss": 4.3241, "step": 274 }, { "epoch": 0.9561060408518036, "grad_norm": 0.8620980978012085, "learning_rate": 9.942386327322978e-05, "loss": 4.2955, "step": 275 }, { "epoch": 0.9595827900912647, "grad_norm": 0.8473606109619141, "learning_rate": 9.940992436428409e-05, "loss": 4.2834, "step": 276 }, { "epoch": 0.9630595393307257, "grad_norm": 0.7120651006698608, "learning_rate": 9.93958198481263e-05, "loss": 4.4614, "step": 277 }, { "epoch": 0.9665362885701869, "grad_norm": 0.613216757774353, "learning_rate": 9.938154977203049e-05, "loss": 4.3594, "step": 278 }, { "epoch": 0.970013037809648, "grad_norm": 0.7402132153511047, "learning_rate": 9.93671141838257e-05, "loss": 4.377, "step": 279 }, { "epoch": 0.973489787049109, "grad_norm": 0.6736766695976257, "learning_rate": 9.935251313189564e-05, "loss": 4.2954, "step": 280 }, { "epoch": 0.9769665362885702, "grad_norm": 0.6440555453300476, "learning_rate": 9.93377466651787e-05, "loss": 4.342, "step": 281 }, { "epoch": 0.9804432855280313, "grad_norm": 0.6008772850036621, "learning_rate": 9.932281483316758e-05, "loss": 4.2842, "step": 282 }, { "epoch": 0.9839200347674923, "grad_norm": 0.6302841901779175, "learning_rate": 9.930771768590933e-05, "loss": 4.347, "step": 283 }, { "epoch": 0.9873967840069535, "grad_norm": 0.6728438138961792, "learning_rate": 9.929245527400503e-05, "loss": 4.403, "step": 284 }, { "epoch": 0.9908735332464146, "grad_norm": 0.8032251596450806, "learning_rate": 9.927702764860973e-05, "loss": 4.3079, "step": 285 }, { "epoch": 0.9943502824858758, "grad_norm": 0.8664717674255371, "learning_rate": 9.926143486143214e-05, "loss": 4.3445, "step": 286 }, { "epoch": 0.9978270317253368, "grad_norm": 0.9623958468437195, "learning_rate": 9.924567696473464e-05, "loss": 4.2848, "step": 287 }, { "epoch": 1.0, "grad_norm": 0.8602651357650757, "learning_rate": 9.922975401133293e-05, "loss": 4.4036, "step": 288 }, { "epoch": 1.0034767492394612, "grad_norm": 1.0667355060577393, "learning_rate": 9.921366605459597e-05, "loss": 4.1641, "step": 289 }, { "epoch": 1.0069534984789221, "grad_norm": 1.0437101125717163, "learning_rate": 9.919741314844577e-05, "loss": 4.2776, "step": 290 }, { "epoch": 1.0104302477183833, "grad_norm": 1.0120586156845093, "learning_rate": 9.918099534735718e-05, "loss": 4.2515, "step": 291 }, { "epoch": 1.0139069969578445, "grad_norm": 0.9842764735221863, "learning_rate": 9.916441270635772e-05, "loss": 4.2831, "step": 292 }, { "epoch": 1.0173837461973054, "grad_norm": 0.8590813279151917, "learning_rate": 9.914766528102744e-05, "loss": 4.2932, "step": 293 }, { "epoch": 1.0208604954367666, "grad_norm": 0.8668161034584045, "learning_rate": 9.913075312749866e-05, "loss": 4.1978, "step": 294 }, { "epoch": 1.0243372446762278, "grad_norm": 0.6615017652511597, "learning_rate": 9.911367630245582e-05, "loss": 4.3328, "step": 295 }, { "epoch": 1.0278139939156887, "grad_norm": 0.802143931388855, "learning_rate": 9.909643486313533e-05, "loss": 4.2053, "step": 296 }, { "epoch": 1.03129074315515, "grad_norm": 0.9250998497009277, "learning_rate": 9.907902886732532e-05, "loss": 4.2751, "step": 297 }, { "epoch": 1.034767492394611, "grad_norm": 1.1024829149246216, "learning_rate": 9.90614583733654e-05, "loss": 4.3187, "step": 298 }, { "epoch": 1.038244241634072, "grad_norm": 0.8108821511268616, "learning_rate": 9.904372344014665e-05, "loss": 4.234, "step": 299 }, { "epoch": 1.0417209908735332, "grad_norm": 0.7548679709434509, "learning_rate": 9.90258241271112e-05, "loss": 4.2868, "step": 300 }, { "epoch": 1.0451977401129944, "grad_norm": 0.7505620718002319, "learning_rate": 9.900776049425215e-05, "loss": 4.3266, "step": 301 }, { "epoch": 1.0486744893524556, "grad_norm": 0.7533482909202576, "learning_rate": 9.898953260211338e-05, "loss": 4.3681, "step": 302 }, { "epoch": 1.0521512385919165, "grad_norm": 0.7146593928337097, "learning_rate": 9.897114051178934e-05, "loss": 4.3203, "step": 303 }, { "epoch": 1.0556279878313777, "grad_norm": 0.7534311413764954, "learning_rate": 9.895258428492475e-05, "loss": 4.3486, "step": 304 }, { "epoch": 1.0591047370708389, "grad_norm": 0.7221617102622986, "learning_rate": 9.89338639837145e-05, "loss": 4.3302, "step": 305 }, { "epoch": 1.0625814863102998, "grad_norm": 0.7446046471595764, "learning_rate": 9.891497967090344e-05, "loss": 4.2775, "step": 306 }, { "epoch": 1.066058235549761, "grad_norm": 0.9568740725517273, "learning_rate": 9.889593140978608e-05, "loss": 4.3143, "step": 307 }, { "epoch": 1.0695349847892222, "grad_norm": 1.138326644897461, "learning_rate": 9.887671926420648e-05, "loss": 4.2791, "step": 308 }, { "epoch": 1.073011734028683, "grad_norm": 1.1122509241104126, "learning_rate": 9.885734329855798e-05, "loss": 4.4454, "step": 309 }, { "epoch": 1.0764884832681443, "grad_norm": 0.7699280977249146, "learning_rate": 9.883780357778299e-05, "loss": 4.2428, "step": 310 }, { "epoch": 1.0799652325076055, "grad_norm": 0.7796168923377991, "learning_rate": 9.881810016737276e-05, "loss": 4.2749, "step": 311 }, { "epoch": 1.0834419817470664, "grad_norm": 0.9707072377204895, "learning_rate": 9.879823313336722e-05, "loss": 4.2417, "step": 312 }, { "epoch": 1.0869187309865276, "grad_norm": 1.184442162513733, "learning_rate": 9.877820254235471e-05, "loss": 4.1928, "step": 313 }, { "epoch": 1.0903954802259888, "grad_norm": 0.8373008966445923, "learning_rate": 9.87580084614717e-05, "loss": 4.3421, "step": 314 }, { "epoch": 1.0938722294654497, "grad_norm": 0.7426689863204956, "learning_rate": 9.873765095840271e-05, "loss": 4.1962, "step": 315 }, { "epoch": 1.0973489787049109, "grad_norm": 0.7142062187194824, "learning_rate": 9.871713010137997e-05, "loss": 4.2231, "step": 316 }, { "epoch": 1.100825727944372, "grad_norm": 0.6791448593139648, "learning_rate": 9.869644595918323e-05, "loss": 4.305, "step": 317 }, { "epoch": 1.104302477183833, "grad_norm": 0.7155593633651733, "learning_rate": 9.86755986011395e-05, "loss": 4.1596, "step": 318 }, { "epoch": 1.1077792264232942, "grad_norm": 0.6412737965583801, "learning_rate": 9.865458809712286e-05, "loss": 4.239, "step": 319 }, { "epoch": 1.1112559756627554, "grad_norm": 0.6325717568397522, "learning_rate": 9.86334145175542e-05, "loss": 4.1725, "step": 320 }, { "epoch": 1.1147327249022165, "grad_norm": 0.5991365909576416, "learning_rate": 9.861207793340101e-05, "loss": 4.3368, "step": 321 }, { "epoch": 1.1182094741416775, "grad_norm": 0.7135671973228455, "learning_rate": 9.859057841617709e-05, "loss": 4.3057, "step": 322 }, { "epoch": 1.1216862233811387, "grad_norm": 0.8155140280723572, "learning_rate": 9.856891603794235e-05, "loss": 4.3181, "step": 323 }, { "epoch": 1.1251629726205998, "grad_norm": 0.8652500510215759, "learning_rate": 9.85470908713026e-05, "loss": 4.1882, "step": 324 }, { "epoch": 1.1286397218600608, "grad_norm": 0.7795478105545044, "learning_rate": 9.852510298940922e-05, "loss": 4.2282, "step": 325 }, { "epoch": 1.132116471099522, "grad_norm": 0.8294603228569031, "learning_rate": 9.850295246595898e-05, "loss": 4.2696, "step": 326 }, { "epoch": 1.1355932203389831, "grad_norm": 1.0049405097961426, "learning_rate": 9.848063937519376e-05, "loss": 4.2633, "step": 327 }, { "epoch": 1.139069969578444, "grad_norm": 1.0607136487960815, "learning_rate": 9.845816379190036e-05, "loss": 4.1424, "step": 328 }, { "epoch": 1.1425467188179053, "grad_norm": 0.9381546974182129, "learning_rate": 9.843552579141016e-05, "loss": 4.2441, "step": 329 }, { "epoch": 1.1460234680573664, "grad_norm": 0.8802816271781921, "learning_rate": 9.841272544959892e-05, "loss": 4.3125, "step": 330 }, { "epoch": 1.1495002172968274, "grad_norm": 0.8450530767440796, "learning_rate": 9.838976284288657e-05, "loss": 4.2173, "step": 331 }, { "epoch": 1.1529769665362886, "grad_norm": 0.8956658244132996, "learning_rate": 9.836663804823683e-05, "loss": 4.1585, "step": 332 }, { "epoch": 1.1564537157757497, "grad_norm": 0.8815373182296753, "learning_rate": 9.834335114315708e-05, "loss": 4.1268, "step": 333 }, { "epoch": 1.1599304650152107, "grad_norm": 0.785399317741394, "learning_rate": 9.831990220569801e-05, "loss": 4.264, "step": 334 }, { "epoch": 1.1634072142546719, "grad_norm": 0.7312754988670349, "learning_rate": 9.829629131445342e-05, "loss": 4.1669, "step": 335 }, { "epoch": 1.166883963494133, "grad_norm": 0.7686638832092285, "learning_rate": 9.827251854855991e-05, "loss": 4.1686, "step": 336 }, { "epoch": 1.170360712733594, "grad_norm": 0.7417734265327454, "learning_rate": 9.824858398769665e-05, "loss": 4.2047, "step": 337 }, { "epoch": 1.1738374619730552, "grad_norm": 0.8165032863616943, "learning_rate": 9.82244877120851e-05, "loss": 4.2661, "step": 338 }, { "epoch": 1.1773142112125163, "grad_norm": 0.9953269958496094, "learning_rate": 9.820022980248871e-05, "loss": 4.143, "step": 339 }, { "epoch": 1.1807909604519775, "grad_norm": 1.0205968618392944, "learning_rate": 9.817581034021272e-05, "loss": 4.146, "step": 340 }, { "epoch": 1.1842677096914385, "grad_norm": 0.8952431082725525, "learning_rate": 9.815122940710382e-05, "loss": 4.2152, "step": 341 }, { "epoch": 1.1877444589308996, "grad_norm": 0.799082338809967, "learning_rate": 9.81264870855499e-05, "loss": 4.2976, "step": 342 }, { "epoch": 1.1912212081703606, "grad_norm": 0.8718714714050293, "learning_rate": 9.81015834584798e-05, "loss": 4.2623, "step": 343 }, { "epoch": 1.1946979574098218, "grad_norm": 0.8109663724899292, "learning_rate": 9.807651860936297e-05, "loss": 4.0596, "step": 344 }, { "epoch": 1.198174706649283, "grad_norm": 0.742445707321167, "learning_rate": 9.805129262220927e-05, "loss": 4.165, "step": 345 }, { "epoch": 1.2016514558887441, "grad_norm": 0.7259525060653687, "learning_rate": 9.802590558156862e-05, "loss": 4.2433, "step": 346 }, { "epoch": 1.205128205128205, "grad_norm": 0.5326026678085327, "learning_rate": 9.800035757253074e-05, "loss": 4.1949, "step": 347 }, { "epoch": 1.2086049543676662, "grad_norm": 0.6962956190109253, "learning_rate": 9.797464868072488e-05, "loss": 4.2642, "step": 348 }, { "epoch": 1.2120817036071274, "grad_norm": 0.6962010860443115, "learning_rate": 9.794877899231951e-05, "loss": 4.2872, "step": 349 }, { "epoch": 1.2155584528465884, "grad_norm": 0.596272885799408, "learning_rate": 9.792274859402205e-05, "loss": 4.2611, "step": 350 }, { "epoch": 1.2190352020860495, "grad_norm": 0.6690101027488708, "learning_rate": 9.789655757307858e-05, "loss": 4.3164, "step": 351 }, { "epoch": 1.2225119513255107, "grad_norm": 0.64564049243927, "learning_rate": 9.787020601727352e-05, "loss": 4.2659, "step": 352 }, { "epoch": 1.2259887005649717, "grad_norm": 0.5660618543624878, "learning_rate": 9.784369401492937e-05, "loss": 4.2104, "step": 353 }, { "epoch": 1.2294654498044328, "grad_norm": 0.5513273477554321, "learning_rate": 9.781702165490639e-05, "loss": 4.1671, "step": 354 }, { "epoch": 1.232942199043894, "grad_norm": 0.617933452129364, "learning_rate": 9.779018902660228e-05, "loss": 4.2193, "step": 355 }, { "epoch": 1.236418948283355, "grad_norm": 0.632296621799469, "learning_rate": 9.776319621995201e-05, "loss": 4.1993, "step": 356 }, { "epoch": 1.2398956975228161, "grad_norm": 0.6899546384811401, "learning_rate": 9.773604332542729e-05, "loss": 4.1942, "step": 357 }, { "epoch": 1.2433724467622773, "grad_norm": 0.9000681638717651, "learning_rate": 9.770873043403648e-05, "loss": 4.2009, "step": 358 }, { "epoch": 1.2468491960017385, "grad_norm": 1.3722553253173828, "learning_rate": 9.76812576373242e-05, "loss": 4.1257, "step": 359 }, { "epoch": 1.2503259452411994, "grad_norm": 0.8985775709152222, "learning_rate": 9.765362502737097e-05, "loss": 4.1981, "step": 360 }, { "epoch": 1.2538026944806606, "grad_norm": 0.7751098275184631, "learning_rate": 9.762583269679303e-05, "loss": 4.1548, "step": 361 }, { "epoch": 1.2572794437201216, "grad_norm": 0.7167885899543762, "learning_rate": 9.759788073874189e-05, "loss": 4.1174, "step": 362 }, { "epoch": 1.2607561929595827, "grad_norm": 0.9465803503990173, "learning_rate": 9.756976924690412e-05, "loss": 4.1763, "step": 363 }, { "epoch": 1.264232942199044, "grad_norm": 1.0560104846954346, "learning_rate": 9.754149831550098e-05, "loss": 4.1163, "step": 364 }, { "epoch": 1.267709691438505, "grad_norm": 0.811275839805603, "learning_rate": 9.751306803928815e-05, "loss": 4.1351, "step": 365 }, { "epoch": 1.271186440677966, "grad_norm": 0.7459397912025452, "learning_rate": 9.748447851355535e-05, "loss": 4.1354, "step": 366 }, { "epoch": 1.2746631899174272, "grad_norm": 0.6689170002937317, "learning_rate": 9.745572983412607e-05, "loss": 4.2022, "step": 367 }, { "epoch": 1.2781399391568882, "grad_norm": 0.6770418882369995, "learning_rate": 9.742682209735727e-05, "loss": 4.1978, "step": 368 }, { "epoch": 1.2816166883963493, "grad_norm": 0.7698088884353638, "learning_rate": 9.7397755400139e-05, "loss": 4.1294, "step": 369 }, { "epoch": 1.2850934376358105, "grad_norm": 0.8307857513427734, "learning_rate": 9.736852983989404e-05, "loss": 4.0905, "step": 370 }, { "epoch": 1.2885701868752717, "grad_norm": 0.8605286478996277, "learning_rate": 9.733914551457771e-05, "loss": 4.2347, "step": 371 }, { "epoch": 1.2920469361147326, "grad_norm": 0.8637606501579285, "learning_rate": 9.730960252267743e-05, "loss": 4.1814, "step": 372 }, { "epoch": 1.2955236853541938, "grad_norm": 0.9002863764762878, "learning_rate": 9.727990096321244e-05, "loss": 4.1763, "step": 373 }, { "epoch": 1.299000434593655, "grad_norm": 0.7790195345878601, "learning_rate": 9.725004093573342e-05, "loss": 4.1895, "step": 374 }, { "epoch": 1.302477183833116, "grad_norm": 0.8514074087142944, "learning_rate": 9.722002254032224e-05, "loss": 4.1396, "step": 375 }, { "epoch": 1.3059539330725771, "grad_norm": 0.9697980284690857, "learning_rate": 9.718984587759148e-05, "loss": 4.1228, "step": 376 }, { "epoch": 1.3094306823120383, "grad_norm": 0.9058026671409607, "learning_rate": 9.715951104868428e-05, "loss": 4.1695, "step": 377 }, { "epoch": 1.3129074315514995, "grad_norm": 0.8548423051834106, "learning_rate": 9.712901815527386e-05, "loss": 4.17, "step": 378 }, { "epoch": 1.3163841807909604, "grad_norm": 0.9168500304222107, "learning_rate": 9.709836729956325e-05, "loss": 4.1448, "step": 379 }, { "epoch": 1.3198609300304216, "grad_norm": 1.040820837020874, "learning_rate": 9.706755858428486e-05, "loss": 4.2922, "step": 380 }, { "epoch": 1.3233376792698825, "grad_norm": 0.9632564187049866, "learning_rate": 9.703659211270028e-05, "loss": 4.0698, "step": 381 }, { "epoch": 1.3268144285093437, "grad_norm": 1.0353721380233765, "learning_rate": 9.70054679885998e-05, "loss": 4.0974, "step": 382 }, { "epoch": 1.330291177748805, "grad_norm": 0.9106934070587158, "learning_rate": 9.69741863163021e-05, "loss": 4.1968, "step": 383 }, { "epoch": 1.333767926988266, "grad_norm": 0.7971479892730713, "learning_rate": 9.694274720065399e-05, "loss": 4.1139, "step": 384 }, { "epoch": 1.337244676227727, "grad_norm": 0.8078503608703613, "learning_rate": 9.691115074702992e-05, "loss": 4.1033, "step": 385 }, { "epoch": 1.3407214254671882, "grad_norm": 0.8231106996536255, "learning_rate": 9.687939706133168e-05, "loss": 4.2063, "step": 386 }, { "epoch": 1.3441981747066492, "grad_norm": 0.7890482544898987, "learning_rate": 9.68474862499881e-05, "loss": 4.0187, "step": 387 }, { "epoch": 1.3476749239461103, "grad_norm": 0.658843994140625, "learning_rate": 9.681541841995461e-05, "loss": 4.114, "step": 388 }, { "epoch": 1.3511516731855715, "grad_norm": 0.7294294238090515, "learning_rate": 9.678319367871293e-05, "loss": 4.198, "step": 389 }, { "epoch": 1.3546284224250327, "grad_norm": 0.9159700274467468, "learning_rate": 9.675081213427076e-05, "loss": 4.1494, "step": 390 }, { "epoch": 1.3581051716644936, "grad_norm": 0.964610755443573, "learning_rate": 9.671827389516122e-05, "loss": 4.2178, "step": 391 }, { "epoch": 1.3615819209039548, "grad_norm": 0.8890171647071838, "learning_rate": 9.668557907044276e-05, "loss": 4.2487, "step": 392 }, { "epoch": 1.365058670143416, "grad_norm": 0.999011754989624, "learning_rate": 9.66527277696986e-05, "loss": 4.222, "step": 393 }, { "epoch": 1.368535419382877, "grad_norm": 1.05777108669281, "learning_rate": 9.661972010303641e-05, "loss": 4.191, "step": 394 }, { "epoch": 1.372012168622338, "grad_norm": 0.6993465423583984, "learning_rate": 9.658655618108796e-05, "loss": 4.0272, "step": 395 }, { "epoch": 1.3754889178617993, "grad_norm": 0.8366750478744507, "learning_rate": 9.655323611500875e-05, "loss": 4.1595, "step": 396 }, { "epoch": 1.3789656671012605, "grad_norm": 0.8456549048423767, "learning_rate": 9.651976001647765e-05, "loss": 4.237, "step": 397 }, { "epoch": 1.3824424163407214, "grad_norm": 0.791394054889679, "learning_rate": 9.648612799769644e-05, "loss": 4.2289, "step": 398 }, { "epoch": 1.3859191655801826, "grad_norm": 0.9533299803733826, "learning_rate": 9.645234017138956e-05, "loss": 4.1586, "step": 399 }, { "epoch": 1.3893959148196435, "grad_norm": 1.044310450553894, "learning_rate": 9.641839665080363e-05, "loss": 4.2023, "step": 400 }, { "epoch": 1.3928726640591047, "grad_norm": 0.920463502407074, "learning_rate": 9.638429754970715e-05, "loss": 4.1882, "step": 401 }, { "epoch": 1.3963494132985659, "grad_norm": 0.9414947032928467, "learning_rate": 9.635004298239004e-05, "loss": 4.0434, "step": 402 }, { "epoch": 1.399826162538027, "grad_norm": 0.8273729085922241, "learning_rate": 9.63156330636633e-05, "loss": 4.1685, "step": 403 }, { "epoch": 1.403302911777488, "grad_norm": 0.6224656105041504, "learning_rate": 9.628106790885865e-05, "loss": 4.1801, "step": 404 }, { "epoch": 1.4067796610169492, "grad_norm": 0.7260700464248657, "learning_rate": 9.62463476338281e-05, "loss": 4.164, "step": 405 }, { "epoch": 1.4102564102564101, "grad_norm": 0.7788506150245667, "learning_rate": 9.621147235494356e-05, "loss": 4.1687, "step": 406 }, { "epoch": 1.4137331594958713, "grad_norm": 0.9212743043899536, "learning_rate": 9.617644218909649e-05, "loss": 4.094, "step": 407 }, { "epoch": 1.4172099087353325, "grad_norm": 0.9231887459754944, "learning_rate": 9.614125725369747e-05, "loss": 4.221, "step": 408 }, { "epoch": 1.4206866579747937, "grad_norm": 0.8516282439231873, "learning_rate": 9.610591766667583e-05, "loss": 4.0776, "step": 409 }, { "epoch": 1.4241634072142546, "grad_norm": 0.6721898913383484, "learning_rate": 9.607042354647924e-05, "loss": 4.0561, "step": 410 }, { "epoch": 1.4276401564537158, "grad_norm": 0.9020458459854126, "learning_rate": 9.60347750120733e-05, "loss": 4.0563, "step": 411 }, { "epoch": 1.431116905693177, "grad_norm": 0.8082847595214844, "learning_rate": 9.599897218294122e-05, "loss": 4.0433, "step": 412 }, { "epoch": 1.434593654932638, "grad_norm": 0.89698725938797, "learning_rate": 9.596301517908328e-05, "loss": 4.1292, "step": 413 }, { "epoch": 1.438070404172099, "grad_norm": 1.1492091417312622, "learning_rate": 9.592690412101658e-05, "loss": 4.1224, "step": 414 }, { "epoch": 1.4415471534115603, "grad_norm": 0.8929200172424316, "learning_rate": 9.589063912977451e-05, "loss": 4.0852, "step": 415 }, { "epoch": 1.4450239026510214, "grad_norm": 0.8456454873085022, "learning_rate": 9.585422032690643e-05, "loss": 4.1604, "step": 416 }, { "epoch": 1.4485006518904824, "grad_norm": 0.9664533734321594, "learning_rate": 9.581764783447719e-05, "loss": 4.234, "step": 417 }, { "epoch": 1.4519774011299436, "grad_norm": 0.8303775787353516, "learning_rate": 9.578092177506683e-05, "loss": 4.1508, "step": 418 }, { "epoch": 1.4554541503694045, "grad_norm": 0.735723614692688, "learning_rate": 9.574404227177005e-05, "loss": 4.0714, "step": 419 }, { "epoch": 1.4589308996088657, "grad_norm": 0.7835253477096558, "learning_rate": 9.570700944819584e-05, "loss": 4.0993, "step": 420 }, { "epoch": 1.4624076488483269, "grad_norm": 0.8943220376968384, "learning_rate": 9.566982342846709e-05, "loss": 4.0499, "step": 421 }, { "epoch": 1.465884398087788, "grad_norm": 1.0396778583526611, "learning_rate": 9.563248433722019e-05, "loss": 4.0751, "step": 422 }, { "epoch": 1.469361147327249, "grad_norm": 0.9445440173149109, "learning_rate": 9.559499229960451e-05, "loss": 4.0111, "step": 423 }, { "epoch": 1.4728378965667102, "grad_norm": 0.8301072716712952, "learning_rate": 9.55573474412821e-05, "loss": 4.045, "step": 424 }, { "epoch": 1.476314645806171, "grad_norm": 0.9725110530853271, "learning_rate": 9.551954988842724e-05, "loss": 4.0069, "step": 425 }, { "epoch": 1.4797913950456323, "grad_norm": 1.132828712463379, "learning_rate": 9.548159976772592e-05, "loss": 4.125, "step": 426 }, { "epoch": 1.4832681442850935, "grad_norm": 0.7702617645263672, "learning_rate": 9.544349720637556e-05, "loss": 4.0776, "step": 427 }, { "epoch": 1.4867448935245546, "grad_norm": 0.7960605025291443, "learning_rate": 9.540524233208448e-05, "loss": 4.0346, "step": 428 }, { "epoch": 1.4902216427640156, "grad_norm": 0.9969260096549988, "learning_rate": 9.536683527307153e-05, "loss": 4.1162, "step": 429 }, { "epoch": 1.4936983920034768, "grad_norm": 1.1111648082733154, "learning_rate": 9.53282761580656e-05, "loss": 4.0933, "step": 430 }, { "epoch": 1.497175141242938, "grad_norm": 1.198155164718628, "learning_rate": 9.528956511630529e-05, "loss": 4.0783, "step": 431 }, { "epoch": 1.5006518904823989, "grad_norm": 1.0453709363937378, "learning_rate": 9.525070227753834e-05, "loss": 4.0607, "step": 432 }, { "epoch": 1.50412863972186, "grad_norm": 0.9988518953323364, "learning_rate": 9.521168777202132e-05, "loss": 4.112, "step": 433 }, { "epoch": 1.5076053889613212, "grad_norm": 0.918400764465332, "learning_rate": 9.517252173051911e-05, "loss": 4.0134, "step": 434 }, { "epoch": 1.5110821382007824, "grad_norm": 0.869310200214386, "learning_rate": 9.513320428430452e-05, "loss": 4.0576, "step": 435 }, { "epoch": 1.5145588874402434, "grad_norm": 0.9518806338310242, "learning_rate": 9.509373556515782e-05, "loss": 4.1244, "step": 436 }, { "epoch": 1.5180356366797043, "grad_norm": 0.8721877336502075, "learning_rate": 9.505411570536626e-05, "loss": 4.0608, "step": 437 }, { "epoch": 1.5215123859191655, "grad_norm": 0.7652842998504639, "learning_rate": 9.50143448377237e-05, "loss": 4.0603, "step": 438 }, { "epoch": 1.5249891351586267, "grad_norm": 0.8885753750801086, "learning_rate": 9.497442309553016e-05, "loss": 3.8367, "step": 439 }, { "epoch": 1.5284658843980878, "grad_norm": 0.6828482151031494, "learning_rate": 9.493435061259131e-05, "loss": 4.1155, "step": 440 }, { "epoch": 1.531942633637549, "grad_norm": 0.729836642742157, "learning_rate": 9.489412752321805e-05, "loss": 4.0957, "step": 441 }, { "epoch": 1.53541938287701, "grad_norm": 0.9394379258155823, "learning_rate": 9.485375396222609e-05, "loss": 3.9957, "step": 442 }, { "epoch": 1.5388961321164711, "grad_norm": 0.9025709629058838, "learning_rate": 9.481323006493547e-05, "loss": 4.0381, "step": 443 }, { "epoch": 1.542372881355932, "grad_norm": 0.861587405204773, "learning_rate": 9.477255596717012e-05, "loss": 4.2398, "step": 444 }, { "epoch": 1.5458496305953933, "grad_norm": 0.8114928007125854, "learning_rate": 9.473173180525737e-05, "loss": 4.096, "step": 445 }, { "epoch": 1.5493263798348544, "grad_norm": 0.8760333061218262, "learning_rate": 9.469075771602755e-05, "loss": 4.1301, "step": 446 }, { "epoch": 1.5528031290743156, "grad_norm": 1.0566339492797852, "learning_rate": 9.464963383681349e-05, "loss": 3.9805, "step": 447 }, { "epoch": 1.5562798783137766, "grad_norm": 0.9962750673294067, "learning_rate": 9.460836030545007e-05, "loss": 3.9729, "step": 448 }, { "epoch": 1.5597566275532377, "grad_norm": 0.9361708164215088, "learning_rate": 9.456693726027375e-05, "loss": 4.0141, "step": 449 }, { "epoch": 1.5632333767926987, "grad_norm": 0.9272918105125427, "learning_rate": 9.452536484012212e-05, "loss": 4.079, "step": 450 }, { "epoch": 1.5667101260321599, "grad_norm": 1.0941708087921143, "learning_rate": 9.448364318433345e-05, "loss": 4.0688, "step": 451 }, { "epoch": 1.570186875271621, "grad_norm": 0.9107572436332703, "learning_rate": 9.444177243274618e-05, "loss": 3.9463, "step": 452 }, { "epoch": 1.5736636245110822, "grad_norm": 0.786167562007904, "learning_rate": 9.439975272569848e-05, "loss": 4.0317, "step": 453 }, { "epoch": 1.5771403737505434, "grad_norm": 0.7826558947563171, "learning_rate": 9.435758420402778e-05, "loss": 4.0186, "step": 454 }, { "epoch": 1.5806171229900043, "grad_norm": 0.8052117824554443, "learning_rate": 9.431526700907027e-05, "loss": 4.0627, "step": 455 }, { "epoch": 1.5840938722294653, "grad_norm": 1.0061588287353516, "learning_rate": 9.42728012826605e-05, "loss": 3.9494, "step": 456 }, { "epoch": 1.5875706214689265, "grad_norm": 0.9913825392723083, "learning_rate": 9.423018716713079e-05, "loss": 4.0443, "step": 457 }, { "epoch": 1.5910473707083876, "grad_norm": 0.8176896572113037, "learning_rate": 9.418742480531085e-05, "loss": 3.9168, "step": 458 }, { "epoch": 1.5945241199478488, "grad_norm": 0.7156447768211365, "learning_rate": 9.414451434052727e-05, "loss": 4.0151, "step": 459 }, { "epoch": 1.59800086918731, "grad_norm": 0.7260638475418091, "learning_rate": 9.410145591660301e-05, "loss": 4.0644, "step": 460 }, { "epoch": 1.601477618426771, "grad_norm": 0.8086751103401184, "learning_rate": 9.405824967785698e-05, "loss": 3.9587, "step": 461 }, { "epoch": 1.6049543676662321, "grad_norm": 0.9354509115219116, "learning_rate": 9.401489576910349e-05, "loss": 3.9932, "step": 462 }, { "epoch": 1.608431116905693, "grad_norm": 1.1333049535751343, "learning_rate": 9.39713943356518e-05, "loss": 3.9713, "step": 463 }, { "epoch": 1.6119078661451542, "grad_norm": 0.9655582904815674, "learning_rate": 9.392774552330567e-05, "loss": 3.9768, "step": 464 }, { "epoch": 1.6153846153846154, "grad_norm": 1.089020013809204, "learning_rate": 9.388394947836279e-05, "loss": 3.8841, "step": 465 }, { "epoch": 1.6188613646240766, "grad_norm": 0.7879194021224976, "learning_rate": 9.38400063476143e-05, "loss": 3.9356, "step": 466 }, { "epoch": 1.6223381138635375, "grad_norm": 0.6482805013656616, "learning_rate": 9.37959162783444e-05, "loss": 3.9739, "step": 467 }, { "epoch": 1.6258148631029987, "grad_norm": 0.7414939403533936, "learning_rate": 9.375167941832973e-05, "loss": 3.9431, "step": 468 }, { "epoch": 1.6292916123424597, "grad_norm": 0.9199277758598328, "learning_rate": 9.370729591583894e-05, "loss": 3.949, "step": 469 }, { "epoch": 1.6327683615819208, "grad_norm": 1.0250579118728638, "learning_rate": 9.366276591963221e-05, "loss": 4.0201, "step": 470 }, { "epoch": 1.636245110821382, "grad_norm": 1.1997772455215454, "learning_rate": 9.361808957896067e-05, "loss": 4.0029, "step": 471 }, { "epoch": 1.6397218600608432, "grad_norm": 1.0056092739105225, "learning_rate": 9.357326704356602e-05, "loss": 3.9536, "step": 472 }, { "epoch": 1.6431986093003044, "grad_norm": 0.8187026977539062, "learning_rate": 9.35282984636799e-05, "loss": 4.0072, "step": 473 }, { "epoch": 1.6466753585397653, "grad_norm": 1.099008321762085, "learning_rate": 9.348318399002347e-05, "loss": 4.0514, "step": 474 }, { "epoch": 1.6501521077792263, "grad_norm": 1.1227092742919922, "learning_rate": 9.343792377380687e-05, "loss": 3.9256, "step": 475 }, { "epoch": 1.6536288570186874, "grad_norm": 0.8582590222358704, "learning_rate": 9.339251796672877e-05, "loss": 3.9037, "step": 476 }, { "epoch": 1.6571056062581486, "grad_norm": 1.0513243675231934, "learning_rate": 9.334696672097576e-05, "loss": 3.9695, "step": 477 }, { "epoch": 1.6605823554976098, "grad_norm": 1.0664559602737427, "learning_rate": 9.330127018922194e-05, "loss": 3.9251, "step": 478 }, { "epoch": 1.664059104737071, "grad_norm": 1.0497174263000488, "learning_rate": 9.325542852462833e-05, "loss": 4.0381, "step": 479 }, { "epoch": 1.667535853976532, "grad_norm": 0.9143519401550293, "learning_rate": 9.320944188084242e-05, "loss": 3.9366, "step": 480 }, { "epoch": 1.671012603215993, "grad_norm": 0.7945806384086609, "learning_rate": 9.31633104119976e-05, "loss": 4.0225, "step": 481 }, { "epoch": 1.674489352455454, "grad_norm": 0.8599205613136292, "learning_rate": 9.31170342727127e-05, "loss": 4.0028, "step": 482 }, { "epoch": 1.6779661016949152, "grad_norm": 1.0326347351074219, "learning_rate": 9.307061361809141e-05, "loss": 3.9804, "step": 483 }, { "epoch": 1.6814428509343764, "grad_norm": 1.0522825717926025, "learning_rate": 9.302404860372185e-05, "loss": 3.9047, "step": 484 }, { "epoch": 1.6849196001738376, "grad_norm": 0.9849698543548584, "learning_rate": 9.29773393856759e-05, "loss": 3.9828, "step": 485 }, { "epoch": 1.6883963494132985, "grad_norm": 1.1080557107925415, "learning_rate": 9.293048612050883e-05, "loss": 3.9129, "step": 486 }, { "epoch": 1.6918730986527597, "grad_norm": 0.9390277862548828, "learning_rate": 9.28834889652587e-05, "loss": 4.0346, "step": 487 }, { "epoch": 1.6953498478922207, "grad_norm": 0.8586708903312683, "learning_rate": 9.283634807744586e-05, "loss": 3.992, "step": 488 }, { "epoch": 1.6988265971316818, "grad_norm": 0.9357092976570129, "learning_rate": 9.278906361507238e-05, "loss": 3.9007, "step": 489 }, { "epoch": 1.702303346371143, "grad_norm": 1.0461989641189575, "learning_rate": 9.274163573662157e-05, "loss": 3.9684, "step": 490 }, { "epoch": 1.7057800956106042, "grad_norm": 1.0791900157928467, "learning_rate": 9.26940646010574e-05, "loss": 3.9989, "step": 491 }, { "epoch": 1.7092568448500653, "grad_norm": 0.7584978938102722, "learning_rate": 9.264635036782405e-05, "loss": 3.9932, "step": 492 }, { "epoch": 1.7127335940895263, "grad_norm": 0.8475772142410278, "learning_rate": 9.259849319684526e-05, "loss": 3.879, "step": 493 }, { "epoch": 1.7162103433289873, "grad_norm": 0.8825334310531616, "learning_rate": 9.255049324852388e-05, "loss": 3.8554, "step": 494 }, { "epoch": 1.7196870925684484, "grad_norm": 0.9635478258132935, "learning_rate": 9.250235068374133e-05, "loss": 3.882, "step": 495 }, { "epoch": 1.7231638418079096, "grad_norm": 1.0515531301498413, "learning_rate": 9.245406566385697e-05, "loss": 3.767, "step": 496 }, { "epoch": 1.7266405910473708, "grad_norm": 0.951738715171814, "learning_rate": 9.240563835070771e-05, "loss": 3.9327, "step": 497 }, { "epoch": 1.730117340286832, "grad_norm": 0.9645599126815796, "learning_rate": 9.235706890660733e-05, "loss": 3.9418, "step": 498 }, { "epoch": 1.733594089526293, "grad_norm": 0.9542970061302185, "learning_rate": 9.230835749434601e-05, "loss": 3.9136, "step": 499 }, { "epoch": 1.7370708387657539, "grad_norm": 0.9848579168319702, "learning_rate": 9.225950427718975e-05, "loss": 3.8674, "step": 500 }, { "epoch": 1.740547588005215, "grad_norm": 0.8875890374183655, "learning_rate": 9.221050941887984e-05, "loss": 3.8565, "step": 501 }, { "epoch": 1.7440243372446762, "grad_norm": 0.9016240835189819, "learning_rate": 9.216137308363233e-05, "loss": 3.9054, "step": 502 }, { "epoch": 1.7475010864841374, "grad_norm": 0.8858099579811096, "learning_rate": 9.211209543613746e-05, "loss": 3.9223, "step": 503 }, { "epoch": 1.7509778357235986, "grad_norm": 0.8951818943023682, "learning_rate": 9.206267664155907e-05, "loss": 3.8056, "step": 504 }, { "epoch": 1.7544545849630595, "grad_norm": 0.8185245394706726, "learning_rate": 9.201311686553408e-05, "loss": 3.9184, "step": 505 }, { "epoch": 1.7579313342025207, "grad_norm": 0.8379631042480469, "learning_rate": 9.1963416274172e-05, "loss": 3.9537, "step": 506 }, { "epoch": 1.7614080834419816, "grad_norm": 0.6970687508583069, "learning_rate": 9.191357503405425e-05, "loss": 3.8938, "step": 507 }, { "epoch": 1.7648848326814428, "grad_norm": 0.7571694254875183, "learning_rate": 9.186359331223369e-05, "loss": 3.8748, "step": 508 }, { "epoch": 1.768361581920904, "grad_norm": 0.916691780090332, "learning_rate": 9.181347127623403e-05, "loss": 3.8796, "step": 509 }, { "epoch": 1.7718383311603652, "grad_norm": 0.8490556478500366, "learning_rate": 9.176320909404924e-05, "loss": 3.8544, "step": 510 }, { "epoch": 1.7753150803998263, "grad_norm": 0.8541643619537354, "learning_rate": 9.171280693414307e-05, "loss": 3.9174, "step": 511 }, { "epoch": 1.7787918296392873, "grad_norm": 0.7272506356239319, "learning_rate": 9.166226496544839e-05, "loss": 3.7822, "step": 512 }, { "epoch": 1.7822685788787482, "grad_norm": 0.7909744381904602, "learning_rate": 9.161158335736671e-05, "loss": 3.9238, "step": 513 }, { "epoch": 1.7857453281182094, "grad_norm": 1.08568274974823, "learning_rate": 9.156076227976752e-05, "loss": 3.9444, "step": 514 }, { "epoch": 1.7892220773576706, "grad_norm": 1.0940698385238647, "learning_rate": 9.15098019029878e-05, "loss": 3.8727, "step": 515 }, { "epoch": 1.7926988265971318, "grad_norm": 0.9210303425788879, "learning_rate": 9.145870239783142e-05, "loss": 3.9063, "step": 516 }, { "epoch": 1.796175575836593, "grad_norm": 0.9154436588287354, "learning_rate": 9.140746393556854e-05, "loss": 3.8853, "step": 517 }, { "epoch": 1.7996523250760539, "grad_norm": 1.0353447198867798, "learning_rate": 9.135608668793511e-05, "loss": 3.841, "step": 518 }, { "epoch": 1.8031290743155148, "grad_norm": 0.8575829863548279, "learning_rate": 9.130457082713219e-05, "loss": 3.8811, "step": 519 }, { "epoch": 1.806605823554976, "grad_norm": 0.9011865854263306, "learning_rate": 9.125291652582549e-05, "loss": 3.7803, "step": 520 }, { "epoch": 1.8100825727944372, "grad_norm": 0.9533329606056213, "learning_rate": 9.120112395714463e-05, "loss": 3.8986, "step": 521 }, { "epoch": 1.8135593220338984, "grad_norm": 0.9975365996360779, "learning_rate": 9.114919329468282e-05, "loss": 3.9143, "step": 522 }, { "epoch": 1.8170360712733595, "grad_norm": 1.015561819076538, "learning_rate": 9.109712471249598e-05, "loss": 3.8863, "step": 523 }, { "epoch": 1.8205128205128205, "grad_norm": 1.0247142314910889, "learning_rate": 9.104491838510235e-05, "loss": 3.9139, "step": 524 }, { "epoch": 1.8239895697522817, "grad_norm": 1.0852620601654053, "learning_rate": 9.099257448748184e-05, "loss": 3.8964, "step": 525 }, { "epoch": 1.8274663189917426, "grad_norm": 0.9563137292861938, "learning_rate": 9.094009319507547e-05, "loss": 3.8767, "step": 526 }, { "epoch": 1.8309430682312038, "grad_norm": 1.0613676309585571, "learning_rate": 9.088747468378474e-05, "loss": 3.9272, "step": 527 }, { "epoch": 1.834419817470665, "grad_norm": 1.0097156763076782, "learning_rate": 9.083471912997108e-05, "loss": 3.9006, "step": 528 }, { "epoch": 1.8378965667101261, "grad_norm": 0.9422491192817688, "learning_rate": 9.078182671045527e-05, "loss": 3.7358, "step": 529 }, { "epoch": 1.841373315949587, "grad_norm": 0.9380639791488647, "learning_rate": 9.072879760251679e-05, "loss": 3.8661, "step": 530 }, { "epoch": 1.8448500651890483, "grad_norm": 0.9634526371955872, "learning_rate": 9.067563198389326e-05, "loss": 3.8993, "step": 531 }, { "epoch": 1.8483268144285092, "grad_norm": 0.9645963907241821, "learning_rate": 9.062233003277983e-05, "loss": 3.8783, "step": 532 }, { "epoch": 1.8518035636679704, "grad_norm": 0.9658483862876892, "learning_rate": 9.056889192782866e-05, "loss": 3.8708, "step": 533 }, { "epoch": 1.8552803129074316, "grad_norm": 0.8875890970230103, "learning_rate": 9.051531784814817e-05, "loss": 3.7292, "step": 534 }, { "epoch": 1.8587570621468927, "grad_norm": 0.9648452997207642, "learning_rate": 9.04616079733026e-05, "loss": 3.8904, "step": 535 }, { "epoch": 1.862233811386354, "grad_norm": 1.1329749822616577, "learning_rate": 9.040776248331129e-05, "loss": 3.8909, "step": 536 }, { "epoch": 1.8657105606258149, "grad_norm": 0.9306442141532898, "learning_rate": 9.035378155864813e-05, "loss": 3.8015, "step": 537 }, { "epoch": 1.8691873098652758, "grad_norm": 0.8368638753890991, "learning_rate": 9.029966538024097e-05, "loss": 3.7593, "step": 538 }, { "epoch": 1.872664059104737, "grad_norm": 0.7863436937332153, "learning_rate": 9.024541412947094e-05, "loss": 3.7565, "step": 539 }, { "epoch": 1.8761408083441982, "grad_norm": 0.8507801294326782, "learning_rate": 9.019102798817197e-05, "loss": 3.867, "step": 540 }, { "epoch": 1.8796175575836593, "grad_norm": 0.8136400580406189, "learning_rate": 9.013650713863e-05, "loss": 3.6624, "step": 541 }, { "epoch": 1.8830943068231205, "grad_norm": 0.8798525929450989, "learning_rate": 9.008185176358256e-05, "loss": 3.7241, "step": 542 }, { "epoch": 1.8865710560625815, "grad_norm": 0.7656006217002869, "learning_rate": 9.002706204621803e-05, "loss": 3.9443, "step": 543 }, { "epoch": 1.8900478053020426, "grad_norm": 1.18944251537323, "learning_rate": 8.997213817017507e-05, "loss": 3.861, "step": 544 }, { "epoch": 1.8935245545415036, "grad_norm": 1.3046687841415405, "learning_rate": 8.991708031954199e-05, "loss": 3.7927, "step": 545 }, { "epoch": 1.8970013037809648, "grad_norm": 0.7693729400634766, "learning_rate": 8.986188867885617e-05, "loss": 3.7631, "step": 546 }, { "epoch": 1.900478053020426, "grad_norm": 0.9569775462150574, "learning_rate": 8.980656343310338e-05, "loss": 3.8547, "step": 547 }, { "epoch": 1.9039548022598871, "grad_norm": 1.0345852375030518, "learning_rate": 8.975110476771724e-05, "loss": 3.8461, "step": 548 }, { "epoch": 1.907431551499348, "grad_norm": 0.9311661124229431, "learning_rate": 8.969551286857849e-05, "loss": 3.831, "step": 549 }, { "epoch": 1.9109083007388092, "grad_norm": 0.847986102104187, "learning_rate": 8.963978792201449e-05, "loss": 3.6819, "step": 550 }, { "epoch": 1.9143850499782702, "grad_norm": 0.7580497860908508, "learning_rate": 8.958393011479848e-05, "loss": 3.7759, "step": 551 }, { "epoch": 1.9178617992177314, "grad_norm": 0.7308617830276489, "learning_rate": 8.952793963414907e-05, "loss": 3.8164, "step": 552 }, { "epoch": 1.9213385484571925, "grad_norm": 0.7043769955635071, "learning_rate": 8.947181666772948e-05, "loss": 3.7235, "step": 553 }, { "epoch": 1.9248152976966537, "grad_norm": 0.6796336770057678, "learning_rate": 8.941556140364706e-05, "loss": 3.8156, "step": 554 }, { "epoch": 1.928292046936115, "grad_norm": 0.6973956227302551, "learning_rate": 8.935917403045251e-05, "loss": 3.7504, "step": 555 }, { "epoch": 1.9317687961755758, "grad_norm": 0.8313055038452148, "learning_rate": 8.930265473713938e-05, "loss": 3.8686, "step": 556 }, { "epoch": 1.9352455454150368, "grad_norm": 0.8577587604522705, "learning_rate": 8.924600371314334e-05, "loss": 3.7068, "step": 557 }, { "epoch": 1.938722294654498, "grad_norm": 0.7737483978271484, "learning_rate": 8.918922114834156e-05, "loss": 3.7297, "step": 558 }, { "epoch": 1.9421990438939591, "grad_norm": 0.826034665107727, "learning_rate": 8.913230723305218e-05, "loss": 3.8176, "step": 559 }, { "epoch": 1.9456757931334203, "grad_norm": 0.9927733540534973, "learning_rate": 8.90752621580335e-05, "loss": 3.8469, "step": 560 }, { "epoch": 1.9491525423728815, "grad_norm": 1.0524464845657349, "learning_rate": 8.901808611448348e-05, "loss": 3.8041, "step": 561 }, { "epoch": 1.9526292916123424, "grad_norm": 1.098197102546692, "learning_rate": 8.896077929403901e-05, "loss": 3.7239, "step": 562 }, { "epoch": 1.9561060408518036, "grad_norm": 0.9311512112617493, "learning_rate": 8.890334188877533e-05, "loss": 3.8397, "step": 563 }, { "epoch": 1.9595827900912646, "grad_norm": 0.8799173831939697, "learning_rate": 8.884577409120535e-05, "loss": 3.7943, "step": 564 }, { "epoch": 1.9630595393307257, "grad_norm": 0.9762241840362549, "learning_rate": 8.878807609427905e-05, "loss": 3.7369, "step": 565 }, { "epoch": 1.966536288570187, "grad_norm": 1.0802910327911377, "learning_rate": 8.873024809138272e-05, "loss": 3.7294, "step": 566 }, { "epoch": 1.970013037809648, "grad_norm": 0.9968630075454712, "learning_rate": 8.86722902763385e-05, "loss": 3.8131, "step": 567 }, { "epoch": 1.973489787049109, "grad_norm": 1.0812654495239258, "learning_rate": 8.861420284340352e-05, "loss": 3.7722, "step": 568 }, { "epoch": 1.9769665362885702, "grad_norm": 1.0790822505950928, "learning_rate": 8.855598598726939e-05, "loss": 3.7907, "step": 569 }, { "epoch": 1.9804432855280312, "grad_norm": 1.0286710262298584, "learning_rate": 8.849763990306152e-05, "loss": 3.8615, "step": 570 }, { "epoch": 1.9839200347674923, "grad_norm": 0.9916720390319824, "learning_rate": 8.843916478633845e-05, "loss": 3.8969, "step": 571 }, { "epoch": 1.9873967840069535, "grad_norm": 1.0431153774261475, "learning_rate": 8.838056083309118e-05, "loss": 3.8122, "step": 572 }, { "epoch": 1.9908735332464147, "grad_norm": 0.9479820728302002, "learning_rate": 8.832182823974256e-05, "loss": 3.7987, "step": 573 }, { "epoch": 1.9943502824858759, "grad_norm": 0.8419528603553772, "learning_rate": 8.826296720314657e-05, "loss": 3.7854, "step": 574 }, { "epoch": 1.9978270317253368, "grad_norm": 0.7449129223823547, "learning_rate": 8.820397792058772e-05, "loss": 3.7427, "step": 575 }, { "epoch": 2.0, "grad_norm": 0.7268674373626709, "learning_rate": 8.814486058978035e-05, "loss": 3.7999, "step": 576 }, { "epoch": 2.003476749239461, "grad_norm": 0.7994548082351685, "learning_rate": 8.808561540886796e-05, "loss": 3.737, "step": 577 }, { "epoch": 2.0069534984789223, "grad_norm": 0.8729116320610046, "learning_rate": 8.802624257642261e-05, "loss": 3.8124, "step": 578 }, { "epoch": 2.0104302477183835, "grad_norm": 0.9562597274780273, "learning_rate": 8.796674229144418e-05, "loss": 3.8352, "step": 579 }, { "epoch": 2.0139069969578443, "grad_norm": 1.0685930252075195, "learning_rate": 8.790711475335971e-05, "loss": 3.799, "step": 580 }, { "epoch": 2.0173837461973054, "grad_norm": 1.0459791421890259, "learning_rate": 8.784736016202282e-05, "loss": 3.8583, "step": 581 }, { "epoch": 2.0208604954367666, "grad_norm": 0.8897305130958557, "learning_rate": 8.778747871771292e-05, "loss": 3.6429, "step": 582 }, { "epoch": 2.0243372446762278, "grad_norm": 0.8162091374397278, "learning_rate": 8.77274706211346e-05, "loss": 3.6127, "step": 583 }, { "epoch": 2.027813993915689, "grad_norm": 0.7601345181465149, "learning_rate": 8.766733607341698e-05, "loss": 3.7352, "step": 584 }, { "epoch": 2.03129074315515, "grad_norm": 0.7575955986976624, "learning_rate": 8.760707527611297e-05, "loss": 3.7707, "step": 585 }, { "epoch": 2.034767492394611, "grad_norm": 0.6935109496116638, "learning_rate": 8.754668843119864e-05, "loss": 3.6845, "step": 586 }, { "epoch": 2.038244241634072, "grad_norm": 0.6233929395675659, "learning_rate": 8.748617574107257e-05, "loss": 3.6204, "step": 587 }, { "epoch": 2.041720990873533, "grad_norm": 0.6228452920913696, "learning_rate": 8.742553740855506e-05, "loss": 3.7569, "step": 588 }, { "epoch": 2.0451977401129944, "grad_norm": 0.6230937838554382, "learning_rate": 8.736477363688761e-05, "loss": 3.7207, "step": 589 }, { "epoch": 2.0486744893524556, "grad_norm": 0.6162528395652771, "learning_rate": 8.730388462973208e-05, "loss": 3.7242, "step": 590 }, { "epoch": 2.0521512385919167, "grad_norm": 0.7501924633979797, "learning_rate": 8.724287059117016e-05, "loss": 3.7486, "step": 591 }, { "epoch": 2.0556279878313775, "grad_norm": 0.8422186970710754, "learning_rate": 8.718173172570254e-05, "loss": 3.6304, "step": 592 }, { "epoch": 2.0591047370708386, "grad_norm": 0.8915469646453857, "learning_rate": 8.71204682382483e-05, "loss": 3.7905, "step": 593 }, { "epoch": 2.0625814863103, "grad_norm": 0.9807952642440796, "learning_rate": 8.705908033414425e-05, "loss": 3.643, "step": 594 }, { "epoch": 2.066058235549761, "grad_norm": 1.0013173818588257, "learning_rate": 8.69975682191442e-05, "loss": 3.6854, "step": 595 }, { "epoch": 2.069534984789222, "grad_norm": 0.9831700325012207, "learning_rate": 8.693593209941825e-05, "loss": 3.673, "step": 596 }, { "epoch": 2.0730117340286833, "grad_norm": 1.1712896823883057, "learning_rate": 8.687417218155213e-05, "loss": 3.8204, "step": 597 }, { "epoch": 2.076488483268144, "grad_norm": 0.9944608807563782, "learning_rate": 8.681228867254655e-05, "loss": 3.7916, "step": 598 }, { "epoch": 2.0799652325076052, "grad_norm": 1.0010840892791748, "learning_rate": 8.675028177981643e-05, "loss": 3.7294, "step": 599 }, { "epoch": 2.0834419817470664, "grad_norm": 0.9211255311965942, "learning_rate": 8.668815171119021e-05, "loss": 3.6087, "step": 600 }, { "epoch": 2.0869187309865276, "grad_norm": 0.869985818862915, "learning_rate": 8.66258986749092e-05, "loss": 3.5953, "step": 601 }, { "epoch": 2.0903954802259888, "grad_norm": 0.7657525539398193, "learning_rate": 8.656352287962686e-05, "loss": 3.6554, "step": 602 }, { "epoch": 2.09387222946545, "grad_norm": 0.7607572078704834, "learning_rate": 8.650102453440813e-05, "loss": 3.6477, "step": 603 }, { "epoch": 2.097348978704911, "grad_norm": 0.9117911458015442, "learning_rate": 8.643840384872866e-05, "loss": 3.6963, "step": 604 }, { "epoch": 2.100825727944372, "grad_norm": 0.8420771956443787, "learning_rate": 8.637566103247415e-05, "loss": 3.6472, "step": 605 }, { "epoch": 2.104302477183833, "grad_norm": 0.7396708726882935, "learning_rate": 8.631279629593966e-05, "loss": 3.7796, "step": 606 }, { "epoch": 2.107779226423294, "grad_norm": 0.6754468083381653, "learning_rate": 8.624980984982892e-05, "loss": 3.581, "step": 607 }, { "epoch": 2.1112559756627554, "grad_norm": 0.78558748960495, "learning_rate": 8.618670190525352e-05, "loss": 3.7445, "step": 608 }, { "epoch": 2.1147327249022165, "grad_norm": 0.7408106923103333, "learning_rate": 8.612347267373234e-05, "loss": 3.7551, "step": 609 }, { "epoch": 2.1182094741416777, "grad_norm": 0.7574490904808044, "learning_rate": 8.606012236719073e-05, "loss": 3.5841, "step": 610 }, { "epoch": 2.1216862233811384, "grad_norm": 0.8567992448806763, "learning_rate": 8.599665119795992e-05, "loss": 3.5432, "step": 611 }, { "epoch": 2.1251629726205996, "grad_norm": 1.0516282320022583, "learning_rate": 8.593305937877614e-05, "loss": 3.6579, "step": 612 }, { "epoch": 2.128639721860061, "grad_norm": 1.0702804327011108, "learning_rate": 8.586934712278006e-05, "loss": 3.7136, "step": 613 }, { "epoch": 2.132116471099522, "grad_norm": 0.9042352437973022, "learning_rate": 8.580551464351603e-05, "loss": 3.7588, "step": 614 }, { "epoch": 2.135593220338983, "grad_norm": 1.0957833528518677, "learning_rate": 8.574156215493132e-05, "loss": 3.7078, "step": 615 }, { "epoch": 2.1390699695784443, "grad_norm": 1.1452794075012207, "learning_rate": 8.567748987137544e-05, "loss": 3.6626, "step": 616 }, { "epoch": 2.142546718817905, "grad_norm": 0.8174344897270203, "learning_rate": 8.561329800759943e-05, "loss": 3.7657, "step": 617 }, { "epoch": 2.146023468057366, "grad_norm": 0.8348018527030945, "learning_rate": 8.554898677875509e-05, "loss": 3.7377, "step": 618 }, { "epoch": 2.1495002172968274, "grad_norm": 0.8676193952560425, "learning_rate": 8.548455640039437e-05, "loss": 3.6855, "step": 619 }, { "epoch": 2.1529769665362886, "grad_norm": 0.7565463185310364, "learning_rate": 8.542000708846852e-05, "loss": 3.6733, "step": 620 }, { "epoch": 2.1564537157757497, "grad_norm": 0.7726176977157593, "learning_rate": 8.535533905932738e-05, "loss": 3.6327, "step": 621 }, { "epoch": 2.159930465015211, "grad_norm": 0.7790519595146179, "learning_rate": 8.529055252971879e-05, "loss": 3.6361, "step": 622 }, { "epoch": 2.163407214254672, "grad_norm": 0.7205121517181396, "learning_rate": 8.522564771678771e-05, "loss": 3.6716, "step": 623 }, { "epoch": 2.166883963494133, "grad_norm": 0.7978485226631165, "learning_rate": 8.516062483807556e-05, "loss": 3.7891, "step": 624 }, { "epoch": 2.170360712733594, "grad_norm": 0.7888757586479187, "learning_rate": 8.509548411151948e-05, "loss": 3.6903, "step": 625 }, { "epoch": 2.173837461973055, "grad_norm": 0.7865068912506104, "learning_rate": 8.503022575545158e-05, "loss": 3.7538, "step": 626 }, { "epoch": 2.1773142112125163, "grad_norm": 0.943021833896637, "learning_rate": 8.49648499885983e-05, "loss": 3.6956, "step": 627 }, { "epoch": 2.1807909604519775, "grad_norm": 1.2752074003219604, "learning_rate": 8.489935703007949e-05, "loss": 3.719, "step": 628 }, { "epoch": 2.1842677096914387, "grad_norm": 0.9368734955787659, "learning_rate": 8.483374709940792e-05, "loss": 3.7057, "step": 629 }, { "epoch": 2.1877444589308994, "grad_norm": 0.8080997467041016, "learning_rate": 8.476802041648832e-05, "loss": 3.7596, "step": 630 }, { "epoch": 2.1912212081703606, "grad_norm": 0.8630561828613281, "learning_rate": 8.47021772016168e-05, "loss": 3.6446, "step": 631 }, { "epoch": 2.1946979574098218, "grad_norm": 0.975145161151886, "learning_rate": 8.463621767547998e-05, "loss": 3.7115, "step": 632 }, { "epoch": 2.198174706649283, "grad_norm": 0.9544527530670166, "learning_rate": 8.457014205915438e-05, "loss": 3.6173, "step": 633 }, { "epoch": 2.201651455888744, "grad_norm": 0.8249536752700806, "learning_rate": 8.450395057410561e-05, "loss": 3.606, "step": 634 }, { "epoch": 2.2051282051282053, "grad_norm": 0.852883517742157, "learning_rate": 8.443764344218761e-05, "loss": 3.6814, "step": 635 }, { "epoch": 2.208604954367666, "grad_norm": 0.8320687413215637, "learning_rate": 8.437122088564198e-05, "loss": 3.7175, "step": 636 }, { "epoch": 2.212081703607127, "grad_norm": 0.781334638595581, "learning_rate": 8.430468312709712e-05, "loss": 3.7437, "step": 637 }, { "epoch": 2.2155584528465884, "grad_norm": 0.7602312564849854, "learning_rate": 8.423803038956764e-05, "loss": 3.6857, "step": 638 }, { "epoch": 2.2190352020860495, "grad_norm": 0.7551150918006897, "learning_rate": 8.417126289645344e-05, "loss": 3.7341, "step": 639 }, { "epoch": 2.2225119513255107, "grad_norm": 0.8869550228118896, "learning_rate": 8.410438087153911e-05, "loss": 3.6262, "step": 640 }, { "epoch": 2.225988700564972, "grad_norm": 0.8678908348083496, "learning_rate": 8.403738453899308e-05, "loss": 3.5698, "step": 641 }, { "epoch": 2.229465449804433, "grad_norm": 0.7575172781944275, "learning_rate": 8.39702741233669e-05, "loss": 3.6571, "step": 642 }, { "epoch": 2.232942199043894, "grad_norm": 0.7576946020126343, "learning_rate": 8.390304984959454e-05, "loss": 3.7523, "step": 643 }, { "epoch": 2.236418948283355, "grad_norm": 0.7081326246261597, "learning_rate": 8.383571194299154e-05, "loss": 3.6405, "step": 644 }, { "epoch": 2.239895697522816, "grad_norm": 0.7227503657341003, "learning_rate": 8.376826062925432e-05, "loss": 3.5813, "step": 645 }, { "epoch": 2.2433724467622773, "grad_norm": 0.9157429933547974, "learning_rate": 8.370069613445939e-05, "loss": 3.7295, "step": 646 }, { "epoch": 2.2468491960017385, "grad_norm": 0.9113756418228149, "learning_rate": 8.363301868506264e-05, "loss": 3.6834, "step": 647 }, { "epoch": 2.2503259452411997, "grad_norm": 0.9403269290924072, "learning_rate": 8.356522850789852e-05, "loss": 3.7961, "step": 648 }, { "epoch": 2.2538026944806604, "grad_norm": 0.7835237979888916, "learning_rate": 8.349732583017934e-05, "loss": 3.5661, "step": 649 }, { "epoch": 2.2572794437201216, "grad_norm": 0.7705581188201904, "learning_rate": 8.342931087949446e-05, "loss": 3.658, "step": 650 }, { "epoch": 2.2607561929595827, "grad_norm": 0.9230582118034363, "learning_rate": 8.336118388380954e-05, "loss": 3.5736, "step": 651 }, { "epoch": 2.264232942199044, "grad_norm": 0.8842865824699402, "learning_rate": 8.329294507146579e-05, "loss": 3.7125, "step": 652 }, { "epoch": 2.267709691438505, "grad_norm": 0.8269637823104858, "learning_rate": 8.32245946711792e-05, "loss": 3.7351, "step": 653 }, { "epoch": 2.2711864406779663, "grad_norm": 0.8161711096763611, "learning_rate": 8.315613291203976e-05, "loss": 3.561, "step": 654 }, { "epoch": 2.274663189917427, "grad_norm": 0.8007653951644897, "learning_rate": 8.30875600235107e-05, "loss": 3.6928, "step": 655 }, { "epoch": 2.278139939156888, "grad_norm": 0.685224175453186, "learning_rate": 8.301887623542773e-05, "loss": 3.6402, "step": 656 }, { "epoch": 2.2816166883963493, "grad_norm": 0.6802821755409241, "learning_rate": 8.295008177799827e-05, "loss": 3.6212, "step": 657 }, { "epoch": 2.2850934376358105, "grad_norm": 0.6886381506919861, "learning_rate": 8.288117688180064e-05, "loss": 3.5536, "step": 658 }, { "epoch": 2.2885701868752717, "grad_norm": 0.7427237629890442, "learning_rate": 8.281216177778334e-05, "loss": 3.7049, "step": 659 }, { "epoch": 2.292046936114733, "grad_norm": 0.6893945932388306, "learning_rate": 8.274303669726426e-05, "loss": 3.6027, "step": 660 }, { "epoch": 2.295523685354194, "grad_norm": 0.7162789106369019, "learning_rate": 8.267380187192989e-05, "loss": 3.7657, "step": 661 }, { "epoch": 2.2990004345936548, "grad_norm": 0.7326735854148865, "learning_rate": 8.260445753383452e-05, "loss": 3.5549, "step": 662 }, { "epoch": 2.302477183833116, "grad_norm": 0.8043762445449829, "learning_rate": 8.253500391539956e-05, "loss": 3.5237, "step": 663 }, { "epoch": 2.305953933072577, "grad_norm": 0.9436041712760925, "learning_rate": 8.246544124941266e-05, "loss": 3.6999, "step": 664 }, { "epoch": 2.3094306823120383, "grad_norm": 0.9793234467506409, "learning_rate": 8.239576976902695e-05, "loss": 3.6953, "step": 665 }, { "epoch": 2.3129074315514995, "grad_norm": 0.8090691566467285, "learning_rate": 8.232598970776028e-05, "loss": 3.6129, "step": 666 }, { "epoch": 2.3163841807909606, "grad_norm": 0.8231872320175171, "learning_rate": 8.225610129949443e-05, "loss": 3.6311, "step": 667 }, { "epoch": 2.3198609300304214, "grad_norm": 0.7168385982513428, "learning_rate": 8.218610477847436e-05, "loss": 3.5907, "step": 668 }, { "epoch": 2.3233376792698825, "grad_norm": 0.7335355877876282, "learning_rate": 8.211600037930736e-05, "loss": 3.6044, "step": 669 }, { "epoch": 2.3268144285093437, "grad_norm": 0.8048617839813232, "learning_rate": 8.204578833696228e-05, "loss": 3.7008, "step": 670 }, { "epoch": 2.330291177748805, "grad_norm": 0.8575453758239746, "learning_rate": 8.197546888676878e-05, "loss": 3.6408, "step": 671 }, { "epoch": 2.333767926988266, "grad_norm": 0.7635408043861389, "learning_rate": 8.190504226441654e-05, "loss": 3.605, "step": 672 }, { "epoch": 2.3372446762277272, "grad_norm": 0.6884478330612183, "learning_rate": 8.183450870595441e-05, "loss": 3.6391, "step": 673 }, { "epoch": 2.340721425467188, "grad_norm": 0.9262790083885193, "learning_rate": 8.176386844778968e-05, "loss": 3.5877, "step": 674 }, { "epoch": 2.344198174706649, "grad_norm": 1.1123980283737183, "learning_rate": 8.169312172668726e-05, "loss": 3.5557, "step": 675 }, { "epoch": 2.3476749239461103, "grad_norm": 0.8961548805236816, "learning_rate": 8.162226877976887e-05, "loss": 3.6788, "step": 676 }, { "epoch": 2.3511516731855715, "grad_norm": 1.1101398468017578, "learning_rate": 8.15513098445123e-05, "loss": 3.653, "step": 677 }, { "epoch": 2.3546284224250327, "grad_norm": 1.0202254056930542, "learning_rate": 8.148024515875057e-05, "loss": 3.6128, "step": 678 }, { "epoch": 2.358105171664494, "grad_norm": 0.9809442758560181, "learning_rate": 8.140907496067114e-05, "loss": 3.6884, "step": 679 }, { "epoch": 2.361581920903955, "grad_norm": 0.998312771320343, "learning_rate": 8.133779948881514e-05, "loss": 3.6343, "step": 680 }, { "epoch": 2.3650586701434158, "grad_norm": 0.8048033118247986, "learning_rate": 8.126641898207648e-05, "loss": 3.6517, "step": 681 }, { "epoch": 2.368535419382877, "grad_norm": 0.7618369460105896, "learning_rate": 8.119493367970119e-05, "loss": 3.5172, "step": 682 }, { "epoch": 2.372012168622338, "grad_norm": 0.7011895775794983, "learning_rate": 8.112334382128651e-05, "loss": 3.5716, "step": 683 }, { "epoch": 2.3754889178617993, "grad_norm": 0.7433372735977173, "learning_rate": 8.105164964678009e-05, "loss": 3.7358, "step": 684 }, { "epoch": 2.3789656671012605, "grad_norm": 0.8242580890655518, "learning_rate": 8.09798513964793e-05, "loss": 3.5706, "step": 685 }, { "epoch": 2.382442416340721, "grad_norm": 0.7195044755935669, "learning_rate": 8.090794931103026e-05, "loss": 3.6252, "step": 686 }, { "epoch": 2.3859191655801824, "grad_norm": 0.7642951011657715, "learning_rate": 8.083594363142717e-05, "loss": 3.6058, "step": 687 }, { "epoch": 2.3893959148196435, "grad_norm": 0.799333393573761, "learning_rate": 8.076383459901137e-05, "loss": 3.5904, "step": 688 }, { "epoch": 2.3928726640591047, "grad_norm": 0.7877199053764343, "learning_rate": 8.06916224554707e-05, "loss": 3.5985, "step": 689 }, { "epoch": 2.396349413298566, "grad_norm": 0.8267778754234314, "learning_rate": 8.061930744283854e-05, "loss": 3.5293, "step": 690 }, { "epoch": 2.399826162538027, "grad_norm": 0.8435574173927307, "learning_rate": 8.054688980349309e-05, "loss": 3.614, "step": 691 }, { "epoch": 2.4033029117774882, "grad_norm": 0.7058742046356201, "learning_rate": 8.047436978015649e-05, "loss": 3.5749, "step": 692 }, { "epoch": 2.406779661016949, "grad_norm": 0.6880009770393372, "learning_rate": 8.040174761589406e-05, "loss": 3.6028, "step": 693 }, { "epoch": 2.41025641025641, "grad_norm": 0.7245696187019348, "learning_rate": 8.032902355411345e-05, "loss": 3.6305, "step": 694 }, { "epoch": 2.4137331594958713, "grad_norm": 0.7221509218215942, "learning_rate": 8.025619783856388e-05, "loss": 3.552, "step": 695 }, { "epoch": 2.4172099087353325, "grad_norm": 0.6806580424308777, "learning_rate": 8.018327071333521e-05, "loss": 3.497, "step": 696 }, { "epoch": 2.4206866579747937, "grad_norm": 0.6653738617897034, "learning_rate": 8.011024242285728e-05, "loss": 3.5242, "step": 697 }, { "epoch": 2.424163407214255, "grad_norm": 0.725482702255249, "learning_rate": 8.003711321189895e-05, "loss": 3.6399, "step": 698 }, { "epoch": 2.427640156453716, "grad_norm": 0.6843823194503784, "learning_rate": 7.996388332556735e-05, "loss": 3.5002, "step": 699 }, { "epoch": 2.4311169056931767, "grad_norm": 0.6126710772514343, "learning_rate": 7.989055300930704e-05, "loss": 3.5279, "step": 700 }, { "epoch": 2.434593654932638, "grad_norm": 0.6402905583381653, "learning_rate": 7.981712250889921e-05, "loss": 3.4861, "step": 701 }, { "epoch": 2.438070404172099, "grad_norm": 0.6130061745643616, "learning_rate": 7.97435920704608e-05, "loss": 3.5446, "step": 702 }, { "epoch": 2.4415471534115603, "grad_norm": 0.6037262678146362, "learning_rate": 7.966996194044376e-05, "loss": 3.5591, "step": 703 }, { "epoch": 2.4450239026510214, "grad_norm": 0.5912169218063354, "learning_rate": 7.959623236563411e-05, "loss": 3.5881, "step": 704 }, { "epoch": 2.448500651890482, "grad_norm": 0.6557775139808655, "learning_rate": 7.952240359315126e-05, "loss": 3.5602, "step": 705 }, { "epoch": 2.4519774011299433, "grad_norm": 0.7176311016082764, "learning_rate": 7.944847587044704e-05, "loss": 3.6946, "step": 706 }, { "epoch": 2.4554541503694045, "grad_norm": 0.9263601303100586, "learning_rate": 7.937444944530495e-05, "loss": 3.4952, "step": 707 }, { "epoch": 2.4589308996088657, "grad_norm": 0.8851249814033508, "learning_rate": 7.930032456583931e-05, "loss": 3.4797, "step": 708 }, { "epoch": 2.462407648848327, "grad_norm": 0.7957706451416016, "learning_rate": 7.922610148049445e-05, "loss": 3.4552, "step": 709 }, { "epoch": 2.465884398087788, "grad_norm": 0.8621044754981995, "learning_rate": 7.915178043804382e-05, "loss": 3.6836, "step": 710 }, { "epoch": 2.469361147327249, "grad_norm": 0.8820719718933105, "learning_rate": 7.907736168758921e-05, "loss": 3.4278, "step": 711 }, { "epoch": 2.47283789656671, "grad_norm": 0.9527013301849365, "learning_rate": 7.900284547855991e-05, "loss": 3.4505, "step": 712 }, { "epoch": 2.476314645806171, "grad_norm": 0.8297730088233948, "learning_rate": 7.892823206071185e-05, "loss": 3.5746, "step": 713 }, { "epoch": 2.4797913950456323, "grad_norm": 0.8282458186149597, "learning_rate": 7.885352168412676e-05, "loss": 3.566, "step": 714 }, { "epoch": 2.4832681442850935, "grad_norm": 0.8243257403373718, "learning_rate": 7.877871459921138e-05, "loss": 3.4891, "step": 715 }, { "epoch": 2.4867448935245546, "grad_norm": 0.8400633335113525, "learning_rate": 7.870381105669657e-05, "loss": 3.4364, "step": 716 }, { "epoch": 2.490221642764016, "grad_norm": 0.9697174429893494, "learning_rate": 7.862881130763646e-05, "loss": 3.58, "step": 717 }, { "epoch": 2.493698392003477, "grad_norm": 0.882383406162262, "learning_rate": 7.85537156034077e-05, "loss": 3.5832, "step": 718 }, { "epoch": 2.4971751412429377, "grad_norm": 0.7008349299430847, "learning_rate": 7.847852419570846e-05, "loss": 3.4555, "step": 719 }, { "epoch": 2.500651890482399, "grad_norm": 0.8088206648826599, "learning_rate": 7.840323733655778e-05, "loss": 3.5888, "step": 720 }, { "epoch": 2.50412863972186, "grad_norm": 0.8056133985519409, "learning_rate": 7.832785527829458e-05, "loss": 3.6404, "step": 721 }, { "epoch": 2.5076053889613212, "grad_norm": 0.9462586045265198, "learning_rate": 7.825237827357683e-05, "loss": 3.5969, "step": 722 }, { "epoch": 2.5110821382007824, "grad_norm": 0.8435570597648621, "learning_rate": 7.817680657538078e-05, "loss": 3.5405, "step": 723 }, { "epoch": 2.514558887440243, "grad_norm": 0.7596110105514526, "learning_rate": 7.8101140437e-05, "loss": 3.5774, "step": 724 }, { "epoch": 2.5180356366797043, "grad_norm": 0.8581807017326355, "learning_rate": 7.80253801120447e-05, "loss": 3.4993, "step": 725 }, { "epoch": 2.5215123859191655, "grad_norm": 0.7680181860923767, "learning_rate": 7.794952585444068e-05, "loss": 3.5667, "step": 726 }, { "epoch": 2.5249891351586267, "grad_norm": 0.8518640398979187, "learning_rate": 7.78735779184286e-05, "loss": 3.5648, "step": 727 }, { "epoch": 2.528465884398088, "grad_norm": 0.8862175345420837, "learning_rate": 7.779753655856312e-05, "loss": 3.608, "step": 728 }, { "epoch": 2.531942633637549, "grad_norm": 0.8444491028785706, "learning_rate": 7.772140202971204e-05, "loss": 3.5332, "step": 729 }, { "epoch": 2.53541938287701, "grad_norm": 0.8340655565261841, "learning_rate": 7.764517458705536e-05, "loss": 3.4764, "step": 730 }, { "epoch": 2.538896132116471, "grad_norm": 0.9282291531562805, "learning_rate": 7.756885448608459e-05, "loss": 3.5358, "step": 731 }, { "epoch": 2.542372881355932, "grad_norm": 0.7459742426872253, "learning_rate": 7.749244198260175e-05, "loss": 3.5173, "step": 732 }, { "epoch": 2.5458496305953933, "grad_norm": 0.6265541315078735, "learning_rate": 7.74159373327186e-05, "loss": 3.5435, "step": 733 }, { "epoch": 2.5493263798348544, "grad_norm": 0.6342918276786804, "learning_rate": 7.733934079285569e-05, "loss": 3.5435, "step": 734 }, { "epoch": 2.5528031290743156, "grad_norm": 0.6661461591720581, "learning_rate": 7.726265261974162e-05, "loss": 3.4791, "step": 735 }, { "epoch": 2.5562798783137763, "grad_norm": 0.6519825458526611, "learning_rate": 7.718587307041208e-05, "loss": 3.6155, "step": 736 }, { "epoch": 2.559756627553238, "grad_norm": 0.6564475297927856, "learning_rate": 7.710900240220904e-05, "loss": 3.6105, "step": 737 }, { "epoch": 2.5632333767926987, "grad_norm": 0.6510676741600037, "learning_rate": 7.703204087277988e-05, "loss": 3.5056, "step": 738 }, { "epoch": 2.56671012603216, "grad_norm": 0.7257498502731323, "learning_rate": 7.695498874007649e-05, "loss": 3.4504, "step": 739 }, { "epoch": 2.570186875271621, "grad_norm": 0.7453542351722717, "learning_rate": 7.687784626235448e-05, "loss": 3.5899, "step": 740 }, { "epoch": 2.573663624511082, "grad_norm": 0.855290949344635, "learning_rate": 7.680061369817222e-05, "loss": 3.5492, "step": 741 }, { "epoch": 2.5771403737505434, "grad_norm": 0.9114766120910645, "learning_rate": 7.672329130639005e-05, "loss": 3.5393, "step": 742 }, { "epoch": 2.580617122990004, "grad_norm": 0.7029814720153809, "learning_rate": 7.66458793461694e-05, "loss": 3.5573, "step": 743 }, { "epoch": 2.5840938722294653, "grad_norm": 0.6988779306411743, "learning_rate": 7.656837807697187e-05, "loss": 3.596, "step": 744 }, { "epoch": 2.5875706214689265, "grad_norm": 0.7252441048622131, "learning_rate": 7.64907877585584e-05, "loss": 3.479, "step": 745 }, { "epoch": 2.5910473707083876, "grad_norm": 0.7226544618606567, "learning_rate": 7.641310865098845e-05, "loss": 3.6214, "step": 746 }, { "epoch": 2.594524119947849, "grad_norm": 0.6503444314002991, "learning_rate": 7.633534101461902e-05, "loss": 3.5305, "step": 747 }, { "epoch": 2.59800086918731, "grad_norm": 0.635787844657898, "learning_rate": 7.625748511010382e-05, "loss": 3.5219, "step": 748 }, { "epoch": 2.601477618426771, "grad_norm": 0.6334975957870483, "learning_rate": 7.617954119839247e-05, "loss": 3.468, "step": 749 }, { "epoch": 2.604954367666232, "grad_norm": 0.692579448223114, "learning_rate": 7.610150954072952e-05, "loss": 3.4644, "step": 750 }, { "epoch": 2.608431116905693, "grad_norm": 0.8107365369796753, "learning_rate": 7.602339039865362e-05, "loss": 3.488, "step": 751 }, { "epoch": 2.6119078661451542, "grad_norm": 0.8618629574775696, "learning_rate": 7.594518403399667e-05, "loss": 3.5615, "step": 752 }, { "epoch": 2.6153846153846154, "grad_norm": 0.8016732931137085, "learning_rate": 7.586689070888284e-05, "loss": 3.4891, "step": 753 }, { "epoch": 2.6188613646240766, "grad_norm": 0.9589606523513794, "learning_rate": 7.578851068572788e-05, "loss": 3.5743, "step": 754 }, { "epoch": 2.6223381138635373, "grad_norm": 1.0258772373199463, "learning_rate": 7.571004422723805e-05, "loss": 3.5625, "step": 755 }, { "epoch": 2.625814863102999, "grad_norm": 0.7846320271492004, "learning_rate": 7.563149159640929e-05, "loss": 3.5014, "step": 756 }, { "epoch": 2.6292916123424597, "grad_norm": 0.7241854667663574, "learning_rate": 7.555285305652644e-05, "loss": 3.5707, "step": 757 }, { "epoch": 2.632768361581921, "grad_norm": 0.6381678581237793, "learning_rate": 7.547412887116223e-05, "loss": 3.5553, "step": 758 }, { "epoch": 2.636245110821382, "grad_norm": 0.6428322196006775, "learning_rate": 7.539531930417648e-05, "loss": 3.4585, "step": 759 }, { "epoch": 2.639721860060843, "grad_norm": 0.6515122056007385, "learning_rate": 7.531642461971516e-05, "loss": 3.4973, "step": 760 }, { "epoch": 2.6431986093003044, "grad_norm": 0.7195065021514893, "learning_rate": 7.523744508220949e-05, "loss": 3.5204, "step": 761 }, { "epoch": 2.646675358539765, "grad_norm": 0.7389284372329712, "learning_rate": 7.51583809563752e-05, "loss": 3.5345, "step": 762 }, { "epoch": 2.6501521077792263, "grad_norm": 0.8561949133872986, "learning_rate": 7.507923250721145e-05, "loss": 3.5781, "step": 763 }, { "epoch": 2.6536288570186874, "grad_norm": 0.9598309993743896, "learning_rate": 7.500000000000001e-05, "loss": 3.5838, "step": 764 }, { "epoch": 2.6571056062581486, "grad_norm": 1.006478190422058, "learning_rate": 7.492068370030447e-05, "loss": 3.6576, "step": 765 }, { "epoch": 2.66058235549761, "grad_norm": 0.9836744666099548, "learning_rate": 7.48412838739692e-05, "loss": 3.649, "step": 766 }, { "epoch": 2.664059104737071, "grad_norm": 0.8797217011451721, "learning_rate": 7.476180078711854e-05, "loss": 3.4689, "step": 767 }, { "epoch": 2.667535853976532, "grad_norm": 0.8832845091819763, "learning_rate": 7.468223470615593e-05, "loss": 3.3975, "step": 768 }, { "epoch": 2.671012603215993, "grad_norm": 0.7454182505607605, "learning_rate": 7.460258589776292e-05, "loss": 3.5938, "step": 769 }, { "epoch": 2.674489352455454, "grad_norm": 0.6505441069602966, "learning_rate": 7.452285462889841e-05, "loss": 3.5975, "step": 770 }, { "epoch": 2.6779661016949152, "grad_norm": 0.6446083188056946, "learning_rate": 7.44430411667976e-05, "loss": 3.4146, "step": 771 }, { "epoch": 2.6814428509343764, "grad_norm": 0.6646334528923035, "learning_rate": 7.436314577897126e-05, "loss": 3.5033, "step": 772 }, { "epoch": 2.6849196001738376, "grad_norm": 0.7433136105537415, "learning_rate": 7.428316873320466e-05, "loss": 3.3934, "step": 773 }, { "epoch": 2.6883963494132983, "grad_norm": 0.7214708924293518, "learning_rate": 7.420311029755688e-05, "loss": 3.5329, "step": 774 }, { "epoch": 2.69187309865276, "grad_norm": 0.7187039852142334, "learning_rate": 7.412297074035967e-05, "loss": 3.6085, "step": 775 }, { "epoch": 2.6953498478922207, "grad_norm": 0.6889802813529968, "learning_rate": 7.404275033021676e-05, "loss": 3.5609, "step": 776 }, { "epoch": 2.698826597131682, "grad_norm": 0.7113670706748962, "learning_rate": 7.396244933600285e-05, "loss": 3.5038, "step": 777 }, { "epoch": 2.702303346371143, "grad_norm": 0.7699861526489258, "learning_rate": 7.388206802686272e-05, "loss": 3.4915, "step": 778 }, { "epoch": 2.705780095610604, "grad_norm": 0.874978244304657, "learning_rate": 7.380160667221035e-05, "loss": 3.5332, "step": 779 }, { "epoch": 2.7092568448500653, "grad_norm": 0.8308786153793335, "learning_rate": 7.372106554172802e-05, "loss": 3.5566, "step": 780 }, { "epoch": 2.712733594089526, "grad_norm": 0.8035147786140442, "learning_rate": 7.364044490536539e-05, "loss": 3.4245, "step": 781 }, { "epoch": 2.7162103433289873, "grad_norm": 0.6532664895057678, "learning_rate": 7.355974503333859e-05, "loss": 3.5687, "step": 782 }, { "epoch": 2.7196870925684484, "grad_norm": 0.7014971375465393, "learning_rate": 7.347896619612932e-05, "loss": 3.3345, "step": 783 }, { "epoch": 2.7231638418079096, "grad_norm": 0.8252018690109253, "learning_rate": 7.339810866448398e-05, "loss": 3.4766, "step": 784 }, { "epoch": 2.7266405910473708, "grad_norm": 0.8229169845581055, "learning_rate": 7.331717270941268e-05, "loss": 3.4474, "step": 785 }, { "epoch": 2.730117340286832, "grad_norm": 0.7657115459442139, "learning_rate": 7.323615860218843e-05, "loss": 3.5018, "step": 786 }, { "epoch": 2.733594089526293, "grad_norm": 0.7167194485664368, "learning_rate": 7.315506661434616e-05, "loss": 3.3943, "step": 787 }, { "epoch": 2.737070838765754, "grad_norm": 0.6008238196372986, "learning_rate": 7.307389701768182e-05, "loss": 3.4675, "step": 788 }, { "epoch": 2.740547588005215, "grad_norm": 0.63405841588974, "learning_rate": 7.299265008425151e-05, "loss": 3.4227, "step": 789 }, { "epoch": 2.744024337244676, "grad_norm": 0.6532236337661743, "learning_rate": 7.291132608637052e-05, "loss": 3.5426, "step": 790 }, { "epoch": 2.7475010864841374, "grad_norm": 0.6104541420936584, "learning_rate": 7.282992529661246e-05, "loss": 3.4405, "step": 791 }, { "epoch": 2.7509778357235986, "grad_norm": 0.620786726474762, "learning_rate": 7.274844798780826e-05, "loss": 3.5128, "step": 792 }, { "epoch": 2.7544545849630593, "grad_norm": 0.6040831804275513, "learning_rate": 7.266689443304541e-05, "loss": 3.3964, "step": 793 }, { "epoch": 2.757931334202521, "grad_norm": 0.6351047158241272, "learning_rate": 7.258526490566687e-05, "loss": 3.4846, "step": 794 }, { "epoch": 2.7614080834419816, "grad_norm": 0.6179344654083252, "learning_rate": 7.25035596792703e-05, "loss": 3.3667, "step": 795 }, { "epoch": 2.764884832681443, "grad_norm": 0.5391456484794617, "learning_rate": 7.242177902770707e-05, "loss": 3.5516, "step": 796 }, { "epoch": 2.768361581920904, "grad_norm": 0.5362287759780884, "learning_rate": 7.233992322508129e-05, "loss": 3.5142, "step": 797 }, { "epoch": 2.771838331160365, "grad_norm": 0.5863167643547058, "learning_rate": 7.225799254574904e-05, "loss": 3.468, "step": 798 }, { "epoch": 2.7753150803998263, "grad_norm": 0.6511790156364441, "learning_rate": 7.217598726431734e-05, "loss": 3.38, "step": 799 }, { "epoch": 2.778791829639287, "grad_norm": 0.7753508687019348, "learning_rate": 7.209390765564318e-05, "loss": 3.4388, "step": 800 }, { "epoch": 2.7822685788787482, "grad_norm": 0.9056048393249512, "learning_rate": 7.201175399483278e-05, "loss": 3.4029, "step": 801 }, { "epoch": 2.7857453281182094, "grad_norm": 0.8542281985282898, "learning_rate": 7.192952655724049e-05, "loss": 3.504, "step": 802 }, { "epoch": 2.7892220773576706, "grad_norm": 0.7020969986915588, "learning_rate": 7.184722561846798e-05, "loss": 3.479, "step": 803 }, { "epoch": 2.7926988265971318, "grad_norm": 0.7977145314216614, "learning_rate": 7.176485145436325e-05, "loss": 3.41, "step": 804 }, { "epoch": 2.796175575836593, "grad_norm": 0.8588235378265381, "learning_rate": 7.168240434101971e-05, "loss": 3.5017, "step": 805 }, { "epoch": 2.799652325076054, "grad_norm": 0.8466975092887878, "learning_rate": 7.159988455477534e-05, "loss": 3.5023, "step": 806 }, { "epoch": 2.803129074315515, "grad_norm": 0.8365435600280762, "learning_rate": 7.151729237221162e-05, "loss": 3.5185, "step": 807 }, { "epoch": 2.806605823554976, "grad_norm": 0.8156039118766785, "learning_rate": 7.143462807015271e-05, "loss": 3.4056, "step": 808 }, { "epoch": 2.810082572794437, "grad_norm": 0.8369770050048828, "learning_rate": 7.135189192566453e-05, "loss": 3.4314, "step": 809 }, { "epoch": 2.8135593220338984, "grad_norm": 0.7135697603225708, "learning_rate": 7.126908421605375e-05, "loss": 3.4166, "step": 810 }, { "epoch": 2.8170360712733595, "grad_norm": 0.6435958743095398, "learning_rate": 7.11862052188669e-05, "loss": 3.469, "step": 811 }, { "epoch": 2.8205128205128203, "grad_norm": 0.6611959934234619, "learning_rate": 7.110325521188949e-05, "loss": 3.3635, "step": 812 }, { "epoch": 2.823989569752282, "grad_norm": 0.6469593644142151, "learning_rate": 7.102023447314501e-05, "loss": 3.4362, "step": 813 }, { "epoch": 2.8274663189917426, "grad_norm": 0.6080538630485535, "learning_rate": 7.093714328089399e-05, "loss": 3.4378, "step": 814 }, { "epoch": 2.830943068231204, "grad_norm": 0.6681721210479736, "learning_rate": 7.085398191363313e-05, "loss": 3.4019, "step": 815 }, { "epoch": 2.834419817470665, "grad_norm": 0.6522132158279419, "learning_rate": 7.077075065009433e-05, "loss": 3.4162, "step": 816 }, { "epoch": 2.837896566710126, "grad_norm": 0.5956054329872131, "learning_rate": 7.068744976924378e-05, "loss": 3.3781, "step": 817 }, { "epoch": 2.8413733159495873, "grad_norm": 0.5895470976829529, "learning_rate": 7.060407955028097e-05, "loss": 3.4813, "step": 818 }, { "epoch": 2.844850065189048, "grad_norm": 0.6135871410369873, "learning_rate": 7.052064027263786e-05, "loss": 3.4464, "step": 819 }, { "epoch": 2.848326814428509, "grad_norm": 0.5782895088195801, "learning_rate": 7.043713221597774e-05, "loss": 3.3993, "step": 820 }, { "epoch": 2.8518035636679704, "grad_norm": 0.6497248411178589, "learning_rate": 7.035355566019458e-05, "loss": 3.4595, "step": 821 }, { "epoch": 2.8552803129074316, "grad_norm": 0.7294395565986633, "learning_rate": 7.026991088541184e-05, "loss": 3.3974, "step": 822 }, { "epoch": 2.8587570621468927, "grad_norm": 0.6991782188415527, "learning_rate": 7.018619817198165e-05, "loss": 3.4243, "step": 823 }, { "epoch": 2.862233811386354, "grad_norm": 0.688694417476654, "learning_rate": 7.01024178004839e-05, "loss": 3.4343, "step": 824 }, { "epoch": 2.865710560625815, "grad_norm": 0.8485518097877502, "learning_rate": 7.001857005172515e-05, "loss": 3.4753, "step": 825 }, { "epoch": 2.869187309865276, "grad_norm": 0.7970876097679138, "learning_rate": 6.99346552067379e-05, "loss": 3.3739, "step": 826 }, { "epoch": 2.872664059104737, "grad_norm": 0.8355128765106201, "learning_rate": 6.985067354677946e-05, "loss": 3.3663, "step": 827 }, { "epoch": 2.876140808344198, "grad_norm": 1.0474745035171509, "learning_rate": 6.976662535333107e-05, "loss": 3.4602, "step": 828 }, { "epoch": 2.8796175575836593, "grad_norm": 0.971641480922699, "learning_rate": 6.968251090809708e-05, "loss": 3.5213, "step": 829 }, { "epoch": 2.8830943068231205, "grad_norm": 0.85914146900177, "learning_rate": 6.959833049300377e-05, "loss": 3.4002, "step": 830 }, { "epoch": 2.8865710560625812, "grad_norm": 0.7117258310317993, "learning_rate": 6.951408439019858e-05, "loss": 3.3931, "step": 831 }, { "epoch": 2.890047805302043, "grad_norm": 0.7882718443870544, "learning_rate": 6.942977288204915e-05, "loss": 3.4796, "step": 832 }, { "epoch": 2.8935245545415036, "grad_norm": 0.8237490057945251, "learning_rate": 6.93453962511423e-05, "loss": 3.511, "step": 833 }, { "epoch": 2.8970013037809648, "grad_norm": 0.8855555057525635, "learning_rate": 6.926095478028311e-05, "loss": 3.4921, "step": 834 }, { "epoch": 2.900478053020426, "grad_norm": 0.8086044788360596, "learning_rate": 6.917644875249404e-05, "loss": 3.3488, "step": 835 }, { "epoch": 2.903954802259887, "grad_norm": 0.672223687171936, "learning_rate": 6.909187845101387e-05, "loss": 3.4005, "step": 836 }, { "epoch": 2.9074315514993483, "grad_norm": 0.7050768136978149, "learning_rate": 6.900724415929681e-05, "loss": 3.436, "step": 837 }, { "epoch": 2.910908300738809, "grad_norm": 0.6785340905189514, "learning_rate": 6.892254616101159e-05, "loss": 3.4632, "step": 838 }, { "epoch": 2.91438504997827, "grad_norm": 0.6947048902511597, "learning_rate": 6.883778474004041e-05, "loss": 3.3555, "step": 839 }, { "epoch": 2.9178617992177314, "grad_norm": 0.712297797203064, "learning_rate": 6.87529601804781e-05, "loss": 3.4413, "step": 840 }, { "epoch": 2.9213385484571925, "grad_norm": 0.6110817193984985, "learning_rate": 6.866807276663106e-05, "loss": 3.3946, "step": 841 }, { "epoch": 2.9248152976966537, "grad_norm": 0.5554404258728027, "learning_rate": 6.858312278301637e-05, "loss": 3.3931, "step": 842 }, { "epoch": 2.928292046936115, "grad_norm": 0.5974950194358826, "learning_rate": 6.849811051436088e-05, "loss": 3.4516, "step": 843 }, { "epoch": 2.931768796175576, "grad_norm": 0.6106230020523071, "learning_rate": 6.841303624560012e-05, "loss": 3.4381, "step": 844 }, { "epoch": 2.935245545415037, "grad_norm": 0.6150991320610046, "learning_rate": 6.832790026187748e-05, "loss": 3.3654, "step": 845 }, { "epoch": 2.938722294654498, "grad_norm": 0.6058139204978943, "learning_rate": 6.824270284854319e-05, "loss": 3.4406, "step": 846 }, { "epoch": 2.942199043893959, "grad_norm": 0.6507914066314697, "learning_rate": 6.815744429115331e-05, "loss": 3.4364, "step": 847 }, { "epoch": 2.9456757931334203, "grad_norm": 0.7351534962654114, "learning_rate": 6.807212487546897e-05, "loss": 3.4768, "step": 848 }, { "epoch": 2.9491525423728815, "grad_norm": 0.7599115371704102, "learning_rate": 6.798674488745515e-05, "loss": 3.3948, "step": 849 }, { "epoch": 2.952629291612342, "grad_norm": 0.6528266668319702, "learning_rate": 6.790130461327993e-05, "loss": 3.4272, "step": 850 }, { "epoch": 2.956106040851804, "grad_norm": 0.5727442502975464, "learning_rate": 6.78158043393134e-05, "loss": 3.324, "step": 851 }, { "epoch": 2.9595827900912646, "grad_norm": 0.5675135254859924, "learning_rate": 6.773024435212678e-05, "loss": 3.3768, "step": 852 }, { "epoch": 2.9630595393307257, "grad_norm": 0.5391719937324524, "learning_rate": 6.764462493849144e-05, "loss": 3.3768, "step": 853 }, { "epoch": 2.966536288570187, "grad_norm": 0.5126081705093384, "learning_rate": 6.755894638537791e-05, "loss": 3.3735, "step": 854 }, { "epoch": 2.970013037809648, "grad_norm": 0.5525986552238464, "learning_rate": 6.747320897995493e-05, "loss": 3.3825, "step": 855 }, { "epoch": 2.9734897870491093, "grad_norm": 0.5459985733032227, "learning_rate": 6.73874130095885e-05, "loss": 3.3285, "step": 856 }, { "epoch": 2.97696653628857, "grad_norm": 0.5992332696914673, "learning_rate": 6.730155876184094e-05, "loss": 3.3521, "step": 857 }, { "epoch": 2.980443285528031, "grad_norm": 0.6372593641281128, "learning_rate": 6.721564652446986e-05, "loss": 3.3832, "step": 858 }, { "epoch": 2.9839200347674923, "grad_norm": 0.740464448928833, "learning_rate": 6.712967658542729e-05, "loss": 3.4085, "step": 859 }, { "epoch": 2.9873967840069535, "grad_norm": 0.8248367309570312, "learning_rate": 6.704364923285857e-05, "loss": 3.4706, "step": 860 }, { "epoch": 2.9908735332464147, "grad_norm": 0.9461883306503296, "learning_rate": 6.695756475510156e-05, "loss": 3.3043, "step": 861 }, { "epoch": 2.994350282485876, "grad_norm": 0.8672154545783997, "learning_rate": 6.687142344068553e-05, "loss": 3.4494, "step": 862 }, { "epoch": 2.997827031725337, "grad_norm": 0.7892129421234131, "learning_rate": 6.678522557833024e-05, "loss": 3.2984, "step": 863 }, { "epoch": 3.0, "grad_norm": 0.7941856384277344, "learning_rate": 6.669897145694507e-05, "loss": 3.4018, "step": 864 }, { "epoch": 3.003476749239461, "grad_norm": 0.7466159462928772, "learning_rate": 6.661266136562788e-05, "loss": 3.3642, "step": 865 }, { "epoch": 3.0069534984789223, "grad_norm": 0.8197594881057739, "learning_rate": 6.652629559366414e-05, "loss": 3.4616, "step": 866 }, { "epoch": 3.0104302477183835, "grad_norm": 0.8821737766265869, "learning_rate": 6.643987443052595e-05, "loss": 3.4096, "step": 867 }, { "epoch": 3.0139069969578443, "grad_norm": 1.0090149641036987, "learning_rate": 6.635339816587109e-05, "loss": 3.313, "step": 868 }, { "epoch": 3.0173837461973054, "grad_norm": 0.958720862865448, "learning_rate": 6.626686708954198e-05, "loss": 3.4145, "step": 869 }, { "epoch": 3.0208604954367666, "grad_norm": 0.8551411628723145, "learning_rate": 6.618028149156479e-05, "loss": 3.4031, "step": 870 }, { "epoch": 3.0243372446762278, "grad_norm": 0.8152096271514893, "learning_rate": 6.609364166214837e-05, "loss": 3.4753, "step": 871 }, { "epoch": 3.027813993915689, "grad_norm": 0.797737717628479, "learning_rate": 6.600694789168344e-05, "loss": 3.4012, "step": 872 }, { "epoch": 3.03129074315515, "grad_norm": 0.8055285215377808, "learning_rate": 6.592020047074144e-05, "loss": 3.3403, "step": 873 }, { "epoch": 3.034767492394611, "grad_norm": 0.6998369693756104, "learning_rate": 6.583339969007363e-05, "loss": 3.3336, "step": 874 }, { "epoch": 3.038244241634072, "grad_norm": 0.6863573789596558, "learning_rate": 6.574654584061013e-05, "loss": 3.369, "step": 875 }, { "epoch": 3.041720990873533, "grad_norm": 0.6229199767112732, "learning_rate": 6.565963921345895e-05, "loss": 3.2858, "step": 876 }, { "epoch": 3.0451977401129944, "grad_norm": 0.7069191336631775, "learning_rate": 6.557268009990496e-05, "loss": 3.4242, "step": 877 }, { "epoch": 3.0486744893524556, "grad_norm": 0.6492975354194641, "learning_rate": 6.548566879140897e-05, "loss": 3.4076, "step": 878 }, { "epoch": 3.0521512385919167, "grad_norm": 0.5807247161865234, "learning_rate": 6.539860557960674e-05, "loss": 3.3027, "step": 879 }, { "epoch": 3.0556279878313775, "grad_norm": 0.6085408926010132, "learning_rate": 6.531149075630796e-05, "loss": 3.3768, "step": 880 }, { "epoch": 3.0591047370708386, "grad_norm": 0.6492026448249817, "learning_rate": 6.522432461349536e-05, "loss": 3.3567, "step": 881 }, { "epoch": 3.0625814863103, "grad_norm": 0.6504300832748413, "learning_rate": 6.51371074433236e-05, "loss": 3.3631, "step": 882 }, { "epoch": 3.066058235549761, "grad_norm": 0.616385281085968, "learning_rate": 6.504983953811845e-05, "loss": 3.399, "step": 883 }, { "epoch": 3.069534984789222, "grad_norm": 0.6602182984352112, "learning_rate": 6.49625211903757e-05, "loss": 3.2783, "step": 884 }, { "epoch": 3.0730117340286833, "grad_norm": 0.6816926002502441, "learning_rate": 6.487515269276016e-05, "loss": 3.4135, "step": 885 }, { "epoch": 3.076488483268144, "grad_norm": 0.6374818682670593, "learning_rate": 6.478773433810477e-05, "loss": 3.4138, "step": 886 }, { "epoch": 3.0799652325076052, "grad_norm": 0.6355860829353333, "learning_rate": 6.470026641940963e-05, "loss": 3.3589, "step": 887 }, { "epoch": 3.0834419817470664, "grad_norm": 0.6020347476005554, "learning_rate": 6.461274922984086e-05, "loss": 3.3701, "step": 888 }, { "epoch": 3.0869187309865276, "grad_norm": 0.5898982882499695, "learning_rate": 6.45251830627298e-05, "loss": 3.332, "step": 889 }, { "epoch": 3.0903954802259888, "grad_norm": 0.6016554236412048, "learning_rate": 6.443756821157186e-05, "loss": 3.3707, "step": 890 }, { "epoch": 3.09387222946545, "grad_norm": 0.59771728515625, "learning_rate": 6.434990497002573e-05, "loss": 3.462, "step": 891 }, { "epoch": 3.097348978704911, "grad_norm": 0.5475634336471558, "learning_rate": 6.426219363191224e-05, "loss": 3.3056, "step": 892 }, { "epoch": 3.100825727944372, "grad_norm": 0.5900417566299438, "learning_rate": 6.417443449121339e-05, "loss": 3.2803, "step": 893 }, { "epoch": 3.104302477183833, "grad_norm": 0.5932677388191223, "learning_rate": 6.408662784207149e-05, "loss": 3.4113, "step": 894 }, { "epoch": 3.107779226423294, "grad_norm": 0.6010504961013794, "learning_rate": 6.3998773978788e-05, "loss": 3.4699, "step": 895 }, { "epoch": 3.1112559756627554, "grad_norm": 0.7265802025794983, "learning_rate": 6.391087319582264e-05, "loss": 3.3484, "step": 896 }, { "epoch": 3.1147327249022165, "grad_norm": 0.8029538989067078, "learning_rate": 6.382292578779243e-05, "loss": 3.4066, "step": 897 }, { "epoch": 3.1182094741416777, "grad_norm": 0.7880899310112, "learning_rate": 6.373493204947065e-05, "loss": 3.4581, "step": 898 }, { "epoch": 3.1216862233811384, "grad_norm": 0.9686315655708313, "learning_rate": 6.364689227578583e-05, "loss": 3.3693, "step": 899 }, { "epoch": 3.1251629726205996, "grad_norm": 0.9728556871414185, "learning_rate": 6.355880676182086e-05, "loss": 3.3544, "step": 900 }, { "epoch": 3.128639721860061, "grad_norm": 0.700410783290863, "learning_rate": 6.347067580281186e-05, "loss": 3.384, "step": 901 }, { "epoch": 3.132116471099522, "grad_norm": 0.8260563611984253, "learning_rate": 6.338249969414734e-05, "loss": 3.375, "step": 902 }, { "epoch": 3.135593220338983, "grad_norm": 0.8206276297569275, "learning_rate": 6.32942787313671e-05, "loss": 3.2862, "step": 903 }, { "epoch": 3.1390699695784443, "grad_norm": 0.6374201774597168, "learning_rate": 6.320601321016128e-05, "loss": 3.3537, "step": 904 }, { "epoch": 3.142546718817905, "grad_norm": 0.7070665955543518, "learning_rate": 6.311770342636937e-05, "loss": 3.3998, "step": 905 }, { "epoch": 3.146023468057366, "grad_norm": 0.6780283451080322, "learning_rate": 6.302934967597922e-05, "loss": 3.4153, "step": 906 }, { "epoch": 3.1495002172968274, "grad_norm": 0.6037342548370361, "learning_rate": 6.294095225512603e-05, "loss": 3.3119, "step": 907 }, { "epoch": 3.1529769665362886, "grad_norm": 0.617067813873291, "learning_rate": 6.28525114600914e-05, "loss": 3.415, "step": 908 }, { "epoch": 3.1564537157757497, "grad_norm": 0.5938243269920349, "learning_rate": 6.276402758730229e-05, "loss": 3.3607, "step": 909 }, { "epoch": 3.159930465015211, "grad_norm": 0.5539820194244385, "learning_rate": 6.267550093333e-05, "loss": 3.3559, "step": 910 }, { "epoch": 3.163407214254672, "grad_norm": 0.6201304197311401, "learning_rate": 6.25869317948893e-05, "loss": 3.3305, "step": 911 }, { "epoch": 3.166883963494133, "grad_norm": 0.6455796957015991, "learning_rate": 6.24983204688373e-05, "loss": 3.3515, "step": 912 }, { "epoch": 3.170360712733594, "grad_norm": 0.6351650953292847, "learning_rate": 6.240966725217249e-05, "loss": 3.3312, "step": 913 }, { "epoch": 3.173837461973055, "grad_norm": 0.6822057366371155, "learning_rate": 6.232097244203388e-05, "loss": 3.3026, "step": 914 }, { "epoch": 3.1773142112125163, "grad_norm": 0.700149655342102, "learning_rate": 6.223223633569973e-05, "loss": 3.3879, "step": 915 }, { "epoch": 3.1807909604519775, "grad_norm": 0.5904785990715027, "learning_rate": 6.214345923058686e-05, "loss": 3.3544, "step": 916 }, { "epoch": 3.1842677096914387, "grad_norm": 0.6163371205329895, "learning_rate": 6.205464142424938e-05, "loss": 3.2991, "step": 917 }, { "epoch": 3.1877444589308994, "grad_norm": 0.6407819390296936, "learning_rate": 6.19657832143779e-05, "loss": 3.3577, "step": 918 }, { "epoch": 3.1912212081703606, "grad_norm": 0.7109596133232117, "learning_rate": 6.187688489879842e-05, "loss": 3.359, "step": 919 }, { "epoch": 3.1946979574098218, "grad_norm": 0.6707236766815186, "learning_rate": 6.178794677547137e-05, "loss": 3.3949, "step": 920 }, { "epoch": 3.198174706649283, "grad_norm": 0.6592925190925598, "learning_rate": 6.169896914249059e-05, "loss": 3.3861, "step": 921 }, { "epoch": 3.201651455888744, "grad_norm": 0.6427909731864929, "learning_rate": 6.160995229808239e-05, "loss": 3.4066, "step": 922 }, { "epoch": 3.2051282051282053, "grad_norm": 0.6185514330863953, "learning_rate": 6.152089654060444e-05, "loss": 3.305, "step": 923 }, { "epoch": 3.208604954367666, "grad_norm": 0.6078172326087952, "learning_rate": 6.143180216854487e-05, "loss": 3.2733, "step": 924 }, { "epoch": 3.212081703607127, "grad_norm": 0.6162821650505066, "learning_rate": 6.134266948052126e-05, "loss": 3.4089, "step": 925 }, { "epoch": 3.2155584528465884, "grad_norm": 0.6186605095863342, "learning_rate": 6.125349877527952e-05, "loss": 3.3432, "step": 926 }, { "epoch": 3.2190352020860495, "grad_norm": 0.582805335521698, "learning_rate": 6.116429035169309e-05, "loss": 3.2337, "step": 927 }, { "epoch": 3.2225119513255107, "grad_norm": 0.6280192136764526, "learning_rate": 6.107504450876181e-05, "loss": 3.4249, "step": 928 }, { "epoch": 3.225988700564972, "grad_norm": 0.6952533721923828, "learning_rate": 6.098576154561087e-05, "loss": 3.397, "step": 929 }, { "epoch": 3.229465449804433, "grad_norm": 0.7494910955429077, "learning_rate": 6.089644176148992e-05, "loss": 3.3243, "step": 930 }, { "epoch": 3.232942199043894, "grad_norm": 0.8510701656341553, "learning_rate": 6.080708545577206e-05, "loss": 3.3274, "step": 931 }, { "epoch": 3.236418948283355, "grad_norm": 0.8542797565460205, "learning_rate": 6.0717692927952744e-05, "loss": 3.3442, "step": 932 }, { "epoch": 3.239895697522816, "grad_norm": 0.8013666868209839, "learning_rate": 6.062826447764883e-05, "loss": 3.4637, "step": 933 }, { "epoch": 3.2433724467622773, "grad_norm": 0.6489382982254028, "learning_rate": 6.053880040459764e-05, "loss": 3.2955, "step": 934 }, { "epoch": 3.2468491960017385, "grad_norm": 0.7227512001991272, "learning_rate": 6.044930100865582e-05, "loss": 3.3125, "step": 935 }, { "epoch": 3.2503259452411997, "grad_norm": 0.6559109091758728, "learning_rate": 6.035976658979846e-05, "loss": 3.3617, "step": 936 }, { "epoch": 3.2538026944806604, "grad_norm": 0.5902848839759827, "learning_rate": 6.027019744811799e-05, "loss": 3.3577, "step": 937 }, { "epoch": 3.2572794437201216, "grad_norm": 0.6053394079208374, "learning_rate": 6.0180593883823266e-05, "loss": 3.3575, "step": 938 }, { "epoch": 3.2607561929595827, "grad_norm": 0.6064934730529785, "learning_rate": 6.009095619723849e-05, "loss": 3.2599, "step": 939 }, { "epoch": 3.264232942199044, "grad_norm": 0.5894815325737, "learning_rate": 6.0001284688802226e-05, "loss": 3.3916, "step": 940 }, { "epoch": 3.267709691438505, "grad_norm": 0.6426341533660889, "learning_rate": 5.991157965906643e-05, "loss": 3.2913, "step": 941 }, { "epoch": 3.2711864406779663, "grad_norm": 0.6230353713035583, "learning_rate": 5.982184140869539e-05, "loss": 3.2609, "step": 942 }, { "epoch": 3.274663189917427, "grad_norm": 0.604451060295105, "learning_rate": 5.973207023846475e-05, "loss": 3.3861, "step": 943 }, { "epoch": 3.278139939156888, "grad_norm": 0.5737584829330444, "learning_rate": 5.964226644926045e-05, "loss": 3.3206, "step": 944 }, { "epoch": 3.2816166883963493, "grad_norm": 0.563567042350769, "learning_rate": 5.9552430342077845e-05, "loss": 3.3096, "step": 945 }, { "epoch": 3.2850934376358105, "grad_norm": 0.6597940325737, "learning_rate": 5.946256221802051e-05, "loss": 3.4015, "step": 946 }, { "epoch": 3.2885701868752717, "grad_norm": 0.771218478679657, "learning_rate": 5.937266237829941e-05, "loss": 3.4045, "step": 947 }, { "epoch": 3.292046936114733, "grad_norm": 0.8483478426933289, "learning_rate": 5.928273112423177e-05, "loss": 3.2997, "step": 948 }, { "epoch": 3.295523685354194, "grad_norm": 0.7908669114112854, "learning_rate": 5.9192768757240115e-05, "loss": 3.3535, "step": 949 }, { "epoch": 3.2990004345936548, "grad_norm": 0.831207811832428, "learning_rate": 5.9102775578851275e-05, "loss": 3.3441, "step": 950 }, { "epoch": 3.302477183833116, "grad_norm": 0.8150995373725891, "learning_rate": 5.90127518906953e-05, "loss": 3.2245, "step": 951 }, { "epoch": 3.305953933072577, "grad_norm": 0.5736242532730103, "learning_rate": 5.892269799450453e-05, "loss": 3.3601, "step": 952 }, { "epoch": 3.3094306823120383, "grad_norm": 0.724216639995575, "learning_rate": 5.883261419211257e-05, "loss": 3.3684, "step": 953 }, { "epoch": 3.3129074315514995, "grad_norm": 0.6450987458229065, "learning_rate": 5.874250078545323e-05, "loss": 3.3208, "step": 954 }, { "epoch": 3.3163841807909606, "grad_norm": 0.630285382270813, "learning_rate": 5.8652358076559554e-05, "loss": 3.2841, "step": 955 }, { "epoch": 3.3198609300304214, "grad_norm": 0.6654447913169861, "learning_rate": 5.856218636756281e-05, "loss": 3.3725, "step": 956 }, { "epoch": 3.3233376792698825, "grad_norm": 0.6245713829994202, "learning_rate": 5.847198596069148e-05, "loss": 3.2424, "step": 957 }, { "epoch": 3.3268144285093437, "grad_norm": 0.6425074338912964, "learning_rate": 5.838175715827016e-05, "loss": 3.3608, "step": 958 }, { "epoch": 3.330291177748805, "grad_norm": 0.635083019733429, "learning_rate": 5.829150026271871e-05, "loss": 3.408, "step": 959 }, { "epoch": 3.333767926988266, "grad_norm": 0.6142615675926208, "learning_rate": 5.820121557655109e-05, "loss": 3.2893, "step": 960 }, { "epoch": 3.3372446762277272, "grad_norm": 0.62328040599823, "learning_rate": 5.811090340237445e-05, "loss": 3.2408, "step": 961 }, { "epoch": 3.340721425467188, "grad_norm": 0.6543041467666626, "learning_rate": 5.8020564042888015e-05, "loss": 3.3329, "step": 962 }, { "epoch": 3.344198174706649, "grad_norm": 0.6720776557922363, "learning_rate": 5.793019780088217e-05, "loss": 3.2764, "step": 963 }, { "epoch": 3.3476749239461103, "grad_norm": 0.6297585368156433, "learning_rate": 5.783980497923742e-05, "loss": 3.3776, "step": 964 }, { "epoch": 3.3511516731855715, "grad_norm": 0.5663807988166809, "learning_rate": 5.774938588092327e-05, "loss": 3.2923, "step": 965 }, { "epoch": 3.3546284224250327, "grad_norm": 0.5683589577674866, "learning_rate": 5.7658940808997394e-05, "loss": 3.2942, "step": 966 }, { "epoch": 3.358105171664494, "grad_norm": 0.6379316449165344, "learning_rate": 5.7568470066604485e-05, "loss": 3.3255, "step": 967 }, { "epoch": 3.361581920903955, "grad_norm": 0.7071390151977539, "learning_rate": 5.747797395697525e-05, "loss": 3.2915, "step": 968 }, { "epoch": 3.3650586701434158, "grad_norm": 0.7530797719955444, "learning_rate": 5.738745278342546e-05, "loss": 3.2985, "step": 969 }, { "epoch": 3.368535419382877, "grad_norm": 0.7314615845680237, "learning_rate": 5.729690684935487e-05, "loss": 3.2902, "step": 970 }, { "epoch": 3.372012168622338, "grad_norm": 0.6785851120948792, "learning_rate": 5.7206336458246234e-05, "loss": 3.1867, "step": 971 }, { "epoch": 3.3754889178617993, "grad_norm": 0.6161269545555115, "learning_rate": 5.7115741913664264e-05, "loss": 3.3526, "step": 972 }, { "epoch": 3.3789656671012605, "grad_norm": 0.6322996020317078, "learning_rate": 5.702512351925464e-05, "loss": 3.2621, "step": 973 }, { "epoch": 3.382442416340721, "grad_norm": 0.6312293410301208, "learning_rate": 5.693448157874298e-05, "loss": 3.3925, "step": 974 }, { "epoch": 3.3859191655801824, "grad_norm": 0.6690333485603333, "learning_rate": 5.6843816395933825e-05, "loss": 3.3599, "step": 975 }, { "epoch": 3.3893959148196435, "grad_norm": 0.6046136021614075, "learning_rate": 5.675312827470959e-05, "loss": 3.2502, "step": 976 }, { "epoch": 3.3928726640591047, "grad_norm": 0.5736027359962463, "learning_rate": 5.666241751902962e-05, "loss": 3.2605, "step": 977 }, { "epoch": 3.396349413298566, "grad_norm": 0.6438172459602356, "learning_rate": 5.6571684432929085e-05, "loss": 3.4194, "step": 978 }, { "epoch": 3.399826162538027, "grad_norm": 0.6276637315750122, "learning_rate": 5.648092932051801e-05, "loss": 3.4356, "step": 979 }, { "epoch": 3.4033029117774882, "grad_norm": 0.6946492195129395, "learning_rate": 5.6390152485980244e-05, "loss": 3.3409, "step": 980 }, { "epoch": 3.406779661016949, "grad_norm": 0.7016040682792664, "learning_rate": 5.6299354233572445e-05, "loss": 3.2799, "step": 981 }, { "epoch": 3.41025641025641, "grad_norm": 0.6583088636398315, "learning_rate": 5.6208534867623067e-05, "loss": 3.3199, "step": 982 }, { "epoch": 3.4137331594958713, "grad_norm": 0.6809118390083313, "learning_rate": 5.611769469253132e-05, "loss": 3.3738, "step": 983 }, { "epoch": 3.4172099087353325, "grad_norm": 0.7044417858123779, "learning_rate": 5.602683401276615e-05, "loss": 3.2786, "step": 984 }, { "epoch": 3.4206866579747937, "grad_norm": 0.7142994999885559, "learning_rate": 5.593595313286526e-05, "loss": 3.2612, "step": 985 }, { "epoch": 3.424163407214255, "grad_norm": 0.5846092104911804, "learning_rate": 5.584505235743403e-05, "loss": 3.2446, "step": 986 }, { "epoch": 3.427640156453716, "grad_norm": 0.6685866117477417, "learning_rate": 5.575413199114452e-05, "loss": 3.3602, "step": 987 }, { "epoch": 3.4311169056931767, "grad_norm": 0.6180519461631775, "learning_rate": 5.566319233873446e-05, "loss": 3.1912, "step": 988 }, { "epoch": 3.434593654932638, "grad_norm": 0.6132622361183167, "learning_rate": 5.557223370500626e-05, "loss": 3.3666, "step": 989 }, { "epoch": 3.438070404172099, "grad_norm": 0.6167900562286377, "learning_rate": 5.548125639482586e-05, "loss": 3.3479, "step": 990 }, { "epoch": 3.4415471534115603, "grad_norm": 0.6413175463676453, "learning_rate": 5.539026071312191e-05, "loss": 3.2576, "step": 991 }, { "epoch": 3.4450239026510214, "grad_norm": 0.6170135736465454, "learning_rate": 5.529924696488456e-05, "loss": 3.3199, "step": 992 }, { "epoch": 3.448500651890482, "grad_norm": 0.5534811019897461, "learning_rate": 5.52082154551645e-05, "loss": 3.1903, "step": 993 }, { "epoch": 3.4519774011299433, "grad_norm": 0.608566403388977, "learning_rate": 5.5117166489072014e-05, "loss": 3.319, "step": 994 }, { "epoch": 3.4554541503694045, "grad_norm": 0.6388020515441895, "learning_rate": 5.502610037177586e-05, "loss": 3.2828, "step": 995 }, { "epoch": 3.4589308996088657, "grad_norm": 0.6199042201042175, "learning_rate": 5.4935017408502274e-05, "loss": 3.3288, "step": 996 }, { "epoch": 3.462407648848327, "grad_norm": 0.6209288835525513, "learning_rate": 5.4843917904533994e-05, "loss": 3.277, "step": 997 }, { "epoch": 3.465884398087788, "grad_norm": 0.657084584236145, "learning_rate": 5.475280216520913e-05, "loss": 3.3167, "step": 998 }, { "epoch": 3.469361147327249, "grad_norm": 0.6538966298103333, "learning_rate": 5.466167049592029e-05, "loss": 3.1976, "step": 999 }, { "epoch": 3.47283789656671, "grad_norm": 0.7147399187088013, "learning_rate": 5.4570523202113396e-05, "loss": 3.3552, "step": 1000 }, { "epoch": 3.476314645806171, "grad_norm": 0.7168034315109253, "learning_rate": 5.44793605892868e-05, "loss": 3.3, "step": 1001 }, { "epoch": 3.4797913950456323, "grad_norm": 0.6470374464988708, "learning_rate": 5.438818296299015e-05, "loss": 3.3219, "step": 1002 }, { "epoch": 3.4832681442850935, "grad_norm": 0.666338324546814, "learning_rate": 5.4296990628823455e-05, "loss": 3.2503, "step": 1003 }, { "epoch": 3.4867448935245546, "grad_norm": 0.6241434216499329, "learning_rate": 5.420578389243599e-05, "loss": 3.1832, "step": 1004 }, { "epoch": 3.490221642764016, "grad_norm": 0.6188483238220215, "learning_rate": 5.4114563059525346e-05, "loss": 3.256, "step": 1005 }, { "epoch": 3.493698392003477, "grad_norm": 0.6511822938919067, "learning_rate": 5.402332843583631e-05, "loss": 3.3172, "step": 1006 }, { "epoch": 3.4971751412429377, "grad_norm": 0.5653273463249207, "learning_rate": 5.3932080327159886e-05, "loss": 3.2706, "step": 1007 }, { "epoch": 3.500651890482399, "grad_norm": 0.5717982649803162, "learning_rate": 5.384081903933235e-05, "loss": 3.2577, "step": 1008 }, { "epoch": 3.50412863972186, "grad_norm": 0.5827626585960388, "learning_rate": 5.374954487823407e-05, "loss": 3.3031, "step": 1009 }, { "epoch": 3.5076053889613212, "grad_norm": 0.611819326877594, "learning_rate": 5.365825814978861e-05, "loss": 3.2244, "step": 1010 }, { "epoch": 3.5110821382007824, "grad_norm": 0.6256561875343323, "learning_rate": 5.3566959159961615e-05, "loss": 3.2336, "step": 1011 }, { "epoch": 3.514558887440243, "grad_norm": 0.6403440237045288, "learning_rate": 5.3475648214759896e-05, "loss": 3.3431, "step": 1012 }, { "epoch": 3.5180356366797043, "grad_norm": 0.612866997718811, "learning_rate": 5.3384325620230245e-05, "loss": 3.2884, "step": 1013 }, { "epoch": 3.5215123859191655, "grad_norm": 0.5799668431282043, "learning_rate": 5.3292991682458574e-05, "loss": 3.3096, "step": 1014 }, { "epoch": 3.5249891351586267, "grad_norm": 0.5905560255050659, "learning_rate": 5.3201646707568764e-05, "loss": 3.274, "step": 1015 }, { "epoch": 3.528465884398088, "grad_norm": 0.5543707609176636, "learning_rate": 5.311029100172172e-05, "loss": 3.1989, "step": 1016 }, { "epoch": 3.531942633637549, "grad_norm": 0.5833373069763184, "learning_rate": 5.3018924871114305e-05, "loss": 3.2519, "step": 1017 }, { "epoch": 3.53541938287701, "grad_norm": 0.5746926069259644, "learning_rate": 5.292754862197831e-05, "loss": 3.1616, "step": 1018 }, { "epoch": 3.538896132116471, "grad_norm": 0.5569521188735962, "learning_rate": 5.2836162560579486e-05, "loss": 3.2796, "step": 1019 }, { "epoch": 3.542372881355932, "grad_norm": 0.5777444839477539, "learning_rate": 5.274476699321638e-05, "loss": 3.2595, "step": 1020 }, { "epoch": 3.5458496305953933, "grad_norm": 0.5533985495567322, "learning_rate": 5.265336222621949e-05, "loss": 3.2107, "step": 1021 }, { "epoch": 3.5493263798348544, "grad_norm": 0.5138996839523315, "learning_rate": 5.2561948565950126e-05, "loss": 3.2121, "step": 1022 }, { "epoch": 3.5528031290743156, "grad_norm": 0.5798884630203247, "learning_rate": 5.2470526318799365e-05, "loss": 3.2697, "step": 1023 }, { "epoch": 3.5562798783137763, "grad_norm": 0.6103897094726562, "learning_rate": 5.2379095791187124e-05, "loss": 3.2417, "step": 1024 }, { "epoch": 3.559756627553238, "grad_norm": 0.5791857242584229, "learning_rate": 5.228765728956102e-05, "loss": 3.2789, "step": 1025 }, { "epoch": 3.5632333767926987, "grad_norm": 0.5706868767738342, "learning_rate": 5.2196211120395444e-05, "loss": 3.3693, "step": 1026 }, { "epoch": 3.56671012603216, "grad_norm": 0.5966957807540894, "learning_rate": 5.2104757590190445e-05, "loss": 3.2444, "step": 1027 }, { "epoch": 3.570186875271621, "grad_norm": 0.5554029941558838, "learning_rate": 5.201329700547076e-05, "loss": 3.2575, "step": 1028 }, { "epoch": 3.573663624511082, "grad_norm": 0.5967298150062561, "learning_rate": 5.1921829672784786e-05, "loss": 3.2913, "step": 1029 }, { "epoch": 3.5771403737505434, "grad_norm": 0.6098992824554443, "learning_rate": 5.183035589870353e-05, "loss": 3.2266, "step": 1030 }, { "epoch": 3.580617122990004, "grad_norm": 0.5845767259597778, "learning_rate": 5.173887598981956e-05, "loss": 3.2771, "step": 1031 }, { "epoch": 3.5840938722294653, "grad_norm": 0.6617316603660583, "learning_rate": 5.164739025274604e-05, "loss": 3.2556, "step": 1032 }, { "epoch": 3.5875706214689265, "grad_norm": 0.7001234292984009, "learning_rate": 5.155589899411567e-05, "loss": 3.264, "step": 1033 }, { "epoch": 3.5910473707083876, "grad_norm": 0.7605653405189514, "learning_rate": 5.146440252057961e-05, "loss": 3.2912, "step": 1034 }, { "epoch": 3.594524119947849, "grad_norm": 0.74252849817276, "learning_rate": 5.137290113880656e-05, "loss": 3.2395, "step": 1035 }, { "epoch": 3.59800086918731, "grad_norm": 0.7059817314147949, "learning_rate": 5.128139515548164e-05, "loss": 3.2552, "step": 1036 }, { "epoch": 3.601477618426771, "grad_norm": 0.7125615477561951, "learning_rate": 5.1189884877305375e-05, "loss": 3.2621, "step": 1037 }, { "epoch": 3.604954367666232, "grad_norm": 0.7657261490821838, "learning_rate": 5.109837061099274e-05, "loss": 3.1871, "step": 1038 }, { "epoch": 3.608431116905693, "grad_norm": 0.6542237401008606, "learning_rate": 5.100685266327202e-05, "loss": 3.1583, "step": 1039 }, { "epoch": 3.6119078661451542, "grad_norm": 0.6304188370704651, "learning_rate": 5.091533134088388e-05, "loss": 3.2659, "step": 1040 }, { "epoch": 3.6153846153846154, "grad_norm": 0.6933708190917969, "learning_rate": 5.0823806950580254e-05, "loss": 3.2046, "step": 1041 }, { "epoch": 3.6188613646240766, "grad_norm": 0.6795551776885986, "learning_rate": 5.073227979912339e-05, "loss": 3.3441, "step": 1042 }, { "epoch": 3.6223381138635373, "grad_norm": 0.6919432282447815, "learning_rate": 5.064075019328479e-05, "loss": 3.2962, "step": 1043 }, { "epoch": 3.625814863102999, "grad_norm": 0.7967680096626282, "learning_rate": 5.054921843984418e-05, "loss": 3.2839, "step": 1044 }, { "epoch": 3.6292916123424597, "grad_norm": 0.7113544940948486, "learning_rate": 5.045768484558847e-05, "loss": 3.2422, "step": 1045 }, { "epoch": 3.632768361581921, "grad_norm": 0.6717776656150818, "learning_rate": 5.036614971731076e-05, "loss": 3.256, "step": 1046 }, { "epoch": 3.636245110821382, "grad_norm": 0.5887851119041443, "learning_rate": 5.027461336180929e-05, "loss": 3.1419, "step": 1047 }, { "epoch": 3.639721860060843, "grad_norm": 0.6715282201766968, "learning_rate": 5.018307608588636e-05, "loss": 3.2874, "step": 1048 }, { "epoch": 3.6431986093003044, "grad_norm": 0.6706719994544983, "learning_rate": 5.0091538196347445e-05, "loss": 3.2945, "step": 1049 }, { "epoch": 3.646675358539765, "grad_norm": 0.6065131425857544, "learning_rate": 5e-05, "loss": 3.2286, "step": 1050 }, { "epoch": 3.6501521077792263, "grad_norm": 0.5673298239707947, "learning_rate": 4.9908461803652566e-05, "loss": 3.332, "step": 1051 }, { "epoch": 3.6536288570186874, "grad_norm": 0.5908064842224121, "learning_rate": 4.981692391411366e-05, "loss": 3.3474, "step": 1052 }, { "epoch": 3.6571056062581486, "grad_norm": 0.5967066884040833, "learning_rate": 4.972538663819073e-05, "loss": 3.248, "step": 1053 }, { "epoch": 3.66058235549761, "grad_norm": 0.6299230456352234, "learning_rate": 4.9633850282689246e-05, "loss": 3.1357, "step": 1054 }, { "epoch": 3.664059104737071, "grad_norm": 0.5750209093093872, "learning_rate": 4.954231515441153e-05, "loss": 3.1899, "step": 1055 }, { "epoch": 3.667535853976532, "grad_norm": 0.6004741787910461, "learning_rate": 4.9450781560155816e-05, "loss": 3.3146, "step": 1056 }, { "epoch": 3.671012603215993, "grad_norm": 0.6603873372077942, "learning_rate": 4.935924980671522e-05, "loss": 3.3052, "step": 1057 }, { "epoch": 3.674489352455454, "grad_norm": 0.5748945474624634, "learning_rate": 4.926772020087663e-05, "loss": 3.294, "step": 1058 }, { "epoch": 3.6779661016949152, "grad_norm": 0.56917804479599, "learning_rate": 4.917619304941977e-05, "loss": 3.2473, "step": 1059 }, { "epoch": 3.6814428509343764, "grad_norm": 0.6211404204368591, "learning_rate": 4.9084668659116154e-05, "loss": 3.3157, "step": 1060 }, { "epoch": 3.6849196001738376, "grad_norm": 0.6026207804679871, "learning_rate": 4.899314733672799e-05, "loss": 3.2696, "step": 1061 }, { "epoch": 3.6883963494132983, "grad_norm": 0.5868690609931946, "learning_rate": 4.890162938900727e-05, "loss": 3.1322, "step": 1062 }, { "epoch": 3.69187309865276, "grad_norm": 0.5615720748901367, "learning_rate": 4.881011512269463e-05, "loss": 3.3041, "step": 1063 }, { "epoch": 3.6953498478922207, "grad_norm": 0.5574554800987244, "learning_rate": 4.871860484451838e-05, "loss": 3.2903, "step": 1064 }, { "epoch": 3.698826597131682, "grad_norm": 0.589513897895813, "learning_rate": 4.862709886119344e-05, "loss": 3.1515, "step": 1065 }, { "epoch": 3.702303346371143, "grad_norm": 0.6018418073654175, "learning_rate": 4.8535597479420406e-05, "loss": 3.2614, "step": 1066 }, { "epoch": 3.705780095610604, "grad_norm": 0.5487574934959412, "learning_rate": 4.844410100588435e-05, "loss": 3.2011, "step": 1067 }, { "epoch": 3.7092568448500653, "grad_norm": 0.5497720241546631, "learning_rate": 4.835260974725397e-05, "loss": 3.2824, "step": 1068 }, { "epoch": 3.712733594089526, "grad_norm": 0.5350008606910706, "learning_rate": 4.8261124010180445e-05, "loss": 3.2405, "step": 1069 }, { "epoch": 3.7162103433289873, "grad_norm": 0.5541216135025024, "learning_rate": 4.8169644101296474e-05, "loss": 3.2287, "step": 1070 }, { "epoch": 3.7196870925684484, "grad_norm": 0.6203867197036743, "learning_rate": 4.807817032721522e-05, "loss": 3.2801, "step": 1071 }, { "epoch": 3.7231638418079096, "grad_norm": 0.6372770667076111, "learning_rate": 4.798670299452926e-05, "loss": 3.2477, "step": 1072 }, { "epoch": 3.7266405910473708, "grad_norm": 0.6761902570724487, "learning_rate": 4.789524240980958e-05, "loss": 3.2624, "step": 1073 }, { "epoch": 3.730117340286832, "grad_norm": 0.732210636138916, "learning_rate": 4.780378887960458e-05, "loss": 3.2492, "step": 1074 }, { "epoch": 3.733594089526293, "grad_norm": 0.6838912963867188, "learning_rate": 4.7712342710438987e-05, "loss": 3.1799, "step": 1075 }, { "epoch": 3.737070838765754, "grad_norm": 0.6677350401878357, "learning_rate": 4.762090420881289e-05, "loss": 3.1473, "step": 1076 }, { "epoch": 3.740547588005215, "grad_norm": 0.6039561033248901, "learning_rate": 4.7529473681200646e-05, "loss": 3.178, "step": 1077 }, { "epoch": 3.744024337244676, "grad_norm": 0.6518564224243164, "learning_rate": 4.743805143404989e-05, "loss": 3.2581, "step": 1078 }, { "epoch": 3.7475010864841374, "grad_norm": 0.6599782705307007, "learning_rate": 4.734663777378052e-05, "loss": 3.2275, "step": 1079 }, { "epoch": 3.7509778357235986, "grad_norm": 0.5603999495506287, "learning_rate": 4.725523300678363e-05, "loss": 3.2184, "step": 1080 }, { "epoch": 3.7544545849630593, "grad_norm": 0.6059252619743347, "learning_rate": 4.7163837439420525e-05, "loss": 3.272, "step": 1081 }, { "epoch": 3.757931334202521, "grad_norm": 0.5787419080734253, "learning_rate": 4.707245137802169e-05, "loss": 3.1984, "step": 1082 }, { "epoch": 3.7614080834419816, "grad_norm": 0.5447966456413269, "learning_rate": 4.6981075128885693e-05, "loss": 3.1944, "step": 1083 }, { "epoch": 3.764884832681443, "grad_norm": 0.5587812066078186, "learning_rate": 4.6889708998278284e-05, "loss": 3.2819, "step": 1084 }, { "epoch": 3.768361581920904, "grad_norm": 0.5818442106246948, "learning_rate": 4.6798353292431254e-05, "loss": 3.2023, "step": 1085 }, { "epoch": 3.771838331160365, "grad_norm": 0.6101146340370178, "learning_rate": 4.6707008317541444e-05, "loss": 3.1397, "step": 1086 }, { "epoch": 3.7753150803998263, "grad_norm": 0.5633519291877747, "learning_rate": 4.661567437976977e-05, "loss": 3.196, "step": 1087 }, { "epoch": 3.778791829639287, "grad_norm": 0.5950900912284851, "learning_rate": 4.652435178524013e-05, "loss": 3.2732, "step": 1088 }, { "epoch": 3.7822685788787482, "grad_norm": 0.6070186495780945, "learning_rate": 4.643304084003839e-05, "loss": 3.233, "step": 1089 }, { "epoch": 3.7857453281182094, "grad_norm": 0.622246503829956, "learning_rate": 4.6341741850211404e-05, "loss": 3.2402, "step": 1090 }, { "epoch": 3.7892220773576706, "grad_norm": 0.628646731376648, "learning_rate": 4.6250455121765944e-05, "loss": 3.2109, "step": 1091 }, { "epoch": 3.7926988265971318, "grad_norm": 0.5549827218055725, "learning_rate": 4.615918096066766e-05, "loss": 3.254, "step": 1092 }, { "epoch": 3.796175575836593, "grad_norm": 0.5434677600860596, "learning_rate": 4.606791967284012e-05, "loss": 3.2507, "step": 1093 }, { "epoch": 3.799652325076054, "grad_norm": 0.5489455461502075, "learning_rate": 4.597667156416371e-05, "loss": 3.26, "step": 1094 }, { "epoch": 3.803129074315515, "grad_norm": 0.5459377765655518, "learning_rate": 4.588543694047466e-05, "loss": 3.2457, "step": 1095 }, { "epoch": 3.806605823554976, "grad_norm": 0.5723990201950073, "learning_rate": 4.5794216107564e-05, "loss": 3.2643, "step": 1096 }, { "epoch": 3.810082572794437, "grad_norm": 0.5779864192008972, "learning_rate": 4.570300937117655e-05, "loss": 3.1926, "step": 1097 }, { "epoch": 3.8135593220338984, "grad_norm": 0.5621626973152161, "learning_rate": 4.561181703700986e-05, "loss": 3.2111, "step": 1098 }, { "epoch": 3.8170360712733595, "grad_norm": 0.5523443818092346, "learning_rate": 4.552063941071323e-05, "loss": 3.2244, "step": 1099 }, { "epoch": 3.8205128205128203, "grad_norm": 0.5601244568824768, "learning_rate": 4.542947679788662e-05, "loss": 3.1726, "step": 1100 }, { "epoch": 3.823989569752282, "grad_norm": 0.5333982110023499, "learning_rate": 4.533832950407973e-05, "loss": 3.2602, "step": 1101 }, { "epoch": 3.8274663189917426, "grad_norm": 0.5482881665229797, "learning_rate": 4.5247197834790876e-05, "loss": 3.2696, "step": 1102 }, { "epoch": 3.830943068231204, "grad_norm": 0.5877856016159058, "learning_rate": 4.515608209546602e-05, "loss": 3.2463, "step": 1103 }, { "epoch": 3.834419817470665, "grad_norm": 0.6199784874916077, "learning_rate": 4.506498259149774e-05, "loss": 3.1951, "step": 1104 }, { "epoch": 3.837896566710126, "grad_norm": 0.5966984629631042, "learning_rate": 4.4973899628224154e-05, "loss": 3.2796, "step": 1105 }, { "epoch": 3.8413733159495873, "grad_norm": 0.5492768287658691, "learning_rate": 4.488283351092799e-05, "loss": 3.2153, "step": 1106 }, { "epoch": 3.844850065189048, "grad_norm": 0.5626404881477356, "learning_rate": 4.4791784544835515e-05, "loss": 3.0994, "step": 1107 }, { "epoch": 3.848326814428509, "grad_norm": 0.5543808937072754, "learning_rate": 4.4700753035115454e-05, "loss": 3.2721, "step": 1108 }, { "epoch": 3.8518035636679704, "grad_norm": 0.6047621369361877, "learning_rate": 4.460973928687809e-05, "loss": 3.2098, "step": 1109 }, { "epoch": 3.8552803129074316, "grad_norm": 0.6789091229438782, "learning_rate": 4.4518743605174136e-05, "loss": 3.2294, "step": 1110 }, { "epoch": 3.8587570621468927, "grad_norm": 0.5913859009742737, "learning_rate": 4.442776629499375e-05, "loss": 3.1556, "step": 1111 }, { "epoch": 3.862233811386354, "grad_norm": 0.5494495034217834, "learning_rate": 4.433680766126554e-05, "loss": 3.2562, "step": 1112 }, { "epoch": 3.865710560625815, "grad_norm": 0.6001352667808533, "learning_rate": 4.424586800885551e-05, "loss": 3.2289, "step": 1113 }, { "epoch": 3.869187309865276, "grad_norm": 0.6379590630531311, "learning_rate": 4.415494764256599e-05, "loss": 3.2218, "step": 1114 }, { "epoch": 3.872664059104737, "grad_norm": 0.5929582118988037, "learning_rate": 4.4064046867134756e-05, "loss": 3.2409, "step": 1115 }, { "epoch": 3.876140808344198, "grad_norm": 0.567794680595398, "learning_rate": 4.397316598723385e-05, "loss": 3.1523, "step": 1116 }, { "epoch": 3.8796175575836593, "grad_norm": 0.6330842971801758, "learning_rate": 4.388230530746869e-05, "loss": 3.1354, "step": 1117 }, { "epoch": 3.8830943068231205, "grad_norm": 0.5656293034553528, "learning_rate": 4.379146513237695e-05, "loss": 3.197, "step": 1118 }, { "epoch": 3.8865710560625812, "grad_norm": 0.5936200618743896, "learning_rate": 4.370064576642757e-05, "loss": 3.2288, "step": 1119 }, { "epoch": 3.890047805302043, "grad_norm": 0.6053287982940674, "learning_rate": 4.360984751401977e-05, "loss": 3.134, "step": 1120 }, { "epoch": 3.8935245545415036, "grad_norm": 0.5832516551017761, "learning_rate": 4.351907067948201e-05, "loss": 3.1563, "step": 1121 }, { "epoch": 3.8970013037809648, "grad_norm": 0.5615696907043457, "learning_rate": 4.3428315567070926e-05, "loss": 3.2662, "step": 1122 }, { "epoch": 3.900478053020426, "grad_norm": 0.557080090045929, "learning_rate": 4.333758248097039e-05, "loss": 3.2424, "step": 1123 }, { "epoch": 3.903954802259887, "grad_norm": 0.6137921810150146, "learning_rate": 4.324687172529041e-05, "loss": 3.2753, "step": 1124 }, { "epoch": 3.9074315514993483, "grad_norm": 0.6269393563270569, "learning_rate": 4.315618360406618e-05, "loss": 3.1655, "step": 1125 }, { "epoch": 3.910908300738809, "grad_norm": 0.6167050004005432, "learning_rate": 4.306551842125702e-05, "loss": 3.183, "step": 1126 }, { "epoch": 3.91438504997827, "grad_norm": 0.5596528649330139, "learning_rate": 4.297487648074538e-05, "loss": 3.212, "step": 1127 }, { "epoch": 3.9178617992177314, "grad_norm": 0.5810624957084656, "learning_rate": 4.288425808633575e-05, "loss": 3.2202, "step": 1128 }, { "epoch": 3.9213385484571925, "grad_norm": 0.5932186245918274, "learning_rate": 4.2793663541753784e-05, "loss": 3.1973, "step": 1129 }, { "epoch": 3.9248152976966537, "grad_norm": 0.5662857294082642, "learning_rate": 4.2703093150645144e-05, "loss": 3.2006, "step": 1130 }, { "epoch": 3.928292046936115, "grad_norm": 0.5792871117591858, "learning_rate": 4.261254721657454e-05, "loss": 3.2062, "step": 1131 }, { "epoch": 3.931768796175576, "grad_norm": 0.6331871151924133, "learning_rate": 4.252202604302476e-05, "loss": 3.2503, "step": 1132 }, { "epoch": 3.935245545415037, "grad_norm": 0.5286860466003418, "learning_rate": 4.2431529933395527e-05, "loss": 3.2181, "step": 1133 }, { "epoch": 3.938722294654498, "grad_norm": 0.5583444833755493, "learning_rate": 4.234105919100261e-05, "loss": 3.2073, "step": 1134 }, { "epoch": 3.942199043893959, "grad_norm": 0.5682042837142944, "learning_rate": 4.2250614119076735e-05, "loss": 3.2068, "step": 1135 }, { "epoch": 3.9456757931334203, "grad_norm": 0.6427503824234009, "learning_rate": 4.2160195020762597e-05, "loss": 3.2051, "step": 1136 }, { "epoch": 3.9491525423728815, "grad_norm": 0.5848159193992615, "learning_rate": 4.206980219911783e-05, "loss": 3.1387, "step": 1137 }, { "epoch": 3.952629291612342, "grad_norm": 0.5341483354568481, "learning_rate": 4.197943595711198e-05, "loss": 3.2449, "step": 1138 }, { "epoch": 3.956106040851804, "grad_norm": 0.5785036683082581, "learning_rate": 4.188909659762556e-05, "loss": 3.2745, "step": 1139 }, { "epoch": 3.9595827900912646, "grad_norm": 0.5817328095436096, "learning_rate": 4.179878442344892e-05, "loss": 3.1943, "step": 1140 }, { "epoch": 3.9630595393307257, "grad_norm": 0.6359755396842957, "learning_rate": 4.17084997372813e-05, "loss": 3.251, "step": 1141 }, { "epoch": 3.966536288570187, "grad_norm": 0.6350237727165222, "learning_rate": 4.1618242841729846e-05, "loss": 3.2164, "step": 1142 }, { "epoch": 3.970013037809648, "grad_norm": 0.6320176720619202, "learning_rate": 4.152801403930855e-05, "loss": 3.2481, "step": 1143 }, { "epoch": 3.9734897870491093, "grad_norm": 0.6210284233093262, "learning_rate": 4.14378136324372e-05, "loss": 3.1793, "step": 1144 }, { "epoch": 3.97696653628857, "grad_norm": 0.543938934803009, "learning_rate": 4.134764192344046e-05, "loss": 3.2093, "step": 1145 }, { "epoch": 3.980443285528031, "grad_norm": 0.5592504143714905, "learning_rate": 4.125749921454679e-05, "loss": 3.1314, "step": 1146 }, { "epoch": 3.9839200347674923, "grad_norm": 0.5563082695007324, "learning_rate": 4.116738580788744e-05, "loss": 3.1755, "step": 1147 }, { "epoch": 3.9873967840069535, "grad_norm": 0.5996997952461243, "learning_rate": 4.107730200549549e-05, "loss": 3.3564, "step": 1148 }, { "epoch": 3.9908735332464147, "grad_norm": 0.5965397357940674, "learning_rate": 4.0987248109304714e-05, "loss": 3.1906, "step": 1149 }, { "epoch": 3.994350282485876, "grad_norm": 0.5691776871681213, "learning_rate": 4.089722442114873e-05, "loss": 3.1736, "step": 1150 }, { "epoch": 3.997827031725337, "grad_norm": 0.5946555733680725, "learning_rate": 4.080723124275988e-05, "loss": 3.1895, "step": 1151 }, { "epoch": 4.0, "grad_norm": 0.5992119312286377, "learning_rate": 4.0717268875768225e-05, "loss": 3.2406, "step": 1152 }, { "epoch": 4.003476749239461, "grad_norm": 0.5725580453872681, "learning_rate": 4.062733762170059e-05, "loss": 3.2336, "step": 1153 }, { "epoch": 4.006953498478922, "grad_norm": 0.6200286746025085, "learning_rate": 4.0537437781979506e-05, "loss": 3.0999, "step": 1154 }, { "epoch": 4.010430247718383, "grad_norm": 0.6485391855239868, "learning_rate": 4.044756965792218e-05, "loss": 3.259, "step": 1155 }, { "epoch": 4.013906996957845, "grad_norm": 0.5790440440177917, "learning_rate": 4.0357733550739554e-05, "loss": 3.1542, "step": 1156 }, { "epoch": 4.017383746197305, "grad_norm": 0.6082708835601807, "learning_rate": 4.026792976153527e-05, "loss": 3.2343, "step": 1157 }, { "epoch": 4.020860495436767, "grad_norm": 0.6026447415351868, "learning_rate": 4.017815859130461e-05, "loss": 3.181, "step": 1158 }, { "epoch": 4.024337244676228, "grad_norm": 0.617872953414917, "learning_rate": 4.008842034093359e-05, "loss": 3.1653, "step": 1159 }, { "epoch": 4.0278139939156885, "grad_norm": 0.5940406918525696, "learning_rate": 3.9998715311197785e-05, "loss": 3.082, "step": 1160 }, { "epoch": 4.03129074315515, "grad_norm": 0.5831135511398315, "learning_rate": 3.990904380276153e-05, "loss": 3.153, "step": 1161 }, { "epoch": 4.034767492394611, "grad_norm": 0.5792697072029114, "learning_rate": 3.981940611617675e-05, "loss": 3.2169, "step": 1162 }, { "epoch": 4.0382442416340725, "grad_norm": 0.5567387342453003, "learning_rate": 3.972980255188201e-05, "loss": 3.1521, "step": 1163 }, { "epoch": 4.041720990873533, "grad_norm": 0.6137407422065735, "learning_rate": 3.9640233410201553e-05, "loss": 3.2615, "step": 1164 }, { "epoch": 4.045197740112994, "grad_norm": 0.6380611062049866, "learning_rate": 3.955069899134418e-05, "loss": 3.2734, "step": 1165 }, { "epoch": 4.0486744893524556, "grad_norm": 0.6026031374931335, "learning_rate": 3.9461199595402354e-05, "loss": 3.1017, "step": 1166 }, { "epoch": 4.052151238591916, "grad_norm": 0.5493614673614502, "learning_rate": 3.937173552235117e-05, "loss": 3.1587, "step": 1167 }, { "epoch": 4.055627987831378, "grad_norm": 0.5663195848464966, "learning_rate": 3.928230707204729e-05, "loss": 3.1702, "step": 1168 }, { "epoch": 4.059104737070839, "grad_norm": 0.5667363405227661, "learning_rate": 3.919291454422796e-05, "loss": 3.1958, "step": 1169 }, { "epoch": 4.0625814863103, "grad_norm": 0.5629976987838745, "learning_rate": 3.9103558238510086e-05, "loss": 3.2153, "step": 1170 }, { "epoch": 4.066058235549761, "grad_norm": 0.6252686381340027, "learning_rate": 3.901423845438916e-05, "loss": 3.2646, "step": 1171 }, { "epoch": 4.069534984789222, "grad_norm": 0.5598259568214417, "learning_rate": 3.8924955491238216e-05, "loss": 3.1532, "step": 1172 }, { "epoch": 4.073011734028683, "grad_norm": 0.5491876602172852, "learning_rate": 3.883570964830692e-05, "loss": 3.124, "step": 1173 }, { "epoch": 4.076488483268144, "grad_norm": 0.5444916486740112, "learning_rate": 3.874650122472049e-05, "loss": 3.1616, "step": 1174 }, { "epoch": 4.079965232507606, "grad_norm": 0.5169205069541931, "learning_rate": 3.865733051947876e-05, "loss": 3.1565, "step": 1175 }, { "epoch": 4.083441981747066, "grad_norm": 0.5438332557678223, "learning_rate": 3.856819783145514e-05, "loss": 3.2329, "step": 1176 }, { "epoch": 4.086918730986528, "grad_norm": 0.551806628704071, "learning_rate": 3.847910345939557e-05, "loss": 3.2382, "step": 1177 }, { "epoch": 4.090395480225989, "grad_norm": 0.5678349733352661, "learning_rate": 3.839004770191762e-05, "loss": 3.2062, "step": 1178 }, { "epoch": 4.0938722294654495, "grad_norm": 0.5699625611305237, "learning_rate": 3.83010308575094e-05, "loss": 3.1105, "step": 1179 }, { "epoch": 4.097348978704911, "grad_norm": 0.5540375709533691, "learning_rate": 3.821205322452863e-05, "loss": 3.1929, "step": 1180 }, { "epoch": 4.100825727944372, "grad_norm": 0.5406695008277893, "learning_rate": 3.812311510120159e-05, "loss": 3.0958, "step": 1181 }, { "epoch": 4.1043024771838335, "grad_norm": 0.5478038787841797, "learning_rate": 3.803421678562213e-05, "loss": 3.1762, "step": 1182 }, { "epoch": 4.107779226423294, "grad_norm": 0.6140181422233582, "learning_rate": 3.794535857575064e-05, "loss": 3.1347, "step": 1183 }, { "epoch": 4.111255975662755, "grad_norm": 0.578913152217865, "learning_rate": 3.785654076941317e-05, "loss": 3.1791, "step": 1184 }, { "epoch": 4.1147327249022165, "grad_norm": 0.6035884618759155, "learning_rate": 3.776776366430027e-05, "loss": 3.1587, "step": 1185 }, { "epoch": 4.118209474141677, "grad_norm": 0.5582138895988464, "learning_rate": 3.767902755796613e-05, "loss": 3.1542, "step": 1186 }, { "epoch": 4.121686223381139, "grad_norm": 0.526336669921875, "learning_rate": 3.759033274782751e-05, "loss": 3.1465, "step": 1187 }, { "epoch": 4.1251629726206, "grad_norm": 0.6234402060508728, "learning_rate": 3.750167953116272e-05, "loss": 3.1536, "step": 1188 }, { "epoch": 4.128639721860061, "grad_norm": 0.7032943964004517, "learning_rate": 3.741306820511072e-05, "loss": 3.1409, "step": 1189 }, { "epoch": 4.132116471099522, "grad_norm": 0.7070549726486206, "learning_rate": 3.7324499066670006e-05, "loss": 3.1695, "step": 1190 }, { "epoch": 4.135593220338983, "grad_norm": 0.6663886904716492, "learning_rate": 3.723597241269772e-05, "loss": 3.1933, "step": 1191 }, { "epoch": 4.139069969578444, "grad_norm": 0.6331878900527954, "learning_rate": 3.7147488539908596e-05, "loss": 3.1693, "step": 1192 }, { "epoch": 4.142546718817905, "grad_norm": 0.5591497421264648, "learning_rate": 3.705904774487396e-05, "loss": 3.267, "step": 1193 }, { "epoch": 4.146023468057367, "grad_norm": 0.6263229250907898, "learning_rate": 3.697065032402078e-05, "loss": 3.1842, "step": 1194 }, { "epoch": 4.149500217296827, "grad_norm": 0.6090732216835022, "learning_rate": 3.6882296573630634e-05, "loss": 3.1744, "step": 1195 }, { "epoch": 4.152976966536288, "grad_norm": 0.5834642648696899, "learning_rate": 3.6793986789838744e-05, "loss": 3.1195, "step": 1196 }, { "epoch": 4.15645371577575, "grad_norm": 0.5978863835334778, "learning_rate": 3.6705721268632915e-05, "loss": 3.1333, "step": 1197 }, { "epoch": 4.1599304650152105, "grad_norm": 0.5608931183815002, "learning_rate": 3.6617500305852674e-05, "loss": 3.0997, "step": 1198 }, { "epoch": 4.163407214254672, "grad_norm": 0.6076219081878662, "learning_rate": 3.6529324197188154e-05, "loss": 3.1598, "step": 1199 }, { "epoch": 4.166883963494133, "grad_norm": 0.571110725402832, "learning_rate": 3.644119323817915e-05, "loss": 3.1411, "step": 1200 }, { "epoch": 4.170360712733594, "grad_norm": 0.524634838104248, "learning_rate": 3.6353107724214175e-05, "loss": 3.1337, "step": 1201 }, { "epoch": 4.173837461973055, "grad_norm": 0.5356817245483398, "learning_rate": 3.6265067950529365e-05, "loss": 3.1192, "step": 1202 }, { "epoch": 4.177314211212516, "grad_norm": 0.5559493899345398, "learning_rate": 3.617707421220758e-05, "loss": 3.1269, "step": 1203 }, { "epoch": 4.1807909604519775, "grad_norm": 0.5397975444793701, "learning_rate": 3.608912680417737e-05, "loss": 3.1603, "step": 1204 }, { "epoch": 4.184267709691438, "grad_norm": 0.5938603281974792, "learning_rate": 3.600122602121202e-05, "loss": 3.13, "step": 1205 }, { "epoch": 4.1877444589309, "grad_norm": 0.5785700082778931, "learning_rate": 3.591337215792852e-05, "loss": 3.2027, "step": 1206 }, { "epoch": 4.191221208170361, "grad_norm": 0.5957548022270203, "learning_rate": 3.58255655087866e-05, "loss": 3.1809, "step": 1207 }, { "epoch": 4.194697957409822, "grad_norm": 0.5649887323379517, "learning_rate": 3.5737806368087774e-05, "loss": 3.2136, "step": 1208 }, { "epoch": 4.198174706649283, "grad_norm": 0.5808984637260437, "learning_rate": 3.5650095029974266e-05, "loss": 3.1725, "step": 1209 }, { "epoch": 4.201651455888744, "grad_norm": 0.5980812907218933, "learning_rate": 3.5562431788428156e-05, "loss": 3.1213, "step": 1210 }, { "epoch": 4.205128205128205, "grad_norm": 0.548862099647522, "learning_rate": 3.5474816937270225e-05, "loss": 3.1082, "step": 1211 }, { "epoch": 4.208604954367666, "grad_norm": 0.5771917104721069, "learning_rate": 3.538725077015915e-05, "loss": 3.2051, "step": 1212 }, { "epoch": 4.212081703607128, "grad_norm": 0.5784444808959961, "learning_rate": 3.529973358059038e-05, "loss": 3.0524, "step": 1213 }, { "epoch": 4.215558452846588, "grad_norm": 0.5533136129379272, "learning_rate": 3.521226566189523e-05, "loss": 3.1359, "step": 1214 }, { "epoch": 4.219035202086049, "grad_norm": 0.5978413224220276, "learning_rate": 3.512484730723986e-05, "loss": 3.1429, "step": 1215 }, { "epoch": 4.222511951325511, "grad_norm": 0.5688326358795166, "learning_rate": 3.503747880962431e-05, "loss": 3.1544, "step": 1216 }, { "epoch": 4.2259887005649714, "grad_norm": 0.5804445147514343, "learning_rate": 3.495016046188155e-05, "loss": 3.1167, "step": 1217 }, { "epoch": 4.229465449804433, "grad_norm": 0.5675270557403564, "learning_rate": 3.4862892556676395e-05, "loss": 3.1552, "step": 1218 }, { "epoch": 4.232942199043894, "grad_norm": 0.5292279720306396, "learning_rate": 3.4775675386504656e-05, "loss": 3.1221, "step": 1219 }, { "epoch": 4.236418948283355, "grad_norm": 0.553810179233551, "learning_rate": 3.468850924369203e-05, "loss": 3.2311, "step": 1220 }, { "epoch": 4.239895697522816, "grad_norm": 0.5489530563354492, "learning_rate": 3.460139442039326e-05, "loss": 3.1054, "step": 1221 }, { "epoch": 4.243372446762277, "grad_norm": 0.5100445747375488, "learning_rate": 3.4514331208591025e-05, "loss": 3.2079, "step": 1222 }, { "epoch": 4.2468491960017385, "grad_norm": 0.5427148938179016, "learning_rate": 3.4427319900095055e-05, "loss": 3.1394, "step": 1223 }, { "epoch": 4.250325945241199, "grad_norm": 0.5344714522361755, "learning_rate": 3.4340360786541064e-05, "loss": 3.1592, "step": 1224 }, { "epoch": 4.253802694480661, "grad_norm": 0.5320305824279785, "learning_rate": 3.425345415938988e-05, "loss": 3.193, "step": 1225 }, { "epoch": 4.257279443720122, "grad_norm": 0.5287283062934875, "learning_rate": 3.4166600309926387e-05, "loss": 3.2131, "step": 1226 }, { "epoch": 4.260756192959583, "grad_norm": 0.5557782649993896, "learning_rate": 3.407979952925857e-05, "loss": 3.24, "step": 1227 }, { "epoch": 4.264232942199044, "grad_norm": 0.5349240303039551, "learning_rate": 3.399305210831656e-05, "loss": 3.1917, "step": 1228 }, { "epoch": 4.267709691438505, "grad_norm": 0.5544419884681702, "learning_rate": 3.390635833785163e-05, "loss": 3.1339, "step": 1229 }, { "epoch": 4.271186440677966, "grad_norm": 0.5695310235023499, "learning_rate": 3.3819718508435226e-05, "loss": 3.1142, "step": 1230 }, { "epoch": 4.274663189917427, "grad_norm": 0.4966232180595398, "learning_rate": 3.3733132910458034e-05, "loss": 3.1916, "step": 1231 }, { "epoch": 4.278139939156889, "grad_norm": 0.5800032019615173, "learning_rate": 3.364660183412892e-05, "loss": 3.1645, "step": 1232 }, { "epoch": 4.281616688396349, "grad_norm": 0.5773885250091553, "learning_rate": 3.356012556947405e-05, "loss": 3.1542, "step": 1233 }, { "epoch": 4.28509343763581, "grad_norm": 0.5234330296516418, "learning_rate": 3.347370440633587e-05, "loss": 3.1371, "step": 1234 }, { "epoch": 4.288570186875272, "grad_norm": 0.5608299374580383, "learning_rate": 3.338733863437212e-05, "loss": 3.1268, "step": 1235 }, { "epoch": 4.292046936114732, "grad_norm": 0.5602731108665466, "learning_rate": 3.3301028543054935e-05, "loss": 3.1742, "step": 1236 }, { "epoch": 4.295523685354194, "grad_norm": 0.5070881247520447, "learning_rate": 3.3214774421669774e-05, "loss": 3.1491, "step": 1237 }, { "epoch": 4.299000434593655, "grad_norm": 0.6085598468780518, "learning_rate": 3.3128576559314504e-05, "loss": 3.1355, "step": 1238 }, { "epoch": 4.302477183833116, "grad_norm": 0.562229335308075, "learning_rate": 3.304243524489847e-05, "loss": 3.1475, "step": 1239 }, { "epoch": 4.305953933072577, "grad_norm": 0.50657719373703, "learning_rate": 3.295635076714144e-05, "loss": 3.1014, "step": 1240 }, { "epoch": 4.309430682312038, "grad_norm": 0.547308623790741, "learning_rate": 3.2870323414572726e-05, "loss": 3.1063, "step": 1241 }, { "epoch": 4.3129074315514995, "grad_norm": 0.538101315498352, "learning_rate": 3.278435347553014e-05, "loss": 3.2346, "step": 1242 }, { "epoch": 4.31638418079096, "grad_norm": 0.5632209777832031, "learning_rate": 3.2698441238159065e-05, "loss": 3.1009, "step": 1243 }, { "epoch": 4.319860930030422, "grad_norm": 0.538557231426239, "learning_rate": 3.261258699041152e-05, "loss": 3.1992, "step": 1244 }, { "epoch": 4.3233376792698825, "grad_norm": 0.5102593302726746, "learning_rate": 3.2526791020045086e-05, "loss": 3.0824, "step": 1245 }, { "epoch": 4.326814428509344, "grad_norm": 0.5571600794792175, "learning_rate": 3.24410536146221e-05, "loss": 3.2344, "step": 1246 }, { "epoch": 4.330291177748805, "grad_norm": 0.5731806755065918, "learning_rate": 3.235537506150856e-05, "loss": 3.1911, "step": 1247 }, { "epoch": 4.333767926988266, "grad_norm": 0.568469226360321, "learning_rate": 3.226975564787322e-05, "loss": 3.1631, "step": 1248 }, { "epoch": 4.337244676227727, "grad_norm": 0.6091863512992859, "learning_rate": 3.218419566068661e-05, "loss": 3.1156, "step": 1249 }, { "epoch": 4.340721425467188, "grad_norm": 0.5082367658615112, "learning_rate": 3.209869538672008e-05, "loss": 3.1481, "step": 1250 }, { "epoch": 4.34419817470665, "grad_norm": 0.531330943107605, "learning_rate": 3.201325511254487e-05, "loss": 3.2305, "step": 1251 }, { "epoch": 4.34767492394611, "grad_norm": 0.542111873626709, "learning_rate": 3.192787512453105e-05, "loss": 3.155, "step": 1252 }, { "epoch": 4.351151673185571, "grad_norm": 0.5078598260879517, "learning_rate": 3.18425557088467e-05, "loss": 3.1793, "step": 1253 }, { "epoch": 4.354628422425033, "grad_norm": 0.5410819053649902, "learning_rate": 3.175729715145684e-05, "loss": 3.2161, "step": 1254 }, { "epoch": 4.358105171664493, "grad_norm": 0.5382195711135864, "learning_rate": 3.167209973812253e-05, "loss": 3.1332, "step": 1255 }, { "epoch": 4.361581920903955, "grad_norm": 0.574900209903717, "learning_rate": 3.158696375439989e-05, "loss": 3.1937, "step": 1256 }, { "epoch": 4.365058670143416, "grad_norm": 0.527801513671875, "learning_rate": 3.1501889485639124e-05, "loss": 3.1353, "step": 1257 }, { "epoch": 4.368535419382877, "grad_norm": 0.5430712699890137, "learning_rate": 3.141687721698363e-05, "loss": 3.1929, "step": 1258 }, { "epoch": 4.372012168622338, "grad_norm": 0.5637326240539551, "learning_rate": 3.133192723336895e-05, "loss": 3.1734, "step": 1259 }, { "epoch": 4.375488917861799, "grad_norm": 0.5349778532981873, "learning_rate": 3.124703981952191e-05, "loss": 3.1223, "step": 1260 }, { "epoch": 4.3789656671012605, "grad_norm": 0.5977018475532532, "learning_rate": 3.1162215259959594e-05, "loss": 3.1931, "step": 1261 }, { "epoch": 4.382442416340721, "grad_norm": 0.5777693390846252, "learning_rate": 3.107745383898841e-05, "loss": 3.2519, "step": 1262 }, { "epoch": 4.385919165580183, "grad_norm": 0.5384812951087952, "learning_rate": 3.0992755840703195e-05, "loss": 3.1766, "step": 1263 }, { "epoch": 4.3893959148196435, "grad_norm": 0.5800558924674988, "learning_rate": 3.0908121548986136e-05, "loss": 3.1622, "step": 1264 }, { "epoch": 4.392872664059105, "grad_norm": 0.5624348521232605, "learning_rate": 3.0823551247505975e-05, "loss": 3.1381, "step": 1265 }, { "epoch": 4.396349413298566, "grad_norm": 0.5728154182434082, "learning_rate": 3.073904521971689e-05, "loss": 3.0738, "step": 1266 }, { "epoch": 4.399826162538027, "grad_norm": 0.6084502935409546, "learning_rate": 3.065460374885771e-05, "loss": 3.1587, "step": 1267 }, { "epoch": 4.403302911777488, "grad_norm": 0.5481749773025513, "learning_rate": 3.057022711795086e-05, "loss": 3.1831, "step": 1268 }, { "epoch": 4.406779661016949, "grad_norm": 0.5750850439071655, "learning_rate": 3.048591560980143e-05, "loss": 3.1694, "step": 1269 }, { "epoch": 4.410256410256411, "grad_norm": 0.5632287859916687, "learning_rate": 3.0401669506996256e-05, "loss": 3.1857, "step": 1270 }, { "epoch": 4.413733159495871, "grad_norm": 0.5208144783973694, "learning_rate": 3.0317489091902935e-05, "loss": 3.161, "step": 1271 }, { "epoch": 4.417209908735332, "grad_norm": 0.5646597146987915, "learning_rate": 3.0233374646668933e-05, "loss": 3.1561, "step": 1272 }, { "epoch": 4.420686657974794, "grad_norm": 0.6238527894020081, "learning_rate": 3.014932645322056e-05, "loss": 3.1294, "step": 1273 }, { "epoch": 4.424163407214254, "grad_norm": 0.5221900939941406, "learning_rate": 3.0065344793262112e-05, "loss": 3.1726, "step": 1274 }, { "epoch": 4.427640156453716, "grad_norm": 0.507268488407135, "learning_rate": 2.9981429948274848e-05, "loss": 3.1116, "step": 1275 }, { "epoch": 4.431116905693177, "grad_norm": 0.5262473225593567, "learning_rate": 2.9897582199516104e-05, "loss": 3.1248, "step": 1276 }, { "epoch": 4.434593654932638, "grad_norm": 0.5219045281410217, "learning_rate": 2.9813801828018344e-05, "loss": 3.1733, "step": 1277 }, { "epoch": 4.438070404172099, "grad_norm": 0.5666697025299072, "learning_rate": 2.973008911458816e-05, "loss": 3.1438, "step": 1278 }, { "epoch": 4.44154715341156, "grad_norm": 0.5166260004043579, "learning_rate": 2.9646444339805436e-05, "loss": 3.1529, "step": 1279 }, { "epoch": 4.445023902651021, "grad_norm": 0.5471430420875549, "learning_rate": 2.9562867784022262e-05, "loss": 3.1087, "step": 1280 }, { "epoch": 4.448500651890482, "grad_norm": 0.5563281774520874, "learning_rate": 2.9479359727362173e-05, "loss": 3.1607, "step": 1281 }, { "epoch": 4.451977401129944, "grad_norm": 0.5162192583084106, "learning_rate": 2.9395920449719027e-05, "loss": 3.0287, "step": 1282 }, { "epoch": 4.4554541503694045, "grad_norm": 0.5430468320846558, "learning_rate": 2.931255023075624e-05, "loss": 3.1928, "step": 1283 }, { "epoch": 4.458930899608866, "grad_norm": 0.5762031078338623, "learning_rate": 2.9229249349905684e-05, "loss": 3.1146, "step": 1284 }, { "epoch": 4.462407648848327, "grad_norm": 0.5716304183006287, "learning_rate": 2.91460180863669e-05, "loss": 3.1328, "step": 1285 }, { "epoch": 4.465884398087788, "grad_norm": 0.48609912395477295, "learning_rate": 2.9062856719106034e-05, "loss": 3.1918, "step": 1286 }, { "epoch": 4.469361147327249, "grad_norm": 0.5626135468482971, "learning_rate": 2.8979765526855002e-05, "loss": 3.0825, "step": 1287 }, { "epoch": 4.47283789656671, "grad_norm": 0.5432965755462646, "learning_rate": 2.8896744788110497e-05, "loss": 3.1992, "step": 1288 }, { "epoch": 4.4763146458061716, "grad_norm": 0.5227442979812622, "learning_rate": 2.881379478113311e-05, "loss": 3.1348, "step": 1289 }, { "epoch": 4.479791395045632, "grad_norm": 0.5127291679382324, "learning_rate": 2.873091578394626e-05, "loss": 3.1505, "step": 1290 }, { "epoch": 4.483268144285093, "grad_norm": 0.5297070145606995, "learning_rate": 2.8648108074335472e-05, "loss": 3.1592, "step": 1291 }, { "epoch": 4.486744893524555, "grad_norm": 0.5441714525222778, "learning_rate": 2.8565371929847284e-05, "loss": 3.1568, "step": 1292 }, { "epoch": 4.490221642764015, "grad_norm": 0.527181088924408, "learning_rate": 2.8482707627788406e-05, "loss": 3.0347, "step": 1293 }, { "epoch": 4.493698392003477, "grad_norm": 0.5165045261383057, "learning_rate": 2.840011544522467e-05, "loss": 3.0401, "step": 1294 }, { "epoch": 4.497175141242938, "grad_norm": 0.5333744287490845, "learning_rate": 2.831759565898029e-05, "loss": 3.1488, "step": 1295 }, { "epoch": 4.500651890482399, "grad_norm": 0.5299988985061646, "learning_rate": 2.8235148545636776e-05, "loss": 3.1192, "step": 1296 }, { "epoch": 4.50412863972186, "grad_norm": 0.5290223956108093, "learning_rate": 2.8152774381532033e-05, "loss": 3.1706, "step": 1297 }, { "epoch": 4.507605388961321, "grad_norm": 0.5143240690231323, "learning_rate": 2.8070473442759515e-05, "loss": 3.0781, "step": 1298 }, { "epoch": 4.511082138200782, "grad_norm": 0.5055028796195984, "learning_rate": 2.798824600516723e-05, "loss": 3.0796, "step": 1299 }, { "epoch": 4.514558887440243, "grad_norm": 0.5169261693954468, "learning_rate": 2.790609234435683e-05, "loss": 3.026, "step": 1300 }, { "epoch": 4.518035636679705, "grad_norm": 0.48205968737602234, "learning_rate": 2.7824012735682693e-05, "loss": 3.0564, "step": 1301 }, { "epoch": 4.5215123859191655, "grad_norm": 0.5130692720413208, "learning_rate": 2.774200745425096e-05, "loss": 3.0893, "step": 1302 }, { "epoch": 4.524989135158627, "grad_norm": 0.502295196056366, "learning_rate": 2.7660076774918708e-05, "loss": 3.0858, "step": 1303 }, { "epoch": 4.528465884398088, "grad_norm": 0.5414760708808899, "learning_rate": 2.757822097229294e-05, "loss": 3.1763, "step": 1304 }, { "epoch": 4.531942633637549, "grad_norm": 0.5080839991569519, "learning_rate": 2.749644032072969e-05, "loss": 3.1469, "step": 1305 }, { "epoch": 4.53541938287701, "grad_norm": 0.510778546333313, "learning_rate": 2.7414735094333137e-05, "loss": 3.1893, "step": 1306 }, { "epoch": 4.538896132116471, "grad_norm": 0.5154252648353577, "learning_rate": 2.7333105566954627e-05, "loss": 3.1355, "step": 1307 }, { "epoch": 4.5423728813559325, "grad_norm": 0.5087947249412537, "learning_rate": 2.7251552012191762e-05, "loss": 3.1925, "step": 1308 }, { "epoch": 4.545849630595393, "grad_norm": 0.5253137350082397, "learning_rate": 2.7170074703387565e-05, "loss": 3.2108, "step": 1309 }, { "epoch": 4.549326379834854, "grad_norm": 0.5336745381355286, "learning_rate": 2.708867391362948e-05, "loss": 3.1476, "step": 1310 }, { "epoch": 4.552803129074316, "grad_norm": 0.5403643846511841, "learning_rate": 2.700734991574849e-05, "loss": 3.1474, "step": 1311 }, { "epoch": 4.556279878313776, "grad_norm": 0.5262840986251831, "learning_rate": 2.6926102982318192e-05, "loss": 3.2003, "step": 1312 }, { "epoch": 4.559756627553238, "grad_norm": 0.5386162996292114, "learning_rate": 2.684493338565386e-05, "loss": 3.1021, "step": 1313 }, { "epoch": 4.563233376792699, "grad_norm": 0.4971773624420166, "learning_rate": 2.6763841397811573e-05, "loss": 3.1549, "step": 1314 }, { "epoch": 4.56671012603216, "grad_norm": 0.5384669303894043, "learning_rate": 2.668282729058732e-05, "loss": 3.1015, "step": 1315 }, { "epoch": 4.570186875271621, "grad_norm": 0.5245262384414673, "learning_rate": 2.6601891335516028e-05, "loss": 3.1012, "step": 1316 }, { "epoch": 4.573663624511082, "grad_norm": 0.5222594141960144, "learning_rate": 2.6521033803870692e-05, "loss": 3.1934, "step": 1317 }, { "epoch": 4.577140373750543, "grad_norm": 0.5642375349998474, "learning_rate": 2.6440254966661425e-05, "loss": 3.1681, "step": 1318 }, { "epoch": 4.580617122990004, "grad_norm": 0.5803114175796509, "learning_rate": 2.6359555094634615e-05, "loss": 3.152, "step": 1319 }, { "epoch": 4.584093872229466, "grad_norm": 0.5136197209358215, "learning_rate": 2.6278934458271997e-05, "loss": 3.1403, "step": 1320 }, { "epoch": 4.5875706214689265, "grad_norm": 0.5403902530670166, "learning_rate": 2.6198393327789662e-05, "loss": 3.1996, "step": 1321 }, { "epoch": 4.591047370708388, "grad_norm": 0.5205239653587341, "learning_rate": 2.6117931973137296e-05, "loss": 3.0724, "step": 1322 }, { "epoch": 4.594524119947849, "grad_norm": 0.5435472726821899, "learning_rate": 2.603755066399718e-05, "loss": 3.0977, "step": 1323 }, { "epoch": 4.5980008691873095, "grad_norm": 0.5279546976089478, "learning_rate": 2.5957249669783256e-05, "loss": 3.0092, "step": 1324 }, { "epoch": 4.601477618426771, "grad_norm": 0.5379184484481812, "learning_rate": 2.587702925964034e-05, "loss": 3.1167, "step": 1325 }, { "epoch": 4.604954367666232, "grad_norm": 0.5148362517356873, "learning_rate": 2.579688970244313e-05, "loss": 3.098, "step": 1326 }, { "epoch": 4.6084311169056935, "grad_norm": 0.5144993662834167, "learning_rate": 2.5716831266795326e-05, "loss": 3.0208, "step": 1327 }, { "epoch": 4.611907866145154, "grad_norm": 0.5288809537887573, "learning_rate": 2.563685422102876e-05, "loss": 3.1685, "step": 1328 }, { "epoch": 4.615384615384615, "grad_norm": 0.5119858980178833, "learning_rate": 2.5556958833202405e-05, "loss": 3.1729, "step": 1329 }, { "epoch": 4.618861364624077, "grad_norm": 0.5306943655014038, "learning_rate": 2.5477145371101597e-05, "loss": 3.113, "step": 1330 }, { "epoch": 4.622338113863537, "grad_norm": 0.4869849681854248, "learning_rate": 2.539741410223707e-05, "loss": 3.0586, "step": 1331 }, { "epoch": 4.625814863102999, "grad_norm": 0.5126588940620422, "learning_rate": 2.531776529384407e-05, "loss": 3.0891, "step": 1332 }, { "epoch": 4.62929161234246, "grad_norm": 0.48841309547424316, "learning_rate": 2.523819921288147e-05, "loss": 3.0713, "step": 1333 }, { "epoch": 4.632768361581921, "grad_norm": 0.5154165625572205, "learning_rate": 2.5158716126030836e-05, "loss": 3.1351, "step": 1334 }, { "epoch": 4.636245110821382, "grad_norm": 0.5262759327888489, "learning_rate": 2.507931629969556e-05, "loss": 3.0784, "step": 1335 }, { "epoch": 4.639721860060843, "grad_norm": 0.5570662021636963, "learning_rate": 2.500000000000001e-05, "loss": 3.1477, "step": 1336 }, { "epoch": 4.643198609300304, "grad_norm": 0.5133278369903564, "learning_rate": 2.4920767492788576e-05, "loss": 3.1487, "step": 1337 }, { "epoch": 4.646675358539765, "grad_norm": 0.5384029150009155, "learning_rate": 2.4841619043624807e-05, "loss": 3.0638, "step": 1338 }, { "epoch": 4.650152107779227, "grad_norm": 0.5107344388961792, "learning_rate": 2.4762554917790525e-05, "loss": 3.1279, "step": 1339 }, { "epoch": 4.6536288570186874, "grad_norm": 0.5281763672828674, "learning_rate": 2.4683575380284874e-05, "loss": 3.162, "step": 1340 }, { "epoch": 4.657105606258149, "grad_norm": 0.5439243912696838, "learning_rate": 2.4604680695823533e-05, "loss": 3.1004, "step": 1341 }, { "epoch": 4.66058235549761, "grad_norm": 0.49902161955833435, "learning_rate": 2.4525871128837773e-05, "loss": 3.0821, "step": 1342 }, { "epoch": 4.6640591047370705, "grad_norm": 0.5409182906150818, "learning_rate": 2.4447146943473565e-05, "loss": 3.1635, "step": 1343 }, { "epoch": 4.667535853976532, "grad_norm": 0.5150710940361023, "learning_rate": 2.436850840359073e-05, "loss": 3.0735, "step": 1344 }, { "epoch": 4.671012603215993, "grad_norm": 0.5056044459342957, "learning_rate": 2.4289955772761974e-05, "loss": 3.0758, "step": 1345 }, { "epoch": 4.6744893524554545, "grad_norm": 0.5287520289421082, "learning_rate": 2.4211489314272127e-05, "loss": 3.1805, "step": 1346 }, { "epoch": 4.677966101694915, "grad_norm": 0.5224068760871887, "learning_rate": 2.4133109291117156e-05, "loss": 3.0326, "step": 1347 }, { "epoch": 4.681442850934376, "grad_norm": 0.4915207624435425, "learning_rate": 2.4054815966003363e-05, "loss": 3.0884, "step": 1348 }, { "epoch": 4.684919600173838, "grad_norm": 0.542336642742157, "learning_rate": 2.3976609601346394e-05, "loss": 3.1397, "step": 1349 }, { "epoch": 4.688396349413298, "grad_norm": 0.5344177484512329, "learning_rate": 2.389849045927049e-05, "loss": 3.1556, "step": 1350 }, { "epoch": 4.69187309865276, "grad_norm": 0.49078139662742615, "learning_rate": 2.3820458801607548e-05, "loss": 3.1368, "step": 1351 }, { "epoch": 4.695349847892221, "grad_norm": 0.55536949634552, "learning_rate": 2.3742514889896196e-05, "loss": 3.1188, "step": 1352 }, { "epoch": 4.698826597131681, "grad_norm": 0.5023874640464783, "learning_rate": 2.3664658985381e-05, "loss": 3.1357, "step": 1353 }, { "epoch": 4.702303346371143, "grad_norm": 0.5190631151199341, "learning_rate": 2.3586891349011552e-05, "loss": 3.175, "step": 1354 }, { "epoch": 4.705780095610604, "grad_norm": 0.5370966196060181, "learning_rate": 2.3509212241441593e-05, "loss": 3.165, "step": 1355 }, { "epoch": 4.709256844850065, "grad_norm": 0.5101494193077087, "learning_rate": 2.3431621923028145e-05, "loss": 3.1552, "step": 1356 }, { "epoch": 4.712733594089526, "grad_norm": 0.5176619291305542, "learning_rate": 2.335412065383061e-05, "loss": 3.0863, "step": 1357 }, { "epoch": 4.716210343328988, "grad_norm": 0.5212742686271667, "learning_rate": 2.3276708693609943e-05, "loss": 3.1169, "step": 1358 }, { "epoch": 4.719687092568448, "grad_norm": 0.507392168045044, "learning_rate": 2.3199386301827775e-05, "loss": 3.1013, "step": 1359 }, { "epoch": 4.72316384180791, "grad_norm": 0.516706109046936, "learning_rate": 2.312215373764551e-05, "loss": 3.0495, "step": 1360 }, { "epoch": 4.726640591047371, "grad_norm": 0.49758976697921753, "learning_rate": 2.304501125992351e-05, "loss": 3.0798, "step": 1361 }, { "epoch": 4.7301173402868315, "grad_norm": 0.525264322757721, "learning_rate": 2.296795912722014e-05, "loss": 3.0509, "step": 1362 }, { "epoch": 4.733594089526293, "grad_norm": 0.5171617865562439, "learning_rate": 2.2890997597790968e-05, "loss": 3.1179, "step": 1363 }, { "epoch": 4.737070838765754, "grad_norm": 0.536133885383606, "learning_rate": 2.2814126929587933e-05, "loss": 3.0698, "step": 1364 }, { "epoch": 4.7405475880052155, "grad_norm": 0.5195457935333252, "learning_rate": 2.2737347380258394e-05, "loss": 3.0672, "step": 1365 }, { "epoch": 4.744024337244676, "grad_norm": 0.5235431790351868, "learning_rate": 2.266065920714432e-05, "loss": 3.1695, "step": 1366 }, { "epoch": 4.747501086484137, "grad_norm": 0.5295529365539551, "learning_rate": 2.258406266728143e-05, "loss": 3.1604, "step": 1367 }, { "epoch": 4.7509778357235986, "grad_norm": 0.5380683541297913, "learning_rate": 2.2507558017398263e-05, "loss": 3.0974, "step": 1368 }, { "epoch": 4.754454584963059, "grad_norm": 0.49247080087661743, "learning_rate": 2.243114551391542e-05, "loss": 3.1145, "step": 1369 }, { "epoch": 4.757931334202521, "grad_norm": 0.5573008060455322, "learning_rate": 2.2354825412944642e-05, "loss": 3.1544, "step": 1370 }, { "epoch": 4.761408083441982, "grad_norm": 0.5144769549369812, "learning_rate": 2.2278597970287966e-05, "loss": 3.1163, "step": 1371 }, { "epoch": 4.764884832681442, "grad_norm": 0.5623053908348083, "learning_rate": 2.2202463441436884e-05, "loss": 3.2186, "step": 1372 }, { "epoch": 4.768361581920904, "grad_norm": 0.5026108026504517, "learning_rate": 2.2126422081571403e-05, "loss": 3.1492, "step": 1373 }, { "epoch": 4.771838331160365, "grad_norm": 0.5406736731529236, "learning_rate": 2.2050474145559326e-05, "loss": 3.125, "step": 1374 }, { "epoch": 4.775315080399826, "grad_norm": 0.526690661907196, "learning_rate": 2.1974619887955294e-05, "loss": 3.1364, "step": 1375 }, { "epoch": 4.778791829639287, "grad_norm": 0.5063409209251404, "learning_rate": 2.1898859563e-05, "loss": 3.1636, "step": 1376 }, { "epoch": 4.782268578878749, "grad_norm": 0.5202750563621521, "learning_rate": 2.1823193424619238e-05, "loss": 3.1399, "step": 1377 }, { "epoch": 4.785745328118209, "grad_norm": 0.5034672617912292, "learning_rate": 2.174762172642319e-05, "loss": 3.2438, "step": 1378 }, { "epoch": 4.789222077357671, "grad_norm": 0.5040730834007263, "learning_rate": 2.1672144721705433e-05, "loss": 3.0598, "step": 1379 }, { "epoch": 4.792698826597132, "grad_norm": 0.47813880443573, "learning_rate": 2.1596762663442218e-05, "loss": 3.1059, "step": 1380 }, { "epoch": 4.7961755758365925, "grad_norm": 0.5351612567901611, "learning_rate": 2.1521475804291535e-05, "loss": 3.1185, "step": 1381 }, { "epoch": 4.799652325076054, "grad_norm": 0.5197668075561523, "learning_rate": 2.144628439659231e-05, "loss": 3.1053, "step": 1382 }, { "epoch": 4.803129074315515, "grad_norm": 0.49083802103996277, "learning_rate": 2.1371188692363552e-05, "loss": 3.093, "step": 1383 }, { "epoch": 4.8066058235549765, "grad_norm": 0.5041195750236511, "learning_rate": 2.1296188943303448e-05, "loss": 3.1273, "step": 1384 }, { "epoch": 4.810082572794437, "grad_norm": 0.5689459443092346, "learning_rate": 2.122128540078862e-05, "loss": 3.1277, "step": 1385 }, { "epoch": 4.813559322033898, "grad_norm": 0.48876744508743286, "learning_rate": 2.1146478315873238e-05, "loss": 3.0268, "step": 1386 }, { "epoch": 4.8170360712733595, "grad_norm": 0.5481560230255127, "learning_rate": 2.1071767939288144e-05, "loss": 3.1129, "step": 1387 }, { "epoch": 4.82051282051282, "grad_norm": 0.5860038995742798, "learning_rate": 2.09971545214401e-05, "loss": 3.1246, "step": 1388 }, { "epoch": 4.823989569752282, "grad_norm": 0.4835022985935211, "learning_rate": 2.0922638312410792e-05, "loss": 3.0547, "step": 1389 }, { "epoch": 4.827466318991743, "grad_norm": 0.5766292214393616, "learning_rate": 2.0848219561956206e-05, "loss": 3.2223, "step": 1390 }, { "epoch": 4.830943068231203, "grad_norm": 0.4890536963939667, "learning_rate": 2.077389851950557e-05, "loss": 3.0601, "step": 1391 }, { "epoch": 4.834419817470665, "grad_norm": 0.508464515209198, "learning_rate": 2.06996754341607e-05, "loss": 3.032, "step": 1392 }, { "epoch": 4.837896566710126, "grad_norm": 0.5292408466339111, "learning_rate": 2.062555055469506e-05, "loss": 3.1608, "step": 1393 }, { "epoch": 4.841373315949587, "grad_norm": 0.4770180583000183, "learning_rate": 2.0551524129552985e-05, "loss": 3.116, "step": 1394 }, { "epoch": 4.844850065189048, "grad_norm": 0.512302041053772, "learning_rate": 2.047759640684876e-05, "loss": 3.0814, "step": 1395 }, { "epoch": 4.84832681442851, "grad_norm": 0.48571115732192993, "learning_rate": 2.04037676343659e-05, "loss": 3.1722, "step": 1396 }, { "epoch": 4.85180356366797, "grad_norm": 0.48603254556655884, "learning_rate": 2.0330038059556256e-05, "loss": 3.12, "step": 1397 }, { "epoch": 4.855280312907432, "grad_norm": 0.49838802218437195, "learning_rate": 2.0256407929539202e-05, "loss": 3.1449, "step": 1398 }, { "epoch": 4.858757062146893, "grad_norm": 0.49490660429000854, "learning_rate": 2.0182877491100806e-05, "loss": 3.125, "step": 1399 }, { "epoch": 4.8622338113863535, "grad_norm": 0.4796282649040222, "learning_rate": 2.0109446990692964e-05, "loss": 3.0519, "step": 1400 }, { "epoch": 4.865710560625815, "grad_norm": 0.5214862823486328, "learning_rate": 2.0036116674432654e-05, "loss": 3.1385, "step": 1401 }, { "epoch": 4.869187309865276, "grad_norm": 0.4797852337360382, "learning_rate": 1.996288678810105e-05, "loss": 3.0939, "step": 1402 }, { "epoch": 4.872664059104737, "grad_norm": 0.5158775448799133, "learning_rate": 1.9889757577142737e-05, "loss": 3.1246, "step": 1403 }, { "epoch": 4.876140808344198, "grad_norm": 0.4937228858470917, "learning_rate": 1.9816729286664798e-05, "loss": 2.9946, "step": 1404 }, { "epoch": 4.879617557583659, "grad_norm": 0.491494357585907, "learning_rate": 1.9743802161436136e-05, "loss": 3.0483, "step": 1405 }, { "epoch": 4.8830943068231205, "grad_norm": 0.4826614558696747, "learning_rate": 1.967097644588657e-05, "loss": 3.0563, "step": 1406 }, { "epoch": 4.886571056062581, "grad_norm": 0.5007966160774231, "learning_rate": 1.959825238410596e-05, "loss": 3.1255, "step": 1407 }, { "epoch": 4.890047805302043, "grad_norm": 0.5254808664321899, "learning_rate": 1.9525630219843522e-05, "loss": 3.1027, "step": 1408 }, { "epoch": 4.893524554541504, "grad_norm": 0.5202295184135437, "learning_rate": 1.9453110196506913e-05, "loss": 3.0686, "step": 1409 }, { "epoch": 4.897001303780964, "grad_norm": 0.5098284482955933, "learning_rate": 1.938069255716145e-05, "loss": 3.2055, "step": 1410 }, { "epoch": 4.900478053020426, "grad_norm": 0.517220139503479, "learning_rate": 1.930837754452931e-05, "loss": 3.0823, "step": 1411 }, { "epoch": 4.903954802259887, "grad_norm": 0.5010183453559875, "learning_rate": 1.9236165400988638e-05, "loss": 3.0627, "step": 1412 }, { "epoch": 4.907431551499348, "grad_norm": 0.4953896105289459, "learning_rate": 1.9164056368572846e-05, "loss": 3.0968, "step": 1413 }, { "epoch": 4.910908300738809, "grad_norm": 0.4907744824886322, "learning_rate": 1.9092050688969738e-05, "loss": 2.934, "step": 1414 }, { "epoch": 4.914385049978271, "grad_norm": 0.5396456122398376, "learning_rate": 1.9020148603520694e-05, "loss": 3.1413, "step": 1415 }, { "epoch": 4.917861799217731, "grad_norm": 0.4796368479728699, "learning_rate": 1.894835035321991e-05, "loss": 3.0553, "step": 1416 }, { "epoch": 4.921338548457193, "grad_norm": 0.509888231754303, "learning_rate": 1.8876656178713525e-05, "loss": 3.0997, "step": 1417 }, { "epoch": 4.924815297696654, "grad_norm": 0.5054630041122437, "learning_rate": 1.8805066320298832e-05, "loss": 3.1347, "step": 1418 }, { "epoch": 4.9282920469361144, "grad_norm": 0.506965160369873, "learning_rate": 1.8733581017923533e-05, "loss": 3.1165, "step": 1419 }, { "epoch": 4.931768796175576, "grad_norm": 0.4983595311641693, "learning_rate": 1.8662200511184874e-05, "loss": 3.1969, "step": 1420 }, { "epoch": 4.935245545415037, "grad_norm": 0.48033374547958374, "learning_rate": 1.8590925039328855e-05, "loss": 3.0475, "step": 1421 }, { "epoch": 4.938722294654498, "grad_norm": 0.48913419246673584, "learning_rate": 1.8519754841249443e-05, "loss": 3.0912, "step": 1422 }, { "epoch": 4.942199043893959, "grad_norm": 0.4766146242618561, "learning_rate": 1.8448690155487715e-05, "loss": 3.0585, "step": 1423 }, { "epoch": 4.94567579313342, "grad_norm": 0.5143147706985474, "learning_rate": 1.837773122023114e-05, "loss": 3.1216, "step": 1424 }, { "epoch": 4.9491525423728815, "grad_norm": 0.5010018944740295, "learning_rate": 1.830687827331275e-05, "loss": 3.122, "step": 1425 }, { "epoch": 4.952629291612342, "grad_norm": 0.519191324710846, "learning_rate": 1.8236131552210317e-05, "loss": 3.1109, "step": 1426 }, { "epoch": 4.956106040851804, "grad_norm": 0.5037075877189636, "learning_rate": 1.8165491294045593e-05, "loss": 3.014, "step": 1427 }, { "epoch": 4.959582790091265, "grad_norm": 0.5274494290351868, "learning_rate": 1.8094957735583463e-05, "loss": 3.0914, "step": 1428 }, { "epoch": 4.963059539330725, "grad_norm": 0.5338578820228577, "learning_rate": 1.802453111323122e-05, "loss": 3.0104, "step": 1429 }, { "epoch": 4.966536288570187, "grad_norm": 0.5094507336616516, "learning_rate": 1.7954211663037728e-05, "loss": 3.1215, "step": 1430 }, { "epoch": 4.970013037809648, "grad_norm": 0.5530762076377869, "learning_rate": 1.788399962069266e-05, "loss": 3.1778, "step": 1431 }, { "epoch": 4.973489787049109, "grad_norm": 0.49168628454208374, "learning_rate": 1.781389522152565e-05, "loss": 3.1523, "step": 1432 }, { "epoch": 4.97696653628857, "grad_norm": 0.543717086315155, "learning_rate": 1.774389870050559e-05, "loss": 3.0974, "step": 1433 }, { "epoch": 4.980443285528032, "grad_norm": 0.4832988679409027, "learning_rate": 1.7674010292239744e-05, "loss": 3.0996, "step": 1434 }, { "epoch": 4.983920034767492, "grad_norm": 0.5092836022377014, "learning_rate": 1.760423023097307e-05, "loss": 3.0521, "step": 1435 }, { "epoch": 4.987396784006954, "grad_norm": 0.5006300210952759, "learning_rate": 1.7534558750587345e-05, "loss": 3.1263, "step": 1436 }, { "epoch": 4.990873533246415, "grad_norm": 0.4926410913467407, "learning_rate": 1.7464996084600433e-05, "loss": 3.0774, "step": 1437 }, { "epoch": 4.994350282485875, "grad_norm": 0.474171906709671, "learning_rate": 1.739554246616549e-05, "loss": 3.0128, "step": 1438 }, { "epoch": 4.997827031725337, "grad_norm": 0.4734145402908325, "learning_rate": 1.7326198128070133e-05, "loss": 2.9946, "step": 1439 }, { "epoch": 5.0, "grad_norm": 0.5680263042449951, "learning_rate": 1.725696330273575e-05, "loss": 3.0865, "step": 1440 }, { "epoch": 5.003476749239461, "grad_norm": 0.4952830970287323, "learning_rate": 1.7187838222216663e-05, "loss": 2.9644, "step": 1441 }, { "epoch": 5.006953498478922, "grad_norm": 0.4864019453525543, "learning_rate": 1.711882311819936e-05, "loss": 3.0979, "step": 1442 }, { "epoch": 5.010430247718383, "grad_norm": 0.506616473197937, "learning_rate": 1.7049918222001742e-05, "loss": 3.122, "step": 1443 }, { "epoch": 5.013906996957845, "grad_norm": 0.4952886700630188, "learning_rate": 1.6981123764572273e-05, "loss": 3.0794, "step": 1444 }, { "epoch": 5.017383746197305, "grad_norm": 0.5021927952766418, "learning_rate": 1.6912439976489313e-05, "loss": 3.1295, "step": 1445 }, { "epoch": 5.020860495436767, "grad_norm": 0.4874928891658783, "learning_rate": 1.684386708796025e-05, "loss": 3.1032, "step": 1446 }, { "epoch": 5.024337244676228, "grad_norm": 0.5078150033950806, "learning_rate": 1.6775405328820805e-05, "loss": 3.1201, "step": 1447 }, { "epoch": 5.0278139939156885, "grad_norm": 0.47642481327056885, "learning_rate": 1.670705492853421e-05, "loss": 3.0506, "step": 1448 }, { "epoch": 5.03129074315515, "grad_norm": 0.5237090587615967, "learning_rate": 1.6638816116190475e-05, "loss": 3.1616, "step": 1449 }, { "epoch": 5.034767492394611, "grad_norm": 0.5067710876464844, "learning_rate": 1.6570689120505555e-05, "loss": 3.1494, "step": 1450 }, { "epoch": 5.0382442416340725, "grad_norm": 0.5045792460441589, "learning_rate": 1.650267416982067e-05, "loss": 3.1457, "step": 1451 }, { "epoch": 5.041720990873533, "grad_norm": 0.5121103525161743, "learning_rate": 1.6434771492101485e-05, "loss": 3.0338, "step": 1452 }, { "epoch": 5.045197740112994, "grad_norm": 0.4789864420890808, "learning_rate": 1.6366981314937376e-05, "loss": 3.0468, "step": 1453 }, { "epoch": 5.0486744893524556, "grad_norm": 0.5022554993629456, "learning_rate": 1.6299303865540617e-05, "loss": 3.1252, "step": 1454 }, { "epoch": 5.052151238591916, "grad_norm": 0.49874892830848694, "learning_rate": 1.6231739370745703e-05, "loss": 3.0489, "step": 1455 }, { "epoch": 5.055627987831378, "grad_norm": 0.4912637770175934, "learning_rate": 1.6164288057008466e-05, "loss": 3.0752, "step": 1456 }, { "epoch": 5.059104737070839, "grad_norm": 0.4912840723991394, "learning_rate": 1.6096950150405454e-05, "loss": 3.1287, "step": 1457 }, { "epoch": 5.0625814863103, "grad_norm": 0.5014127492904663, "learning_rate": 1.6029725876633085e-05, "loss": 3.1283, "step": 1458 }, { "epoch": 5.066058235549761, "grad_norm": 0.4787100851535797, "learning_rate": 1.5962615461006924e-05, "loss": 3.1367, "step": 1459 }, { "epoch": 5.069534984789222, "grad_norm": 0.49083542823791504, "learning_rate": 1.589561912846089e-05, "loss": 3.081, "step": 1460 }, { "epoch": 5.073011734028683, "grad_norm": 0.48826974630355835, "learning_rate": 1.582873710354657e-05, "loss": 3.0724, "step": 1461 }, { "epoch": 5.076488483268144, "grad_norm": 0.4925864040851593, "learning_rate": 1.576196961043237e-05, "loss": 2.9944, "step": 1462 }, { "epoch": 5.079965232507606, "grad_norm": 0.4802667796611786, "learning_rate": 1.569531687290288e-05, "loss": 3.0584, "step": 1463 }, { "epoch": 5.083441981747066, "grad_norm": 0.5159469842910767, "learning_rate": 1.5628779114358034e-05, "loss": 3.079, "step": 1464 }, { "epoch": 5.086918730986528, "grad_norm": 0.4779518246650696, "learning_rate": 1.556235655781239e-05, "loss": 3.1105, "step": 1465 }, { "epoch": 5.090395480225989, "grad_norm": 0.5127723217010498, "learning_rate": 1.549604942589441e-05, "loss": 3.0305, "step": 1466 }, { "epoch": 5.0938722294654495, "grad_norm": 0.5168395638465881, "learning_rate": 1.5429857940845633e-05, "loss": 3.1509, "step": 1467 }, { "epoch": 5.097348978704911, "grad_norm": 0.49684715270996094, "learning_rate": 1.536378232452003e-05, "loss": 3.037, "step": 1468 }, { "epoch": 5.100825727944372, "grad_norm": 0.5140928626060486, "learning_rate": 1.529782279838321e-05, "loss": 3.0983, "step": 1469 }, { "epoch": 5.1043024771838335, "grad_norm": 0.5105652213096619, "learning_rate": 1.5231979583511668e-05, "loss": 3.0414, "step": 1470 }, { "epoch": 5.107779226423294, "grad_norm": 0.5075240135192871, "learning_rate": 1.5166252900592082e-05, "loss": 3.0819, "step": 1471 }, { "epoch": 5.111255975662755, "grad_norm": 0.5023607015609741, "learning_rate": 1.5100642969920503e-05, "loss": 3.0479, "step": 1472 }, { "epoch": 5.1147327249022165, "grad_norm": 0.519066333770752, "learning_rate": 1.5035150011401727e-05, "loss": 3.1059, "step": 1473 }, { "epoch": 5.118209474141677, "grad_norm": 0.5103733539581299, "learning_rate": 1.496977424454843e-05, "loss": 3.0913, "step": 1474 }, { "epoch": 5.121686223381139, "grad_norm": 0.49367067217826843, "learning_rate": 1.4904515888480536e-05, "loss": 3.1771, "step": 1475 }, { "epoch": 5.1251629726206, "grad_norm": 0.4967593252658844, "learning_rate": 1.4839375161924446e-05, "loss": 3.0717, "step": 1476 }, { "epoch": 5.128639721860061, "grad_norm": 0.483659952878952, "learning_rate": 1.4774352283212306e-05, "loss": 3.0219, "step": 1477 }, { "epoch": 5.132116471099522, "grad_norm": 0.47200506925582886, "learning_rate": 1.4709447470281217e-05, "loss": 3.0199, "step": 1478 }, { "epoch": 5.135593220338983, "grad_norm": 0.49920839071273804, "learning_rate": 1.4644660940672627e-05, "loss": 3.1066, "step": 1479 }, { "epoch": 5.139069969578444, "grad_norm": 0.4712933599948883, "learning_rate": 1.4579992911531498e-05, "loss": 3.0786, "step": 1480 }, { "epoch": 5.142546718817905, "grad_norm": 0.4965963661670685, "learning_rate": 1.4515443599605627e-05, "loss": 3.0316, "step": 1481 }, { "epoch": 5.146023468057367, "grad_norm": 0.46099987626075745, "learning_rate": 1.4451013221244913e-05, "loss": 3.0089, "step": 1482 }, { "epoch": 5.149500217296827, "grad_norm": 0.49877581000328064, "learning_rate": 1.4386701992400586e-05, "loss": 3.0595, "step": 1483 }, { "epoch": 5.152976966536288, "grad_norm": 0.47550636529922485, "learning_rate": 1.4322510128624567e-05, "loss": 2.9965, "step": 1484 }, { "epoch": 5.15645371577575, "grad_norm": 0.4906829595565796, "learning_rate": 1.4258437845068685e-05, "loss": 3.0932, "step": 1485 }, { "epoch": 5.1599304650152105, "grad_norm": 0.47613778710365295, "learning_rate": 1.4194485356483977e-05, "loss": 3.0283, "step": 1486 }, { "epoch": 5.163407214254672, "grad_norm": 0.4815172851085663, "learning_rate": 1.4130652877219947e-05, "loss": 3.0631, "step": 1487 }, { "epoch": 5.166883963494133, "grad_norm": 0.47476691007614136, "learning_rate": 1.406694062122389e-05, "loss": 3.0189, "step": 1488 }, { "epoch": 5.170360712733594, "grad_norm": 0.4923785626888275, "learning_rate": 1.4003348802040112e-05, "loss": 3.0331, "step": 1489 }, { "epoch": 5.173837461973055, "grad_norm": 0.4678713381290436, "learning_rate": 1.3939877632809278e-05, "loss": 2.9611, "step": 1490 }, { "epoch": 5.177314211212516, "grad_norm": 0.48306557536125183, "learning_rate": 1.387652732626768e-05, "loss": 3.0772, "step": 1491 }, { "epoch": 5.1807909604519775, "grad_norm": 0.47807446122169495, "learning_rate": 1.3813298094746491e-05, "loss": 2.9963, "step": 1492 }, { "epoch": 5.184267709691438, "grad_norm": 0.46229419112205505, "learning_rate": 1.3750190150171105e-05, "loss": 3.0278, "step": 1493 }, { "epoch": 5.1877444589309, "grad_norm": 0.4713948965072632, "learning_rate": 1.3687203704060342e-05, "loss": 3.1379, "step": 1494 }, { "epoch": 5.191221208170361, "grad_norm": 0.47037366032600403, "learning_rate": 1.3624338967525857e-05, "loss": 3.1018, "step": 1495 }, { "epoch": 5.194697957409822, "grad_norm": 0.45954567193984985, "learning_rate": 1.356159615127135e-05, "loss": 3.0508, "step": 1496 }, { "epoch": 5.198174706649283, "grad_norm": 0.47411566972732544, "learning_rate": 1.3498975465591867e-05, "loss": 3.0462, "step": 1497 }, { "epoch": 5.201651455888744, "grad_norm": 0.4694608747959137, "learning_rate": 1.3436477120373143e-05, "loss": 3.0887, "step": 1498 }, { "epoch": 5.205128205128205, "grad_norm": 0.4654156565666199, "learning_rate": 1.337410132509081e-05, "loss": 3.0725, "step": 1499 }, { "epoch": 5.208604954367666, "grad_norm": 0.45913633704185486, "learning_rate": 1.3311848288809813e-05, "loss": 3.026, "step": 1500 }, { "epoch": 5.212081703607128, "grad_norm": 0.45800894498825073, "learning_rate": 1.3249718220183583e-05, "loss": 3.0755, "step": 1501 }, { "epoch": 5.215558452846588, "grad_norm": 0.46444812417030334, "learning_rate": 1.3187711327453445e-05, "loss": 3.0823, "step": 1502 }, { "epoch": 5.219035202086049, "grad_norm": 0.47551393508911133, "learning_rate": 1.3125827818447862e-05, "loss": 3.0347, "step": 1503 }, { "epoch": 5.222511951325511, "grad_norm": 0.49685007333755493, "learning_rate": 1.3064067900581773e-05, "loss": 3.1281, "step": 1504 }, { "epoch": 5.2259887005649714, "grad_norm": 0.47061434388160706, "learning_rate": 1.3002431780855817e-05, "loss": 3.1229, "step": 1505 }, { "epoch": 5.229465449804433, "grad_norm": 0.5090036392211914, "learning_rate": 1.2940919665855761e-05, "loss": 2.9923, "step": 1506 }, { "epoch": 5.232942199043894, "grad_norm": 0.45235443115234375, "learning_rate": 1.2879531761751713e-05, "loss": 3.0002, "step": 1507 }, { "epoch": 5.236418948283355, "grad_norm": 0.486585408449173, "learning_rate": 1.2818268274297474e-05, "loss": 3.1673, "step": 1508 }, { "epoch": 5.239895697522816, "grad_norm": 0.4809410274028778, "learning_rate": 1.2757129408829843e-05, "loss": 3.0792, "step": 1509 }, { "epoch": 5.243372446762277, "grad_norm": 0.4928673803806305, "learning_rate": 1.2696115370267925e-05, "loss": 3.1247, "step": 1510 }, { "epoch": 5.2468491960017385, "grad_norm": 0.4768332540988922, "learning_rate": 1.2635226363112401e-05, "loss": 3.072, "step": 1511 }, { "epoch": 5.250325945241199, "grad_norm": 0.4630432724952698, "learning_rate": 1.257446259144494e-05, "loss": 3.0369, "step": 1512 }, { "epoch": 5.253802694480661, "grad_norm": 0.47798049449920654, "learning_rate": 1.2513824258927438e-05, "loss": 3.131, "step": 1513 }, { "epoch": 5.257279443720122, "grad_norm": 0.4752330482006073, "learning_rate": 1.2453311568801367e-05, "loss": 3.1487, "step": 1514 }, { "epoch": 5.260756192959583, "grad_norm": 0.4717276990413666, "learning_rate": 1.2392924723887039e-05, "loss": 3.066, "step": 1515 }, { "epoch": 5.264232942199044, "grad_norm": 0.47000083327293396, "learning_rate": 1.2332663926583038e-05, "loss": 2.9598, "step": 1516 }, { "epoch": 5.267709691438505, "grad_norm": 0.46743789315223694, "learning_rate": 1.227252937886541e-05, "loss": 3.0595, "step": 1517 }, { "epoch": 5.271186440677966, "grad_norm": 0.4703062176704407, "learning_rate": 1.2212521282287092e-05, "loss": 3.0263, "step": 1518 }, { "epoch": 5.274663189917427, "grad_norm": 0.496021568775177, "learning_rate": 1.2152639837977187e-05, "loss": 3.013, "step": 1519 }, { "epoch": 5.278139939156889, "grad_norm": 0.48973992466926575, "learning_rate": 1.209288524664029e-05, "loss": 3.1375, "step": 1520 }, { "epoch": 5.281616688396349, "grad_norm": 0.49357324838638306, "learning_rate": 1.2033257708555845e-05, "loss": 3.0935, "step": 1521 }, { "epoch": 5.28509343763581, "grad_norm": 0.4617580771446228, "learning_rate": 1.1973757423577404e-05, "loss": 2.9214, "step": 1522 }, { "epoch": 5.288570186875272, "grad_norm": 0.4951948821544647, "learning_rate": 1.1914384591132044e-05, "loss": 3.0443, "step": 1523 }, { "epoch": 5.292046936114732, "grad_norm": 0.4785456657409668, "learning_rate": 1.1855139410219657e-05, "loss": 3.0451, "step": 1524 }, { "epoch": 5.295523685354194, "grad_norm": 0.47750359773635864, "learning_rate": 1.1796022079412272e-05, "loss": 3.1302, "step": 1525 }, { "epoch": 5.299000434593655, "grad_norm": 0.4931430518627167, "learning_rate": 1.1737032796853432e-05, "loss": 3.1065, "step": 1526 }, { "epoch": 5.302477183833116, "grad_norm": 0.48678264021873474, "learning_rate": 1.1678171760257439e-05, "loss": 3.0954, "step": 1527 }, { "epoch": 5.305953933072577, "grad_norm": 0.47306203842163086, "learning_rate": 1.161943916690883e-05, "loss": 3.0279, "step": 1528 }, { "epoch": 5.309430682312038, "grad_norm": 0.4727276861667633, "learning_rate": 1.1560835213661564e-05, "loss": 3.1249, "step": 1529 }, { "epoch": 5.3129074315514995, "grad_norm": 0.45371294021606445, "learning_rate": 1.1502360096938487e-05, "loss": 2.9934, "step": 1530 }, { "epoch": 5.31638418079096, "grad_norm": 0.48115792870521545, "learning_rate": 1.144401401273062e-05, "loss": 3.0486, "step": 1531 }, { "epoch": 5.319860930030422, "grad_norm": 0.4729406237602234, "learning_rate": 1.1385797156596506e-05, "loss": 3.0835, "step": 1532 }, { "epoch": 5.3233376792698825, "grad_norm": 0.46308913826942444, "learning_rate": 1.1327709723661517e-05, "loss": 3.0495, "step": 1533 }, { "epoch": 5.326814428509344, "grad_norm": 0.4657433032989502, "learning_rate": 1.1269751908617277e-05, "loss": 2.9911, "step": 1534 }, { "epoch": 5.330291177748805, "grad_norm": 0.45987650752067566, "learning_rate": 1.1211923905720956e-05, "loss": 3.0637, "step": 1535 }, { "epoch": 5.333767926988266, "grad_norm": 0.46726325154304504, "learning_rate": 1.1154225908794642e-05, "loss": 3.0673, "step": 1536 }, { "epoch": 5.337244676227727, "grad_norm": 0.4622207581996918, "learning_rate": 1.1096658111224684e-05, "loss": 3.0466, "step": 1537 }, { "epoch": 5.340721425467188, "grad_norm": 0.48004594445228577, "learning_rate": 1.1039220705961001e-05, "loss": 3.1712, "step": 1538 }, { "epoch": 5.34419817470665, "grad_norm": 0.4778420925140381, "learning_rate": 1.0981913885516527e-05, "loss": 3.1482, "step": 1539 }, { "epoch": 5.34767492394611, "grad_norm": 0.446816086769104, "learning_rate": 1.0924737841966498e-05, "loss": 3.0631, "step": 1540 }, { "epoch": 5.351151673185571, "grad_norm": 0.465069979429245, "learning_rate": 1.0867692766947812e-05, "loss": 3.0714, "step": 1541 }, { "epoch": 5.354628422425033, "grad_norm": 0.4674012064933777, "learning_rate": 1.0810778851658438e-05, "loss": 3.1051, "step": 1542 }, { "epoch": 5.358105171664493, "grad_norm": 0.46494433283805847, "learning_rate": 1.0753996286856688e-05, "loss": 3.0391, "step": 1543 }, { "epoch": 5.361581920903955, "grad_norm": 0.46729180216789246, "learning_rate": 1.0697345262860636e-05, "loss": 3.1156, "step": 1544 }, { "epoch": 5.365058670143416, "grad_norm": 0.4721660614013672, "learning_rate": 1.0640825969547496e-05, "loss": 3.1373, "step": 1545 }, { "epoch": 5.368535419382877, "grad_norm": 0.47204047441482544, "learning_rate": 1.0584438596352952e-05, "loss": 3.0871, "step": 1546 }, { "epoch": 5.372012168622338, "grad_norm": 0.47127583622932434, "learning_rate": 1.0528183332270519e-05, "loss": 3.1441, "step": 1547 }, { "epoch": 5.375488917861799, "grad_norm": 0.4934067130088806, "learning_rate": 1.047206036585095e-05, "loss": 3.0754, "step": 1548 }, { "epoch": 5.3789656671012605, "grad_norm": 0.45597216486930847, "learning_rate": 1.041606988520153e-05, "loss": 3.1062, "step": 1549 }, { "epoch": 5.382442416340721, "grad_norm": 0.46915626525878906, "learning_rate": 1.0360212077985521e-05, "loss": 3.0976, "step": 1550 }, { "epoch": 5.385919165580183, "grad_norm": 0.4795849919319153, "learning_rate": 1.030448713142151e-05, "loss": 3.0746, "step": 1551 }, { "epoch": 5.3893959148196435, "grad_norm": 0.4348246157169342, "learning_rate": 1.0248895232282762e-05, "loss": 3.0168, "step": 1552 }, { "epoch": 5.392872664059105, "grad_norm": 0.5004274845123291, "learning_rate": 1.0193436566896625e-05, "loss": 3.0659, "step": 1553 }, { "epoch": 5.396349413298566, "grad_norm": 0.4510630667209625, "learning_rate": 1.013811132114384e-05, "loss": 3.0664, "step": 1554 }, { "epoch": 5.399826162538027, "grad_norm": 0.47762078046798706, "learning_rate": 1.0082919680458014e-05, "loss": 3.1583, "step": 1555 }, { "epoch": 5.403302911777488, "grad_norm": 0.4656456410884857, "learning_rate": 1.0027861829824952e-05, "loss": 3.0065, "step": 1556 }, { "epoch": 5.406779661016949, "grad_norm": 0.45920252799987793, "learning_rate": 9.972937953781986e-06, "loss": 3.1666, "step": 1557 }, { "epoch": 5.410256410256411, "grad_norm": 0.4843366742134094, "learning_rate": 9.918148236417452e-06, "loss": 3.0652, "step": 1558 }, { "epoch": 5.413733159495871, "grad_norm": 0.46141600608825684, "learning_rate": 9.863492861370004e-06, "loss": 3.0977, "step": 1559 }, { "epoch": 5.417209908735332, "grad_norm": 0.45874103903770447, "learning_rate": 9.808972011828055e-06, "loss": 2.9799, "step": 1560 }, { "epoch": 5.420686657974794, "grad_norm": 0.46027660369873047, "learning_rate": 9.754585870529059e-06, "loss": 3.1338, "step": 1561 }, { "epoch": 5.424163407214254, "grad_norm": 0.4515167772769928, "learning_rate": 9.700334619759038e-06, "loss": 3.0767, "step": 1562 }, { "epoch": 5.427640156453716, "grad_norm": 0.4566735327243805, "learning_rate": 9.646218441351867e-06, "loss": 3.0172, "step": 1563 }, { "epoch": 5.431116905693177, "grad_norm": 0.45395126938819885, "learning_rate": 9.592237516688708e-06, "loss": 3.0072, "step": 1564 }, { "epoch": 5.434593654932638, "grad_norm": 0.4817012548446655, "learning_rate": 9.538392026697408e-06, "loss": 3.1232, "step": 1565 }, { "epoch": 5.438070404172099, "grad_norm": 0.4628289043903351, "learning_rate": 9.484682151851832e-06, "loss": 3.0401, "step": 1566 }, { "epoch": 5.44154715341156, "grad_norm": 0.4673340320587158, "learning_rate": 9.431108072171346e-06, "loss": 2.9883, "step": 1567 }, { "epoch": 5.445023902651021, "grad_norm": 0.45545145869255066, "learning_rate": 9.377669967220165e-06, "loss": 3.0543, "step": 1568 }, { "epoch": 5.448500651890482, "grad_norm": 0.45707741379737854, "learning_rate": 9.324368016106761e-06, "loss": 3.1049, "step": 1569 }, { "epoch": 5.451977401129944, "grad_norm": 0.4631718099117279, "learning_rate": 9.271202397483215e-06, "loss": 3.1157, "step": 1570 }, { "epoch": 5.4554541503694045, "grad_norm": 0.46336543560028076, "learning_rate": 9.218173289544735e-06, "loss": 2.9833, "step": 1571 }, { "epoch": 5.458930899608866, "grad_norm": 0.4526154398918152, "learning_rate": 9.16528087002892e-06, "loss": 3.0316, "step": 1572 }, { "epoch": 5.462407648848327, "grad_norm": 0.44709017872810364, "learning_rate": 9.11252531621527e-06, "loss": 3.0742, "step": 1573 }, { "epoch": 5.465884398087788, "grad_norm": 0.4863232374191284, "learning_rate": 9.05990680492454e-06, "loss": 3.1241, "step": 1574 }, { "epoch": 5.469361147327249, "grad_norm": 0.4572135806083679, "learning_rate": 9.007425512518164e-06, "loss": 3.0628, "step": 1575 }, { "epoch": 5.47283789656671, "grad_norm": 0.4494728147983551, "learning_rate": 8.955081614897664e-06, "loss": 3.0376, "step": 1576 }, { "epoch": 5.4763146458061716, "grad_norm": 0.4699227809906006, "learning_rate": 8.902875287504025e-06, "loss": 3.0442, "step": 1577 }, { "epoch": 5.479791395045632, "grad_norm": 0.4395771324634552, "learning_rate": 8.850806705317183e-06, "loss": 2.9835, "step": 1578 }, { "epoch": 5.483268144285093, "grad_norm": 0.4605022370815277, "learning_rate": 8.798876042855358e-06, "loss": 3.1216, "step": 1579 }, { "epoch": 5.486744893524555, "grad_norm": 0.47916972637176514, "learning_rate": 8.747083474174527e-06, "loss": 3.1716, "step": 1580 }, { "epoch": 5.490221642764015, "grad_norm": 0.4568222761154175, "learning_rate": 8.69542917286782e-06, "loss": 3.1073, "step": 1581 }, { "epoch": 5.493698392003477, "grad_norm": 0.4488259255886078, "learning_rate": 8.6439133120649e-06, "loss": 3.0121, "step": 1582 }, { "epoch": 5.497175141242938, "grad_norm": 0.4373837411403656, "learning_rate": 8.592536064431467e-06, "loss": 3.0243, "step": 1583 }, { "epoch": 5.500651890482399, "grad_norm": 0.46101701259613037, "learning_rate": 8.541297602168591e-06, "loss": 3.099, "step": 1584 }, { "epoch": 5.50412863972186, "grad_norm": 0.4696287214756012, "learning_rate": 8.490198097012203e-06, "loss": 3.1134, "step": 1585 }, { "epoch": 5.507605388961321, "grad_norm": 0.43790748715400696, "learning_rate": 8.439237720232484e-06, "loss": 3.0106, "step": 1586 }, { "epoch": 5.511082138200782, "grad_norm": 0.4761092960834503, "learning_rate": 8.3884166426333e-06, "loss": 3.0973, "step": 1587 }, { "epoch": 5.514558887440243, "grad_norm": 0.46510714292526245, "learning_rate": 8.337735034551613e-06, "loss": 3.1247, "step": 1588 }, { "epoch": 5.518035636679705, "grad_norm": 0.4549061059951782, "learning_rate": 8.287193065856935e-06, "loss": 3.0587, "step": 1589 }, { "epoch": 5.5215123859191655, "grad_norm": 0.46748998761177063, "learning_rate": 8.236790905950765e-06, "loss": 3.1052, "step": 1590 }, { "epoch": 5.524989135158627, "grad_norm": 0.4555809497833252, "learning_rate": 8.186528723765979e-06, "loss": 3.0366, "step": 1591 }, { "epoch": 5.528465884398088, "grad_norm": 0.45778852701187134, "learning_rate": 8.136406687766318e-06, "loss": 3.0882, "step": 1592 }, { "epoch": 5.531942633637549, "grad_norm": 0.4485376477241516, "learning_rate": 8.086424965945755e-06, "loss": 3.0317, "step": 1593 }, { "epoch": 5.53541938287701, "grad_norm": 0.44758340716362, "learning_rate": 8.036583725828001e-06, "loss": 3.0241, "step": 1594 }, { "epoch": 5.538896132116471, "grad_norm": 0.4677172303199768, "learning_rate": 7.986883134465916e-06, "loss": 3.1337, "step": 1595 }, { "epoch": 5.5423728813559325, "grad_norm": 0.4411722421646118, "learning_rate": 7.937323358440935e-06, "loss": 3.1009, "step": 1596 }, { "epoch": 5.545849630595393, "grad_norm": 0.45588281750679016, "learning_rate": 7.887904563862547e-06, "loss": 3.1043, "step": 1597 }, { "epoch": 5.549326379834854, "grad_norm": 0.4676235616207123, "learning_rate": 7.838626916367675e-06, "loss": 3.1354, "step": 1598 }, { "epoch": 5.552803129074316, "grad_norm": 0.4364144504070282, "learning_rate": 7.789490581120169e-06, "loss": 3.051, "step": 1599 }, { "epoch": 5.556279878313776, "grad_norm": 0.457533061504364, "learning_rate": 7.740495722810271e-06, "loss": 3.1305, "step": 1600 }, { "epoch": 5.559756627553238, "grad_norm": 0.4606762230396271, "learning_rate": 7.691642505654007e-06, "loss": 3.0427, "step": 1601 }, { "epoch": 5.563233376792699, "grad_norm": 0.4465121328830719, "learning_rate": 7.642931093392674e-06, "loss": 3.1152, "step": 1602 }, { "epoch": 5.56671012603216, "grad_norm": 0.4565368592739105, "learning_rate": 7.594361649292303e-06, "loss": 3.0521, "step": 1603 }, { "epoch": 5.570186875271621, "grad_norm": 0.44448092579841614, "learning_rate": 7.545934336143034e-06, "loss": 3.0368, "step": 1604 }, { "epoch": 5.573663624511082, "grad_norm": 0.44336286187171936, "learning_rate": 7.497649316258687e-06, "loss": 3.0708, "step": 1605 }, { "epoch": 5.577140373750543, "grad_norm": 0.4594724178314209, "learning_rate": 7.449506751476121e-06, "loss": 3.0141, "step": 1606 }, { "epoch": 5.580617122990004, "grad_norm": 0.45023006200790405, "learning_rate": 7.40150680315474e-06, "loss": 3.1725, "step": 1607 }, { "epoch": 5.584093872229466, "grad_norm": 0.46296626329421997, "learning_rate": 7.353649632175957e-06, "loss": 3.046, "step": 1608 }, { "epoch": 5.5875706214689265, "grad_norm": 0.4497106969356537, "learning_rate": 7.305935398942598e-06, "loss": 2.9753, "step": 1609 }, { "epoch": 5.591047370708388, "grad_norm": 0.46122488379478455, "learning_rate": 7.258364263378437e-06, "loss": 3.0618, "step": 1610 }, { "epoch": 5.594524119947849, "grad_norm": 0.45882707834243774, "learning_rate": 7.21093638492763e-06, "loss": 3.0114, "step": 1611 }, { "epoch": 5.5980008691873095, "grad_norm": 0.4593908488750458, "learning_rate": 7.163651922554149e-06, "loss": 3.192, "step": 1612 }, { "epoch": 5.601477618426771, "grad_norm": 0.4521577060222626, "learning_rate": 7.1165110347413025e-06, "loss": 3.0416, "step": 1613 }, { "epoch": 5.604954367666232, "grad_norm": 0.46317487955093384, "learning_rate": 7.06951387949118e-06, "loss": 3.1004, "step": 1614 }, { "epoch": 5.6084311169056935, "grad_norm": 0.4537200927734375, "learning_rate": 7.022660614324122e-06, "loss": 3.0639, "step": 1615 }, { "epoch": 5.611907866145154, "grad_norm": 0.4527956545352936, "learning_rate": 6.975951396278168e-06, "loss": 3.0711, "step": 1616 }, { "epoch": 5.615384615384615, "grad_norm": 0.46440109610557556, "learning_rate": 6.9293863819085865e-06, "loss": 3.095, "step": 1617 }, { "epoch": 5.618861364624077, "grad_norm": 0.4648788571357727, "learning_rate": 6.882965727287305e-06, "loss": 3.0182, "step": 1618 }, { "epoch": 5.622338113863537, "grad_norm": 0.4295171797275543, "learning_rate": 6.836689588002399e-06, "loss": 2.9744, "step": 1619 }, { "epoch": 5.625814863102999, "grad_norm": 0.464074045419693, "learning_rate": 6.790558119157597e-06, "loss": 3.0175, "step": 1620 }, { "epoch": 5.62929161234246, "grad_norm": 0.4391389787197113, "learning_rate": 6.74457147537168e-06, "loss": 3.0392, "step": 1621 }, { "epoch": 5.632768361581921, "grad_norm": 0.4579889476299286, "learning_rate": 6.698729810778065e-06, "loss": 3.0613, "step": 1622 }, { "epoch": 5.636245110821382, "grad_norm": 0.46115800738334656, "learning_rate": 6.65303327902424e-06, "loss": 3.044, "step": 1623 }, { "epoch": 5.639721860060843, "grad_norm": 0.4502374529838562, "learning_rate": 6.607482033271229e-06, "loss": 3.0842, "step": 1624 }, { "epoch": 5.643198609300304, "grad_norm": 0.44308748841285706, "learning_rate": 6.562076226193136e-06, "loss": 3.1463, "step": 1625 }, { "epoch": 5.646675358539765, "grad_norm": 0.44475382566452026, "learning_rate": 6.516816009976556e-06, "loss": 3.1117, "step": 1626 }, { "epoch": 5.650152107779227, "grad_norm": 0.4457295536994934, "learning_rate": 6.471701536320118e-06, "loss": 3.0655, "step": 1627 }, { "epoch": 5.6536288570186874, "grad_norm": 0.43922004103660583, "learning_rate": 6.4267329564339895e-06, "loss": 3.0753, "step": 1628 }, { "epoch": 5.657105606258149, "grad_norm": 0.4469055235385895, "learning_rate": 6.381910421039328e-06, "loss": 3.2001, "step": 1629 }, { "epoch": 5.66058235549761, "grad_norm": 0.4434399902820587, "learning_rate": 6.337234080367794e-06, "loss": 3.1005, "step": 1630 }, { "epoch": 5.6640591047370705, "grad_norm": 0.4269639253616333, "learning_rate": 6.292704084161067e-06, "loss": 3.0321, "step": 1631 }, { "epoch": 5.667535853976532, "grad_norm": 0.44678542017936707, "learning_rate": 6.248320581670281e-06, "loss": 3.0467, "step": 1632 }, { "epoch": 5.671012603215993, "grad_norm": 0.4449600577354431, "learning_rate": 6.204083721655607e-06, "loss": 3.0347, "step": 1633 }, { "epoch": 5.6744893524554545, "grad_norm": 0.43621253967285156, "learning_rate": 6.159993652385698e-06, "loss": 3.0156, "step": 1634 }, { "epoch": 5.677966101694915, "grad_norm": 0.4372401833534241, "learning_rate": 6.116050521637218e-06, "loss": 2.9871, "step": 1635 }, { "epoch": 5.681442850934376, "grad_norm": 0.449531614780426, "learning_rate": 6.072254476694328e-06, "loss": 3.1147, "step": 1636 }, { "epoch": 5.684919600173838, "grad_norm": 0.4365013539791107, "learning_rate": 6.028605664348191e-06, "loss": 3.0371, "step": 1637 }, { "epoch": 5.688396349413298, "grad_norm": 0.45457354187965393, "learning_rate": 5.985104230896516e-06, "loss": 3.0452, "step": 1638 }, { "epoch": 5.69187309865276, "grad_norm": 0.4303637444972992, "learning_rate": 5.941750322143036e-06, "loss": 3.0588, "step": 1639 }, { "epoch": 5.695349847892221, "grad_norm": 0.42853423953056335, "learning_rate": 5.898544083397e-06, "loss": 3.1129, "step": 1640 }, { "epoch": 5.698826597131681, "grad_norm": 0.4471043348312378, "learning_rate": 5.8554856594727425e-06, "loss": 3.0419, "step": 1641 }, { "epoch": 5.702303346371143, "grad_norm": 0.44851046800613403, "learning_rate": 5.812575194689163e-06, "loss": 3.0814, "step": 1642 }, { "epoch": 5.705780095610604, "grad_norm": 0.42725709080696106, "learning_rate": 5.76981283286922e-06, "loss": 3.0032, "step": 1643 }, { "epoch": 5.709256844850065, "grad_norm": 0.43283796310424805, "learning_rate": 5.727198717339511e-06, "loss": 3.05, "step": 1644 }, { "epoch": 5.712733594089526, "grad_norm": 0.44458362460136414, "learning_rate": 5.684732990929725e-06, "loss": 3.0892, "step": 1645 }, { "epoch": 5.716210343328988, "grad_norm": 0.4499422609806061, "learning_rate": 5.642415795972228e-06, "loss": 3.1489, "step": 1646 }, { "epoch": 5.719687092568448, "grad_norm": 0.4384379982948303, "learning_rate": 5.600247274301535e-06, "loss": 3.0991, "step": 1647 }, { "epoch": 5.72316384180791, "grad_norm": 0.4302300214767456, "learning_rate": 5.558227567253832e-06, "loss": 3.0915, "step": 1648 }, { "epoch": 5.726640591047371, "grad_norm": 0.4346764385700226, "learning_rate": 5.5163568156665565e-06, "loss": 3.0154, "step": 1649 }, { "epoch": 5.7301173402868315, "grad_norm": 0.429609477519989, "learning_rate": 5.474635159877883e-06, "loss": 2.9877, "step": 1650 }, { "epoch": 5.733594089526293, "grad_norm": 0.45559728145599365, "learning_rate": 5.433062739726258e-06, "loss": 3.1049, "step": 1651 }, { "epoch": 5.737070838765754, "grad_norm": 0.4608874022960663, "learning_rate": 5.391639694549943e-06, "loss": 3.1147, "step": 1652 }, { "epoch": 5.7405475880052155, "grad_norm": 0.4482730031013489, "learning_rate": 5.350366163186521e-06, "loss": 3.069, "step": 1653 }, { "epoch": 5.744024337244676, "grad_norm": 0.4392632246017456, "learning_rate": 5.30924228397246e-06, "loss": 3.1065, "step": 1654 }, { "epoch": 5.747501086484137, "grad_norm": 0.43618863821029663, "learning_rate": 5.2682681947426375e-06, "loss": 3.0498, "step": 1655 }, { "epoch": 5.7509778357235986, "grad_norm": 0.4522383213043213, "learning_rate": 5.227444032829887e-06, "loss": 3.1129, "step": 1656 }, { "epoch": 5.754454584963059, "grad_norm": 0.4380725920200348, "learning_rate": 5.186769935064534e-06, "loss": 3.0673, "step": 1657 }, { "epoch": 5.757931334202521, "grad_norm": 0.4552653729915619, "learning_rate": 5.146246037773922e-06, "loss": 3.121, "step": 1658 }, { "epoch": 5.761408083441982, "grad_norm": 0.44829854369163513, "learning_rate": 5.105872476781964e-06, "loss": 3.0677, "step": 1659 }, { "epoch": 5.764884832681442, "grad_norm": 0.4196740984916687, "learning_rate": 5.065649387408705e-06, "loss": 2.9564, "step": 1660 }, { "epoch": 5.768361581920904, "grad_norm": 0.44454118609428406, "learning_rate": 5.025576904469842e-06, "loss": 2.9863, "step": 1661 }, { "epoch": 5.771838331160365, "grad_norm": 0.4422823190689087, "learning_rate": 4.985655162276298e-06, "loss": 3.0987, "step": 1662 }, { "epoch": 5.775315080399826, "grad_norm": 0.44028976559638977, "learning_rate": 4.945884294633757e-06, "loss": 3.0641, "step": 1663 }, { "epoch": 5.778791829639287, "grad_norm": 0.43805432319641113, "learning_rate": 4.906264434842195e-06, "loss": 3.1735, "step": 1664 }, { "epoch": 5.782268578878749, "grad_norm": 0.4369741380214691, "learning_rate": 4.866795715695477e-06, "loss": 2.983, "step": 1665 }, { "epoch": 5.785745328118209, "grad_norm": 0.42888718843460083, "learning_rate": 4.827478269480895e-06, "loss": 2.9566, "step": 1666 }, { "epoch": 5.789222077357671, "grad_norm": 0.44136032462120056, "learning_rate": 4.788312227978686e-06, "loss": 3.143, "step": 1667 }, { "epoch": 5.792698826597132, "grad_norm": 0.42934727668762207, "learning_rate": 4.74929772246166e-06, "loss": 3.1335, "step": 1668 }, { "epoch": 5.7961755758365925, "grad_norm": 0.44165438413619995, "learning_rate": 4.710434883694715e-06, "loss": 3.0607, "step": 1669 }, { "epoch": 5.799652325076054, "grad_norm": 0.4275215268135071, "learning_rate": 4.6717238419344e-06, "loss": 3.052, "step": 1670 }, { "epoch": 5.803129074315515, "grad_norm": 0.4319266676902771, "learning_rate": 4.6331647269284795e-06, "loss": 3.0009, "step": 1671 }, { "epoch": 5.8066058235549765, "grad_norm": 0.4423021972179413, "learning_rate": 4.594757667915523e-06, "loss": 3.0179, "step": 1672 }, { "epoch": 5.810082572794437, "grad_norm": 0.4336705207824707, "learning_rate": 4.556502793624445e-06, "loss": 3.0279, "step": 1673 }, { "epoch": 5.813559322033898, "grad_norm": 0.41388699412345886, "learning_rate": 4.5184002322740785e-06, "loss": 3.015, "step": 1674 }, { "epoch": 5.8170360712733595, "grad_norm": 0.43204250931739807, "learning_rate": 4.48045011157277e-06, "loss": 3.0567, "step": 1675 }, { "epoch": 5.82051282051282, "grad_norm": 0.4401572048664093, "learning_rate": 4.442652558717897e-06, "loss": 3.0272, "step": 1676 }, { "epoch": 5.823989569752282, "grad_norm": 0.41776955127716064, "learning_rate": 4.405007700395497e-06, "loss": 2.9544, "step": 1677 }, { "epoch": 5.827466318991743, "grad_norm": 0.4356550872325897, "learning_rate": 4.3675156627798196e-06, "loss": 3.0096, "step": 1678 }, { "epoch": 5.830943068231203, "grad_norm": 0.43083855509757996, "learning_rate": 4.330176571532907e-06, "loss": 3.0231, "step": 1679 }, { "epoch": 5.834419817470665, "grad_norm": 0.43351972103118896, "learning_rate": 4.292990551804171e-06, "loss": 3.0642, "step": 1680 }, { "epoch": 5.837896566710126, "grad_norm": 0.4334842562675476, "learning_rate": 4.255957728229965e-06, "loss": 3.0491, "step": 1681 }, { "epoch": 5.841373315949587, "grad_norm": 0.4342375695705414, "learning_rate": 4.219078224933176e-06, "loss": 3.0382, "step": 1682 }, { "epoch": 5.844850065189048, "grad_norm": 0.42653897404670715, "learning_rate": 4.182352165522807e-06, "loss": 3.0741, "step": 1683 }, { "epoch": 5.84832681442851, "grad_norm": 0.43444690108299255, "learning_rate": 4.145779673093581e-06, "loss": 3.0743, "step": 1684 }, { "epoch": 5.85180356366797, "grad_norm": 0.42880359292030334, "learning_rate": 4.109360870225493e-06, "loss": 3.0114, "step": 1685 }, { "epoch": 5.855280312907432, "grad_norm": 0.4228828251361847, "learning_rate": 4.0730958789834295e-06, "loss": 3.103, "step": 1686 }, { "epoch": 5.858757062146893, "grad_norm": 0.4425991475582123, "learning_rate": 4.036984820916723e-06, "loss": 3.0108, "step": 1687 }, { "epoch": 5.8622338113863535, "grad_norm": 0.43377602100372314, "learning_rate": 4.001027817058789e-06, "loss": 3.1284, "step": 1688 }, { "epoch": 5.865710560625815, "grad_norm": 0.4408301115036011, "learning_rate": 3.965224987926702e-06, "loss": 3.0967, "step": 1689 }, { "epoch": 5.869187309865276, "grad_norm": 0.43531477451324463, "learning_rate": 3.9295764535207724e-06, "loss": 3.0079, "step": 1690 }, { "epoch": 5.872664059104737, "grad_norm": 0.43204811215400696, "learning_rate": 3.894082333324184e-06, "loss": 3.01, "step": 1691 }, { "epoch": 5.876140808344198, "grad_norm": 0.43953630328178406, "learning_rate": 3.858742746302535e-06, "loss": 3.1421, "step": 1692 }, { "epoch": 5.879617557583659, "grad_norm": 0.4389553964138031, "learning_rate": 3.823557810903517e-06, "loss": 3.1524, "step": 1693 }, { "epoch": 5.8830943068231205, "grad_norm": 0.43646112084388733, "learning_rate": 3.78852764505645e-06, "loss": 3.1744, "step": 1694 }, { "epoch": 5.886571056062581, "grad_norm": 0.4238009452819824, "learning_rate": 3.7536523661719112e-06, "loss": 3.0273, "step": 1695 }, { "epoch": 5.890047805302043, "grad_norm": 0.42250022292137146, "learning_rate": 3.7189320911413526e-06, "loss": 2.9794, "step": 1696 }, { "epoch": 5.893524554541504, "grad_norm": 0.41812658309936523, "learning_rate": 3.684366936336714e-06, "loss": 3.0897, "step": 1697 }, { "epoch": 5.897001303780964, "grad_norm": 0.4232540428638458, "learning_rate": 3.6499570176099785e-06, "loss": 3.0451, "step": 1698 }, { "epoch": 5.900478053020426, "grad_norm": 0.4142310917377472, "learning_rate": 3.615702450292857e-06, "loss": 2.9745, "step": 1699 }, { "epoch": 5.903954802259887, "grad_norm": 0.4126403331756592, "learning_rate": 3.581603349196372e-06, "loss": 2.9889, "step": 1700 }, { "epoch": 5.907431551499348, "grad_norm": 0.4263981282711029, "learning_rate": 3.5476598286104447e-06, "loss": 3.0619, "step": 1701 }, { "epoch": 5.910908300738809, "grad_norm": 0.41806501150131226, "learning_rate": 3.5138720023035696e-06, "loss": 3.0235, "step": 1702 }, { "epoch": 5.914385049978271, "grad_norm": 0.41938862204551697, "learning_rate": 3.4802399835223653e-06, "loss": 3.0578, "step": 1703 }, { "epoch": 5.917861799217731, "grad_norm": 0.4120302200317383, "learning_rate": 3.4467638849912497e-06, "loss": 2.9969, "step": 1704 }, { "epoch": 5.921338548457193, "grad_norm": 0.4265589714050293, "learning_rate": 3.413443818912049e-06, "loss": 3.0355, "step": 1705 }, { "epoch": 5.924815297696654, "grad_norm": 0.4226160943508148, "learning_rate": 3.3802798969636008e-06, "loss": 3.0972, "step": 1706 }, { "epoch": 5.9282920469361144, "grad_norm": 0.41874009370803833, "learning_rate": 3.3472722303014124e-06, "loss": 3.0843, "step": 1707 }, { "epoch": 5.931768796175576, "grad_norm": 0.42675143480300903, "learning_rate": 3.3144209295572494e-06, "loss": 3.0468, "step": 1708 }, { "epoch": 5.935245545415037, "grad_norm": 0.42002323269844055, "learning_rate": 3.2817261048387893e-06, "loss": 3.1044, "step": 1709 }, { "epoch": 5.938722294654498, "grad_norm": 0.41737180948257446, "learning_rate": 3.249187865729264e-06, "loss": 3.0161, "step": 1710 }, { "epoch": 5.942199043893959, "grad_norm": 0.42569711804389954, "learning_rate": 3.216806321287069e-06, "loss": 3.1494, "step": 1711 }, { "epoch": 5.94567579313342, "grad_norm": 0.4135091006755829, "learning_rate": 3.1845815800454036e-06, "loss": 3.0645, "step": 1712 }, { "epoch": 5.9491525423728815, "grad_norm": 0.4207760691642761, "learning_rate": 3.1525137500119207e-06, "loss": 3.0256, "step": 1713 }, { "epoch": 5.952629291612342, "grad_norm": 0.42668992280960083, "learning_rate": 3.1206029386683324e-06, "loss": 3.0805, "step": 1714 }, { "epoch": 5.956106040851804, "grad_norm": 0.42762625217437744, "learning_rate": 3.0888492529700885e-06, "loss": 3.0698, "step": 1715 }, { "epoch": 5.959582790091265, "grad_norm": 0.4181903600692749, "learning_rate": 3.0572527993460053e-06, "loss": 3.0316, "step": 1716 }, { "epoch": 5.963059539330725, "grad_norm": 0.4258002042770386, "learning_rate": 3.0258136836978866e-06, "loss": 3.0404, "step": 1717 }, { "epoch": 5.966536288570187, "grad_norm": 0.41894468665122986, "learning_rate": 2.994532011400214e-06, "loss": 3.0116, "step": 1718 }, { "epoch": 5.970013037809648, "grad_norm": 0.41693276166915894, "learning_rate": 2.963407887299724e-06, "loss": 3.0115, "step": 1719 }, { "epoch": 5.973489787049109, "grad_norm": 0.4395310878753662, "learning_rate": 2.932441415715137e-06, "loss": 3.1532, "step": 1720 }, { "epoch": 5.97696653628857, "grad_norm": 0.4162778854370117, "learning_rate": 2.901632700436757e-06, "loss": 3.0968, "step": 1721 }, { "epoch": 5.980443285528032, "grad_norm": 0.41690173745155334, "learning_rate": 2.8709818447261337e-06, "loss": 3.034, "step": 1722 }, { "epoch": 5.983920034767492, "grad_norm": 0.42720499634742737, "learning_rate": 2.8404889513157176e-06, "loss": 3.0948, "step": 1723 }, { "epoch": 5.987396784006954, "grad_norm": 0.4119645655155182, "learning_rate": 2.81015412240852e-06, "loss": 2.9471, "step": 1724 }, { "epoch": 5.990873533246415, "grad_norm": 0.42118823528289795, "learning_rate": 2.7799774596777794e-06, "loss": 3.0288, "step": 1725 }, { "epoch": 5.994350282485875, "grad_norm": 0.40349310636520386, "learning_rate": 2.7499590642665774e-06, "loss": 2.9575, "step": 1726 }, { "epoch": 5.997827031725337, "grad_norm": 0.4119188189506531, "learning_rate": 2.7200990367875656e-06, "loss": 2.9967, "step": 1727 }, { "epoch": 6.0, "grad_norm": 0.5107985138893127, "learning_rate": 2.6903974773225702e-06, "loss": 2.9496, "step": 1728 }, { "epoch": 6.003476749239461, "grad_norm": 0.42430585622787476, "learning_rate": 2.660854485422298e-06, "loss": 3.0021, "step": 1729 }, { "epoch": 6.006953498478922, "grad_norm": 0.42097654938697815, "learning_rate": 2.6314701601059756e-06, "loss": 3.0434, "step": 1730 }, { "epoch": 6.010430247718383, "grad_norm": 0.43055519461631775, "learning_rate": 2.60224459986102e-06, "loss": 3.1388, "step": 1731 }, { "epoch": 6.013906996957845, "grad_norm": 0.42370346188545227, "learning_rate": 2.573177902642726e-06, "loss": 3.0623, "step": 1732 }, { "epoch": 6.017383746197305, "grad_norm": 0.41282978653907776, "learning_rate": 2.5442701658739186e-06, "loss": 3.0301, "step": 1733 }, { "epoch": 6.020860495436767, "grad_norm": 0.42440617084503174, "learning_rate": 2.515521486444655e-06, "loss": 3.0945, "step": 1734 }, { "epoch": 6.024337244676228, "grad_norm": 0.42208147048950195, "learning_rate": 2.4869319607118636e-06, "loss": 3.138, "step": 1735 }, { "epoch": 6.0278139939156885, "grad_norm": 0.40505877137184143, "learning_rate": 2.4585016844990316e-06, "loss": 2.9389, "step": 1736 }, { "epoch": 6.03129074315515, "grad_norm": 0.4153197109699249, "learning_rate": 2.430230753095891e-06, "loss": 3.027, "step": 1737 }, { "epoch": 6.034767492394611, "grad_norm": 0.41399866342544556, "learning_rate": 2.4021192612581178e-06, "loss": 2.9704, "step": 1738 }, { "epoch": 6.0382442416340725, "grad_norm": 0.42374345660209656, "learning_rate": 2.3741673032069756e-06, "loss": 3.0041, "step": 1739 }, { "epoch": 6.041720990873533, "grad_norm": 0.41399243474006653, "learning_rate": 2.3463749726290286e-06, "loss": 2.9925, "step": 1740 }, { "epoch": 6.045197740112994, "grad_norm": 0.4206804633140564, "learning_rate": 2.318742362675813e-06, "loss": 3.001, "step": 1741 }, { "epoch": 6.0486744893524556, "grad_norm": 0.4350409209728241, "learning_rate": 2.291269565963522e-06, "loss": 3.1501, "step": 1742 }, { "epoch": 6.052151238591916, "grad_norm": 0.4238588809967041, "learning_rate": 2.2639566745727205e-06, "loss": 3.108, "step": 1743 }, { "epoch": 6.055627987831378, "grad_norm": 0.42708900570869446, "learning_rate": 2.2368037800480023e-06, "loss": 3.071, "step": 1744 }, { "epoch": 6.059104737070839, "grad_norm": 0.43548697233200073, "learning_rate": 2.2098109733977136e-06, "loss": 3.0807, "step": 1745 }, { "epoch": 6.0625814863103, "grad_norm": 0.4319799542427063, "learning_rate": 2.1829783450936283e-06, "loss": 3.1164, "step": 1746 }, { "epoch": 6.066058235549761, "grad_norm": 0.41852834820747375, "learning_rate": 2.1563059850706337e-06, "loss": 3.0957, "step": 1747 }, { "epoch": 6.069534984789222, "grad_norm": 0.4240981340408325, "learning_rate": 2.12979398272648e-06, "loss": 2.9745, "step": 1748 }, { "epoch": 6.073011734028683, "grad_norm": 0.41493934392929077, "learning_rate": 2.1034424269214257e-06, "loss": 3.0467, "step": 1749 }, { "epoch": 6.076488483268144, "grad_norm": 0.411040335893631, "learning_rate": 2.0772514059779535e-06, "loss": 2.9896, "step": 1750 }, { "epoch": 6.079965232507606, "grad_norm": 0.41265007853507996, "learning_rate": 2.0512210076804982e-06, "loss": 2.9951, "step": 1751 }, { "epoch": 6.083441981747066, "grad_norm": 0.4220527410507202, "learning_rate": 2.0253513192751373e-06, "loss": 3.0334, "step": 1752 }, { "epoch": 6.086918730986528, "grad_norm": 0.4182776212692261, "learning_rate": 1.9996424274692725e-06, "loss": 3.0428, "step": 1753 }, { "epoch": 6.090395480225989, "grad_norm": 0.4076248109340668, "learning_rate": 1.974094418431388e-06, "loss": 2.9442, "step": 1754 }, { "epoch": 6.0938722294654495, "grad_norm": 0.40764060616493225, "learning_rate": 1.9487073777907316e-06, "loss": 2.9983, "step": 1755 }, { "epoch": 6.097348978704911, "grad_norm": 0.41706383228302, "learning_rate": 1.9234813906370276e-06, "loss": 3.0464, "step": 1756 }, { "epoch": 6.100825727944372, "grad_norm": 0.41518938541412354, "learning_rate": 1.8984165415202094e-06, "loss": 3.0598, "step": 1757 }, { "epoch": 6.1043024771838335, "grad_norm": 0.41128143668174744, "learning_rate": 1.873512914450104e-06, "loss": 2.9316, "step": 1758 }, { "epoch": 6.107779226423294, "grad_norm": 0.4145541191101074, "learning_rate": 1.8487705928961874e-06, "loss": 3.1041, "step": 1759 }, { "epoch": 6.111255975662755, "grad_norm": 0.40822362899780273, "learning_rate": 1.8241896597872842e-06, "loss": 3.0146, "step": 1760 }, { "epoch": 6.1147327249022165, "grad_norm": 0.4129459857940674, "learning_rate": 1.7997701975112912e-06, "loss": 3.0685, "step": 1761 }, { "epoch": 6.118209474141677, "grad_norm": 0.4087814390659332, "learning_rate": 1.77551228791491e-06, "loss": 2.9484, "step": 1762 }, { "epoch": 6.121686223381139, "grad_norm": 0.41240808367729187, "learning_rate": 1.7514160123033584e-06, "loss": 3.0681, "step": 1763 }, { "epoch": 6.1251629726206, "grad_norm": 0.4155349135398865, "learning_rate": 1.7274814514400994e-06, "loss": 3.1166, "step": 1764 }, { "epoch": 6.128639721860061, "grad_norm": 0.41594305634498596, "learning_rate": 1.70370868554659e-06, "loss": 3.0405, "step": 1765 }, { "epoch": 6.132116471099522, "grad_norm": 0.4166364371776581, "learning_rate": 1.6800977943019936e-06, "loss": 3.0783, "step": 1766 }, { "epoch": 6.135593220338983, "grad_norm": 0.42120352387428284, "learning_rate": 1.6566488568429294e-06, "loss": 3.0457, "step": 1767 }, { "epoch": 6.139069969578444, "grad_norm": 0.40205660462379456, "learning_rate": 1.6333619517631793e-06, "loss": 3.0141, "step": 1768 }, { "epoch": 6.142546718817905, "grad_norm": 0.40919235348701477, "learning_rate": 1.6102371571134423e-06, "loss": 3.0343, "step": 1769 }, { "epoch": 6.146023468057367, "grad_norm": 0.4099332392215729, "learning_rate": 1.5872745504010799e-06, "loss": 2.9358, "step": 1770 }, { "epoch": 6.149500217296827, "grad_norm": 0.4156320095062256, "learning_rate": 1.5644742085898556e-06, "loss": 3.0225, "step": 1771 }, { "epoch": 6.152976966536288, "grad_norm": 0.41079193353652954, "learning_rate": 1.5418362080996507e-06, "loss": 3.0517, "step": 1772 }, { "epoch": 6.15645371577575, "grad_norm": 0.4225104749202728, "learning_rate": 1.5193606248062486e-06, "loss": 3.0728, "step": 1773 }, { "epoch": 6.1599304650152105, "grad_norm": 0.41226664185523987, "learning_rate": 1.497047534041035e-06, "loss": 3.0909, "step": 1774 }, { "epoch": 6.163407214254672, "grad_norm": 0.4080653488636017, "learning_rate": 1.4748970105907866e-06, "loss": 3.0225, "step": 1775 }, { "epoch": 6.166883963494133, "grad_norm": 0.42255541682243347, "learning_rate": 1.4529091286973995e-06, "loss": 3.0809, "step": 1776 }, { "epoch": 6.170360712733594, "grad_norm": 0.4306301474571228, "learning_rate": 1.4310839620576444e-06, "loss": 3.0776, "step": 1777 }, { "epoch": 6.173837461973055, "grad_norm": 0.4123653769493103, "learning_rate": 1.4094215838229176e-06, "loss": 2.9917, "step": 1778 }, { "epoch": 6.177314211212516, "grad_norm": 0.40510258078575134, "learning_rate": 1.3879220665989955e-06, "loss": 2.9276, "step": 1779 }, { "epoch": 6.1807909604519775, "grad_norm": 0.41338902711868286, "learning_rate": 1.3665854824458036e-06, "loss": 2.9885, "step": 1780 }, { "epoch": 6.184267709691438, "grad_norm": 0.40919315814971924, "learning_rate": 1.3454119028771528e-06, "loss": 2.9884, "step": 1781 }, { "epoch": 6.1877444589309, "grad_norm": 0.40917351841926575, "learning_rate": 1.3244013988605086e-06, "loss": 3.0144, "step": 1782 }, { "epoch": 6.191221208170361, "grad_norm": 0.4239337146282196, "learning_rate": 1.303554040816779e-06, "loss": 3.168, "step": 1783 }, { "epoch": 6.194697957409822, "grad_norm": 0.42139217257499695, "learning_rate": 1.282869898620026e-06, "loss": 3.0753, "step": 1784 }, { "epoch": 6.198174706649283, "grad_norm": 0.42025497555732727, "learning_rate": 1.2623490415972938e-06, "loss": 3.0688, "step": 1785 }, { "epoch": 6.201651455888744, "grad_norm": 0.4139435589313507, "learning_rate": 1.2419915385283088e-06, "loss": 3.0563, "step": 1786 }, { "epoch": 6.205128205128205, "grad_norm": 0.3968459367752075, "learning_rate": 1.2217974576453073e-06, "loss": 2.9678, "step": 1787 }, { "epoch": 6.208604954367666, "grad_norm": 0.4100431203842163, "learning_rate": 1.2017668666327753e-06, "loss": 3.0309, "step": 1788 }, { "epoch": 6.212081703607128, "grad_norm": 0.4222622811794281, "learning_rate": 1.1818998326272369e-06, "loss": 3.1016, "step": 1789 }, { "epoch": 6.215558452846588, "grad_norm": 0.42679232358932495, "learning_rate": 1.1621964222170213e-06, "loss": 3.0438, "step": 1790 }, { "epoch": 6.219035202086049, "grad_norm": 0.4114995002746582, "learning_rate": 1.1426567014420297e-06, "loss": 3.0503, "step": 1791 }, { "epoch": 6.222511951325511, "grad_norm": 0.41597941517829895, "learning_rate": 1.1232807357935248e-06, "loss": 2.9836, "step": 1792 }, { "epoch": 6.2259887005649714, "grad_norm": 0.4132709205150604, "learning_rate": 1.1040685902139304e-06, "loss": 3.0925, "step": 1793 }, { "epoch": 6.229465449804433, "grad_norm": 0.4016041159629822, "learning_rate": 1.08502032909657e-06, "loss": 3.0071, "step": 1794 }, { "epoch": 6.232942199043894, "grad_norm": 0.4292777478694916, "learning_rate": 1.0661360162855016e-06, "loss": 3.1368, "step": 1795 }, { "epoch": 6.236418948283355, "grad_norm": 0.4259623885154724, "learning_rate": 1.0474157150752672e-06, "loss": 3.0294, "step": 1796 }, { "epoch": 6.239895697522816, "grad_norm": 0.4248141646385193, "learning_rate": 1.0288594882106707e-06, "loss": 3.0409, "step": 1797 }, { "epoch": 6.243372446762277, "grad_norm": 0.41500625014305115, "learning_rate": 1.0104673978866164e-06, "loss": 3.069, "step": 1798 }, { "epoch": 6.2468491960017385, "grad_norm": 0.40690165758132935, "learning_rate": 9.922395057478607e-07, "loss": 3.0169, "step": 1799 }, { "epoch": 6.250325945241199, "grad_norm": 0.40856775641441345, "learning_rate": 9.741758728888218e-07, "loss": 3.0363, "step": 1800 }, { "epoch": 6.253802694480661, "grad_norm": 0.4182389974594116, "learning_rate": 9.562765598533641e-07, "loss": 3.0165, "step": 1801 }, { "epoch": 6.257279443720122, "grad_norm": 0.42129838466644287, "learning_rate": 9.385416266345982e-07, "loss": 3.0921, "step": 1802 }, { "epoch": 6.260756192959583, "grad_norm": 0.4121788740158081, "learning_rate": 9.209711326746918e-07, "loss": 3.0918, "step": 1803 }, { "epoch": 6.264232942199044, "grad_norm": 0.40901103615760803, "learning_rate": 9.035651368646648e-07, "loss": 3.1085, "step": 1804 }, { "epoch": 6.267709691438505, "grad_norm": 0.3984892666339874, "learning_rate": 8.86323697544178e-07, "loss": 3.0205, "step": 1805 }, { "epoch": 6.271186440677966, "grad_norm": 0.41698625683784485, "learning_rate": 8.692468725013448e-07, "loss": 3.0734, "step": 1806 }, { "epoch": 6.274663189917427, "grad_norm": 0.4178268611431122, "learning_rate": 8.523347189725639e-07, "loss": 3.0949, "step": 1807 }, { "epoch": 6.278139939156889, "grad_norm": 0.41233837604522705, "learning_rate": 8.355872936422759e-07, "loss": 2.9744, "step": 1808 }, { "epoch": 6.281616688396349, "grad_norm": 0.4059881567955017, "learning_rate": 8.190046526428242e-07, "loss": 2.9941, "step": 1809 }, { "epoch": 6.28509343763581, "grad_norm": 0.4065244495868683, "learning_rate": 8.025868515542268e-07, "loss": 3.0267, "step": 1810 }, { "epoch": 6.288570186875272, "grad_norm": 0.41115114092826843, "learning_rate": 7.863339454040275e-07, "loss": 3.05, "step": 1811 }, { "epoch": 6.292046936114732, "grad_norm": 0.4080888628959656, "learning_rate": 7.702459886670788e-07, "loss": 3.0628, "step": 1812 }, { "epoch": 6.295523685354194, "grad_norm": 0.41215333342552185, "learning_rate": 7.543230352653751e-07, "loss": 3.079, "step": 1813 }, { "epoch": 6.299000434593655, "grad_norm": 0.4129772484302521, "learning_rate": 7.385651385678649e-07, "loss": 3.0567, "step": 1814 }, { "epoch": 6.302477183833116, "grad_norm": 0.40636974573135376, "learning_rate": 7.229723513902831e-07, "loss": 3.0737, "step": 1815 }, { "epoch": 6.305953933072577, "grad_norm": 0.4125308692455292, "learning_rate": 7.07544725994963e-07, "loss": 2.9995, "step": 1816 }, { "epoch": 6.309430682312038, "grad_norm": 0.4017834961414337, "learning_rate": 6.922823140906753e-07, "loss": 3.0389, "step": 1817 }, { "epoch": 6.3129074315514995, "grad_norm": 0.39485737681388855, "learning_rate": 6.771851668324225e-07, "loss": 2.9166, "step": 1818 }, { "epoch": 6.31638418079096, "grad_norm": 0.41091981530189514, "learning_rate": 6.622533348213167e-07, "loss": 3.1168, "step": 1819 }, { "epoch": 6.319860930030422, "grad_norm": 0.40876245498657227, "learning_rate": 6.474868681043578e-07, "loss": 3.0703, "step": 1820 }, { "epoch": 6.3233376792698825, "grad_norm": 0.41209086775779724, "learning_rate": 6.328858161743112e-07, "loss": 3.079, "step": 1821 }, { "epoch": 6.326814428509344, "grad_norm": 0.4071592390537262, "learning_rate": 6.184502279695137e-07, "loss": 3.1334, "step": 1822 }, { "epoch": 6.330291177748805, "grad_norm": 0.3960627317428589, "learning_rate": 6.041801518737122e-07, "loss": 2.9825, "step": 1823 }, { "epoch": 6.333767926988266, "grad_norm": 0.4103507995605469, "learning_rate": 5.900756357159143e-07, "loss": 3.0592, "step": 1824 }, { "epoch": 6.337244676227727, "grad_norm": 0.4149017930030823, "learning_rate": 5.761367267702155e-07, "loss": 3.1168, "step": 1825 }, { "epoch": 6.340721425467188, "grad_norm": 0.4141378104686737, "learning_rate": 5.623634717556503e-07, "loss": 3.099, "step": 1826 }, { "epoch": 6.34419817470665, "grad_norm": 0.40302687883377075, "learning_rate": 5.487559168360301e-07, "loss": 3.0205, "step": 1827 }, { "epoch": 6.34767492394611, "grad_norm": 0.4126074016094208, "learning_rate": 5.353141076197887e-07, "loss": 3.0381, "step": 1828 }, { "epoch": 6.351151673185571, "grad_norm": 0.41772815585136414, "learning_rate": 5.220380891598265e-07, "loss": 3.0821, "step": 1829 }, { "epoch": 6.354628422425033, "grad_norm": 0.408920019865036, "learning_rate": 5.089279059533658e-07, "loss": 3.0344, "step": 1830 }, { "epoch": 6.358105171664493, "grad_norm": 0.39784538745880127, "learning_rate": 4.959836019417963e-07, "loss": 3.0118, "step": 1831 }, { "epoch": 6.361581920903955, "grad_norm": 0.41000446677207947, "learning_rate": 4.832052205105464e-07, "loss": 3.0489, "step": 1832 }, { "epoch": 6.365058670143416, "grad_norm": 0.40835651755332947, "learning_rate": 4.705928044888952e-07, "loss": 3.0752, "step": 1833 }, { "epoch": 6.368535419382877, "grad_norm": 0.40791556239128113, "learning_rate": 4.581463961498722e-07, "loss": 3.0884, "step": 1834 }, { "epoch": 6.372012168622338, "grad_norm": 0.408925861120224, "learning_rate": 4.45866037210102e-07, "loss": 3.0482, "step": 1835 }, { "epoch": 6.375488917861799, "grad_norm": 0.4026236832141876, "learning_rate": 4.337517688296544e-07, "loss": 2.9968, "step": 1836 }, { "epoch": 6.3789656671012605, "grad_norm": 0.4146400988101959, "learning_rate": 4.218036316119112e-07, "loss": 3.1706, "step": 1837 }, { "epoch": 6.382442416340721, "grad_norm": 0.4027198553085327, "learning_rate": 4.100216656034328e-07, "loss": 3.0222, "step": 1838 }, { "epoch": 6.385919165580183, "grad_norm": 0.40519019961357117, "learning_rate": 3.98405910293842e-07, "loss": 3.0531, "step": 1839 }, { "epoch": 6.3893959148196435, "grad_norm": 0.40851891040802, "learning_rate": 3.86956404615646e-07, "loss": 3.0943, "step": 1840 }, { "epoch": 6.392872664059105, "grad_norm": 0.40011292695999146, "learning_rate": 3.7567318694414765e-07, "loss": 3.013, "step": 1841 }, { "epoch": 6.396349413298566, "grad_norm": 0.40723660588264465, "learning_rate": 3.6455629509730136e-07, "loss": 3.091, "step": 1842 }, { "epoch": 6.399826162538027, "grad_norm": 0.41157960891723633, "learning_rate": 3.536057663355852e-07, "loss": 3.1858, "step": 1843 }, { "epoch": 6.403302911777488, "grad_norm": 0.4096364974975586, "learning_rate": 3.4282163736188424e-07, "loss": 3.0255, "step": 1844 }, { "epoch": 6.406779661016949, "grad_norm": 0.40242981910705566, "learning_rate": 3.3220394432135205e-07, "loss": 3.0467, "step": 1845 }, { "epoch": 6.410256410256411, "grad_norm": 0.4040038585662842, "learning_rate": 3.2175272280131064e-07, "loss": 3.0921, "step": 1846 }, { "epoch": 6.413733159495871, "grad_norm": 0.3992011547088623, "learning_rate": 3.114680078311005e-07, "loss": 3.0465, "step": 1847 }, { "epoch": 6.417209908735332, "grad_norm": 0.4034344255924225, "learning_rate": 3.013498338820031e-07, "loss": 3.02, "step": 1848 }, { "epoch": 6.420686657974794, "grad_norm": 0.4033249020576477, "learning_rate": 2.913982348670907e-07, "loss": 3.0312, "step": 1849 }, { "epoch": 6.424163407214254, "grad_norm": 0.4111652076244354, "learning_rate": 2.816132441411379e-07, "loss": 3.1184, "step": 1850 }, { "epoch": 6.427640156453716, "grad_norm": 0.41480228304862976, "learning_rate": 2.71994894500488e-07, "loss": 3.0247, "step": 1851 }, { "epoch": 6.431116905693177, "grad_norm": 0.40189048647880554, "learning_rate": 2.6254321818295345e-07, "loss": 3.129, "step": 1852 }, { "epoch": 6.434593654932638, "grad_norm": 0.40309739112854004, "learning_rate": 2.532582468677214e-07, "loss": 3.0843, "step": 1853 }, { "epoch": 6.438070404172099, "grad_norm": 0.4054506719112396, "learning_rate": 2.441400116752146e-07, "loss": 2.9962, "step": 1854 }, { "epoch": 6.44154715341156, "grad_norm": 0.40184342861175537, "learning_rate": 2.3518854316701977e-07, "loss": 3.0369, "step": 1855 }, { "epoch": 6.445023902651021, "grad_norm": 0.41236573457717896, "learning_rate": 2.2640387134577058e-07, "loss": 3.121, "step": 1856 }, { "epoch": 6.448500651890482, "grad_norm": 0.4023594856262207, "learning_rate": 2.1778602565504237e-07, "loss": 3.07, "step": 1857 }, { "epoch": 6.451977401129944, "grad_norm": 0.4037077724933624, "learning_rate": 2.0933503497926888e-07, "loss": 3.0607, "step": 1858 }, { "epoch": 6.4554541503694045, "grad_norm": 0.4005909264087677, "learning_rate": 2.010509276436201e-07, "loss": 3.0266, "step": 1859 }, { "epoch": 6.458930899608866, "grad_norm": 0.40143847465515137, "learning_rate": 1.9293373141394122e-07, "loss": 3.0043, "step": 1860 }, { "epoch": 6.462407648848327, "grad_norm": 0.38799387216567993, "learning_rate": 1.8498347349663602e-07, "loss": 2.8786, "step": 1861 }, { "epoch": 6.465884398087788, "grad_norm": 0.40057605504989624, "learning_rate": 1.7720018053857257e-07, "loss": 3.0905, "step": 1862 }, { "epoch": 6.469361147327249, "grad_norm": 0.406303733587265, "learning_rate": 1.6958387862701098e-07, "loss": 3.1033, "step": 1863 }, { "epoch": 6.47283789656671, "grad_norm": 0.40010327100753784, "learning_rate": 1.6213459328950352e-07, "loss": 3.0289, "step": 1864 }, { "epoch": 6.4763146458061716, "grad_norm": 0.3997277021408081, "learning_rate": 1.54852349493817e-07, "loss": 3.0767, "step": 1865 }, { "epoch": 6.479791395045632, "grad_norm": 0.40357694029808044, "learning_rate": 1.4773717164784373e-07, "loss": 3.0858, "step": 1866 }, { "epoch": 6.483268144285093, "grad_norm": 0.4033971130847931, "learning_rate": 1.4078908359952403e-07, "loss": 3.0353, "step": 1867 }, { "epoch": 6.486744893524555, "grad_norm": 0.40543147921562195, "learning_rate": 1.3400810863675174e-07, "loss": 3.0127, "step": 1868 }, { "epoch": 6.490221642764015, "grad_norm": 0.40934666991233826, "learning_rate": 1.2739426948732424e-07, "loss": 3.132, "step": 1869 }, { "epoch": 6.493698392003477, "grad_norm": 0.4018721878528595, "learning_rate": 1.2094758831883712e-07, "loss": 3.0282, "step": 1870 }, { "epoch": 6.497175141242938, "grad_norm": 0.40862470865249634, "learning_rate": 1.146680867386285e-07, "loss": 3.1336, "step": 1871 }, { "epoch": 6.500651890482399, "grad_norm": 0.4113106429576874, "learning_rate": 1.0855578579370695e-07, "loss": 3.0698, "step": 1872 }, { "epoch": 6.50412863972186, "grad_norm": 0.39900723099708557, "learning_rate": 1.0261070597065713e-07, "loss": 3.025, "step": 1873 }, { "epoch": 6.507605388961321, "grad_norm": 0.39849308133125305, "learning_rate": 9.683286719560647e-08, "loss": 3.042, "step": 1874 }, { "epoch": 6.511082138200782, "grad_norm": 0.39917635917663574, "learning_rate": 9.12222888341252e-08, "loss": 3.0235, "step": 1875 }, { "epoch": 6.514558887440243, "grad_norm": 0.40099823474884033, "learning_rate": 8.577898969119869e-08, "loss": 3.0162, "step": 1876 }, { "epoch": 6.518035636679705, "grad_norm": 0.40142250061035156, "learning_rate": 8.050298801111633e-08, "loss": 2.98, "step": 1877 }, { "epoch": 6.5215123859191655, "grad_norm": 0.40689295530319214, "learning_rate": 7.539430147745496e-08, "loss": 3.1645, "step": 1878 }, { "epoch": 6.524989135158627, "grad_norm": 0.39953750371932983, "learning_rate": 7.045294721299556e-08, "loss": 3.0468, "step": 1879 }, { "epoch": 6.528465884398088, "grad_norm": 0.4039493501186371, "learning_rate": 6.567894177967326e-08, "loss": 3.109, "step": 1880 }, { "epoch": 6.531942633637549, "grad_norm": 0.4100818336009979, "learning_rate": 6.107230117851636e-08, "loss": 3.0905, "step": 1881 }, { "epoch": 6.53541938287701, "grad_norm": 0.4003710448741913, "learning_rate": 5.663304084960186e-08, "loss": 3.0059, "step": 1882 }, { "epoch": 6.538896132116471, "grad_norm": 0.4005975127220154, "learning_rate": 5.236117567199439e-08, "loss": 2.9985, "step": 1883 }, { "epoch": 6.5423728813559325, "grad_norm": 0.39996832609176636, "learning_rate": 4.825671996370185e-08, "loss": 2.993, "step": 1884 }, { "epoch": 6.545849630595393, "grad_norm": 0.4038967490196228, "learning_rate": 4.431968748162541e-08, "loss": 3.0491, "step": 1885 }, { "epoch": 6.549326379834854, "grad_norm": 0.39866870641708374, "learning_rate": 4.055009142152067e-08, "loss": 2.9085, "step": 1886 }, { "epoch": 6.552803129074316, "grad_norm": 0.40169060230255127, "learning_rate": 3.6947944417925483e-08, "loss": 2.9973, "step": 1887 }, { "epoch": 6.556279878313776, "grad_norm": 0.41364067792892456, "learning_rate": 3.351325854417109e-08, "loss": 3.095, "step": 1888 }, { "epoch": 6.559756627553238, "grad_norm": 0.40573641657829285, "learning_rate": 3.0246045312282144e-08, "loss": 3.0339, "step": 1889 }, { "epoch": 6.563233376792699, "grad_norm": 0.39951014518737793, "learning_rate": 2.7146315672971212e-08, "loss": 3.0338, "step": 1890 }, { "epoch": 6.56671012603216, "grad_norm": 0.4012530744075775, "learning_rate": 2.4214080015610986e-08, "loss": 3.0469, "step": 1891 }, { "epoch": 6.570186875271621, "grad_norm": 0.4066547453403473, "learning_rate": 2.1449348168167682e-08, "loss": 3.0924, "step": 1892 }, { "epoch": 6.573663624511082, "grad_norm": 0.4027867913246155, "learning_rate": 1.8852129397189942e-08, "loss": 3.008, "step": 1893 }, { "epoch": 6.577140373750543, "grad_norm": 0.4034714102745056, "learning_rate": 1.6422432407781075e-08, "loss": 3.1072, "step": 1894 }, { "epoch": 6.580617122990004, "grad_norm": 0.3969672620296478, "learning_rate": 1.4160265343549083e-08, "loss": 3.0111, "step": 1895 }, { "epoch": 6.584093872229466, "grad_norm": 0.4007243514060974, "learning_rate": 1.2065635786595586e-08, "loss": 3.0336, "step": 1896 }, { "epoch": 6.5875706214689265, "grad_norm": 0.40860840678215027, "learning_rate": 1.0138550757493592e-08, "loss": 3.1238, "step": 1897 }, { "epoch": 6.591047370708388, "grad_norm": 0.40205299854278564, "learning_rate": 8.379016715254207e-09, "loss": 3.0705, "step": 1898 }, { "epoch": 6.594524119947849, "grad_norm": 0.39915701746940613, "learning_rate": 6.78703955730442e-09, "loss": 2.989, "step": 1899 }, { "epoch": 6.5980008691873095, "grad_norm": 0.3982495367527008, "learning_rate": 5.362624619470458e-09, "loss": 3.0144, "step": 1900 }, { "epoch": 6.601477618426771, "grad_norm": 0.40340107679367065, "learning_rate": 4.105776675966677e-09, "loss": 3.0345, "step": 1901 }, { "epoch": 6.604954367666232, "grad_norm": 0.3958084285259247, "learning_rate": 3.0164999393678117e-09, "loss": 3.0071, "step": 1902 }, { "epoch": 6.6084311169056935, "grad_norm": 0.3997930586338043, "learning_rate": 2.0947980606034203e-09, "loss": 3.0614, "step": 1903 }, { "epoch": 6.611907866145154, "grad_norm": 0.4008050560951233, "learning_rate": 1.3406741289412329e-09, "loss": 3.0648, "step": 1904 }, { "epoch": 6.615384615384615, "grad_norm": 0.3971216082572937, "learning_rate": 7.541306719704988e-10, "loss": 3.0075, "step": 1905 }, { "epoch": 6.618861364624077, "grad_norm": 0.39623579382896423, "learning_rate": 3.3516965562419013e-10, "loss": 2.9764, "step": 1906 }, { "epoch": 6.622338113863537, "grad_norm": 0.39841023087501526, "learning_rate": 8.379248411793939e-11, "loss": 3.0144, "step": 1907 } ], "logging_steps": 1, "max_steps": 1907, "num_input_tokens_seen": 0, "num_train_epochs": 7, "save_steps": 200, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 1.8113861270435267e+18, "train_batch_size": 32, "trial_name": null, "trial_params": null }