diff --git "a/limo/full/checkpoint-1545/trainer_state.json" "b/limo/full/checkpoint-1545/trainer_state.json" new file mode 100644--- /dev/null +++ "b/limo/full/checkpoint-1545/trainer_state.json" @@ -0,0 +1,10848 @@ +{ + "best_metric": null, + "best_model_checkpoint": null, + "epoch": 15.0, + "eval_steps": 500, + "global_step": 1545, + "is_hyper_param_search": false, + "is_local_process_zero": true, + "is_world_process_zero": true, + "log_history": [ + { + "epoch": 0.009708737864077669, + "grad_norm": 4.871730834654797, + "learning_rate": 4.999994831641374e-06, + "loss": 1.8478, + "step": 1 + }, + { + "epoch": 0.019417475728155338, + "grad_norm": 3.669459516332945, + "learning_rate": 4.9999793265868636e-06, + "loss": 1.4574, + "step": 2 + }, + { + "epoch": 0.02912621359223301, + "grad_norm": 3.321268843922114, + "learning_rate": 4.999953484900578e-06, + "loss": 1.5713, + "step": 3 + }, + { + "epoch": 0.038834951456310676, + "grad_norm": 4.757625498508145, + "learning_rate": 4.9999173066893655e-06, + "loss": 2.1849, + "step": 4 + }, + { + "epoch": 0.04854368932038835, + "grad_norm": 4.484235162199009, + "learning_rate": 4.9998707921028104e-06, + "loss": 2.538, + "step": 5 + }, + { + "epoch": 0.05825242718446602, + "grad_norm": 3.0105096661858055, + "learning_rate": 4.999813941333237e-06, + "loss": 1.5868, + "step": 6 + }, + { + "epoch": 0.06796116504854369, + "grad_norm": 2.8411927476391083, + "learning_rate": 4.999746754615704e-06, + "loss": 1.6472, + "step": 7 + }, + { + "epoch": 0.07766990291262135, + "grad_norm": 1.506880067190551, + "learning_rate": 4.9996692322280085e-06, + "loss": 1.0478, + "step": 8 + }, + { + "epoch": 0.08737864077669903, + "grad_norm": 4.0008731506353445, + "learning_rate": 4.999581374490681e-06, + "loss": 2.373, + "step": 9 + }, + { + "epoch": 0.0970873786407767, + "grad_norm": 2.4382442959213715, + "learning_rate": 4.999483181766986e-06, + "loss": 1.7706, + "step": 10 + }, + { + "epoch": 0.10679611650485436, + "grad_norm": 1.6052608591178483, + "learning_rate": 4.999374654462919e-06, + "loss": 1.3104, + "step": 11 + }, + { + "epoch": 0.11650485436893204, + "grad_norm": 2.7437905836581296, + "learning_rate": 4.999255793027207e-06, + "loss": 2.048, + "step": 12 + }, + { + "epoch": 0.1262135922330097, + "grad_norm": 1.3906335136930252, + "learning_rate": 4.999126597951305e-06, + "loss": 1.0066, + "step": 13 + }, + { + "epoch": 0.13592233009708737, + "grad_norm": 1.3340600696507865, + "learning_rate": 4.998987069769394e-06, + "loss": 1.0998, + "step": 14 + }, + { + "epoch": 0.14563106796116504, + "grad_norm": 1.7130835123840018, + "learning_rate": 4.998837209058379e-06, + "loss": 1.6654, + "step": 15 + }, + { + "epoch": 0.1553398058252427, + "grad_norm": 1.7208415743567615, + "learning_rate": 4.998677016437888e-06, + "loss": 1.3388, + "step": 16 + }, + { + "epoch": 0.1650485436893204, + "grad_norm": 2.4073143065161937, + "learning_rate": 4.998506492570266e-06, + "loss": 1.8365, + "step": 17 + }, + { + "epoch": 0.17475728155339806, + "grad_norm": 1.4448713906198571, + "learning_rate": 4.998325638160576e-06, + "loss": 1.4415, + "step": 18 + }, + { + "epoch": 0.18446601941747573, + "grad_norm": 1.5964597845558501, + "learning_rate": 4.998134453956596e-06, + "loss": 1.1629, + "step": 19 + }, + { + "epoch": 0.1941747572815534, + "grad_norm": 1.3355139686727637, + "learning_rate": 4.997932940748811e-06, + "loss": 1.2245, + "step": 20 + }, + { + "epoch": 0.20388349514563106, + "grad_norm": 1.2386749477462127, + "learning_rate": 4.997721099370416e-06, + "loss": 1.2827, + "step": 21 + }, + { + "epoch": 0.21359223300970873, + "grad_norm": 1.3520067333735615, + "learning_rate": 4.997498930697308e-06, + "loss": 1.3963, + "step": 22 + }, + { + "epoch": 0.22330097087378642, + "grad_norm": 1.7137484637690432, + "learning_rate": 4.997266435648086e-06, + "loss": 1.4022, + "step": 23 + }, + { + "epoch": 0.23300970873786409, + "grad_norm": 1.9868889107654797, + "learning_rate": 4.997023615184044e-06, + "loss": 1.7246, + "step": 24 + }, + { + "epoch": 0.24271844660194175, + "grad_norm": 1.6032606855000195, + "learning_rate": 4.996770470309167e-06, + "loss": 1.7975, + "step": 25 + }, + { + "epoch": 0.2524271844660194, + "grad_norm": 1.268066436948988, + "learning_rate": 4.996507002070131e-06, + "loss": 1.3028, + "step": 26 + }, + { + "epoch": 0.2621359223300971, + "grad_norm": 1.2303902485186446, + "learning_rate": 4.996233211556295e-06, + "loss": 1.3613, + "step": 27 + }, + { + "epoch": 0.27184466019417475, + "grad_norm": 0.9549363286063691, + "learning_rate": 4.9959490998996974e-06, + "loss": 0.9778, + "step": 28 + }, + { + "epoch": 0.2815533980582524, + "grad_norm": 1.2603575097056405, + "learning_rate": 4.995654668275049e-06, + "loss": 1.4077, + "step": 29 + }, + { + "epoch": 0.2912621359223301, + "grad_norm": 1.5075543470098034, + "learning_rate": 4.995349917899735e-06, + "loss": 1.5452, + "step": 30 + }, + { + "epoch": 0.30097087378640774, + "grad_norm": 0.9482722379956937, + "learning_rate": 4.9950348500338005e-06, + "loss": 1.0589, + "step": 31 + }, + { + "epoch": 0.3106796116504854, + "grad_norm": 0.8370896925167312, + "learning_rate": 4.994709465979954e-06, + "loss": 0.977, + "step": 32 + }, + { + "epoch": 0.32038834951456313, + "grad_norm": 1.204641294852316, + "learning_rate": 4.994373767083556e-06, + "loss": 1.2391, + "step": 33 + }, + { + "epoch": 0.3300970873786408, + "grad_norm": 1.1272285807079887, + "learning_rate": 4.994027754732616e-06, + "loss": 1.2678, + "step": 34 + }, + { + "epoch": 0.33980582524271846, + "grad_norm": 1.0240110484845462, + "learning_rate": 4.993671430357788e-06, + "loss": 1.1705, + "step": 35 + }, + { + "epoch": 0.34951456310679613, + "grad_norm": 1.3831448718921984, + "learning_rate": 4.99330479543236e-06, + "loss": 1.6517, + "step": 36 + }, + { + "epoch": 0.3592233009708738, + "grad_norm": 1.0046287010394588, + "learning_rate": 4.992927851472254e-06, + "loss": 1.2714, + "step": 37 + }, + { + "epoch": 0.36893203883495146, + "grad_norm": 1.3527812004094968, + "learning_rate": 4.992540600036014e-06, + "loss": 1.6523, + "step": 38 + }, + { + "epoch": 0.3786407766990291, + "grad_norm": 1.162751011119127, + "learning_rate": 4.992143042724805e-06, + "loss": 1.4164, + "step": 39 + }, + { + "epoch": 0.3883495145631068, + "grad_norm": 1.183212466699315, + "learning_rate": 4.991735181182401e-06, + "loss": 1.6037, + "step": 40 + }, + { + "epoch": 0.39805825242718446, + "grad_norm": 1.0671445151315921, + "learning_rate": 4.991317017095182e-06, + "loss": 1.2074, + "step": 41 + }, + { + "epoch": 0.4077669902912621, + "grad_norm": 1.1374295195195832, + "learning_rate": 4.990888552192126e-06, + "loss": 1.2052, + "step": 42 + }, + { + "epoch": 0.4174757281553398, + "grad_norm": 1.1838914278145132, + "learning_rate": 4.9904497882448004e-06, + "loss": 1.3772, + "step": 43 + }, + { + "epoch": 0.42718446601941745, + "grad_norm": 1.1898023150727208, + "learning_rate": 4.990000727067357e-06, + "loss": 1.4616, + "step": 44 + }, + { + "epoch": 0.4368932038834951, + "grad_norm": 1.3045045420389443, + "learning_rate": 4.989541370516523e-06, + "loss": 1.2714, + "step": 45 + }, + { + "epoch": 0.44660194174757284, + "grad_norm": 1.1641307331044313, + "learning_rate": 4.989071720491595e-06, + "loss": 1.2003, + "step": 46 + }, + { + "epoch": 0.4563106796116505, + "grad_norm": 0.6576132196355234, + "learning_rate": 4.988591778934428e-06, + "loss": 0.791, + "step": 47 + }, + { + "epoch": 0.46601941747572817, + "grad_norm": 0.8500656663270045, + "learning_rate": 4.9881015478294294e-06, + "loss": 0.9369, + "step": 48 + }, + { + "epoch": 0.47572815533980584, + "grad_norm": 1.4233952816867905, + "learning_rate": 4.987601029203553e-06, + "loss": 1.4027, + "step": 49 + }, + { + "epoch": 0.4854368932038835, + "grad_norm": 1.1258574635846645, + "learning_rate": 4.987090225126285e-06, + "loss": 1.3511, + "step": 50 + }, + { + "epoch": 0.49514563106796117, + "grad_norm": 1.1687633946081721, + "learning_rate": 4.98656913770964e-06, + "loss": 1.1697, + "step": 51 + }, + { + "epoch": 0.5048543689320388, + "grad_norm": 1.2317336138068717, + "learning_rate": 4.986037769108154e-06, + "loss": 1.4503, + "step": 52 + }, + { + "epoch": 0.5145631067961165, + "grad_norm": 0.7574877986414429, + "learning_rate": 4.9854961215188676e-06, + "loss": 1.1225, + "step": 53 + }, + { + "epoch": 0.5242718446601942, + "grad_norm": 0.8037978880160027, + "learning_rate": 4.984944197181324e-06, + "loss": 0.9828, + "step": 54 + }, + { + "epoch": 0.5339805825242718, + "grad_norm": 1.1494946115521347, + "learning_rate": 4.9843819983775575e-06, + "loss": 1.2705, + "step": 55 + }, + { + "epoch": 0.5436893203883495, + "grad_norm": 1.0404784765183215, + "learning_rate": 4.983809527432086e-06, + "loss": 1.1189, + "step": 56 + }, + { + "epoch": 0.5533980582524272, + "grad_norm": 1.0072279484535855, + "learning_rate": 4.983226786711895e-06, + "loss": 1.2205, + "step": 57 + }, + { + "epoch": 0.5631067961165048, + "grad_norm": 0.9065921594118113, + "learning_rate": 4.982633778626437e-06, + "loss": 1.07, + "step": 58 + }, + { + "epoch": 0.5728155339805825, + "grad_norm": 1.2370148909491399, + "learning_rate": 4.982030505627613e-06, + "loss": 1.535, + "step": 59 + }, + { + "epoch": 0.5825242718446602, + "grad_norm": 0.7862714572091344, + "learning_rate": 4.98141697020977e-06, + "loss": 1.0275, + "step": 60 + }, + { + "epoch": 0.5922330097087378, + "grad_norm": 1.0179109472315906, + "learning_rate": 4.9807931749096836e-06, + "loss": 1.1225, + "step": 61 + }, + { + "epoch": 0.6019417475728155, + "grad_norm": 0.6647718819463199, + "learning_rate": 4.980159122306551e-06, + "loss": 0.9871, + "step": 62 + }, + { + "epoch": 0.6116504854368932, + "grad_norm": 0.7487245734536395, + "learning_rate": 4.979514815021984e-06, + "loss": 1.2099, + "step": 63 + }, + { + "epoch": 0.6213592233009708, + "grad_norm": 1.1029648607720772, + "learning_rate": 4.978860255719989e-06, + "loss": 1.0207, + "step": 64 + }, + { + "epoch": 0.6310679611650486, + "grad_norm": 0.7621265063758594, + "learning_rate": 4.978195447106965e-06, + "loss": 1.1363, + "step": 65 + }, + { + "epoch": 0.6407766990291263, + "grad_norm": 2.313544656936807, + "learning_rate": 4.9775203919316864e-06, + "loss": 0.9453, + "step": 66 + }, + { + "epoch": 0.6504854368932039, + "grad_norm": 0.8280847399908062, + "learning_rate": 4.976835092985297e-06, + "loss": 1.1131, + "step": 67 + }, + { + "epoch": 0.6601941747572816, + "grad_norm": 0.8927638160250211, + "learning_rate": 4.976139553101291e-06, + "loss": 1.133, + "step": 68 + }, + { + "epoch": 0.6699029126213593, + "grad_norm": 0.9610176243493397, + "learning_rate": 4.975433775155509e-06, + "loss": 1.1768, + "step": 69 + }, + { + "epoch": 0.6796116504854369, + "grad_norm": 1.194498620463663, + "learning_rate": 4.974717762066123e-06, + "loss": 1.3615, + "step": 70 + }, + { + "epoch": 0.6893203883495146, + "grad_norm": 0.8347048742413943, + "learning_rate": 4.973991516793621e-06, + "loss": 1.2546, + "step": 71 + }, + { + "epoch": 0.6990291262135923, + "grad_norm": 0.9172122226894214, + "learning_rate": 4.973255042340801e-06, + "loss": 1.1607, + "step": 72 + }, + { + "epoch": 0.7087378640776699, + "grad_norm": 1.03327431376727, + "learning_rate": 4.972508341752754e-06, + "loss": 1.361, + "step": 73 + }, + { + "epoch": 0.7184466019417476, + "grad_norm": 1.3504753816166224, + "learning_rate": 4.9717514181168534e-06, + "loss": 0.9757, + "step": 74 + }, + { + "epoch": 0.7281553398058253, + "grad_norm": 1.002604662508283, + "learning_rate": 4.970984274562741e-06, + "loss": 1.145, + "step": 75 + }, + { + "epoch": 0.7378640776699029, + "grad_norm": 0.8521212437833093, + "learning_rate": 4.970206914262315e-06, + "loss": 1.0841, + "step": 76 + }, + { + "epoch": 0.7475728155339806, + "grad_norm": 0.7268667125506638, + "learning_rate": 4.969419340429717e-06, + "loss": 1.0124, + "step": 77 + }, + { + "epoch": 0.7572815533980582, + "grad_norm": 1.0230017430471454, + "learning_rate": 4.968621556321319e-06, + "loss": 1.4082, + "step": 78 + }, + { + "epoch": 0.7669902912621359, + "grad_norm": 0.8075925843214921, + "learning_rate": 4.967813565235708e-06, + "loss": 0.9341, + "step": 79 + }, + { + "epoch": 0.7766990291262136, + "grad_norm": 0.7417143914647092, + "learning_rate": 4.966995370513675e-06, + "loss": 0.9597, + "step": 80 + }, + { + "epoch": 0.7864077669902912, + "grad_norm": 0.9927625054735497, + "learning_rate": 4.966166975538197e-06, + "loss": 1.298, + "step": 81 + }, + { + "epoch": 0.7961165048543689, + "grad_norm": 0.6918994370607547, + "learning_rate": 4.965328383734429e-06, + "loss": 0.9559, + "step": 82 + }, + { + "epoch": 0.8058252427184466, + "grad_norm": 0.9084696462282987, + "learning_rate": 4.964479598569686e-06, + "loss": 1.3896, + "step": 83 + }, + { + "epoch": 0.8155339805825242, + "grad_norm": 0.9240522865451553, + "learning_rate": 4.963620623553428e-06, + "loss": 1.2391, + "step": 84 + }, + { + "epoch": 0.8252427184466019, + "grad_norm": 0.9801285757696232, + "learning_rate": 4.962751462237248e-06, + "loss": 0.9936, + "step": 85 + }, + { + "epoch": 0.8349514563106796, + "grad_norm": 0.7316677725651993, + "learning_rate": 4.9618721182148564e-06, + "loss": 0.8096, + "step": 86 + }, + { + "epoch": 0.8446601941747572, + "grad_norm": 0.7986070833021874, + "learning_rate": 4.960982595122064e-06, + "loss": 0.9936, + "step": 87 + }, + { + "epoch": 0.8543689320388349, + "grad_norm": 2.991170115347129, + "learning_rate": 4.960082896636773e-06, + "loss": 1.2219, + "step": 88 + }, + { + "epoch": 0.8640776699029126, + "grad_norm": 0.841666379855367, + "learning_rate": 4.959173026478952e-06, + "loss": 1.0418, + "step": 89 + }, + { + "epoch": 0.8737864077669902, + "grad_norm": 0.8276404717553167, + "learning_rate": 4.958252988410631e-06, + "loss": 0.9154, + "step": 90 + }, + { + "epoch": 0.883495145631068, + "grad_norm": 0.9617539349436555, + "learning_rate": 4.9573227862358794e-06, + "loss": 1.1546, + "step": 91 + }, + { + "epoch": 0.8932038834951457, + "grad_norm": 0.9410894322853203, + "learning_rate": 4.956382423800791e-06, + "loss": 1.2453, + "step": 92 + }, + { + "epoch": 0.9029126213592233, + "grad_norm": 0.7461574178337397, + "learning_rate": 4.955431904993471e-06, + "loss": 1.0039, + "step": 93 + }, + { + "epoch": 0.912621359223301, + "grad_norm": 0.7787771559636139, + "learning_rate": 4.954471233744015e-06, + "loss": 0.7834, + "step": 94 + }, + { + "epoch": 0.9223300970873787, + "grad_norm": 0.8319061238375747, + "learning_rate": 4.9535004140245005e-06, + "loss": 0.9597, + "step": 95 + }, + { + "epoch": 0.9320388349514563, + "grad_norm": 1.241881123747875, + "learning_rate": 4.952519449848962e-06, + "loss": 1.2155, + "step": 96 + }, + { + "epoch": 0.941747572815534, + "grad_norm": 0.9416478821490492, + "learning_rate": 4.951528345273379e-06, + "loss": 0.9302, + "step": 97 + }, + { + "epoch": 0.9514563106796117, + "grad_norm": 0.6462620680684062, + "learning_rate": 4.950527104395659e-06, + "loss": 0.9483, + "step": 98 + }, + { + "epoch": 0.9611650485436893, + "grad_norm": 1.3945318287905581, + "learning_rate": 4.9495157313556185e-06, + "loss": 0.884, + "step": 99 + }, + { + "epoch": 0.970873786407767, + "grad_norm": 0.6127133377291775, + "learning_rate": 4.94849423033497e-06, + "loss": 0.6754, + "step": 100 + }, + { + "epoch": 0.9805825242718447, + "grad_norm": 0.8204199917952882, + "learning_rate": 4.9474626055573e-06, + "loss": 1.1493, + "step": 101 + }, + { + "epoch": 0.9902912621359223, + "grad_norm": 0.6412217993750089, + "learning_rate": 4.946420861288051e-06, + "loss": 0.9205, + "step": 102 + }, + { + "epoch": 1.0, + "grad_norm": 0.7254619944280207, + "learning_rate": 4.9453690018345144e-06, + "loss": 0.9894, + "step": 103 + }, + { + "epoch": 1.0097087378640777, + "grad_norm": 0.8838225821340072, + "learning_rate": 4.944307031545797e-06, + "loss": 1.0139, + "step": 104 + }, + { + "epoch": 1.0194174757281553, + "grad_norm": 0.8681724674727661, + "learning_rate": 4.943234954812812e-06, + "loss": 1.2911, + "step": 105 + }, + { + "epoch": 1.029126213592233, + "grad_norm": 0.6707492146981981, + "learning_rate": 4.942152776068264e-06, + "loss": 0.8712, + "step": 106 + }, + { + "epoch": 1.0388349514563107, + "grad_norm": 0.8258075866719828, + "learning_rate": 4.941060499786622e-06, + "loss": 1.2535, + "step": 107 + }, + { + "epoch": 1.0485436893203883, + "grad_norm": 0.9343039880211657, + "learning_rate": 4.939958130484106e-06, + "loss": 1.0248, + "step": 108 + }, + { + "epoch": 1.058252427184466, + "grad_norm": 0.6215772104671746, + "learning_rate": 4.938845672718668e-06, + "loss": 0.899, + "step": 109 + }, + { + "epoch": 1.0679611650485437, + "grad_norm": 0.6106502770977033, + "learning_rate": 4.937723131089974e-06, + "loss": 0.8534, + "step": 110 + }, + { + "epoch": 1.0776699029126213, + "grad_norm": 0.8527770514681675, + "learning_rate": 4.93659051023938e-06, + "loss": 1.0798, + "step": 111 + }, + { + "epoch": 1.087378640776699, + "grad_norm": 0.6013648488199381, + "learning_rate": 4.93544781484992e-06, + "loss": 0.7126, + "step": 112 + }, + { + "epoch": 1.0970873786407767, + "grad_norm": 0.673264040200857, + "learning_rate": 4.9342950496462815e-06, + "loss": 1.0433, + "step": 113 + }, + { + "epoch": 1.1067961165048543, + "grad_norm": 0.6367668762270327, + "learning_rate": 4.933132219394786e-06, + "loss": 0.8586, + "step": 114 + }, + { + "epoch": 1.116504854368932, + "grad_norm": 0.7326471152183239, + "learning_rate": 4.931959328903376e-06, + "loss": 1.1661, + "step": 115 + }, + { + "epoch": 1.1262135922330097, + "grad_norm": 0.7905138029016624, + "learning_rate": 4.930776383021584e-06, + "loss": 0.9155, + "step": 116 + }, + { + "epoch": 1.1359223300970873, + "grad_norm": 0.7113840452485917, + "learning_rate": 4.92958338664052e-06, + "loss": 0.7101, + "step": 117 + }, + { + "epoch": 1.145631067961165, + "grad_norm": 0.7085441798426322, + "learning_rate": 4.928380344692853e-06, + "loss": 0.9499, + "step": 118 + }, + { + "epoch": 1.1553398058252426, + "grad_norm": 0.6654689711573367, + "learning_rate": 4.927167262152784e-06, + "loss": 0.9453, + "step": 119 + }, + { + "epoch": 1.1650485436893203, + "grad_norm": 0.7093345141752957, + "learning_rate": 4.925944144036027e-06, + "loss": 1.0183, + "step": 120 + }, + { + "epoch": 1.174757281553398, + "grad_norm": 0.7761162990313378, + "learning_rate": 4.924710995399796e-06, + "loss": 0.9277, + "step": 121 + }, + { + "epoch": 1.1844660194174756, + "grad_norm": 0.8690880735001523, + "learning_rate": 4.923467821342773e-06, + "loss": 1.1022, + "step": 122 + }, + { + "epoch": 1.1941747572815533, + "grad_norm": 0.8588202527080991, + "learning_rate": 4.922214627005092e-06, + "loss": 1.0451, + "step": 123 + }, + { + "epoch": 1.203883495145631, + "grad_norm": 0.6415200336267318, + "learning_rate": 4.920951417568323e-06, + "loss": 1.004, + "step": 124 + }, + { + "epoch": 1.2135922330097086, + "grad_norm": 0.796282956613202, + "learning_rate": 4.919678198255438e-06, + "loss": 0.9759, + "step": 125 + }, + { + "epoch": 1.2233009708737863, + "grad_norm": 0.7236053821892523, + "learning_rate": 4.918394974330801e-06, + "loss": 1.0669, + "step": 126 + }, + { + "epoch": 1.233009708737864, + "grad_norm": 0.7465165265947691, + "learning_rate": 4.917101751100142e-06, + "loss": 1.0007, + "step": 127 + }, + { + "epoch": 1.2427184466019416, + "grad_norm": 0.6913202728148214, + "learning_rate": 4.915798533910534e-06, + "loss": 1.037, + "step": 128 + }, + { + "epoch": 1.2524271844660193, + "grad_norm": 0.6269818244555071, + "learning_rate": 4.9144853281503715e-06, + "loss": 0.7153, + "step": 129 + }, + { + "epoch": 1.262135922330097, + "grad_norm": 0.7329901927771046, + "learning_rate": 4.91316213924935e-06, + "loss": 1.0037, + "step": 130 + }, + { + "epoch": 1.2718446601941746, + "grad_norm": 0.9047110334486224, + "learning_rate": 4.911828972678441e-06, + "loss": 0.9842, + "step": 131 + }, + { + "epoch": 1.2815533980582523, + "grad_norm": 0.7849365731256595, + "learning_rate": 4.91048583394987e-06, + "loss": 0.8823, + "step": 132 + }, + { + "epoch": 1.29126213592233, + "grad_norm": 0.8276108169878443, + "learning_rate": 4.909132728617095e-06, + "loss": 1.2258, + "step": 133 + }, + { + "epoch": 1.3009708737864076, + "grad_norm": 0.6818450934871862, + "learning_rate": 4.907769662274785e-06, + "loss": 0.9031, + "step": 134 + }, + { + "epoch": 1.3106796116504853, + "grad_norm": 0.573255811123994, + "learning_rate": 4.90639664055879e-06, + "loss": 0.7234, + "step": 135 + }, + { + "epoch": 1.3203883495145632, + "grad_norm": 0.6504319050897613, + "learning_rate": 4.905013669146127e-06, + "loss": 0.8108, + "step": 136 + }, + { + "epoch": 1.3300970873786409, + "grad_norm": 0.6160025172155659, + "learning_rate": 4.903620753754949e-06, + "loss": 0.951, + "step": 137 + }, + { + "epoch": 1.3398058252427185, + "grad_norm": 0.7489527548993854, + "learning_rate": 4.902217900144524e-06, + "loss": 1.1361, + "step": 138 + }, + { + "epoch": 1.3495145631067962, + "grad_norm": 0.7112858054270321, + "learning_rate": 4.900805114115214e-06, + "loss": 1.1212, + "step": 139 + }, + { + "epoch": 1.3592233009708738, + "grad_norm": 0.6142945035367823, + "learning_rate": 4.899382401508446e-06, + "loss": 0.6799, + "step": 140 + }, + { + "epoch": 1.3689320388349515, + "grad_norm": 0.7192927924067215, + "learning_rate": 4.8979497682066916e-06, + "loss": 0.9943, + "step": 141 + }, + { + "epoch": 1.3786407766990292, + "grad_norm": 0.5629568582132167, + "learning_rate": 4.89650722013344e-06, + "loss": 0.6788, + "step": 142 + }, + { + "epoch": 1.3883495145631068, + "grad_norm": 1.1483194850326925, + "learning_rate": 4.895054763253177e-06, + "loss": 1.0813, + "step": 143 + }, + { + "epoch": 1.3980582524271845, + "grad_norm": 0.7435969328117086, + "learning_rate": 4.8935924035713564e-06, + "loss": 0.8197, + "step": 144 + }, + { + "epoch": 1.4077669902912622, + "grad_norm": 1.317031074434914, + "learning_rate": 4.892120147134378e-06, + "loss": 1.0709, + "step": 145 + }, + { + "epoch": 1.4174757281553398, + "grad_norm": 0.5996482737135205, + "learning_rate": 4.8906380000295615e-06, + "loss": 0.7866, + "step": 146 + }, + { + "epoch": 1.4271844660194175, + "grad_norm": 0.7791569768007636, + "learning_rate": 4.889145968385121e-06, + "loss": 0.8691, + "step": 147 + }, + { + "epoch": 1.4368932038834952, + "grad_norm": 0.6501971847860233, + "learning_rate": 4.887644058370139e-06, + "loss": 0.6976, + "step": 148 + }, + { + "epoch": 1.4466019417475728, + "grad_norm": 0.6511216890119076, + "learning_rate": 4.886132276194544e-06, + "loss": 0.7491, + "step": 149 + }, + { + "epoch": 1.4563106796116505, + "grad_norm": 0.612023999998433, + "learning_rate": 4.884610628109082e-06, + "loss": 0.717, + "step": 150 + }, + { + "epoch": 1.4660194174757282, + "grad_norm": 0.7773151050365769, + "learning_rate": 4.883079120405292e-06, + "loss": 1.004, + "step": 151 + }, + { + "epoch": 1.4757281553398058, + "grad_norm": 0.6158372837580877, + "learning_rate": 4.881537759415478e-06, + "loss": 0.6869, + "step": 152 + }, + { + "epoch": 1.4854368932038835, + "grad_norm": 0.8120612151123343, + "learning_rate": 4.879986551512684e-06, + "loss": 0.9272, + "step": 153 + }, + { + "epoch": 1.4951456310679612, + "grad_norm": 0.9467728776526185, + "learning_rate": 4.878425503110672e-06, + "loss": 0.7134, + "step": 154 + }, + { + "epoch": 1.5048543689320388, + "grad_norm": 0.7268524122061516, + "learning_rate": 4.876854620663887e-06, + "loss": 0.9148, + "step": 155 + }, + { + "epoch": 1.5145631067961165, + "grad_norm": 0.5995550004105531, + "learning_rate": 4.875273910667434e-06, + "loss": 0.74, + "step": 156 + }, + { + "epoch": 1.5242718446601942, + "grad_norm": 0.8211820446136313, + "learning_rate": 4.873683379657057e-06, + "loss": 0.7739, + "step": 157 + }, + { + "epoch": 1.5339805825242718, + "grad_norm": 0.6823628164842023, + "learning_rate": 4.8720830342091015e-06, + "loss": 1.1488, + "step": 158 + }, + { + "epoch": 1.5436893203883495, + "grad_norm": 0.7014265251368315, + "learning_rate": 4.870472880940496e-06, + "loss": 0.9573, + "step": 159 + }, + { + "epoch": 1.5533980582524272, + "grad_norm": 0.8352178057977285, + "learning_rate": 4.868852926508721e-06, + "loss": 0.9985, + "step": 160 + }, + { + "epoch": 1.5631067961165048, + "grad_norm": 0.756687144256477, + "learning_rate": 4.867223177611779e-06, + "loss": 0.938, + "step": 161 + }, + { + "epoch": 1.5728155339805825, + "grad_norm": 0.704162668693477, + "learning_rate": 4.865583640988173e-06, + "loss": 0.7891, + "step": 162 + }, + { + "epoch": 1.5825242718446602, + "grad_norm": 0.5937940435523842, + "learning_rate": 4.863934323416871e-06, + "loss": 0.7824, + "step": 163 + }, + { + "epoch": 1.5922330097087378, + "grad_norm": 0.5548908895552046, + "learning_rate": 4.862275231717288e-06, + "loss": 0.7329, + "step": 164 + }, + { + "epoch": 1.6019417475728155, + "grad_norm": 0.8668497443441112, + "learning_rate": 4.860606372749247e-06, + "loss": 0.7515, + "step": 165 + }, + { + "epoch": 1.6116504854368932, + "grad_norm": 0.8678487798318208, + "learning_rate": 4.858927753412958e-06, + "loss": 0.9059, + "step": 166 + }, + { + "epoch": 1.6213592233009708, + "grad_norm": 0.6236208019957972, + "learning_rate": 4.857239380648985e-06, + "loss": 0.7846, + "step": 167 + }, + { + "epoch": 1.6310679611650487, + "grad_norm": 0.6093138120471663, + "learning_rate": 4.855541261438223e-06, + "loss": 0.8363, + "step": 168 + }, + { + "epoch": 1.6407766990291264, + "grad_norm": 0.6297044385352409, + "learning_rate": 4.8538334028018605e-06, + "loss": 1.0275, + "step": 169 + }, + { + "epoch": 1.650485436893204, + "grad_norm": 0.7705428538880952, + "learning_rate": 4.8521158118013605e-06, + "loss": 0.8634, + "step": 170 + }, + { + "epoch": 1.6601941747572817, + "grad_norm": 0.5714628211852456, + "learning_rate": 4.850388495538423e-06, + "loss": 0.7612, + "step": 171 + }, + { + "epoch": 1.6699029126213594, + "grad_norm": 0.6093620351916359, + "learning_rate": 4.84865146115496e-06, + "loss": 0.6787, + "step": 172 + }, + { + "epoch": 1.679611650485437, + "grad_norm": 0.6119153926928719, + "learning_rate": 4.846904715833066e-06, + "loss": 0.6522, + "step": 173 + }, + { + "epoch": 1.6893203883495147, + "grad_norm": 1.4628803952904057, + "learning_rate": 4.8451482667949836e-06, + "loss": 0.9146, + "step": 174 + }, + { + "epoch": 1.6990291262135924, + "grad_norm": 0.6382465121072494, + "learning_rate": 4.843382121303082e-06, + "loss": 0.7933, + "step": 175 + }, + { + "epoch": 1.70873786407767, + "grad_norm": 0.6449697173083225, + "learning_rate": 4.841606286659819e-06, + "loss": 0.7666, + "step": 176 + }, + { + "epoch": 1.7184466019417477, + "grad_norm": 0.5895577438347263, + "learning_rate": 4.839820770207714e-06, + "loss": 0.6859, + "step": 177 + }, + { + "epoch": 1.7281553398058254, + "grad_norm": 0.6104244057063786, + "learning_rate": 4.8380255793293195e-06, + "loss": 0.691, + "step": 178 + }, + { + "epoch": 1.737864077669903, + "grad_norm": 0.6257950437898444, + "learning_rate": 4.8362207214471864e-06, + "loss": 0.667, + "step": 179 + }, + { + "epoch": 1.7475728155339807, + "grad_norm": 0.6404597691271853, + "learning_rate": 4.83440620402384e-06, + "loss": 0.7931, + "step": 180 + }, + { + "epoch": 1.7572815533980584, + "grad_norm": 0.554547749031446, + "learning_rate": 4.832582034561738e-06, + "loss": 0.7692, + "step": 181 + }, + { + "epoch": 1.766990291262136, + "grad_norm": 0.8379042242449998, + "learning_rate": 4.830748220603251e-06, + "loss": 1.0003, + "step": 182 + }, + { + "epoch": 1.7766990291262137, + "grad_norm": 1.371164550147673, + "learning_rate": 4.828904769730628e-06, + "loss": 1.1004, + "step": 183 + }, + { + "epoch": 1.7864077669902914, + "grad_norm": 0.663561268446458, + "learning_rate": 4.827051689565958e-06, + "loss": 0.7922, + "step": 184 + }, + { + "epoch": 1.796116504854369, + "grad_norm": 0.6107817143669849, + "learning_rate": 4.825188987771149e-06, + "loss": 0.9067, + "step": 185 + }, + { + "epoch": 1.8058252427184467, + "grad_norm": 0.5868092910997027, + "learning_rate": 4.82331667204789e-06, + "loss": 0.5892, + "step": 186 + }, + { + "epoch": 1.8155339805825244, + "grad_norm": 0.6461919671595425, + "learning_rate": 4.821434750137619e-06, + "loss": 0.725, + "step": 187 + }, + { + "epoch": 1.825242718446602, + "grad_norm": 0.6489841825930465, + "learning_rate": 4.819543229821494e-06, + "loss": 0.8711, + "step": 188 + }, + { + "epoch": 1.8349514563106797, + "grad_norm": 0.5584447600264582, + "learning_rate": 4.8176421189203605e-06, + "loss": 0.6982, + "step": 189 + }, + { + "epoch": 1.8446601941747574, + "grad_norm": 0.5659976253732854, + "learning_rate": 4.815731425294716e-06, + "loss": 0.7654, + "step": 190 + }, + { + "epoch": 1.854368932038835, + "grad_norm": 0.7825930846614281, + "learning_rate": 4.813811156844681e-06, + "loss": 0.9132, + "step": 191 + }, + { + "epoch": 1.8640776699029127, + "grad_norm": 0.662682846727773, + "learning_rate": 4.811881321509964e-06, + "loss": 0.849, + "step": 192 + }, + { + "epoch": 1.8737864077669903, + "grad_norm": 0.610458993645023, + "learning_rate": 4.809941927269829e-06, + "loss": 1.0337, + "step": 193 + }, + { + "epoch": 1.883495145631068, + "grad_norm": 0.6547992835486613, + "learning_rate": 4.807992982143064e-06, + "loss": 0.7589, + "step": 194 + }, + { + "epoch": 1.8932038834951457, + "grad_norm": 0.7542898472365844, + "learning_rate": 4.806034494187949e-06, + "loss": 1.0626, + "step": 195 + }, + { + "epoch": 1.9029126213592233, + "grad_norm": 0.6344494281728731, + "learning_rate": 4.804066471502216e-06, + "loss": 0.8039, + "step": 196 + }, + { + "epoch": 1.912621359223301, + "grad_norm": 0.6136658573521839, + "learning_rate": 4.802088922223024e-06, + "loss": 0.6425, + "step": 197 + }, + { + "epoch": 1.9223300970873787, + "grad_norm": 0.6483657220529678, + "learning_rate": 4.80010185452692e-06, + "loss": 0.8784, + "step": 198 + }, + { + "epoch": 1.9320388349514563, + "grad_norm": 0.5420123808990978, + "learning_rate": 4.798105276629806e-06, + "loss": 0.6578, + "step": 199 + }, + { + "epoch": 1.941747572815534, + "grad_norm": 0.6687950517102932, + "learning_rate": 4.796099196786908e-06, + "loss": 0.785, + "step": 200 + }, + { + "epoch": 1.9514563106796117, + "grad_norm": 0.6178617434820718, + "learning_rate": 4.794083623292737e-06, + "loss": 0.9773, + "step": 201 + }, + { + "epoch": 1.9611650485436893, + "grad_norm": 0.6711763346625357, + "learning_rate": 4.792058564481058e-06, + "loss": 0.7296, + "step": 202 + }, + { + "epoch": 1.970873786407767, + "grad_norm": 0.6084143889062773, + "learning_rate": 4.7900240287248554e-06, + "loss": 0.8921, + "step": 203 + }, + { + "epoch": 1.9805825242718447, + "grad_norm": 0.7549301985138527, + "learning_rate": 4.7879800244362975e-06, + "loss": 0.9962, + "step": 204 + }, + { + "epoch": 1.9902912621359223, + "grad_norm": 0.6828205600145024, + "learning_rate": 4.785926560066703e-06, + "loss": 0.9536, + "step": 205 + }, + { + "epoch": 2.0, + "grad_norm": 0.7048432912651719, + "learning_rate": 4.783863644106502e-06, + "loss": 0.91, + "step": 206 + }, + { + "epoch": 2.0097087378640777, + "grad_norm": 0.5925003641609397, + "learning_rate": 4.781791285085209e-06, + "loss": 0.7574, + "step": 207 + }, + { + "epoch": 2.0194174757281553, + "grad_norm": 0.6183118556276791, + "learning_rate": 4.779709491571378e-06, + "loss": 0.7517, + "step": 208 + }, + { + "epoch": 2.029126213592233, + "grad_norm": 0.5513269000902425, + "learning_rate": 4.777618272172573e-06, + "loss": 0.8483, + "step": 209 + }, + { + "epoch": 2.0388349514563107, + "grad_norm": 0.6449371069347959, + "learning_rate": 4.775517635535332e-06, + "loss": 0.9983, + "step": 210 + }, + { + "epoch": 2.0485436893203883, + "grad_norm": 0.6510146500554277, + "learning_rate": 4.77340759034513e-06, + "loss": 0.943, + "step": 211 + }, + { + "epoch": 2.058252427184466, + "grad_norm": 0.5516833176176236, + "learning_rate": 4.771288145326343e-06, + "loss": 0.8697, + "step": 212 + }, + { + "epoch": 2.0679611650485437, + "grad_norm": 0.5372833778405927, + "learning_rate": 4.769159309242213e-06, + "loss": 0.6181, + "step": 213 + }, + { + "epoch": 2.0776699029126213, + "grad_norm": 0.6989081916523086, + "learning_rate": 4.767021090894809e-06, + "loss": 0.9862, + "step": 214 + }, + { + "epoch": 2.087378640776699, + "grad_norm": 0.5906461946301448, + "learning_rate": 4.764873499124997e-06, + "loss": 0.9271, + "step": 215 + }, + { + "epoch": 2.0970873786407767, + "grad_norm": 0.546362671281846, + "learning_rate": 4.762716542812395e-06, + "loss": 0.7597, + "step": 216 + }, + { + "epoch": 2.1067961165048543, + "grad_norm": 0.5766822546336634, + "learning_rate": 4.7605502308753415e-06, + "loss": 0.7774, + "step": 217 + }, + { + "epoch": 2.116504854368932, + "grad_norm": 0.6388672043671925, + "learning_rate": 4.758374572270859e-06, + "loss": 0.9155, + "step": 218 + }, + { + "epoch": 2.1262135922330097, + "grad_norm": 0.5728853545011949, + "learning_rate": 4.756189575994614e-06, + "loss": 0.6017, + "step": 219 + }, + { + "epoch": 2.1359223300970873, + "grad_norm": 0.5366888703406183, + "learning_rate": 4.753995251080884e-06, + "loss": 0.7425, + "step": 220 + }, + { + "epoch": 2.145631067961165, + "grad_norm": 1.3209362433112186, + "learning_rate": 4.7517916066025126e-06, + "loss": 0.6868, + "step": 221 + }, + { + "epoch": 2.1553398058252426, + "grad_norm": 0.6484738703855876, + "learning_rate": 4.7495786516708806e-06, + "loss": 0.9286, + "step": 222 + }, + { + "epoch": 2.1650485436893203, + "grad_norm": 0.6262395253661787, + "learning_rate": 4.747356395435865e-06, + "loss": 0.6069, + "step": 223 + }, + { + "epoch": 2.174757281553398, + "grad_norm": 0.6311571664234611, + "learning_rate": 4.745124847085799e-06, + "loss": 0.7718, + "step": 224 + }, + { + "epoch": 2.1844660194174756, + "grad_norm": 0.613078099794739, + "learning_rate": 4.742884015847436e-06, + "loss": 0.805, + "step": 225 + }, + { + "epoch": 2.1941747572815533, + "grad_norm": 0.5854928160805744, + "learning_rate": 4.740633910985911e-06, + "loss": 0.8084, + "step": 226 + }, + { + "epoch": 2.203883495145631, + "grad_norm": 0.5976089299186353, + "learning_rate": 4.738374541804704e-06, + "loss": 0.6863, + "step": 227 + }, + { + "epoch": 2.2135922330097086, + "grad_norm": 0.6014124521376234, + "learning_rate": 4.7361059176456e-06, + "loss": 0.7472, + "step": 228 + }, + { + "epoch": 2.2233009708737863, + "grad_norm": 0.5685151477620302, + "learning_rate": 4.733828047888647e-06, + "loss": 0.8648, + "step": 229 + }, + { + "epoch": 2.233009708737864, + "grad_norm": 0.5695477126283384, + "learning_rate": 4.731540941952126e-06, + "loss": 0.5924, + "step": 230 + }, + { + "epoch": 2.2427184466019416, + "grad_norm": 0.5651868559249316, + "learning_rate": 4.7292446092925016e-06, + "loss": 0.6548, + "step": 231 + }, + { + "epoch": 2.2524271844660193, + "grad_norm": 0.5710356100550151, + "learning_rate": 4.726939059404392e-06, + "loss": 0.6689, + "step": 232 + }, + { + "epoch": 2.262135922330097, + "grad_norm": 0.5613313610051237, + "learning_rate": 4.724624301820524e-06, + "loss": 0.673, + "step": 233 + }, + { + "epoch": 2.2718446601941746, + "grad_norm": 0.7004250430825069, + "learning_rate": 4.722300346111695e-06, + "loss": 0.8458, + "step": 234 + }, + { + "epoch": 2.2815533980582523, + "grad_norm": 0.6216105948838587, + "learning_rate": 4.719967201886734e-06, + "loss": 0.7305, + "step": 235 + }, + { + "epoch": 2.29126213592233, + "grad_norm": 0.6326118950665, + "learning_rate": 4.717624878792461e-06, + "loss": 0.9773, + "step": 236 + }, + { + "epoch": 2.3009708737864076, + "grad_norm": 0.7022071993385745, + "learning_rate": 4.715273386513651e-06, + "loss": 0.9996, + "step": 237 + }, + { + "epoch": 2.3106796116504853, + "grad_norm": 0.5723457282915828, + "learning_rate": 4.712912734772988e-06, + "loss": 0.7054, + "step": 238 + }, + { + "epoch": 2.320388349514563, + "grad_norm": 0.6945599167433392, + "learning_rate": 4.710542933331025e-06, + "loss": 0.8439, + "step": 239 + }, + { + "epoch": 2.3300970873786406, + "grad_norm": 0.6298659607215348, + "learning_rate": 4.708163991986152e-06, + "loss": 0.9141, + "step": 240 + }, + { + "epoch": 2.3398058252427183, + "grad_norm": 0.5466796831126433, + "learning_rate": 4.705775920574546e-06, + "loss": 0.8445, + "step": 241 + }, + { + "epoch": 2.349514563106796, + "grad_norm": 0.8906548403954916, + "learning_rate": 4.703378728970134e-06, + "loss": 0.6775, + "step": 242 + }, + { + "epoch": 2.3592233009708736, + "grad_norm": 0.5453159831751623, + "learning_rate": 4.700972427084551e-06, + "loss": 0.8023, + "step": 243 + }, + { + "epoch": 2.3689320388349513, + "grad_norm": 0.6086212250840284, + "learning_rate": 4.698557024867105e-06, + "loss": 0.6731, + "step": 244 + }, + { + "epoch": 2.378640776699029, + "grad_norm": 0.5633438402824648, + "learning_rate": 4.696132532304727e-06, + "loss": 0.8508, + "step": 245 + }, + { + "epoch": 2.3883495145631066, + "grad_norm": 0.5713702706197664, + "learning_rate": 4.693698959421935e-06, + "loss": 0.7985, + "step": 246 + }, + { + "epoch": 2.3980582524271843, + "grad_norm": 0.7944485374871474, + "learning_rate": 4.691256316280789e-06, + "loss": 1.0714, + "step": 247 + }, + { + "epoch": 2.407766990291262, + "grad_norm": 0.7146714273823804, + "learning_rate": 4.688804612980855e-06, + "loss": 0.7486, + "step": 248 + }, + { + "epoch": 2.4174757281553396, + "grad_norm": 0.5620249512602807, + "learning_rate": 4.686343859659158e-06, + "loss": 0.9298, + "step": 249 + }, + { + "epoch": 2.4271844660194173, + "grad_norm": 0.573615588141239, + "learning_rate": 4.683874066490143e-06, + "loss": 0.7426, + "step": 250 + }, + { + "epoch": 2.436893203883495, + "grad_norm": 0.5949978059155007, + "learning_rate": 4.681395243685631e-06, + "loss": 0.7588, + "step": 251 + }, + { + "epoch": 2.4466019417475726, + "grad_norm": 0.5548882698312756, + "learning_rate": 4.67890740149478e-06, + "loss": 0.6808, + "step": 252 + }, + { + "epoch": 2.4563106796116507, + "grad_norm": 0.5998026064624129, + "learning_rate": 4.676410550204036e-06, + "loss": 0.9237, + "step": 253 + }, + { + "epoch": 2.466019417475728, + "grad_norm": 0.5343256980470983, + "learning_rate": 4.673904700137098e-06, + "loss": 0.607, + "step": 254 + }, + { + "epoch": 2.475728155339806, + "grad_norm": 0.5129919168354066, + "learning_rate": 4.671389861654873e-06, + "loss": 0.6515, + "step": 255 + }, + { + "epoch": 2.4854368932038833, + "grad_norm": 0.5256394449847143, + "learning_rate": 4.668866045155428e-06, + "loss": 0.7839, + "step": 256 + }, + { + "epoch": 2.4951456310679614, + "grad_norm": 0.6664881549664314, + "learning_rate": 4.666333261073956e-06, + "loss": 0.8259, + "step": 257 + }, + { + "epoch": 2.5048543689320386, + "grad_norm": 0.5878066563376306, + "learning_rate": 4.6637915198827265e-06, + "loss": 0.8741, + "step": 258 + }, + { + "epoch": 2.5145631067961167, + "grad_norm": 0.5762944457977316, + "learning_rate": 4.661240832091042e-06, + "loss": 0.719, + "step": 259 + }, + { + "epoch": 2.524271844660194, + "grad_norm": 0.635760206889188, + "learning_rate": 4.658681208245198e-06, + "loss": 0.8797, + "step": 260 + }, + { + "epoch": 2.533980582524272, + "grad_norm": 0.5296771268285247, + "learning_rate": 4.65611265892844e-06, + "loss": 0.7687, + "step": 261 + }, + { + "epoch": 2.5436893203883493, + "grad_norm": 0.5975151352963404, + "learning_rate": 4.653535194760912e-06, + "loss": 0.7371, + "step": 262 + }, + { + "epoch": 2.5533980582524274, + "grad_norm": 0.5669500366203135, + "learning_rate": 4.650948826399624e-06, + "loss": 0.5689, + "step": 263 + }, + { + "epoch": 2.5631067961165046, + "grad_norm": 0.5800778206254651, + "learning_rate": 4.648353564538397e-06, + "loss": 0.7246, + "step": 264 + }, + { + "epoch": 2.5728155339805827, + "grad_norm": 0.6321546100365262, + "learning_rate": 4.645749419907829e-06, + "loss": 0.7167, + "step": 265 + }, + { + "epoch": 2.58252427184466, + "grad_norm": 0.5319661367490109, + "learning_rate": 4.64313640327524e-06, + "loss": 0.5765, + "step": 266 + }, + { + "epoch": 2.592233009708738, + "grad_norm": 0.6013985476971264, + "learning_rate": 4.640514525444637e-06, + "loss": 0.8056, + "step": 267 + }, + { + "epoch": 2.6019417475728153, + "grad_norm": 0.6212321355444684, + "learning_rate": 4.637883797256663e-06, + "loss": 1.0431, + "step": 268 + }, + { + "epoch": 2.6116504854368934, + "grad_norm": 0.5534158811339417, + "learning_rate": 4.635244229588558e-06, + "loss": 0.7036, + "step": 269 + }, + { + "epoch": 2.6213592233009706, + "grad_norm": 0.6415745872801252, + "learning_rate": 4.632595833354105e-06, + "loss": 0.8674, + "step": 270 + }, + { + "epoch": 2.6310679611650487, + "grad_norm": 0.6285131287505951, + "learning_rate": 4.629938619503593e-06, + "loss": 0.7088, + "step": 271 + }, + { + "epoch": 2.6407766990291264, + "grad_norm": 0.5291268994493087, + "learning_rate": 4.627272599023772e-06, + "loss": 0.8775, + "step": 272 + }, + { + "epoch": 2.650485436893204, + "grad_norm": 0.5852978169428409, + "learning_rate": 4.6245977829378e-06, + "loss": 1.0301, + "step": 273 + }, + { + "epoch": 2.6601941747572817, + "grad_norm": 0.5619590148291557, + "learning_rate": 4.6219141823052035e-06, + "loss": 0.655, + "step": 274 + }, + { + "epoch": 2.6699029126213594, + "grad_norm": 0.586715896019821, + "learning_rate": 4.619221808221833e-06, + "loss": 0.9604, + "step": 275 + }, + { + "epoch": 2.679611650485437, + "grad_norm": 0.5855513689539318, + "learning_rate": 4.616520671819812e-06, + "loss": 0.6424, + "step": 276 + }, + { + "epoch": 2.6893203883495147, + "grad_norm": 0.647039460728171, + "learning_rate": 4.613810784267492e-06, + "loss": 0.8329, + "step": 277 + }, + { + "epoch": 2.6990291262135924, + "grad_norm": 0.5154279265816364, + "learning_rate": 4.61109215676941e-06, + "loss": 0.588, + "step": 278 + }, + { + "epoch": 2.70873786407767, + "grad_norm": 0.5832083686660431, + "learning_rate": 4.608364800566241e-06, + "loss": 0.7689, + "step": 279 + }, + { + "epoch": 2.7184466019417477, + "grad_norm": 0.5884877738799021, + "learning_rate": 4.605628726934747e-06, + "loss": 0.8834, + "step": 280 + }, + { + "epoch": 2.7281553398058254, + "grad_norm": 0.5126503459961328, + "learning_rate": 4.602883947187738e-06, + "loss": 0.7756, + "step": 281 + }, + { + "epoch": 2.737864077669903, + "grad_norm": 0.5509578989929416, + "learning_rate": 4.600130472674017e-06, + "loss": 0.63, + "step": 282 + }, + { + "epoch": 2.7475728155339807, + "grad_norm": 0.5937802430842546, + "learning_rate": 4.5973683147783405e-06, + "loss": 0.9692, + "step": 283 + }, + { + "epoch": 2.7572815533980584, + "grad_norm": 0.8437616641968025, + "learning_rate": 4.594597484921365e-06, + "loss": 0.7999, + "step": 284 + }, + { + "epoch": 2.766990291262136, + "grad_norm": 0.5815733690670376, + "learning_rate": 4.5918179945596055e-06, + "loss": 0.7197, + "step": 285 + }, + { + "epoch": 2.7766990291262137, + "grad_norm": 0.6766231348468958, + "learning_rate": 4.589029855185384e-06, + "loss": 0.8009, + "step": 286 + }, + { + "epoch": 2.7864077669902914, + "grad_norm": 0.6016607485166134, + "learning_rate": 4.586233078326785e-06, + "loss": 1.055, + "step": 287 + }, + { + "epoch": 2.796116504854369, + "grad_norm": 0.5385528216529567, + "learning_rate": 4.583427675547602e-06, + "loss": 0.8221, + "step": 288 + }, + { + "epoch": 2.8058252427184467, + "grad_norm": 0.5993566648276473, + "learning_rate": 4.580613658447301e-06, + "loss": 0.938, + "step": 289 + }, + { + "epoch": 2.8155339805825244, + "grad_norm": 0.5912484848249645, + "learning_rate": 4.577791038660959e-06, + "loss": 0.6534, + "step": 290 + }, + { + "epoch": 2.825242718446602, + "grad_norm": 0.5558723739620934, + "learning_rate": 4.574959827859226e-06, + "loss": 0.6328, + "step": 291 + }, + { + "epoch": 2.8349514563106797, + "grad_norm": 0.5215220174359476, + "learning_rate": 4.572120037748273e-06, + "loss": 0.6078, + "step": 292 + }, + { + "epoch": 2.8446601941747574, + "grad_norm": 0.5578352792075171, + "learning_rate": 4.5692716800697415e-06, + "loss": 0.7983, + "step": 293 + }, + { + "epoch": 2.854368932038835, + "grad_norm": 0.5631763967068151, + "learning_rate": 4.566414766600698e-06, + "loss": 0.7932, + "step": 294 + }, + { + "epoch": 2.8640776699029127, + "grad_norm": 0.5467170630989897, + "learning_rate": 4.563549309153589e-06, + "loss": 0.6513, + "step": 295 + }, + { + "epoch": 2.8737864077669903, + "grad_norm": 0.6087442326908157, + "learning_rate": 4.56067531957618e-06, + "loss": 0.7059, + "step": 296 + }, + { + "epoch": 2.883495145631068, + "grad_norm": 0.6345964624606057, + "learning_rate": 4.557792809751519e-06, + "loss": 0.8497, + "step": 297 + }, + { + "epoch": 2.8932038834951457, + "grad_norm": 0.6421474826199707, + "learning_rate": 4.554901791597883e-06, + "loss": 0.754, + "step": 298 + }, + { + "epoch": 2.9029126213592233, + "grad_norm": 0.5611662286254485, + "learning_rate": 4.552002277068725e-06, + "loss": 0.7587, + "step": 299 + }, + { + "epoch": 2.912621359223301, + "grad_norm": 0.6356794525355608, + "learning_rate": 4.549094278152631e-06, + "loss": 0.8163, + "step": 300 + }, + { + "epoch": 2.9223300970873787, + "grad_norm": 0.6336849057165008, + "learning_rate": 4.546177806873266e-06, + "loss": 0.682, + "step": 301 + }, + { + "epoch": 2.9320388349514563, + "grad_norm": 0.6059295762410952, + "learning_rate": 4.543252875289326e-06, + "loss": 0.6621, + "step": 302 + }, + { + "epoch": 2.941747572815534, + "grad_norm": 0.5887024077749718, + "learning_rate": 4.540319495494486e-06, + "loss": 0.7904, + "step": 303 + }, + { + "epoch": 2.9514563106796117, + "grad_norm": 0.6053171571638954, + "learning_rate": 4.537377679617353e-06, + "loss": 0.8159, + "step": 304 + }, + { + "epoch": 2.9611650485436893, + "grad_norm": 0.6162643989939205, + "learning_rate": 4.534427439821416e-06, + "loss": 0.7054, + "step": 305 + }, + { + "epoch": 2.970873786407767, + "grad_norm": 0.5653687306702669, + "learning_rate": 4.531468788304992e-06, + "loss": 0.6473, + "step": 306 + }, + { + "epoch": 2.9805825242718447, + "grad_norm": 0.6147243558106896, + "learning_rate": 4.5285017373011784e-06, + "loss": 0.6811, + "step": 307 + }, + { + "epoch": 2.9902912621359223, + "grad_norm": 0.5023058479913145, + "learning_rate": 4.5255262990778024e-06, + "loss": 0.5666, + "step": 308 + }, + { + "epoch": 3.0, + "grad_norm": 0.5339774541368152, + "learning_rate": 4.522542485937369e-06, + "loss": 0.7024, + "step": 309 + }, + { + "epoch": 3.0097087378640777, + "grad_norm": 0.5871214604141014, + "learning_rate": 4.519550310217013e-06, + "loss": 0.792, + "step": 310 + }, + { + "epoch": 3.0194174757281553, + "grad_norm": 0.550905273830553, + "learning_rate": 4.516549784288442e-06, + "loss": 0.6801, + "step": 311 + }, + { + "epoch": 3.029126213592233, + "grad_norm": 0.5295468316460928, + "learning_rate": 4.513540920557892e-06, + "loss": 0.6589, + "step": 312 + }, + { + "epoch": 3.0388349514563107, + "grad_norm": 0.5818378095787248, + "learning_rate": 4.510523731466072e-06, + "loss": 0.9006, + "step": 313 + }, + { + "epoch": 3.0485436893203883, + "grad_norm": 0.53839185417949, + "learning_rate": 4.507498229488116e-06, + "loss": 0.5935, + "step": 314 + }, + { + "epoch": 3.058252427184466, + "grad_norm": 0.5162101468182391, + "learning_rate": 4.504464427133527e-06, + "loss": 0.6615, + "step": 315 + }, + { + "epoch": 3.0679611650485437, + "grad_norm": 0.5627252995451077, + "learning_rate": 4.501422336946126e-06, + "loss": 0.6778, + "step": 316 + }, + { + "epoch": 3.0776699029126213, + "grad_norm": 0.5499924181430055, + "learning_rate": 4.498371971504005e-06, + "loss": 0.6824, + "step": 317 + }, + { + "epoch": 3.087378640776699, + "grad_norm": 0.5313000147551927, + "learning_rate": 4.49531334341947e-06, + "loss": 0.6953, + "step": 318 + }, + { + "epoch": 3.0970873786407767, + "grad_norm": 0.5053678892953669, + "learning_rate": 4.49224646533899e-06, + "loss": 0.624, + "step": 319 + }, + { + "epoch": 3.1067961165048543, + "grad_norm": 0.5607765091857444, + "learning_rate": 4.489171349943144e-06, + "loss": 0.6887, + "step": 320 + }, + { + "epoch": 3.116504854368932, + "grad_norm": 0.49971275746543925, + "learning_rate": 4.486088009946575e-06, + "loss": 0.599, + "step": 321 + }, + { + "epoch": 3.1262135922330097, + "grad_norm": 0.5463448565404533, + "learning_rate": 4.482996458097926e-06, + "loss": 0.7412, + "step": 322 + }, + { + "epoch": 3.1359223300970873, + "grad_norm": 0.5577175615372468, + "learning_rate": 4.479896707179796e-06, + "loss": 0.9504, + "step": 323 + }, + { + "epoch": 3.145631067961165, + "grad_norm": 0.739795464222875, + "learning_rate": 4.476788770008685e-06, + "loss": 0.9296, + "step": 324 + }, + { + "epoch": 3.1553398058252426, + "grad_norm": 0.5860377779118467, + "learning_rate": 4.473672659434941e-06, + "loss": 0.8433, + "step": 325 + }, + { + "epoch": 3.1650485436893203, + "grad_norm": 0.5234329633422101, + "learning_rate": 4.470548388342704e-06, + "loss": 0.7058, + "step": 326 + }, + { + "epoch": 3.174757281553398, + "grad_norm": 0.5307383652263797, + "learning_rate": 4.467415969649858e-06, + "loss": 0.7462, + "step": 327 + }, + { + "epoch": 3.1844660194174756, + "grad_norm": 0.5283773909342946, + "learning_rate": 4.464275416307973e-06, + "loss": 0.7497, + "step": 328 + }, + { + "epoch": 3.1941747572815533, + "grad_norm": 0.5706683381161813, + "learning_rate": 4.461126741302253e-06, + "loss": 0.7677, + "step": 329 + }, + { + "epoch": 3.203883495145631, + "grad_norm": 0.568157264873081, + "learning_rate": 4.457969957651485e-06, + "loss": 0.6759, + "step": 330 + }, + { + "epoch": 3.2135922330097086, + "grad_norm": 0.5665376751141662, + "learning_rate": 4.454805078407979e-06, + "loss": 0.9144, + "step": 331 + }, + { + "epoch": 3.2233009708737863, + "grad_norm": 0.5146029291558777, + "learning_rate": 4.451632116657521e-06, + "loss": 0.6786, + "step": 332 + }, + { + "epoch": 3.233009708737864, + "grad_norm": 0.4670736553738834, + "learning_rate": 4.448451085519314e-06, + "loss": 0.5646, + "step": 333 + }, + { + "epoch": 3.2427184466019416, + "grad_norm": 0.5983555089379993, + "learning_rate": 4.445261998145927e-06, + "loss": 0.7486, + "step": 334 + }, + { + "epoch": 3.2524271844660193, + "grad_norm": 0.5247722963016707, + "learning_rate": 4.442064867723236e-06, + "loss": 0.7228, + "step": 335 + }, + { + "epoch": 3.262135922330097, + "grad_norm": 0.5830371706199331, + "learning_rate": 4.438859707470376e-06, + "loss": 0.7545, + "step": 336 + }, + { + "epoch": 3.2718446601941746, + "grad_norm": 0.569317200535919, + "learning_rate": 4.435646530639679e-06, + "loss": 0.6446, + "step": 337 + }, + { + "epoch": 3.2815533980582523, + "grad_norm": 0.5229847762890999, + "learning_rate": 4.432425350516627e-06, + "loss": 0.7016, + "step": 338 + }, + { + "epoch": 3.29126213592233, + "grad_norm": 0.5556853198137727, + "learning_rate": 4.42919618041979e-06, + "loss": 0.581, + "step": 339 + }, + { + "epoch": 3.3009708737864076, + "grad_norm": 0.5504701249347626, + "learning_rate": 4.425959033700776e-06, + "loss": 0.4669, + "step": 340 + }, + { + "epoch": 3.3106796116504853, + "grad_norm": 0.5733436063811748, + "learning_rate": 4.422713923744174e-06, + "loss": 0.7609, + "step": 341 + }, + { + "epoch": 3.320388349514563, + "grad_norm": 0.5712264087934034, + "learning_rate": 4.419460863967496e-06, + "loss": 0.796, + "step": 342 + }, + { + "epoch": 3.3300970873786406, + "grad_norm": 0.5298396914849735, + "learning_rate": 4.416199867821126e-06, + "loss": 0.7347, + "step": 343 + }, + { + "epoch": 3.3398058252427183, + "grad_norm": 0.6167638285377818, + "learning_rate": 4.412930948788263e-06, + "loss": 0.7213, + "step": 344 + }, + { + "epoch": 3.349514563106796, + "grad_norm": 0.6769548528256155, + "learning_rate": 4.409654120384863e-06, + "loss": 0.9705, + "step": 345 + }, + { + "epoch": 3.3592233009708736, + "grad_norm": 0.5558234569754197, + "learning_rate": 4.406369396159585e-06, + "loss": 0.866, + "step": 346 + }, + { + "epoch": 3.3689320388349513, + "grad_norm": 0.5683210417700995, + "learning_rate": 4.403076789693735e-06, + "loss": 0.8529, + "step": 347 + }, + { + "epoch": 3.378640776699029, + "grad_norm": 0.6004152868285685, + "learning_rate": 4.399776314601212e-06, + "loss": 0.6924, + "step": 348 + }, + { + "epoch": 3.3883495145631066, + "grad_norm": 0.5915756102782764, + "learning_rate": 4.396467984528445e-06, + "loss": 0.809, + "step": 349 + }, + { + "epoch": 3.3980582524271843, + "grad_norm": 0.5726590426952238, + "learning_rate": 4.393151813154345e-06, + "loss": 0.6593, + "step": 350 + }, + { + "epoch": 3.407766990291262, + "grad_norm": 0.5071226489196179, + "learning_rate": 4.3898278141902396e-06, + "loss": 0.6473, + "step": 351 + }, + { + "epoch": 3.4174757281553396, + "grad_norm": 0.5733646491001667, + "learning_rate": 4.386496001379826e-06, + "loss": 0.8021, + "step": 352 + }, + { + "epoch": 3.4271844660194173, + "grad_norm": 0.5616644007222712, + "learning_rate": 4.383156388499106e-06, + "loss": 0.7276, + "step": 353 + }, + { + "epoch": 3.436893203883495, + "grad_norm": 0.5270931676906139, + "learning_rate": 4.3798089893563335e-06, + "loss": 0.6511, + "step": 354 + }, + { + "epoch": 3.4466019417475726, + "grad_norm": 0.5956176073578046, + "learning_rate": 4.3764538177919555e-06, + "loss": 0.5386, + "step": 355 + }, + { + "epoch": 3.4563106796116507, + "grad_norm": 0.6087361244099403, + "learning_rate": 4.3730908876785574e-06, + "loss": 0.7574, + "step": 356 + }, + { + "epoch": 3.466019417475728, + "grad_norm": 0.6124986417248809, + "learning_rate": 4.3697202129208e-06, + "loss": 0.797, + "step": 357 + }, + { + "epoch": 3.475728155339806, + "grad_norm": 0.5233985914467231, + "learning_rate": 4.36634180745537e-06, + "loss": 0.5764, + "step": 358 + }, + { + "epoch": 3.4854368932038833, + "grad_norm": 0.5590564532881283, + "learning_rate": 4.3629556852509145e-06, + "loss": 0.9866, + "step": 359 + }, + { + "epoch": 3.4951456310679614, + "grad_norm": 0.5100762691051512, + "learning_rate": 4.35956186030799e-06, + "loss": 0.6127, + "step": 360 + }, + { + "epoch": 3.5048543689320386, + "grad_norm": 0.6118806932618559, + "learning_rate": 4.356160346659001e-06, + "loss": 0.7548, + "step": 361 + }, + { + "epoch": 3.5145631067961167, + "grad_norm": 0.6631159487479685, + "learning_rate": 4.3527511583681384e-06, + "loss": 1.2605, + "step": 362 + }, + { + "epoch": 3.524271844660194, + "grad_norm": 0.6075337136532736, + "learning_rate": 4.34933430953133e-06, + "loss": 0.7971, + "step": 363 + }, + { + "epoch": 3.533980582524272, + "grad_norm": 0.6085963816816855, + "learning_rate": 4.345909814276177e-06, + "loss": 0.8115, + "step": 364 + }, + { + "epoch": 3.5436893203883493, + "grad_norm": 0.6489049958725044, + "learning_rate": 4.3424776867618935e-06, + "loss": 0.8511, + "step": 365 + }, + { + "epoch": 3.5533980582524274, + "grad_norm": 0.5533454922999531, + "learning_rate": 4.339037941179253e-06, + "loss": 0.7739, + "step": 366 + }, + { + "epoch": 3.5631067961165046, + "grad_norm": 0.5324277334095654, + "learning_rate": 4.335590591750526e-06, + "loss": 0.6825, + "step": 367 + }, + { + "epoch": 3.5728155339805827, + "grad_norm": 0.583139527089237, + "learning_rate": 4.332135652729423e-06, + "loss": 0.6584, + "step": 368 + }, + { + "epoch": 3.58252427184466, + "grad_norm": 0.5359360883573963, + "learning_rate": 4.328673138401036e-06, + "loss": 0.7771, + "step": 369 + }, + { + "epoch": 3.592233009708738, + "grad_norm": 0.5374666185272033, + "learning_rate": 4.325203063081776e-06, + "loss": 0.4958, + "step": 370 + }, + { + "epoch": 3.6019417475728153, + "grad_norm": 0.5182456153897488, + "learning_rate": 4.32172544111932e-06, + "loss": 0.6043, + "step": 371 + }, + { + "epoch": 3.6116504854368934, + "grad_norm": 0.5463365868711481, + "learning_rate": 4.318240286892544e-06, + "loss": 0.67, + "step": 372 + }, + { + "epoch": 3.6213592233009706, + "grad_norm": 0.5832479071806481, + "learning_rate": 4.314747614811471e-06, + "loss": 0.7424, + "step": 373 + }, + { + "epoch": 3.6310679611650487, + "grad_norm": 0.5054355197048168, + "learning_rate": 4.3112474393172055e-06, + "loss": 0.5487, + "step": 374 + }, + { + "epoch": 3.6407766990291264, + "grad_norm": 0.5382871622608469, + "learning_rate": 4.307739774881878e-06, + "loss": 0.6772, + "step": 375 + }, + { + "epoch": 3.650485436893204, + "grad_norm": 0.5458477824608565, + "learning_rate": 4.304224636008582e-06, + "loss": 0.5329, + "step": 376 + }, + { + "epoch": 3.6601941747572817, + "grad_norm": 0.5435683453057218, + "learning_rate": 4.300702037231318e-06, + "loss": 0.8113, + "step": 377 + }, + { + "epoch": 3.6699029126213594, + "grad_norm": 0.5383065920679537, + "learning_rate": 4.297171993114927e-06, + "loss": 0.5284, + "step": 378 + }, + { + "epoch": 3.679611650485437, + "grad_norm": 0.5880433935588353, + "learning_rate": 4.2936345182550365e-06, + "loss": 0.8539, + "step": 379 + }, + { + "epoch": 3.6893203883495147, + "grad_norm": 0.5952587100488826, + "learning_rate": 4.290089627277998e-06, + "loss": 0.5857, + "step": 380 + }, + { + "epoch": 3.6990291262135924, + "grad_norm": 0.5286616194694634, + "learning_rate": 4.286537334840825e-06, + "loss": 0.7462, + "step": 381 + }, + { + "epoch": 3.70873786407767, + "grad_norm": 0.6011010828422011, + "learning_rate": 4.2829776556311355e-06, + "loss": 0.6581, + "step": 382 + }, + { + "epoch": 3.7184466019417477, + "grad_norm": 0.59061291418775, + "learning_rate": 4.279410604367088e-06, + "loss": 0.8607, + "step": 383 + }, + { + "epoch": 3.7281553398058254, + "grad_norm": 0.6006128694830518, + "learning_rate": 4.275836195797323e-06, + "loss": 0.8253, + "step": 384 + }, + { + "epoch": 3.737864077669903, + "grad_norm": 0.559500801329847, + "learning_rate": 4.2722544447008995e-06, + "loss": 0.8451, + "step": 385 + }, + { + "epoch": 3.7475728155339807, + "grad_norm": 0.5834501093822622, + "learning_rate": 4.268665365887238e-06, + "loss": 0.8882, + "step": 386 + }, + { + "epoch": 3.7572815533980584, + "grad_norm": 0.5043323998416107, + "learning_rate": 4.265068974196056e-06, + "loss": 0.5369, + "step": 387 + }, + { + "epoch": 3.766990291262136, + "grad_norm": 0.5727251303865297, + "learning_rate": 4.261465284497307e-06, + "loss": 0.6106, + "step": 388 + }, + { + "epoch": 3.7766990291262137, + "grad_norm": 0.5372256278502759, + "learning_rate": 4.257854311691118e-06, + "loss": 0.7688, + "step": 389 + }, + { + "epoch": 3.7864077669902914, + "grad_norm": 0.53361182265545, + "learning_rate": 4.254236070707734e-06, + "loss": 0.6263, + "step": 390 + }, + { + "epoch": 3.796116504854369, + "grad_norm": 0.4761706762756821, + "learning_rate": 4.250610576507445e-06, + "loss": 0.5494, + "step": 391 + }, + { + "epoch": 3.8058252427184467, + "grad_norm": 0.5667551282882679, + "learning_rate": 4.246977844080537e-06, + "loss": 0.7841, + "step": 392 + }, + { + "epoch": 3.8155339805825244, + "grad_norm": 0.5581465960923723, + "learning_rate": 4.24333788844722e-06, + "loss": 0.727, + "step": 393 + }, + { + "epoch": 3.825242718446602, + "grad_norm": 0.641698316355312, + "learning_rate": 4.239690724657571e-06, + "loss": 0.7415, + "step": 394 + }, + { + "epoch": 3.8349514563106797, + "grad_norm": 0.5629852857617874, + "learning_rate": 4.236036367791471e-06, + "loss": 0.7685, + "step": 395 + }, + { + "epoch": 3.8446601941747574, + "grad_norm": 0.5910867032740321, + "learning_rate": 4.23237483295854e-06, + "loss": 0.797, + "step": 396 + }, + { + "epoch": 3.854368932038835, + "grad_norm": 0.5182753315022374, + "learning_rate": 4.228706135298081e-06, + "loss": 0.5786, + "step": 397 + }, + { + "epoch": 3.8640776699029127, + "grad_norm": 0.545028212220101, + "learning_rate": 4.225030289979006e-06, + "loss": 0.5823, + "step": 398 + }, + { + "epoch": 3.8737864077669903, + "grad_norm": 0.5228141064601322, + "learning_rate": 4.221347312199788e-06, + "loss": 0.5436, + "step": 399 + }, + { + "epoch": 3.883495145631068, + "grad_norm": 0.5397693619666596, + "learning_rate": 4.2176572171883865e-06, + "loss": 0.7937, + "step": 400 + }, + { + "epoch": 3.8932038834951457, + "grad_norm": 0.6886419465953945, + "learning_rate": 4.213960020202187e-06, + "loss": 0.8116, + "step": 401 + }, + { + "epoch": 3.9029126213592233, + "grad_norm": 0.5900622648932935, + "learning_rate": 4.2102557365279435e-06, + "loss": 0.917, + "step": 402 + }, + { + "epoch": 3.912621359223301, + "grad_norm": 0.544861080103798, + "learning_rate": 4.206544381481708e-06, + "loss": 0.8025, + "step": 403 + }, + { + "epoch": 3.9223300970873787, + "grad_norm": 0.518875885292913, + "learning_rate": 4.202825970408772e-06, + "loss": 0.5702, + "step": 404 + }, + { + "epoch": 3.9320388349514563, + "grad_norm": 0.656540638196741, + "learning_rate": 4.199100518683601e-06, + "loss": 0.6321, + "step": 405 + }, + { + "epoch": 3.941747572815534, + "grad_norm": 0.5191060858406762, + "learning_rate": 4.195368041709772e-06, + "loss": 0.7449, + "step": 406 + }, + { + "epoch": 3.9514563106796117, + "grad_norm": 0.4946680296221446, + "learning_rate": 4.191628554919907e-06, + "loss": 0.6497, + "step": 407 + }, + { + "epoch": 3.9611650485436893, + "grad_norm": 0.5824785363442938, + "learning_rate": 4.187882073775615e-06, + "loss": 0.7381, + "step": 408 + }, + { + "epoch": 3.970873786407767, + "grad_norm": 0.5193631938328553, + "learning_rate": 4.184128613767422e-06, + "loss": 0.6982, + "step": 409 + }, + { + "epoch": 3.9805825242718447, + "grad_norm": 0.5290930959018318, + "learning_rate": 4.18036819041471e-06, + "loss": 0.6859, + "step": 410 + }, + { + "epoch": 3.9902912621359223, + "grad_norm": 0.5272372038733293, + "learning_rate": 4.17660081926565e-06, + "loss": 0.5822, + "step": 411 + }, + { + "epoch": 4.0, + "grad_norm": 0.5630958714803344, + "learning_rate": 4.172826515897146e-06, + "loss": 0.6599, + "step": 412 + }, + { + "epoch": 4.009708737864078, + "grad_norm": 0.5200333011009248, + "learning_rate": 4.169045295914757e-06, + "loss": 0.7889, + "step": 413 + }, + { + "epoch": 4.019417475728155, + "grad_norm": 0.5331918145560857, + "learning_rate": 4.165257174952647e-06, + "loss": 0.5669, + "step": 414 + }, + { + "epoch": 4.029126213592233, + "grad_norm": 0.4838073648864831, + "learning_rate": 4.161462168673508e-06, + "loss": 0.6616, + "step": 415 + }, + { + "epoch": 4.038834951456311, + "grad_norm": 0.5285061889577369, + "learning_rate": 4.157660292768502e-06, + "loss": 0.7826, + "step": 416 + }, + { + "epoch": 4.048543689320389, + "grad_norm": 0.47876226674574307, + "learning_rate": 4.1538515629571985e-06, + "loss": 0.6324, + "step": 417 + }, + { + "epoch": 4.058252427184466, + "grad_norm": 0.4964106466439782, + "learning_rate": 4.1500359949875e-06, + "loss": 0.7882, + "step": 418 + }, + { + "epoch": 4.067961165048544, + "grad_norm": 0.5098927521794874, + "learning_rate": 4.1462136046355864e-06, + "loss": 0.679, + "step": 419 + }, + { + "epoch": 4.077669902912621, + "grad_norm": 0.5261711579674679, + "learning_rate": 4.142384407705846e-06, + "loss": 0.9621, + "step": 420 + }, + { + "epoch": 4.087378640776699, + "grad_norm": 0.4855524472712664, + "learning_rate": 4.138548420030808e-06, + "loss": 0.5577, + "step": 421 + }, + { + "epoch": 4.097087378640777, + "grad_norm": 0.46404425362535634, + "learning_rate": 4.13470565747108e-06, + "loss": 0.727, + "step": 422 + }, + { + "epoch": 4.106796116504855, + "grad_norm": 0.5013144099880142, + "learning_rate": 4.130856135915282e-06, + "loss": 0.7378, + "step": 423 + }, + { + "epoch": 4.116504854368932, + "grad_norm": 0.5316059543260906, + "learning_rate": 4.126999871279982e-06, + "loss": 0.6906, + "step": 424 + }, + { + "epoch": 4.12621359223301, + "grad_norm": 0.4944636869938843, + "learning_rate": 4.123136879509626e-06, + "loss": 0.6641, + "step": 425 + }, + { + "epoch": 4.135922330097087, + "grad_norm": 0.5095570582691661, + "learning_rate": 4.119267176576475e-06, + "loss": 0.7491, + "step": 426 + }, + { + "epoch": 4.145631067961165, + "grad_norm": 0.54005610895594, + "learning_rate": 4.11539077848054e-06, + "loss": 0.7558, + "step": 427 + }, + { + "epoch": 4.155339805825243, + "grad_norm": 0.5923182555591768, + "learning_rate": 4.111507701249513e-06, + "loss": 0.7929, + "step": 428 + }, + { + "epoch": 4.165048543689321, + "grad_norm": 0.6116168581706061, + "learning_rate": 4.107617960938702e-06, + "loss": 0.5406, + "step": 429 + }, + { + "epoch": 4.174757281553398, + "grad_norm": 0.5210538716839132, + "learning_rate": 4.103721573630965e-06, + "loss": 0.6724, + "step": 430 + }, + { + "epoch": 4.184466019417476, + "grad_norm": 0.47356150602269126, + "learning_rate": 4.099818555436645e-06, + "loss": 0.6737, + "step": 431 + }, + { + "epoch": 4.194174757281553, + "grad_norm": 0.6377655423855296, + "learning_rate": 4.095908922493499e-06, + "loss": 1.024, + "step": 432 + }, + { + "epoch": 4.203883495145631, + "grad_norm": 0.5471085748898301, + "learning_rate": 4.091992690966636e-06, + "loss": 0.5427, + "step": 433 + }, + { + "epoch": 4.213592233009709, + "grad_norm": 0.5818794345348097, + "learning_rate": 4.088069877048447e-06, + "loss": 0.6099, + "step": 434 + }, + { + "epoch": 4.223300970873787, + "grad_norm": 0.543595019351133, + "learning_rate": 4.084140496958539e-06, + "loss": 0.556, + "step": 435 + }, + { + "epoch": 4.233009708737864, + "grad_norm": 0.5101773462629047, + "learning_rate": 4.080204566943668e-06, + "loss": 0.5016, + "step": 436 + }, + { + "epoch": 4.242718446601942, + "grad_norm": 0.574864469322262, + "learning_rate": 4.076262103277673e-06, + "loss": 0.5518, + "step": 437 + }, + { + "epoch": 4.252427184466019, + "grad_norm": 0.529410686127538, + "learning_rate": 4.072313122261406e-06, + "loss": 0.9056, + "step": 438 + }, + { + "epoch": 4.262135922330097, + "grad_norm": 0.4724977490535202, + "learning_rate": 4.068357640222668e-06, + "loss": 0.5444, + "step": 439 + }, + { + "epoch": 4.271844660194175, + "grad_norm": 0.5371234765273634, + "learning_rate": 4.06439567351614e-06, + "loss": 0.5565, + "step": 440 + }, + { + "epoch": 4.281553398058253, + "grad_norm": 0.5554052963535526, + "learning_rate": 4.0604272385233105e-06, + "loss": 0.6978, + "step": 441 + }, + { + "epoch": 4.29126213592233, + "grad_norm": 0.5738857111157796, + "learning_rate": 4.056452351652418e-06, + "loss": 0.5298, + "step": 442 + }, + { + "epoch": 4.300970873786408, + "grad_norm": 0.5183938376729829, + "learning_rate": 4.052471029338375e-06, + "loss": 0.689, + "step": 443 + }, + { + "epoch": 4.310679611650485, + "grad_norm": 0.562235068394362, + "learning_rate": 4.048483288042703e-06, + "loss": 0.5329, + "step": 444 + }, + { + "epoch": 4.320388349514563, + "grad_norm": 0.5873359410798857, + "learning_rate": 4.0444891442534615e-06, + "loss": 0.6088, + "step": 445 + }, + { + "epoch": 4.330097087378641, + "grad_norm": 0.5232886853596697, + "learning_rate": 4.040488614485187e-06, + "loss": 0.6861, + "step": 446 + }, + { + "epoch": 4.339805825242719, + "grad_norm": 0.5707575802489561, + "learning_rate": 4.036481715278818e-06, + "loss": 0.7804, + "step": 447 + }, + { + "epoch": 4.349514563106796, + "grad_norm": 0.5743158436311879, + "learning_rate": 4.032468463201626e-06, + "loss": 0.7531, + "step": 448 + }, + { + "epoch": 4.359223300970874, + "grad_norm": 0.5272870524360327, + "learning_rate": 4.028448874847152e-06, + "loss": 0.4601, + "step": 449 + }, + { + "epoch": 4.368932038834951, + "grad_norm": 0.8245819176512555, + "learning_rate": 4.024422966835137e-06, + "loss": 0.7204, + "step": 450 + }, + { + "epoch": 4.378640776699029, + "grad_norm": 0.5818583543824783, + "learning_rate": 4.0203907558114475e-06, + "loss": 0.6319, + "step": 451 + }, + { + "epoch": 4.388349514563107, + "grad_norm": 0.5916617169196222, + "learning_rate": 4.016352258448016e-06, + "loss": 0.5578, + "step": 452 + }, + { + "epoch": 4.398058252427185, + "grad_norm": 0.5247948377375482, + "learning_rate": 4.0123074914427635e-06, + "loss": 0.5817, + "step": 453 + }, + { + "epoch": 4.407766990291262, + "grad_norm": 0.4841360885356806, + "learning_rate": 4.008256471519536e-06, + "loss": 0.5309, + "step": 454 + }, + { + "epoch": 4.41747572815534, + "grad_norm": 0.5386315901899501, + "learning_rate": 4.004199215428032e-06, + "loss": 0.6876, + "step": 455 + }, + { + "epoch": 4.427184466019417, + "grad_norm": 0.6303477188551428, + "learning_rate": 4.000135739943735e-06, + "loss": 0.7771, + "step": 456 + }, + { + "epoch": 4.436893203883495, + "grad_norm": 0.877305177258536, + "learning_rate": 3.996066061867844e-06, + "loss": 0.7183, + "step": 457 + }, + { + "epoch": 4.446601941747573, + "grad_norm": 0.5475353424276045, + "learning_rate": 3.991990198027203e-06, + "loss": 0.6071, + "step": 458 + }, + { + "epoch": 4.456310679611651, + "grad_norm": 0.5369581006252573, + "learning_rate": 3.987908165274233e-06, + "loss": 0.6663, + "step": 459 + }, + { + "epoch": 4.466019417475728, + "grad_norm": 0.5481338983710499, + "learning_rate": 3.9838199804868635e-06, + "loss": 0.7901, + "step": 460 + }, + { + "epoch": 4.475728155339806, + "grad_norm": 0.5673339500133648, + "learning_rate": 3.979725660568456e-06, + "loss": 0.7221, + "step": 461 + }, + { + "epoch": 4.485436893203883, + "grad_norm": 0.5276433542097172, + "learning_rate": 3.975625222447742e-06, + "loss": 0.5454, + "step": 462 + }, + { + "epoch": 4.495145631067961, + "grad_norm": 0.5773222334966291, + "learning_rate": 3.97151868307875e-06, + "loss": 0.8548, + "step": 463 + }, + { + "epoch": 4.504854368932039, + "grad_norm": 0.5288063685496589, + "learning_rate": 3.9674060594407345e-06, + "loss": 0.6481, + "step": 464 + }, + { + "epoch": 4.514563106796117, + "grad_norm": 0.5234908733008361, + "learning_rate": 3.963287368538105e-06, + "loss": 0.4874, + "step": 465 + }, + { + "epoch": 4.524271844660194, + "grad_norm": 0.5548781615871586, + "learning_rate": 3.959162627400361e-06, + "loss": 0.7774, + "step": 466 + }, + { + "epoch": 4.533980582524272, + "grad_norm": 0.6088390104886395, + "learning_rate": 3.9550318530820145e-06, + "loss": 0.7768, + "step": 467 + }, + { + "epoch": 4.543689320388349, + "grad_norm": 0.5362660929991437, + "learning_rate": 3.9508950626625244e-06, + "loss": 0.5664, + "step": 468 + }, + { + "epoch": 4.553398058252427, + "grad_norm": 0.5403037827442222, + "learning_rate": 3.946752273246224e-06, + "loss": 0.7226, + "step": 469 + }, + { + "epoch": 4.563106796116505, + "grad_norm": 0.5160659142912444, + "learning_rate": 3.942603501962249e-06, + "loss": 0.609, + "step": 470 + }, + { + "epoch": 4.572815533980583, + "grad_norm": 0.6197532352354348, + "learning_rate": 3.9384487659644716e-06, + "loss": 0.5509, + "step": 471 + }, + { + "epoch": 4.58252427184466, + "grad_norm": 0.6600383068172292, + "learning_rate": 3.934288082431423e-06, + "loss": 0.7257, + "step": 472 + }, + { + "epoch": 4.592233009708738, + "grad_norm": 0.5641531137481023, + "learning_rate": 3.930121468566227e-06, + "loss": 0.8232, + "step": 473 + }, + { + "epoch": 4.601941747572815, + "grad_norm": 0.5450663648523331, + "learning_rate": 3.925948941596528e-06, + "loss": 0.6493, + "step": 474 + }, + { + "epoch": 4.611650485436893, + "grad_norm": 0.5134107913642527, + "learning_rate": 3.92177051877442e-06, + "loss": 0.554, + "step": 475 + }, + { + "epoch": 4.621359223300971, + "grad_norm": 0.5988443116784232, + "learning_rate": 3.917586217376369e-06, + "loss": 0.7745, + "step": 476 + }, + { + "epoch": 4.631067961165049, + "grad_norm": 0.5138233596193919, + "learning_rate": 3.913396054703155e-06, + "loss": 0.7129, + "step": 477 + }, + { + "epoch": 4.640776699029126, + "grad_norm": 0.5323215028941234, + "learning_rate": 3.909200048079786e-06, + "loss": 0.7206, + "step": 478 + }, + { + "epoch": 4.650485436893204, + "grad_norm": 0.5875971843837734, + "learning_rate": 3.9049982148554384e-06, + "loss": 0.8862, + "step": 479 + }, + { + "epoch": 4.660194174757281, + "grad_norm": 0.5645023952141737, + "learning_rate": 3.900790572403376e-06, + "loss": 0.6745, + "step": 480 + }, + { + "epoch": 4.669902912621359, + "grad_norm": 0.5154008565576635, + "learning_rate": 3.896577138120881e-06, + "loss": 0.6947, + "step": 481 + }, + { + "epoch": 4.679611650485437, + "grad_norm": 0.5394552750418433, + "learning_rate": 3.892357929429187e-06, + "loss": 0.6791, + "step": 482 + }, + { + "epoch": 4.689320388349515, + "grad_norm": 0.5827930464161047, + "learning_rate": 3.8881329637734e-06, + "loss": 0.5154, + "step": 483 + }, + { + "epoch": 4.699029126213592, + "grad_norm": 0.5361181581166996, + "learning_rate": 3.883902258622431e-06, + "loss": 0.6468, + "step": 484 + }, + { + "epoch": 4.70873786407767, + "grad_norm": 0.5711060546895186, + "learning_rate": 3.8796658314689205e-06, + "loss": 0.7167, + "step": 485 + }, + { + "epoch": 4.718446601941747, + "grad_norm": 0.5434222419984399, + "learning_rate": 3.875423699829168e-06, + "loss": 0.5867, + "step": 486 + }, + { + "epoch": 4.728155339805825, + "grad_norm": 0.5096613481897871, + "learning_rate": 3.871175881243061e-06, + "loss": 0.6356, + "step": 487 + }, + { + "epoch": 4.737864077669903, + "grad_norm": 0.542382695866742, + "learning_rate": 3.866922393273999e-06, + "loss": 0.7997, + "step": 488 + }, + { + "epoch": 4.747572815533981, + "grad_norm": 0.5295202287930876, + "learning_rate": 3.862663253508822e-06, + "loss": 0.6506, + "step": 489 + }, + { + "epoch": 4.757281553398058, + "grad_norm": 0.5116429751055901, + "learning_rate": 3.858398479557739e-06, + "loss": 0.6037, + "step": 490 + }, + { + "epoch": 4.766990291262136, + "grad_norm": 0.5283075153391654, + "learning_rate": 3.8541280890542565e-06, + "loss": 0.6142, + "step": 491 + }, + { + "epoch": 4.776699029126213, + "grad_norm": 0.5741236822380017, + "learning_rate": 3.849852099655102e-06, + "loss": 0.7518, + "step": 492 + }, + { + "epoch": 4.786407766990291, + "grad_norm": 0.5062208506690622, + "learning_rate": 3.845570529040151e-06, + "loss": 0.6338, + "step": 493 + }, + { + "epoch": 4.796116504854369, + "grad_norm": 0.5321985087384002, + "learning_rate": 3.841283394912361e-06, + "loss": 0.6375, + "step": 494 + }, + { + "epoch": 4.805825242718447, + "grad_norm": 0.5310703753315527, + "learning_rate": 3.836990714997686e-06, + "loss": 0.7015, + "step": 495 + }, + { + "epoch": 4.815533980582524, + "grad_norm": 0.5620272702074033, + "learning_rate": 3.832692507045015e-06, + "loss": 0.731, + "step": 496 + }, + { + "epoch": 4.825242718446602, + "grad_norm": 0.5060945256107214, + "learning_rate": 3.828388788826091e-06, + "loss": 0.7914, + "step": 497 + }, + { + "epoch": 4.834951456310679, + "grad_norm": 0.5538607078294117, + "learning_rate": 3.824079578135442e-06, + "loss": 0.6944, + "step": 498 + }, + { + "epoch": 4.844660194174757, + "grad_norm": 0.5452852198799888, + "learning_rate": 3.819764892790307e-06, + "loss": 0.7015, + "step": 499 + }, + { + "epoch": 4.854368932038835, + "grad_norm": 0.557748542801134, + "learning_rate": 3.815444750630555e-06, + "loss": 0.6616, + "step": 500 + }, + { + "epoch": 4.864077669902913, + "grad_norm": 0.555226374195002, + "learning_rate": 3.811119169518624e-06, + "loss": 0.8484, + "step": 501 + }, + { + "epoch": 4.87378640776699, + "grad_norm": 0.583053708240317, + "learning_rate": 3.8067881673394363e-06, + "loss": 1.071, + "step": 502 + }, + { + "epoch": 4.883495145631068, + "grad_norm": 0.6078873319472481, + "learning_rate": 3.802451762000331e-06, + "loss": 0.6256, + "step": 503 + }, + { + "epoch": 4.893203883495145, + "grad_norm": 0.539423072387751, + "learning_rate": 3.7981099714309856e-06, + "loss": 0.5403, + "step": 504 + }, + { + "epoch": 4.902912621359223, + "grad_norm": 0.6603698148330428, + "learning_rate": 3.7937628135833453e-06, + "loss": 0.6906, + "step": 505 + }, + { + "epoch": 4.9126213592233015, + "grad_norm": 0.5562121401027199, + "learning_rate": 3.7894103064315463e-06, + "loss": 0.7039, + "step": 506 + }, + { + "epoch": 4.922330097087379, + "grad_norm": 0.5336235803797111, + "learning_rate": 3.7850524679718424e-06, + "loss": 0.59, + "step": 507 + }, + { + "epoch": 4.932038834951456, + "grad_norm": 0.5331917381976123, + "learning_rate": 3.7806893162225328e-06, + "loss": 0.54, + "step": 508 + }, + { + "epoch": 4.941747572815534, + "grad_norm": 0.5058791106293693, + "learning_rate": 3.7763208692238818e-06, + "loss": 0.6653, + "step": 509 + }, + { + "epoch": 4.951456310679612, + "grad_norm": 0.5299074738912896, + "learning_rate": 3.7719471450380518e-06, + "loss": 0.6194, + "step": 510 + }, + { + "epoch": 4.961165048543689, + "grad_norm": 0.5327819159805345, + "learning_rate": 3.7675681617490212e-06, + "loss": 0.805, + "step": 511 + }, + { + "epoch": 4.970873786407767, + "grad_norm": 0.5298660538103491, + "learning_rate": 3.7631839374625167e-06, + "loss": 0.7929, + "step": 512 + }, + { + "epoch": 4.980582524271845, + "grad_norm": 0.4764247185946088, + "learning_rate": 3.758794490305932e-06, + "loss": 0.611, + "step": 513 + }, + { + "epoch": 4.990291262135923, + "grad_norm": 0.5451924892487114, + "learning_rate": 3.7543998384282565e-06, + "loss": 0.6893, + "step": 514 + }, + { + "epoch": 5.0, + "grad_norm": 0.5454441184842063, + "learning_rate": 3.7500000000000005e-06, + "loss": 0.7546, + "step": 515 + }, + { + "epoch": 5.009708737864078, + "grad_norm": 0.5236580494464181, + "learning_rate": 3.745594993213118e-06, + "loss": 0.743, + "step": 516 + }, + { + "epoch": 5.019417475728155, + "grad_norm": 0.5322157623594023, + "learning_rate": 3.7411848362809324e-06, + "loss": 0.9126, + "step": 517 + }, + { + "epoch": 5.029126213592233, + "grad_norm": 0.4717932076034679, + "learning_rate": 3.7367695474380623e-06, + "loss": 0.6524, + "step": 518 + }, + { + "epoch": 5.038834951456311, + "grad_norm": 0.504345727123636, + "learning_rate": 3.7323491449403444e-06, + "loss": 0.74, + "step": 519 + }, + { + "epoch": 5.048543689320389, + "grad_norm": 0.4527014691294767, + "learning_rate": 3.7279236470647593e-06, + "loss": 0.5095, + "step": 520 + }, + { + "epoch": 5.058252427184466, + "grad_norm": 0.5507904251400323, + "learning_rate": 3.723493072109355e-06, + "loss": 0.8863, + "step": 521 + }, + { + "epoch": 5.067961165048544, + "grad_norm": 0.4965393682876186, + "learning_rate": 3.719057438393172e-06, + "loss": 0.6478, + "step": 522 + }, + { + "epoch": 5.077669902912621, + "grad_norm": 0.5218053726872672, + "learning_rate": 3.714616764256166e-06, + "loss": 0.8168, + "step": 523 + }, + { + "epoch": 5.087378640776699, + "grad_norm": 0.5407179591306065, + "learning_rate": 3.7101710680591353e-06, + "loss": 0.6817, + "step": 524 + }, + { + "epoch": 5.097087378640777, + "grad_norm": 0.5149668318951321, + "learning_rate": 3.7057203681836407e-06, + "loss": 0.7706, + "step": 525 + }, + { + "epoch": 5.106796116504855, + "grad_norm": 0.6260255413647343, + "learning_rate": 3.701264683031934e-06, + "loss": 0.8372, + "step": 526 + }, + { + "epoch": 5.116504854368932, + "grad_norm": 0.5147717662373354, + "learning_rate": 3.6968040310268766e-06, + "loss": 0.6642, + "step": 527 + }, + { + "epoch": 5.12621359223301, + "grad_norm": 0.5229036248727982, + "learning_rate": 3.692338430611869e-06, + "loss": 0.6629, + "step": 528 + }, + { + "epoch": 5.135922330097087, + "grad_norm": 0.5573618752917658, + "learning_rate": 3.687867900250771e-06, + "loss": 0.6025, + "step": 529 + }, + { + "epoch": 5.145631067961165, + "grad_norm": 0.4894399208639397, + "learning_rate": 3.683392458427825e-06, + "loss": 0.5302, + "step": 530 + }, + { + "epoch": 5.155339805825243, + "grad_norm": 0.5637535925415186, + "learning_rate": 3.6789121236475818e-06, + "loss": 0.7266, + "step": 531 + }, + { + "epoch": 5.165048543689321, + "grad_norm": 0.5789728245531511, + "learning_rate": 3.674426914434824e-06, + "loss": 0.7271, + "step": 532 + }, + { + "epoch": 5.174757281553398, + "grad_norm": 0.5466853817465158, + "learning_rate": 3.6699368493344856e-06, + "loss": 0.7348, + "step": 533 + }, + { + "epoch": 5.184466019417476, + "grad_norm": 0.5042069743820944, + "learning_rate": 3.665441946911582e-06, + "loss": 0.5876, + "step": 534 + }, + { + "epoch": 5.194174757281553, + "grad_norm": 0.55765274686579, + "learning_rate": 3.660942225751126e-06, + "loss": 0.6261, + "step": 535 + }, + { + "epoch": 5.203883495145631, + "grad_norm": 0.5585892588024256, + "learning_rate": 3.6564377044580558e-06, + "loss": 0.6696, + "step": 536 + }, + { + "epoch": 5.213592233009709, + "grad_norm": 0.5410505344470596, + "learning_rate": 3.6519284016571567e-06, + "loss": 0.5517, + "step": 537 + }, + { + "epoch": 5.223300970873787, + "grad_norm": 0.5655367086406735, + "learning_rate": 3.647414335992985e-06, + "loss": 0.8221, + "step": 538 + }, + { + "epoch": 5.233009708737864, + "grad_norm": 0.5496575434925196, + "learning_rate": 3.642895526129787e-06, + "loss": 0.7155, + "step": 539 + }, + { + "epoch": 5.242718446601942, + "grad_norm": 0.527282587966023, + "learning_rate": 3.638371990751428e-06, + "loss": 0.7697, + "step": 540 + }, + { + "epoch": 5.252427184466019, + "grad_norm": 0.5886591311740469, + "learning_rate": 3.63384374856131e-06, + "loss": 0.7738, + "step": 541 + }, + { + "epoch": 5.262135922330097, + "grad_norm": 0.547315818665875, + "learning_rate": 3.629310818282297e-06, + "loss": 0.849, + "step": 542 + }, + { + "epoch": 5.271844660194175, + "grad_norm": 0.5281918770795211, + "learning_rate": 3.6247732186566365e-06, + "loss": 0.6471, + "step": 543 + }, + { + "epoch": 5.281553398058253, + "grad_norm": 0.5082451180767762, + "learning_rate": 3.6202309684458813e-06, + "loss": 0.5249, + "step": 544 + }, + { + "epoch": 5.29126213592233, + "grad_norm": 0.5048524465631024, + "learning_rate": 3.615684086430815e-06, + "loss": 0.5654, + "step": 545 + }, + { + "epoch": 5.300970873786408, + "grad_norm": 0.5489198982092451, + "learning_rate": 3.61113259141137e-06, + "loss": 0.7281, + "step": 546 + }, + { + "epoch": 5.310679611650485, + "grad_norm": 0.5248421669868224, + "learning_rate": 3.606576502206554e-06, + "loss": 0.4695, + "step": 547 + }, + { + "epoch": 5.320388349514563, + "grad_norm": 0.5525806088263864, + "learning_rate": 3.602015837654369e-06, + "loss": 0.5076, + "step": 548 + }, + { + "epoch": 5.330097087378641, + "grad_norm": 0.5147558593612964, + "learning_rate": 3.5974506166117355e-06, + "loss": 0.5361, + "step": 549 + }, + { + "epoch": 5.339805825242719, + "grad_norm": 0.5230812119454316, + "learning_rate": 3.592880857954413e-06, + "loss": 0.79, + "step": 550 + }, + { + "epoch": 5.349514563106796, + "grad_norm": 0.47610639093924234, + "learning_rate": 3.588306580576922e-06, + "loss": 0.5074, + "step": 551 + }, + { + "epoch": 5.359223300970874, + "grad_norm": 0.5076655865626652, + "learning_rate": 3.583727803392468e-06, + "loss": 0.4614, + "step": 552 + }, + { + "epoch": 5.368932038834951, + "grad_norm": 0.5034407309618866, + "learning_rate": 3.57914454533286e-06, + "loss": 0.7016, + "step": 553 + }, + { + "epoch": 5.378640776699029, + "grad_norm": 0.5956638435834678, + "learning_rate": 3.5745568253484363e-06, + "loss": 0.574, + "step": 554 + }, + { + "epoch": 5.388349514563107, + "grad_norm": 0.5374526756995527, + "learning_rate": 3.5699646624079824e-06, + "loss": 0.6492, + "step": 555 + }, + { + "epoch": 5.398058252427185, + "grad_norm": 0.5748361523110773, + "learning_rate": 3.5653680754986543e-06, + "loss": 0.4815, + "step": 556 + }, + { + "epoch": 5.407766990291262, + "grad_norm": 0.54567644758393, + "learning_rate": 3.560767083625899e-06, + "loss": 0.8402, + "step": 557 + }, + { + "epoch": 5.41747572815534, + "grad_norm": 0.5659255771617656, + "learning_rate": 3.556161705813378e-06, + "loss": 0.5835, + "step": 558 + }, + { + "epoch": 5.427184466019417, + "grad_norm": 0.6247712187120456, + "learning_rate": 3.5515519611028863e-06, + "loss": 0.8306, + "step": 559 + }, + { + "epoch": 5.436893203883495, + "grad_norm": 0.5173918216113128, + "learning_rate": 3.5469378685542742e-06, + "loss": 0.733, + "step": 560 + }, + { + "epoch": 5.446601941747573, + "grad_norm": 0.5518176200795789, + "learning_rate": 3.542319447245372e-06, + "loss": 0.9392, + "step": 561 + }, + { + "epoch": 5.456310679611651, + "grad_norm": 0.5349757875135974, + "learning_rate": 3.537696716271904e-06, + "loss": 0.8355, + "step": 562 + }, + { + "epoch": 5.466019417475728, + "grad_norm": 0.5211503905687772, + "learning_rate": 3.533069694747415e-06, + "loss": 0.5956, + "step": 563 + }, + { + "epoch": 5.475728155339806, + "grad_norm": 0.4837470952780332, + "learning_rate": 3.528438401803192e-06, + "loss": 0.5892, + "step": 564 + }, + { + "epoch": 5.485436893203883, + "grad_norm": 0.5125612599343026, + "learning_rate": 3.52380285658818e-06, + "loss": 0.5399, + "step": 565 + }, + { + "epoch": 5.495145631067961, + "grad_norm": 0.5410485375898842, + "learning_rate": 3.5191630782689074e-06, + "loss": 0.6512, + "step": 566 + }, + { + "epoch": 5.504854368932039, + "grad_norm": 0.47850608014355017, + "learning_rate": 3.5145190860294043e-06, + "loss": 0.5441, + "step": 567 + }, + { + "epoch": 5.514563106796117, + "grad_norm": 0.49368651452695383, + "learning_rate": 3.5098708990711254e-06, + "loss": 0.4864, + "step": 568 + }, + { + "epoch": 5.524271844660194, + "grad_norm": 0.5210530061794847, + "learning_rate": 3.505218536612869e-06, + "loss": 0.6663, + "step": 569 + }, + { + "epoch": 5.533980582524272, + "grad_norm": 0.5402383645877286, + "learning_rate": 3.500562017890695e-06, + "loss": 0.5224, + "step": 570 + }, + { + "epoch": 5.543689320388349, + "grad_norm": 0.5069706433380721, + "learning_rate": 3.495901362157853e-06, + "loss": 0.4391, + "step": 571 + }, + { + "epoch": 5.553398058252427, + "grad_norm": 0.5370346510366669, + "learning_rate": 3.4912365886846934e-06, + "loss": 0.8669, + "step": 572 + }, + { + "epoch": 5.563106796116505, + "grad_norm": 0.4976926429074089, + "learning_rate": 3.4865677167585942e-06, + "loss": 0.678, + "step": 573 + }, + { + "epoch": 5.572815533980583, + "grad_norm": 0.5118571857656103, + "learning_rate": 3.4818947656838796e-06, + "loss": 0.5575, + "step": 574 + }, + { + "epoch": 5.58252427184466, + "grad_norm": 0.575107709638005, + "learning_rate": 3.4772177547817387e-06, + "loss": 0.5642, + "step": 575 + }, + { + "epoch": 5.592233009708738, + "grad_norm": 0.544943705330846, + "learning_rate": 3.472536703390148e-06, + "loss": 0.5671, + "step": 576 + }, + { + "epoch": 5.601941747572815, + "grad_norm": 0.48590763199396203, + "learning_rate": 3.467851630863789e-06, + "loss": 0.5398, + "step": 577 + }, + { + "epoch": 5.611650485436893, + "grad_norm": 0.48995907325112287, + "learning_rate": 3.463162556573969e-06, + "loss": 0.6857, + "step": 578 + }, + { + "epoch": 5.621359223300971, + "grad_norm": 0.5250045703440803, + "learning_rate": 3.4584694999085424e-06, + "loss": 0.7885, + "step": 579 + }, + { + "epoch": 5.631067961165049, + "grad_norm": 0.5629591675833219, + "learning_rate": 3.4537724802718294e-06, + "loss": 0.5854, + "step": 580 + }, + { + "epoch": 5.640776699029126, + "grad_norm": 0.4846489281273186, + "learning_rate": 3.4490715170845356e-06, + "loss": 0.5208, + "step": 581 + }, + { + "epoch": 5.650485436893204, + "grad_norm": 0.5888541854252343, + "learning_rate": 3.4443666297836715e-06, + "loss": 0.5512, + "step": 582 + }, + { + "epoch": 5.660194174757281, + "grad_norm": 0.6442464798240836, + "learning_rate": 3.4396578378224734e-06, + "loss": 0.5946, + "step": 583 + }, + { + "epoch": 5.669902912621359, + "grad_norm": 0.493758970029001, + "learning_rate": 3.4349451606703214e-06, + "loss": 0.6526, + "step": 584 + }, + { + "epoch": 5.679611650485437, + "grad_norm": 0.581861416552549, + "learning_rate": 3.430228617812661e-06, + "loss": 0.5385, + "step": 585 + }, + { + "epoch": 5.689320388349515, + "grad_norm": 0.496120024238278, + "learning_rate": 3.4255082287509183e-06, + "loss": 0.4038, + "step": 586 + }, + { + "epoch": 5.699029126213592, + "grad_norm": 0.5767006067460728, + "learning_rate": 3.420784013002426e-06, + "loss": 0.5677, + "step": 587 + }, + { + "epoch": 5.70873786407767, + "grad_norm": 0.5158422312441495, + "learning_rate": 3.416055990100336e-06, + "loss": 0.5685, + "step": 588 + }, + { + "epoch": 5.718446601941747, + "grad_norm": 0.5162552308353006, + "learning_rate": 3.4113241795935427e-06, + "loss": 0.5916, + "step": 589 + }, + { + "epoch": 5.728155339805825, + "grad_norm": 0.5431343552918371, + "learning_rate": 3.4065886010466014e-06, + "loss": 0.6086, + "step": 590 + }, + { + "epoch": 5.737864077669903, + "grad_norm": 0.5119365013009938, + "learning_rate": 3.401849274039647e-06, + "loss": 0.6584, + "step": 591 + }, + { + "epoch": 5.747572815533981, + "grad_norm": 0.4947683856841006, + "learning_rate": 3.3971062181683117e-06, + "loss": 0.5387, + "step": 592 + }, + { + "epoch": 5.757281553398058, + "grad_norm": 0.5434375331427597, + "learning_rate": 3.3923594530436477e-06, + "loss": 0.7175, + "step": 593 + }, + { + "epoch": 5.766990291262136, + "grad_norm": 0.5577372563704153, + "learning_rate": 3.387608998292041e-06, + "loss": 0.6719, + "step": 594 + }, + { + "epoch": 5.776699029126213, + "grad_norm": 0.4837142305485574, + "learning_rate": 3.382854873555137e-06, + "loss": 0.4739, + "step": 595 + }, + { + "epoch": 5.786407766990291, + "grad_norm": 0.5306514259002709, + "learning_rate": 3.3780970984897504e-06, + "loss": 0.5225, + "step": 596 + }, + { + "epoch": 5.796116504854369, + "grad_norm": 0.5194892402061854, + "learning_rate": 3.373335692767793e-06, + "loss": 0.776, + "step": 597 + }, + { + "epoch": 5.805825242718447, + "grad_norm": 0.518491658036596, + "learning_rate": 3.3685706760761865e-06, + "loss": 0.6919, + "step": 598 + }, + { + "epoch": 5.815533980582524, + "grad_norm": 0.5938926587571165, + "learning_rate": 3.3638020681167827e-06, + "loss": 0.5489, + "step": 599 + }, + { + "epoch": 5.825242718446602, + "grad_norm": 0.5206196560346433, + "learning_rate": 3.3590298886062833e-06, + "loss": 0.5532, + "step": 600 + }, + { + "epoch": 5.834951456310679, + "grad_norm": 0.4898920438591441, + "learning_rate": 3.354254157276155e-06, + "loss": 0.7708, + "step": 601 + }, + { + "epoch": 5.844660194174757, + "grad_norm": 0.7027567743473708, + "learning_rate": 3.3494748938725525e-06, + "loss": 0.5813, + "step": 602 + }, + { + "epoch": 5.854368932038835, + "grad_norm": 0.5373706241783007, + "learning_rate": 3.3446921181562326e-06, + "loss": 0.5484, + "step": 603 + }, + { + "epoch": 5.864077669902913, + "grad_norm": 0.6287988932770455, + "learning_rate": 3.3399058499024767e-06, + "loss": 0.4705, + "step": 604 + }, + { + "epoch": 5.87378640776699, + "grad_norm": 0.6420488331857355, + "learning_rate": 3.3351161089010055e-06, + "loss": 0.8357, + "step": 605 + }, + { + "epoch": 5.883495145631068, + "grad_norm": 0.483984948258574, + "learning_rate": 3.330322914955897e-06, + "loss": 0.6679, + "step": 606 + }, + { + "epoch": 5.893203883495145, + "grad_norm": 0.5633594210474601, + "learning_rate": 3.325526287885509e-06, + "loss": 0.6853, + "step": 607 + }, + { + "epoch": 5.902912621359223, + "grad_norm": 0.5099944120499966, + "learning_rate": 3.3207262475223913e-06, + "loss": 0.653, + "step": 608 + }, + { + "epoch": 5.9126213592233015, + "grad_norm": 0.6133535314483775, + "learning_rate": 3.315922813713209e-06, + "loss": 0.8421, + "step": 609 + }, + { + "epoch": 5.922330097087379, + "grad_norm": 0.5514042535577254, + "learning_rate": 3.3111160063186553e-06, + "loss": 0.6114, + "step": 610 + }, + { + "epoch": 5.932038834951456, + "grad_norm": 0.515401175634321, + "learning_rate": 3.3063058452133756e-06, + "loss": 0.7155, + "step": 611 + }, + { + "epoch": 5.941747572815534, + "grad_norm": 0.5483222829849357, + "learning_rate": 3.301492350285879e-06, + "loss": 0.6461, + "step": 612 + }, + { + "epoch": 5.951456310679612, + "grad_norm": 0.5102148189624786, + "learning_rate": 3.296675541438461e-06, + "loss": 0.5192, + "step": 613 + }, + { + "epoch": 5.961165048543689, + "grad_norm": 0.5459577941840908, + "learning_rate": 3.2918554385871163e-06, + "loss": 0.7572, + "step": 614 + }, + { + "epoch": 5.970873786407767, + "grad_norm": 0.4986561470940389, + "learning_rate": 3.2870320616614626e-06, + "loss": 0.4979, + "step": 615 + }, + { + "epoch": 5.980582524271845, + "grad_norm": 0.5366419322997734, + "learning_rate": 3.282205430604653e-06, + "loss": 0.5367, + "step": 616 + }, + { + "epoch": 5.990291262135923, + "grad_norm": 0.6234877265968874, + "learning_rate": 3.2773755653732954e-06, + "loss": 0.6123, + "step": 617 + }, + { + "epoch": 6.0, + "grad_norm": 0.5139134055855142, + "learning_rate": 3.272542485937369e-06, + "loss": 0.6528, + "step": 618 + }, + { + "epoch": 6.009708737864078, + "grad_norm": 0.5535462185153359, + "learning_rate": 3.267706212280146e-06, + "loss": 0.5759, + "step": 619 + }, + { + "epoch": 6.019417475728155, + "grad_norm": 0.5263133197453993, + "learning_rate": 3.2628667643981036e-06, + "loss": 0.6369, + "step": 620 + }, + { + "epoch": 6.029126213592233, + "grad_norm": 0.5823149773756441, + "learning_rate": 3.2580241623008426e-06, + "loss": 0.7007, + "step": 621 + }, + { + "epoch": 6.038834951456311, + "grad_norm": 0.5698523756699659, + "learning_rate": 3.2531784260110067e-06, + "loss": 0.6968, + "step": 622 + }, + { + "epoch": 6.048543689320389, + "grad_norm": 0.5323935991893491, + "learning_rate": 3.2483295755641986e-06, + "loss": 0.6614, + "step": 623 + }, + { + "epoch": 6.058252427184466, + "grad_norm": 0.5579966093733544, + "learning_rate": 3.243477631008897e-06, + "loss": 0.7018, + "step": 624 + }, + { + "epoch": 6.067961165048544, + "grad_norm": 0.5672146611605858, + "learning_rate": 3.238622612406373e-06, + "loss": 0.4811, + "step": 625 + }, + { + "epoch": 6.077669902912621, + "grad_norm": 0.5487784461526284, + "learning_rate": 3.233764539830608e-06, + "loss": 0.6582, + "step": 626 + }, + { + "epoch": 6.087378640776699, + "grad_norm": 0.4968156455806434, + "learning_rate": 3.228903433368212e-06, + "loss": 0.4246, + "step": 627 + }, + { + "epoch": 6.097087378640777, + "grad_norm": 0.6508059967941122, + "learning_rate": 3.224039313118338e-06, + "loss": 0.3601, + "step": 628 + }, + { + "epoch": 6.106796116504855, + "grad_norm": 0.5223437529762169, + "learning_rate": 3.2191721991925993e-06, + "loss": 0.5353, + "step": 629 + }, + { + "epoch": 6.116504854368932, + "grad_norm": 0.4969256578725589, + "learning_rate": 3.21430211171499e-06, + "loss": 0.6791, + "step": 630 + }, + { + "epoch": 6.12621359223301, + "grad_norm": 0.5033067440842812, + "learning_rate": 3.209429070821795e-06, + "loss": 0.9006, + "step": 631 + }, + { + "epoch": 6.135922330097087, + "grad_norm": 0.5293076970287124, + "learning_rate": 3.2045530966615136e-06, + "loss": 0.7065, + "step": 632 + }, + { + "epoch": 6.145631067961165, + "grad_norm": 0.5232595313548619, + "learning_rate": 3.1996742093947724e-06, + "loss": 0.5669, + "step": 633 + }, + { + "epoch": 6.155339805825243, + "grad_norm": 0.5096836013728645, + "learning_rate": 3.1947924291942423e-06, + "loss": 0.6983, + "step": 634 + }, + { + "epoch": 6.165048543689321, + "grad_norm": 0.5367180425599433, + "learning_rate": 3.189907776244556e-06, + "loss": 0.5669, + "step": 635 + }, + { + "epoch": 6.174757281553398, + "grad_norm": 0.5645144344049964, + "learning_rate": 3.185020270742225e-06, + "loss": 0.6902, + "step": 636 + }, + { + "epoch": 6.184466019417476, + "grad_norm": 0.5634972380711424, + "learning_rate": 3.180129932895553e-06, + "loss": 0.4903, + "step": 637 + }, + { + "epoch": 6.194174757281553, + "grad_norm": 0.5589418053348287, + "learning_rate": 3.1752367829245563e-06, + "loss": 0.7803, + "step": 638 + }, + { + "epoch": 6.203883495145631, + "grad_norm": 0.5558458265176147, + "learning_rate": 3.1703408410608777e-06, + "loss": 0.8576, + "step": 639 + }, + { + "epoch": 6.213592233009709, + "grad_norm": 0.4994812128687224, + "learning_rate": 3.1654421275477045e-06, + "loss": 0.598, + "step": 640 + }, + { + "epoch": 6.223300970873787, + "grad_norm": 0.50112018228175, + "learning_rate": 3.1605406626396826e-06, + "loss": 0.6546, + "step": 641 + }, + { + "epoch": 6.233009708737864, + "grad_norm": 0.48779980049930693, + "learning_rate": 3.155636466602836e-06, + "loss": 0.629, + "step": 642 + }, + { + "epoch": 6.242718446601942, + "grad_norm": 0.5251674835972288, + "learning_rate": 3.150729559714478e-06, + "loss": 0.7687, + "step": 643 + }, + { + "epoch": 6.252427184466019, + "grad_norm": 0.5145810139817877, + "learning_rate": 3.145819962263134e-06, + "loss": 0.6442, + "step": 644 + }, + { + "epoch": 6.262135922330097, + "grad_norm": 0.5189234129866919, + "learning_rate": 3.1409076945484513e-06, + "loss": 0.536, + "step": 645 + }, + { + "epoch": 6.271844660194175, + "grad_norm": 0.5247294425181219, + "learning_rate": 3.135992776881119e-06, + "loss": 0.4885, + "step": 646 + }, + { + "epoch": 6.281553398058253, + "grad_norm": 0.561750334008629, + "learning_rate": 3.1310752295827818e-06, + "loss": 0.4685, + "step": 647 + }, + { + "epoch": 6.29126213592233, + "grad_norm": 0.48682689595750156, + "learning_rate": 3.1261550729859602e-06, + "loss": 0.6406, + "step": 648 + }, + { + "epoch": 6.300970873786408, + "grad_norm": 0.48042953552521844, + "learning_rate": 3.12123232743396e-06, + "loss": 0.5257, + "step": 649 + }, + { + "epoch": 6.310679611650485, + "grad_norm": 0.5265214331531651, + "learning_rate": 3.116307013280793e-06, + "loss": 0.8245, + "step": 650 + }, + { + "epoch": 6.320388349514563, + "grad_norm": 0.5216967258162386, + "learning_rate": 3.1113791508910913e-06, + "loss": 0.6038, + "step": 651 + }, + { + "epoch": 6.330097087378641, + "grad_norm": 0.4684172774395295, + "learning_rate": 3.106448760640022e-06, + "loss": 0.7096, + "step": 652 + }, + { + "epoch": 6.339805825242719, + "grad_norm": 0.5535208394965718, + "learning_rate": 3.1015158629132066e-06, + "loss": 0.5093, + "step": 653 + }, + { + "epoch": 6.349514563106796, + "grad_norm": 0.5230336669127887, + "learning_rate": 3.096580478106631e-06, + "loss": 0.6788, + "step": 654 + }, + { + "epoch": 6.359223300970874, + "grad_norm": 0.5497093072547722, + "learning_rate": 3.0916426266265676e-06, + "loss": 0.5919, + "step": 655 + }, + { + "epoch": 6.368932038834951, + "grad_norm": 0.5462224534842463, + "learning_rate": 3.086702328889486e-06, + "loss": 0.5686, + "step": 656 + }, + { + "epoch": 6.378640776699029, + "grad_norm": 0.5234947411014694, + "learning_rate": 3.0817596053219697e-06, + "loss": 0.5988, + "step": 657 + }, + { + "epoch": 6.388349514563107, + "grad_norm": 0.5588189733664695, + "learning_rate": 3.076814476360634e-06, + "loss": 0.5349, + "step": 658 + }, + { + "epoch": 6.398058252427185, + "grad_norm": 0.5779610115951219, + "learning_rate": 3.071866962452038e-06, + "loss": 0.3482, + "step": 659 + }, + { + "epoch": 6.407766990291262, + "grad_norm": 0.5479800145154113, + "learning_rate": 3.066917084052603e-06, + "loss": 0.7218, + "step": 660 + }, + { + "epoch": 6.41747572815534, + "grad_norm": 0.5397084718954609, + "learning_rate": 3.061964861628527e-06, + "loss": 0.6851, + "step": 661 + }, + { + "epoch": 6.427184466019417, + "grad_norm": 0.5329899425540183, + "learning_rate": 3.057010315655698e-06, + "loss": 0.4821, + "step": 662 + }, + { + "epoch": 6.436893203883495, + "grad_norm": 0.5458422785815938, + "learning_rate": 3.0520534666196134e-06, + "loss": 0.6327, + "step": 663 + }, + { + "epoch": 6.446601941747573, + "grad_norm": 0.47708680903135803, + "learning_rate": 3.0470943350152914e-06, + "loss": 0.5923, + "step": 664 + }, + { + "epoch": 6.456310679611651, + "grad_norm": 0.5144616542269372, + "learning_rate": 3.042132941347189e-06, + "loss": 0.6899, + "step": 665 + }, + { + "epoch": 6.466019417475728, + "grad_norm": 0.5433270155772785, + "learning_rate": 3.037169306129115e-06, + "loss": 0.6198, + "step": 666 + }, + { + "epoch": 6.475728155339806, + "grad_norm": 0.538855795352053, + "learning_rate": 3.0322034498841475e-06, + "loss": 0.6547, + "step": 667 + }, + { + "epoch": 6.485436893203883, + "grad_norm": 0.5708846163744057, + "learning_rate": 3.027235393144547e-06, + "loss": 0.6127, + "step": 668 + }, + { + "epoch": 6.495145631067961, + "grad_norm": 0.5317812805085081, + "learning_rate": 3.0222651564516715e-06, + "loss": 0.7369, + "step": 669 + }, + { + "epoch": 6.504854368932039, + "grad_norm": 0.5550895845970475, + "learning_rate": 3.017292760355896e-06, + "loss": 0.6136, + "step": 670 + }, + { + "epoch": 6.514563106796117, + "grad_norm": 0.553837992106348, + "learning_rate": 3.0123182254165194e-06, + "loss": 0.4291, + "step": 671 + }, + { + "epoch": 6.524271844660194, + "grad_norm": 0.5290319496985271, + "learning_rate": 3.0073415722016875e-06, + "loss": 0.8316, + "step": 672 + }, + { + "epoch": 6.533980582524272, + "grad_norm": 0.5733723058046754, + "learning_rate": 3.002362821288302e-06, + "loss": 0.5706, + "step": 673 + }, + { + "epoch": 6.543689320388349, + "grad_norm": 0.5372610455400619, + "learning_rate": 2.9973819932619404e-06, + "loss": 0.6444, + "step": 674 + }, + { + "epoch": 6.553398058252427, + "grad_norm": 0.5193287209030268, + "learning_rate": 2.9923991087167657e-06, + "loss": 0.6096, + "step": 675 + }, + { + "epoch": 6.563106796116505, + "grad_norm": 0.4949824248830205, + "learning_rate": 2.987414188255446e-06, + "loss": 0.6912, + "step": 676 + }, + { + "epoch": 6.572815533980583, + "grad_norm": 0.5814542413549719, + "learning_rate": 2.9824272524890664e-06, + "loss": 0.727, + "step": 677 + }, + { + "epoch": 6.58252427184466, + "grad_norm": 0.5117293174087842, + "learning_rate": 2.977438322037046e-06, + "loss": 0.5774, + "step": 678 + }, + { + "epoch": 6.592233009708738, + "grad_norm": 0.5409173310731737, + "learning_rate": 2.9724474175270485e-06, + "loss": 0.5585, + "step": 679 + }, + { + "epoch": 6.601941747572815, + "grad_norm": 0.5529086956863897, + "learning_rate": 2.967454559594903e-06, + "loss": 0.5848, + "step": 680 + }, + { + "epoch": 6.611650485436893, + "grad_norm": 0.5142526842665167, + "learning_rate": 2.9624597688845126e-06, + "loss": 0.5823, + "step": 681 + }, + { + "epoch": 6.621359223300971, + "grad_norm": 0.5056072060581505, + "learning_rate": 2.957463066047773e-06, + "loss": 0.6629, + "step": 682 + }, + { + "epoch": 6.631067961165049, + "grad_norm": 0.5648376543089534, + "learning_rate": 2.9524644717444866e-06, + "loss": 0.4144, + "step": 683 + }, + { + "epoch": 6.640776699029126, + "grad_norm": 0.5577514630301723, + "learning_rate": 2.9474640066422757e-06, + "loss": 0.8292, + "step": 684 + }, + { + "epoch": 6.650485436893204, + "grad_norm": 0.6179761336134023, + "learning_rate": 2.9424616914164982e-06, + "loss": 0.5197, + "step": 685 + }, + { + "epoch": 6.660194174757281, + "grad_norm": 0.5762308527579274, + "learning_rate": 2.9374575467501605e-06, + "loss": 0.5277, + "step": 686 + }, + { + "epoch": 6.669902912621359, + "grad_norm": 0.5104768442736158, + "learning_rate": 2.9324515933338343e-06, + "loss": 0.6059, + "step": 687 + }, + { + "epoch": 6.679611650485437, + "grad_norm": 0.5258016358630682, + "learning_rate": 2.9274438518655703e-06, + "loss": 0.753, + "step": 688 + }, + { + "epoch": 6.689320388349515, + "grad_norm": 0.5813275530174288, + "learning_rate": 2.9224343430508105e-06, + "loss": 0.476, + "step": 689 + }, + { + "epoch": 6.699029126213592, + "grad_norm": 0.5889679759396246, + "learning_rate": 2.917423087602306e-06, + "loss": 0.6838, + "step": 690 + }, + { + "epoch": 6.70873786407767, + "grad_norm": 0.5076859976205057, + "learning_rate": 2.9124101062400283e-06, + "loss": 0.6557, + "step": 691 + }, + { + "epoch": 6.718446601941747, + "grad_norm": 0.482464943160344, + "learning_rate": 2.907395419691087e-06, + "loss": 0.7261, + "step": 692 + }, + { + "epoch": 6.728155339805825, + "grad_norm": 0.5332220257567927, + "learning_rate": 2.9023790486896404e-06, + "loss": 0.5912, + "step": 693 + }, + { + "epoch": 6.737864077669903, + "grad_norm": 0.4682638337187069, + "learning_rate": 2.8973610139768114e-06, + "loss": 0.4614, + "step": 694 + }, + { + "epoch": 6.747572815533981, + "grad_norm": 0.558184511949571, + "learning_rate": 2.8923413363006038e-06, + "loss": 0.7464, + "step": 695 + }, + { + "epoch": 6.757281553398058, + "grad_norm": 0.5233023988021802, + "learning_rate": 2.887320036415811e-06, + "loss": 0.5608, + "step": 696 + }, + { + "epoch": 6.766990291262136, + "grad_norm": 0.5834246097072383, + "learning_rate": 2.882297135083937e-06, + "loss": 0.5184, + "step": 697 + }, + { + "epoch": 6.776699029126213, + "grad_norm": 0.5429366118270437, + "learning_rate": 2.877272653073107e-06, + "loss": 0.4984, + "step": 698 + }, + { + "epoch": 6.786407766990291, + "grad_norm": 0.5277264921068271, + "learning_rate": 2.87224661115798e-06, + "loss": 0.5662, + "step": 699 + }, + { + "epoch": 6.796116504854369, + "grad_norm": 0.5382979837388214, + "learning_rate": 2.8672190301196655e-06, + "loss": 0.6262, + "step": 700 + }, + { + "epoch": 6.805825242718447, + "grad_norm": 0.6097486357607171, + "learning_rate": 2.8621899307456376e-06, + "loss": 0.6926, + "step": 701 + }, + { + "epoch": 6.815533980582524, + "grad_norm": 0.5937487238676019, + "learning_rate": 2.8571593338296473e-06, + "loss": 0.6452, + "step": 702 + }, + { + "epoch": 6.825242718446602, + "grad_norm": 0.5132985773419537, + "learning_rate": 2.8521272601716376e-06, + "loss": 0.3947, + "step": 703 + }, + { + "epoch": 6.834951456310679, + "grad_norm": 0.6139505653367293, + "learning_rate": 2.8470937305776567e-06, + "loss": 0.8517, + "step": 704 + }, + { + "epoch": 6.844660194174757, + "grad_norm": 0.5362944543425278, + "learning_rate": 2.842058765859776e-06, + "loss": 0.5163, + "step": 705 + }, + { + "epoch": 6.854368932038835, + "grad_norm": 0.5271174346444056, + "learning_rate": 2.837022386835996e-06, + "loss": 0.5, + "step": 706 + }, + { + "epoch": 6.864077669902913, + "grad_norm": 0.5154656661302897, + "learning_rate": 2.8319846143301676e-06, + "loss": 0.4827, + "step": 707 + }, + { + "epoch": 6.87378640776699, + "grad_norm": 0.46846839589872596, + "learning_rate": 2.826945469171903e-06, + "loss": 0.5299, + "step": 708 + }, + { + "epoch": 6.883495145631068, + "grad_norm": 0.48940397947973197, + "learning_rate": 2.82190497219649e-06, + "loss": 0.4866, + "step": 709 + }, + { + "epoch": 6.893203883495145, + "grad_norm": 0.5913425687153925, + "learning_rate": 2.8168631442448046e-06, + "loss": 0.6692, + "step": 710 + }, + { + "epoch": 6.902912621359223, + "grad_norm": 0.5202999575403233, + "learning_rate": 2.8118200061632273e-06, + "loss": 0.5154, + "step": 711 + }, + { + "epoch": 6.9126213592233015, + "grad_norm": 0.5477222203271073, + "learning_rate": 2.8067755788035544e-06, + "loss": 0.8153, + "step": 712 + }, + { + "epoch": 6.922330097087379, + "grad_norm": 0.5412406643013349, + "learning_rate": 2.801729883022915e-06, + "loss": 0.5411, + "step": 713 + }, + { + "epoch": 6.932038834951456, + "grad_norm": 0.5157277209771828, + "learning_rate": 2.7966829396836804e-06, + "loss": 0.6641, + "step": 714 + }, + { + "epoch": 6.941747572815534, + "grad_norm": 0.585569198546952, + "learning_rate": 2.791634769653381e-06, + "loss": 0.5722, + "step": 715 + }, + { + "epoch": 6.951456310679612, + "grad_norm": 0.518068495388716, + "learning_rate": 2.78658539380462e-06, + "loss": 0.7203, + "step": 716 + }, + { + "epoch": 6.961165048543689, + "grad_norm": 0.5043833753127558, + "learning_rate": 2.781534833014985e-06, + "loss": 0.5923, + "step": 717 + }, + { + "epoch": 6.970873786407767, + "grad_norm": 0.50783508334967, + "learning_rate": 2.7764831081669635e-06, + "loss": 0.7243, + "step": 718 + }, + { + "epoch": 6.980582524271845, + "grad_norm": 0.523109640167941, + "learning_rate": 2.771430240147856e-06, + "loss": 0.4553, + "step": 719 + }, + { + "epoch": 6.990291262135923, + "grad_norm": 0.5501112401053831, + "learning_rate": 2.7663762498496905e-06, + "loss": 0.6997, + "step": 720 + }, + { + "epoch": 7.0, + "grad_norm": 0.6013031662632521, + "learning_rate": 2.761321158169134e-06, + "loss": 0.5361, + "step": 721 + }, + { + "epoch": 7.009708737864078, + "grad_norm": 0.5233359625858339, + "learning_rate": 2.7562649860074077e-06, + "loss": 0.6119, + "step": 722 + }, + { + "epoch": 7.019417475728155, + "grad_norm": 0.517203456333739, + "learning_rate": 2.7512077542702005e-06, + "loss": 0.9122, + "step": 723 + }, + { + "epoch": 7.029126213592233, + "grad_norm": 0.5348182520574054, + "learning_rate": 2.746149483867582e-06, + "loss": 0.5543, + "step": 724 + }, + { + "epoch": 7.038834951456311, + "grad_norm": 0.5551672858403643, + "learning_rate": 2.741090195713917e-06, + "loss": 0.5383, + "step": 725 + }, + { + "epoch": 7.048543689320389, + "grad_norm": 0.5589328402666769, + "learning_rate": 2.736029910727777e-06, + "loss": 0.5495, + "step": 726 + }, + { + "epoch": 7.058252427184466, + "grad_norm": 0.9295358484591534, + "learning_rate": 2.730968649831858e-06, + "loss": 0.4708, + "step": 727 + }, + { + "epoch": 7.067961165048544, + "grad_norm": 0.5641431302765963, + "learning_rate": 2.7259064339528875e-06, + "loss": 0.7656, + "step": 728 + }, + { + "epoch": 7.077669902912621, + "grad_norm": 0.5150837177025048, + "learning_rate": 2.720843284021543e-06, + "loss": 0.5798, + "step": 729 + }, + { + "epoch": 7.087378640776699, + "grad_norm": 0.49904621404216537, + "learning_rate": 2.7157792209723654e-06, + "loss": 0.4392, + "step": 730 + }, + { + "epoch": 7.097087378640777, + "grad_norm": 0.5020142101967577, + "learning_rate": 2.7107142657436696e-06, + "loss": 0.6109, + "step": 731 + }, + { + "epoch": 7.106796116504855, + "grad_norm": 0.508200057178077, + "learning_rate": 2.705648439277459e-06, + "loss": 0.6078, + "step": 732 + }, + { + "epoch": 7.116504854368932, + "grad_norm": 0.48671766693455193, + "learning_rate": 2.7005817625193398e-06, + "loss": 0.4544, + "step": 733 + }, + { + "epoch": 7.12621359223301, + "grad_norm": 0.5649840605126365, + "learning_rate": 2.695514256418435e-06, + "loss": 0.4394, + "step": 734 + }, + { + "epoch": 7.135922330097087, + "grad_norm": 0.60875887483297, + "learning_rate": 2.6904459419272955e-06, + "loss": 0.6065, + "step": 735 + }, + { + "epoch": 7.145631067961165, + "grad_norm": 0.5974558558874661, + "learning_rate": 2.685376840001814e-06, + "loss": 0.6386, + "step": 736 + }, + { + "epoch": 7.155339805825243, + "grad_norm": 0.5530398341248075, + "learning_rate": 2.6803069716011405e-06, + "loss": 0.4432, + "step": 737 + }, + { + "epoch": 7.165048543689321, + "grad_norm": 0.5102871169083648, + "learning_rate": 2.6752363576875933e-06, + "loss": 0.662, + "step": 738 + }, + { + "epoch": 7.174757281553398, + "grad_norm": 0.5054879661184567, + "learning_rate": 2.6701650192265734e-06, + "loss": 0.6198, + "step": 739 + }, + { + "epoch": 7.184466019417476, + "grad_norm": 0.5089332252690112, + "learning_rate": 2.6650929771864776e-06, + "loss": 0.533, + "step": 740 + }, + { + "epoch": 7.194174757281553, + "grad_norm": 0.503785749624171, + "learning_rate": 2.660020252538611e-06, + "loss": 0.7417, + "step": 741 + }, + { + "epoch": 7.203883495145631, + "grad_norm": 0.5271479384718044, + "learning_rate": 2.6549468662571026e-06, + "loss": 0.5881, + "step": 742 + }, + { + "epoch": 7.213592233009709, + "grad_norm": 0.5635104879136624, + "learning_rate": 2.6498728393188157e-06, + "loss": 0.6668, + "step": 743 + }, + { + "epoch": 7.223300970873787, + "grad_norm": 0.48666286774042283, + "learning_rate": 2.6447981927032634e-06, + "loss": 0.5292, + "step": 744 + }, + { + "epoch": 7.233009708737864, + "grad_norm": 0.5252410611230779, + "learning_rate": 2.639722947392521e-06, + "loss": 0.7362, + "step": 745 + }, + { + "epoch": 7.242718446601942, + "grad_norm": 0.5055513935431378, + "learning_rate": 2.6346471243711376e-06, + "loss": 0.6224, + "step": 746 + }, + { + "epoch": 7.252427184466019, + "grad_norm": 0.4945880898396502, + "learning_rate": 2.629570744626052e-06, + "loss": 0.631, + "step": 747 + }, + { + "epoch": 7.262135922330097, + "grad_norm": 0.47517967426654084, + "learning_rate": 2.624493829146507e-06, + "loss": 0.6347, + "step": 748 + }, + { + "epoch": 7.271844660194175, + "grad_norm": 0.6161713749712753, + "learning_rate": 2.619416398923957e-06, + "loss": 0.8685, + "step": 749 + }, + { + "epoch": 7.281553398058253, + "grad_norm": 0.6026630303868498, + "learning_rate": 2.614338474951987e-06, + "loss": 0.4722, + "step": 750 + }, + { + "epoch": 7.29126213592233, + "grad_norm": 0.5081080900411881, + "learning_rate": 2.6092600782262213e-06, + "loss": 0.4761, + "step": 751 + }, + { + "epoch": 7.300970873786408, + "grad_norm": 0.4914556776074673, + "learning_rate": 2.6041812297442417e-06, + "loss": 0.5394, + "step": 752 + }, + { + "epoch": 7.310679611650485, + "grad_norm": 0.5174239393590538, + "learning_rate": 2.5991019505054965e-06, + "loss": 0.5791, + "step": 753 + }, + { + "epoch": 7.320388349514563, + "grad_norm": 0.577016310911, + "learning_rate": 2.5940222615112143e-06, + "loss": 0.4506, + "step": 754 + }, + { + "epoch": 7.330097087378641, + "grad_norm": 0.4931993425278694, + "learning_rate": 2.5889421837643186e-06, + "loss": 0.501, + "step": 755 + }, + { + "epoch": 7.339805825242719, + "grad_norm": 0.4880724860087956, + "learning_rate": 2.5838617382693415e-06, + "loss": 0.6661, + "step": 756 + }, + { + "epoch": 7.349514563106796, + "grad_norm": 0.5210458539063239, + "learning_rate": 2.5787809460323337e-06, + "loss": 0.6234, + "step": 757 + }, + { + "epoch": 7.359223300970874, + "grad_norm": 0.5135992991117062, + "learning_rate": 2.57369982806078e-06, + "loss": 0.641, + "step": 758 + }, + { + "epoch": 7.368932038834951, + "grad_norm": 0.5505536500994148, + "learning_rate": 2.5686184053635127e-06, + "loss": 0.5763, + "step": 759 + }, + { + "epoch": 7.378640776699029, + "grad_norm": 0.4778646758208665, + "learning_rate": 2.563536698950624e-06, + "loss": 0.4859, + "step": 760 + }, + { + "epoch": 7.388349514563107, + "grad_norm": 0.540107781162827, + "learning_rate": 2.5584547298333772e-06, + "loss": 0.4128, + "step": 761 + }, + { + "epoch": 7.398058252427185, + "grad_norm": 0.5303028011574588, + "learning_rate": 2.5533725190241255e-06, + "loss": 0.3898, + "step": 762 + }, + { + "epoch": 7.407766990291262, + "grad_norm": 0.5105167298780968, + "learning_rate": 2.5482900875362184e-06, + "loss": 0.5078, + "step": 763 + }, + { + "epoch": 7.41747572815534, + "grad_norm": 0.565588338746564, + "learning_rate": 2.543207456383919e-06, + "loss": 0.6164, + "step": 764 + }, + { + "epoch": 7.427184466019417, + "grad_norm": 0.5525554215167167, + "learning_rate": 2.538124646582315e-06, + "loss": 0.785, + "step": 765 + }, + { + "epoch": 7.436893203883495, + "grad_norm": 0.5695569930727707, + "learning_rate": 2.533041679147235e-06, + "loss": 0.5288, + "step": 766 + }, + { + "epoch": 7.446601941747573, + "grad_norm": 0.5101624379639444, + "learning_rate": 2.527958575095157e-06, + "loss": 0.5563, + "step": 767 + }, + { + "epoch": 7.456310679611651, + "grad_norm": 0.5432188733614521, + "learning_rate": 2.522875355443124e-06, + "loss": 0.7357, + "step": 768 + }, + { + "epoch": 7.466019417475728, + "grad_norm": 0.5464004709948751, + "learning_rate": 2.5177920412086586e-06, + "loss": 0.7825, + "step": 769 + }, + { + "epoch": 7.475728155339806, + "grad_norm": 0.529743426710544, + "learning_rate": 2.512708653409674e-06, + "loss": 0.6695, + "step": 770 + }, + { + "epoch": 7.485436893203883, + "grad_norm": 0.5145423159262458, + "learning_rate": 2.507625213064386e-06, + "loss": 0.5038, + "step": 771 + }, + { + "epoch": 7.495145631067961, + "grad_norm": 0.47269595307770995, + "learning_rate": 2.5025417411912307e-06, + "loss": 0.4515, + "step": 772 + }, + { + "epoch": 7.504854368932039, + "grad_norm": 0.5580818879826553, + "learning_rate": 2.4974582588087697e-06, + "loss": 0.6909, + "step": 773 + }, + { + "epoch": 7.514563106796117, + "grad_norm": 0.5100233581400443, + "learning_rate": 2.492374786935614e-06, + "loss": 0.6848, + "step": 774 + }, + { + "epoch": 7.524271844660194, + "grad_norm": 0.5522174720334939, + "learning_rate": 2.487291346590326e-06, + "loss": 0.7382, + "step": 775 + }, + { + "epoch": 7.533980582524272, + "grad_norm": 0.4890510705027455, + "learning_rate": 2.4822079587913414e-06, + "loss": 0.5869, + "step": 776 + }, + { + "epoch": 7.543689320388349, + "grad_norm": 0.518568061347995, + "learning_rate": 2.4771246445568763e-06, + "loss": 0.4913, + "step": 777 + }, + { + "epoch": 7.553398058252427, + "grad_norm": 0.5727634652449346, + "learning_rate": 2.472041424904844e-06, + "loss": 0.5031, + "step": 778 + }, + { + "epoch": 7.563106796116505, + "grad_norm": 0.49387541758342474, + "learning_rate": 2.466958320852766e-06, + "loss": 0.5388, + "step": 779 + }, + { + "epoch": 7.572815533980583, + "grad_norm": 0.5217948655090912, + "learning_rate": 2.4618753534176854e-06, + "loss": 0.6361, + "step": 780 + }, + { + "epoch": 7.58252427184466, + "grad_norm": 0.5332175891151626, + "learning_rate": 2.4567925436160823e-06, + "loss": 0.6208, + "step": 781 + }, + { + "epoch": 7.592233009708738, + "grad_norm": 0.5318535876006302, + "learning_rate": 2.4517099124637824e-06, + "loss": 0.7728, + "step": 782 + }, + { + "epoch": 7.601941747572815, + "grad_norm": 0.5502888061073115, + "learning_rate": 2.4466274809758757e-06, + "loss": 0.6001, + "step": 783 + }, + { + "epoch": 7.611650485436893, + "grad_norm": 0.4936319038862824, + "learning_rate": 2.4415452701666236e-06, + "loss": 0.5221, + "step": 784 + }, + { + "epoch": 7.621359223300971, + "grad_norm": 0.5022890404063105, + "learning_rate": 2.436463301049378e-06, + "loss": 0.6604, + "step": 785 + }, + { + "epoch": 7.631067961165049, + "grad_norm": 0.5272732094633668, + "learning_rate": 2.431381594636488e-06, + "loss": 0.6919, + "step": 786 + }, + { + "epoch": 7.640776699029126, + "grad_norm": 0.5919653641625273, + "learning_rate": 2.42630017193922e-06, + "loss": 0.8888, + "step": 787 + }, + { + "epoch": 7.650485436893204, + "grad_norm": 0.5004825929075615, + "learning_rate": 2.4212190539676667e-06, + "loss": 0.3034, + "step": 788 + }, + { + "epoch": 7.660194174757281, + "grad_norm": 0.5188376934752825, + "learning_rate": 2.4161382617306585e-06, + "loss": 0.5967, + "step": 789 + }, + { + "epoch": 7.669902912621359, + "grad_norm": 0.50648375774632, + "learning_rate": 2.4110578162356814e-06, + "loss": 0.5533, + "step": 790 + }, + { + "epoch": 7.679611650485437, + "grad_norm": 0.4995699684525013, + "learning_rate": 2.405977738488786e-06, + "loss": 0.5644, + "step": 791 + }, + { + "epoch": 7.689320388349515, + "grad_norm": 0.5177586536034464, + "learning_rate": 2.4008980494945044e-06, + "loss": 0.6069, + "step": 792 + }, + { + "epoch": 7.699029126213592, + "grad_norm": 0.5143304969975345, + "learning_rate": 2.3958187702557587e-06, + "loss": 0.4916, + "step": 793 + }, + { + "epoch": 7.70873786407767, + "grad_norm": 0.5576884681772939, + "learning_rate": 2.39073992177378e-06, + "loss": 0.4913, + "step": 794 + }, + { + "epoch": 7.718446601941747, + "grad_norm": 0.5320452230495075, + "learning_rate": 2.385661525048014e-06, + "loss": 0.6293, + "step": 795 + }, + { + "epoch": 7.728155339805825, + "grad_norm": 0.7020840522130559, + "learning_rate": 2.3805836010760435e-06, + "loss": 0.6646, + "step": 796 + }, + { + "epoch": 7.737864077669903, + "grad_norm": 0.5230839966091113, + "learning_rate": 2.375506170853494e-06, + "loss": 0.5772, + "step": 797 + }, + { + "epoch": 7.747572815533981, + "grad_norm": 0.5734597379091115, + "learning_rate": 2.3704292553739487e-06, + "loss": 0.4759, + "step": 798 + }, + { + "epoch": 7.757281553398058, + "grad_norm": 0.5117329377962575, + "learning_rate": 2.3653528756288636e-06, + "loss": 0.4828, + "step": 799 + }, + { + "epoch": 7.766990291262136, + "grad_norm": 0.47543907618671766, + "learning_rate": 2.3602770526074804e-06, + "loss": 0.6375, + "step": 800 + }, + { + "epoch": 7.776699029126213, + "grad_norm": 0.5109119486237039, + "learning_rate": 2.3552018072967375e-06, + "loss": 0.6483, + "step": 801 + }, + { + "epoch": 7.786407766990291, + "grad_norm": 0.5323210647105892, + "learning_rate": 2.3501271606811848e-06, + "loss": 0.4874, + "step": 802 + }, + { + "epoch": 7.796116504854369, + "grad_norm": 0.5730319631624969, + "learning_rate": 2.345053133742898e-06, + "loss": 0.7191, + "step": 803 + }, + { + "epoch": 7.805825242718447, + "grad_norm": 0.5071812370282966, + "learning_rate": 2.3399797474613894e-06, + "loss": 0.3434, + "step": 804 + }, + { + "epoch": 7.815533980582524, + "grad_norm": 0.5461829256062627, + "learning_rate": 2.334907022813523e-06, + "loss": 0.485, + "step": 805 + }, + { + "epoch": 7.825242718446602, + "grad_norm": 0.5636593344058092, + "learning_rate": 2.329834980773427e-06, + "loss": 0.5785, + "step": 806 + }, + { + "epoch": 7.834951456310679, + "grad_norm": 0.5773502744704323, + "learning_rate": 2.324763642312407e-06, + "loss": 0.6205, + "step": 807 + }, + { + "epoch": 7.844660194174757, + "grad_norm": 0.48736301511057184, + "learning_rate": 2.3196930283988603e-06, + "loss": 0.6431, + "step": 808 + }, + { + "epoch": 7.854368932038835, + "grad_norm": 0.5161473184954976, + "learning_rate": 2.3146231599981865e-06, + "loss": 0.5869, + "step": 809 + }, + { + "epoch": 7.864077669902913, + "grad_norm": 0.5499369220986959, + "learning_rate": 2.3095540580727054e-06, + "loss": 0.7587, + "step": 810 + }, + { + "epoch": 7.87378640776699, + "grad_norm": 0.5625597573563249, + "learning_rate": 2.304485743581566e-06, + "loss": 0.6425, + "step": 811 + }, + { + "epoch": 7.883495145631068, + "grad_norm": 0.5629938236856635, + "learning_rate": 2.299418237480661e-06, + "loss": 0.4925, + "step": 812 + }, + { + "epoch": 7.893203883495145, + "grad_norm": 0.49533812712631564, + "learning_rate": 2.294351560722542e-06, + "loss": 0.6207, + "step": 813 + }, + { + "epoch": 7.902912621359223, + "grad_norm": 0.6428692294617694, + "learning_rate": 2.2892857342563316e-06, + "loss": 0.4045, + "step": 814 + }, + { + "epoch": 7.9126213592233015, + "grad_norm": 0.5474333139164419, + "learning_rate": 2.2842207790276355e-06, + "loss": 0.6509, + "step": 815 + }, + { + "epoch": 7.922330097087379, + "grad_norm": 0.5193991986662039, + "learning_rate": 2.279156715978457e-06, + "loss": 0.7148, + "step": 816 + }, + { + "epoch": 7.932038834951456, + "grad_norm": 0.499433426250342, + "learning_rate": 2.274093566047113e-06, + "loss": 0.5728, + "step": 817 + }, + { + "epoch": 7.941747572815534, + "grad_norm": 0.5512612849904238, + "learning_rate": 2.2690313501681426e-06, + "loss": 0.5075, + "step": 818 + }, + { + "epoch": 7.951456310679612, + "grad_norm": 0.5208798190855127, + "learning_rate": 2.263970089272223e-06, + "loss": 0.5729, + "step": 819 + }, + { + "epoch": 7.961165048543689, + "grad_norm": 0.5298051120860908, + "learning_rate": 2.2589098042860838e-06, + "loss": 0.5755, + "step": 820 + }, + { + "epoch": 7.970873786407767, + "grad_norm": 0.6511172387364248, + "learning_rate": 2.2538505161324186e-06, + "loss": 0.4285, + "step": 821 + }, + { + "epoch": 7.980582524271845, + "grad_norm": 0.5520636005584219, + "learning_rate": 2.2487922457298007e-06, + "loss": 0.4915, + "step": 822 + }, + { + "epoch": 7.990291262135923, + "grad_norm": 0.6008177376268479, + "learning_rate": 2.243735013992593e-06, + "loss": 0.7512, + "step": 823 + }, + { + "epoch": 8.0, + "grad_norm": 0.5164857595576907, + "learning_rate": 2.238678841830867e-06, + "loss": 0.6132, + "step": 824 + }, + { + "epoch": 8.009708737864077, + "grad_norm": 0.5927201492916486, + "learning_rate": 2.2336237501503103e-06, + "loss": 0.4526, + "step": 825 + }, + { + "epoch": 8.019417475728156, + "grad_norm": 0.5149776302383433, + "learning_rate": 2.2285697598521446e-06, + "loss": 0.5165, + "step": 826 + }, + { + "epoch": 8.029126213592233, + "grad_norm": 0.4668940317507629, + "learning_rate": 2.2235168918330374e-06, + "loss": 0.567, + "step": 827 + }, + { + "epoch": 8.03883495145631, + "grad_norm": 0.5154594445027514, + "learning_rate": 2.2184651669850164e-06, + "loss": 0.7516, + "step": 828 + }, + { + "epoch": 8.048543689320388, + "grad_norm": 0.4940207401959282, + "learning_rate": 2.2134146061953814e-06, + "loss": 0.2917, + "step": 829 + }, + { + "epoch": 8.058252427184467, + "grad_norm": 0.5178752504635592, + "learning_rate": 2.2083652303466196e-06, + "loss": 0.8172, + "step": 830 + }, + { + "epoch": 8.067961165048544, + "grad_norm": 0.546827519101471, + "learning_rate": 2.20331706031632e-06, + "loss": 0.7023, + "step": 831 + }, + { + "epoch": 8.077669902912621, + "grad_norm": 0.4582055070562166, + "learning_rate": 2.1982701169770853e-06, + "loss": 0.4083, + "step": 832 + }, + { + "epoch": 8.087378640776699, + "grad_norm": 0.4683301177297805, + "learning_rate": 2.1932244211964456e-06, + "loss": 0.5582, + "step": 833 + }, + { + "epoch": 8.097087378640778, + "grad_norm": 0.5178375602465826, + "learning_rate": 2.1881799938367735e-06, + "loss": 0.356, + "step": 834 + }, + { + "epoch": 8.106796116504855, + "grad_norm": 0.504467565940944, + "learning_rate": 2.1831368557551962e-06, + "loss": 0.4254, + "step": 835 + }, + { + "epoch": 8.116504854368932, + "grad_norm": 0.5226565525141085, + "learning_rate": 2.1780950278035114e-06, + "loss": 0.6666, + "step": 836 + }, + { + "epoch": 8.12621359223301, + "grad_norm": 0.587266082983635, + "learning_rate": 2.173054530828098e-06, + "loss": 0.4792, + "step": 837 + }, + { + "epoch": 8.135922330097088, + "grad_norm": 0.5562710685791401, + "learning_rate": 2.168015385669833e-06, + "loss": 0.4674, + "step": 838 + }, + { + "epoch": 8.145631067961165, + "grad_norm": 0.5181070514066362, + "learning_rate": 2.162977613164005e-06, + "loss": 0.7922, + "step": 839 + }, + { + "epoch": 8.155339805825243, + "grad_norm": 0.5332598920234418, + "learning_rate": 2.157941234140225e-06, + "loss": 0.8334, + "step": 840 + }, + { + "epoch": 8.16504854368932, + "grad_norm": 0.508519951977822, + "learning_rate": 2.1529062694223437e-06, + "loss": 0.562, + "step": 841 + }, + { + "epoch": 8.174757281553399, + "grad_norm": 0.5623882061470568, + "learning_rate": 2.147872739828364e-06, + "loss": 0.3732, + "step": 842 + }, + { + "epoch": 8.184466019417476, + "grad_norm": 0.581097080186488, + "learning_rate": 2.142840666170354e-06, + "loss": 0.3534, + "step": 843 + }, + { + "epoch": 8.194174757281553, + "grad_norm": 0.5552223251444132, + "learning_rate": 2.1378100692543637e-06, + "loss": 0.5417, + "step": 844 + }, + { + "epoch": 8.20388349514563, + "grad_norm": 0.6036836009498026, + "learning_rate": 2.1327809698803354e-06, + "loss": 0.6192, + "step": 845 + }, + { + "epoch": 8.21359223300971, + "grad_norm": 0.4918145204654767, + "learning_rate": 2.1277533888420203e-06, + "loss": 0.5632, + "step": 846 + }, + { + "epoch": 8.223300970873787, + "grad_norm": 0.54420380543783, + "learning_rate": 2.1227273469268932e-06, + "loss": 0.5219, + "step": 847 + }, + { + "epoch": 8.233009708737864, + "grad_norm": 0.5919068430953334, + "learning_rate": 2.117702864916063e-06, + "loss": 0.4869, + "step": 848 + }, + { + "epoch": 8.242718446601941, + "grad_norm": 0.5748938854690556, + "learning_rate": 2.1126799635841897e-06, + "loss": 0.6345, + "step": 849 + }, + { + "epoch": 8.25242718446602, + "grad_norm": 0.5669754510278896, + "learning_rate": 2.1076586636993975e-06, + "loss": 0.6422, + "step": 850 + }, + { + "epoch": 8.262135922330097, + "grad_norm": 0.5578707537573963, + "learning_rate": 2.102638986023189e-06, + "loss": 0.5943, + "step": 851 + }, + { + "epoch": 8.271844660194175, + "grad_norm": 0.5244178006438305, + "learning_rate": 2.0976209513103604e-06, + "loss": 0.5996, + "step": 852 + }, + { + "epoch": 8.281553398058252, + "grad_norm": 0.4735653903118842, + "learning_rate": 2.0926045803089135e-06, + "loss": 0.5118, + "step": 853 + }, + { + "epoch": 8.29126213592233, + "grad_norm": 0.5610192165309852, + "learning_rate": 2.087589893759972e-06, + "loss": 0.6315, + "step": 854 + }, + { + "epoch": 8.300970873786408, + "grad_norm": 0.586016198505159, + "learning_rate": 2.0825769123976954e-06, + "loss": 0.484, + "step": 855 + }, + { + "epoch": 8.310679611650485, + "grad_norm": 0.5474063939519695, + "learning_rate": 2.077565656949191e-06, + "loss": 0.6091, + "step": 856 + }, + { + "epoch": 8.320388349514563, + "grad_norm": 0.5292483811678337, + "learning_rate": 2.072556148134431e-06, + "loss": 0.6521, + "step": 857 + }, + { + "epoch": 8.330097087378642, + "grad_norm": 0.4874932647375569, + "learning_rate": 2.0675484066661666e-06, + "loss": 0.5259, + "step": 858 + }, + { + "epoch": 8.339805825242719, + "grad_norm": 0.4851303580845472, + "learning_rate": 2.0625424532498407e-06, + "loss": 0.4161, + "step": 859 + }, + { + "epoch": 8.349514563106796, + "grad_norm": 0.5545793735034753, + "learning_rate": 2.057538308583502e-06, + "loss": 0.5025, + "step": 860 + }, + { + "epoch": 8.359223300970873, + "grad_norm": 0.5080809023551803, + "learning_rate": 2.0525359933577243e-06, + "loss": 0.8568, + "step": 861 + }, + { + "epoch": 8.368932038834952, + "grad_norm": 0.46911329394510076, + "learning_rate": 2.047535528255514e-06, + "loss": 0.6254, + "step": 862 + }, + { + "epoch": 8.37864077669903, + "grad_norm": 0.48403193573667014, + "learning_rate": 2.0425369339522276e-06, + "loss": 0.3043, + "step": 863 + }, + { + "epoch": 8.388349514563107, + "grad_norm": 0.5370398217757195, + "learning_rate": 2.0375402311154886e-06, + "loss": 0.5867, + "step": 864 + }, + { + "epoch": 8.398058252427184, + "grad_norm": 0.6080681817574075, + "learning_rate": 2.0325454404050983e-06, + "loss": 0.3763, + "step": 865 + }, + { + "epoch": 8.407766990291263, + "grad_norm": 0.530337907125514, + "learning_rate": 2.0275525824729523e-06, + "loss": 0.7515, + "step": 866 + }, + { + "epoch": 8.41747572815534, + "grad_norm": 0.4954396437597861, + "learning_rate": 2.022561677962955e-06, + "loss": 0.5687, + "step": 867 + }, + { + "epoch": 8.427184466019417, + "grad_norm": 0.5599603984905624, + "learning_rate": 2.017572747510934e-06, + "loss": 0.4983, + "step": 868 + }, + { + "epoch": 8.436893203883495, + "grad_norm": 0.5593697940698449, + "learning_rate": 2.012585811744555e-06, + "loss": 0.6475, + "step": 869 + }, + { + "epoch": 8.446601941747574, + "grad_norm": 0.5608755353144496, + "learning_rate": 2.0076008912832355e-06, + "loss": 0.7165, + "step": 870 + }, + { + "epoch": 8.45631067961165, + "grad_norm": 0.49618292660157465, + "learning_rate": 2.002618006738061e-06, + "loss": 0.6876, + "step": 871 + }, + { + "epoch": 8.466019417475728, + "grad_norm": 0.52539938762679, + "learning_rate": 1.9976371787116992e-06, + "loss": 0.6282, + "step": 872 + }, + { + "epoch": 8.475728155339805, + "grad_norm": 0.6197757961176346, + "learning_rate": 1.9926584277983134e-06, + "loss": 0.4668, + "step": 873 + }, + { + "epoch": 8.485436893203884, + "grad_norm": 0.5982098576330306, + "learning_rate": 1.9876817745834805e-06, + "loss": 0.7359, + "step": 874 + }, + { + "epoch": 8.495145631067961, + "grad_norm": 0.5184171696351032, + "learning_rate": 1.9827072396441044e-06, + "loss": 0.629, + "step": 875 + }, + { + "epoch": 8.504854368932039, + "grad_norm": 0.5704731732012165, + "learning_rate": 1.9777348435483285e-06, + "loss": 0.5434, + "step": 876 + }, + { + "epoch": 8.514563106796116, + "grad_norm": 0.5914779709255895, + "learning_rate": 1.972764606855454e-06, + "loss": 0.8657, + "step": 877 + }, + { + "epoch": 8.524271844660195, + "grad_norm": 0.4998262061570929, + "learning_rate": 1.9677965501158534e-06, + "loss": 0.5069, + "step": 878 + }, + { + "epoch": 8.533980582524272, + "grad_norm": 0.5060351233776778, + "learning_rate": 1.9628306938708857e-06, + "loss": 0.5947, + "step": 879 + }, + { + "epoch": 8.54368932038835, + "grad_norm": 0.477718552398954, + "learning_rate": 1.957867058652812e-06, + "loss": 0.5787, + "step": 880 + }, + { + "epoch": 8.553398058252426, + "grad_norm": 0.5163268989253287, + "learning_rate": 1.952905664984709e-06, + "loss": 0.6632, + "step": 881 + }, + { + "epoch": 8.563106796116505, + "grad_norm": 0.5530459189954213, + "learning_rate": 1.947946533380387e-06, + "loss": 0.455, + "step": 882 + }, + { + "epoch": 8.572815533980583, + "grad_norm": 0.5116903651673903, + "learning_rate": 1.9429896843443025e-06, + "loss": 0.4705, + "step": 883 + }, + { + "epoch": 8.58252427184466, + "grad_norm": 0.480475135243548, + "learning_rate": 1.938035138371474e-06, + "loss": 0.7274, + "step": 884 + }, + { + "epoch": 8.592233009708737, + "grad_norm": 0.5297320744221382, + "learning_rate": 1.933082915947398e-06, + "loss": 0.4558, + "step": 885 + }, + { + "epoch": 8.601941747572816, + "grad_norm": 0.505128857471151, + "learning_rate": 1.928133037547963e-06, + "loss": 0.3586, + "step": 886 + }, + { + "epoch": 8.611650485436893, + "grad_norm": 0.5928628788232783, + "learning_rate": 1.9231855236393677e-06, + "loss": 0.6312, + "step": 887 + }, + { + "epoch": 8.62135922330097, + "grad_norm": 0.649524612781142, + "learning_rate": 1.9182403946780316e-06, + "loss": 0.6427, + "step": 888 + }, + { + "epoch": 8.631067961165048, + "grad_norm": 0.5876898341916988, + "learning_rate": 1.9132976711105146e-06, + "loss": 0.6925, + "step": 889 + }, + { + "epoch": 8.640776699029127, + "grad_norm": 0.58652849068603, + "learning_rate": 1.9083573733734328e-06, + "loss": 0.6009, + "step": 890 + }, + { + "epoch": 8.650485436893204, + "grad_norm": 0.5556588390992402, + "learning_rate": 1.903419521893369e-06, + "loss": 0.5995, + "step": 891 + }, + { + "epoch": 8.660194174757281, + "grad_norm": 0.5220315347935208, + "learning_rate": 1.898484137086794e-06, + "loss": 0.5321, + "step": 892 + }, + { + "epoch": 8.669902912621358, + "grad_norm": 0.569643001305556, + "learning_rate": 1.8935512393599784e-06, + "loss": 0.4225, + "step": 893 + }, + { + "epoch": 8.679611650485437, + "grad_norm": 0.5373129439323331, + "learning_rate": 1.8886208491089095e-06, + "loss": 0.5555, + "step": 894 + }, + { + "epoch": 8.689320388349515, + "grad_norm": 0.5400895369165816, + "learning_rate": 1.8836929867192077e-06, + "loss": 0.4617, + "step": 895 + }, + { + "epoch": 8.699029126213592, + "grad_norm": 0.487412932412484, + "learning_rate": 1.8787676725660405e-06, + "loss": 0.4752, + "step": 896 + }, + { + "epoch": 8.70873786407767, + "grad_norm": 0.48707203079572564, + "learning_rate": 1.8738449270140404e-06, + "loss": 0.6907, + "step": 897 + }, + { + "epoch": 8.718446601941748, + "grad_norm": 0.48910391480682963, + "learning_rate": 1.8689247704172187e-06, + "loss": 0.5326, + "step": 898 + }, + { + "epoch": 8.728155339805825, + "grad_norm": 0.5147613851501489, + "learning_rate": 1.8640072231188825e-06, + "loss": 0.5996, + "step": 899 + }, + { + "epoch": 8.737864077669903, + "grad_norm": 0.4917113961710611, + "learning_rate": 1.8590923054515504e-06, + "loss": 0.5886, + "step": 900 + }, + { + "epoch": 8.74757281553398, + "grad_norm": 0.5083586615609125, + "learning_rate": 1.8541800377368673e-06, + "loss": 0.6473, + "step": 901 + }, + { + "epoch": 8.757281553398059, + "grad_norm": 0.5294044154978094, + "learning_rate": 1.8492704402855229e-06, + "loss": 0.6822, + "step": 902 + }, + { + "epoch": 8.766990291262136, + "grad_norm": 0.5804348364355945, + "learning_rate": 1.8443635333971643e-06, + "loss": 0.421, + "step": 903 + }, + { + "epoch": 8.776699029126213, + "grad_norm": 0.5203615517682456, + "learning_rate": 1.8394593373603173e-06, + "loss": 0.6694, + "step": 904 + }, + { + "epoch": 8.78640776699029, + "grad_norm": 0.5109166593919562, + "learning_rate": 1.8345578724522957e-06, + "loss": 0.5472, + "step": 905 + }, + { + "epoch": 8.79611650485437, + "grad_norm": 0.560325487440104, + "learning_rate": 1.8296591589391227e-06, + "loss": 0.5814, + "step": 906 + }, + { + "epoch": 8.805825242718447, + "grad_norm": 0.5421411914919385, + "learning_rate": 1.8247632170754443e-06, + "loss": 0.5985, + "step": 907 + }, + { + "epoch": 8.815533980582524, + "grad_norm": 0.5770835806642348, + "learning_rate": 1.8198700671044477e-06, + "loss": 0.5228, + "step": 908 + }, + { + "epoch": 8.825242718446601, + "grad_norm": 0.4895076081734303, + "learning_rate": 1.8149797292577757e-06, + "loss": 0.6435, + "step": 909 + }, + { + "epoch": 8.83495145631068, + "grad_norm": 0.4873199553388694, + "learning_rate": 1.8100922237554442e-06, + "loss": 0.6997, + "step": 910 + }, + { + "epoch": 8.844660194174757, + "grad_norm": 0.5490067414210736, + "learning_rate": 1.8052075708057581e-06, + "loss": 0.3939, + "step": 911 + }, + { + "epoch": 8.854368932038835, + "grad_norm": 0.6321945383667227, + "learning_rate": 1.8003257906052284e-06, + "loss": 0.4784, + "step": 912 + }, + { + "epoch": 8.864077669902912, + "grad_norm": 0.604869131460347, + "learning_rate": 1.7954469033384868e-06, + "loss": 0.5676, + "step": 913 + }, + { + "epoch": 8.87378640776699, + "grad_norm": 0.5447552422923585, + "learning_rate": 1.790570929178206e-06, + "loss": 0.713, + "step": 914 + }, + { + "epoch": 8.883495145631068, + "grad_norm": 0.5302556571289168, + "learning_rate": 1.7856978882850112e-06, + "loss": 0.5198, + "step": 915 + }, + { + "epoch": 8.893203883495145, + "grad_norm": 0.4973051756135799, + "learning_rate": 1.780827800807401e-06, + "loss": 0.5464, + "step": 916 + }, + { + "epoch": 8.902912621359224, + "grad_norm": 0.6496512810930873, + "learning_rate": 1.7759606868816623e-06, + "loss": 0.6319, + "step": 917 + }, + { + "epoch": 8.912621359223301, + "grad_norm": 0.5065859495112356, + "learning_rate": 1.771096566631788e-06, + "loss": 0.4308, + "step": 918 + }, + { + "epoch": 8.922330097087379, + "grad_norm": 0.5437885245827361, + "learning_rate": 1.766235460169392e-06, + "loss": 0.6901, + "step": 919 + }, + { + "epoch": 8.932038834951456, + "grad_norm": 0.6349112557712641, + "learning_rate": 1.7613773875936274e-06, + "loss": 0.5075, + "step": 920 + }, + { + "epoch": 8.941747572815533, + "grad_norm": 0.5653114938530557, + "learning_rate": 1.7565223689911038e-06, + "loss": 0.5145, + "step": 921 + }, + { + "epoch": 8.951456310679612, + "grad_norm": 0.5191163631219877, + "learning_rate": 1.7516704244358018e-06, + "loss": 0.4828, + "step": 922 + }, + { + "epoch": 8.96116504854369, + "grad_norm": 0.49692045826136555, + "learning_rate": 1.7468215739889941e-06, + "loss": 0.4133, + "step": 923 + }, + { + "epoch": 8.970873786407767, + "grad_norm": 0.580247056070986, + "learning_rate": 1.741975837699158e-06, + "loss": 0.6603, + "step": 924 + }, + { + "epoch": 8.980582524271846, + "grad_norm": 0.562012613485115, + "learning_rate": 1.7371332356018972e-06, + "loss": 0.4871, + "step": 925 + }, + { + "epoch": 8.990291262135923, + "grad_norm": 0.598353497274599, + "learning_rate": 1.7322937877198545e-06, + "loss": 0.606, + "step": 926 + }, + { + "epoch": 9.0, + "grad_norm": 0.5771508102246277, + "learning_rate": 1.7274575140626318e-06, + "loss": 0.44, + "step": 927 + }, + { + "epoch": 9.009708737864077, + "grad_norm": 0.6340958349337429, + "learning_rate": 1.7226244346267063e-06, + "loss": 0.5384, + "step": 928 + }, + { + "epoch": 9.019417475728156, + "grad_norm": 0.5464524313638762, + "learning_rate": 1.7177945693953486e-06, + "loss": 0.5522, + "step": 929 + }, + { + "epoch": 9.029126213592233, + "grad_norm": 0.509758461452797, + "learning_rate": 1.7129679383385384e-06, + "loss": 0.5649, + "step": 930 + }, + { + "epoch": 9.03883495145631, + "grad_norm": 0.6123362850624466, + "learning_rate": 1.7081445614128845e-06, + "loss": 0.4895, + "step": 931 + }, + { + "epoch": 9.048543689320388, + "grad_norm": 0.5077335227225634, + "learning_rate": 1.7033244585615393e-06, + "loss": 0.612, + "step": 932 + }, + { + "epoch": 9.058252427184467, + "grad_norm": 0.5392346989255187, + "learning_rate": 1.698507649714121e-06, + "loss": 0.7241, + "step": 933 + }, + { + "epoch": 9.067961165048544, + "grad_norm": 0.5141203869994445, + "learning_rate": 1.6936941547866248e-06, + "loss": 0.4577, + "step": 934 + }, + { + "epoch": 9.077669902912621, + "grad_norm": 0.5106376436372138, + "learning_rate": 1.688883993681345e-06, + "loss": 0.5075, + "step": 935 + }, + { + "epoch": 9.087378640776699, + "grad_norm": 0.5179058552152446, + "learning_rate": 1.6840771862867922e-06, + "loss": 0.5869, + "step": 936 + }, + { + "epoch": 9.097087378640778, + "grad_norm": 0.4977753472362273, + "learning_rate": 1.6792737524776093e-06, + "loss": 0.6939, + "step": 937 + }, + { + "epoch": 9.106796116504855, + "grad_norm": 0.5543634504769569, + "learning_rate": 1.674473712114492e-06, + "loss": 0.4248, + "step": 938 + }, + { + "epoch": 9.116504854368932, + "grad_norm": 0.5846797847400803, + "learning_rate": 1.6696770850441036e-06, + "loss": 0.458, + "step": 939 + }, + { + "epoch": 9.12621359223301, + "grad_norm": 0.5457809797121611, + "learning_rate": 1.6648838910989955e-06, + "loss": 0.7876, + "step": 940 + }, + { + "epoch": 9.135922330097088, + "grad_norm": 0.5144940860816298, + "learning_rate": 1.6600941500975237e-06, + "loss": 0.5822, + "step": 941 + }, + { + "epoch": 9.145631067961165, + "grad_norm": 0.5046446642153924, + "learning_rate": 1.6553078818437678e-06, + "loss": 0.6312, + "step": 942 + }, + { + "epoch": 9.155339805825243, + "grad_norm": 0.5401161645711199, + "learning_rate": 1.6505251061274492e-06, + "loss": 0.7804, + "step": 943 + }, + { + "epoch": 9.16504854368932, + "grad_norm": 0.6409458250404014, + "learning_rate": 1.6457458427238464e-06, + "loss": 0.6905, + "step": 944 + }, + { + "epoch": 9.174757281553399, + "grad_norm": 0.5273386933789973, + "learning_rate": 1.6409701113937182e-06, + "loss": 0.7029, + "step": 945 + }, + { + "epoch": 9.184466019417476, + "grad_norm": 0.5109661246472791, + "learning_rate": 1.6361979318832173e-06, + "loss": 0.4894, + "step": 946 + }, + { + "epoch": 9.194174757281553, + "grad_norm": 0.5511532623064545, + "learning_rate": 1.6314293239238134e-06, + "loss": 0.5167, + "step": 947 + }, + { + "epoch": 9.20388349514563, + "grad_norm": 0.6610489196315156, + "learning_rate": 1.626664307232207e-06, + "loss": 0.5094, + "step": 948 + }, + { + "epoch": 9.21359223300971, + "grad_norm": 0.5736305051432473, + "learning_rate": 1.62190290151025e-06, + "loss": 0.6754, + "step": 949 + }, + { + "epoch": 9.223300970873787, + "grad_norm": 0.5279615724891207, + "learning_rate": 1.617145126444864e-06, + "loss": 0.5783, + "step": 950 + }, + { + "epoch": 9.233009708737864, + "grad_norm": 0.5163456645251102, + "learning_rate": 1.6123910017079591e-06, + "loss": 0.6769, + "step": 951 + }, + { + "epoch": 9.242718446601941, + "grad_norm": 0.5833509127515675, + "learning_rate": 1.6076405469563533e-06, + "loss": 0.3916, + "step": 952 + }, + { + "epoch": 9.25242718446602, + "grad_norm": 0.4895578621974047, + "learning_rate": 1.6028937818316889e-06, + "loss": 0.6144, + "step": 953 + }, + { + "epoch": 9.262135922330097, + "grad_norm": 0.5425400198843934, + "learning_rate": 1.598150725960354e-06, + "loss": 0.2586, + "step": 954 + }, + { + "epoch": 9.271844660194175, + "grad_norm": 0.5501148487291081, + "learning_rate": 1.5934113989533992e-06, + "loss": 0.6158, + "step": 955 + }, + { + "epoch": 9.281553398058252, + "grad_norm": 0.5544881707377486, + "learning_rate": 1.5886758204064582e-06, + "loss": 0.5528, + "step": 956 + }, + { + "epoch": 9.29126213592233, + "grad_norm": 0.48248060557065603, + "learning_rate": 1.583944009899665e-06, + "loss": 0.2886, + "step": 957 + }, + { + "epoch": 9.300970873786408, + "grad_norm": 0.6941081918833938, + "learning_rate": 1.579215986997575e-06, + "loss": 0.4662, + "step": 958 + }, + { + "epoch": 9.310679611650485, + "grad_norm": 0.5465640944202089, + "learning_rate": 1.5744917712490821e-06, + "loss": 0.6941, + "step": 959 + }, + { + "epoch": 9.320388349514563, + "grad_norm": 0.5319670471627715, + "learning_rate": 1.5697713821873401e-06, + "loss": 0.4341, + "step": 960 + }, + { + "epoch": 9.330097087378642, + "grad_norm": 0.48885465810429257, + "learning_rate": 1.5650548393296788e-06, + "loss": 0.4523, + "step": 961 + }, + { + "epoch": 9.339805825242719, + "grad_norm": 0.7022004361946983, + "learning_rate": 1.5603421621775273e-06, + "loss": 0.629, + "step": 962 + }, + { + "epoch": 9.349514563106796, + "grad_norm": 0.5632088911699985, + "learning_rate": 1.555633370216329e-06, + "loss": 0.6666, + "step": 963 + }, + { + "epoch": 9.359223300970873, + "grad_norm": 0.49380965838592933, + "learning_rate": 1.5509284829154652e-06, + "loss": 0.4755, + "step": 964 + }, + { + "epoch": 9.368932038834952, + "grad_norm": 0.5020708429363245, + "learning_rate": 1.5462275197281717e-06, + "loss": 0.5441, + "step": 965 + }, + { + "epoch": 9.37864077669903, + "grad_norm": 0.5195307949196531, + "learning_rate": 1.5415305000914587e-06, + "loss": 0.7319, + "step": 966 + }, + { + "epoch": 9.388349514563107, + "grad_norm": 0.48395870668183194, + "learning_rate": 1.536837443426032e-06, + "loss": 0.3737, + "step": 967 + }, + { + "epoch": 9.398058252427184, + "grad_norm": 0.6288833157472449, + "learning_rate": 1.5321483691362121e-06, + "loss": 0.3642, + "step": 968 + }, + { + "epoch": 9.407766990291263, + "grad_norm": 0.7231122831566812, + "learning_rate": 1.5274632966098527e-06, + "loss": 0.4566, + "step": 969 + }, + { + "epoch": 9.41747572815534, + "grad_norm": 0.5043165423307169, + "learning_rate": 1.5227822452182617e-06, + "loss": 0.4137, + "step": 970 + }, + { + "epoch": 9.427184466019417, + "grad_norm": 0.7582603233246429, + "learning_rate": 1.5181052343161212e-06, + "loss": 0.4781, + "step": 971 + }, + { + "epoch": 9.436893203883495, + "grad_norm": 0.5366974044741509, + "learning_rate": 1.5134322832414066e-06, + "loss": 0.4163, + "step": 972 + }, + { + "epoch": 9.446601941747574, + "grad_norm": 0.5799499713204818, + "learning_rate": 1.508763411315308e-06, + "loss": 0.5431, + "step": 973 + }, + { + "epoch": 9.45631067961165, + "grad_norm": 0.5604178353146644, + "learning_rate": 1.5040986378421485e-06, + "loss": 0.4337, + "step": 974 + }, + { + "epoch": 9.466019417475728, + "grad_norm": 0.4513197289000593, + "learning_rate": 1.499437982109305e-06, + "loss": 0.4195, + "step": 975 + }, + { + "epoch": 9.475728155339805, + "grad_norm": 0.6109174978216692, + "learning_rate": 1.4947814633871316e-06, + "loss": 0.6013, + "step": 976 + }, + { + "epoch": 9.485436893203884, + "grad_norm": 0.4822809855329016, + "learning_rate": 1.4901291009288748e-06, + "loss": 0.3593, + "step": 977 + }, + { + "epoch": 9.495145631067961, + "grad_norm": 0.5640776707095488, + "learning_rate": 1.4854809139705961e-06, + "loss": 0.5642, + "step": 978 + }, + { + "epoch": 9.504854368932039, + "grad_norm": 0.5595919599675019, + "learning_rate": 1.4808369217310937e-06, + "loss": 0.5531, + "step": 979 + }, + { + "epoch": 9.514563106796116, + "grad_norm": 0.5302464691788346, + "learning_rate": 1.4761971434118207e-06, + "loss": 0.6372, + "step": 980 + }, + { + "epoch": 9.524271844660195, + "grad_norm": 0.5300165066761687, + "learning_rate": 1.4715615981968088e-06, + "loss": 0.5707, + "step": 981 + }, + { + "epoch": 9.533980582524272, + "grad_norm": 0.4668375811587118, + "learning_rate": 1.4669303052525852e-06, + "loss": 0.4989, + "step": 982 + }, + { + "epoch": 9.54368932038835, + "grad_norm": 0.5560569559935798, + "learning_rate": 1.4623032837280971e-06, + "loss": 0.3493, + "step": 983 + }, + { + "epoch": 9.553398058252426, + "grad_norm": 0.5577549092239438, + "learning_rate": 1.4576805527546293e-06, + "loss": 0.8323, + "step": 984 + }, + { + "epoch": 9.563106796116505, + "grad_norm": 0.5677422426756373, + "learning_rate": 1.4530621314457255e-06, + "loss": 0.6881, + "step": 985 + }, + { + "epoch": 9.572815533980583, + "grad_norm": 0.5279934542850471, + "learning_rate": 1.4484480388971141e-06, + "loss": 0.7699, + "step": 986 + }, + { + "epoch": 9.58252427184466, + "grad_norm": 0.5350476163708038, + "learning_rate": 1.4438382941866224e-06, + "loss": 0.7338, + "step": 987 + }, + { + "epoch": 9.592233009708737, + "grad_norm": 0.5110629848586472, + "learning_rate": 1.4392329163741015e-06, + "loss": 0.5176, + "step": 988 + }, + { + "epoch": 9.601941747572816, + "grad_norm": 0.5687212051187402, + "learning_rate": 1.4346319245013463e-06, + "loss": 0.5527, + "step": 989 + }, + { + "epoch": 9.611650485436893, + "grad_norm": 0.4974392454544525, + "learning_rate": 1.430035337592018e-06, + "loss": 0.4942, + "step": 990 + }, + { + "epoch": 9.62135922330097, + "grad_norm": 0.5629010223404443, + "learning_rate": 1.425443174651564e-06, + "loss": 0.4756, + "step": 991 + }, + { + "epoch": 9.631067961165048, + "grad_norm": 0.5524080467073957, + "learning_rate": 1.4208554546671407e-06, + "loss": 0.6035, + "step": 992 + }, + { + "epoch": 9.640776699029127, + "grad_norm": 0.4804538845483175, + "learning_rate": 1.4162721966075323e-06, + "loss": 0.7249, + "step": 993 + }, + { + "epoch": 9.650485436893204, + "grad_norm": 0.4993730994583247, + "learning_rate": 1.411693419423078e-06, + "loss": 0.3848, + "step": 994 + }, + { + "epoch": 9.660194174757281, + "grad_norm": 0.5354151376861092, + "learning_rate": 1.4071191420455873e-06, + "loss": 0.5479, + "step": 995 + }, + { + "epoch": 9.669902912621358, + "grad_norm": 0.5224179916255741, + "learning_rate": 1.4025493833882645e-06, + "loss": 0.428, + "step": 996 + }, + { + "epoch": 9.679611650485437, + "grad_norm": 0.5441066168724764, + "learning_rate": 1.3979841623456309e-06, + "loss": 0.485, + "step": 997 + }, + { + "epoch": 9.689320388349515, + "grad_norm": 0.5162503314510989, + "learning_rate": 1.3934234977934463e-06, + "loss": 0.6244, + "step": 998 + }, + { + "epoch": 9.699029126213592, + "grad_norm": 0.5088420073204567, + "learning_rate": 1.3888674085886302e-06, + "loss": 0.5502, + "step": 999 + }, + { + "epoch": 9.70873786407767, + "grad_norm": 0.4891658312631955, + "learning_rate": 1.3843159135691859e-06, + "loss": 0.4974, + "step": 1000 + }, + { + "epoch": 9.718446601941748, + "grad_norm": 0.5247239478758943, + "learning_rate": 1.3797690315541193e-06, + "loss": 0.5293, + "step": 1001 + }, + { + "epoch": 9.728155339805825, + "grad_norm": 0.5579036546703239, + "learning_rate": 1.3752267813433645e-06, + "loss": 0.5561, + "step": 1002 + }, + { + "epoch": 9.737864077669903, + "grad_norm": 0.5128549281129826, + "learning_rate": 1.3706891817177036e-06, + "loss": 0.392, + "step": 1003 + }, + { + "epoch": 9.74757281553398, + "grad_norm": 0.5506859233210426, + "learning_rate": 1.3661562514386895e-06, + "loss": 0.5688, + "step": 1004 + }, + { + "epoch": 9.757281553398059, + "grad_norm": 0.5704140162541189, + "learning_rate": 1.3616280092485719e-06, + "loss": 0.6288, + "step": 1005 + }, + { + "epoch": 9.766990291262136, + "grad_norm": 0.5764246997137785, + "learning_rate": 1.357104473870213e-06, + "loss": 0.655, + "step": 1006 + }, + { + "epoch": 9.776699029126213, + "grad_norm": 0.5123137189616399, + "learning_rate": 1.3525856640070156e-06, + "loss": 0.4107, + "step": 1007 + }, + { + "epoch": 9.78640776699029, + "grad_norm": 0.5456395663842477, + "learning_rate": 1.3480715983428433e-06, + "loss": 0.5157, + "step": 1008 + }, + { + "epoch": 9.79611650485437, + "grad_norm": 0.6018889487173135, + "learning_rate": 1.3435622955419447e-06, + "loss": 0.5223, + "step": 1009 + }, + { + "epoch": 9.805825242718447, + "grad_norm": 0.5288879566344424, + "learning_rate": 1.3390577742488747e-06, + "loss": 0.402, + "step": 1010 + }, + { + "epoch": 9.815533980582524, + "grad_norm": 0.5060531265013377, + "learning_rate": 1.334558053088419e-06, + "loss": 0.5625, + "step": 1011 + }, + { + "epoch": 9.825242718446601, + "grad_norm": 0.5983276437902583, + "learning_rate": 1.3300631506655148e-06, + "loss": 0.7724, + "step": 1012 + }, + { + "epoch": 9.83495145631068, + "grad_norm": 0.5421716359030226, + "learning_rate": 1.3255730855651772e-06, + "loss": 0.6375, + "step": 1013 + }, + { + "epoch": 9.844660194174757, + "grad_norm": 0.5137714833067758, + "learning_rate": 1.3210878763524186e-06, + "loss": 0.565, + "step": 1014 + }, + { + "epoch": 9.854368932038835, + "grad_norm": 0.5224752768506955, + "learning_rate": 1.3166075415721762e-06, + "loss": 0.7022, + "step": 1015 + }, + { + "epoch": 9.864077669902912, + "grad_norm": 0.5131878611886128, + "learning_rate": 1.3121320997492305e-06, + "loss": 0.5082, + "step": 1016 + }, + { + "epoch": 9.87378640776699, + "grad_norm": 0.5203915202506901, + "learning_rate": 1.307661569388132e-06, + "loss": 0.538, + "step": 1017 + }, + { + "epoch": 9.883495145631068, + "grad_norm": 0.5259467137163731, + "learning_rate": 1.3031959689731236e-06, + "loss": 0.4584, + "step": 1018 + }, + { + "epoch": 9.893203883495145, + "grad_norm": 0.5245481953235169, + "learning_rate": 1.2987353169680667e-06, + "loss": 0.6089, + "step": 1019 + }, + { + "epoch": 9.902912621359224, + "grad_norm": 0.5303857378086793, + "learning_rate": 1.2942796318163595e-06, + "loss": 0.5585, + "step": 1020 + }, + { + "epoch": 9.912621359223301, + "grad_norm": 0.5041008237960473, + "learning_rate": 1.2898289319408653e-06, + "loss": 0.5354, + "step": 1021 + }, + { + "epoch": 9.922330097087379, + "grad_norm": 0.5552017542716956, + "learning_rate": 1.2853832357438346e-06, + "loss": 0.6052, + "step": 1022 + }, + { + "epoch": 9.932038834951456, + "grad_norm": 0.47893428879532324, + "learning_rate": 1.2809425616068288e-06, + "loss": 0.5291, + "step": 1023 + }, + { + "epoch": 9.941747572815533, + "grad_norm": 0.6660999509501955, + "learning_rate": 1.2765069278906456e-06, + "loss": 0.6806, + "step": 1024 + }, + { + "epoch": 9.951456310679612, + "grad_norm": 0.6474829457761733, + "learning_rate": 1.2720763529352415e-06, + "loss": 0.5242, + "step": 1025 + }, + { + "epoch": 9.96116504854369, + "grad_norm": 0.5121227651736198, + "learning_rate": 1.2676508550596562e-06, + "loss": 0.4134, + "step": 1026 + }, + { + "epoch": 9.970873786407767, + "grad_norm": 0.5377519390532532, + "learning_rate": 1.2632304525619388e-06, + "loss": 0.429, + "step": 1027 + }, + { + "epoch": 9.980582524271846, + "grad_norm": 0.5861629446040602, + "learning_rate": 1.2588151637190687e-06, + "loss": 0.8707, + "step": 1028 + }, + { + "epoch": 9.990291262135923, + "grad_norm": 0.5691854746366819, + "learning_rate": 1.2544050067868834e-06, + "loss": 0.4264, + "step": 1029 + }, + { + "epoch": 10.0, + "grad_norm": 0.7478699669714269, + "learning_rate": 1.2500000000000007e-06, + "loss": 0.5084, + "step": 1030 + }, + { + "epoch": 10.009708737864077, + "grad_norm": 0.46151381211002906, + "learning_rate": 1.2456001615717445e-06, + "loss": 0.5594, + "step": 1031 + }, + { + "epoch": 10.019417475728156, + "grad_norm": 0.5102877537048928, + "learning_rate": 1.2412055096940692e-06, + "loss": 0.4949, + "step": 1032 + }, + { + "epoch": 10.029126213592233, + "grad_norm": 0.5822245866471158, + "learning_rate": 1.2368160625374835e-06, + "loss": 0.797, + "step": 1033 + }, + { + "epoch": 10.03883495145631, + "grad_norm": 0.4956969860994467, + "learning_rate": 1.2324318382509787e-06, + "loss": 0.3893, + "step": 1034 + }, + { + "epoch": 10.048543689320388, + "grad_norm": 0.5208191592763981, + "learning_rate": 1.2280528549619487e-06, + "loss": 0.3821, + "step": 1035 + }, + { + "epoch": 10.058252427184467, + "grad_norm": 0.5448550745622244, + "learning_rate": 1.2236791307761184e-06, + "loss": 0.6886, + "step": 1036 + }, + { + "epoch": 10.067961165048544, + "grad_norm": 0.6039482131792457, + "learning_rate": 1.2193106837774678e-06, + "loss": 0.5576, + "step": 1037 + }, + { + "epoch": 10.077669902912621, + "grad_norm": 0.5479538068077612, + "learning_rate": 1.2149475320281578e-06, + "loss": 0.4806, + "step": 1038 + }, + { + "epoch": 10.087378640776699, + "grad_norm": 0.5705314303946492, + "learning_rate": 1.2105896935684545e-06, + "loss": 0.5369, + "step": 1039 + }, + { + "epoch": 10.097087378640778, + "grad_norm": 0.5271533020970816, + "learning_rate": 1.2062371864166553e-06, + "loss": 0.4633, + "step": 1040 + }, + { + "epoch": 10.106796116504855, + "grad_norm": 0.5456418396000027, + "learning_rate": 1.2018900285690148e-06, + "loss": 0.5174, + "step": 1041 + }, + { + "epoch": 10.116504854368932, + "grad_norm": 0.5368574554830884, + "learning_rate": 1.1975482379996697e-06, + "loss": 0.5409, + "step": 1042 + }, + { + "epoch": 10.12621359223301, + "grad_norm": 0.5035113911073187, + "learning_rate": 1.1932118326605644e-06, + "loss": 0.4153, + "step": 1043 + }, + { + "epoch": 10.135922330097088, + "grad_norm": 0.5630447002734048, + "learning_rate": 1.188880830481377e-06, + "loss": 0.3517, + "step": 1044 + }, + { + "epoch": 10.145631067961165, + "grad_norm": 0.46046579136925914, + "learning_rate": 1.1845552493694462e-06, + "loss": 0.3997, + "step": 1045 + }, + { + "epoch": 10.155339805825243, + "grad_norm": 0.5566449442651117, + "learning_rate": 1.1802351072096948e-06, + "loss": 0.473, + "step": 1046 + }, + { + "epoch": 10.16504854368932, + "grad_norm": 0.5372526771927345, + "learning_rate": 1.1759204218645577e-06, + "loss": 0.4501, + "step": 1047 + }, + { + "epoch": 10.174757281553399, + "grad_norm": 0.6565634479247288, + "learning_rate": 1.1716112111739095e-06, + "loss": 0.4763, + "step": 1048 + }, + { + "epoch": 10.184466019417476, + "grad_norm": 0.5550268695193769, + "learning_rate": 1.167307492954986e-06, + "loss": 0.695, + "step": 1049 + }, + { + "epoch": 10.194174757281553, + "grad_norm": 0.5101472665103614, + "learning_rate": 1.1630092850023148e-06, + "loss": 0.6151, + "step": 1050 + }, + { + "epoch": 10.20388349514563, + "grad_norm": 0.5663960308823205, + "learning_rate": 1.15871660508764e-06, + "loss": 0.7663, + "step": 1051 + }, + { + "epoch": 10.21359223300971, + "grad_norm": 0.5189202841990675, + "learning_rate": 1.1544294709598491e-06, + "loss": 0.7497, + "step": 1052 + }, + { + "epoch": 10.223300970873787, + "grad_norm": 0.5286405812389172, + "learning_rate": 1.1501479003448992e-06, + "loss": 0.6158, + "step": 1053 + }, + { + "epoch": 10.233009708737864, + "grad_norm": 0.5004315366131019, + "learning_rate": 1.1458719109457445e-06, + "loss": 0.4122, + "step": 1054 + }, + { + "epoch": 10.242718446601941, + "grad_norm": 0.5071667711625752, + "learning_rate": 1.141601520442262e-06, + "loss": 0.6746, + "step": 1055 + }, + { + "epoch": 10.25242718446602, + "grad_norm": 0.5057112811278313, + "learning_rate": 1.1373367464911798e-06, + "loss": 0.3907, + "step": 1056 + }, + { + "epoch": 10.262135922330097, + "grad_norm": 0.5310657265619477, + "learning_rate": 1.1330776067260026e-06, + "loss": 0.6539, + "step": 1057 + }, + { + "epoch": 10.271844660194175, + "grad_norm": 0.5114027839132974, + "learning_rate": 1.12882411875694e-06, + "loss": 0.4251, + "step": 1058 + }, + { + "epoch": 10.281553398058252, + "grad_norm": 0.5828286358595048, + "learning_rate": 1.1245763001708326e-06, + "loss": 0.4021, + "step": 1059 + }, + { + "epoch": 10.29126213592233, + "grad_norm": 0.4828687846960337, + "learning_rate": 1.120334168531081e-06, + "loss": 0.5349, + "step": 1060 + }, + { + "epoch": 10.300970873786408, + "grad_norm": 0.5241573566112172, + "learning_rate": 1.1160977413775704e-06, + "loss": 0.6077, + "step": 1061 + }, + { + "epoch": 10.310679611650485, + "grad_norm": 0.5066991756324611, + "learning_rate": 1.1118670362266003e-06, + "loss": 0.4968, + "step": 1062 + }, + { + "epoch": 10.320388349514563, + "grad_norm": 0.5118698345971918, + "learning_rate": 1.1076420705708137e-06, + "loss": 0.4295, + "step": 1063 + }, + { + "epoch": 10.330097087378642, + "grad_norm": 0.5509488966267911, + "learning_rate": 1.1034228618791197e-06, + "loss": 0.3386, + "step": 1064 + }, + { + "epoch": 10.339805825242719, + "grad_norm": 0.6303421970689016, + "learning_rate": 1.0992094275966256e-06, + "loss": 0.3737, + "step": 1065 + }, + { + "epoch": 10.349514563106796, + "grad_norm": 0.5369953531447155, + "learning_rate": 1.0950017851445624e-06, + "loss": 0.6595, + "step": 1066 + }, + { + "epoch": 10.359223300970873, + "grad_norm": 0.6048647515454525, + "learning_rate": 1.0907999519202142e-06, + "loss": 0.5782, + "step": 1067 + }, + { + "epoch": 10.368932038834952, + "grad_norm": 0.5011839711044318, + "learning_rate": 1.0866039452968464e-06, + "loss": 0.4836, + "step": 1068 + }, + { + "epoch": 10.37864077669903, + "grad_norm": 0.549369060527332, + "learning_rate": 1.0824137826236318e-06, + "loss": 0.3571, + "step": 1069 + }, + { + "epoch": 10.388349514563107, + "grad_norm": 0.5046590280856795, + "learning_rate": 1.078229481225582e-06, + "loss": 0.381, + "step": 1070 + }, + { + "epoch": 10.398058252427184, + "grad_norm": 0.49366158846755176, + "learning_rate": 1.074051058403472e-06, + "loss": 0.414, + "step": 1071 + }, + { + "epoch": 10.407766990291263, + "grad_norm": 0.566545758141253, + "learning_rate": 1.069878531433773e-06, + "loss": 0.4003, + "step": 1072 + }, + { + "epoch": 10.41747572815534, + "grad_norm": 0.5472254994058976, + "learning_rate": 1.0657119175685776e-06, + "loss": 0.5429, + "step": 1073 + }, + { + "epoch": 10.427184466019417, + "grad_norm": 0.5887149186721222, + "learning_rate": 1.061551234035529e-06, + "loss": 0.3878, + "step": 1074 + }, + { + "epoch": 10.436893203883495, + "grad_norm": 0.5282433190533067, + "learning_rate": 1.0573964980377517e-06, + "loss": 0.6502, + "step": 1075 + }, + { + "epoch": 10.446601941747574, + "grad_norm": 0.44374712602971006, + "learning_rate": 1.0532477267537772e-06, + "loss": 0.511, + "step": 1076 + }, + { + "epoch": 10.45631067961165, + "grad_norm": 0.5490486215222025, + "learning_rate": 1.0491049373374762e-06, + "loss": 0.6757, + "step": 1077 + }, + { + "epoch": 10.466019417475728, + "grad_norm": 0.49442276500022064, + "learning_rate": 1.044968146917986e-06, + "loss": 0.4764, + "step": 1078 + }, + { + "epoch": 10.475728155339805, + "grad_norm": 0.5608038269974015, + "learning_rate": 1.0408373725996386e-06, + "loss": 0.5028, + "step": 1079 + }, + { + "epoch": 10.485436893203884, + "grad_norm": 0.5687457204009948, + "learning_rate": 1.0367126314618946e-06, + "loss": 0.5785, + "step": 1080 + }, + { + "epoch": 10.495145631067961, + "grad_norm": 0.6968670389266779, + "learning_rate": 1.0325939405592661e-06, + "loss": 0.6075, + "step": 1081 + }, + { + "epoch": 10.504854368932039, + "grad_norm": 0.5304379314551411, + "learning_rate": 1.0284813169212502e-06, + "loss": 0.3914, + "step": 1082 + }, + { + "epoch": 10.514563106796116, + "grad_norm": 0.5419260179618722, + "learning_rate": 1.024374777552258e-06, + "loss": 0.5441, + "step": 1083 + }, + { + "epoch": 10.524271844660195, + "grad_norm": 1.194505571995219, + "learning_rate": 1.0202743394315444e-06, + "loss": 0.5646, + "step": 1084 + }, + { + "epoch": 10.533980582524272, + "grad_norm": 0.5014060531536156, + "learning_rate": 1.0161800195131372e-06, + "loss": 0.5548, + "step": 1085 + }, + { + "epoch": 10.54368932038835, + "grad_norm": 0.49937660543182716, + "learning_rate": 1.0120918347257669e-06, + "loss": 0.5122, + "step": 1086 + }, + { + "epoch": 10.553398058252426, + "grad_norm": 0.5462501234068766, + "learning_rate": 1.0080098019727979e-06, + "loss": 0.452, + "step": 1087 + }, + { + "epoch": 10.563106796116505, + "grad_norm": 0.5631895834373033, + "learning_rate": 1.0039339381321572e-06, + "loss": 0.7399, + "step": 1088 + }, + { + "epoch": 10.572815533980583, + "grad_norm": 0.4736886401897993, + "learning_rate": 9.998642600562664e-07, + "loss": 0.6179, + "step": 1089 + }, + { + "epoch": 10.58252427184466, + "grad_norm": 0.46820689865613146, + "learning_rate": 9.95800784571969e-07, + "loss": 0.6221, + "step": 1090 + }, + { + "epoch": 10.592233009708737, + "grad_norm": 0.5195369073586825, + "learning_rate": 9.91743528480464e-07, + "loss": 0.5183, + "step": 1091 + }, + { + "epoch": 10.601941747572816, + "grad_norm": 0.5432189978052531, + "learning_rate": 9.876925085572365e-07, + "loss": 0.5503, + "step": 1092 + }, + { + "epoch": 10.611650485436893, + "grad_norm": 0.5493319095400769, + "learning_rate": 9.836477415519843e-07, + "loss": 0.5628, + "step": 1093 + }, + { + "epoch": 10.62135922330097, + "grad_norm": 0.5150871722296954, + "learning_rate": 9.79609244188553e-07, + "loss": 0.5485, + "step": 1094 + }, + { + "epoch": 10.631067961165048, + "grad_norm": 0.5523462804112262, + "learning_rate": 9.755770331648642e-07, + "loss": 0.4576, + "step": 1095 + }, + { + "epoch": 10.640776699029127, + "grad_norm": 0.5157125198942374, + "learning_rate": 9.715511251528486e-07, + "loss": 0.5491, + "step": 1096 + }, + { + "epoch": 10.650485436893204, + "grad_norm": 0.5227068552137706, + "learning_rate": 9.67531536798375e-07, + "loss": 0.6811, + "step": 1097 + }, + { + "epoch": 10.660194174757281, + "grad_norm": 0.5192500487387761, + "learning_rate": 9.635182847211827e-07, + "loss": 0.6348, + "step": 1098 + }, + { + "epoch": 10.669902912621358, + "grad_norm": 0.5401758469516798, + "learning_rate": 9.595113855148128e-07, + "loss": 0.5902, + "step": 1099 + }, + { + "epoch": 10.679611650485437, + "grad_norm": 0.9322574427952275, + "learning_rate": 9.555108557465383e-07, + "loss": 0.7462, + "step": 1100 + }, + { + "epoch": 10.689320388349515, + "grad_norm": 0.5179604641829436, + "learning_rate": 9.51516711957298e-07, + "loss": 0.5137, + "step": 1101 + }, + { + "epoch": 10.699029126213592, + "grad_norm": 0.5210907828158898, + "learning_rate": 9.475289706616256e-07, + "loss": 0.5653, + "step": 1102 + }, + { + "epoch": 10.70873786407767, + "grad_norm": 0.5610819470304531, + "learning_rate": 9.435476483475825e-07, + "loss": 0.4509, + "step": 1103 + }, + { + "epoch": 10.718446601941748, + "grad_norm": 0.5239244795742548, + "learning_rate": 9.395727614766903e-07, + "loss": 0.419, + "step": 1104 + }, + { + "epoch": 10.728155339805825, + "grad_norm": 0.5043201994579133, + "learning_rate": 9.356043264838607e-07, + "loss": 0.5223, + "step": 1105 + }, + { + "epoch": 10.737864077669903, + "grad_norm": 0.6710199227966344, + "learning_rate": 9.316423597773316e-07, + "loss": 0.5586, + "step": 1106 + }, + { + "epoch": 10.74757281553398, + "grad_norm": 0.5060889411301873, + "learning_rate": 9.276868777385942e-07, + "loss": 0.5578, + "step": 1107 + }, + { + "epoch": 10.757281553398059, + "grad_norm": 0.5180089569639259, + "learning_rate": 9.237378967223279e-07, + "loss": 0.3907, + "step": 1108 + }, + { + "epoch": 10.766990291262136, + "grad_norm": 0.6040964912815993, + "learning_rate": 9.197954330563327e-07, + "loss": 0.5639, + "step": 1109 + }, + { + "epoch": 10.776699029126213, + "grad_norm": 0.5816718750599607, + "learning_rate": 9.158595030414621e-07, + "loss": 0.5752, + "step": 1110 + }, + { + "epoch": 10.78640776699029, + "grad_norm": 0.5417325173104717, + "learning_rate": 9.11930122951554e-07, + "loss": 0.4933, + "step": 1111 + }, + { + "epoch": 10.79611650485437, + "grad_norm": 0.600237968659638, + "learning_rate": 9.080073090333646e-07, + "loss": 0.4976, + "step": 1112 + }, + { + "epoch": 10.805825242718447, + "grad_norm": 0.49463419377226303, + "learning_rate": 9.040910775065015e-07, + "loss": 0.5993, + "step": 1113 + }, + { + "epoch": 10.815533980582524, + "grad_norm": 0.510920823880218, + "learning_rate": 9.001814445633558e-07, + "loss": 0.4481, + "step": 1114 + }, + { + "epoch": 10.825242718446601, + "grad_norm": 0.5307522441544542, + "learning_rate": 8.962784263690358e-07, + "loss": 0.6389, + "step": 1115 + }, + { + "epoch": 10.83495145631068, + "grad_norm": 0.5001773473983797, + "learning_rate": 8.923820390612991e-07, + "loss": 0.431, + "step": 1116 + }, + { + "epoch": 10.844660194174757, + "grad_norm": 0.5452911240935504, + "learning_rate": 8.884922987504882e-07, + "loss": 0.5698, + "step": 1117 + }, + { + "epoch": 10.854368932038835, + "grad_norm": 0.5875035644254444, + "learning_rate": 8.846092215194607e-07, + "loss": 0.5176, + "step": 1118 + }, + { + "epoch": 10.864077669902912, + "grad_norm": 0.593676774427124, + "learning_rate": 8.807328234235254e-07, + "loss": 0.4337, + "step": 1119 + }, + { + "epoch": 10.87378640776699, + "grad_norm": 0.5292460765839232, + "learning_rate": 8.768631204903738e-07, + "loss": 0.6187, + "step": 1120 + }, + { + "epoch": 10.883495145631068, + "grad_norm": 0.49638389592552545, + "learning_rate": 8.730001287200177e-07, + "loss": 0.7939, + "step": 1121 + }, + { + "epoch": 10.893203883495145, + "grad_norm": 0.49414826625658015, + "learning_rate": 8.691438640847177e-07, + "loss": 0.4413, + "step": 1122 + }, + { + "epoch": 10.902912621359224, + "grad_norm": 0.5295200176718633, + "learning_rate": 8.652943425289206e-07, + "loss": 0.7966, + "step": 1123 + }, + { + "epoch": 10.912621359223301, + "grad_norm": 0.5870264475278172, + "learning_rate": 8.61451579969193e-07, + "loss": 0.4587, + "step": 1124 + }, + { + "epoch": 10.922330097087379, + "grad_norm": 0.5685870573511718, + "learning_rate": 8.576155922941548e-07, + "loss": 0.6506, + "step": 1125 + }, + { + "epoch": 10.932038834951456, + "grad_norm": 0.44300049758866367, + "learning_rate": 8.537863953644138e-07, + "loss": 0.4497, + "step": 1126 + }, + { + "epoch": 10.941747572815533, + "grad_norm": 0.5555503272916815, + "learning_rate": 8.499640050125007e-07, + "loss": 0.6713, + "step": 1127 + }, + { + "epoch": 10.951456310679612, + "grad_norm": 0.5182699503371718, + "learning_rate": 8.461484370428025e-07, + "loss": 0.55, + "step": 1128 + }, + { + "epoch": 10.96116504854369, + "grad_norm": 0.5219385783314696, + "learning_rate": 8.423397072314985e-07, + "loss": 0.6125, + "step": 1129 + }, + { + "epoch": 10.970873786407767, + "grad_norm": 0.5394176739754208, + "learning_rate": 8.385378313264933e-07, + "loss": 0.5786, + "step": 1130 + }, + { + "epoch": 10.980582524271846, + "grad_norm": 0.5661425078708068, + "learning_rate": 8.347428250473541e-07, + "loss": 0.6141, + "step": 1131 + }, + { + "epoch": 10.990291262135923, + "grad_norm": 0.7696322655449727, + "learning_rate": 8.309547040852434e-07, + "loss": 0.4752, + "step": 1132 + }, + { + "epoch": 11.0, + "grad_norm": 0.47444374346121526, + "learning_rate": 8.271734841028553e-07, + "loss": 0.6056, + "step": 1133 + }, + { + "epoch": 11.009708737864077, + "grad_norm": 0.5035082193387115, + "learning_rate": 8.233991807343497e-07, + "loss": 0.6257, + "step": 1134 + }, + { + "epoch": 11.019417475728156, + "grad_norm": 0.5349228147551298, + "learning_rate": 8.196318095852909e-07, + "loss": 0.3682, + "step": 1135 + }, + { + "epoch": 11.029126213592233, + "grad_norm": 0.560456795350431, + "learning_rate": 8.158713862325782e-07, + "loss": 0.5356, + "step": 1136 + }, + { + "epoch": 11.03883495145631, + "grad_norm": 0.5244174908608809, + "learning_rate": 8.12117926224385e-07, + "loss": 0.4204, + "step": 1137 + }, + { + "epoch": 11.048543689320388, + "grad_norm": 0.5586110284911971, + "learning_rate": 8.08371445080093e-07, + "loss": 0.4261, + "step": 1138 + }, + { + "epoch": 11.058252427184467, + "grad_norm": 0.5836602488957451, + "learning_rate": 8.04631958290229e-07, + "loss": 0.4734, + "step": 1139 + }, + { + "epoch": 11.067961165048544, + "grad_norm": 0.5112021875362739, + "learning_rate": 8.008994813163995e-07, + "loss": 0.6441, + "step": 1140 + }, + { + "epoch": 11.077669902912621, + "grad_norm": 0.5202806365311643, + "learning_rate": 7.971740295912289e-07, + "loss": 0.4098, + "step": 1141 + }, + { + "epoch": 11.087378640776699, + "grad_norm": 0.5160042718236523, + "learning_rate": 7.934556185182928e-07, + "loss": 0.4041, + "step": 1142 + }, + { + "epoch": 11.097087378640778, + "grad_norm": 0.5204797673157634, + "learning_rate": 7.897442634720576e-07, + "loss": 0.372, + "step": 1143 + }, + { + "epoch": 11.106796116504855, + "grad_norm": 0.5067704301983367, + "learning_rate": 7.860399797978138e-07, + "loss": 0.5713, + "step": 1144 + }, + { + "epoch": 11.116504854368932, + "grad_norm": 0.4880515826350417, + "learning_rate": 7.823427828116148e-07, + "loss": 0.6707, + "step": 1145 + }, + { + "epoch": 11.12621359223301, + "grad_norm": 0.5001107410419231, + "learning_rate": 7.786526878002126e-07, + "loss": 0.4481, + "step": 1146 + }, + { + "epoch": 11.135922330097088, + "grad_norm": 0.5033124797385321, + "learning_rate": 7.749697100209947e-07, + "loss": 0.4927, + "step": 1147 + }, + { + "epoch": 11.145631067961165, + "grad_norm": 0.5491536863139684, + "learning_rate": 7.7129386470192e-07, + "loss": 0.4699, + "step": 1148 + }, + { + "epoch": 11.155339805825243, + "grad_norm": 0.5559995427878067, + "learning_rate": 7.6762516704146e-07, + "loss": 0.4684, + "step": 1149 + }, + { + "epoch": 11.16504854368932, + "grad_norm": 0.4642492344086255, + "learning_rate": 7.6396363220853e-07, + "loss": 0.6739, + "step": 1150 + }, + { + "epoch": 11.174757281553399, + "grad_norm": 0.488509927755893, + "learning_rate": 7.603092753424298e-07, + "loss": 0.5208, + "step": 1151 + }, + { + "epoch": 11.184466019417476, + "grad_norm": 0.5224426261949789, + "learning_rate": 7.566621115527811e-07, + "loss": 0.3606, + "step": 1152 + }, + { + "epoch": 11.194174757281553, + "grad_norm": 0.5187700185641912, + "learning_rate": 7.530221559194643e-07, + "loss": 0.3599, + "step": 1153 + }, + { + "epoch": 11.20388349514563, + "grad_norm": 0.46970734175456236, + "learning_rate": 7.493894234925558e-07, + "loss": 0.4689, + "step": 1154 + }, + { + "epoch": 11.21359223300971, + "grad_norm": 0.5340977887603126, + "learning_rate": 7.457639292922675e-07, + "loss": 0.4931, + "step": 1155 + }, + { + "epoch": 11.223300970873787, + "grad_norm": 0.6399746017320636, + "learning_rate": 7.421456883088826e-07, + "loss": 0.4575, + "step": 1156 + }, + { + "epoch": 11.233009708737864, + "grad_norm": 0.5315209210773844, + "learning_rate": 7.385347155026934e-07, + "loss": 0.6337, + "step": 1157 + }, + { + "epoch": 11.242718446601941, + "grad_norm": 0.5118418712712183, + "learning_rate": 7.349310258039441e-07, + "loss": 0.6285, + "step": 1158 + }, + { + "epoch": 11.25242718446602, + "grad_norm": 0.530069361401702, + "learning_rate": 7.31334634112762e-07, + "loss": 0.7002, + "step": 1159 + }, + { + "epoch": 11.262135922330097, + "grad_norm": 0.5172157588149657, + "learning_rate": 7.277455552991011e-07, + "loss": 0.6858, + "step": 1160 + }, + { + "epoch": 11.271844660194175, + "grad_norm": 0.5749812236582913, + "learning_rate": 7.241638042026783e-07, + "loss": 0.3873, + "step": 1161 + }, + { + "epoch": 11.281553398058252, + "grad_norm": 0.5951213032992472, + "learning_rate": 7.20589395632913e-07, + "loss": 0.4891, + "step": 1162 + }, + { + "epoch": 11.29126213592233, + "grad_norm": 0.5134339950348145, + "learning_rate": 7.170223443688654e-07, + "loss": 0.7757, + "step": 1163 + }, + { + "epoch": 11.300970873786408, + "grad_norm": 0.48002921717189445, + "learning_rate": 7.134626651591758e-07, + "loss": 0.5223, + "step": 1164 + }, + { + "epoch": 11.310679611650485, + "grad_norm": 0.5324335789694284, + "learning_rate": 7.099103727220024e-07, + "loss": 0.267, + "step": 1165 + }, + { + "epoch": 11.320388349514563, + "grad_norm": 0.5082511836996648, + "learning_rate": 7.063654817449638e-07, + "loss": 0.7224, + "step": 1166 + }, + { + "epoch": 11.330097087378642, + "grad_norm": 0.4699534351048162, + "learning_rate": 7.028280068850734e-07, + "loss": 0.3477, + "step": 1167 + }, + { + "epoch": 11.339805825242719, + "grad_norm": 0.5290710230891463, + "learning_rate": 6.992979627686821e-07, + "loss": 0.5673, + "step": 1168 + }, + { + "epoch": 11.349514563106796, + "grad_norm": 0.5635904119108134, + "learning_rate": 6.957753639914175e-07, + "loss": 0.6496, + "step": 1169 + }, + { + "epoch": 11.359223300970873, + "grad_norm": 0.4785277786459809, + "learning_rate": 6.922602251181221e-07, + "loss": 0.5035, + "step": 1170 + }, + { + "epoch": 11.368932038834952, + "grad_norm": 0.48631251601303854, + "learning_rate": 6.887525606827947e-07, + "loss": 0.5194, + "step": 1171 + }, + { + "epoch": 11.37864077669903, + "grad_norm": 0.4530281507214442, + "learning_rate": 6.852523851885295e-07, + "loss": 0.2014, + "step": 1172 + }, + { + "epoch": 11.388349514563107, + "grad_norm": 0.5485167968205401, + "learning_rate": 6.817597131074566e-07, + "loss": 0.5967, + "step": 1173 + }, + { + "epoch": 11.398058252427184, + "grad_norm": 0.5659426811727628, + "learning_rate": 6.782745588806811e-07, + "loss": 0.4703, + "step": 1174 + }, + { + "epoch": 11.407766990291263, + "grad_norm": 0.5844594075015748, + "learning_rate": 6.747969369182248e-07, + "loss": 0.5578, + "step": 1175 + }, + { + "epoch": 11.41747572815534, + "grad_norm": 0.6114807287741806, + "learning_rate": 6.713268615989654e-07, + "loss": 0.6004, + "step": 1176 + }, + { + "epoch": 11.427184466019417, + "grad_norm": 0.536472460310527, + "learning_rate": 6.678643472705773e-07, + "loss": 0.6322, + "step": 1177 + }, + { + "epoch": 11.436893203883495, + "grad_norm": 0.5884548398285022, + "learning_rate": 6.644094082494746e-07, + "loss": 0.4211, + "step": 1178 + }, + { + "epoch": 11.446601941747574, + "grad_norm": 0.5293934881901676, + "learning_rate": 6.609620588207474e-07, + "loss": 0.4222, + "step": 1179 + }, + { + "epoch": 11.45631067961165, + "grad_norm": 0.5298417575144992, + "learning_rate": 6.575223132381067e-07, + "loss": 0.5372, + "step": 1180 + }, + { + "epoch": 11.466019417475728, + "grad_norm": 0.5329508454700512, + "learning_rate": 6.540901857238233e-07, + "loss": 0.3954, + "step": 1181 + }, + { + "epoch": 11.475728155339805, + "grad_norm": 0.5330378729803877, + "learning_rate": 6.506656904686698e-07, + "loss": 0.4937, + "step": 1182 + }, + { + "epoch": 11.485436893203884, + "grad_norm": 0.4883863303942178, + "learning_rate": 6.472488416318621e-07, + "loss": 0.4685, + "step": 1183 + }, + { + "epoch": 11.495145631067961, + "grad_norm": 0.5433426004305278, + "learning_rate": 6.438396533410002e-07, + "loss": 0.5392, + "step": 1184 + }, + { + "epoch": 11.504854368932039, + "grad_norm": 0.5155163462176132, + "learning_rate": 6.4043813969201e-07, + "loss": 0.4761, + "step": 1185 + }, + { + "epoch": 11.514563106796116, + "grad_norm": 0.5282705137900666, + "learning_rate": 6.370443147490857e-07, + "loss": 0.6882, + "step": 1186 + }, + { + "epoch": 11.524271844660195, + "grad_norm": 0.5028031458659833, + "learning_rate": 6.336581925446309e-07, + "loss": 0.4767, + "step": 1187 + }, + { + "epoch": 11.533980582524272, + "grad_norm": 0.5977403377061767, + "learning_rate": 6.302797870792007e-07, + "loss": 0.7029, + "step": 1188 + }, + { + "epoch": 11.54368932038835, + "grad_norm": 0.47504759941605157, + "learning_rate": 6.269091123214438e-07, + "loss": 0.4299, + "step": 1189 + }, + { + "epoch": 11.553398058252426, + "grad_norm": 0.5677960420156702, + "learning_rate": 6.235461822080449e-07, + "loss": 0.7392, + "step": 1190 + }, + { + "epoch": 11.563106796116505, + "grad_norm": 0.5148644708372068, + "learning_rate": 6.201910106436673e-07, + "loss": 0.462, + "step": 1191 + }, + { + "epoch": 11.572815533980583, + "grad_norm": 0.5634224658071493, + "learning_rate": 6.168436115008941e-07, + "loss": 0.6542, + "step": 1192 + }, + { + "epoch": 11.58252427184466, + "grad_norm": 0.5270200209649619, + "learning_rate": 6.135039986201744e-07, + "loss": 0.5779, + "step": 1193 + }, + { + "epoch": 11.592233009708737, + "grad_norm": 0.572184122415712, + "learning_rate": 6.101721858097606e-07, + "loss": 0.5775, + "step": 1194 + }, + { + "epoch": 11.601941747572816, + "grad_norm": 0.5022620535938661, + "learning_rate": 6.068481868456558e-07, + "loss": 0.535, + "step": 1195 + }, + { + "epoch": 11.611650485436893, + "grad_norm": 0.5002295261145933, + "learning_rate": 6.035320154715549e-07, + "loss": 0.4669, + "step": 1196 + }, + { + "epoch": 11.62135922330097, + "grad_norm": 0.5684134153610501, + "learning_rate": 6.00223685398788e-07, + "loss": 0.4175, + "step": 1197 + }, + { + "epoch": 11.631067961165048, + "grad_norm": 0.5951925773731871, + "learning_rate": 5.969232103062647e-07, + "loss": 0.4168, + "step": 1198 + }, + { + "epoch": 11.640776699029127, + "grad_norm": 0.5451041361161728, + "learning_rate": 5.936306038404158e-07, + "loss": 0.3661, + "step": 1199 + }, + { + "epoch": 11.650485436893204, + "grad_norm": 0.5600180852134692, + "learning_rate": 5.903458796151382e-07, + "loss": 0.832, + "step": 1200 + }, + { + "epoch": 11.660194174757281, + "grad_norm": 0.5679137831594563, + "learning_rate": 5.870690512117377e-07, + "loss": 0.4264, + "step": 1201 + }, + { + "epoch": 11.669902912621358, + "grad_norm": 0.5512583377764644, + "learning_rate": 5.838001321788744e-07, + "loss": 0.5387, + "step": 1202 + }, + { + "epoch": 11.679611650485437, + "grad_norm": 0.5202492691551494, + "learning_rate": 5.80539136032505e-07, + "loss": 0.4072, + "step": 1203 + }, + { + "epoch": 11.689320388349515, + "grad_norm": 0.5303414222747518, + "learning_rate": 5.772860762558269e-07, + "loss": 0.7383, + "step": 1204 + }, + { + "epoch": 11.699029126213592, + "grad_norm": 0.5747472031996176, + "learning_rate": 5.740409662992244e-07, + "loss": 0.4772, + "step": 1205 + }, + { + "epoch": 11.70873786407767, + "grad_norm": 0.5344538177636208, + "learning_rate": 5.708038195802098e-07, + "loss": 0.5894, + "step": 1206 + }, + { + "epoch": 11.718446601941748, + "grad_norm": 0.6162700091163664, + "learning_rate": 5.675746494833733e-07, + "loss": 0.6199, + "step": 1207 + }, + { + "epoch": 11.728155339805825, + "grad_norm": 0.5212416584936878, + "learning_rate": 5.643534693603214e-07, + "loss": 0.5544, + "step": 1208 + }, + { + "epoch": 11.737864077669903, + "grad_norm": 0.5002178765849649, + "learning_rate": 5.61140292529625e-07, + "loss": 0.4494, + "step": 1209 + }, + { + "epoch": 11.74757281553398, + "grad_norm": 0.49081842853898583, + "learning_rate": 5.579351322767643e-07, + "loss": 0.4325, + "step": 1210 + }, + { + "epoch": 11.757281553398059, + "grad_norm": 0.677193839042659, + "learning_rate": 5.547380018540735e-07, + "loss": 0.582, + "step": 1211 + }, + { + "epoch": 11.766990291262136, + "grad_norm": 0.6177020335391481, + "learning_rate": 5.515489144806862e-07, + "loss": 0.3849, + "step": 1212 + }, + { + "epoch": 11.776699029126213, + "grad_norm": 0.5418788069046763, + "learning_rate": 5.483678833424796e-07, + "loss": 0.655, + "step": 1213 + }, + { + "epoch": 11.78640776699029, + "grad_norm": 0.5041790619256601, + "learning_rate": 5.451949215920221e-07, + "loss": 0.5462, + "step": 1214 + }, + { + "epoch": 11.79611650485437, + "grad_norm": 0.5833392895268761, + "learning_rate": 5.420300423485167e-07, + "loss": 0.4945, + "step": 1215 + }, + { + "epoch": 11.805825242718447, + "grad_norm": 0.49267471870126267, + "learning_rate": 5.38873258697748e-07, + "loss": 0.5889, + "step": 1216 + }, + { + "epoch": 11.815533980582524, + "grad_norm": 0.48439741671139047, + "learning_rate": 5.357245836920286e-07, + "loss": 0.5955, + "step": 1217 + }, + { + "epoch": 11.825242718446601, + "grad_norm": 0.5572093766556138, + "learning_rate": 5.325840303501431e-07, + "loss": 0.6049, + "step": 1218 + }, + { + "epoch": 11.83495145631068, + "grad_norm": 0.5237802353774021, + "learning_rate": 5.29451611657297e-07, + "loss": 0.5523, + "step": 1219 + }, + { + "epoch": 11.844660194174757, + "grad_norm": 0.5624258745574067, + "learning_rate": 5.263273405650601e-07, + "loss": 0.6399, + "step": 1220 + }, + { + "epoch": 11.854368932038835, + "grad_norm": 0.7999810849053399, + "learning_rate": 5.232112299913151e-07, + "loss": 0.5699, + "step": 1221 + }, + { + "epoch": 11.864077669902912, + "grad_norm": 0.4742127157028219, + "learning_rate": 5.201032928202043e-07, + "loss": 0.4969, + "step": 1222 + }, + { + "epoch": 11.87378640776699, + "grad_norm": 0.5137383431300119, + "learning_rate": 5.17003541902075e-07, + "loss": 0.772, + "step": 1223 + }, + { + "epoch": 11.883495145631068, + "grad_norm": 0.46497503447213573, + "learning_rate": 5.139119900534259e-07, + "loss": 0.5661, + "step": 1224 + }, + { + "epoch": 11.893203883495145, + "grad_norm": 0.6079453183093874, + "learning_rate": 5.108286500568562e-07, + "loss": 0.3941, + "step": 1225 + }, + { + "epoch": 11.902912621359224, + "grad_norm": 0.5273197405999158, + "learning_rate": 5.077535346610115e-07, + "loss": 0.5251, + "step": 1226 + }, + { + "epoch": 11.912621359223301, + "grad_norm": 0.48161444879759807, + "learning_rate": 5.046866565805311e-07, + "loss": 0.3903, + "step": 1227 + }, + { + "epoch": 11.922330097087379, + "grad_norm": 0.49154732760101316, + "learning_rate": 5.016280284959957e-07, + "loss": 0.538, + "step": 1228 + }, + { + "epoch": 11.932038834951456, + "grad_norm": 0.4885700992202129, + "learning_rate": 4.985776630538746e-07, + "loss": 0.6219, + "step": 1229 + }, + { + "epoch": 11.941747572815533, + "grad_norm": 0.5594706359230254, + "learning_rate": 4.95535572866474e-07, + "loss": 0.4817, + "step": 1230 + }, + { + "epoch": 11.951456310679612, + "grad_norm": 0.5257629598562028, + "learning_rate": 4.925017705118843e-07, + "loss": 0.4894, + "step": 1231 + }, + { + "epoch": 11.96116504854369, + "grad_norm": 0.5767484443957133, + "learning_rate": 4.89476268533928e-07, + "loss": 0.7011, + "step": 1232 + }, + { + "epoch": 11.970873786407767, + "grad_norm": 0.5950735781630769, + "learning_rate": 4.864590794421092e-07, + "loss": 0.7114, + "step": 1233 + }, + { + "epoch": 11.980582524271846, + "grad_norm": 0.540369700371263, + "learning_rate": 4.834502157115597e-07, + "loss": 0.4854, + "step": 1234 + }, + { + "epoch": 11.990291262135923, + "grad_norm": 0.5818146270683455, + "learning_rate": 4.804496897829883e-07, + "loss": 0.2926, + "step": 1235 + }, + { + "epoch": 12.0, + "grad_norm": 0.5089608589552385, + "learning_rate": 4.774575140626317e-07, + "loss": 0.6658, + "step": 1236 + }, + { + "epoch": 12.009708737864077, + "grad_norm": 0.4547147316741764, + "learning_rate": 4.744737009221986e-07, + "loss": 0.483, + "step": 1237 + }, + { + "epoch": 12.019417475728156, + "grad_norm": 0.5829159491473124, + "learning_rate": 4.7149826269882294e-07, + "loss": 0.616, + "step": 1238 + }, + { + "epoch": 12.029126213592233, + "grad_norm": 0.5043098105333009, + "learning_rate": 4.6853121169500914e-07, + "loss": 0.4629, + "step": 1239 + }, + { + "epoch": 12.03883495145631, + "grad_norm": 0.571022287176784, + "learning_rate": 4.6557256017858485e-07, + "loss": 0.4794, + "step": 1240 + }, + { + "epoch": 12.048543689320388, + "grad_norm": 0.5399154525227847, + "learning_rate": 4.626223203826477e-07, + "loss": 0.5385, + "step": 1241 + }, + { + "epoch": 12.058252427184467, + "grad_norm": 0.5198642674251133, + "learning_rate": 4.5968050450551527e-07, + "loss": 0.3339, + "step": 1242 + }, + { + "epoch": 12.067961165048544, + "grad_norm": 0.49500784542537424, + "learning_rate": 4.56747124710675e-07, + "loss": 0.4985, + "step": 1243 + }, + { + "epoch": 12.077669902912621, + "grad_norm": 0.5475203658604212, + "learning_rate": 4.5382219312673364e-07, + "loss": 0.5487, + "step": 1244 + }, + { + "epoch": 12.087378640776699, + "grad_norm": 0.48035513204470476, + "learning_rate": 4.5090572184736863e-07, + "loss": 0.3491, + "step": 1245 + }, + { + "epoch": 12.097087378640778, + "grad_norm": 0.5166017045705675, + "learning_rate": 4.4799772293127486e-07, + "loss": 0.752, + "step": 1246 + }, + { + "epoch": 12.106796116504855, + "grad_norm": 0.5229778700888286, + "learning_rate": 4.4509820840211745e-07, + "loss": 0.5296, + "step": 1247 + }, + { + "epoch": 12.116504854368932, + "grad_norm": 0.5224021014520674, + "learning_rate": 4.422071902484812e-07, + "loss": 0.4395, + "step": 1248 + }, + { + "epoch": 12.12621359223301, + "grad_norm": 0.4859120821940226, + "learning_rate": 4.3932468042382075e-07, + "loss": 0.3914, + "step": 1249 + }, + { + "epoch": 12.135922330097088, + "grad_norm": 0.49206048744899616, + "learning_rate": 4.3645069084641195e-07, + "loss": 0.5086, + "step": 1250 + }, + { + "epoch": 12.145631067961165, + "grad_norm": 0.5507080078693621, + "learning_rate": 4.335852333993018e-07, + "loss": 0.6545, + "step": 1251 + }, + { + "epoch": 12.155339805825243, + "grad_norm": 0.5256375066912108, + "learning_rate": 4.3072831993025895e-07, + "loss": 0.6475, + "step": 1252 + }, + { + "epoch": 12.16504854368932, + "grad_norm": 0.5165162373708071, + "learning_rate": 4.278799622517274e-07, + "loss": 0.5665, + "step": 1253 + }, + { + "epoch": 12.174757281553399, + "grad_norm": 0.48585634241986353, + "learning_rate": 4.2504017214077374e-07, + "loss": 0.5531, + "step": 1254 + }, + { + "epoch": 12.184466019417476, + "grad_norm": 0.5727064553331418, + "learning_rate": 4.222089613390412e-07, + "loss": 0.4, + "step": 1255 + }, + { + "epoch": 12.194174757281553, + "grad_norm": 0.5655692683446615, + "learning_rate": 4.1938634155269944e-07, + "loss": 0.5369, + "step": 1256 + }, + { + "epoch": 12.20388349514563, + "grad_norm": 0.5339261343495418, + "learning_rate": 4.165723244523978e-07, + "loss": 0.548, + "step": 1257 + }, + { + "epoch": 12.21359223300971, + "grad_norm": 0.5995207842725745, + "learning_rate": 4.1376692167321626e-07, + "loss": 0.7545, + "step": 1258 + }, + { + "epoch": 12.223300970873787, + "grad_norm": 0.49946940718627203, + "learning_rate": 4.109701448146164e-07, + "loss": 0.4537, + "step": 1259 + }, + { + "epoch": 12.233009708737864, + "grad_norm": 0.48406505308218556, + "learning_rate": 4.0818200544039484e-07, + "loss": 0.5457, + "step": 1260 + }, + { + "epoch": 12.242718446601941, + "grad_norm": 0.5761404083738131, + "learning_rate": 4.054025150786356e-07, + "loss": 0.4459, + "step": 1261 + }, + { + "epoch": 12.25242718446602, + "grad_norm": 0.5368464804303047, + "learning_rate": 4.026316852216605e-07, + "loss": 0.5884, + "step": 1262 + }, + { + "epoch": 12.262135922330097, + "grad_norm": 0.5455773855091807, + "learning_rate": 3.998695273259834e-07, + "loss": 0.4718, + "step": 1263 + }, + { + "epoch": 12.271844660194175, + "grad_norm": 0.5462489365415362, + "learning_rate": 3.971160528122622e-07, + "loss": 0.621, + "step": 1264 + }, + { + "epoch": 12.281553398058252, + "grad_norm": 0.556552861635233, + "learning_rate": 3.9437127306525295e-07, + "loss": 0.6744, + "step": 1265 + }, + { + "epoch": 12.29126213592233, + "grad_norm": 0.5297638757980944, + "learning_rate": 3.9163519943375973e-07, + "loss": 0.4573, + "step": 1266 + }, + { + "epoch": 12.300970873786408, + "grad_norm": 0.49902627020942136, + "learning_rate": 3.889078432305904e-07, + "loss": 0.4568, + "step": 1267 + }, + { + "epoch": 12.310679611650485, + "grad_norm": 0.5535869111612749, + "learning_rate": 3.8618921573250896e-07, + "loss": 0.528, + "step": 1268 + }, + { + "epoch": 12.320388349514563, + "grad_norm": 0.5204509920890341, + "learning_rate": 3.834793281801891e-07, + "loss": 0.3842, + "step": 1269 + }, + { + "epoch": 12.330097087378642, + "grad_norm": 0.5692383110945788, + "learning_rate": 3.8077819177816695e-07, + "loss": 0.5053, + "step": 1270 + }, + { + "epoch": 12.339805825242719, + "grad_norm": 0.47822476540581443, + "learning_rate": 3.780858176947963e-07, + "loss": 0.3833, + "step": 1271 + }, + { + "epoch": 12.349514563106796, + "grad_norm": 0.5186445853503381, + "learning_rate": 3.754022170622007e-07, + "loss": 0.3893, + "step": 1272 + }, + { + "epoch": 12.359223300970873, + "grad_norm": 0.49920957568724805, + "learning_rate": 3.7272740097622884e-07, + "loss": 0.6484, + "step": 1273 + }, + { + "epoch": 12.368932038834952, + "grad_norm": 0.5579303155302804, + "learning_rate": 3.700613804964073e-07, + "loss": 0.6875, + "step": 1274 + }, + { + "epoch": 12.37864077669903, + "grad_norm": 0.5539922366049171, + "learning_rate": 3.6740416664589634e-07, + "loss": 0.5997, + "step": 1275 + }, + { + "epoch": 12.388349514563107, + "grad_norm": 0.4840847496708577, + "learning_rate": 3.6475577041144324e-07, + "loss": 0.7178, + "step": 1276 + }, + { + "epoch": 12.398058252427184, + "grad_norm": 0.5501337207898733, + "learning_rate": 3.6211620274333727e-07, + "loss": 0.3739, + "step": 1277 + }, + { + "epoch": 12.407766990291263, + "grad_norm": 0.5541786710200026, + "learning_rate": 3.594854745553636e-07, + "loss": 0.7565, + "step": 1278 + }, + { + "epoch": 12.41747572815534, + "grad_norm": 0.5175636279750693, + "learning_rate": 3.568635967247605e-07, + "loss": 0.7672, + "step": 1279 + }, + { + "epoch": 12.427184466019417, + "grad_norm": 0.46450923628601365, + "learning_rate": 3.5425058009217193e-07, + "loss": 0.5094, + "step": 1280 + }, + { + "epoch": 12.436893203883495, + "grad_norm": 0.5786871968641614, + "learning_rate": 3.516464354616031e-07, + "loss": 0.6925, + "step": 1281 + }, + { + "epoch": 12.446601941747574, + "grad_norm": 0.5724996419603651, + "learning_rate": 3.4905117360037683e-07, + "loss": 0.5977, + "step": 1282 + }, + { + "epoch": 12.45631067961165, + "grad_norm": 0.540971499700757, + "learning_rate": 3.4646480523908813e-07, + "loss": 0.4706, + "step": 1283 + }, + { + "epoch": 12.466019417475728, + "grad_norm": 0.5182006459005003, + "learning_rate": 3.43887341071561e-07, + "loss": 0.6454, + "step": 1284 + }, + { + "epoch": 12.475728155339805, + "grad_norm": 0.5425832252884726, + "learning_rate": 3.413187917548019e-07, + "loss": 0.339, + "step": 1285 + }, + { + "epoch": 12.485436893203884, + "grad_norm": 0.539013979648478, + "learning_rate": 3.3875916790895883e-07, + "loss": 0.828, + "step": 1286 + }, + { + "epoch": 12.495145631067961, + "grad_norm": 0.6096747704359214, + "learning_rate": 3.3620848011727437e-07, + "loss": 0.4499, + "step": 1287 + }, + { + "epoch": 12.504854368932039, + "grad_norm": 0.5278542481454789, + "learning_rate": 3.336667389260445e-07, + "loss": 0.422, + "step": 1288 + }, + { + "epoch": 12.514563106796116, + "grad_norm": 0.5403848073760052, + "learning_rate": 3.311339548445727e-07, + "loss": 0.4152, + "step": 1289 + }, + { + "epoch": 12.524271844660195, + "grad_norm": 0.48656658093770444, + "learning_rate": 3.2861013834512844e-07, + "loss": 0.5419, + "step": 1290 + }, + { + "epoch": 12.533980582524272, + "grad_norm": 0.5218058868156833, + "learning_rate": 3.2609529986290246e-07, + "loss": 0.4006, + "step": 1291 + }, + { + "epoch": 12.54368932038835, + "grad_norm": 0.5053568098333768, + "learning_rate": 3.235894497959649e-07, + "loss": 0.402, + "step": 1292 + }, + { + "epoch": 12.553398058252426, + "grad_norm": 0.5028842562515405, + "learning_rate": 3.2109259850522045e-07, + "loss": 0.5259, + "step": 1293 + }, + { + "epoch": 12.563106796116505, + "grad_norm": 0.5127746566227459, + "learning_rate": 3.186047563143685e-07, + "loss": 0.524, + "step": 1294 + }, + { + "epoch": 12.572815533980583, + "grad_norm": 0.502598831483758, + "learning_rate": 3.161259335098571e-07, + "loss": 0.5442, + "step": 1295 + }, + { + "epoch": 12.58252427184466, + "grad_norm": 0.5953762476232632, + "learning_rate": 3.1365614034084224e-07, + "loss": 0.286, + "step": 1296 + }, + { + "epoch": 12.592233009708737, + "grad_norm": 0.5102868867073259, + "learning_rate": 3.111953870191459e-07, + "loss": 0.447, + "step": 1297 + }, + { + "epoch": 12.601941747572816, + "grad_norm": 0.5245488599975241, + "learning_rate": 3.087436837192118e-07, + "loss": 0.8385, + "step": 1298 + }, + { + "epoch": 12.611650485436893, + "grad_norm": 0.5645317531008295, + "learning_rate": 3.0630104057806616e-07, + "loss": 0.5325, + "step": 1299 + }, + { + "epoch": 12.62135922330097, + "grad_norm": 0.5131734216726613, + "learning_rate": 3.0386746769527323e-07, + "loss": 0.542, + "step": 1300 + }, + { + "epoch": 12.631067961165048, + "grad_norm": 0.521794966830633, + "learning_rate": 3.0144297513289483e-07, + "loss": 0.4628, + "step": 1301 + }, + { + "epoch": 12.640776699029127, + "grad_norm": 0.5396439134195453, + "learning_rate": 2.9902757291544905e-07, + "loss": 0.548, + "step": 1302 + }, + { + "epoch": 12.650485436893204, + "grad_norm": 0.5622776596142265, + "learning_rate": 2.966212710298674e-07, + "loss": 0.371, + "step": 1303 + }, + { + "epoch": 12.660194174757281, + "grad_norm": 0.4790177990779847, + "learning_rate": 2.94224079425455e-07, + "loss": 0.432, + "step": 1304 + }, + { + "epoch": 12.669902912621358, + "grad_norm": 0.539797943283852, + "learning_rate": 2.9183600801384853e-07, + "loss": 0.4002, + "step": 1305 + }, + { + "epoch": 12.679611650485437, + "grad_norm": 0.5483551448669917, + "learning_rate": 2.8945706666897555e-07, + "loss": 0.3479, + "step": 1306 + }, + { + "epoch": 12.689320388349515, + "grad_norm": 0.49713629082843547, + "learning_rate": 2.870872652270129e-07, + "loss": 0.3679, + "step": 1307 + }, + { + "epoch": 12.699029126213592, + "grad_norm": 0.4992602508121686, + "learning_rate": 2.8472661348634883e-07, + "loss": 0.3935, + "step": 1308 + }, + { + "epoch": 12.70873786407767, + "grad_norm": 0.5115896966629423, + "learning_rate": 2.82375121207539e-07, + "loss": 0.4042, + "step": 1309 + }, + { + "epoch": 12.718446601941748, + "grad_norm": 0.480675662078779, + "learning_rate": 2.8003279811326724e-07, + "loss": 0.4149, + "step": 1310 + }, + { + "epoch": 12.728155339805825, + "grad_norm": 0.5305606289822804, + "learning_rate": 2.776996538883062e-07, + "loss": 0.2728, + "step": 1311 + }, + { + "epoch": 12.737864077669903, + "grad_norm": 0.49638478140651515, + "learning_rate": 2.7537569817947694e-07, + "loss": 0.5558, + "step": 1312 + }, + { + "epoch": 12.74757281553398, + "grad_norm": 0.5161627991330044, + "learning_rate": 2.730609405956083e-07, + "loss": 0.4527, + "step": 1313 + }, + { + "epoch": 12.757281553398059, + "grad_norm": 0.5053870345750662, + "learning_rate": 2.707553907074989e-07, + "loss": 0.4976, + "step": 1314 + }, + { + "epoch": 12.766990291262136, + "grad_norm": 0.6117896664980851, + "learning_rate": 2.684590580478749e-07, + "loss": 0.7138, + "step": 1315 + }, + { + "epoch": 12.776699029126213, + "grad_norm": 0.5424702681942861, + "learning_rate": 2.6617195211135343e-07, + "loss": 0.7378, + "step": 1316 + }, + { + "epoch": 12.78640776699029, + "grad_norm": 0.45282710489101363, + "learning_rate": 2.638940823544012e-07, + "loss": 0.4519, + "step": 1317 + }, + { + "epoch": 12.79611650485437, + "grad_norm": 0.5634124208174777, + "learning_rate": 2.6162545819529624e-07, + "loss": 0.5306, + "step": 1318 + }, + { + "epoch": 12.805825242718447, + "grad_norm": 0.5317119729976713, + "learning_rate": 2.593660890140895e-07, + "loss": 0.5186, + "step": 1319 + }, + { + "epoch": 12.815533980582524, + "grad_norm": 0.5164784363125361, + "learning_rate": 2.57115984152565e-07, + "loss": 0.4913, + "step": 1320 + }, + { + "epoch": 12.825242718446601, + "grad_norm": 0.583079797245975, + "learning_rate": 2.548751529142018e-07, + "loss": 0.5579, + "step": 1321 + }, + { + "epoch": 12.83495145631068, + "grad_norm": 0.4937883663752087, + "learning_rate": 2.526436045641351e-07, + "loss": 0.6115, + "step": 1322 + }, + { + "epoch": 12.844660194174757, + "grad_norm": 0.4886740713037102, + "learning_rate": 2.504213483291193e-07, + "loss": 0.5438, + "step": 1323 + }, + { + "epoch": 12.854368932038835, + "grad_norm": 0.4742082781225294, + "learning_rate": 2.482083933974883e-07, + "loss": 0.3571, + "step": 1324 + }, + { + "epoch": 12.864077669902912, + "grad_norm": 0.5090813269543996, + "learning_rate": 2.4600474891911696e-07, + "loss": 0.626, + "step": 1325 + }, + { + "epoch": 12.87378640776699, + "grad_norm": 0.5756284766855548, + "learning_rate": 2.43810424005386e-07, + "loss": 0.4204, + "step": 1326 + }, + { + "epoch": 12.883495145631068, + "grad_norm": 0.5302071313339889, + "learning_rate": 2.416254277291416e-07, + "loss": 0.4967, + "step": 1327 + }, + { + "epoch": 12.893203883495145, + "grad_norm": 0.519224974871366, + "learning_rate": 2.3944976912465916e-07, + "loss": 0.3222, + "step": 1328 + }, + { + "epoch": 12.902912621359224, + "grad_norm": 0.5125376672237484, + "learning_rate": 2.3728345718760622e-07, + "loss": 0.604, + "step": 1329 + }, + { + "epoch": 12.912621359223301, + "grad_norm": 0.5025700910016356, + "learning_rate": 2.3512650087500338e-07, + "loss": 0.6674, + "step": 1330 + }, + { + "epoch": 12.922330097087379, + "grad_norm": 0.5472841146691036, + "learning_rate": 2.3297890910519093e-07, + "loss": 0.4614, + "step": 1331 + }, + { + "epoch": 12.932038834951456, + "grad_norm": 0.6618994321490155, + "learning_rate": 2.3084069075778758e-07, + "loss": 0.3552, + "step": 1332 + }, + { + "epoch": 12.941747572815533, + "grad_norm": 0.5653535449986341, + "learning_rate": 2.287118546736572e-07, + "loss": 0.515, + "step": 1333 + }, + { + "epoch": 12.951456310679612, + "grad_norm": 0.5393697804104346, + "learning_rate": 2.2659240965487023e-07, + "loss": 0.704, + "step": 1334 + }, + { + "epoch": 12.96116504854369, + "grad_norm": 0.5516311443510744, + "learning_rate": 2.2448236446466847e-07, + "loss": 0.7699, + "step": 1335 + }, + { + "epoch": 12.970873786407767, + "grad_norm": 0.49351094018608216, + "learning_rate": 2.2238172782742763e-07, + "loss": 0.4795, + "step": 1336 + }, + { + "epoch": 12.980582524271846, + "grad_norm": 0.6006692564530559, + "learning_rate": 2.2029050842862277e-07, + "loss": 0.5718, + "step": 1337 + }, + { + "epoch": 12.990291262135923, + "grad_norm": 0.5026296445519245, + "learning_rate": 2.1820871491479102e-07, + "loss": 0.5669, + "step": 1338 + }, + { + "epoch": 13.0, + "grad_norm": 0.4662986293212389, + "learning_rate": 2.1613635589349756e-07, + "loss": 0.6033, + "step": 1339 + }, + { + "epoch": 13.009708737864077, + "grad_norm": 0.5762881710629336, + "learning_rate": 2.140734399332975e-07, + "loss": 0.4637, + "step": 1340 + }, + { + "epoch": 13.019417475728156, + "grad_norm": 0.5100299787885914, + "learning_rate": 2.1201997556370284e-07, + "loss": 0.6593, + "step": 1341 + }, + { + "epoch": 13.029126213592233, + "grad_norm": 0.5142901147631517, + "learning_rate": 2.0997597127514507e-07, + "loss": 0.4614, + "step": 1342 + }, + { + "epoch": 13.03883495145631, + "grad_norm": 0.5610525704881512, + "learning_rate": 2.079414355189427e-07, + "loss": 0.4303, + "step": 1343 + }, + { + "epoch": 13.048543689320388, + "grad_norm": 0.565400240725852, + "learning_rate": 2.059163767072639e-07, + "loss": 0.6574, + "step": 1344 + }, + { + "epoch": 13.058252427184467, + "grad_norm": 0.49074149271912093, + "learning_rate": 2.0390080321309236e-07, + "loss": 0.3981, + "step": 1345 + }, + { + "epoch": 13.067961165048544, + "grad_norm": 0.5120110967792576, + "learning_rate": 2.01894723370194e-07, + "loss": 0.2239, + "step": 1346 + }, + { + "epoch": 13.077669902912621, + "grad_norm": 0.5018096971105644, + "learning_rate": 1.9989814547308056e-07, + "loss": 0.4467, + "step": 1347 + }, + { + "epoch": 13.087378640776699, + "grad_norm": 0.5476918123606207, + "learning_rate": 1.9791107777697633e-07, + "loss": 0.6297, + "step": 1348 + }, + { + "epoch": 13.097087378640778, + "grad_norm": 0.572988197563463, + "learning_rate": 1.9593352849778453e-07, + "loss": 0.6092, + "step": 1349 + }, + { + "epoch": 13.106796116504855, + "grad_norm": 0.5261661637028531, + "learning_rate": 1.9396550581205208e-07, + "loss": 0.4294, + "step": 1350 + }, + { + "epoch": 13.116504854368932, + "grad_norm": 0.50494082943497, + "learning_rate": 1.920070178569361e-07, + "loss": 0.387, + "step": 1351 + }, + { + "epoch": 13.12621359223301, + "grad_norm": 0.5111855540996291, + "learning_rate": 1.900580727301718e-07, + "loss": 0.5553, + "step": 1352 + }, + { + "epoch": 13.135922330097088, + "grad_norm": 0.5066506430565386, + "learning_rate": 1.8811867849003684e-07, + "loss": 0.4342, + "step": 1353 + }, + { + "epoch": 13.145631067961165, + "grad_norm": 0.5595564814476269, + "learning_rate": 1.8618884315531939e-07, + "loss": 0.8207, + "step": 1354 + }, + { + "epoch": 13.155339805825243, + "grad_norm": 0.5505727997556397, + "learning_rate": 1.8426857470528414e-07, + "loss": 0.4033, + "step": 1355 + }, + { + "epoch": 13.16504854368932, + "grad_norm": 0.5766520644750025, + "learning_rate": 1.8235788107963948e-07, + "loss": 0.5713, + "step": 1356 + }, + { + "epoch": 13.174757281553399, + "grad_norm": 0.5600331852928223, + "learning_rate": 1.8045677017850595e-07, + "loss": 0.3118, + "step": 1357 + }, + { + "epoch": 13.184466019417476, + "grad_norm": 0.4745484708437102, + "learning_rate": 1.785652498623816e-07, + "loss": 0.4337, + "step": 1358 + }, + { + "epoch": 13.194174757281553, + "grad_norm": 0.527479211214423, + "learning_rate": 1.7668332795211074e-07, + "loss": 0.6401, + "step": 1359 + }, + { + "epoch": 13.20388349514563, + "grad_norm": 0.5317399547389916, + "learning_rate": 1.7481101222885126e-07, + "loss": 0.4149, + "step": 1360 + }, + { + "epoch": 13.21359223300971, + "grad_norm": 0.5417102266482062, + "learning_rate": 1.7294831043404264e-07, + "loss": 0.4646, + "step": 1361 + }, + { + "epoch": 13.223300970873787, + "grad_norm": 0.4964064829393023, + "learning_rate": 1.7109523026937302e-07, + "loss": 0.4473, + "step": 1362 + }, + { + "epoch": 13.233009708737864, + "grad_norm": 0.5687847935424973, + "learning_rate": 1.6925177939674936e-07, + "loss": 0.6734, + "step": 1363 + }, + { + "epoch": 13.242718446601941, + "grad_norm": 0.5204693568021522, + "learning_rate": 1.6741796543826321e-07, + "loss": 0.5916, + "step": 1364 + }, + { + "epoch": 13.25242718446602, + "grad_norm": 0.5230106632722077, + "learning_rate": 1.6559379597616136e-07, + "loss": 0.4952, + "step": 1365 + }, + { + "epoch": 13.262135922330097, + "grad_norm": 0.5125645899558829, + "learning_rate": 1.6377927855281362e-07, + "loss": 0.5947, + "step": 1366 + }, + { + "epoch": 13.271844660194175, + "grad_norm": 0.5122079677737931, + "learning_rate": 1.6197442067068136e-07, + "loss": 0.561, + "step": 1367 + }, + { + "epoch": 13.281553398058252, + "grad_norm": 0.5645728565635226, + "learning_rate": 1.6017922979228662e-07, + "loss": 0.3307, + "step": 1368 + }, + { + "epoch": 13.29126213592233, + "grad_norm": 0.5262239005191955, + "learning_rate": 1.5839371334018193e-07, + "loss": 0.6279, + "step": 1369 + }, + { + "epoch": 13.300970873786408, + "grad_norm": 0.5325025169012865, + "learning_rate": 1.5661787869691858e-07, + "loss": 0.5475, + "step": 1370 + }, + { + "epoch": 13.310679611650485, + "grad_norm": 0.5606899292139784, + "learning_rate": 1.5485173320501673e-07, + "loss": 0.583, + "step": 1371 + }, + { + "epoch": 13.320388349514563, + "grad_norm": 0.5167998308377985, + "learning_rate": 1.5309528416693503e-07, + "loss": 0.4803, + "step": 1372 + }, + { + "epoch": 13.330097087378642, + "grad_norm": 0.5309442940524358, + "learning_rate": 1.513485388450403e-07, + "loss": 0.5416, + "step": 1373 + }, + { + "epoch": 13.339805825242719, + "grad_norm": 0.4917882458055186, + "learning_rate": 1.4961150446157759e-07, + "loss": 0.5256, + "step": 1374 + }, + { + "epoch": 13.349514563106796, + "grad_norm": 0.48576191372876565, + "learning_rate": 1.4788418819864037e-07, + "loss": 0.4438, + "step": 1375 + }, + { + "epoch": 13.359223300970873, + "grad_norm": 0.5502263061660566, + "learning_rate": 1.461665971981402e-07, + "loss": 0.6946, + "step": 1376 + }, + { + "epoch": 13.368932038834952, + "grad_norm": 0.5378410979734453, + "learning_rate": 1.444587385617785e-07, + "loss": 0.4166, + "step": 1377 + }, + { + "epoch": 13.37864077669903, + "grad_norm": 0.49693973418779785, + "learning_rate": 1.4276061935101586e-07, + "loss": 0.6703, + "step": 1378 + }, + { + "epoch": 13.388349514563107, + "grad_norm": 0.6231776665301004, + "learning_rate": 1.4107224658704288e-07, + "loss": 0.5784, + "step": 1379 + }, + { + "epoch": 13.398058252427184, + "grad_norm": 0.5586955074187786, + "learning_rate": 1.3939362725075344e-07, + "loss": 0.3888, + "step": 1380 + }, + { + "epoch": 13.407766990291263, + "grad_norm": 0.5217161899840396, + "learning_rate": 1.3772476828271236e-07, + "loss": 0.5822, + "step": 1381 + }, + { + "epoch": 13.41747572815534, + "grad_norm": 0.5153464617820017, + "learning_rate": 1.360656765831289e-07, + "loss": 0.5918, + "step": 1382 + }, + { + "epoch": 13.427184466019417, + "grad_norm": 0.5657762923904565, + "learning_rate": 1.3441635901182803e-07, + "loss": 0.5361, + "step": 1383 + }, + { + "epoch": 13.436893203883495, + "grad_norm": 0.5323531737775129, + "learning_rate": 1.3277682238822142e-07, + "loss": 0.5163, + "step": 1384 + }, + { + "epoch": 13.446601941747574, + "grad_norm": 0.5177243720403791, + "learning_rate": 1.3114707349127954e-07, + "loss": 0.5323, + "step": 1385 + }, + { + "epoch": 13.45631067961165, + "grad_norm": 0.5406186027574795, + "learning_rate": 1.2952711905950377e-07, + "loss": 0.4687, + "step": 1386 + }, + { + "epoch": 13.466019417475728, + "grad_norm": 0.5716720820723694, + "learning_rate": 1.279169657908988e-07, + "loss": 0.5792, + "step": 1387 + }, + { + "epoch": 13.475728155339805, + "grad_norm": 0.4877829559887591, + "learning_rate": 1.263166203429439e-07, + "loss": 0.3873, + "step": 1388 + }, + { + "epoch": 13.485436893203884, + "grad_norm": 0.5546173120250771, + "learning_rate": 1.2472608933256637e-07, + "loss": 0.3873, + "step": 1389 + }, + { + "epoch": 13.495145631067961, + "grad_norm": 0.48230172753514133, + "learning_rate": 1.2314537933611425e-07, + "loss": 0.4945, + "step": 1390 + }, + { + "epoch": 13.504854368932039, + "grad_norm": 0.4574368189573909, + "learning_rate": 1.2157449688932872e-07, + "loss": 0.4456, + "step": 1391 + }, + { + "epoch": 13.514563106796116, + "grad_norm": 0.5242392231721918, + "learning_rate": 1.2001344848731612e-07, + "loss": 0.4399, + "step": 1392 + }, + { + "epoch": 13.524271844660195, + "grad_norm": 0.5942067255673704, + "learning_rate": 1.1846224058452316e-07, + "loss": 0.3374, + "step": 1393 + }, + { + "epoch": 13.533980582524272, + "grad_norm": 0.5808625655096099, + "learning_rate": 1.1692087959470882e-07, + "loss": 0.523, + "step": 1394 + }, + { + "epoch": 13.54368932038835, + "grad_norm": 0.4945556802549258, + "learning_rate": 1.1538937189091825e-07, + "loss": 0.4227, + "step": 1395 + }, + { + "epoch": 13.553398058252426, + "grad_norm": 0.5403255251924856, + "learning_rate": 1.1386772380545669e-07, + "loss": 0.4746, + "step": 1396 + }, + { + "epoch": 13.563106796116505, + "grad_norm": 0.47061852831174805, + "learning_rate": 1.1235594162986168e-07, + "loss": 0.5258, + "step": 1397 + }, + { + "epoch": 13.572815533980583, + "grad_norm": 0.45606330482815727, + "learning_rate": 1.1085403161488012e-07, + "loss": 0.5838, + "step": 1398 + }, + { + "epoch": 13.58252427184466, + "grad_norm": 0.5701345027363025, + "learning_rate": 1.09361999970439e-07, + "loss": 0.4101, + "step": 1399 + }, + { + "epoch": 13.592233009708737, + "grad_norm": 0.5900234582182388, + "learning_rate": 1.0787985286562219e-07, + "loss": 0.5734, + "step": 1400 + }, + { + "epoch": 13.601941747572816, + "grad_norm": 0.5947961921849346, + "learning_rate": 1.0640759642864401e-07, + "loss": 0.5931, + "step": 1401 + }, + { + "epoch": 13.611650485436893, + "grad_norm": 0.5357137024816893, + "learning_rate": 1.0494523674682372e-07, + "loss": 0.6879, + "step": 1402 + }, + { + "epoch": 13.62135922330097, + "grad_norm": 0.5559096380768329, + "learning_rate": 1.0349277986656081e-07, + "loss": 0.769, + "step": 1403 + }, + { + "epoch": 13.631067961165048, + "grad_norm": 0.5771642974798561, + "learning_rate": 1.0205023179330975e-07, + "loss": 0.2985, + "step": 1404 + }, + { + "epoch": 13.640776699029127, + "grad_norm": 0.5304015816283653, + "learning_rate": 1.00617598491555e-07, + "loss": 0.481, + "step": 1405 + }, + { + "epoch": 13.650485436893204, + "grad_norm": 0.5286982826502439, + "learning_rate": 9.919488588478715e-08, + "loss": 0.3866, + "step": 1406 + }, + { + "epoch": 13.660194174757281, + "grad_norm": 0.5494253997430171, + "learning_rate": 9.778209985547682e-08, + "loss": 0.7586, + "step": 1407 + }, + { + "epoch": 13.669902912621358, + "grad_norm": 0.48745009825917535, + "learning_rate": 9.637924624505191e-08, + "loss": 0.4863, + "step": 1408 + }, + { + "epoch": 13.679611650485437, + "grad_norm": 0.6027383636383639, + "learning_rate": 9.498633085387343e-08, + "loss": 0.4851, + "step": 1409 + }, + { + "epoch": 13.689320388349515, + "grad_norm": 0.6028302352156594, + "learning_rate": 9.360335944121029e-08, + "loss": 0.502, + "step": 1410 + }, + { + "epoch": 13.699029126213592, + "grad_norm": 0.5291390026157702, + "learning_rate": 9.223033772521594e-08, + "loss": 0.563, + "step": 1411 + }, + { + "epoch": 13.70873786407767, + "grad_norm": 0.524461539567668, + "learning_rate": 9.086727138290535e-08, + "loss": 0.579, + "step": 1412 + }, + { + "epoch": 13.718446601941748, + "grad_norm": 0.5531924624277184, + "learning_rate": 8.951416605013114e-08, + "loss": 0.5283, + "step": 1413 + }, + { + "epoch": 13.728155339805825, + "grad_norm": 0.498875465737702, + "learning_rate": 8.817102732155996e-08, + "loss": 0.493, + "step": 1414 + }, + { + "epoch": 13.737864077669903, + "grad_norm": 0.5277705069065116, + "learning_rate": 8.683786075065065e-08, + "loss": 0.5852, + "step": 1415 + }, + { + "epoch": 13.74757281553398, + "grad_norm": 0.48811477594662267, + "learning_rate": 8.55146718496283e-08, + "loss": 0.5653, + "step": 1416 + }, + { + "epoch": 13.757281553398059, + "grad_norm": 0.49370146088269673, + "learning_rate": 8.420146608946605e-08, + "loss": 0.4668, + "step": 1417 + }, + { + "epoch": 13.766990291262136, + "grad_norm": 0.5084353639944768, + "learning_rate": 8.28982488998581e-08, + "loss": 0.5095, + "step": 1418 + }, + { + "epoch": 13.776699029126213, + "grad_norm": 0.46703565271667047, + "learning_rate": 8.160502566919942e-08, + "loss": 0.4907, + "step": 1419 + }, + { + "epoch": 13.78640776699029, + "grad_norm": 0.5068684892515162, + "learning_rate": 8.032180174456283e-08, + "loss": 0.5642, + "step": 1420 + }, + { + "epoch": 13.79611650485437, + "grad_norm": 0.5215588251395539, + "learning_rate": 7.904858243167806e-08, + "loss": 0.3992, + "step": 1421 + }, + { + "epoch": 13.805825242718447, + "grad_norm": 0.48149979973118395, + "learning_rate": 7.778537299490796e-08, + "loss": 0.4448, + "step": 1422 + }, + { + "epoch": 13.815533980582524, + "grad_norm": 0.5316505800654157, + "learning_rate": 7.653217865722817e-08, + "loss": 0.5612, + "step": 1423 + }, + { + "epoch": 13.825242718446601, + "grad_norm": 0.5345196593770813, + "learning_rate": 7.528900460020444e-08, + "loss": 0.4618, + "step": 1424 + }, + { + "epoch": 13.83495145631068, + "grad_norm": 0.5536257142626635, + "learning_rate": 7.405585596397314e-08, + "loss": 0.4619, + "step": 1425 + }, + { + "epoch": 13.844660194174757, + "grad_norm": 0.5307437379543056, + "learning_rate": 7.283273784721739e-08, + "loss": 0.6937, + "step": 1426 + }, + { + "epoch": 13.854368932038835, + "grad_norm": 0.4649068565884641, + "learning_rate": 7.161965530714743e-08, + "loss": 0.4241, + "step": 1427 + }, + { + "epoch": 13.864077669902912, + "grad_norm": 0.4921857856166554, + "learning_rate": 7.041661335948024e-08, + "loss": 0.4411, + "step": 1428 + }, + { + "epoch": 13.87378640776699, + "grad_norm": 0.5207340849219146, + "learning_rate": 6.92236169784169e-08, + "loss": 0.7874, + "step": 1429 + }, + { + "epoch": 13.883495145631068, + "grad_norm": 0.49196505164477133, + "learning_rate": 6.804067109662443e-08, + "loss": 0.5332, + "step": 1430 + }, + { + "epoch": 13.893203883495145, + "grad_norm": 0.5075431393301558, + "learning_rate": 6.68677806052137e-08, + "loss": 0.5008, + "step": 1431 + }, + { + "epoch": 13.902912621359224, + "grad_norm": 0.546839111052632, + "learning_rate": 6.57049503537191e-08, + "loss": 0.5992, + "step": 1432 + }, + { + "epoch": 13.912621359223301, + "grad_norm": 0.48530918747896695, + "learning_rate": 6.455218515008049e-08, + "loss": 0.5208, + "step": 1433 + }, + { + "epoch": 13.922330097087379, + "grad_norm": 0.4639108474015221, + "learning_rate": 6.340948976062023e-08, + "loss": 0.4323, + "step": 1434 + }, + { + "epoch": 13.932038834951456, + "grad_norm": 0.4580616182119806, + "learning_rate": 6.227686891002671e-08, + "loss": 0.4902, + "step": 1435 + }, + { + "epoch": 13.941747572815533, + "grad_norm": 0.5092774475295171, + "learning_rate": 6.115432728133198e-08, + "loss": 0.679, + "step": 1436 + }, + { + "epoch": 13.951456310679612, + "grad_norm": 0.5166759750145309, + "learning_rate": 6.004186951589414e-08, + "loss": 0.5287, + "step": 1437 + }, + { + "epoch": 13.96116504854369, + "grad_norm": 0.6029338550408697, + "learning_rate": 5.8939500213378296e-08, + "loss": 0.3729, + "step": 1438 + }, + { + "epoch": 13.970873786407767, + "grad_norm": 0.4820897061950064, + "learning_rate": 5.7847223931735974e-08, + "loss": 0.4876, + "step": 1439 + }, + { + "epoch": 13.980582524271846, + "grad_norm": 0.5218779280339735, + "learning_rate": 5.6765045187187614e-08, + "loss": 0.5849, + "step": 1440 + }, + { + "epoch": 13.990291262135923, + "grad_norm": 0.5215155120681154, + "learning_rate": 5.569296845420375e-08, + "loss": 0.7779, + "step": 1441 + }, + { + "epoch": 14.0, + "grad_norm": 0.5656825108193669, + "learning_rate": 5.463099816548578e-08, + "loss": 0.4209, + "step": 1442 + }, + { + "epoch": 14.009708737864077, + "grad_norm": 0.5218440790701329, + "learning_rate": 5.3579138711948587e-08, + "loss": 0.4079, + "step": 1443 + }, + { + "epoch": 14.019417475728156, + "grad_norm": 0.5184483586663422, + "learning_rate": 5.253739444270128e-08, + "loss": 0.6546, + "step": 1444 + }, + { + "epoch": 14.029126213592233, + "grad_norm": 0.5560840017158974, + "learning_rate": 5.150576966503063e-08, + "loss": 0.5656, + "step": 1445 + }, + { + "epoch": 14.03883495145631, + "grad_norm": 0.5145055616244204, + "learning_rate": 5.048426864438183e-08, + "loss": 0.736, + "step": 1446 + }, + { + "epoch": 14.048543689320388, + "grad_norm": 0.5477079295048156, + "learning_rate": 4.9472895604341655e-08, + "loss": 0.4869, + "step": 1447 + }, + { + "epoch": 14.058252427184467, + "grad_norm": 0.4733588081568726, + "learning_rate": 4.8471654726621464e-08, + "loss": 0.5382, + "step": 1448 + }, + { + "epoch": 14.067961165048544, + "grad_norm": 0.5101422793371593, + "learning_rate": 4.7480550151038365e-08, + "loss": 0.6127, + "step": 1449 + }, + { + "epoch": 14.077669902912621, + "grad_norm": 0.5382006222610713, + "learning_rate": 4.649958597549964e-08, + "loss": 0.5527, + "step": 1450 + }, + { + "epoch": 14.087378640776699, + "grad_norm": 0.5275430010182721, + "learning_rate": 4.552876625598501e-08, + "loss": 0.7032, + "step": 1451 + }, + { + "epoch": 14.097087378640778, + "grad_norm": 0.5458051914647247, + "learning_rate": 4.4568095006529975e-08, + "loss": 0.4345, + "step": 1452 + }, + { + "epoch": 14.106796116504855, + "grad_norm": 0.7051966998510896, + "learning_rate": 4.361757619920942e-08, + "loss": 0.5567, + "step": 1453 + }, + { + "epoch": 14.116504854368932, + "grad_norm": 0.49381508246509925, + "learning_rate": 4.2677213764120986e-08, + "loss": 0.3299, + "step": 1454 + }, + { + "epoch": 14.12621359223301, + "grad_norm": 0.5048558481830228, + "learning_rate": 4.174701158936895e-08, + "loss": 0.451, + "step": 1455 + }, + { + "epoch": 14.135922330097088, + "grad_norm": 0.482626507365039, + "learning_rate": 4.082697352104814e-08, + "loss": 0.533, + "step": 1456 + }, + { + "epoch": 14.145631067961165, + "grad_norm": 0.5382580640297336, + "learning_rate": 3.991710336322757e-08, + "loss": 0.7137, + "step": 1457 + }, + { + "epoch": 14.155339805825243, + "grad_norm": 0.536897740958693, + "learning_rate": 3.9017404877935986e-08, + "loss": 0.5705, + "step": 1458 + }, + { + "epoch": 14.16504854368932, + "grad_norm": 0.5211520199572297, + "learning_rate": 3.812788178514437e-08, + "loss": 0.7333, + "step": 1459 + }, + { + "epoch": 14.174757281553399, + "grad_norm": 0.5267984123710193, + "learning_rate": 3.7248537762752666e-08, + "loss": 0.5171, + "step": 1460 + }, + { + "epoch": 14.184466019417476, + "grad_norm": 0.5584448575561238, + "learning_rate": 3.637937644657308e-08, + "loss": 0.7025, + "step": 1461 + }, + { + "epoch": 14.194174757281553, + "grad_norm": 0.5239791697552872, + "learning_rate": 3.55204014303151e-08, + "loss": 0.8438, + "step": 1462 + }, + { + "epoch": 14.20388349514563, + "grad_norm": 0.5245481621288842, + "learning_rate": 3.467161626557164e-08, + "loss": 0.6344, + "step": 1463 + }, + { + "epoch": 14.21359223300971, + "grad_norm": 0.5069515035185672, + "learning_rate": 3.3833024461803756e-08, + "loss": 0.4982, + "step": 1464 + }, + { + "epoch": 14.223300970873787, + "grad_norm": 0.5281513334975837, + "learning_rate": 3.300462948632593e-08, + "loss": 0.432, + "step": 1465 + }, + { + "epoch": 14.233009708737864, + "grad_norm": 0.5471335526184092, + "learning_rate": 3.218643476429167e-08, + "loss": 0.5776, + "step": 1466 + }, + { + "epoch": 14.242718446601941, + "grad_norm": 0.6037098746834603, + "learning_rate": 3.1378443678680706e-08, + "loss": 0.4294, + "step": 1467 + }, + { + "epoch": 14.25242718446602, + "grad_norm": 0.5288263190310578, + "learning_rate": 3.0580659570282886e-08, + "loss": 0.5654, + "step": 1468 + }, + { + "epoch": 14.262135922330097, + "grad_norm": 0.5553795697688602, + "learning_rate": 2.979308573768547e-08, + "loss": 0.5792, + "step": 1469 + }, + { + "epoch": 14.271844660194175, + "grad_norm": 0.5225773956434618, + "learning_rate": 2.9015725437259724e-08, + "loss": 0.4668, + "step": 1470 + }, + { + "epoch": 14.281553398058252, + "grad_norm": 0.5131128030358816, + "learning_rate": 2.8248581883147387e-08, + "loss": 0.3315, + "step": 1471 + }, + { + "epoch": 14.29126213592233, + "grad_norm": 0.45903451471678286, + "learning_rate": 2.7491658247246478e-08, + "loss": 0.4191, + "step": 1472 + }, + { + "epoch": 14.300970873786408, + "grad_norm": 0.4953151621096038, + "learning_rate": 2.6744957659199376e-08, + "loss": 0.5154, + "step": 1473 + }, + { + "epoch": 14.310679611650485, + "grad_norm": 0.519499457859041, + "learning_rate": 2.6008483206379497e-08, + "loss": 0.5541, + "step": 1474 + }, + { + "epoch": 14.320388349514563, + "grad_norm": 0.5083323776230902, + "learning_rate": 2.5282237933877962e-08, + "loss": 0.4306, + "step": 1475 + }, + { + "epoch": 14.330097087378642, + "grad_norm": 0.5335621363530555, + "learning_rate": 2.4566224844491393e-08, + "loss": 0.5205, + "step": 1476 + }, + { + "epoch": 14.339805825242719, + "grad_norm": 0.501971009726888, + "learning_rate": 2.38604468987097e-08, + "loss": 0.6609, + "step": 1477 + }, + { + "epoch": 14.349514563106796, + "grad_norm": 0.5261429789373724, + "learning_rate": 2.316490701470414e-08, + "loss": 0.5117, + "step": 1478 + }, + { + "epoch": 14.359223300970873, + "grad_norm": 0.5293420600011806, + "learning_rate": 2.247960806831373e-08, + "loss": 0.5102, + "step": 1479 + }, + { + "epoch": 14.368932038834952, + "grad_norm": 0.5026358641196522, + "learning_rate": 2.180455289303579e-08, + "loss": 0.4931, + "step": 1480 + }, + { + "epoch": 14.37864077669903, + "grad_norm": 0.5133562751158053, + "learning_rate": 2.113974428001153e-08, + "loss": 0.433, + "step": 1481 + }, + { + "epoch": 14.388349514563107, + "grad_norm": 0.5265411531668414, + "learning_rate": 2.0485184978016604e-08, + "loss": 0.4764, + "step": 1482 + }, + { + "epoch": 14.398058252427184, + "grad_norm": 0.5367964213460488, + "learning_rate": 1.984087769344889e-08, + "loss": 0.5355, + "step": 1483 + }, + { + "epoch": 14.407766990291263, + "grad_norm": 0.5217857607014399, + "learning_rate": 1.9206825090317126e-08, + "loss": 0.4859, + "step": 1484 + }, + { + "epoch": 14.41747572815534, + "grad_norm": 0.5169259518658883, + "learning_rate": 1.8583029790230356e-08, + "loss": 0.5476, + "step": 1485 + }, + { + "epoch": 14.427184466019417, + "grad_norm": 0.4904849259610178, + "learning_rate": 1.796949437238682e-08, + "loss": 0.6665, + "step": 1486 + }, + { + "epoch": 14.436893203883495, + "grad_norm": 0.49108376586122854, + "learning_rate": 1.736622137356342e-08, + "loss": 0.551, + "step": 1487 + }, + { + "epoch": 14.446601941747574, + "grad_norm": 0.5142950491406901, + "learning_rate": 1.677321328810516e-08, + "loss": 0.3045, + "step": 1488 + }, + { + "epoch": 14.45631067961165, + "grad_norm": 0.503112558064198, + "learning_rate": 1.6190472567914617e-08, + "loss": 0.5419, + "step": 1489 + }, + { + "epoch": 14.466019417475728, + "grad_norm": 0.4622893048563594, + "learning_rate": 1.561800162244248e-08, + "loss": 0.338, + "step": 1490 + }, + { + "epoch": 14.475728155339805, + "grad_norm": 0.4830240169733427, + "learning_rate": 1.5055802818676745e-08, + "loss": 0.5756, + "step": 1491 + }, + { + "epoch": 14.485436893203884, + "grad_norm": 0.5659073116377321, + "learning_rate": 1.450387848113327e-08, + "loss": 0.4251, + "step": 1492 + }, + { + "epoch": 14.495145631067961, + "grad_norm": 0.5067553896210469, + "learning_rate": 1.3962230891846618e-08, + "loss": 0.5445, + "step": 1493 + }, + { + "epoch": 14.504854368932039, + "grad_norm": 0.5077181092700885, + "learning_rate": 1.3430862290359781e-08, + "loss": 0.6045, + "step": 1494 + }, + { + "epoch": 14.514563106796116, + "grad_norm": 0.5136305065251573, + "learning_rate": 1.2909774873715585e-08, + "loss": 0.6548, + "step": 1495 + }, + { + "epoch": 14.524271844660195, + "grad_norm": 0.49226391309275364, + "learning_rate": 1.2398970796447807e-08, + "loss": 0.5737, + "step": 1496 + }, + { + "epoch": 14.533980582524272, + "grad_norm": 0.5238814510404914, + "learning_rate": 1.1898452170570618e-08, + "loss": 0.4521, + "step": 1497 + }, + { + "epoch": 14.54368932038835, + "grad_norm": 0.5053130339074384, + "learning_rate": 1.140822106557249e-08, + "loss": 0.3477, + "step": 1498 + }, + { + "epoch": 14.553398058252426, + "grad_norm": 0.5913005570564529, + "learning_rate": 1.0928279508405082e-08, + "loss": 0.5268, + "step": 1499 + }, + { + "epoch": 14.563106796116505, + "grad_norm": 0.6238009868766736, + "learning_rate": 1.0458629483476868e-08, + "loss": 0.4591, + "step": 1500 + }, + { + "epoch": 14.572815533980583, + "grad_norm": 0.5980143147512443, + "learning_rate": 9.999272932643134e-09, + "loss": 0.6164, + "step": 1501 + }, + { + "epoch": 14.58252427184466, + "grad_norm": 0.5003040722536184, + "learning_rate": 9.550211755199879e-09, + "loss": 0.5502, + "step": 1502 + }, + { + "epoch": 14.592233009708737, + "grad_norm": 0.5932523805715173, + "learning_rate": 9.111447807874374e-09, + "loss": 0.5563, + "step": 1503 + }, + { + "epoch": 14.601941747572816, + "grad_norm": 0.6369284594646255, + "learning_rate": 8.682982904817948e-09, + "loss": 0.3706, + "step": 1504 + }, + { + "epoch": 14.611650485436893, + "grad_norm": 0.4671794754568133, + "learning_rate": 8.264818817599052e-09, + "loss": 0.6647, + "step": 1505 + }, + { + "epoch": 14.62135922330097, + "grad_norm": 0.45435838697605796, + "learning_rate": 7.856957275194921e-09, + "loss": 0.447, + "step": 1506 + }, + { + "epoch": 14.631067961165048, + "grad_norm": 0.49035303412272524, + "learning_rate": 7.459399963985758e-09, + "loss": 0.4693, + "step": 1507 + }, + { + "epoch": 14.640776699029127, + "grad_norm": 0.5547141125879742, + "learning_rate": 7.072148527746403e-09, + "loss": 0.7602, + "step": 1508 + }, + { + "epoch": 14.650485436893204, + "grad_norm": 0.586251716363603, + "learning_rate": 6.6952045676405005e-09, + "loss": 0.5752, + "step": 1509 + }, + { + "epoch": 14.660194174757281, + "grad_norm": 0.5166126228384136, + "learning_rate": 6.328569642212734e-09, + "loss": 0.5488, + "step": 1510 + }, + { + "epoch": 14.669902912621358, + "grad_norm": 0.489603168687867, + "learning_rate": 5.972245267384102e-09, + "loss": 0.586, + "step": 1511 + }, + { + "epoch": 14.679611650485437, + "grad_norm": 0.5365602268760269, + "learning_rate": 5.62623291644443e-09, + "loss": 0.6064, + "step": 1512 + }, + { + "epoch": 14.689320388349515, + "grad_norm": 0.4765356527868554, + "learning_rate": 5.290534020046256e-09, + "loss": 0.256, + "step": 1513 + }, + { + "epoch": 14.699029126213592, + "grad_norm": 0.5282645667952264, + "learning_rate": 4.965149966199567e-09, + "loss": 0.524, + "step": 1514 + }, + { + "epoch": 14.70873786407767, + "grad_norm": 0.5345977975069597, + "learning_rate": 4.6500821002654075e-09, + "loss": 0.4932, + "step": 1515 + }, + { + "epoch": 14.718446601941748, + "grad_norm": 0.5012155423781401, + "learning_rate": 4.345331724950885e-09, + "loss": 0.5275, + "step": 1516 + }, + { + "epoch": 14.728155339805825, + "grad_norm": 0.4761432206250365, + "learning_rate": 4.050900100303068e-09, + "loss": 0.5482, + "step": 1517 + }, + { + "epoch": 14.737864077669903, + "grad_norm": 0.4722854556901931, + "learning_rate": 3.766788443705094e-09, + "loss": 0.486, + "step": 1518 + }, + { + "epoch": 14.74757281553398, + "grad_norm": 0.48685808472088393, + "learning_rate": 3.492997929869235e-09, + "loss": 0.4182, + "step": 1519 + }, + { + "epoch": 14.757281553398059, + "grad_norm": 0.5368190413406362, + "learning_rate": 3.2295296908338437e-09, + "loss": 0.4503, + "step": 1520 + }, + { + "epoch": 14.766990291262136, + "grad_norm": 0.4718713824984552, + "learning_rate": 2.976384815957245e-09, + "loss": 0.4848, + "step": 1521 + }, + { + "epoch": 14.776699029126213, + "grad_norm": 0.5644203137176583, + "learning_rate": 2.7335643519144086e-09, + "loss": 0.6387, + "step": 1522 + }, + { + "epoch": 14.78640776699029, + "grad_norm": 0.4362591665856203, + "learning_rate": 2.5010693026922273e-09, + "loss": 0.4436, + "step": 1523 + }, + { + "epoch": 14.79611650485437, + "grad_norm": 0.5404439955190101, + "learning_rate": 2.278900629584524e-09, + "loss": 0.3487, + "step": 1524 + }, + { + "epoch": 14.805825242718447, + "grad_norm": 0.5112825487443058, + "learning_rate": 2.067059251189274e-09, + "loss": 0.6591, + "step": 1525 + }, + { + "epoch": 14.815533980582524, + "grad_norm": 0.5176623109055024, + "learning_rate": 1.8655460434044427e-09, + "loss": 0.3372, + "step": 1526 + }, + { + "epoch": 14.825242718446601, + "grad_norm": 0.5412558465607696, + "learning_rate": 1.6743618394238215e-09, + "loss": 0.7489, + "step": 1527 + }, + { + "epoch": 14.83495145631068, + "grad_norm": 0.5609397261408766, + "learning_rate": 1.493507429734531e-09, + "loss": 0.555, + "step": 1528 + }, + { + "epoch": 14.844660194174757, + "grad_norm": 0.5233104588853983, + "learning_rate": 1.3229835621125786e-09, + "loss": 0.4006, + "step": 1529 + }, + { + "epoch": 14.854368932038835, + "grad_norm": 0.49863211232575066, + "learning_rate": 1.1627909416211947e-09, + "loss": 0.3629, + "step": 1530 + }, + { + "epoch": 14.864077669902912, + "grad_norm": 0.5077470970611437, + "learning_rate": 1.0129302306061128e-09, + "loss": 0.4257, + "step": 1531 + }, + { + "epoch": 14.87378640776699, + "grad_norm": 0.49886786092314855, + "learning_rate": 8.734020486950157e-10, + "loss": 0.5751, + "step": 1532 + }, + { + "epoch": 14.883495145631068, + "grad_norm": 0.5398797190248956, + "learning_rate": 7.442069727930934e-10, + "loss": 0.2788, + "step": 1533 + }, + { + "epoch": 14.893203883495145, + "grad_norm": 0.5452560970159284, + "learning_rate": 6.253455370811012e-10, + "loss": 0.5946, + "step": 1534 + }, + { + "epoch": 14.902912621359224, + "grad_norm": 0.5511414946543856, + "learning_rate": 5.168182330145266e-10, + "loss": 0.4084, + "step": 1535 + }, + { + "epoch": 14.912621359223301, + "grad_norm": 0.5429125042489947, + "learning_rate": 4.186255093194258e-10, + "loss": 0.4831, + "step": 1536 + }, + { + "epoch": 14.922330097087379, + "grad_norm": 0.5720561401215973, + "learning_rate": 3.3076777199186894e-10, + "loss": 0.4999, + "step": 1537 + }, + { + "epoch": 14.932038834951456, + "grad_norm": 0.5021834442307801, + "learning_rate": 2.532453842965521e-10, + "loss": 0.366, + "step": 1538 + }, + { + "epoch": 14.941747572815533, + "grad_norm": 0.5277645181797495, + "learning_rate": 1.8605866676374428e-10, + "loss": 0.419, + "step": 1539 + }, + { + "epoch": 14.951456310679612, + "grad_norm": 0.5207502460141687, + "learning_rate": 1.292078971898425e-10, + "loss": 0.427, + "step": 1540 + }, + { + "epoch": 14.96116504854369, + "grad_norm": 0.5656367527963858, + "learning_rate": 8.269331063459618e-11, + "loss": 0.2437, + "step": 1541 + }, + { + "epoch": 14.970873786407767, + "grad_norm": 0.5623100607316655, + "learning_rate": 4.651509942193988e-11, + "loss": 0.5833, + "step": 1542 + }, + { + "epoch": 14.980582524271846, + "grad_norm": 0.5028687329515733, + "learning_rate": 2.06734131366626e-11, + "loss": 0.6296, + "step": 1543 + }, + { + "epoch": 14.990291262135923, + "grad_norm": 0.5297985306979774, + "learning_rate": 5.168358626628234e-12, + "loss": 0.4812, + "step": 1544 + }, + { + "epoch": 15.0, + "grad_norm": 0.5181687092759953, + "learning_rate": 0.0, + "loss": 0.4149, + "step": 1545 + } + ], + "logging_steps": 1, + "max_steps": 1545, + "num_input_tokens_seen": 0, + "num_train_epochs": 15, + "save_steps": 500, + "stateful_callbacks": { + "TrainerControl": { + "args": { + "should_epoch_stop": false, + "should_evaluate": false, + "should_log": false, + "should_save": true, + "should_training_stop": true + }, + "attributes": {} + } + }, + "total_flos": 74526993309696.0, + "train_batch_size": 1, + "trial_name": null, + "trial_params": null +}