{ "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9663236217809344, "eval_steps": 500, "global_step": 20000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 4.831618108904672e-05, "grad_norm": 158.98464965820312, "learning_rate": 9.99951683818911e-07, "loss": 8.4116, "step": 1 }, { "epoch": 9.663236217809344e-05, "grad_norm": 368.3272399902344, "learning_rate": 9.99903367637822e-07, "loss": 9.5014, "step": 2 }, { "epoch": 0.00014494854326714017, "grad_norm": 633.87890625, "learning_rate": 9.998550514567327e-07, "loss": 7.6215, "step": 3 }, { "epoch": 0.0001932647243561869, "grad_norm": 224.53778076171875, "learning_rate": 9.998067352756437e-07, "loss": 8.9014, "step": 4 }, { "epoch": 0.0002415809054452336, "grad_norm": 1099.088134765625, "learning_rate": 9.997584190945547e-07, "loss": 8.9736, "step": 5 }, { "epoch": 0.00028989708653428034, "grad_norm": 536.1260986328125, "learning_rate": 9.997101029134657e-07, "loss": 5.917, "step": 6 }, { "epoch": 0.00033821326762332706, "grad_norm": 163.71018981933594, "learning_rate": 9.996617867323767e-07, "loss": 8.1104, "step": 7 }, { "epoch": 0.0003865294487123738, "grad_norm": 350.3841552734375, "learning_rate": 9.996134705512875e-07, "loss": 9.8364, "step": 8 }, { "epoch": 0.0004348456298014205, "grad_norm": 239.17550659179688, "learning_rate": 9.995651543701984e-07, "loss": 9.4617, "step": 9 }, { "epoch": 0.0004831618108904672, "grad_norm": 336.7843933105469, "learning_rate": 9.995168381891094e-07, "loss": 9.3641, "step": 10 }, { "epoch": 0.000531477991979514, "grad_norm": 404.41180419921875, "learning_rate": 9.994685220080204e-07, "loss": 7.5099, "step": 11 }, { "epoch": 0.0005797941730685607, "grad_norm": 282.79974365234375, "learning_rate": 9.994202058269314e-07, "loss": 7.3996, "step": 12 }, { "epoch": 0.0006281103541576074, "grad_norm": 118.2531509399414, "learning_rate": 9.993718896458422e-07, "loss": 6.0803, "step": 13 }, { "epoch": 0.0006764265352466541, "grad_norm": 157.37123107910156, "learning_rate": 9.993235734647532e-07, "loss": 9.8236, "step": 14 }, { "epoch": 0.0007247427163357008, "grad_norm": 859.8961791992188, "learning_rate": 9.992752572836642e-07, "loss": 10.9691, "step": 15 }, { "epoch": 0.0007730588974247475, "grad_norm": 193.165283203125, "learning_rate": 9.992269411025752e-07, "loss": 7.0075, "step": 16 }, { "epoch": 0.0008213750785137943, "grad_norm": 197.92843627929688, "learning_rate": 9.991786249214862e-07, "loss": 5.9484, "step": 17 }, { "epoch": 0.000869691259602841, "grad_norm": 129.3522491455078, "learning_rate": 9.991303087403971e-07, "loss": 5.5798, "step": 18 }, { "epoch": 0.0009180074406918877, "grad_norm": 782.004638671875, "learning_rate": 9.990819925593081e-07, "loss": 5.0737, "step": 19 }, { "epoch": 0.0009663236217809344, "grad_norm": 194.71310424804688, "learning_rate": 9.99033676378219e-07, "loss": 8.6226, "step": 20 }, { "epoch": 0.0010146398028699811, "grad_norm": 486.0804443359375, "learning_rate": 9.9898536019713e-07, "loss": 6.8446, "step": 21 }, { "epoch": 0.001062955983959028, "grad_norm": 66.16017150878906, "learning_rate": 9.989370440160409e-07, "loss": 3.6099, "step": 22 }, { "epoch": 0.0011112721650480746, "grad_norm": 105.3721923828125, "learning_rate": 9.988887278349519e-07, "loss": 4.0868, "step": 23 }, { "epoch": 0.0011595883461371214, "grad_norm": 717.3970336914062, "learning_rate": 9.988404116538629e-07, "loss": 7.7091, "step": 24 }, { "epoch": 0.001207904527226168, "grad_norm": 195.83522033691406, "learning_rate": 9.987920954727739e-07, "loss": 6.6062, "step": 25 }, { "epoch": 0.0012562207083152148, "grad_norm": 85.42451477050781, "learning_rate": 9.987437792916846e-07, "loss": 4.4446, "step": 26 }, { "epoch": 0.0013045368894042614, "grad_norm": 84.41814422607422, "learning_rate": 9.986954631105956e-07, "loss": 6.2296, "step": 27 }, { "epoch": 0.0013528530704933082, "grad_norm": 217.5702667236328, "learning_rate": 9.986471469295066e-07, "loss": 4.1675, "step": 28 }, { "epoch": 0.0014011692515823548, "grad_norm": 43.83252716064453, "learning_rate": 9.985988307484176e-07, "loss": 3.2357, "step": 29 }, { "epoch": 0.0014494854326714017, "grad_norm": 155.4297332763672, "learning_rate": 9.985505145673286e-07, "loss": 5.8568, "step": 30 }, { "epoch": 0.0014978016137604483, "grad_norm": 69.5805892944336, "learning_rate": 9.985021983862396e-07, "loss": 4.0453, "step": 31 }, { "epoch": 0.001546117794849495, "grad_norm": 98.12931060791016, "learning_rate": 9.984538822051506e-07, "loss": 5.8816, "step": 32 }, { "epoch": 0.001594433975938542, "grad_norm": 55.2652473449707, "learning_rate": 9.984055660240614e-07, "loss": 3.052, "step": 33 }, { "epoch": 0.0016427501570275885, "grad_norm": 110.13990020751953, "learning_rate": 9.983572498429724e-07, "loss": 4.0042, "step": 34 }, { "epoch": 0.0016910663381166354, "grad_norm": 75.99415588378906, "learning_rate": 9.983089336618833e-07, "loss": 4.4497, "step": 35 }, { "epoch": 0.001739382519205682, "grad_norm": 70.36664581298828, "learning_rate": 9.982606174807943e-07, "loss": 3.7751, "step": 36 }, { "epoch": 0.0017876987002947288, "grad_norm": 69.84972381591797, "learning_rate": 9.982123012997053e-07, "loss": 4.4502, "step": 37 }, { "epoch": 0.0018360148813837754, "grad_norm": 66.83287811279297, "learning_rate": 9.981639851186163e-07, "loss": 5.3068, "step": 38 }, { "epoch": 0.0018843310624728222, "grad_norm": 40.00603485107422, "learning_rate": 9.98115668937527e-07, "loss": 3.4305, "step": 39 }, { "epoch": 0.0019326472435618688, "grad_norm": 102.47992706298828, "learning_rate": 9.98067352756438e-07, "loss": 4.1107, "step": 40 }, { "epoch": 0.0019809634246509156, "grad_norm": 15.808629989624023, "learning_rate": 9.98019036575349e-07, "loss": 1.7982, "step": 41 }, { "epoch": 0.0020292796057399622, "grad_norm": 65.91925048828125, "learning_rate": 9.9797072039426e-07, "loss": 4.7159, "step": 42 }, { "epoch": 0.002077595786829009, "grad_norm": 103.07284545898438, "learning_rate": 9.97922404213171e-07, "loss": 5.4709, "step": 43 }, { "epoch": 0.002125911967918056, "grad_norm": 91.8626937866211, "learning_rate": 9.97874088032082e-07, "loss": 6.0436, "step": 44 }, { "epoch": 0.0021742281490071025, "grad_norm": 28.601743698120117, "learning_rate": 9.978257718509928e-07, "loss": 2.4029, "step": 45 }, { "epoch": 0.002222544330096149, "grad_norm": 40.12073516845703, "learning_rate": 9.977774556699038e-07, "loss": 2.9379, "step": 46 }, { "epoch": 0.0022708605111851957, "grad_norm": 92.3554916381836, "learning_rate": 9.977291394888148e-07, "loss": 5.5964, "step": 47 }, { "epoch": 0.0023191766922742428, "grad_norm": 43.766273498535156, "learning_rate": 9.976808233077258e-07, "loss": 2.9347, "step": 48 }, { "epoch": 0.0023674928733632894, "grad_norm": 73.58697509765625, "learning_rate": 9.976325071266368e-07, "loss": 4.3044, "step": 49 }, { "epoch": 0.002415809054452336, "grad_norm": 80.0179672241211, "learning_rate": 9.975841909455476e-07, "loss": 4.3078, "step": 50 }, { "epoch": 0.002464125235541383, "grad_norm": 80.28998565673828, "learning_rate": 9.975358747644585e-07, "loss": 4.1845, "step": 51 }, { "epoch": 0.0025124414166304296, "grad_norm": 70.19136810302734, "learning_rate": 9.974875585833695e-07, "loss": 3.6404, "step": 52 }, { "epoch": 0.0025607575977194762, "grad_norm": 85.42420959472656, "learning_rate": 9.974392424022805e-07, "loss": 4.1986, "step": 53 }, { "epoch": 0.002609073778808523, "grad_norm": 91.57319641113281, "learning_rate": 9.973909262211915e-07, "loss": 4.1525, "step": 54 }, { "epoch": 0.00265738995989757, "grad_norm": 75.6168212890625, "learning_rate": 9.973426100401023e-07, "loss": 3.4413, "step": 55 }, { "epoch": 0.0027057061409866165, "grad_norm": 57.48381042480469, "learning_rate": 9.972942938590133e-07, "loss": 2.7079, "step": 56 }, { "epoch": 0.002754022322075663, "grad_norm": 97.36280059814453, "learning_rate": 9.972459776779243e-07, "loss": 3.8505, "step": 57 }, { "epoch": 0.0028023385031647097, "grad_norm": 104.30175018310547, "learning_rate": 9.971976614968353e-07, "loss": 3.9823, "step": 58 }, { "epoch": 0.0028506546842537567, "grad_norm": 61.70418167114258, "learning_rate": 9.971493453157463e-07, "loss": 2.5855, "step": 59 }, { "epoch": 0.0028989708653428033, "grad_norm": 65.60269165039062, "learning_rate": 9.97101029134657e-07, "loss": 2.5763, "step": 60 }, { "epoch": 0.00294728704643185, "grad_norm": 89.63236236572266, "learning_rate": 9.97052712953568e-07, "loss": 3.2051, "step": 61 }, { "epoch": 0.0029956032275208965, "grad_norm": 135.55206298828125, "learning_rate": 9.97004396772479e-07, "loss": 4.4888, "step": 62 }, { "epoch": 0.0030439194086099436, "grad_norm": 114.31710815429688, "learning_rate": 9.9695608059139e-07, "loss": 3.746, "step": 63 }, { "epoch": 0.00309223558969899, "grad_norm": 71.6878433227539, "learning_rate": 9.96907764410301e-07, "loss": 2.5809, "step": 64 }, { "epoch": 0.003140551770788037, "grad_norm": 71.28953552246094, "learning_rate": 9.968594482292118e-07, "loss": 2.5754, "step": 65 }, { "epoch": 0.003188867951877084, "grad_norm": 96.28130340576172, "learning_rate": 9.968111320481228e-07, "loss": 3.099, "step": 66 }, { "epoch": 0.0032371841329661304, "grad_norm": 98.53373718261719, "learning_rate": 9.967628158670338e-07, "loss": 3.0711, "step": 67 }, { "epoch": 0.003285500314055177, "grad_norm": 120.86969757080078, "learning_rate": 9.967144996859447e-07, "loss": 3.5978, "step": 68 }, { "epoch": 0.0033338164951442237, "grad_norm": 116.77488708496094, "learning_rate": 9.966661835048557e-07, "loss": 3.4207, "step": 69 }, { "epoch": 0.0033821326762332707, "grad_norm": 118.27132415771484, "learning_rate": 9.966178673237667e-07, "loss": 3.5943, "step": 70 }, { "epoch": 0.0034304488573223173, "grad_norm": 72.4471664428711, "learning_rate": 9.965695511426775e-07, "loss": 2.3837, "step": 71 }, { "epoch": 0.003478765038411364, "grad_norm": 47.264156341552734, "learning_rate": 9.965212349615885e-07, "loss": 1.9458, "step": 72 }, { "epoch": 0.0035270812195004105, "grad_norm": 142.44178771972656, "learning_rate": 9.964729187804995e-07, "loss": 4.0847, "step": 73 }, { "epoch": 0.0035753974005894576, "grad_norm": 143.8776397705078, "learning_rate": 9.964246025994105e-07, "loss": 4.0802, "step": 74 }, { "epoch": 0.003623713581678504, "grad_norm": 143.40234375, "learning_rate": 9.963762864183215e-07, "loss": 4.0128, "step": 75 }, { "epoch": 0.0036720297627675508, "grad_norm": 64.01071166992188, "learning_rate": 9.963279702372325e-07, "loss": 2.079, "step": 76 }, { "epoch": 0.0037203459438565974, "grad_norm": 94.44830322265625, "learning_rate": 9.962796540561432e-07, "loss": 2.8595, "step": 77 }, { "epoch": 0.0037686621249456444, "grad_norm": 143.44125366210938, "learning_rate": 9.962313378750542e-07, "loss": 3.918, "step": 78 }, { "epoch": 0.003816978306034691, "grad_norm": 94.51136016845703, "learning_rate": 9.961830216939652e-07, "loss": 2.8806, "step": 79 }, { "epoch": 0.0038652944871237376, "grad_norm": 64.87667083740234, "learning_rate": 9.961347055128762e-07, "loss": 2.1726, "step": 80 }, { "epoch": 0.003913610668212784, "grad_norm": 96.31803894042969, "learning_rate": 9.960863893317872e-07, "loss": 2.8386, "step": 81 }, { "epoch": 0.003961926849301831, "grad_norm": 71.16743469238281, "learning_rate": 9.960380731506982e-07, "loss": 2.2649, "step": 82 }, { "epoch": 0.004010243030390878, "grad_norm": 116.75038146972656, "learning_rate": 9.959897569696092e-07, "loss": 3.334, "step": 83 }, { "epoch": 0.0040585592114799245, "grad_norm": 70.79693603515625, "learning_rate": 9.9594144078852e-07, "loss": 2.1316, "step": 84 }, { "epoch": 0.0041068753925689715, "grad_norm": 71.8719253540039, "learning_rate": 9.95893124607431e-07, "loss": 2.3541, "step": 85 }, { "epoch": 0.004155191573658018, "grad_norm": 113.59060668945312, "learning_rate": 9.95844808426342e-07, "loss": 3.1761, "step": 86 }, { "epoch": 0.004203507754747065, "grad_norm": 67.64000701904297, "learning_rate": 9.95796492245253e-07, "loss": 2.0665, "step": 87 }, { "epoch": 0.004251823935836112, "grad_norm": 142.01724243164062, "learning_rate": 9.95748176064164e-07, "loss": 3.8028, "step": 88 }, { "epoch": 0.004300140116925158, "grad_norm": 90.7448501586914, "learning_rate": 9.95699859883075e-07, "loss": 2.5496, "step": 89 }, { "epoch": 0.004348456298014205, "grad_norm": 90.19241333007812, "learning_rate": 9.956515437019857e-07, "loss": 2.6339, "step": 90 }, { "epoch": 0.004396772479103252, "grad_norm": 87.08958435058594, "learning_rate": 9.956032275208967e-07, "loss": 2.491, "step": 91 }, { "epoch": 0.004445088660192298, "grad_norm": 120.17505645751953, "learning_rate": 9.955549113398077e-07, "loss": 3.2949, "step": 92 }, { "epoch": 0.004493404841281345, "grad_norm": 23.137706756591797, "learning_rate": 9.955065951587187e-07, "loss": 1.0726, "step": 93 }, { "epoch": 0.004541721022370391, "grad_norm": 92.95460510253906, "learning_rate": 9.954582789776296e-07, "loss": 2.5857, "step": 94 }, { "epoch": 0.0045900372034594385, "grad_norm": 92.36193084716797, "learning_rate": 9.954099627965406e-07, "loss": 2.7206, "step": 95 }, { "epoch": 0.0046383533845484855, "grad_norm": 115.16254425048828, "learning_rate": 9.953616466154516e-07, "loss": 3.0352, "step": 96 }, { "epoch": 0.004686669565637532, "grad_norm": 25.61468505859375, "learning_rate": 9.953133304343624e-07, "loss": 1.0781, "step": 97 }, { "epoch": 0.004734985746726579, "grad_norm": 44.19015884399414, "learning_rate": 9.952650142532734e-07, "loss": 1.4567, "step": 98 }, { "epoch": 0.004783301927815626, "grad_norm": 68.40250396728516, "learning_rate": 9.952166980721844e-07, "loss": 2.158, "step": 99 }, { "epoch": 0.004831618108904672, "grad_norm": 26.773723602294922, "learning_rate": 9.951683818910954e-07, "loss": 1.0607, "step": 100 }, { "epoch": 0.004879934289993719, "grad_norm": 71.00688934326172, "learning_rate": 9.951200657100064e-07, "loss": 2.048, "step": 101 }, { "epoch": 0.004928250471082766, "grad_norm": 70.83213806152344, "learning_rate": 9.950717495289171e-07, "loss": 2.0534, "step": 102 }, { "epoch": 0.004976566652171812, "grad_norm": 141.72703552246094, "learning_rate": 9.950234333478281e-07, "loss": 3.4261, "step": 103 }, { "epoch": 0.005024882833260859, "grad_norm": 88.92850494384766, "learning_rate": 9.949751171667391e-07, "loss": 2.4481, "step": 104 }, { "epoch": 0.005073199014349905, "grad_norm": 46.23762130737305, "learning_rate": 9.949268009856501e-07, "loss": 1.5239, "step": 105 }, { "epoch": 0.0051215151954389524, "grad_norm": 91.52803802490234, "learning_rate": 9.948784848045611e-07, "loss": 2.5907, "step": 106 }, { "epoch": 0.0051698313765279995, "grad_norm": 138.78880310058594, "learning_rate": 9.948301686234719e-07, "loss": 3.3497, "step": 107 }, { "epoch": 0.005218147557617046, "grad_norm": 117.28173828125, "learning_rate": 9.947818524423829e-07, "loss": 2.9648, "step": 108 }, { "epoch": 0.005266463738706093, "grad_norm": 115.50651550292969, "learning_rate": 9.947335362612939e-07, "loss": 2.867, "step": 109 }, { "epoch": 0.00531477991979514, "grad_norm": 93.17166137695312, "learning_rate": 9.946852200802049e-07, "loss": 2.3593, "step": 110 }, { "epoch": 0.005363096100884186, "grad_norm": 106.12985229492188, "learning_rate": 9.946369038991158e-07, "loss": 2.6004, "step": 111 }, { "epoch": 0.005411412281973233, "grad_norm": 116.74195861816406, "learning_rate": 9.945885877180266e-07, "loss": 2.743, "step": 112 }, { "epoch": 0.00545972846306228, "grad_norm": 91.2406005859375, "learning_rate": 9.945402715369376e-07, "loss": 2.3477, "step": 113 }, { "epoch": 0.005508044644151326, "grad_norm": 22.97930335998535, "learning_rate": 9.944919553558486e-07, "loss": 0.9568, "step": 114 }, { "epoch": 0.005556360825240373, "grad_norm": 92.10697937011719, "learning_rate": 9.944436391747596e-07, "loss": 2.2629, "step": 115 }, { "epoch": 0.005604677006329419, "grad_norm": 71.2599868774414, "learning_rate": 9.943953229936706e-07, "loss": 1.9427, "step": 116 }, { "epoch": 0.005652993187418466, "grad_norm": 115.42411041259766, "learning_rate": 9.943470068125814e-07, "loss": 2.6945, "step": 117 }, { "epoch": 0.0057013093685075135, "grad_norm": 96.96400451660156, "learning_rate": 9.942986906314924e-07, "loss": 2.3275, "step": 118 }, { "epoch": 0.00574962554959656, "grad_norm": 115.00272369384766, "learning_rate": 9.942503744504033e-07, "loss": 2.6229, "step": 119 }, { "epoch": 0.005797941730685607, "grad_norm": 119.42229461669922, "learning_rate": 9.942020582693143e-07, "loss": 2.7242, "step": 120 }, { "epoch": 0.005846257911774654, "grad_norm": 142.2580108642578, "learning_rate": 9.941537420882253e-07, "loss": 3.0541, "step": 121 }, { "epoch": 0.0058945740928637, "grad_norm": 109.79370880126953, "learning_rate": 9.941054259071363e-07, "loss": 2.4251, "step": 122 }, { "epoch": 0.005942890273952747, "grad_norm": 91.64277648925781, "learning_rate": 9.94057109726047e-07, "loss": 2.1767, "step": 123 }, { "epoch": 0.005991206455041793, "grad_norm": 95.20866394042969, "learning_rate": 9.94008793544958e-07, "loss": 2.1722, "step": 124 }, { "epoch": 0.00603952263613084, "grad_norm": 167.27127075195312, "learning_rate": 9.93960477363869e-07, "loss": 3.438, "step": 125 }, { "epoch": 0.006087838817219887, "grad_norm": 87.11947631835938, "learning_rate": 9.9391216118278e-07, "loss": 2.0652, "step": 126 }, { "epoch": 0.006136154998308933, "grad_norm": 73.68755340576172, "learning_rate": 9.93863845001691e-07, "loss": 1.8726, "step": 127 }, { "epoch": 0.00618447117939798, "grad_norm": 142.22396850585938, "learning_rate": 9.938155288206018e-07, "loss": 2.947, "step": 128 }, { "epoch": 0.006232787360487027, "grad_norm": 37.565834045410156, "learning_rate": 9.937672126395128e-07, "loss": 1.1286, "step": 129 }, { "epoch": 0.006281103541576074, "grad_norm": 44.991329193115234, "learning_rate": 9.937188964584238e-07, "loss": 1.4555, "step": 130 }, { "epoch": 0.006329419722665121, "grad_norm": 145.14056396484375, "learning_rate": 9.936705802773348e-07, "loss": 2.8478, "step": 131 }, { "epoch": 0.006377735903754168, "grad_norm": 133.0084991455078, "learning_rate": 9.936222640962458e-07, "loss": 2.726, "step": 132 }, { "epoch": 0.006426052084843214, "grad_norm": 94.54267120361328, "learning_rate": 9.935739479151568e-07, "loss": 2.0369, "step": 133 }, { "epoch": 0.006474368265932261, "grad_norm": 112.41687774658203, "learning_rate": 9.935256317340678e-07, "loss": 2.2662, "step": 134 }, { "epoch": 0.006522684447021307, "grad_norm": 67.57015991210938, "learning_rate": 9.934773155529786e-07, "loss": 1.6409, "step": 135 }, { "epoch": 0.006571000628110354, "grad_norm": 136.4622344970703, "learning_rate": 9.934289993718895e-07, "loss": 2.6013, "step": 136 }, { "epoch": 0.006619316809199401, "grad_norm": 41.240604400634766, "learning_rate": 9.933806831908005e-07, "loss": 1.16, "step": 137 }, { "epoch": 0.006667632990288447, "grad_norm": 140.4503936767578, "learning_rate": 9.933323670097115e-07, "loss": 2.6622, "step": 138 }, { "epoch": 0.006715949171377494, "grad_norm": 46.429786682128906, "learning_rate": 9.932840508286225e-07, "loss": 1.3198, "step": 139 }, { "epoch": 0.006764265352466541, "grad_norm": 24.86690330505371, "learning_rate": 9.932357346475335e-07, "loss": 0.9898, "step": 140 }, { "epoch": 0.006812581533555588, "grad_norm": 114.14783477783203, "learning_rate": 9.931874184664443e-07, "loss": 2.2655, "step": 141 }, { "epoch": 0.006860897714644635, "grad_norm": 22.31276512145996, "learning_rate": 9.931391022853553e-07, "loss": 0.9626, "step": 142 }, { "epoch": 0.006909213895733681, "grad_norm": 89.01710510253906, "learning_rate": 9.930907861042663e-07, "loss": 1.7938, "step": 143 }, { "epoch": 0.006957530076822728, "grad_norm": 96.43439483642578, "learning_rate": 9.930424699231773e-07, "loss": 2.0059, "step": 144 }, { "epoch": 0.007005846257911775, "grad_norm": 113.6407699584961, "learning_rate": 9.929941537420882e-07, "loss": 2.1795, "step": 145 }, { "epoch": 0.007054162439000821, "grad_norm": 70.96408081054688, "learning_rate": 9.929458375609992e-07, "loss": 1.5772, "step": 146 }, { "epoch": 0.007102478620089868, "grad_norm": 110.25071716308594, "learning_rate": 9.928975213799102e-07, "loss": 2.0858, "step": 147 }, { "epoch": 0.007150794801178915, "grad_norm": 69.33943939208984, "learning_rate": 9.92849205198821e-07, "loss": 1.4949, "step": 148 }, { "epoch": 0.007199110982267961, "grad_norm": 118.11092376708984, "learning_rate": 9.92800889017732e-07, "loss": 2.1281, "step": 149 }, { "epoch": 0.007247427163357008, "grad_norm": 90.46385192871094, "learning_rate": 9.92752572836643e-07, "loss": 1.8212, "step": 150 }, { "epoch": 0.007295743344446055, "grad_norm": 88.58406829833984, "learning_rate": 9.92704256655554e-07, "loss": 1.8066, "step": 151 }, { "epoch": 0.0073440595255351016, "grad_norm": 91.06085205078125, "learning_rate": 9.92655940474465e-07, "loss": 1.7057, "step": 152 }, { "epoch": 0.007392375706624149, "grad_norm": 112.01631927490234, "learning_rate": 9.92607624293376e-07, "loss": 1.9877, "step": 153 }, { "epoch": 0.007440691887713195, "grad_norm": 93.9548110961914, "learning_rate": 9.925593081122867e-07, "loss": 1.755, "step": 154 }, { "epoch": 0.007489008068802242, "grad_norm": 45.9187126159668, "learning_rate": 9.925109919311977e-07, "loss": 1.2311, "step": 155 }, { "epoch": 0.007537324249891289, "grad_norm": 88.1593246459961, "learning_rate": 9.924626757501087e-07, "loss": 1.7084, "step": 156 }, { "epoch": 0.007585640430980335, "grad_norm": 112.14048767089844, "learning_rate": 9.924143595690197e-07, "loss": 1.9455, "step": 157 }, { "epoch": 0.007633956612069382, "grad_norm": 116.33192443847656, "learning_rate": 9.923660433879307e-07, "loss": 2.0103, "step": 158 }, { "epoch": 0.007682272793158429, "grad_norm": 95.44628143310547, "learning_rate": 9.923177272068415e-07, "loss": 1.7382, "step": 159 }, { "epoch": 0.007730588974247475, "grad_norm": 87.68425750732422, "learning_rate": 9.922694110257525e-07, "loss": 1.6528, "step": 160 }, { "epoch": 0.007778905155336522, "grad_norm": 110.35762786865234, "learning_rate": 9.922210948446635e-07, "loss": 1.9561, "step": 161 }, { "epoch": 0.007827221336425568, "grad_norm": 88.20429992675781, "learning_rate": 9.921727786635744e-07, "loss": 1.7133, "step": 162 }, { "epoch": 0.007875537517514616, "grad_norm": 69.46304321289062, "learning_rate": 9.921244624824854e-07, "loss": 1.4006, "step": 163 }, { "epoch": 0.007923853698603663, "grad_norm": 90.39278411865234, "learning_rate": 9.920761463013962e-07, "loss": 1.7403, "step": 164 }, { "epoch": 0.007972169879692709, "grad_norm": 137.5623779296875, "learning_rate": 9.920278301203072e-07, "loss": 2.2553, "step": 165 }, { "epoch": 0.008020486060781757, "grad_norm": 139.99380493164062, "learning_rate": 9.919795139392182e-07, "loss": 2.1333, "step": 166 }, { "epoch": 0.008068802241870803, "grad_norm": 69.59623718261719, "learning_rate": 9.919311977581292e-07, "loss": 1.4324, "step": 167 }, { "epoch": 0.008117118422959849, "grad_norm": 114.44952392578125, "learning_rate": 9.918828815770402e-07, "loss": 1.821, "step": 168 }, { "epoch": 0.008165434604048895, "grad_norm": 68.33834075927734, "learning_rate": 9.91834565395951e-07, "loss": 1.3473, "step": 169 }, { "epoch": 0.008213750785137943, "grad_norm": 68.0626449584961, "learning_rate": 9.91786249214862e-07, "loss": 1.3877, "step": 170 }, { "epoch": 0.00826206696622699, "grad_norm": 159.64620971679688, "learning_rate": 9.91737933033773e-07, "loss": 2.2695, "step": 171 }, { "epoch": 0.008310383147316035, "grad_norm": 115.35440063476562, "learning_rate": 9.91689616852684e-07, "loss": 1.7088, "step": 172 }, { "epoch": 0.008358699328405083, "grad_norm": 115.47564697265625, "learning_rate": 9.91641300671595e-07, "loss": 1.8927, "step": 173 }, { "epoch": 0.00840701550949413, "grad_norm": 110.61624908447266, "learning_rate": 9.91592984490506e-07, "loss": 1.8893, "step": 174 }, { "epoch": 0.008455331690583176, "grad_norm": 153.2813262939453, "learning_rate": 9.915446683094167e-07, "loss": 2.1358, "step": 175 }, { "epoch": 0.008503647871672224, "grad_norm": 85.23792266845703, "learning_rate": 9.914963521283277e-07, "loss": 1.4653, "step": 176 }, { "epoch": 0.00855196405276127, "grad_norm": 72.77533721923828, "learning_rate": 9.914480359472387e-07, "loss": 1.252, "step": 177 }, { "epoch": 0.008600280233850316, "grad_norm": 133.8021240234375, "learning_rate": 9.913997197661496e-07, "loss": 1.9365, "step": 178 }, { "epoch": 0.008648596414939364, "grad_norm": 110.2817611694336, "learning_rate": 9.913514035850606e-07, "loss": 1.604, "step": 179 }, { "epoch": 0.00869691259602841, "grad_norm": 110.23932647705078, "learning_rate": 9.913030874039714e-07, "loss": 1.6638, "step": 180 }, { "epoch": 0.008745228777117456, "grad_norm": 70.38236999511719, "learning_rate": 9.912547712228824e-07, "loss": 1.2438, "step": 181 }, { "epoch": 0.008793544958206504, "grad_norm": 90.7051010131836, "learning_rate": 9.912064550417934e-07, "loss": 1.4672, "step": 182 }, { "epoch": 0.00884186113929555, "grad_norm": 87.38855743408203, "learning_rate": 9.911581388607044e-07, "loss": 1.4282, "step": 183 }, { "epoch": 0.008890177320384596, "grad_norm": 110.9583969116211, "learning_rate": 9.911098226796154e-07, "loss": 1.5995, "step": 184 }, { "epoch": 0.008938493501473644, "grad_norm": 125.34140014648438, "learning_rate": 9.910615064985264e-07, "loss": 1.7299, "step": 185 }, { "epoch": 0.00898680968256269, "grad_norm": 92.15584564208984, "learning_rate": 9.910131903174371e-07, "loss": 1.378, "step": 186 }, { "epoch": 0.009035125863651737, "grad_norm": 90.27934265136719, "learning_rate": 9.909648741363481e-07, "loss": 1.34, "step": 187 }, { "epoch": 0.009083442044740783, "grad_norm": 44.68595504760742, "learning_rate": 9.909165579552591e-07, "loss": 0.9377, "step": 188 }, { "epoch": 0.00913175822582983, "grad_norm": 79.5791244506836, "learning_rate": 9.908682417741701e-07, "loss": 1.2405, "step": 189 }, { "epoch": 0.009180074406918877, "grad_norm": 103.50055694580078, "learning_rate": 9.908199255930811e-07, "loss": 1.5241, "step": 190 }, { "epoch": 0.009228390588007923, "grad_norm": 103.86858367919922, "learning_rate": 9.90771609411992e-07, "loss": 1.4413, "step": 191 }, { "epoch": 0.009276706769096971, "grad_norm": 23.024311065673828, "learning_rate": 9.907232932309029e-07, "loss": 0.7703, "step": 192 }, { "epoch": 0.009325022950186017, "grad_norm": 86.52326965332031, "learning_rate": 9.906749770498139e-07, "loss": 1.3639, "step": 193 }, { "epoch": 0.009373339131275063, "grad_norm": 63.65388870239258, "learning_rate": 9.906266608687249e-07, "loss": 1.0558, "step": 194 }, { "epoch": 0.009421655312364111, "grad_norm": 107.47823333740234, "learning_rate": 9.905783446876358e-07, "loss": 1.4038, "step": 195 }, { "epoch": 0.009469971493453157, "grad_norm": 110.24325561523438, "learning_rate": 9.905300285065468e-07, "loss": 1.4298, "step": 196 }, { "epoch": 0.009518287674542204, "grad_norm": 128.51759338378906, "learning_rate": 9.904817123254578e-07, "loss": 1.5534, "step": 197 }, { "epoch": 0.009566603855631252, "grad_norm": 63.28202438354492, "learning_rate": 9.904333961443688e-07, "loss": 1.1504, "step": 198 }, { "epoch": 0.009614920036720298, "grad_norm": 103.50233459472656, "learning_rate": 9.903850799632796e-07, "loss": 1.3448, "step": 199 }, { "epoch": 0.009663236217809344, "grad_norm": 63.5787353515625, "learning_rate": 9.903367637821906e-07, "loss": 1.2118, "step": 200 }, { "epoch": 0.009711552398898392, "grad_norm": 62.27947235107422, "learning_rate": 9.902884476011016e-07, "loss": 1.0825, "step": 201 }, { "epoch": 0.009759868579987438, "grad_norm": 104.09855651855469, "learning_rate": 9.902401314200126e-07, "loss": 1.4045, "step": 202 }, { "epoch": 0.009808184761076484, "grad_norm": 41.37382125854492, "learning_rate": 9.901918152389236e-07, "loss": 0.8092, "step": 203 }, { "epoch": 0.009856500942165532, "grad_norm": 102.7940902709961, "learning_rate": 9.901434990578345e-07, "loss": 1.2496, "step": 204 }, { "epoch": 0.009904817123254578, "grad_norm": 80.55416870117188, "learning_rate": 9.900951828767453e-07, "loss": 1.0724, "step": 205 }, { "epoch": 0.009953133304343624, "grad_norm": 128.12469482421875, "learning_rate": 9.900468666956563e-07, "loss": 1.4384, "step": 206 }, { "epoch": 0.01000144948543267, "grad_norm": 101.4955062866211, "learning_rate": 9.899985505145673e-07, "loss": 1.2347, "step": 207 }, { "epoch": 0.010049765666521718, "grad_norm": 104.55001068115234, "learning_rate": 9.899502343334783e-07, "loss": 1.2129, "step": 208 }, { "epoch": 0.010098081847610765, "grad_norm": 60.86736297607422, "learning_rate": 9.899019181523893e-07, "loss": 0.9215, "step": 209 }, { "epoch": 0.01014639802869981, "grad_norm": 101.03289031982422, "learning_rate": 9.898536019713003e-07, "loss": 1.1399, "step": 210 }, { "epoch": 0.010194714209788859, "grad_norm": 160.86532592773438, "learning_rate": 9.89805285790211e-07, "loss": 1.5416, "step": 211 }, { "epoch": 0.010243030390877905, "grad_norm": 18.148487091064453, "learning_rate": 9.89756969609122e-07, "loss": 0.6606, "step": 212 }, { "epoch": 0.010291346571966951, "grad_norm": 58.20994186401367, "learning_rate": 9.89708653428033e-07, "loss": 1.0258, "step": 213 }, { "epoch": 0.010339662753055999, "grad_norm": 59.08200454711914, "learning_rate": 9.89660337246944e-07, "loss": 0.933, "step": 214 }, { "epoch": 0.010387978934145045, "grad_norm": 59.00634002685547, "learning_rate": 9.89612021065855e-07, "loss": 0.9763, "step": 215 }, { "epoch": 0.010436295115234091, "grad_norm": 59.72798538208008, "learning_rate": 9.895637048847658e-07, "loss": 0.9146, "step": 216 }, { "epoch": 0.01048461129632314, "grad_norm": 59.65439987182617, "learning_rate": 9.895153887036768e-07, "loss": 0.8862, "step": 217 }, { "epoch": 0.010532927477412185, "grad_norm": 77.82144165039062, "learning_rate": 9.894670725225878e-07, "loss": 0.9722, "step": 218 }, { "epoch": 0.010581243658501232, "grad_norm": 55.18349075317383, "learning_rate": 9.894187563414988e-07, "loss": 0.8257, "step": 219 }, { "epoch": 0.01062955983959028, "grad_norm": 96.62494659423828, "learning_rate": 9.893704401604098e-07, "loss": 1.0927, "step": 220 }, { "epoch": 0.010677876020679326, "grad_norm": 109.0641860961914, "learning_rate": 9.893221239793205e-07, "loss": 1.1335, "step": 221 }, { "epoch": 0.010726192201768372, "grad_norm": 88.37156677246094, "learning_rate": 9.892738077982315e-07, "loss": 0.939, "step": 222 }, { "epoch": 0.01077450838285742, "grad_norm": 92.60873413085938, "learning_rate": 9.892254916171425e-07, "loss": 1.0472, "step": 223 }, { "epoch": 0.010822824563946466, "grad_norm": 56.704288482666016, "learning_rate": 9.891771754360535e-07, "loss": 0.8097, "step": 224 }, { "epoch": 0.010871140745035512, "grad_norm": 73.02754211425781, "learning_rate": 9.891288592549645e-07, "loss": 0.8853, "step": 225 }, { "epoch": 0.01091945692612456, "grad_norm": 19.05266571044922, "learning_rate": 9.890805430738755e-07, "loss": 0.6501, "step": 226 }, { "epoch": 0.010967773107213606, "grad_norm": 106.98467254638672, "learning_rate": 9.890322268927863e-07, "loss": 1.0325, "step": 227 }, { "epoch": 0.011016089288302652, "grad_norm": 54.38882064819336, "learning_rate": 9.889839107116973e-07, "loss": 0.826, "step": 228 }, { "epoch": 0.011064405469391698, "grad_norm": 36.014137268066406, "learning_rate": 9.889355945306082e-07, "loss": 0.6251, "step": 229 }, { "epoch": 0.011112721650480746, "grad_norm": 66.1773681640625, "learning_rate": 9.888872783495192e-07, "loss": 0.7737, "step": 230 }, { "epoch": 0.011161037831569793, "grad_norm": 66.53890228271484, "learning_rate": 9.888389621684302e-07, "loss": 0.8716, "step": 231 }, { "epoch": 0.011209354012658839, "grad_norm": 85.62137603759766, "learning_rate": 9.88790645987341e-07, "loss": 0.937, "step": 232 }, { "epoch": 0.011257670193747887, "grad_norm": 81.63236999511719, "learning_rate": 9.88742329806252e-07, "loss": 0.8507, "step": 233 }, { "epoch": 0.011305986374836933, "grad_norm": 68.67433166503906, "learning_rate": 9.88694013625163e-07, "loss": 0.8822, "step": 234 }, { "epoch": 0.011354302555925979, "grad_norm": 50.33929443359375, "learning_rate": 9.88645697444074e-07, "loss": 0.7648, "step": 235 }, { "epoch": 0.011402618737015027, "grad_norm": 68.32503509521484, "learning_rate": 9.88597381262985e-07, "loss": 0.8658, "step": 236 }, { "epoch": 0.011450934918104073, "grad_norm": 84.97394561767578, "learning_rate": 9.885490650818957e-07, "loss": 0.8829, "step": 237 }, { "epoch": 0.01149925109919312, "grad_norm": 68.14761352539062, "learning_rate": 9.885007489008067e-07, "loss": 0.806, "step": 238 }, { "epoch": 0.011547567280282167, "grad_norm": 80.87867736816406, "learning_rate": 9.884524327197177e-07, "loss": 0.9106, "step": 239 }, { "epoch": 0.011595883461371213, "grad_norm": 61.519203186035156, "learning_rate": 9.884041165386287e-07, "loss": 0.7858, "step": 240 }, { "epoch": 0.01164419964246026, "grad_norm": 76.84468841552734, "learning_rate": 9.883558003575397e-07, "loss": 0.817, "step": 241 }, { "epoch": 0.011692515823549307, "grad_norm": 47.87452697753906, "learning_rate": 9.883074841764507e-07, "loss": 0.7644, "step": 242 }, { "epoch": 0.011740832004638354, "grad_norm": 31.398197174072266, "learning_rate": 9.882591679953615e-07, "loss": 0.624, "step": 243 }, { "epoch": 0.0117891481857274, "grad_norm": 30.396467208862305, "learning_rate": 9.882108518142725e-07, "loss": 0.6019, "step": 244 }, { "epoch": 0.011837464366816448, "grad_norm": 72.82440185546875, "learning_rate": 9.881625356331835e-07, "loss": 0.7635, "step": 245 }, { "epoch": 0.011885780547905494, "grad_norm": 59.49244689941406, "learning_rate": 9.881142194520944e-07, "loss": 0.6393, "step": 246 }, { "epoch": 0.01193409672899454, "grad_norm": 30.555376052856445, "learning_rate": 9.880659032710054e-07, "loss": 0.6211, "step": 247 }, { "epoch": 0.011982412910083586, "grad_norm": 56.2292594909668, "learning_rate": 9.880175870899164e-07, "loss": 0.7954, "step": 248 }, { "epoch": 0.012030729091172634, "grad_norm": 14.801816940307617, "learning_rate": 9.879692709088274e-07, "loss": 0.5503, "step": 249 }, { "epoch": 0.01207904527226168, "grad_norm": 67.59768676757812, "learning_rate": 9.879209547277382e-07, "loss": 0.7316, "step": 250 }, { "epoch": 0.012127361453350726, "grad_norm": 70.6972427368164, "learning_rate": 9.878726385466492e-07, "loss": 0.6569, "step": 251 }, { "epoch": 0.012175677634439774, "grad_norm": 55.15121841430664, "learning_rate": 9.878243223655602e-07, "loss": 0.7698, "step": 252 }, { "epoch": 0.01222399381552882, "grad_norm": 51.99075698852539, "learning_rate": 9.877760061844712e-07, "loss": 0.5819, "step": 253 }, { "epoch": 0.012272309996617867, "grad_norm": 52.39459228515625, "learning_rate": 9.877276900033822e-07, "loss": 0.6143, "step": 254 }, { "epoch": 0.012320626177706915, "grad_norm": 61.716957092285156, "learning_rate": 9.876793738222931e-07, "loss": 0.6401, "step": 255 }, { "epoch": 0.01236894235879596, "grad_norm": 50.716941833496094, "learning_rate": 9.87631057641204e-07, "loss": 0.5913, "step": 256 }, { "epoch": 0.012417258539885007, "grad_norm": 51.793792724609375, "learning_rate": 9.87582741460115e-07, "loss": 0.6758, "step": 257 }, { "epoch": 0.012465574720974055, "grad_norm": 48.380699157714844, "learning_rate": 9.87534425279026e-07, "loss": 0.6501, "step": 258 }, { "epoch": 0.012513890902063101, "grad_norm": 36.92835998535156, "learning_rate": 9.874861090979369e-07, "loss": 0.5673, "step": 259 }, { "epoch": 0.012562207083152147, "grad_norm": 85.30402374267578, "learning_rate": 9.874377929168479e-07, "loss": 0.7044, "step": 260 }, { "epoch": 0.012610523264241195, "grad_norm": 57.04265213012695, "learning_rate": 9.873894767357589e-07, "loss": 0.5826, "step": 261 }, { "epoch": 0.012658839445330241, "grad_norm": 59.843746185302734, "learning_rate": 9.873411605546699e-07, "loss": 0.6571, "step": 262 }, { "epoch": 0.012707155626419287, "grad_norm": 45.02401351928711, "learning_rate": 9.872928443735806e-07, "loss": 0.5646, "step": 263 }, { "epoch": 0.012755471807508335, "grad_norm": 23.121313095092773, "learning_rate": 9.872445281924916e-07, "loss": 0.5835, "step": 264 }, { "epoch": 0.012803787988597382, "grad_norm": 31.9121150970459, "learning_rate": 9.871962120114026e-07, "loss": 0.5397, "step": 265 }, { "epoch": 0.012852104169686428, "grad_norm": 40.900020599365234, "learning_rate": 9.871478958303136e-07, "loss": 0.6544, "step": 266 }, { "epoch": 0.012900420350775474, "grad_norm": 42.06022644042969, "learning_rate": 9.870995796492246e-07, "loss": 0.6612, "step": 267 }, { "epoch": 0.012948736531864522, "grad_norm": 23.04738426208496, "learning_rate": 9.870512634681354e-07, "loss": 0.6577, "step": 268 }, { "epoch": 0.012997052712953568, "grad_norm": 40.43238830566406, "learning_rate": 9.870029472870464e-07, "loss": 0.642, "step": 269 }, { "epoch": 0.013045368894042614, "grad_norm": 40.472007751464844, "learning_rate": 9.869546311059574e-07, "loss": 0.5236, "step": 270 }, { "epoch": 0.013093685075131662, "grad_norm": 51.133514404296875, "learning_rate": 9.869063149248684e-07, "loss": 0.6391, "step": 271 }, { "epoch": 0.013142001256220708, "grad_norm": 39.87445831298828, "learning_rate": 9.868579987437793e-07, "loss": 0.5293, "step": 272 }, { "epoch": 0.013190317437309754, "grad_norm": 27.491060256958008, "learning_rate": 9.868096825626901e-07, "loss": 0.5589, "step": 273 }, { "epoch": 0.013238633618398802, "grad_norm": 28.730104446411133, "learning_rate": 9.867613663816011e-07, "loss": 0.603, "step": 274 }, { "epoch": 0.013286949799487848, "grad_norm": 53.985469818115234, "learning_rate": 9.86713050200512e-07, "loss": 0.5503, "step": 275 }, { "epoch": 0.013335265980576895, "grad_norm": 38.809104919433594, "learning_rate": 9.86664734019423e-07, "loss": 0.6009, "step": 276 }, { "epoch": 0.013383582161665943, "grad_norm": 51.63545227050781, "learning_rate": 9.86616417838334e-07, "loss": 0.4933, "step": 277 }, { "epoch": 0.013431898342754989, "grad_norm": 60.88602066040039, "learning_rate": 9.86568101657245e-07, "loss": 0.4708, "step": 278 }, { "epoch": 0.013480214523844035, "grad_norm": 25.06388282775879, "learning_rate": 9.865197854761558e-07, "loss": 0.5234, "step": 279 }, { "epoch": 0.013528530704933083, "grad_norm": 40.38130187988281, "learning_rate": 9.864714692950668e-07, "loss": 0.572, "step": 280 }, { "epoch": 0.013576846886022129, "grad_norm": 55.97913360595703, "learning_rate": 9.864231531139778e-07, "loss": 0.5683, "step": 281 }, { "epoch": 0.013625163067111175, "grad_norm": 31.990449905395508, "learning_rate": 9.863748369328888e-07, "loss": 0.4734, "step": 282 }, { "epoch": 0.013673479248200223, "grad_norm": 44.03947448730469, "learning_rate": 9.863265207517998e-07, "loss": 0.5222, "step": 283 }, { "epoch": 0.01372179542928927, "grad_norm": 51.840110778808594, "learning_rate": 9.862782045707106e-07, "loss": 0.4621, "step": 284 }, { "epoch": 0.013770111610378315, "grad_norm": 47.18116760253906, "learning_rate": 9.862298883896216e-07, "loss": 0.4105, "step": 285 }, { "epoch": 0.013818427791467362, "grad_norm": 28.105363845825195, "learning_rate": 9.861815722085326e-07, "loss": 0.4485, "step": 286 }, { "epoch": 0.01386674397255641, "grad_norm": 28.617143630981445, "learning_rate": 9.861332560274436e-07, "loss": 0.4725, "step": 287 }, { "epoch": 0.013915060153645456, "grad_norm": 33.20792770385742, "learning_rate": 9.860849398463545e-07, "loss": 0.4847, "step": 288 }, { "epoch": 0.013963376334734502, "grad_norm": 8.20564079284668, "learning_rate": 9.860366236652653e-07, "loss": 0.57, "step": 289 }, { "epoch": 0.01401169251582355, "grad_norm": 27.0456600189209, "learning_rate": 9.859883074841763e-07, "loss": 0.3856, "step": 290 }, { "epoch": 0.014060008696912596, "grad_norm": 32.23746109008789, "learning_rate": 9.859399913030873e-07, "loss": 0.4407, "step": 291 }, { "epoch": 0.014108324878001642, "grad_norm": 25.526561737060547, "learning_rate": 9.858916751219983e-07, "loss": 0.5201, "step": 292 }, { "epoch": 0.01415664105909069, "grad_norm": 36.878578186035156, "learning_rate": 9.858433589409093e-07, "loss": 0.3968, "step": 293 }, { "epoch": 0.014204957240179736, "grad_norm": 29.679506301879883, "learning_rate": 9.8579504275982e-07, "loss": 0.4239, "step": 294 }, { "epoch": 0.014253273421268782, "grad_norm": 29.540163040161133, "learning_rate": 9.85746726578731e-07, "loss": 0.4652, "step": 295 }, { "epoch": 0.01430158960235783, "grad_norm": 16.892515182495117, "learning_rate": 9.85698410397642e-07, "loss": 0.431, "step": 296 }, { "epoch": 0.014349905783446876, "grad_norm": 17.860170364379883, "learning_rate": 9.85650094216553e-07, "loss": 0.4374, "step": 297 }, { "epoch": 0.014398221964535923, "grad_norm": 10.808305740356445, "learning_rate": 9.85601778035464e-07, "loss": 0.5821, "step": 298 }, { "epoch": 0.01444653814562497, "grad_norm": 13.809274673461914, "learning_rate": 9.85553461854375e-07, "loss": 0.5204, "step": 299 }, { "epoch": 0.014494854326714017, "grad_norm": 25.96785545349121, "learning_rate": 9.85505145673286e-07, "loss": 0.4185, "step": 300 }, { "epoch": 0.014543170507803063, "grad_norm": 19.215608596801758, "learning_rate": 9.854568294921968e-07, "loss": 0.5135, "step": 301 }, { "epoch": 0.01459148668889211, "grad_norm": 10.994481086730957, "learning_rate": 9.854085133111078e-07, "loss": 0.5347, "step": 302 }, { "epoch": 0.014639802869981157, "grad_norm": 38.306640625, "learning_rate": 9.853601971300188e-07, "loss": 0.3382, "step": 303 }, { "epoch": 0.014688119051070203, "grad_norm": 18.480009078979492, "learning_rate": 9.853118809489298e-07, "loss": 0.4539, "step": 304 }, { "epoch": 0.01473643523215925, "grad_norm": 19.531566619873047, "learning_rate": 9.852635647678407e-07, "loss": 0.4022, "step": 305 }, { "epoch": 0.014784751413248297, "grad_norm": 14.084431648254395, "learning_rate": 9.852152485867517e-07, "loss": 0.5053, "step": 306 }, { "epoch": 0.014833067594337343, "grad_norm": 6.586619853973389, "learning_rate": 9.851669324056625e-07, "loss": 0.5756, "step": 307 }, { "epoch": 0.01488138377542639, "grad_norm": 12.97571849822998, "learning_rate": 9.851186162245735e-07, "loss": 0.4244, "step": 308 }, { "epoch": 0.014929699956515437, "grad_norm": 40.84330749511719, "learning_rate": 9.850703000434845e-07, "loss": 0.3999, "step": 309 }, { "epoch": 0.014978016137604484, "grad_norm": 16.034976959228516, "learning_rate": 9.850219838623955e-07, "loss": 0.4274, "step": 310 }, { "epoch": 0.01502633231869353, "grad_norm": 19.961519241333008, "learning_rate": 9.849736676813065e-07, "loss": 0.3583, "step": 311 }, { "epoch": 0.015074648499782578, "grad_norm": 22.720081329345703, "learning_rate": 9.849253515002175e-07, "loss": 0.4407, "step": 312 }, { "epoch": 0.015122964680871624, "grad_norm": 21.990365982055664, "learning_rate": 9.848770353191285e-07, "loss": 0.3958, "step": 313 }, { "epoch": 0.01517128086196067, "grad_norm": 22.633352279663086, "learning_rate": 9.848287191380392e-07, "loss": 0.3733, "step": 314 }, { "epoch": 0.015219597043049718, "grad_norm": 18.09235191345215, "learning_rate": 9.847804029569502e-07, "loss": 0.3399, "step": 315 }, { "epoch": 0.015267913224138764, "grad_norm": 17.60904884338379, "learning_rate": 9.847320867758612e-07, "loss": 0.401, "step": 316 }, { "epoch": 0.01531622940522781, "grad_norm": 12.72434139251709, "learning_rate": 9.846837705947722e-07, "loss": 0.3406, "step": 317 }, { "epoch": 0.015364545586316858, "grad_norm": 10.152015686035156, "learning_rate": 9.846354544136832e-07, "loss": 0.4547, "step": 318 }, { "epoch": 0.015412861767405904, "grad_norm": 6.935280799865723, "learning_rate": 9.845871382325942e-07, "loss": 0.4629, "step": 319 }, { "epoch": 0.01546117794849495, "grad_norm": 12.665138244628906, "learning_rate": 9.84538822051505e-07, "loss": 0.4643, "step": 320 }, { "epoch": 0.015509494129583998, "grad_norm": 11.194241523742676, "learning_rate": 9.84490505870416e-07, "loss": 0.339, "step": 321 }, { "epoch": 0.015557810310673045, "grad_norm": 8.179733276367188, "learning_rate": 9.84442189689327e-07, "loss": 0.3665, "step": 322 }, { "epoch": 0.01560612649176209, "grad_norm": 11.779800415039062, "learning_rate": 9.84393873508238e-07, "loss": 0.3883, "step": 323 }, { "epoch": 0.015654442672851137, "grad_norm": 14.43346881866455, "learning_rate": 9.84345557327149e-07, "loss": 0.4864, "step": 324 }, { "epoch": 0.015702758853940183, "grad_norm": 6.0207037925720215, "learning_rate": 9.842972411460597e-07, "loss": 0.5012, "step": 325 }, { "epoch": 0.015751075035029233, "grad_norm": 20.621604919433594, "learning_rate": 9.842489249649707e-07, "loss": 0.4505, "step": 326 }, { "epoch": 0.01579939121611828, "grad_norm": 13.213973999023438, "learning_rate": 9.842006087838817e-07, "loss": 0.2588, "step": 327 }, { "epoch": 0.015847707397207325, "grad_norm": 6.511539936065674, "learning_rate": 9.841522926027927e-07, "loss": 0.5502, "step": 328 }, { "epoch": 0.01589602357829637, "grad_norm": 7.498532772064209, "learning_rate": 9.841039764217037e-07, "loss": 0.4498, "step": 329 }, { "epoch": 0.015944339759385417, "grad_norm": 5.720475196838379, "learning_rate": 9.840556602406144e-07, "loss": 0.4601, "step": 330 }, { "epoch": 0.015992655940474464, "grad_norm": 7.24036979675293, "learning_rate": 9.840073440595254e-07, "loss": 0.3372, "step": 331 }, { "epoch": 0.016040972121563513, "grad_norm": 11.252107620239258, "learning_rate": 9.839590278784364e-07, "loss": 0.3471, "step": 332 }, { "epoch": 0.01608928830265256, "grad_norm": 11.276805877685547, "learning_rate": 9.839107116973474e-07, "loss": 0.3531, "step": 333 }, { "epoch": 0.016137604483741606, "grad_norm": 13.437745094299316, "learning_rate": 9.838623955162584e-07, "loss": 0.3621, "step": 334 }, { "epoch": 0.016185920664830652, "grad_norm": 8.880374908447266, "learning_rate": 9.838140793351694e-07, "loss": 0.4336, "step": 335 }, { "epoch": 0.016234236845919698, "grad_norm": 6.110656261444092, "learning_rate": 9.837657631540802e-07, "loss": 0.3413, "step": 336 }, { "epoch": 0.016282553027008744, "grad_norm": 9.685194969177246, "learning_rate": 9.837174469729912e-07, "loss": 0.4685, "step": 337 }, { "epoch": 0.01633086920809779, "grad_norm": 8.669172286987305, "learning_rate": 9.836691307919022e-07, "loss": 0.3996, "step": 338 }, { "epoch": 0.01637918538918684, "grad_norm": 11.544371604919434, "learning_rate": 9.836208146108131e-07, "loss": 0.281, "step": 339 }, { "epoch": 0.016427501570275886, "grad_norm": 7.83820104598999, "learning_rate": 9.835724984297241e-07, "loss": 0.3916, "step": 340 }, { "epoch": 0.016475817751364932, "grad_norm": 5.995692729949951, "learning_rate": 9.83524182248635e-07, "loss": 0.4362, "step": 341 }, { "epoch": 0.01652413393245398, "grad_norm": 7.17012882232666, "learning_rate": 9.83475866067546e-07, "loss": 0.3227, "step": 342 }, { "epoch": 0.016572450113543025, "grad_norm": 11.932387351989746, "learning_rate": 9.834275498864569e-07, "loss": 0.2869, "step": 343 }, { "epoch": 0.01662076629463207, "grad_norm": 6.221382141113281, "learning_rate": 9.833792337053679e-07, "loss": 0.4441, "step": 344 }, { "epoch": 0.01666908247572112, "grad_norm": 8.745169639587402, "learning_rate": 9.833309175242789e-07, "loss": 0.2628, "step": 345 }, { "epoch": 0.016717398656810167, "grad_norm": 11.14468765258789, "learning_rate": 9.832826013431897e-07, "loss": 0.2431, "step": 346 }, { "epoch": 0.016765714837899213, "grad_norm": 5.589805603027344, "learning_rate": 9.832342851621006e-07, "loss": 0.442, "step": 347 }, { "epoch": 0.01681403101898826, "grad_norm": 31.858070373535156, "learning_rate": 9.831859689810116e-07, "loss": 0.251, "step": 348 }, { "epoch": 0.016862347200077305, "grad_norm": 6.123062610626221, "learning_rate": 9.831376527999226e-07, "loss": 0.3746, "step": 349 }, { "epoch": 0.01691066338116635, "grad_norm": 7.616664409637451, "learning_rate": 9.830893366188336e-07, "loss": 0.3616, "step": 350 }, { "epoch": 0.0169589795622554, "grad_norm": 5.669783592224121, "learning_rate": 9.830410204377446e-07, "loss": 0.3425, "step": 351 }, { "epoch": 0.017007295743344447, "grad_norm": 9.210264205932617, "learning_rate": 9.829927042566554e-07, "loss": 0.3082, "step": 352 }, { "epoch": 0.017055611924433493, "grad_norm": 4.259627819061279, "learning_rate": 9.829443880755664e-07, "loss": 0.4859, "step": 353 }, { "epoch": 0.01710392810552254, "grad_norm": 7.9227118492126465, "learning_rate": 9.828960718944774e-07, "loss": 0.3495, "step": 354 }, { "epoch": 0.017152244286611586, "grad_norm": 5.306040287017822, "learning_rate": 9.828477557133884e-07, "loss": 0.3339, "step": 355 }, { "epoch": 0.017200560467700632, "grad_norm": 6.638487339019775, "learning_rate": 9.827994395322993e-07, "loss": 0.3233, "step": 356 }, { "epoch": 0.017248876648789678, "grad_norm": 5.220907211303711, "learning_rate": 9.827511233512103e-07, "loss": 0.4093, "step": 357 }, { "epoch": 0.017297192829878728, "grad_norm": 8.701929092407227, "learning_rate": 9.827028071701213e-07, "loss": 0.3121, "step": 358 }, { "epoch": 0.017345509010967774, "grad_norm": 7.364223480224609, "learning_rate": 9.82654490989032e-07, "loss": 0.2771, "step": 359 }, { "epoch": 0.01739382519205682, "grad_norm": 4.26821231842041, "learning_rate": 9.82606174807943e-07, "loss": 0.4393, "step": 360 }, { "epoch": 0.017442141373145866, "grad_norm": 7.30900239944458, "learning_rate": 9.82557858626854e-07, "loss": 0.2671, "step": 361 }, { "epoch": 0.017490457554234912, "grad_norm": 6.1524977684021, "learning_rate": 9.82509542445765e-07, "loss": 0.4385, "step": 362 }, { "epoch": 0.01753877373532396, "grad_norm": 3.829899549484253, "learning_rate": 9.82461226264676e-07, "loss": 0.3722, "step": 363 }, { "epoch": 0.017587089916413008, "grad_norm": 16.7507381439209, "learning_rate": 9.82412910083587e-07, "loss": 0.2991, "step": 364 }, { "epoch": 0.017635406097502054, "grad_norm": 9.395427703857422, "learning_rate": 9.823645939024978e-07, "loss": 0.3326, "step": 365 }, { "epoch": 0.0176837222785911, "grad_norm": 7.401327133178711, "learning_rate": 9.823162777214088e-07, "loss": 0.2992, "step": 366 }, { "epoch": 0.017732038459680147, "grad_norm": 6.537594318389893, "learning_rate": 9.822679615403198e-07, "loss": 0.327, "step": 367 }, { "epoch": 0.017780354640769193, "grad_norm": 5.165227890014648, "learning_rate": 9.822196453592308e-07, "loss": 0.2622, "step": 368 }, { "epoch": 0.01782867082185824, "grad_norm": 8.773951530456543, "learning_rate": 9.821713291781418e-07, "loss": 0.5338, "step": 369 }, { "epoch": 0.01787698700294729, "grad_norm": 4.609565734863281, "learning_rate": 9.821230129970528e-07, "loss": 0.3659, "step": 370 }, { "epoch": 0.017925303184036335, "grad_norm": 3.303905725479126, "learning_rate": 9.820746968159636e-07, "loss": 0.3168, "step": 371 }, { "epoch": 0.01797361936512538, "grad_norm": 6.679924964904785, "learning_rate": 9.820263806348746e-07, "loss": 0.2554, "step": 372 }, { "epoch": 0.018021935546214427, "grad_norm": 4.242708206176758, "learning_rate": 9.819780644537855e-07, "loss": 0.4025, "step": 373 }, { "epoch": 0.018070251727303473, "grad_norm": 5.4395060539245605, "learning_rate": 9.819297482726965e-07, "loss": 0.2377, "step": 374 }, { "epoch": 0.01811856790839252, "grad_norm": 4.4214091300964355, "learning_rate": 9.818814320916075e-07, "loss": 0.2598, "step": 375 }, { "epoch": 0.018166884089481566, "grad_norm": 12.523144721984863, "learning_rate": 9.818331159105185e-07, "loss": 0.4547, "step": 376 }, { "epoch": 0.018215200270570615, "grad_norm": 3.6031417846679688, "learning_rate": 9.817847997294293e-07, "loss": 0.4425, "step": 377 }, { "epoch": 0.01826351645165966, "grad_norm": 4.908614635467529, "learning_rate": 9.817364835483403e-07, "loss": 0.2625, "step": 378 }, { "epoch": 0.018311832632748708, "grad_norm": 5.530620574951172, "learning_rate": 9.816881673672513e-07, "loss": 0.2093, "step": 379 }, { "epoch": 0.018360148813837754, "grad_norm": 4.781846523284912, "learning_rate": 9.816398511861623e-07, "loss": 0.5021, "step": 380 }, { "epoch": 0.0184084649949268, "grad_norm": 5.215845584869385, "learning_rate": 9.815915350050733e-07, "loss": 0.3972, "step": 381 }, { "epoch": 0.018456781176015846, "grad_norm": 4.437341213226318, "learning_rate": 9.81543218823984e-07, "loss": 0.332, "step": 382 }, { "epoch": 0.018505097357104896, "grad_norm": 4.646346092224121, "learning_rate": 9.81494902642895e-07, "loss": 0.2916, "step": 383 }, { "epoch": 0.018553413538193942, "grad_norm": 5.644554615020752, "learning_rate": 9.81446586461806e-07, "loss": 0.4255, "step": 384 }, { "epoch": 0.018601729719282988, "grad_norm": 41.7437629699707, "learning_rate": 9.81398270280717e-07, "loss": 0.2661, "step": 385 }, { "epoch": 0.018650045900372034, "grad_norm": 3.083519220352173, "learning_rate": 9.81349954099628e-07, "loss": 0.3426, "step": 386 }, { "epoch": 0.01869836208146108, "grad_norm": 5.881302833557129, "learning_rate": 9.81301637918539e-07, "loss": 0.3286, "step": 387 }, { "epoch": 0.018746678262550127, "grad_norm": 3.6041393280029297, "learning_rate": 9.812533217374498e-07, "loss": 0.3446, "step": 388 }, { "epoch": 0.018794994443639176, "grad_norm": 3.4815993309020996, "learning_rate": 9.812050055563607e-07, "loss": 0.4243, "step": 389 }, { "epoch": 0.018843310624728223, "grad_norm": 3.732922315597534, "learning_rate": 9.811566893752717e-07, "loss": 0.3789, "step": 390 }, { "epoch": 0.01889162680581727, "grad_norm": 4.128118991851807, "learning_rate": 9.811083731941827e-07, "loss": 0.5137, "step": 391 }, { "epoch": 0.018939942986906315, "grad_norm": 3.623991012573242, "learning_rate": 9.810600570130937e-07, "loss": 0.4449, "step": 392 }, { "epoch": 0.01898825916799536, "grad_norm": 4.722195625305176, "learning_rate": 9.810117408320045e-07, "loss": 0.29, "step": 393 }, { "epoch": 0.019036575349084407, "grad_norm": 3.1039135456085205, "learning_rate": 9.809634246509155e-07, "loss": 0.2813, "step": 394 }, { "epoch": 0.019084891530173453, "grad_norm": 3.1731393337249756, "learning_rate": 9.809151084698265e-07, "loss": 0.4272, "step": 395 }, { "epoch": 0.019133207711262503, "grad_norm": 4.119685173034668, "learning_rate": 9.808667922887375e-07, "loss": 0.361, "step": 396 }, { "epoch": 0.01918152389235155, "grad_norm": 4.165393829345703, "learning_rate": 9.808184761076485e-07, "loss": 0.3531, "step": 397 }, { "epoch": 0.019229840073440595, "grad_norm": 7.950779438018799, "learning_rate": 9.807701599265592e-07, "loss": 0.3714, "step": 398 }, { "epoch": 0.01927815625452964, "grad_norm": 3.8319132328033447, "learning_rate": 9.807218437454702e-07, "loss": 0.271, "step": 399 }, { "epoch": 0.019326472435618688, "grad_norm": 2.8254339694976807, "learning_rate": 9.806735275643812e-07, "loss": 0.3238, "step": 400 }, { "epoch": 0.019374788616707734, "grad_norm": 3.5678088665008545, "learning_rate": 9.806252113832922e-07, "loss": 0.3744, "step": 401 }, { "epoch": 0.019423104797796784, "grad_norm": 3.7996773719787598, "learning_rate": 9.805768952022032e-07, "loss": 0.4688, "step": 402 }, { "epoch": 0.01947142097888583, "grad_norm": 3.146111488342285, "learning_rate": 9.80528579021114e-07, "loss": 0.3421, "step": 403 }, { "epoch": 0.019519737159974876, "grad_norm": 3.629199743270874, "learning_rate": 9.80480262840025e-07, "loss": 0.2668, "step": 404 }, { "epoch": 0.019568053341063922, "grad_norm": 3.28444504737854, "learning_rate": 9.80431946658936e-07, "loss": 0.4586, "step": 405 }, { "epoch": 0.019616369522152968, "grad_norm": 3.0093719959259033, "learning_rate": 9.80383630477847e-07, "loss": 0.2633, "step": 406 }, { "epoch": 0.019664685703242014, "grad_norm": 3.5111095905303955, "learning_rate": 9.80335314296758e-07, "loss": 0.4369, "step": 407 }, { "epoch": 0.019713001884331064, "grad_norm": 2.9169740676879883, "learning_rate": 9.80286998115669e-07, "loss": 0.2878, "step": 408 }, { "epoch": 0.01976131806542011, "grad_norm": 3.487293243408203, "learning_rate": 9.8023868193458e-07, "loss": 0.3488, "step": 409 }, { "epoch": 0.019809634246509156, "grad_norm": 3.509518623352051, "learning_rate": 9.801903657534907e-07, "loss": 0.4293, "step": 410 }, { "epoch": 0.019857950427598203, "grad_norm": 4.9561028480529785, "learning_rate": 9.801420495724017e-07, "loss": 0.5386, "step": 411 }, { "epoch": 0.01990626660868725, "grad_norm": 3.3470301628112793, "learning_rate": 9.800937333913127e-07, "loss": 0.2439, "step": 412 }, { "epoch": 0.019954582789776295, "grad_norm": 3.7768776416778564, "learning_rate": 9.800454172102237e-07, "loss": 0.3648, "step": 413 }, { "epoch": 0.02000289897086534, "grad_norm": 8.392770767211914, "learning_rate": 9.799971010291347e-07, "loss": 0.5006, "step": 414 }, { "epoch": 0.02005121515195439, "grad_norm": 3.104482889175415, "learning_rate": 9.799487848480456e-07, "loss": 0.3399, "step": 415 }, { "epoch": 0.020099531333043437, "grad_norm": 7.136111259460449, "learning_rate": 9.799004686669564e-07, "loss": 0.2977, "step": 416 }, { "epoch": 0.020147847514132483, "grad_norm": 4.938168525695801, "learning_rate": 9.798521524858674e-07, "loss": 0.4122, "step": 417 }, { "epoch": 0.02019616369522153, "grad_norm": 2.926515579223633, "learning_rate": 9.798038363047784e-07, "loss": 0.3202, "step": 418 }, { "epoch": 0.020244479876310575, "grad_norm": 3.2249867916107178, "learning_rate": 9.797555201236894e-07, "loss": 0.4042, "step": 419 }, { "epoch": 0.02029279605739962, "grad_norm": 2.90742564201355, "learning_rate": 9.797072039426004e-07, "loss": 0.289, "step": 420 }, { "epoch": 0.02034111223848867, "grad_norm": 3.1158111095428467, "learning_rate": 9.796588877615114e-07, "loss": 0.3662, "step": 421 }, { "epoch": 0.020389428419577717, "grad_norm": 4.570650577545166, "learning_rate": 9.796105715804224e-07, "loss": 0.3526, "step": 422 }, { "epoch": 0.020437744600666764, "grad_norm": 2.940622091293335, "learning_rate": 9.795622553993331e-07, "loss": 0.4222, "step": 423 }, { "epoch": 0.02048606078175581, "grad_norm": 3.4018609523773193, "learning_rate": 9.795139392182441e-07, "loss": 0.4545, "step": 424 }, { "epoch": 0.020534376962844856, "grad_norm": 4.354815483093262, "learning_rate": 9.794656230371551e-07, "loss": 0.3244, "step": 425 }, { "epoch": 0.020582693143933902, "grad_norm": 3.3759443759918213, "learning_rate": 9.794173068560661e-07, "loss": 0.233, "step": 426 }, { "epoch": 0.020631009325022952, "grad_norm": 4.769083023071289, "learning_rate": 9.793689906749771e-07, "loss": 0.3238, "step": 427 }, { "epoch": 0.020679325506111998, "grad_norm": 3.3851613998413086, "learning_rate": 9.79320674493888e-07, "loss": 0.2822, "step": 428 }, { "epoch": 0.020727641687201044, "grad_norm": 3.1707046031951904, "learning_rate": 9.792723583127989e-07, "loss": 0.3811, "step": 429 }, { "epoch": 0.02077595786829009, "grad_norm": 3.64534330368042, "learning_rate": 9.792240421317099e-07, "loss": 0.3169, "step": 430 }, { "epoch": 0.020824274049379136, "grad_norm": 4.074503421783447, "learning_rate": 9.791757259506209e-07, "loss": 0.507, "step": 431 }, { "epoch": 0.020872590230468183, "grad_norm": 3.039674997329712, "learning_rate": 9.791274097695318e-07, "loss": 0.3362, "step": 432 }, { "epoch": 0.02092090641155723, "grad_norm": 3.4387049674987793, "learning_rate": 9.790790935884428e-07, "loss": 0.3153, "step": 433 }, { "epoch": 0.02096922259264628, "grad_norm": 2.7715041637420654, "learning_rate": 9.790307774073536e-07, "loss": 0.2612, "step": 434 }, { "epoch": 0.021017538773735325, "grad_norm": 5.62885046005249, "learning_rate": 9.789824612262646e-07, "loss": 0.3185, "step": 435 }, { "epoch": 0.02106585495482437, "grad_norm": 3.294832944869995, "learning_rate": 9.789341450451756e-07, "loss": 0.438, "step": 436 }, { "epoch": 0.021114171135913417, "grad_norm": 5.170270919799805, "learning_rate": 9.788858288640866e-07, "loss": 0.3815, "step": 437 }, { "epoch": 0.021162487317002463, "grad_norm": 3.370877981185913, "learning_rate": 9.788375126829976e-07, "loss": 0.4276, "step": 438 }, { "epoch": 0.02121080349809151, "grad_norm": 2.701573133468628, "learning_rate": 9.787891965019086e-07, "loss": 0.3384, "step": 439 }, { "epoch": 0.02125911967918056, "grad_norm": 3.839859962463379, "learning_rate": 9.787408803208193e-07, "loss": 0.2438, "step": 440 }, { "epoch": 0.021307435860269605, "grad_norm": 3.1781163215637207, "learning_rate": 9.786925641397303e-07, "loss": 0.2734, "step": 441 }, { "epoch": 0.02135575204135865, "grad_norm": 3.0526866912841797, "learning_rate": 9.786442479586413e-07, "loss": 0.2754, "step": 442 }, { "epoch": 0.021404068222447697, "grad_norm": 4.099555015563965, "learning_rate": 9.785959317775523e-07, "loss": 0.5114, "step": 443 }, { "epoch": 0.021452384403536744, "grad_norm": 5.918056011199951, "learning_rate": 9.785476155964633e-07, "loss": 0.3193, "step": 444 }, { "epoch": 0.02150070058462579, "grad_norm": 4.311163425445557, "learning_rate": 9.78499299415374e-07, "loss": 0.4458, "step": 445 }, { "epoch": 0.02154901676571484, "grad_norm": 2.585665225982666, "learning_rate": 9.78450983234285e-07, "loss": 0.289, "step": 446 }, { "epoch": 0.021597332946803886, "grad_norm": 3.854249954223633, "learning_rate": 9.78402667053196e-07, "loss": 0.4993, "step": 447 }, { "epoch": 0.021645649127892932, "grad_norm": 2.714472770690918, "learning_rate": 9.78354350872107e-07, "loss": 0.2207, "step": 448 }, { "epoch": 0.021693965308981978, "grad_norm": 2.8523995876312256, "learning_rate": 9.78306034691018e-07, "loss": 0.3331, "step": 449 }, { "epoch": 0.021742281490071024, "grad_norm": 2.948734998703003, "learning_rate": 9.782577185099288e-07, "loss": 0.358, "step": 450 }, { "epoch": 0.02179059767116007, "grad_norm": 9.310885429382324, "learning_rate": 9.782094023288398e-07, "loss": 0.3244, "step": 451 }, { "epoch": 0.02183891385224912, "grad_norm": 2.6029322147369385, "learning_rate": 9.781610861477508e-07, "loss": 0.2553, "step": 452 }, { "epoch": 0.021887230033338166, "grad_norm": 2.710512638092041, "learning_rate": 9.781127699666618e-07, "loss": 0.2588, "step": 453 }, { "epoch": 0.021935546214427212, "grad_norm": 3.6317481994628906, "learning_rate": 9.780644537855728e-07, "loss": 0.4491, "step": 454 }, { "epoch": 0.02198386239551626, "grad_norm": 3.7179038524627686, "learning_rate": 9.780161376044836e-07, "loss": 0.3612, "step": 455 }, { "epoch": 0.022032178576605305, "grad_norm": 3.233677864074707, "learning_rate": 9.779678214233946e-07, "loss": 0.3663, "step": 456 }, { "epoch": 0.02208049475769435, "grad_norm": 2.6833808422088623, "learning_rate": 9.779195052423055e-07, "loss": 0.4345, "step": 457 }, { "epoch": 0.022128810938783397, "grad_norm": 3.8961822986602783, "learning_rate": 9.778711890612165e-07, "loss": 0.3121, "step": 458 }, { "epoch": 0.022177127119872447, "grad_norm": 3.0824756622314453, "learning_rate": 9.778228728801275e-07, "loss": 0.2958, "step": 459 }, { "epoch": 0.022225443300961493, "grad_norm": 2.9355263710021973, "learning_rate": 9.777745566990385e-07, "loss": 0.3195, "step": 460 }, { "epoch": 0.02227375948205054, "grad_norm": 2.559811592102051, "learning_rate": 9.777262405179493e-07, "loss": 0.1868, "step": 461 }, { "epoch": 0.022322075663139585, "grad_norm": 8.567280769348145, "learning_rate": 9.776779243368603e-07, "loss": 0.2339, "step": 462 }, { "epoch": 0.02237039184422863, "grad_norm": 6.460943698883057, "learning_rate": 9.776296081557713e-07, "loss": 0.4277, "step": 463 }, { "epoch": 0.022418708025317678, "grad_norm": 2.517951011657715, "learning_rate": 9.775812919746823e-07, "loss": 0.208, "step": 464 }, { "epoch": 0.022467024206406727, "grad_norm": 3.0611653327941895, "learning_rate": 9.775329757935933e-07, "loss": 0.6126, "step": 465 }, { "epoch": 0.022515340387495773, "grad_norm": 3.735119104385376, "learning_rate": 9.774846596125042e-07, "loss": 0.3938, "step": 466 }, { "epoch": 0.02256365656858482, "grad_norm": 5.450608253479004, "learning_rate": 9.77436343431415e-07, "loss": 0.336, "step": 467 }, { "epoch": 0.022611972749673866, "grad_norm": 3.2728590965270996, "learning_rate": 9.77388027250326e-07, "loss": 0.3476, "step": 468 }, { "epoch": 0.022660288930762912, "grad_norm": 2.134758949279785, "learning_rate": 9.77339711069237e-07, "loss": 0.2466, "step": 469 }, { "epoch": 0.022708605111851958, "grad_norm": 2.7927608489990234, "learning_rate": 9.77291394888148e-07, "loss": 0.2361, "step": 470 }, { "epoch": 0.022756921292941008, "grad_norm": 2.5517585277557373, "learning_rate": 9.77243078707059e-07, "loss": 0.2563, "step": 471 }, { "epoch": 0.022805237474030054, "grad_norm": 2.8972911834716797, "learning_rate": 9.7719476252597e-07, "loss": 0.2201, "step": 472 }, { "epoch": 0.0228535536551191, "grad_norm": 3.513291358947754, "learning_rate": 9.77146446344881e-07, "loss": 0.3769, "step": 473 }, { "epoch": 0.022901869836208146, "grad_norm": 6.837310314178467, "learning_rate": 9.770981301637917e-07, "loss": 0.353, "step": 474 }, { "epoch": 0.022950186017297192, "grad_norm": 3.0168793201446533, "learning_rate": 9.770498139827027e-07, "loss": 0.397, "step": 475 }, { "epoch": 0.02299850219838624, "grad_norm": 2.9581501483917236, "learning_rate": 9.770014978016137e-07, "loss": 0.3131, "step": 476 }, { "epoch": 0.023046818379475285, "grad_norm": 3.4254112243652344, "learning_rate": 9.769531816205247e-07, "loss": 0.4506, "step": 477 }, { "epoch": 0.023095134560564334, "grad_norm": 4.728109836578369, "learning_rate": 9.769048654394357e-07, "loss": 0.3619, "step": 478 }, { "epoch": 0.02314345074165338, "grad_norm": 4.072128772735596, "learning_rate": 9.768565492583467e-07, "loss": 0.4191, "step": 479 }, { "epoch": 0.023191766922742427, "grad_norm": 5.0161452293396, "learning_rate": 9.768082330772575e-07, "loss": 0.2139, "step": 480 }, { "epoch": 0.023240083103831473, "grad_norm": 10.605693817138672, "learning_rate": 9.767599168961685e-07, "loss": 0.3079, "step": 481 }, { "epoch": 0.02328839928492052, "grad_norm": 2.788853645324707, "learning_rate": 9.767116007150795e-07, "loss": 0.3628, "step": 482 }, { "epoch": 0.023336715466009565, "grad_norm": 4.859884738922119, "learning_rate": 9.766632845339904e-07, "loss": 0.4161, "step": 483 }, { "epoch": 0.023385031647098615, "grad_norm": 3.4943976402282715, "learning_rate": 9.766149683529014e-07, "loss": 0.4823, "step": 484 }, { "epoch": 0.02343334782818766, "grad_norm": 2.7220962047576904, "learning_rate": 9.765666521718124e-07, "loss": 0.3886, "step": 485 }, { "epoch": 0.023481664009276707, "grad_norm": 4.176887512207031, "learning_rate": 9.765183359907232e-07, "loss": 0.2793, "step": 486 }, { "epoch": 0.023529980190365753, "grad_norm": 2.4157021045684814, "learning_rate": 9.764700198096342e-07, "loss": 0.2366, "step": 487 }, { "epoch": 0.0235782963714548, "grad_norm": 4.784462928771973, "learning_rate": 9.764217036285452e-07, "loss": 0.3512, "step": 488 }, { "epoch": 0.023626612552543846, "grad_norm": 2.8248331546783447, "learning_rate": 9.763733874474562e-07, "loss": 0.2613, "step": 489 }, { "epoch": 0.023674928733632895, "grad_norm": 2.555285930633545, "learning_rate": 9.763250712663672e-07, "loss": 0.2959, "step": 490 }, { "epoch": 0.02372324491472194, "grad_norm": 3.190674066543579, "learning_rate": 9.762767550852782e-07, "loss": 0.4428, "step": 491 }, { "epoch": 0.023771561095810988, "grad_norm": 6.292142868041992, "learning_rate": 9.76228438904189e-07, "loss": 0.322, "step": 492 }, { "epoch": 0.023819877276900034, "grad_norm": 4.204588413238525, "learning_rate": 9.761801227231e-07, "loss": 0.4386, "step": 493 }, { "epoch": 0.02386819345798908, "grad_norm": 3.1012721061706543, "learning_rate": 9.76131806542011e-07, "loss": 0.4592, "step": 494 }, { "epoch": 0.023916509639078126, "grad_norm": 2.4598255157470703, "learning_rate": 9.76083490360922e-07, "loss": 0.2561, "step": 495 }, { "epoch": 0.023964825820167172, "grad_norm": 3.5692546367645264, "learning_rate": 9.760351741798329e-07, "loss": 0.4042, "step": 496 }, { "epoch": 0.024013142001256222, "grad_norm": 9.71478271484375, "learning_rate": 9.759868579987437e-07, "loss": 0.2703, "step": 497 }, { "epoch": 0.024061458182345268, "grad_norm": 2.3937363624572754, "learning_rate": 9.759385418176547e-07, "loss": 0.2364, "step": 498 }, { "epoch": 0.024109774363434314, "grad_norm": 3.701051950454712, "learning_rate": 9.758902256365656e-07, "loss": 0.3953, "step": 499 }, { "epoch": 0.02415809054452336, "grad_norm": 25.053916931152344, "learning_rate": 9.758419094554766e-07, "loss": 0.3231, "step": 500 }, { "epoch": 0.024206406725612407, "grad_norm": 3.0199577808380127, "learning_rate": 9.757935932743876e-07, "loss": 0.292, "step": 501 }, { "epoch": 0.024254722906701453, "grad_norm": 4.03092098236084, "learning_rate": 9.757452770932984e-07, "loss": 0.5021, "step": 502 }, { "epoch": 0.024303039087790503, "grad_norm": 3.5100362300872803, "learning_rate": 9.756969609122094e-07, "loss": 0.2864, "step": 503 }, { "epoch": 0.02435135526887955, "grad_norm": 3.3808250427246094, "learning_rate": 9.756486447311204e-07, "loss": 0.3184, "step": 504 }, { "epoch": 0.024399671449968595, "grad_norm": 2.4669837951660156, "learning_rate": 9.756003285500314e-07, "loss": 0.2344, "step": 505 }, { "epoch": 0.02444798763105764, "grad_norm": 6.6219353675842285, "learning_rate": 9.755520123689424e-07, "loss": 0.2986, "step": 506 }, { "epoch": 0.024496303812146687, "grad_norm": 3.54085373878479, "learning_rate": 9.755036961878531e-07, "loss": 0.392, "step": 507 }, { "epoch": 0.024544619993235733, "grad_norm": 2.3859856128692627, "learning_rate": 9.754553800067641e-07, "loss": 0.2179, "step": 508 }, { "epoch": 0.024592936174324783, "grad_norm": 9.23926067352295, "learning_rate": 9.754070638256751e-07, "loss": 0.3434, "step": 509 }, { "epoch": 0.02464125235541383, "grad_norm": 2.936267375946045, "learning_rate": 9.753587476445861e-07, "loss": 0.4425, "step": 510 }, { "epoch": 0.024689568536502875, "grad_norm": 3.397599697113037, "learning_rate": 9.753104314634971e-07, "loss": 0.3439, "step": 511 }, { "epoch": 0.02473788471759192, "grad_norm": 2.6518771648406982, "learning_rate": 9.752621152824079e-07, "loss": 0.2883, "step": 512 }, { "epoch": 0.024786200898680968, "grad_norm": 2.4209346771240234, "learning_rate": 9.752137991013189e-07, "loss": 0.2886, "step": 513 }, { "epoch": 0.024834517079770014, "grad_norm": 2.5146892070770264, "learning_rate": 9.751654829202299e-07, "loss": 0.2862, "step": 514 }, { "epoch": 0.02488283326085906, "grad_norm": 2.590160846710205, "learning_rate": 9.751171667391409e-07, "loss": 0.2587, "step": 515 }, { "epoch": 0.02493114944194811, "grad_norm": 8.160828590393066, "learning_rate": 9.750688505580518e-07, "loss": 0.3262, "step": 516 }, { "epoch": 0.024979465623037156, "grad_norm": 6.713265419006348, "learning_rate": 9.750205343769628e-07, "loss": 0.437, "step": 517 }, { "epoch": 0.025027781804126202, "grad_norm": 2.5147197246551514, "learning_rate": 9.749722181958736e-07, "loss": 0.2419, "step": 518 }, { "epoch": 0.025076097985215248, "grad_norm": 3.209458589553833, "learning_rate": 9.749239020147846e-07, "loss": 0.4003, "step": 519 }, { "epoch": 0.025124414166304294, "grad_norm": 3.292206048965454, "learning_rate": 9.748755858336956e-07, "loss": 0.4171, "step": 520 }, { "epoch": 0.02517273034739334, "grad_norm": 4.197911262512207, "learning_rate": 9.748272696526066e-07, "loss": 0.4272, "step": 521 }, { "epoch": 0.02522104652848239, "grad_norm": 6.616693019866943, "learning_rate": 9.747789534715176e-07, "loss": 0.3632, "step": 522 }, { "epoch": 0.025269362709571436, "grad_norm": 5.939690113067627, "learning_rate": 9.747306372904286e-07, "loss": 0.3168, "step": 523 }, { "epoch": 0.025317678890660483, "grad_norm": 2.459395170211792, "learning_rate": 9.746823211093396e-07, "loss": 0.2799, "step": 524 }, { "epoch": 0.02536599507174953, "grad_norm": 3.001983165740967, "learning_rate": 9.746340049282503e-07, "loss": 0.3163, "step": 525 }, { "epoch": 0.025414311252838575, "grad_norm": 2.660701274871826, "learning_rate": 9.745856887471613e-07, "loss": 0.3334, "step": 526 }, { "epoch": 0.02546262743392762, "grad_norm": 5.07377290725708, "learning_rate": 9.745373725660723e-07, "loss": 0.5727, "step": 527 }, { "epoch": 0.02551094361501667, "grad_norm": 8.080005645751953, "learning_rate": 9.744890563849833e-07, "loss": 0.2935, "step": 528 }, { "epoch": 0.025559259796105717, "grad_norm": 4.334054946899414, "learning_rate": 9.744407402038943e-07, "loss": 0.2846, "step": 529 }, { "epoch": 0.025607575977194763, "grad_norm": 2.4584858417510986, "learning_rate": 9.743924240228053e-07, "loss": 0.3082, "step": 530 }, { "epoch": 0.02565589215828381, "grad_norm": 3.116365432739258, "learning_rate": 9.74344107841716e-07, "loss": 0.4152, "step": 531 }, { "epoch": 0.025704208339372855, "grad_norm": 8.42453384399414, "learning_rate": 9.74295791660627e-07, "loss": 0.3803, "step": 532 }, { "epoch": 0.0257525245204619, "grad_norm": 2.6850357055664062, "learning_rate": 9.74247475479538e-07, "loss": 0.337, "step": 533 }, { "epoch": 0.025800840701550948, "grad_norm": 3.4855525493621826, "learning_rate": 9.74199159298449e-07, "loss": 0.335, "step": 534 }, { "epoch": 0.025849156882639997, "grad_norm": 16.58172035217285, "learning_rate": 9.7415084311736e-07, "loss": 0.3402, "step": 535 }, { "epoch": 0.025897473063729044, "grad_norm": 3.4395294189453125, "learning_rate": 9.74102526936271e-07, "loss": 0.435, "step": 536 }, { "epoch": 0.02594578924481809, "grad_norm": 2.157045364379883, "learning_rate": 9.74054210755182e-07, "loss": 0.2825, "step": 537 }, { "epoch": 0.025994105425907136, "grad_norm": 3.293933391571045, "learning_rate": 9.740058945740928e-07, "loss": 0.4144, "step": 538 }, { "epoch": 0.026042421606996182, "grad_norm": 2.6696367263793945, "learning_rate": 9.739575783930038e-07, "loss": 0.3172, "step": 539 }, { "epoch": 0.02609073778808523, "grad_norm": 5.949459075927734, "learning_rate": 9.739092622119148e-07, "loss": 0.2436, "step": 540 }, { "epoch": 0.026139053969174278, "grad_norm": 2.6209537982940674, "learning_rate": 9.738609460308258e-07, "loss": 0.2923, "step": 541 }, { "epoch": 0.026187370150263324, "grad_norm": 3.4263617992401123, "learning_rate": 9.738126298497367e-07, "loss": 0.2414, "step": 542 }, { "epoch": 0.02623568633135237, "grad_norm": 2.3965213298797607, "learning_rate": 9.737643136686477e-07, "loss": 0.2367, "step": 543 }, { "epoch": 0.026284002512441416, "grad_norm": 3.7071211338043213, "learning_rate": 9.737159974875585e-07, "loss": 0.2956, "step": 544 }, { "epoch": 0.026332318693530463, "grad_norm": 2.981661319732666, "learning_rate": 9.736676813064695e-07, "loss": 0.3502, "step": 545 }, { "epoch": 0.02638063487461951, "grad_norm": 3.028587579727173, "learning_rate": 9.736193651253805e-07, "loss": 0.4428, "step": 546 }, { "epoch": 0.02642895105570856, "grad_norm": 2.691013813018799, "learning_rate": 9.735710489442915e-07, "loss": 0.3423, "step": 547 }, { "epoch": 0.026477267236797605, "grad_norm": 5.37606143951416, "learning_rate": 9.735227327632025e-07, "loss": 0.1814, "step": 548 }, { "epoch": 0.02652558341788665, "grad_norm": 4.053155422210693, "learning_rate": 9.734744165821133e-07, "loss": 0.3456, "step": 549 }, { "epoch": 0.026573899598975697, "grad_norm": 3.9348766803741455, "learning_rate": 9.734261004010242e-07, "loss": 0.4682, "step": 550 }, { "epoch": 0.026622215780064743, "grad_norm": 2.758354663848877, "learning_rate": 9.733777842199352e-07, "loss": 0.3366, "step": 551 }, { "epoch": 0.02667053196115379, "grad_norm": 2.226593255996704, "learning_rate": 9.733294680388462e-07, "loss": 0.2267, "step": 552 }, { "epoch": 0.026718848142242835, "grad_norm": 2.8209543228149414, "learning_rate": 9.732811518577572e-07, "loss": 0.3192, "step": 553 }, { "epoch": 0.026767164323331885, "grad_norm": 2.719883680343628, "learning_rate": 9.73232835676668e-07, "loss": 0.3243, "step": 554 }, { "epoch": 0.02681548050442093, "grad_norm": 4.331324100494385, "learning_rate": 9.73184519495579e-07, "loss": 0.3762, "step": 555 }, { "epoch": 0.026863796685509977, "grad_norm": 2.4079926013946533, "learning_rate": 9.7313620331449e-07, "loss": 0.2443, "step": 556 }, { "epoch": 0.026912112866599024, "grad_norm": 3.388885498046875, "learning_rate": 9.73087887133401e-07, "loss": 0.3061, "step": 557 }, { "epoch": 0.02696042904768807, "grad_norm": 2.9894495010375977, "learning_rate": 9.73039570952312e-07, "loss": 0.271, "step": 558 }, { "epoch": 0.027008745228777116, "grad_norm": 3.8011906147003174, "learning_rate": 9.729912547712227e-07, "loss": 0.3587, "step": 559 }, { "epoch": 0.027057061409866166, "grad_norm": 3.1871228218078613, "learning_rate": 9.729429385901337e-07, "loss": 0.311, "step": 560 }, { "epoch": 0.027105377590955212, "grad_norm": 3.0439743995666504, "learning_rate": 9.728946224090447e-07, "loss": 0.42, "step": 561 }, { "epoch": 0.027153693772044258, "grad_norm": 2.1560635566711426, "learning_rate": 9.728463062279557e-07, "loss": 0.2806, "step": 562 }, { "epoch": 0.027202009953133304, "grad_norm": 3.0309536457061768, "learning_rate": 9.727979900468667e-07, "loss": 0.4001, "step": 563 }, { "epoch": 0.02725032613422235, "grad_norm": 5.979130268096924, "learning_rate": 9.727496738657775e-07, "loss": 0.2754, "step": 564 }, { "epoch": 0.027298642315311396, "grad_norm": 2.224682092666626, "learning_rate": 9.727013576846885e-07, "loss": 0.2301, "step": 565 }, { "epoch": 0.027346958496400446, "grad_norm": 5.0141377449035645, "learning_rate": 9.726530415035995e-07, "loss": 0.3527, "step": 566 }, { "epoch": 0.027395274677489492, "grad_norm": 3.5677707195281982, "learning_rate": 9.726047253225104e-07, "loss": 0.4364, "step": 567 }, { "epoch": 0.02744359085857854, "grad_norm": 2.5082967281341553, "learning_rate": 9.725564091414214e-07, "loss": 0.2876, "step": 568 }, { "epoch": 0.027491907039667585, "grad_norm": 2.7494168281555176, "learning_rate": 9.725080929603322e-07, "loss": 0.3046, "step": 569 }, { "epoch": 0.02754022322075663, "grad_norm": 2.6459145545959473, "learning_rate": 9.724597767792432e-07, "loss": 0.1856, "step": 570 }, { "epoch": 0.027588539401845677, "grad_norm": 4.256002426147461, "learning_rate": 9.724114605981542e-07, "loss": 0.319, "step": 571 }, { "epoch": 0.027636855582934723, "grad_norm": 2.8036997318267822, "learning_rate": 9.723631444170652e-07, "loss": 0.3787, "step": 572 }, { "epoch": 0.027685171764023773, "grad_norm": 3.4025449752807617, "learning_rate": 9.723148282359762e-07, "loss": 0.4473, "step": 573 }, { "epoch": 0.02773348794511282, "grad_norm": 3.047724485397339, "learning_rate": 9.722665120548872e-07, "loss": 0.4541, "step": 574 }, { "epoch": 0.027781804126201865, "grad_norm": 5.265444755554199, "learning_rate": 9.722181958737982e-07, "loss": 0.3114, "step": 575 }, { "epoch": 0.02783012030729091, "grad_norm": 3.179608106613159, "learning_rate": 9.72169879692709e-07, "loss": 0.4378, "step": 576 }, { "epoch": 0.027878436488379957, "grad_norm": 2.2824347019195557, "learning_rate": 9.7212156351162e-07, "loss": 0.299, "step": 577 }, { "epoch": 0.027926752669469004, "grad_norm": 2.5449953079223633, "learning_rate": 9.72073247330531e-07, "loss": 0.3041, "step": 578 }, { "epoch": 0.027975068850558053, "grad_norm": 3.1304712295532227, "learning_rate": 9.72024931149442e-07, "loss": 0.3148, "step": 579 }, { "epoch": 0.0280233850316471, "grad_norm": 2.42651104927063, "learning_rate": 9.719766149683529e-07, "loss": 0.3302, "step": 580 }, { "epoch": 0.028071701212736146, "grad_norm": 2.5854909420013428, "learning_rate": 9.719282987872639e-07, "loss": 0.2531, "step": 581 }, { "epoch": 0.028120017393825192, "grad_norm": 2.907717227935791, "learning_rate": 9.718799826061747e-07, "loss": 0.2877, "step": 582 }, { "epoch": 0.028168333574914238, "grad_norm": 2.262662172317505, "learning_rate": 9.718316664250857e-07, "loss": 0.2766, "step": 583 }, { "epoch": 0.028216649756003284, "grad_norm": 8.507390975952148, "learning_rate": 9.717833502439966e-07, "loss": 0.3086, "step": 584 }, { "epoch": 0.028264965937092334, "grad_norm": 2.5281872749328613, "learning_rate": 9.717350340629076e-07, "loss": 0.2464, "step": 585 }, { "epoch": 0.02831328211818138, "grad_norm": 3.073946475982666, "learning_rate": 9.716867178818186e-07, "loss": 0.4294, "step": 586 }, { "epoch": 0.028361598299270426, "grad_norm": 6.731721878051758, "learning_rate": 9.716384017007296e-07, "loss": 0.2934, "step": 587 }, { "epoch": 0.028409914480359472, "grad_norm": 2.7988250255584717, "learning_rate": 9.715900855196406e-07, "loss": 0.4379, "step": 588 }, { "epoch": 0.02845823066144852, "grad_norm": 3.283963203430176, "learning_rate": 9.715417693385514e-07, "loss": 0.3145, "step": 589 }, { "epoch": 0.028506546842537565, "grad_norm": 3.3558895587921143, "learning_rate": 9.714934531574624e-07, "loss": 0.2899, "step": 590 }, { "epoch": 0.02855486302362661, "grad_norm": 3.718153953552246, "learning_rate": 9.714451369763734e-07, "loss": 0.4327, "step": 591 }, { "epoch": 0.02860317920471566, "grad_norm": 3.735590934753418, "learning_rate": 9.713968207952844e-07, "loss": 0.4457, "step": 592 }, { "epoch": 0.028651495385804707, "grad_norm": 4.41007137298584, "learning_rate": 9.713485046141953e-07, "loss": 0.5772, "step": 593 }, { "epoch": 0.028699811566893753, "grad_norm": 2.2620174884796143, "learning_rate": 9.713001884331063e-07, "loss": 0.2772, "step": 594 }, { "epoch": 0.0287481277479828, "grad_norm": 4.886958122253418, "learning_rate": 9.712518722520171e-07, "loss": 0.2888, "step": 595 }, { "epoch": 0.028796443929071845, "grad_norm": 17.82718276977539, "learning_rate": 9.71203556070928e-07, "loss": 0.2638, "step": 596 }, { "epoch": 0.02884476011016089, "grad_norm": 2.693788766860962, "learning_rate": 9.71155239889839e-07, "loss": 0.2978, "step": 597 }, { "epoch": 0.02889307629124994, "grad_norm": 5.669571876525879, "learning_rate": 9.7110692370875e-07, "loss": 0.5441, "step": 598 }, { "epoch": 0.028941392472338987, "grad_norm": 9.912436485290527, "learning_rate": 9.71058607527661e-07, "loss": 0.3451, "step": 599 }, { "epoch": 0.028989708653428033, "grad_norm": 7.132877826690674, "learning_rate": 9.71010291346572e-07, "loss": 0.3427, "step": 600 }, { "epoch": 0.02903802483451708, "grad_norm": 2.9619877338409424, "learning_rate": 9.709619751654828e-07, "loss": 0.3947, "step": 601 }, { "epoch": 0.029086341015606126, "grad_norm": 2.182756185531616, "learning_rate": 9.709136589843938e-07, "loss": 0.2652, "step": 602 }, { "epoch": 0.029134657196695172, "grad_norm": 7.363553524017334, "learning_rate": 9.708653428033048e-07, "loss": 0.4474, "step": 603 }, { "epoch": 0.02918297337778422, "grad_norm": 2.9206721782684326, "learning_rate": 9.708170266222158e-07, "loss": 0.421, "step": 604 }, { "epoch": 0.029231289558873268, "grad_norm": 3.177523374557495, "learning_rate": 9.707687104411268e-07, "loss": 0.3699, "step": 605 }, { "epoch": 0.029279605739962314, "grad_norm": 3.383510112762451, "learning_rate": 9.707203942600376e-07, "loss": 0.399, "step": 606 }, { "epoch": 0.02932792192105136, "grad_norm": 3.285050868988037, "learning_rate": 9.706720780789486e-07, "loss": 0.3441, "step": 607 }, { "epoch": 0.029376238102140406, "grad_norm": 3.815667152404785, "learning_rate": 9.706237618978596e-07, "loss": 0.2855, "step": 608 }, { "epoch": 0.029424554283229452, "grad_norm": 3.1998891830444336, "learning_rate": 9.705754457167705e-07, "loss": 0.2318, "step": 609 }, { "epoch": 0.0294728704643185, "grad_norm": 3.7855653762817383, "learning_rate": 9.705271295356815e-07, "loss": 0.3717, "step": 610 }, { "epoch": 0.029521186645407548, "grad_norm": 8.577168464660645, "learning_rate": 9.704788133545923e-07, "loss": 0.4208, "step": 611 }, { "epoch": 0.029569502826496594, "grad_norm": 2.7360544204711914, "learning_rate": 9.704304971735033e-07, "loss": 0.3269, "step": 612 }, { "epoch": 0.02961781900758564, "grad_norm": 5.695843696594238, "learning_rate": 9.703821809924143e-07, "loss": 0.293, "step": 613 }, { "epoch": 0.029666135188674687, "grad_norm": 3.4484143257141113, "learning_rate": 9.703338648113253e-07, "loss": 0.3239, "step": 614 }, { "epoch": 0.029714451369763733, "grad_norm": 11.299336433410645, "learning_rate": 9.702855486302363e-07, "loss": 0.4773, "step": 615 }, { "epoch": 0.02976276755085278, "grad_norm": 16.118555068969727, "learning_rate": 9.70237232449147e-07, "loss": 0.4007, "step": 616 }, { "epoch": 0.02981108373194183, "grad_norm": 2.231184959411621, "learning_rate": 9.70188916268058e-07, "loss": 0.3239, "step": 617 }, { "epoch": 0.029859399913030875, "grad_norm": 7.3491411209106445, "learning_rate": 9.70140600086969e-07, "loss": 0.2567, "step": 618 }, { "epoch": 0.02990771609411992, "grad_norm": 1.915977954864502, "learning_rate": 9.7009228390588e-07, "loss": 0.2023, "step": 619 }, { "epoch": 0.029956032275208967, "grad_norm": 2.6875531673431396, "learning_rate": 9.70043967724791e-07, "loss": 0.3275, "step": 620 }, { "epoch": 0.030004348456298013, "grad_norm": 4.039107799530029, "learning_rate": 9.699956515437018e-07, "loss": 0.3874, "step": 621 }, { "epoch": 0.03005266463738706, "grad_norm": 2.2694528102874756, "learning_rate": 9.699473353626128e-07, "loss": 0.2243, "step": 622 }, { "epoch": 0.03010098081847611, "grad_norm": 3.0361855030059814, "learning_rate": 9.698990191815238e-07, "loss": 0.2234, "step": 623 }, { "epoch": 0.030149296999565155, "grad_norm": 3.340183734893799, "learning_rate": 9.698507030004348e-07, "loss": 0.4191, "step": 624 }, { "epoch": 0.0301976131806542, "grad_norm": 3.375115394592285, "learning_rate": 9.698023868193458e-07, "loss": 0.4521, "step": 625 }, { "epoch": 0.030245929361743248, "grad_norm": 4.115942478179932, "learning_rate": 9.697540706382567e-07, "loss": 0.3548, "step": 626 }, { "epoch": 0.030294245542832294, "grad_norm": 2.3006739616394043, "learning_rate": 9.697057544571675e-07, "loss": 0.2271, "step": 627 }, { "epoch": 0.03034256172392134, "grad_norm": 2.4183804988861084, "learning_rate": 9.696574382760785e-07, "loss": 0.3323, "step": 628 }, { "epoch": 0.030390877905010386, "grad_norm": 2.924668788909912, "learning_rate": 9.696091220949895e-07, "loss": 0.3331, "step": 629 }, { "epoch": 0.030439194086099436, "grad_norm": 2.7858636379241943, "learning_rate": 9.695608059139005e-07, "loss": 0.3008, "step": 630 }, { "epoch": 0.030487510267188482, "grad_norm": 2.1964569091796875, "learning_rate": 9.695124897328115e-07, "loss": 0.217, "step": 631 }, { "epoch": 0.030535826448277528, "grad_norm": 2.923983335494995, "learning_rate": 9.694641735517225e-07, "loss": 0.2701, "step": 632 }, { "epoch": 0.030584142629366574, "grad_norm": 3.67930006980896, "learning_rate": 9.694158573706333e-07, "loss": 0.3093, "step": 633 }, { "epoch": 0.03063245881045562, "grad_norm": 3.0369973182678223, "learning_rate": 9.693675411895442e-07, "loss": 0.308, "step": 634 }, { "epoch": 0.030680774991544667, "grad_norm": 2.67716908454895, "learning_rate": 9.693192250084552e-07, "loss": 0.1767, "step": 635 }, { "epoch": 0.030729091172633716, "grad_norm": 2.7881274223327637, "learning_rate": 9.692709088273662e-07, "loss": 0.3033, "step": 636 }, { "epoch": 0.030777407353722763, "grad_norm": 8.521295547485352, "learning_rate": 9.692225926462772e-07, "loss": 0.278, "step": 637 }, { "epoch": 0.03082572353481181, "grad_norm": 5.439621925354004, "learning_rate": 9.691742764651882e-07, "loss": 0.3376, "step": 638 }, { "epoch": 0.030874039715900855, "grad_norm": 3.609224319458008, "learning_rate": 9.691259602840992e-07, "loss": 0.4946, "step": 639 }, { "epoch": 0.0309223558969899, "grad_norm": 2.0390069484710693, "learning_rate": 9.6907764410301e-07, "loss": 0.2433, "step": 640 }, { "epoch": 0.030970672078078947, "grad_norm": 3.648671865463257, "learning_rate": 9.69029327921921e-07, "loss": 0.3851, "step": 641 }, { "epoch": 0.031018988259167997, "grad_norm": 10.72965145111084, "learning_rate": 9.68981011740832e-07, "loss": 0.2444, "step": 642 }, { "epoch": 0.031067304440257043, "grad_norm": 4.0619988441467285, "learning_rate": 9.68932695559743e-07, "loss": 0.4797, "step": 643 }, { "epoch": 0.03111562062134609, "grad_norm": 5.241813659667969, "learning_rate": 9.68884379378654e-07, "loss": 0.4234, "step": 644 }, { "epoch": 0.031163936802435135, "grad_norm": 2.916358470916748, "learning_rate": 9.68836063197565e-07, "loss": 0.3621, "step": 645 }, { "epoch": 0.03121225298352418, "grad_norm": 3.9421608448028564, "learning_rate": 9.687877470164757e-07, "loss": 0.4371, "step": 646 }, { "epoch": 0.03126056916461323, "grad_norm": 2.7943029403686523, "learning_rate": 9.687394308353867e-07, "loss": 0.2978, "step": 647 }, { "epoch": 0.031308885345702274, "grad_norm": 1.9576640129089355, "learning_rate": 9.686911146542977e-07, "loss": 0.2141, "step": 648 }, { "epoch": 0.03135720152679132, "grad_norm": 2.8642351627349854, "learning_rate": 9.686427984732087e-07, "loss": 0.2797, "step": 649 }, { "epoch": 0.031405517707880366, "grad_norm": 3.170063018798828, "learning_rate": 9.685944822921197e-07, "loss": 0.3215, "step": 650 }, { "epoch": 0.03145383388896941, "grad_norm": 2.481196641921997, "learning_rate": 9.685461661110307e-07, "loss": 0.2718, "step": 651 }, { "epoch": 0.031502150070058466, "grad_norm": 7.786694526672363, "learning_rate": 9.684978499299416e-07, "loss": 0.4296, "step": 652 }, { "epoch": 0.03155046625114751, "grad_norm": 2.204488754272461, "learning_rate": 9.684495337488524e-07, "loss": 0.2709, "step": 653 }, { "epoch": 0.03159878243223656, "grad_norm": 2.064192533493042, "learning_rate": 9.684012175677634e-07, "loss": 0.1975, "step": 654 }, { "epoch": 0.031647098613325604, "grad_norm": 2.427579879760742, "learning_rate": 9.683529013866744e-07, "loss": 0.2377, "step": 655 }, { "epoch": 0.03169541479441465, "grad_norm": 3.3490638732910156, "learning_rate": 9.683045852055854e-07, "loss": 0.2724, "step": 656 }, { "epoch": 0.031743730975503696, "grad_norm": 2.8287205696105957, "learning_rate": 9.682562690244964e-07, "loss": 0.2922, "step": 657 }, { "epoch": 0.03179204715659274, "grad_norm": 3.1594858169555664, "learning_rate": 9.682079528434072e-07, "loss": 0.2779, "step": 658 }, { "epoch": 0.03184036333768179, "grad_norm": 2.791775941848755, "learning_rate": 9.681596366623182e-07, "loss": 0.3125, "step": 659 }, { "epoch": 0.031888679518770835, "grad_norm": 3.427776575088501, "learning_rate": 9.681113204812291e-07, "loss": 0.3543, "step": 660 }, { "epoch": 0.03193699569985988, "grad_norm": 7.335475444793701, "learning_rate": 9.680630043001401e-07, "loss": 0.2603, "step": 661 }, { "epoch": 0.03198531188094893, "grad_norm": 7.30718994140625, "learning_rate": 9.680146881190511e-07, "loss": 0.4501, "step": 662 }, { "epoch": 0.032033628062037973, "grad_norm": 2.7457573413848877, "learning_rate": 9.67966371937962e-07, "loss": 0.3003, "step": 663 }, { "epoch": 0.03208194424312703, "grad_norm": 2.1687111854553223, "learning_rate": 9.679180557568729e-07, "loss": 0.2746, "step": 664 }, { "epoch": 0.03213026042421607, "grad_norm": 3.9406967163085938, "learning_rate": 9.678697395757839e-07, "loss": 0.4387, "step": 665 }, { "epoch": 0.03217857660530512, "grad_norm": 2.670217514038086, "learning_rate": 9.678214233946949e-07, "loss": 0.2202, "step": 666 }, { "epoch": 0.032226892786394165, "grad_norm": 2.962779998779297, "learning_rate": 9.677731072136059e-07, "loss": 0.3382, "step": 667 }, { "epoch": 0.03227520896748321, "grad_norm": 2.581692934036255, "learning_rate": 9.677247910325166e-07, "loss": 0.3513, "step": 668 }, { "epoch": 0.03232352514857226, "grad_norm": 3.6033802032470703, "learning_rate": 9.676764748514276e-07, "loss": 0.3713, "step": 669 }, { "epoch": 0.032371841329661304, "grad_norm": 6.731000900268555, "learning_rate": 9.676281586703386e-07, "loss": 0.2598, "step": 670 }, { "epoch": 0.03242015751075035, "grad_norm": 2.4316720962524414, "learning_rate": 9.675798424892496e-07, "loss": 0.2782, "step": 671 }, { "epoch": 0.032468473691839396, "grad_norm": 2.046900987625122, "learning_rate": 9.675315263081606e-07, "loss": 0.1927, "step": 672 }, { "epoch": 0.03251678987292844, "grad_norm": 2.631699323654175, "learning_rate": 9.674832101270714e-07, "loss": 0.3189, "step": 673 }, { "epoch": 0.03256510605401749, "grad_norm": 2.870434284210205, "learning_rate": 9.674348939459824e-07, "loss": 0.3915, "step": 674 }, { "epoch": 0.032613422235106534, "grad_norm": 1.8693537712097168, "learning_rate": 9.673865777648934e-07, "loss": 0.2323, "step": 675 }, { "epoch": 0.03266173841619558, "grad_norm": 3.167118787765503, "learning_rate": 9.673382615838044e-07, "loss": 0.3017, "step": 676 }, { "epoch": 0.032710054597284634, "grad_norm": 2.720151662826538, "learning_rate": 9.672899454027153e-07, "loss": 0.3016, "step": 677 }, { "epoch": 0.03275837077837368, "grad_norm": 3.1792073249816895, "learning_rate": 9.672416292216263e-07, "loss": 0.3628, "step": 678 }, { "epoch": 0.032806686959462726, "grad_norm": 3.1895651817321777, "learning_rate": 9.671933130405371e-07, "loss": 0.3615, "step": 679 }, { "epoch": 0.03285500314055177, "grad_norm": 2.806121587753296, "learning_rate": 9.67144996859448e-07, "loss": 0.3406, "step": 680 }, { "epoch": 0.03290331932164082, "grad_norm": 1.819555640220642, "learning_rate": 9.67096680678359e-07, "loss": 0.1926, "step": 681 }, { "epoch": 0.032951635502729865, "grad_norm": 3.0646157264709473, "learning_rate": 9.6704836449727e-07, "loss": 0.3457, "step": 682 }, { "epoch": 0.03299995168381891, "grad_norm": 18.114837646484375, "learning_rate": 9.67000048316181e-07, "loss": 0.3114, "step": 683 }, { "epoch": 0.03304826786490796, "grad_norm": 5.398266315460205, "learning_rate": 9.66951732135092e-07, "loss": 0.3642, "step": 684 }, { "epoch": 0.033096584045997, "grad_norm": 2.7439205646514893, "learning_rate": 9.669034159540028e-07, "loss": 0.3351, "step": 685 }, { "epoch": 0.03314490022708605, "grad_norm": 5.931520462036133, "learning_rate": 9.668550997729138e-07, "loss": 0.2891, "step": 686 }, { "epoch": 0.033193216408175095, "grad_norm": 2.9962668418884277, "learning_rate": 9.668067835918248e-07, "loss": 0.2632, "step": 687 }, { "epoch": 0.03324153258926414, "grad_norm": 2.044340133666992, "learning_rate": 9.667584674107358e-07, "loss": 0.2155, "step": 688 }, { "epoch": 0.03328984877035319, "grad_norm": 2.221890449523926, "learning_rate": 9.667101512296468e-07, "loss": 0.2747, "step": 689 }, { "epoch": 0.03333816495144224, "grad_norm": 5.20952033996582, "learning_rate": 9.666618350485578e-07, "loss": 0.4273, "step": 690 }, { "epoch": 0.03338648113253129, "grad_norm": 2.3790087699890137, "learning_rate": 9.666135188674686e-07, "loss": 0.2816, "step": 691 }, { "epoch": 0.03343479731362033, "grad_norm": 3.8167483806610107, "learning_rate": 9.665652026863796e-07, "loss": 0.3639, "step": 692 }, { "epoch": 0.03348311349470938, "grad_norm": 2.7205986976623535, "learning_rate": 9.665168865052906e-07, "loss": 0.3351, "step": 693 }, { "epoch": 0.033531429675798426, "grad_norm": 3.3541195392608643, "learning_rate": 9.664685703242015e-07, "loss": 0.2937, "step": 694 }, { "epoch": 0.03357974585688747, "grad_norm": 6.774521827697754, "learning_rate": 9.664202541431125e-07, "loss": 0.4786, "step": 695 }, { "epoch": 0.03362806203797652, "grad_norm": 2.7702877521514893, "learning_rate": 9.663719379620235e-07, "loss": 0.3904, "step": 696 }, { "epoch": 0.033676378219065564, "grad_norm": 2.873828887939453, "learning_rate": 9.663236217809343e-07, "loss": 0.3485, "step": 697 }, { "epoch": 0.03372469440015461, "grad_norm": 2.7960498332977295, "learning_rate": 9.662753055998453e-07, "loss": 0.3931, "step": 698 }, { "epoch": 0.033773010581243657, "grad_norm": 4.186445713043213, "learning_rate": 9.662269894187563e-07, "loss": 0.2792, "step": 699 }, { "epoch": 0.0338213267623327, "grad_norm": 3.8963377475738525, "learning_rate": 9.661786732376673e-07, "loss": 0.4393, "step": 700 }, { "epoch": 0.03386964294342175, "grad_norm": 2.6296656131744385, "learning_rate": 9.661303570565783e-07, "loss": 0.3095, "step": 701 }, { "epoch": 0.0339179591245108, "grad_norm": 2.0172007083892822, "learning_rate": 9.660820408754893e-07, "loss": 0.2751, "step": 702 }, { "epoch": 0.03396627530559985, "grad_norm": 2.3075778484344482, "learning_rate": 9.660337246944002e-07, "loss": 0.2285, "step": 703 }, { "epoch": 0.034014591486688894, "grad_norm": 5.639332294464111, "learning_rate": 9.65985408513311e-07, "loss": 0.4045, "step": 704 }, { "epoch": 0.03406290766777794, "grad_norm": 3.8391594886779785, "learning_rate": 9.65937092332222e-07, "loss": 0.3999, "step": 705 }, { "epoch": 0.03411122384886699, "grad_norm": 2.393343687057495, "learning_rate": 9.65888776151133e-07, "loss": 0.2645, "step": 706 }, { "epoch": 0.03415954002995603, "grad_norm": 2.245304584503174, "learning_rate": 9.65840459970044e-07, "loss": 0.249, "step": 707 }, { "epoch": 0.03420785621104508, "grad_norm": 4.202280521392822, "learning_rate": 9.65792143788955e-07, "loss": 0.403, "step": 708 }, { "epoch": 0.034256172392134125, "grad_norm": 2.3152623176574707, "learning_rate": 9.65743827607866e-07, "loss": 0.2434, "step": 709 }, { "epoch": 0.03430448857322317, "grad_norm": 2.8823063373565674, "learning_rate": 9.656955114267767e-07, "loss": 0.3406, "step": 710 }, { "epoch": 0.03435280475431222, "grad_norm": 2.7265141010284424, "learning_rate": 9.656471952456877e-07, "loss": 0.1859, "step": 711 }, { "epoch": 0.034401120935401264, "grad_norm": 4.71783447265625, "learning_rate": 9.655988790645987e-07, "loss": 0.4975, "step": 712 }, { "epoch": 0.03444943711649031, "grad_norm": 3.428546905517578, "learning_rate": 9.655505628835097e-07, "loss": 0.2865, "step": 713 }, { "epoch": 0.034497753297579356, "grad_norm": 2.2557876110076904, "learning_rate": 9.655022467024207e-07, "loss": 0.216, "step": 714 }, { "epoch": 0.03454606947866841, "grad_norm": 1.9437402486801147, "learning_rate": 9.654539305213315e-07, "loss": 0.2478, "step": 715 }, { "epoch": 0.034594385659757455, "grad_norm": 3.1772494316101074, "learning_rate": 9.654056143402425e-07, "loss": 0.3854, "step": 716 }, { "epoch": 0.0346427018408465, "grad_norm": 4.322741508483887, "learning_rate": 9.653572981591535e-07, "loss": 0.4201, "step": 717 }, { "epoch": 0.03469101802193555, "grad_norm": 4.4286417961120605, "learning_rate": 9.653089819780645e-07, "loss": 0.1978, "step": 718 }, { "epoch": 0.034739334203024594, "grad_norm": 2.5103838443756104, "learning_rate": 9.652606657969754e-07, "loss": 0.3152, "step": 719 }, { "epoch": 0.03478765038411364, "grad_norm": 3.2371184825897217, "learning_rate": 9.652123496158862e-07, "loss": 0.3508, "step": 720 }, { "epoch": 0.034835966565202686, "grad_norm": 1.839545488357544, "learning_rate": 9.651640334347972e-07, "loss": 0.2168, "step": 721 }, { "epoch": 0.03488428274629173, "grad_norm": 3.1407887935638428, "learning_rate": 9.651157172537082e-07, "loss": 0.4005, "step": 722 }, { "epoch": 0.03493259892738078, "grad_norm": 2.941208839416504, "learning_rate": 9.650674010726192e-07, "loss": 0.3514, "step": 723 }, { "epoch": 0.034980915108469825, "grad_norm": 3.6326675415039062, "learning_rate": 9.650190848915302e-07, "loss": 0.4546, "step": 724 }, { "epoch": 0.03502923128955887, "grad_norm": 2.872123956680298, "learning_rate": 9.64970768710441e-07, "loss": 0.343, "step": 725 }, { "epoch": 0.03507754747064792, "grad_norm": 2.8143813610076904, "learning_rate": 9.64922452529352e-07, "loss": 0.2618, "step": 726 }, { "epoch": 0.03512586365173697, "grad_norm": 2.0726375579833984, "learning_rate": 9.64874136348263e-07, "loss": 0.2853, "step": 727 }, { "epoch": 0.035174179832826016, "grad_norm": 3.872140407562256, "learning_rate": 9.64825820167174e-07, "loss": 0.3247, "step": 728 }, { "epoch": 0.03522249601391506, "grad_norm": 3.424999237060547, "learning_rate": 9.64777503986085e-07, "loss": 0.4549, "step": 729 }, { "epoch": 0.03527081219500411, "grad_norm": 3.2120094299316406, "learning_rate": 9.64729187804996e-07, "loss": 0.3026, "step": 730 }, { "epoch": 0.035319128376093155, "grad_norm": 3.3864188194274902, "learning_rate": 9.646808716239067e-07, "loss": 0.3058, "step": 731 }, { "epoch": 0.0353674445571822, "grad_norm": 2.5511679649353027, "learning_rate": 9.646325554428177e-07, "loss": 0.3067, "step": 732 }, { "epoch": 0.03541576073827125, "grad_norm": 3.6355721950531006, "learning_rate": 9.645842392617287e-07, "loss": 0.3974, "step": 733 }, { "epoch": 0.03546407691936029, "grad_norm": 5.45479679107666, "learning_rate": 9.645359230806397e-07, "loss": 0.3242, "step": 734 }, { "epoch": 0.03551239310044934, "grad_norm": 55.65563201904297, "learning_rate": 9.644876068995507e-07, "loss": 0.4441, "step": 735 }, { "epoch": 0.035560709281538386, "grad_norm": 2.672968864440918, "learning_rate": 9.644392907184614e-07, "loss": 0.3033, "step": 736 }, { "epoch": 0.03560902546262743, "grad_norm": 2.4356508255004883, "learning_rate": 9.643909745373724e-07, "loss": 0.3705, "step": 737 }, { "epoch": 0.03565734164371648, "grad_norm": 7.36446475982666, "learning_rate": 9.643426583562834e-07, "loss": 0.2727, "step": 738 }, { "epoch": 0.035705657824805524, "grad_norm": 3.0732693672180176, "learning_rate": 9.642943421751944e-07, "loss": 0.4103, "step": 739 }, { "epoch": 0.03575397400589458, "grad_norm": 3.7809252738952637, "learning_rate": 9.642460259941054e-07, "loss": 0.4772, "step": 740 }, { "epoch": 0.035802290186983624, "grad_norm": 2.272250175476074, "learning_rate": 9.641977098130164e-07, "loss": 0.3133, "step": 741 }, { "epoch": 0.03585060636807267, "grad_norm": 2.2534632682800293, "learning_rate": 9.641493936319272e-07, "loss": 0.276, "step": 742 }, { "epoch": 0.035898922549161716, "grad_norm": 3.817821979522705, "learning_rate": 9.641010774508382e-07, "loss": 0.2787, "step": 743 }, { "epoch": 0.03594723873025076, "grad_norm": 3.846781015396118, "learning_rate": 9.640527612697491e-07, "loss": 0.4234, "step": 744 }, { "epoch": 0.03599555491133981, "grad_norm": 2.880277395248413, "learning_rate": 9.640044450886601e-07, "loss": 0.3191, "step": 745 }, { "epoch": 0.036043871092428854, "grad_norm": 2.993455648422241, "learning_rate": 9.639561289075711e-07, "loss": 0.3745, "step": 746 }, { "epoch": 0.0360921872735179, "grad_norm": 11.307567596435547, "learning_rate": 9.639078127264821e-07, "loss": 0.2893, "step": 747 }, { "epoch": 0.03614050345460695, "grad_norm": 2.4407143592834473, "learning_rate": 9.638594965453931e-07, "loss": 0.3066, "step": 748 }, { "epoch": 0.03618881963569599, "grad_norm": 1.8715107440948486, "learning_rate": 9.638111803643039e-07, "loss": 0.2432, "step": 749 }, { "epoch": 0.03623713581678504, "grad_norm": 28.039852142333984, "learning_rate": 9.637628641832149e-07, "loss": 0.5074, "step": 750 }, { "epoch": 0.036285451997874085, "grad_norm": 2.225130081176758, "learning_rate": 9.637145480021259e-07, "loss": 0.2418, "step": 751 }, { "epoch": 0.03633376817896313, "grad_norm": 10.149792671203613, "learning_rate": 9.636662318210369e-07, "loss": 0.2069, "step": 752 }, { "epoch": 0.036382084360052185, "grad_norm": 2.394840717315674, "learning_rate": 9.636179156399478e-07, "loss": 0.2692, "step": 753 }, { "epoch": 0.03643040054114123, "grad_norm": 1.5771054029464722, "learning_rate": 9.635695994588588e-07, "loss": 0.1688, "step": 754 }, { "epoch": 0.03647871672223028, "grad_norm": 2.742532253265381, "learning_rate": 9.635212832777696e-07, "loss": 0.3733, "step": 755 }, { "epoch": 0.03652703290331932, "grad_norm": 2.922597646713257, "learning_rate": 9.634729670966806e-07, "loss": 0.3966, "step": 756 }, { "epoch": 0.03657534908440837, "grad_norm": 2.005326986312866, "learning_rate": 9.634246509155916e-07, "loss": 0.2604, "step": 757 }, { "epoch": 0.036623665265497415, "grad_norm": 4.235630035400391, "learning_rate": 9.633763347345026e-07, "loss": 0.2763, "step": 758 }, { "epoch": 0.03667198144658646, "grad_norm": 2.263528347015381, "learning_rate": 9.633280185534136e-07, "loss": 0.235, "step": 759 }, { "epoch": 0.03672029762767551, "grad_norm": 5.334431171417236, "learning_rate": 9.632797023723246e-07, "loss": 0.5265, "step": 760 }, { "epoch": 0.036768613808764554, "grad_norm": 2.98772931098938, "learning_rate": 9.632313861912356e-07, "loss": 0.3219, "step": 761 }, { "epoch": 0.0368169299898536, "grad_norm": 6.68079948425293, "learning_rate": 9.631830700101463e-07, "loss": 0.2987, "step": 762 }, { "epoch": 0.036865246170942646, "grad_norm": 5.25956916809082, "learning_rate": 9.631347538290573e-07, "loss": 0.3993, "step": 763 }, { "epoch": 0.03691356235203169, "grad_norm": 19.735628128051758, "learning_rate": 9.630864376479683e-07, "loss": 0.2847, "step": 764 }, { "epoch": 0.036961878533120746, "grad_norm": 3.519477605819702, "learning_rate": 9.630381214668793e-07, "loss": 0.375, "step": 765 }, { "epoch": 0.03701019471420979, "grad_norm": 2.271327018737793, "learning_rate": 9.629898052857903e-07, "loss": 0.2514, "step": 766 }, { "epoch": 0.03705851089529884, "grad_norm": 2.827082872390747, "learning_rate": 9.62941489104701e-07, "loss": 0.2689, "step": 767 }, { "epoch": 0.037106827076387884, "grad_norm": 2.82212233543396, "learning_rate": 9.62893172923612e-07, "loss": 0.2996, "step": 768 }, { "epoch": 0.03715514325747693, "grad_norm": 4.082566261291504, "learning_rate": 9.62844856742523e-07, "loss": 0.371, "step": 769 }, { "epoch": 0.037203459438565976, "grad_norm": 3.360553503036499, "learning_rate": 9.62796540561434e-07, "loss": 0.2924, "step": 770 }, { "epoch": 0.03725177561965502, "grad_norm": 3.5665061473846436, "learning_rate": 9.62748224380345e-07, "loss": 0.319, "step": 771 }, { "epoch": 0.03730009180074407, "grad_norm": 2.086672067642212, "learning_rate": 9.626999081992558e-07, "loss": 0.2637, "step": 772 }, { "epoch": 0.037348407981833115, "grad_norm": 2.8856422901153564, "learning_rate": 9.626515920181668e-07, "loss": 0.2434, "step": 773 }, { "epoch": 0.03739672416292216, "grad_norm": 2.625272750854492, "learning_rate": 9.626032758370778e-07, "loss": 0.3724, "step": 774 }, { "epoch": 0.03744504034401121, "grad_norm": 2.7457752227783203, "learning_rate": 9.625549596559888e-07, "loss": 0.355, "step": 775 }, { "epoch": 0.03749335652510025, "grad_norm": 3.1708507537841797, "learning_rate": 9.625066434748998e-07, "loss": 0.324, "step": 776 }, { "epoch": 0.0375416727061893, "grad_norm": 17.219507217407227, "learning_rate": 9.624583272938106e-07, "loss": 0.4215, "step": 777 }, { "epoch": 0.03758998888727835, "grad_norm": 2.7296996116638184, "learning_rate": 9.624100111127215e-07, "loss": 0.2674, "step": 778 }, { "epoch": 0.0376383050683674, "grad_norm": 2.7711498737335205, "learning_rate": 9.623616949316325e-07, "loss": 0.3082, "step": 779 }, { "epoch": 0.037686621249456445, "grad_norm": 2.5798723697662354, "learning_rate": 9.623133787505435e-07, "loss": 0.2756, "step": 780 }, { "epoch": 0.03773493743054549, "grad_norm": 1.9716187715530396, "learning_rate": 9.622650625694545e-07, "loss": 0.2316, "step": 781 }, { "epoch": 0.03778325361163454, "grad_norm": 3.0952157974243164, "learning_rate": 9.622167463883655e-07, "loss": 0.4976, "step": 782 }, { "epoch": 0.037831569792723584, "grad_norm": 2.6784889698028564, "learning_rate": 9.621684302072763e-07, "loss": 0.3508, "step": 783 }, { "epoch": 0.03787988597381263, "grad_norm": 2.7355098724365234, "learning_rate": 9.621201140261873e-07, "loss": 0.3613, "step": 784 }, { "epoch": 0.037928202154901676, "grad_norm": 14.744927406311035, "learning_rate": 9.620717978450983e-07, "loss": 0.2315, "step": 785 }, { "epoch": 0.03797651833599072, "grad_norm": 4.616762638092041, "learning_rate": 9.620234816640093e-07, "loss": 0.3513, "step": 786 }, { "epoch": 0.03802483451707977, "grad_norm": 2.7342519760131836, "learning_rate": 9.619751654829202e-07, "loss": 0.2912, "step": 787 }, { "epoch": 0.038073150698168814, "grad_norm": 6.314999103546143, "learning_rate": 9.61926849301831e-07, "loss": 0.2556, "step": 788 }, { "epoch": 0.03812146687925786, "grad_norm": 2.1891493797302246, "learning_rate": 9.61878533120742e-07, "loss": 0.2195, "step": 789 }, { "epoch": 0.03816978306034691, "grad_norm": 2.565852165222168, "learning_rate": 9.61830216939653e-07, "loss": 0.3197, "step": 790 }, { "epoch": 0.03821809924143596, "grad_norm": 3.0472331047058105, "learning_rate": 9.61781900758564e-07, "loss": 0.3188, "step": 791 }, { "epoch": 0.038266415422525006, "grad_norm": 3.6639175415039062, "learning_rate": 9.61733584577475e-07, "loss": 0.2775, "step": 792 }, { "epoch": 0.03831473160361405, "grad_norm": 15.987265586853027, "learning_rate": 9.616852683963858e-07, "loss": 0.4034, "step": 793 }, { "epoch": 0.0383630477847031, "grad_norm": 3.320580244064331, "learning_rate": 9.616369522152968e-07, "loss": 0.3383, "step": 794 }, { "epoch": 0.038411363965792145, "grad_norm": 4.953262805938721, "learning_rate": 9.615886360342077e-07, "loss": 0.4075, "step": 795 }, { "epoch": 0.03845968014688119, "grad_norm": 2.7397308349609375, "learning_rate": 9.615403198531187e-07, "loss": 0.2484, "step": 796 }, { "epoch": 0.03850799632797024, "grad_norm": 2.546454668045044, "learning_rate": 9.614920036720297e-07, "loss": 0.2545, "step": 797 }, { "epoch": 0.03855631250905928, "grad_norm": 2.5093300342559814, "learning_rate": 9.614436874909407e-07, "loss": 0.3344, "step": 798 }, { "epoch": 0.03860462869014833, "grad_norm": 2.54134202003479, "learning_rate": 9.613953713098517e-07, "loss": 0.347, "step": 799 }, { "epoch": 0.038652944871237375, "grad_norm": 3.619943380355835, "learning_rate": 9.613470551287625e-07, "loss": 0.3342, "step": 800 }, { "epoch": 0.03870126105232642, "grad_norm": 3.8335988521575928, "learning_rate": 9.612987389476735e-07, "loss": 0.256, "step": 801 }, { "epoch": 0.03874957723341547, "grad_norm": 3.282363176345825, "learning_rate": 9.612504227665845e-07, "loss": 0.3605, "step": 802 }, { "epoch": 0.03879789341450452, "grad_norm": 12.377533912658691, "learning_rate": 9.612021065854955e-07, "loss": 0.4218, "step": 803 }, { "epoch": 0.03884620959559357, "grad_norm": 2.9516608715057373, "learning_rate": 9.611537904044064e-07, "loss": 0.332, "step": 804 }, { "epoch": 0.03889452577668261, "grad_norm": 3.7454957962036133, "learning_rate": 9.611054742233174e-07, "loss": 0.4902, "step": 805 }, { "epoch": 0.03894284195777166, "grad_norm": 3.620548963546753, "learning_rate": 9.610571580422282e-07, "loss": 0.409, "step": 806 }, { "epoch": 0.038991158138860706, "grad_norm": 2.8134942054748535, "learning_rate": 9.610088418611392e-07, "loss": 0.3731, "step": 807 }, { "epoch": 0.03903947431994975, "grad_norm": 2.5567047595977783, "learning_rate": 9.609605256800502e-07, "loss": 0.3734, "step": 808 }, { "epoch": 0.0390877905010388, "grad_norm": 2.737046241760254, "learning_rate": 9.609122094989612e-07, "loss": 0.5284, "step": 809 }, { "epoch": 0.039136106682127844, "grad_norm": 2.75722074508667, "learning_rate": 9.608638933178722e-07, "loss": 0.2497, "step": 810 }, { "epoch": 0.03918442286321689, "grad_norm": 4.001081943511963, "learning_rate": 9.608155771367832e-07, "loss": 0.3793, "step": 811 }, { "epoch": 0.039232739044305937, "grad_norm": 2.558701753616333, "learning_rate": 9.607672609556942e-07, "loss": 0.3064, "step": 812 }, { "epoch": 0.03928105522539498, "grad_norm": 3.8188998699188232, "learning_rate": 9.60718944774605e-07, "loss": 0.2959, "step": 813 }, { "epoch": 0.03932937140648403, "grad_norm": 2.7177352905273438, "learning_rate": 9.60670628593516e-07, "loss": 0.3236, "step": 814 }, { "epoch": 0.039377687587573075, "grad_norm": 2.5647966861724854, "learning_rate": 9.60622312412427e-07, "loss": 0.326, "step": 815 }, { "epoch": 0.03942600376866213, "grad_norm": 2.335801601409912, "learning_rate": 9.60573996231338e-07, "loss": 0.3155, "step": 816 }, { "epoch": 0.039474319949751174, "grad_norm": 6.537277698516846, "learning_rate": 9.605256800502489e-07, "loss": 0.3162, "step": 817 }, { "epoch": 0.03952263613084022, "grad_norm": 3.0280470848083496, "learning_rate": 9.604773638691599e-07, "loss": 0.3242, "step": 818 }, { "epoch": 0.03957095231192927, "grad_norm": 2.6606733798980713, "learning_rate": 9.604290476880707e-07, "loss": 0.3279, "step": 819 }, { "epoch": 0.03961926849301831, "grad_norm": 1.920761227607727, "learning_rate": 9.603807315069817e-07, "loss": 0.1736, "step": 820 }, { "epoch": 0.03966758467410736, "grad_norm": 2.8427574634552, "learning_rate": 9.603324153258926e-07, "loss": 0.3329, "step": 821 }, { "epoch": 0.039715900855196405, "grad_norm": 2.3724896907806396, "learning_rate": 9.602840991448036e-07, "loss": 0.204, "step": 822 }, { "epoch": 0.03976421703628545, "grad_norm": 3.380692958831787, "learning_rate": 9.602357829637146e-07, "loss": 0.2934, "step": 823 }, { "epoch": 0.0398125332173745, "grad_norm": 2.715608835220337, "learning_rate": 9.601874667826254e-07, "loss": 0.3732, "step": 824 }, { "epoch": 0.039860849398463544, "grad_norm": 4.650603294372559, "learning_rate": 9.601391506015364e-07, "loss": 0.375, "step": 825 }, { "epoch": 0.03990916557955259, "grad_norm": 1.8358869552612305, "learning_rate": 9.600908344204474e-07, "loss": 0.2022, "step": 826 }, { "epoch": 0.039957481760641636, "grad_norm": 2.679014205932617, "learning_rate": 9.600425182393584e-07, "loss": 0.382, "step": 827 }, { "epoch": 0.04000579794173068, "grad_norm": 3.5788986682891846, "learning_rate": 9.599942020582694e-07, "loss": 0.4065, "step": 828 }, { "epoch": 0.040054114122819735, "grad_norm": 3.2322170734405518, "learning_rate": 9.599458858771801e-07, "loss": 0.3276, "step": 829 }, { "epoch": 0.04010243030390878, "grad_norm": 3.85836124420166, "learning_rate": 9.598975696960911e-07, "loss": 0.2413, "step": 830 }, { "epoch": 0.04015074648499783, "grad_norm": 3.090538501739502, "learning_rate": 9.598492535150021e-07, "loss": 0.2987, "step": 831 }, { "epoch": 0.040199062666086874, "grad_norm": 3.5621345043182373, "learning_rate": 9.598009373339131e-07, "loss": 0.5053, "step": 832 }, { "epoch": 0.04024737884717592, "grad_norm": 3.162865400314331, "learning_rate": 9.59752621152824e-07, "loss": 0.2774, "step": 833 }, { "epoch": 0.040295695028264966, "grad_norm": 2.982217311859131, "learning_rate": 9.59704304971735e-07, "loss": 0.4422, "step": 834 }, { "epoch": 0.04034401120935401, "grad_norm": 3.6462817192077637, "learning_rate": 9.596559887906459e-07, "loss": 0.3243, "step": 835 }, { "epoch": 0.04039232739044306, "grad_norm": 3.1078290939331055, "learning_rate": 9.596076726095569e-07, "loss": 0.3173, "step": 836 }, { "epoch": 0.040440643571532105, "grad_norm": 3.206345558166504, "learning_rate": 9.595593564284678e-07, "loss": 0.4418, "step": 837 }, { "epoch": 0.04048895975262115, "grad_norm": 3.0001590251922607, "learning_rate": 9.595110402473788e-07, "loss": 0.268, "step": 838 }, { "epoch": 0.0405372759337102, "grad_norm": 3.7342207431793213, "learning_rate": 9.594627240662898e-07, "loss": 0.3438, "step": 839 }, { "epoch": 0.04058559211479924, "grad_norm": 3.523948907852173, "learning_rate": 9.594144078852006e-07, "loss": 0.3238, "step": 840 }, { "epoch": 0.040633908295888296, "grad_norm": 2.3180932998657227, "learning_rate": 9.593660917041116e-07, "loss": 0.2928, "step": 841 }, { "epoch": 0.04068222447697734, "grad_norm": 2.5104551315307617, "learning_rate": 9.593177755230226e-07, "loss": 0.2546, "step": 842 }, { "epoch": 0.04073054065806639, "grad_norm": 2.7483670711517334, "learning_rate": 9.592694593419336e-07, "loss": 0.2551, "step": 843 }, { "epoch": 0.040778856839155435, "grad_norm": 2.7940673828125, "learning_rate": 9.592211431608446e-07, "loss": 0.3838, "step": 844 }, { "epoch": 0.04082717302024448, "grad_norm": 3.9217476844787598, "learning_rate": 9.591728269797553e-07, "loss": 0.3871, "step": 845 }, { "epoch": 0.04087548920133353, "grad_norm": 8.282997131347656, "learning_rate": 9.591245107986663e-07, "loss": 0.4306, "step": 846 }, { "epoch": 0.04092380538242257, "grad_norm": 2.071523666381836, "learning_rate": 9.590761946175773e-07, "loss": 0.2098, "step": 847 }, { "epoch": 0.04097212156351162, "grad_norm": 3.1474063396453857, "learning_rate": 9.590278784364883e-07, "loss": 0.2502, "step": 848 }, { "epoch": 0.041020437744600666, "grad_norm": 5.632136344909668, "learning_rate": 9.589795622553993e-07, "loss": 0.3167, "step": 849 }, { "epoch": 0.04106875392568971, "grad_norm": 2.7507591247558594, "learning_rate": 9.589312460743103e-07, "loss": 0.2792, "step": 850 }, { "epoch": 0.04111707010677876, "grad_norm": 2.8612396717071533, "learning_rate": 9.58882929893221e-07, "loss": 0.2499, "step": 851 }, { "epoch": 0.041165386287867804, "grad_norm": 3.7012195587158203, "learning_rate": 9.58834613712132e-07, "loss": 0.5501, "step": 852 }, { "epoch": 0.04121370246895685, "grad_norm": 4.291708946228027, "learning_rate": 9.58786297531043e-07, "loss": 0.3424, "step": 853 }, { "epoch": 0.041262018650045904, "grad_norm": 2.8191072940826416, "learning_rate": 9.58737981349954e-07, "loss": 0.2603, "step": 854 }, { "epoch": 0.04131033483113495, "grad_norm": 2.5506625175476074, "learning_rate": 9.58689665168865e-07, "loss": 0.2826, "step": 855 }, { "epoch": 0.041358651012223996, "grad_norm": 4.562040328979492, "learning_rate": 9.58641348987776e-07, "loss": 0.3596, "step": 856 }, { "epoch": 0.04140696719331304, "grad_norm": 2.778534173965454, "learning_rate": 9.585930328066868e-07, "loss": 0.3391, "step": 857 }, { "epoch": 0.04145528337440209, "grad_norm": 3.42668080329895, "learning_rate": 9.585447166255978e-07, "loss": 0.357, "step": 858 }, { "epoch": 0.041503599555491134, "grad_norm": 1.9271173477172852, "learning_rate": 9.584964004445088e-07, "loss": 0.231, "step": 859 }, { "epoch": 0.04155191573658018, "grad_norm": 3.6661202907562256, "learning_rate": 9.584480842634198e-07, "loss": 0.3758, "step": 860 }, { "epoch": 0.04160023191766923, "grad_norm": 2.7438457012176514, "learning_rate": 9.583997680823308e-07, "loss": 0.3423, "step": 861 }, { "epoch": 0.04164854809875827, "grad_norm": 2.3941543102264404, "learning_rate": 9.583514519012418e-07, "loss": 0.2981, "step": 862 }, { "epoch": 0.04169686427984732, "grad_norm": 2.362802505493164, "learning_rate": 9.583031357201527e-07, "loss": 0.2821, "step": 863 }, { "epoch": 0.041745180460936365, "grad_norm": 3.9215943813323975, "learning_rate": 9.582548195390635e-07, "loss": 0.3664, "step": 864 }, { "epoch": 0.04179349664202541, "grad_norm": 2.7432456016540527, "learning_rate": 9.582065033579745e-07, "loss": 0.3467, "step": 865 }, { "epoch": 0.04184181282311446, "grad_norm": 2.6976096630096436, "learning_rate": 9.581581871768855e-07, "loss": 0.2917, "step": 866 }, { "epoch": 0.04189012900420351, "grad_norm": 4.955928802490234, "learning_rate": 9.581098709957965e-07, "loss": 0.27, "step": 867 }, { "epoch": 0.04193844518529256, "grad_norm": 3.7848010063171387, "learning_rate": 9.580615548147075e-07, "loss": 0.3477, "step": 868 }, { "epoch": 0.0419867613663816, "grad_norm": 3.0207583904266357, "learning_rate": 9.580132386336185e-07, "loss": 0.3524, "step": 869 }, { "epoch": 0.04203507754747065, "grad_norm": 2.3326191902160645, "learning_rate": 9.579649224525293e-07, "loss": 0.2658, "step": 870 }, { "epoch": 0.042083393728559695, "grad_norm": 2.3068952560424805, "learning_rate": 9.579166062714402e-07, "loss": 0.3057, "step": 871 }, { "epoch": 0.04213170990964874, "grad_norm": 1.934598684310913, "learning_rate": 9.578682900903512e-07, "loss": 0.2246, "step": 872 }, { "epoch": 0.04218002609073779, "grad_norm": 2.3199474811553955, "learning_rate": 9.578199739092622e-07, "loss": 0.317, "step": 873 }, { "epoch": 0.042228342271826834, "grad_norm": 3.1297526359558105, "learning_rate": 9.577716577281732e-07, "loss": 0.4229, "step": 874 }, { "epoch": 0.04227665845291588, "grad_norm": 5.415986061096191, "learning_rate": 9.577233415470842e-07, "loss": 0.3353, "step": 875 }, { "epoch": 0.042324974634004926, "grad_norm": 1.8478981256484985, "learning_rate": 9.57675025365995e-07, "loss": 0.1678, "step": 876 }, { "epoch": 0.04237329081509397, "grad_norm": 2.8100881576538086, "learning_rate": 9.57626709184906e-07, "loss": 0.3769, "step": 877 }, { "epoch": 0.04242160699618302, "grad_norm": 3.6920955181121826, "learning_rate": 9.57578393003817e-07, "loss": 0.3381, "step": 878 }, { "epoch": 0.04246992317727207, "grad_norm": 11.334879875183105, "learning_rate": 9.57530076822728e-07, "loss": 0.3585, "step": 879 }, { "epoch": 0.04251823935836112, "grad_norm": 2.017467975616455, "learning_rate": 9.57481760641639e-07, "loss": 0.2185, "step": 880 }, { "epoch": 0.042566555539450164, "grad_norm": 3.241368293762207, "learning_rate": 9.574334444605497e-07, "loss": 0.4827, "step": 881 }, { "epoch": 0.04261487172053921, "grad_norm": 2.4720325469970703, "learning_rate": 9.573851282794607e-07, "loss": 0.2871, "step": 882 }, { "epoch": 0.042663187901628256, "grad_norm": 2.315983772277832, "learning_rate": 9.573368120983717e-07, "loss": 0.2606, "step": 883 }, { "epoch": 0.0427115040827173, "grad_norm": 2.3238308429718018, "learning_rate": 9.572884959172827e-07, "loss": 0.2227, "step": 884 }, { "epoch": 0.04275982026380635, "grad_norm": 2.9749701023101807, "learning_rate": 9.572401797361937e-07, "loss": 0.3537, "step": 885 }, { "epoch": 0.042808136444895395, "grad_norm": 4.636014938354492, "learning_rate": 9.571918635551045e-07, "loss": 0.3297, "step": 886 }, { "epoch": 0.04285645262598444, "grad_norm": 3.139751672744751, "learning_rate": 9.571435473740155e-07, "loss": 0.2898, "step": 887 }, { "epoch": 0.04290476880707349, "grad_norm": 2.463207483291626, "learning_rate": 9.570952311929264e-07, "loss": 0.2726, "step": 888 }, { "epoch": 0.04295308498816253, "grad_norm": 10.11752986907959, "learning_rate": 9.570469150118374e-07, "loss": 0.2741, "step": 889 }, { "epoch": 0.04300140116925158, "grad_norm": 2.143429756164551, "learning_rate": 9.569985988307484e-07, "loss": 0.2201, "step": 890 }, { "epoch": 0.043049717350340626, "grad_norm": 2.6065492630004883, "learning_rate": 9.569502826496594e-07, "loss": 0.325, "step": 891 }, { "epoch": 0.04309803353142968, "grad_norm": 3.4626972675323486, "learning_rate": 9.569019664685702e-07, "loss": 0.4434, "step": 892 }, { "epoch": 0.043146349712518725, "grad_norm": 3.1573870182037354, "learning_rate": 9.568536502874812e-07, "loss": 0.3565, "step": 893 }, { "epoch": 0.04319466589360777, "grad_norm": 3.149827718734741, "learning_rate": 9.568053341063922e-07, "loss": 0.3482, "step": 894 }, { "epoch": 0.04324298207469682, "grad_norm": 2.2789392471313477, "learning_rate": 9.567570179253032e-07, "loss": 0.2665, "step": 895 }, { "epoch": 0.043291298255785864, "grad_norm": 2.498807668685913, "learning_rate": 9.567087017442142e-07, "loss": 0.3757, "step": 896 }, { "epoch": 0.04333961443687491, "grad_norm": 1.962924599647522, "learning_rate": 9.56660385563125e-07, "loss": 0.213, "step": 897 }, { "epoch": 0.043387930617963956, "grad_norm": 2.58791446685791, "learning_rate": 9.56612069382036e-07, "loss": 0.272, "step": 898 }, { "epoch": 0.043436246799053, "grad_norm": 2.036062240600586, "learning_rate": 9.56563753200947e-07, "loss": 0.2104, "step": 899 }, { "epoch": 0.04348456298014205, "grad_norm": 3.7194464206695557, "learning_rate": 9.56515437019858e-07, "loss": 0.3358, "step": 900 }, { "epoch": 0.043532879161231094, "grad_norm": 5.325500965118408, "learning_rate": 9.564671208387689e-07, "loss": 0.3394, "step": 901 }, { "epoch": 0.04358119534232014, "grad_norm": 7.014562129974365, "learning_rate": 9.564188046576797e-07, "loss": 0.3887, "step": 902 }, { "epoch": 0.04362951152340919, "grad_norm": 2.0807862281799316, "learning_rate": 9.563704884765907e-07, "loss": 0.2347, "step": 903 }, { "epoch": 0.04367782770449824, "grad_norm": 2.5052661895751953, "learning_rate": 9.563221722955017e-07, "loss": 0.253, "step": 904 }, { "epoch": 0.043726143885587286, "grad_norm": 1.9443480968475342, "learning_rate": 9.562738561144126e-07, "loss": 0.1831, "step": 905 }, { "epoch": 0.04377446006667633, "grad_norm": 9.588532447814941, "learning_rate": 9.562255399333236e-07, "loss": 0.4792, "step": 906 }, { "epoch": 0.04382277624776538, "grad_norm": 3.443746566772461, "learning_rate": 9.561772237522346e-07, "loss": 0.2884, "step": 907 }, { "epoch": 0.043871092428854425, "grad_norm": 1.7691890001296997, "learning_rate": 9.561289075711454e-07, "loss": 0.1677, "step": 908 }, { "epoch": 0.04391940860994347, "grad_norm": 2.8564531803131104, "learning_rate": 9.560805913900564e-07, "loss": 0.2615, "step": 909 }, { "epoch": 0.04396772479103252, "grad_norm": 2.889796733856201, "learning_rate": 9.560322752089674e-07, "loss": 0.3961, "step": 910 }, { "epoch": 0.04401604097212156, "grad_norm": 2.1146414279937744, "learning_rate": 9.559839590278784e-07, "loss": 0.2759, "step": 911 }, { "epoch": 0.04406435715321061, "grad_norm": 2.5653271675109863, "learning_rate": 9.559356428467894e-07, "loss": 0.2661, "step": 912 }, { "epoch": 0.044112673334299655, "grad_norm": 148.30332946777344, "learning_rate": 9.558873266657004e-07, "loss": 0.3536, "step": 913 }, { "epoch": 0.0441609895153887, "grad_norm": 2.5112051963806152, "learning_rate": 9.558390104846113e-07, "loss": 0.3299, "step": 914 }, { "epoch": 0.04420930569647775, "grad_norm": 4.433218955993652, "learning_rate": 9.557906943035221e-07, "loss": 0.4205, "step": 915 }, { "epoch": 0.044257621877566794, "grad_norm": 2.3132307529449463, "learning_rate": 9.557423781224331e-07, "loss": 0.2682, "step": 916 }, { "epoch": 0.04430593805865585, "grad_norm": 2.718397855758667, "learning_rate": 9.55694061941344e-07, "loss": 0.3711, "step": 917 }, { "epoch": 0.04435425423974489, "grad_norm": 2.6153500080108643, "learning_rate": 9.55645745760255e-07, "loss": 0.3334, "step": 918 }, { "epoch": 0.04440257042083394, "grad_norm": 2.505779504776001, "learning_rate": 9.55597429579166e-07, "loss": 0.2981, "step": 919 }, { "epoch": 0.044450886601922986, "grad_norm": 2.129704475402832, "learning_rate": 9.55549113398077e-07, "loss": 0.2964, "step": 920 }, { "epoch": 0.04449920278301203, "grad_norm": 3.1016926765441895, "learning_rate": 9.555007972169879e-07, "loss": 0.3281, "step": 921 }, { "epoch": 0.04454751896410108, "grad_norm": 2.2945704460144043, "learning_rate": 9.554524810358988e-07, "loss": 0.2708, "step": 922 }, { "epoch": 0.044595835145190124, "grad_norm": 2.173520565032959, "learning_rate": 9.554041648548098e-07, "loss": 0.2297, "step": 923 }, { "epoch": 0.04464415132627917, "grad_norm": 2.885246753692627, "learning_rate": 9.553558486737208e-07, "loss": 0.2104, "step": 924 }, { "epoch": 0.044692467507368216, "grad_norm": 3.1733310222625732, "learning_rate": 9.553075324926318e-07, "loss": 0.5083, "step": 925 }, { "epoch": 0.04474078368845726, "grad_norm": 3.3408517837524414, "learning_rate": 9.552592163115428e-07, "loss": 0.4232, "step": 926 }, { "epoch": 0.04478909986954631, "grad_norm": 130.99716186523438, "learning_rate": 9.552109001304538e-07, "loss": 0.4394, "step": 927 }, { "epoch": 0.044837416050635355, "grad_norm": 5.833311080932617, "learning_rate": 9.551625839493646e-07, "loss": 0.3739, "step": 928 }, { "epoch": 0.0448857322317244, "grad_norm": 3.279695749282837, "learning_rate": 9.551142677682756e-07, "loss": 0.3387, "step": 929 }, { "epoch": 0.044934048412813454, "grad_norm": 5.333140850067139, "learning_rate": 9.550659515871866e-07, "loss": 0.2937, "step": 930 }, { "epoch": 0.0449823645939025, "grad_norm": 3.4568593502044678, "learning_rate": 9.550176354060975e-07, "loss": 0.4565, "step": 931 }, { "epoch": 0.04503068077499155, "grad_norm": 2.826213836669922, "learning_rate": 9.549693192250085e-07, "loss": 0.3595, "step": 932 }, { "epoch": 0.04507899695608059, "grad_norm": 2.4957845211029053, "learning_rate": 9.549210030439193e-07, "loss": 0.2844, "step": 933 }, { "epoch": 0.04512731313716964, "grad_norm": 3.138941764831543, "learning_rate": 9.548726868628303e-07, "loss": 0.4197, "step": 934 }, { "epoch": 0.045175629318258685, "grad_norm": 5.127030372619629, "learning_rate": 9.548243706817413e-07, "loss": 0.371, "step": 935 }, { "epoch": 0.04522394549934773, "grad_norm": 2.6110386848449707, "learning_rate": 9.547760545006523e-07, "loss": 0.2771, "step": 936 }, { "epoch": 0.04527226168043678, "grad_norm": 2.9168949127197266, "learning_rate": 9.547277383195633e-07, "loss": 0.3913, "step": 937 }, { "epoch": 0.045320577861525824, "grad_norm": 3.352189302444458, "learning_rate": 9.54679422138474e-07, "loss": 0.401, "step": 938 }, { "epoch": 0.04536889404261487, "grad_norm": 3.0441555976867676, "learning_rate": 9.54631105957385e-07, "loss": 0.2099, "step": 939 }, { "epoch": 0.045417210223703916, "grad_norm": 3.647207260131836, "learning_rate": 9.54582789776296e-07, "loss": 0.5028, "step": 940 }, { "epoch": 0.04546552640479296, "grad_norm": 2.5725295543670654, "learning_rate": 9.54534473595207e-07, "loss": 0.3101, "step": 941 }, { "epoch": 0.045513842585882015, "grad_norm": 3.1928110122680664, "learning_rate": 9.54486157414118e-07, "loss": 0.4304, "step": 942 }, { "epoch": 0.04556215876697106, "grad_norm": 2.2320728302001953, "learning_rate": 9.54437841233029e-07, "loss": 0.2339, "step": 943 }, { "epoch": 0.04561047494806011, "grad_norm": 2.100806713104248, "learning_rate": 9.543895250519398e-07, "loss": 0.3223, "step": 944 }, { "epoch": 0.045658791129149154, "grad_norm": 3.295776128768921, "learning_rate": 9.543412088708508e-07, "loss": 0.3982, "step": 945 }, { "epoch": 0.0457071073102382, "grad_norm": 3.633723020553589, "learning_rate": 9.542928926897618e-07, "loss": 0.3902, "step": 946 }, { "epoch": 0.045755423491327246, "grad_norm": 3.798828363418579, "learning_rate": 9.542445765086727e-07, "loss": 0.2804, "step": 947 }, { "epoch": 0.04580373967241629, "grad_norm": 2.526458740234375, "learning_rate": 9.541962603275837e-07, "loss": 0.2953, "step": 948 }, { "epoch": 0.04585205585350534, "grad_norm": 4.0092926025390625, "learning_rate": 9.541479441464945e-07, "loss": 0.2292, "step": 949 }, { "epoch": 0.045900372034594385, "grad_norm": 3.0744879245758057, "learning_rate": 9.540996279654055e-07, "loss": 0.3481, "step": 950 }, { "epoch": 0.04594868821568343, "grad_norm": 4.24916410446167, "learning_rate": 9.540513117843165e-07, "loss": 0.3749, "step": 951 }, { "epoch": 0.04599700439677248, "grad_norm": 2.6412923336029053, "learning_rate": 9.540029956032275e-07, "loss": 0.3453, "step": 952 }, { "epoch": 0.04604532057786152, "grad_norm": 1.9807217121124268, "learning_rate": 9.539546794221385e-07, "loss": 0.2098, "step": 953 }, { "epoch": 0.04609363675895057, "grad_norm": 2.3359012603759766, "learning_rate": 9.539063632410493e-07, "loss": 0.2312, "step": 954 }, { "epoch": 0.04614195294003962, "grad_norm": 7.0597825050354, "learning_rate": 9.538580470599602e-07, "loss": 0.4678, "step": 955 }, { "epoch": 0.04619026912112867, "grad_norm": 2.6010308265686035, "learning_rate": 9.538097308788712e-07, "loss": 0.3179, "step": 956 }, { "epoch": 0.046238585302217715, "grad_norm": 2.641362190246582, "learning_rate": 9.537614146977822e-07, "loss": 0.2693, "step": 957 }, { "epoch": 0.04628690148330676, "grad_norm": 5.135815143585205, "learning_rate": 9.537130985166932e-07, "loss": 0.4137, "step": 958 }, { "epoch": 0.04633521766439581, "grad_norm": 3.2346837520599365, "learning_rate": 9.536647823356041e-07, "loss": 0.3014, "step": 959 }, { "epoch": 0.04638353384548485, "grad_norm": 1.3545476198196411, "learning_rate": 9.536164661545151e-07, "loss": 0.1563, "step": 960 }, { "epoch": 0.0464318500265739, "grad_norm": 4.914168834686279, "learning_rate": 9.535681499734261e-07, "loss": 0.2443, "step": 961 }, { "epoch": 0.046480166207662946, "grad_norm": 10.57476806640625, "learning_rate": 9.535198337923371e-07, "loss": 0.3012, "step": 962 }, { "epoch": 0.04652848238875199, "grad_norm": 3.344247817993164, "learning_rate": 9.53471517611248e-07, "loss": 0.2932, "step": 963 }, { "epoch": 0.04657679856984104, "grad_norm": 11.3355131149292, "learning_rate": 9.534232014301588e-07, "loss": 0.2536, "step": 964 }, { "epoch": 0.046625114750930084, "grad_norm": 4.337341785430908, "learning_rate": 9.533748852490698e-07, "loss": 0.4019, "step": 965 }, { "epoch": 0.04667343093201913, "grad_norm": 4.339783191680908, "learning_rate": 9.533265690679808e-07, "loss": 0.3663, "step": 966 }, { "epoch": 0.04672174711310818, "grad_norm": 3.373011827468872, "learning_rate": 9.532782528868918e-07, "loss": 0.3344, "step": 967 }, { "epoch": 0.04677006329419723, "grad_norm": 2.608823537826538, "learning_rate": 9.532299367058028e-07, "loss": 0.3464, "step": 968 }, { "epoch": 0.046818379475286276, "grad_norm": 1.9661189317703247, "learning_rate": 9.531816205247137e-07, "loss": 0.2183, "step": 969 }, { "epoch": 0.04686669565637532, "grad_norm": 2.4765679836273193, "learning_rate": 9.531333043436246e-07, "loss": 0.3095, "step": 970 }, { "epoch": 0.04691501183746437, "grad_norm": 2.1946523189544678, "learning_rate": 9.530849881625356e-07, "loss": 0.2368, "step": 971 }, { "epoch": 0.046963328018553414, "grad_norm": 3.1186821460723877, "learning_rate": 9.530366719814466e-07, "loss": 0.3468, "step": 972 }, { "epoch": 0.04701164419964246, "grad_norm": 2.713933229446411, "learning_rate": 9.529883558003575e-07, "loss": 0.3977, "step": 973 }, { "epoch": 0.04705996038073151, "grad_norm": 3.9048733711242676, "learning_rate": 9.529400396192685e-07, "loss": 0.3538, "step": 974 }, { "epoch": 0.04710827656182055, "grad_norm": 2.9545390605926514, "learning_rate": 9.528917234381793e-07, "loss": 0.3513, "step": 975 }, { "epoch": 0.0471565927429096, "grad_norm": 3.3526690006256104, "learning_rate": 9.528434072570903e-07, "loss": 0.4564, "step": 976 }, { "epoch": 0.047204908923998645, "grad_norm": 1.8734793663024902, "learning_rate": 9.527950910760013e-07, "loss": 0.194, "step": 977 }, { "epoch": 0.04725322510508769, "grad_norm": 4.632877826690674, "learning_rate": 9.527467748949123e-07, "loss": 0.3958, "step": 978 }, { "epoch": 0.04730154128617674, "grad_norm": 3.0012693405151367, "learning_rate": 9.526984587138233e-07, "loss": 0.3477, "step": 979 }, { "epoch": 0.04734985746726579, "grad_norm": 4.127896308898926, "learning_rate": 9.526501425327342e-07, "loss": 0.3231, "step": 980 }, { "epoch": 0.04739817364835484, "grad_norm": 6.797053337097168, "learning_rate": 9.526018263516451e-07, "loss": 0.3391, "step": 981 }, { "epoch": 0.04744648982944388, "grad_norm": 2.210146188735962, "learning_rate": 9.52553510170556e-07, "loss": 0.2492, "step": 982 }, { "epoch": 0.04749480601053293, "grad_norm": 2.1261250972747803, "learning_rate": 9.52505193989467e-07, "loss": 0.2251, "step": 983 }, { "epoch": 0.047543122191621975, "grad_norm": 2.551539421081543, "learning_rate": 9.52456877808378e-07, "loss": 0.3129, "step": 984 }, { "epoch": 0.04759143837271102, "grad_norm": 2.4223666191101074, "learning_rate": 9.524085616272889e-07, "loss": 0.3194, "step": 985 }, { "epoch": 0.04763975455380007, "grad_norm": 3.017664670944214, "learning_rate": 9.523602454461999e-07, "loss": 0.3152, "step": 986 }, { "epoch": 0.047688070734889114, "grad_norm": 2.6998560428619385, "learning_rate": 9.523119292651109e-07, "loss": 0.2938, "step": 987 }, { "epoch": 0.04773638691597816, "grad_norm": 2.875171422958374, "learning_rate": 9.522636130840218e-07, "loss": 0.315, "step": 988 }, { "epoch": 0.047784703097067206, "grad_norm": 3.10270094871521, "learning_rate": 9.522152969029327e-07, "loss": 0.3924, "step": 989 }, { "epoch": 0.04783301927815625, "grad_norm": 2.0871827602386475, "learning_rate": 9.521669807218436e-07, "loss": 0.1903, "step": 990 }, { "epoch": 0.0478813354592453, "grad_norm": 3.1666765213012695, "learning_rate": 9.521186645407546e-07, "loss": 0.3697, "step": 991 }, { "epoch": 0.047929651640334345, "grad_norm": 3.4778225421905518, "learning_rate": 9.520703483596656e-07, "loss": 0.3381, "step": 992 }, { "epoch": 0.0479779678214234, "grad_norm": 4.098067283630371, "learning_rate": 9.520220321785766e-07, "loss": 0.4205, "step": 993 }, { "epoch": 0.048026284002512444, "grad_norm": 2.11040997505188, "learning_rate": 9.519737159974876e-07, "loss": 0.2399, "step": 994 }, { "epoch": 0.04807460018360149, "grad_norm": 4.3017072677612305, "learning_rate": 9.519253998163985e-07, "loss": 0.3615, "step": 995 }, { "epoch": 0.048122916364690536, "grad_norm": 3.328434467315674, "learning_rate": 9.518770836353094e-07, "loss": 0.2326, "step": 996 }, { "epoch": 0.04817123254577958, "grad_norm": 3.2140438556671143, "learning_rate": 9.518287674542204e-07, "loss": 0.5153, "step": 997 }, { "epoch": 0.04821954872686863, "grad_norm": 2.5136265754699707, "learning_rate": 9.517804512731313e-07, "loss": 0.2512, "step": 998 }, { "epoch": 0.048267864907957675, "grad_norm": 3.2730233669281006, "learning_rate": 9.517321350920423e-07, "loss": 0.3487, "step": 999 }, { "epoch": 0.04831618108904672, "grad_norm": 1.8586987257003784, "learning_rate": 9.516838189109533e-07, "loss": 0.2241, "step": 1000 }, { "epoch": 0.04836449727013577, "grad_norm": 2.9830849170684814, "learning_rate": 9.516355027298641e-07, "loss": 0.2761, "step": 1001 }, { "epoch": 0.04841281345122481, "grad_norm": 1.7682881355285645, "learning_rate": 9.515871865487751e-07, "loss": 0.215, "step": 1002 }, { "epoch": 0.04846112963231386, "grad_norm": 4.65143346786499, "learning_rate": 9.515388703676861e-07, "loss": 0.3289, "step": 1003 }, { "epoch": 0.048509445813402906, "grad_norm": 3.797586679458618, "learning_rate": 9.514905541865971e-07, "loss": 0.37, "step": 1004 }, { "epoch": 0.04855776199449195, "grad_norm": 3.047983169555664, "learning_rate": 9.514422380055081e-07, "loss": 0.3959, "step": 1005 }, { "epoch": 0.048606078175581005, "grad_norm": 2.862759828567505, "learning_rate": 9.51393921824419e-07, "loss": 0.3864, "step": 1006 }, { "epoch": 0.04865439435667005, "grad_norm": 4.1610331535339355, "learning_rate": 9.513456056433298e-07, "loss": 0.1902, "step": 1007 }, { "epoch": 0.0487027105377591, "grad_norm": 2.2927870750427246, "learning_rate": 9.512972894622408e-07, "loss": 0.2733, "step": 1008 }, { "epoch": 0.048751026718848144, "grad_norm": 2.920994281768799, "learning_rate": 9.512489732811518e-07, "loss": 0.3499, "step": 1009 }, { "epoch": 0.04879934289993719, "grad_norm": 2.100424289703369, "learning_rate": 9.512006571000628e-07, "loss": 0.2782, "step": 1010 }, { "epoch": 0.048847659081026236, "grad_norm": 4.554656505584717, "learning_rate": 9.511523409189737e-07, "loss": 0.4009, "step": 1011 }, { "epoch": 0.04889597526211528, "grad_norm": 2.1318283081054688, "learning_rate": 9.511040247378847e-07, "loss": 0.29, "step": 1012 }, { "epoch": 0.04894429144320433, "grad_norm": 3.0832417011260986, "learning_rate": 9.510557085567957e-07, "loss": 0.3101, "step": 1013 }, { "epoch": 0.048992607624293374, "grad_norm": 1.8186042308807373, "learning_rate": 9.510073923757066e-07, "loss": 0.2185, "step": 1014 }, { "epoch": 0.04904092380538242, "grad_norm": 3.332581043243408, "learning_rate": 9.509590761946175e-07, "loss": 0.374, "step": 1015 }, { "epoch": 0.04908923998647147, "grad_norm": 2.7436962127685547, "learning_rate": 9.509107600135284e-07, "loss": 0.2585, "step": 1016 }, { "epoch": 0.04913755616756051, "grad_norm": 2.2046310901641846, "learning_rate": 9.508624438324394e-07, "loss": 0.2571, "step": 1017 }, { "epoch": 0.049185872348649566, "grad_norm": 3.194162130355835, "learning_rate": 9.508141276513504e-07, "loss": 0.4368, "step": 1018 }, { "epoch": 0.04923418852973861, "grad_norm": 2.9215378761291504, "learning_rate": 9.507658114702614e-07, "loss": 0.3637, "step": 1019 }, { "epoch": 0.04928250471082766, "grad_norm": 2.1318652629852295, "learning_rate": 9.507174952891723e-07, "loss": 0.2958, "step": 1020 }, { "epoch": 0.049330820891916705, "grad_norm": 3.043807029724121, "learning_rate": 9.506691791080833e-07, "loss": 0.3265, "step": 1021 }, { "epoch": 0.04937913707300575, "grad_norm": 2.427046298980713, "learning_rate": 9.506208629269942e-07, "loss": 0.2482, "step": 1022 }, { "epoch": 0.0494274532540948, "grad_norm": 2.9995391368865967, "learning_rate": 9.505725467459051e-07, "loss": 0.3399, "step": 1023 }, { "epoch": 0.04947576943518384, "grad_norm": 3.0346145629882812, "learning_rate": 9.505242305648161e-07, "loss": 0.2526, "step": 1024 }, { "epoch": 0.04952408561627289, "grad_norm": 2.3615124225616455, "learning_rate": 9.504759143837271e-07, "loss": 0.3474, "step": 1025 }, { "epoch": 0.049572401797361935, "grad_norm": 2.0601894855499268, "learning_rate": 9.504275982026381e-07, "loss": 0.2684, "step": 1026 }, { "epoch": 0.04962071797845098, "grad_norm": 3.7824594974517822, "learning_rate": 9.503792820215489e-07, "loss": 0.5151, "step": 1027 }, { "epoch": 0.04966903415954003, "grad_norm": 2.6680850982666016, "learning_rate": 9.503309658404599e-07, "loss": 0.381, "step": 1028 }, { "epoch": 0.049717350340629074, "grad_norm": 3.420194387435913, "learning_rate": 9.502826496593709e-07, "loss": 0.4028, "step": 1029 }, { "epoch": 0.04976566652171812, "grad_norm": 3.417851209640503, "learning_rate": 9.502343334782819e-07, "loss": 0.4306, "step": 1030 }, { "epoch": 0.04981398270280717, "grad_norm": 2.6576688289642334, "learning_rate": 9.501860172971929e-07, "loss": 0.3948, "step": 1031 }, { "epoch": 0.04986229888389622, "grad_norm": 3.592198133468628, "learning_rate": 9.501377011161037e-07, "loss": 0.4169, "step": 1032 }, { "epoch": 0.049910615064985266, "grad_norm": 2.1187565326690674, "learning_rate": 9.500893849350146e-07, "loss": 0.293, "step": 1033 }, { "epoch": 0.04995893124607431, "grad_norm": 4.63816499710083, "learning_rate": 9.500410687539256e-07, "loss": 0.3425, "step": 1034 }, { "epoch": 0.05000724742716336, "grad_norm": 2.163350820541382, "learning_rate": 9.499927525728366e-07, "loss": 0.2235, "step": 1035 }, { "epoch": 0.050055563608252404, "grad_norm": 3.6903955936431885, "learning_rate": 9.499444363917476e-07, "loss": 0.3756, "step": 1036 }, { "epoch": 0.05010387978934145, "grad_norm": 3.8453893661499023, "learning_rate": 9.498961202106585e-07, "loss": 0.4051, "step": 1037 }, { "epoch": 0.050152195970430496, "grad_norm": 2.5116820335388184, "learning_rate": 9.498478040295695e-07, "loss": 0.3524, "step": 1038 }, { "epoch": 0.05020051215151954, "grad_norm": 3.5704259872436523, "learning_rate": 9.497994878484804e-07, "loss": 0.3991, "step": 1039 }, { "epoch": 0.05024882833260859, "grad_norm": 34.85274124145508, "learning_rate": 9.497511716673913e-07, "loss": 0.4678, "step": 1040 }, { "epoch": 0.050297144513697635, "grad_norm": 2.365920066833496, "learning_rate": 9.497028554863023e-07, "loss": 0.2809, "step": 1041 }, { "epoch": 0.05034546069478668, "grad_norm": 3.3874285221099854, "learning_rate": 9.496545393052132e-07, "loss": 0.3396, "step": 1042 }, { "epoch": 0.05039377687587573, "grad_norm": 15.316631317138672, "learning_rate": 9.496062231241242e-07, "loss": 0.3862, "step": 1043 }, { "epoch": 0.05044209305696478, "grad_norm": 4.791275501251221, "learning_rate": 9.495579069430352e-07, "loss": 0.3371, "step": 1044 }, { "epoch": 0.05049040923805383, "grad_norm": 2.9645423889160156, "learning_rate": 9.495095907619462e-07, "loss": 0.3076, "step": 1045 }, { "epoch": 0.05053872541914287, "grad_norm": 3.482140302658081, "learning_rate": 9.494612745808571e-07, "loss": 0.2853, "step": 1046 }, { "epoch": 0.05058704160023192, "grad_norm": 2.110987424850464, "learning_rate": 9.494129583997681e-07, "loss": 0.1679, "step": 1047 }, { "epoch": 0.050635357781320965, "grad_norm": 3.170422077178955, "learning_rate": 9.49364642218679e-07, "loss": 0.449, "step": 1048 }, { "epoch": 0.05068367396241001, "grad_norm": 2.934610366821289, "learning_rate": 9.493163260375899e-07, "loss": 0.4025, "step": 1049 }, { "epoch": 0.05073199014349906, "grad_norm": 3.7996134757995605, "learning_rate": 9.492680098565009e-07, "loss": 0.318, "step": 1050 }, { "epoch": 0.050780306324588104, "grad_norm": 2.1610794067382812, "learning_rate": 9.492196936754119e-07, "loss": 0.2585, "step": 1051 }, { "epoch": 0.05082862250567715, "grad_norm": 3.252986431121826, "learning_rate": 9.491713774943228e-07, "loss": 0.3884, "step": 1052 }, { "epoch": 0.050876938686766196, "grad_norm": 4.869869709014893, "learning_rate": 9.491230613132337e-07, "loss": 0.3629, "step": 1053 }, { "epoch": 0.05092525486785524, "grad_norm": 2.331963300704956, "learning_rate": 9.490747451321447e-07, "loss": 0.3419, "step": 1054 }, { "epoch": 0.05097357104894429, "grad_norm": 4.215184688568115, "learning_rate": 9.490264289510557e-07, "loss": 0.3207, "step": 1055 }, { "epoch": 0.05102188723003334, "grad_norm": 4.407707691192627, "learning_rate": 9.489781127699667e-07, "loss": 0.2527, "step": 1056 }, { "epoch": 0.05107020341112239, "grad_norm": 2.3558850288391113, "learning_rate": 9.489297965888776e-07, "loss": 0.2111, "step": 1057 }, { "epoch": 0.051118519592211434, "grad_norm": 3.2267181873321533, "learning_rate": 9.488814804077885e-07, "loss": 0.2721, "step": 1058 }, { "epoch": 0.05116683577330048, "grad_norm": 19.28703498840332, "learning_rate": 9.488331642266994e-07, "loss": 0.2374, "step": 1059 }, { "epoch": 0.051215151954389526, "grad_norm": 3.6030113697052, "learning_rate": 9.487848480456104e-07, "loss": 0.434, "step": 1060 }, { "epoch": 0.05126346813547857, "grad_norm": 2.7695975303649902, "learning_rate": 9.487365318645214e-07, "loss": 0.339, "step": 1061 }, { "epoch": 0.05131178431656762, "grad_norm": 9.177644729614258, "learning_rate": 9.486882156834324e-07, "loss": 0.3748, "step": 1062 }, { "epoch": 0.051360100497656665, "grad_norm": 2.9846818447113037, "learning_rate": 9.486398995023433e-07, "loss": 0.1754, "step": 1063 }, { "epoch": 0.05140841667874571, "grad_norm": 13.273388862609863, "learning_rate": 9.485915833212543e-07, "loss": 0.2211, "step": 1064 }, { "epoch": 0.05145673285983476, "grad_norm": 3.920680522918701, "learning_rate": 9.485432671401651e-07, "loss": 0.2166, "step": 1065 }, { "epoch": 0.0515050490409238, "grad_norm": 3.6388769149780273, "learning_rate": 9.484949509590761e-07, "loss": 0.4729, "step": 1066 }, { "epoch": 0.05155336522201285, "grad_norm": 5.642730712890625, "learning_rate": 9.484466347779871e-07, "loss": 0.5008, "step": 1067 }, { "epoch": 0.051601681403101896, "grad_norm": 68.6778335571289, "learning_rate": 9.48398318596898e-07, "loss": 0.2739, "step": 1068 }, { "epoch": 0.05164999758419095, "grad_norm": 3.171175718307495, "learning_rate": 9.48350002415809e-07, "loss": 0.3749, "step": 1069 }, { "epoch": 0.051698313765279995, "grad_norm": 4.79764461517334, "learning_rate": 9.4830168623472e-07, "loss": 0.2805, "step": 1070 }, { "epoch": 0.05174662994636904, "grad_norm": 2.559927225112915, "learning_rate": 9.48253370053631e-07, "loss": 0.3271, "step": 1071 }, { "epoch": 0.05179494612745809, "grad_norm": 3.3201234340667725, "learning_rate": 9.482050538725419e-07, "loss": 0.4339, "step": 1072 }, { "epoch": 0.05184326230854713, "grad_norm": 2.941551685333252, "learning_rate": 9.481567376914529e-07, "loss": 0.397, "step": 1073 }, { "epoch": 0.05189157848963618, "grad_norm": 3.1088247299194336, "learning_rate": 9.481084215103637e-07, "loss": 0.3804, "step": 1074 }, { "epoch": 0.051939894670725226, "grad_norm": 2.8831989765167236, "learning_rate": 9.480601053292747e-07, "loss": 0.3512, "step": 1075 }, { "epoch": 0.05198821085181427, "grad_norm": 3.2470459938049316, "learning_rate": 9.480117891481857e-07, "loss": 0.4043, "step": 1076 }, { "epoch": 0.05203652703290332, "grad_norm": 3.3286774158477783, "learning_rate": 9.479634729670967e-07, "loss": 0.3614, "step": 1077 }, { "epoch": 0.052084843213992364, "grad_norm": 3.031449556350708, "learning_rate": 9.479151567860076e-07, "loss": 0.341, "step": 1078 }, { "epoch": 0.05213315939508141, "grad_norm": 3.700310230255127, "learning_rate": 9.478668406049185e-07, "loss": 0.2629, "step": 1079 }, { "epoch": 0.05218147557617046, "grad_norm": 2.6303210258483887, "learning_rate": 9.478185244238295e-07, "loss": 0.3071, "step": 1080 }, { "epoch": 0.05222979175725951, "grad_norm": 3.0201148986816406, "learning_rate": 9.477702082427405e-07, "loss": 0.3438, "step": 1081 }, { "epoch": 0.052278107938348556, "grad_norm": 2.243715524673462, "learning_rate": 9.477218920616515e-07, "loss": 0.2938, "step": 1082 }, { "epoch": 0.0523264241194376, "grad_norm": 3.2305572032928467, "learning_rate": 9.476735758805624e-07, "loss": 0.3883, "step": 1083 }, { "epoch": 0.05237474030052665, "grad_norm": 56.298912048339844, "learning_rate": 9.476252596994732e-07, "loss": 0.3185, "step": 1084 }, { "epoch": 0.052423056481615694, "grad_norm": 4.1641740798950195, "learning_rate": 9.475769435183842e-07, "loss": 0.3101, "step": 1085 }, { "epoch": 0.05247137266270474, "grad_norm": 2.7609975337982178, "learning_rate": 9.475286273372952e-07, "loss": 0.2601, "step": 1086 }, { "epoch": 0.05251968884379379, "grad_norm": 2.4985194206237793, "learning_rate": 9.474803111562062e-07, "loss": 0.2997, "step": 1087 }, { "epoch": 0.05256800502488283, "grad_norm": 9.152607917785645, "learning_rate": 9.474319949751172e-07, "loss": 0.5242, "step": 1088 }, { "epoch": 0.05261632120597188, "grad_norm": 1.691946268081665, "learning_rate": 9.473836787940281e-07, "loss": 0.2089, "step": 1089 }, { "epoch": 0.052664637387060925, "grad_norm": 3.1538381576538086, "learning_rate": 9.473353626129391e-07, "loss": 0.3152, "step": 1090 }, { "epoch": 0.05271295356814997, "grad_norm": 3.1954030990600586, "learning_rate": 9.472870464318499e-07, "loss": 0.3699, "step": 1091 }, { "epoch": 0.05276126974923902, "grad_norm": 2.205056667327881, "learning_rate": 9.472387302507609e-07, "loss": 0.2363, "step": 1092 }, { "epoch": 0.052809585930328064, "grad_norm": 2.1848933696746826, "learning_rate": 9.471904140696719e-07, "loss": 0.2218, "step": 1093 }, { "epoch": 0.05285790211141712, "grad_norm": 3.2215776443481445, "learning_rate": 9.471420978885828e-07, "loss": 0.3344, "step": 1094 }, { "epoch": 0.05290621829250616, "grad_norm": 12.960456848144531, "learning_rate": 9.470937817074938e-07, "loss": 0.3976, "step": 1095 }, { "epoch": 0.05295453447359521, "grad_norm": 2.0757243633270264, "learning_rate": 9.470454655264048e-07, "loss": 0.1712, "step": 1096 }, { "epoch": 0.053002850654684255, "grad_norm": 21.069425582885742, "learning_rate": 9.469971493453157e-07, "loss": 0.352, "step": 1097 }, { "epoch": 0.0530511668357733, "grad_norm": 4.788760185241699, "learning_rate": 9.469488331642267e-07, "loss": 0.4479, "step": 1098 }, { "epoch": 0.05309948301686235, "grad_norm": 2.579162359237671, "learning_rate": 9.469005169831377e-07, "loss": 0.365, "step": 1099 }, { "epoch": 0.053147799197951394, "grad_norm": 3.173471689224243, "learning_rate": 9.468522008020485e-07, "loss": 0.4564, "step": 1100 }, { "epoch": 0.05319611537904044, "grad_norm": 2.6622607707977295, "learning_rate": 9.468038846209595e-07, "loss": 0.316, "step": 1101 }, { "epoch": 0.053244431560129486, "grad_norm": 1.75474214553833, "learning_rate": 9.467555684398705e-07, "loss": 0.2397, "step": 1102 }, { "epoch": 0.05329274774121853, "grad_norm": 2.1772494316101074, "learning_rate": 9.467072522587815e-07, "loss": 0.3029, "step": 1103 }, { "epoch": 0.05334106392230758, "grad_norm": 3.250051736831665, "learning_rate": 9.466589360776924e-07, "loss": 0.2699, "step": 1104 }, { "epoch": 0.053389380103396625, "grad_norm": 2.268733263015747, "learning_rate": 9.466106198966033e-07, "loss": 0.2392, "step": 1105 }, { "epoch": 0.05343769628448567, "grad_norm": 4.159872531890869, "learning_rate": 9.465623037155143e-07, "loss": 0.4089, "step": 1106 }, { "epoch": 0.053486012465574724, "grad_norm": 2.7650132179260254, "learning_rate": 9.465139875344253e-07, "loss": 0.3055, "step": 1107 }, { "epoch": 0.05353432864666377, "grad_norm": 5.499889850616455, "learning_rate": 9.464656713533362e-07, "loss": 0.34, "step": 1108 }, { "epoch": 0.053582644827752816, "grad_norm": 2.9966094493865967, "learning_rate": 9.464173551722472e-07, "loss": 0.4917, "step": 1109 }, { "epoch": 0.05363096100884186, "grad_norm": 1.882120966911316, "learning_rate": 9.46369038991158e-07, "loss": 0.2019, "step": 1110 }, { "epoch": 0.05367927718993091, "grad_norm": 5.884964942932129, "learning_rate": 9.46320722810069e-07, "loss": 0.2773, "step": 1111 }, { "epoch": 0.053727593371019955, "grad_norm": 2.160123825073242, "learning_rate": 9.4627240662898e-07, "loss": 0.2673, "step": 1112 }, { "epoch": 0.053775909552109, "grad_norm": 2.8405210971832275, "learning_rate": 9.46224090447891e-07, "loss": 0.2884, "step": 1113 }, { "epoch": 0.05382422573319805, "grad_norm": 2.8064942359924316, "learning_rate": 9.46175774266802e-07, "loss": 0.3271, "step": 1114 }, { "epoch": 0.05387254191428709, "grad_norm": 48.94305419921875, "learning_rate": 9.461274580857129e-07, "loss": 0.325, "step": 1115 }, { "epoch": 0.05392085809537614, "grad_norm": 2.679845094680786, "learning_rate": 9.460791419046237e-07, "loss": 0.2744, "step": 1116 }, { "epoch": 0.053969174276465186, "grad_norm": 2.653707504272461, "learning_rate": 9.460308257235347e-07, "loss": 0.2903, "step": 1117 }, { "epoch": 0.05401749045755423, "grad_norm": 3.4394524097442627, "learning_rate": 9.459825095424457e-07, "loss": 0.2851, "step": 1118 }, { "epoch": 0.054065806638643285, "grad_norm": 2.572312355041504, "learning_rate": 9.459341933613567e-07, "loss": 0.3302, "step": 1119 }, { "epoch": 0.05411412281973233, "grad_norm": 3.3196942806243896, "learning_rate": 9.458858771802676e-07, "loss": 0.2347, "step": 1120 }, { "epoch": 0.05416243900082138, "grad_norm": 2.892455816268921, "learning_rate": 9.458375609991786e-07, "loss": 0.3599, "step": 1121 }, { "epoch": 0.054210755181910424, "grad_norm": 2.494680881500244, "learning_rate": 9.457892448180896e-07, "loss": 0.2973, "step": 1122 }, { "epoch": 0.05425907136299947, "grad_norm": 3.636598825454712, "learning_rate": 9.457409286370005e-07, "loss": 0.2973, "step": 1123 }, { "epoch": 0.054307387544088516, "grad_norm": 3.339463710784912, "learning_rate": 9.456926124559115e-07, "loss": 0.3848, "step": 1124 }, { "epoch": 0.05435570372517756, "grad_norm": 2.8138628005981445, "learning_rate": 9.456442962748224e-07, "loss": 0.3461, "step": 1125 }, { "epoch": 0.05440401990626661, "grad_norm": 2.626446008682251, "learning_rate": 9.455959800937333e-07, "loss": 0.2733, "step": 1126 }, { "epoch": 0.054452336087355654, "grad_norm": 2.5260496139526367, "learning_rate": 9.455476639126443e-07, "loss": 0.2959, "step": 1127 }, { "epoch": 0.0545006522684447, "grad_norm": 2.797433376312256, "learning_rate": 9.454993477315553e-07, "loss": 0.3585, "step": 1128 }, { "epoch": 0.05454896844953375, "grad_norm": 2.3190298080444336, "learning_rate": 9.454510315504662e-07, "loss": 0.305, "step": 1129 }, { "epoch": 0.05459728463062279, "grad_norm": 3.8152873516082764, "learning_rate": 9.454027153693772e-07, "loss": 0.2189, "step": 1130 }, { "epoch": 0.05464560081171184, "grad_norm": 2.141791343688965, "learning_rate": 9.453543991882881e-07, "loss": 0.307, "step": 1131 }, { "epoch": 0.05469391699280089, "grad_norm": 3.3875133991241455, "learning_rate": 9.453060830071991e-07, "loss": 0.2358, "step": 1132 }, { "epoch": 0.05474223317388994, "grad_norm": 6.870141983032227, "learning_rate": 9.4525776682611e-07, "loss": 0.1859, "step": 1133 }, { "epoch": 0.054790549354978985, "grad_norm": 2.4977855682373047, "learning_rate": 9.45209450645021e-07, "loss": 0.2645, "step": 1134 }, { "epoch": 0.05483886553606803, "grad_norm": 2.6508867740631104, "learning_rate": 9.45161134463932e-07, "loss": 0.3343, "step": 1135 }, { "epoch": 0.05488718171715708, "grad_norm": 2.2775793075561523, "learning_rate": 9.451128182828428e-07, "loss": 0.2676, "step": 1136 }, { "epoch": 0.05493549789824612, "grad_norm": 2.69490385055542, "learning_rate": 9.450645021017538e-07, "loss": 0.3916, "step": 1137 }, { "epoch": 0.05498381407933517, "grad_norm": 6.688041687011719, "learning_rate": 9.450161859206648e-07, "loss": 0.3359, "step": 1138 }, { "epoch": 0.055032130260424215, "grad_norm": 2.0552642345428467, "learning_rate": 9.449678697395758e-07, "loss": 0.1946, "step": 1139 }, { "epoch": 0.05508044644151326, "grad_norm": 2.478245496749878, "learning_rate": 9.449195535584868e-07, "loss": 0.3235, "step": 1140 }, { "epoch": 0.05512876262260231, "grad_norm": 2.2308387756347656, "learning_rate": 9.448712373773977e-07, "loss": 0.2477, "step": 1141 }, { "epoch": 0.055177078803691354, "grad_norm": 4.233534812927246, "learning_rate": 9.448229211963085e-07, "loss": 0.4797, "step": 1142 }, { "epoch": 0.0552253949847804, "grad_norm": 2.7182281017303467, "learning_rate": 9.447746050152195e-07, "loss": 0.3671, "step": 1143 }, { "epoch": 0.055273711165869446, "grad_norm": 4.037023067474365, "learning_rate": 9.447262888341305e-07, "loss": 0.396, "step": 1144 }, { "epoch": 0.0553220273469585, "grad_norm": 4.159581661224365, "learning_rate": 9.446779726530415e-07, "loss": 0.4059, "step": 1145 }, { "epoch": 0.055370343528047546, "grad_norm": 3.283700942993164, "learning_rate": 9.446296564719524e-07, "loss": 0.2691, "step": 1146 }, { "epoch": 0.05541865970913659, "grad_norm": 2.1753664016723633, "learning_rate": 9.445813402908634e-07, "loss": 0.3187, "step": 1147 }, { "epoch": 0.05546697589022564, "grad_norm": 3.277237892150879, "learning_rate": 9.445330241097743e-07, "loss": 0.492, "step": 1148 }, { "epoch": 0.055515292071314684, "grad_norm": 2.1732544898986816, "learning_rate": 9.444847079286853e-07, "loss": 0.2397, "step": 1149 }, { "epoch": 0.05556360825240373, "grad_norm": 2.601799249649048, "learning_rate": 9.444363917475962e-07, "loss": 0.3105, "step": 1150 }, { "epoch": 0.055611924433492776, "grad_norm": 3.122332811355591, "learning_rate": 9.443880755665071e-07, "loss": 0.3566, "step": 1151 }, { "epoch": 0.05566024061458182, "grad_norm": 2.538532018661499, "learning_rate": 9.443397593854181e-07, "loss": 0.3625, "step": 1152 }, { "epoch": 0.05570855679567087, "grad_norm": 9.584085464477539, "learning_rate": 9.442914432043291e-07, "loss": 0.3119, "step": 1153 }, { "epoch": 0.055756872976759915, "grad_norm": 2.460735559463501, "learning_rate": 9.442431270232401e-07, "loss": 0.3131, "step": 1154 }, { "epoch": 0.05580518915784896, "grad_norm": 1.948341965675354, "learning_rate": 9.44194810842151e-07, "loss": 0.1632, "step": 1155 }, { "epoch": 0.05585350533893801, "grad_norm": 2.0574700832366943, "learning_rate": 9.44146494661062e-07, "loss": 0.2542, "step": 1156 }, { "epoch": 0.05590182152002706, "grad_norm": 3.6841585636138916, "learning_rate": 9.440981784799729e-07, "loss": 0.3255, "step": 1157 }, { "epoch": 0.05595013770111611, "grad_norm": 19.03251075744629, "learning_rate": 9.440498622988838e-07, "loss": 0.3962, "step": 1158 }, { "epoch": 0.05599845388220515, "grad_norm": 2.7428853511810303, "learning_rate": 9.440015461177948e-07, "loss": 0.3165, "step": 1159 }, { "epoch": 0.0560467700632942, "grad_norm": 3.3432791233062744, "learning_rate": 9.439532299367058e-07, "loss": 0.3651, "step": 1160 }, { "epoch": 0.056095086244383245, "grad_norm": 3.324613094329834, "learning_rate": 9.439049137556167e-07, "loss": 0.4084, "step": 1161 }, { "epoch": 0.05614340242547229, "grad_norm": 6.452321529388428, "learning_rate": 9.438565975745276e-07, "loss": 0.3357, "step": 1162 }, { "epoch": 0.05619171860656134, "grad_norm": 2.2843358516693115, "learning_rate": 9.438082813934386e-07, "loss": 0.3549, "step": 1163 }, { "epoch": 0.056240034787650384, "grad_norm": 2.3711583614349365, "learning_rate": 9.437599652123496e-07, "loss": 0.3529, "step": 1164 }, { "epoch": 0.05628835096873943, "grad_norm": 3.2680017948150635, "learning_rate": 9.437116490312606e-07, "loss": 0.3844, "step": 1165 }, { "epoch": 0.056336667149828476, "grad_norm": 2.6877522468566895, "learning_rate": 9.436633328501716e-07, "loss": 0.29, "step": 1166 }, { "epoch": 0.05638498333091752, "grad_norm": 3.9627912044525146, "learning_rate": 9.436150166690823e-07, "loss": 0.2594, "step": 1167 }, { "epoch": 0.05643329951200657, "grad_norm": 2.9544410705566406, "learning_rate": 9.435667004879933e-07, "loss": 0.2992, "step": 1168 }, { "epoch": 0.056481615693095615, "grad_norm": 2.767146348953247, "learning_rate": 9.435183843069043e-07, "loss": 0.3064, "step": 1169 }, { "epoch": 0.05652993187418467, "grad_norm": 2.118685722351074, "learning_rate": 9.434700681258153e-07, "loss": 0.2575, "step": 1170 }, { "epoch": 0.056578248055273714, "grad_norm": 4.529419898986816, "learning_rate": 9.434217519447263e-07, "loss": 0.3105, "step": 1171 }, { "epoch": 0.05662656423636276, "grad_norm": 2.222116231918335, "learning_rate": 9.433734357636372e-07, "loss": 0.2533, "step": 1172 }, { "epoch": 0.056674880417451806, "grad_norm": 2.0656962394714355, "learning_rate": 9.433251195825482e-07, "loss": 0.2925, "step": 1173 }, { "epoch": 0.05672319659854085, "grad_norm": 2.61915922164917, "learning_rate": 9.432768034014591e-07, "loss": 0.2348, "step": 1174 }, { "epoch": 0.0567715127796299, "grad_norm": 3.3191897869110107, "learning_rate": 9.4322848722037e-07, "loss": 0.4525, "step": 1175 }, { "epoch": 0.056819828960718945, "grad_norm": 3.109387159347534, "learning_rate": 9.43180171039281e-07, "loss": 0.2663, "step": 1176 }, { "epoch": 0.05686814514180799, "grad_norm": 2.5516295433044434, "learning_rate": 9.431318548581919e-07, "loss": 0.2925, "step": 1177 }, { "epoch": 0.05691646132289704, "grad_norm": 1.944778561592102, "learning_rate": 9.430835386771029e-07, "loss": 0.2421, "step": 1178 }, { "epoch": 0.05696477750398608, "grad_norm": 1.6827077865600586, "learning_rate": 9.430352224960139e-07, "loss": 0.2172, "step": 1179 }, { "epoch": 0.05701309368507513, "grad_norm": 4.498042106628418, "learning_rate": 9.429869063149248e-07, "loss": 0.3833, "step": 1180 }, { "epoch": 0.057061409866164176, "grad_norm": 3.1954259872436523, "learning_rate": 9.429385901338358e-07, "loss": 0.3872, "step": 1181 }, { "epoch": 0.05710972604725322, "grad_norm": 2.6915791034698486, "learning_rate": 9.428902739527468e-07, "loss": 0.3196, "step": 1182 }, { "epoch": 0.057158042228342275, "grad_norm": 2.605175733566284, "learning_rate": 9.428419577716577e-07, "loss": 0.283, "step": 1183 }, { "epoch": 0.05720635840943132, "grad_norm": 11.341469764709473, "learning_rate": 9.427936415905686e-07, "loss": 0.3606, "step": 1184 }, { "epoch": 0.05725467459052037, "grad_norm": 4.443129062652588, "learning_rate": 9.427453254094796e-07, "loss": 0.3616, "step": 1185 }, { "epoch": 0.05730299077160941, "grad_norm": 2.2530994415283203, "learning_rate": 9.426970092283906e-07, "loss": 0.2645, "step": 1186 }, { "epoch": 0.05735130695269846, "grad_norm": 3.991448163986206, "learning_rate": 9.426486930473015e-07, "loss": 0.3394, "step": 1187 }, { "epoch": 0.057399623133787506, "grad_norm": 3.23234224319458, "learning_rate": 9.426003768662124e-07, "loss": 0.2809, "step": 1188 }, { "epoch": 0.05744793931487655, "grad_norm": 2.3701908588409424, "learning_rate": 9.425520606851234e-07, "loss": 0.301, "step": 1189 }, { "epoch": 0.0574962554959656, "grad_norm": 2.2808287143707275, "learning_rate": 9.425037445040344e-07, "loss": 0.2652, "step": 1190 }, { "epoch": 0.057544571677054644, "grad_norm": 5.844300746917725, "learning_rate": 9.424554283229454e-07, "loss": 0.2779, "step": 1191 }, { "epoch": 0.05759288785814369, "grad_norm": 2.130577564239502, "learning_rate": 9.424071121418564e-07, "loss": 0.2207, "step": 1192 }, { "epoch": 0.05764120403923274, "grad_norm": 2.7507801055908203, "learning_rate": 9.423587959607671e-07, "loss": 0.3764, "step": 1193 }, { "epoch": 0.05768952022032178, "grad_norm": 2.187307596206665, "learning_rate": 9.423104797796781e-07, "loss": 0.3511, "step": 1194 }, { "epoch": 0.057737836401410836, "grad_norm": 2.93180775642395, "learning_rate": 9.422621635985891e-07, "loss": 0.3931, "step": 1195 }, { "epoch": 0.05778615258249988, "grad_norm": 2.9802744388580322, "learning_rate": 9.422138474175001e-07, "loss": 0.346, "step": 1196 }, { "epoch": 0.05783446876358893, "grad_norm": 2.8130438327789307, "learning_rate": 9.421655312364111e-07, "loss": 0.3559, "step": 1197 }, { "epoch": 0.057882784944677974, "grad_norm": 2.752802610397339, "learning_rate": 9.42117215055322e-07, "loss": 0.2891, "step": 1198 }, { "epoch": 0.05793110112576702, "grad_norm": 7.822057723999023, "learning_rate": 9.420688988742329e-07, "loss": 0.4083, "step": 1199 }, { "epoch": 0.05797941730685607, "grad_norm": 2.752265453338623, "learning_rate": 9.420205826931439e-07, "loss": 0.2987, "step": 1200 }, { "epoch": 0.05802773348794511, "grad_norm": 2.667672872543335, "learning_rate": 9.419722665120548e-07, "loss": 0.3056, "step": 1201 }, { "epoch": 0.05807604966903416, "grad_norm": 6.462130546569824, "learning_rate": 9.419239503309658e-07, "loss": 0.3914, "step": 1202 }, { "epoch": 0.058124365850123205, "grad_norm": 2.7368931770324707, "learning_rate": 9.418756341498767e-07, "loss": 0.3349, "step": 1203 }, { "epoch": 0.05817268203121225, "grad_norm": 3.81895112991333, "learning_rate": 9.418273179687877e-07, "loss": 0.4829, "step": 1204 }, { "epoch": 0.0582209982123013, "grad_norm": 3.494393825531006, "learning_rate": 9.417790017876987e-07, "loss": 0.3717, "step": 1205 }, { "epoch": 0.058269314393390344, "grad_norm": 2.9009852409362793, "learning_rate": 9.417306856066096e-07, "loss": 0.4054, "step": 1206 }, { "epoch": 0.05831763057447939, "grad_norm": 3.2101852893829346, "learning_rate": 9.416823694255206e-07, "loss": 0.1678, "step": 1207 }, { "epoch": 0.05836594675556844, "grad_norm": 4.4363298416137695, "learning_rate": 9.416340532444316e-07, "loss": 0.336, "step": 1208 }, { "epoch": 0.05841426293665749, "grad_norm": 2.4030871391296387, "learning_rate": 9.415857370633424e-07, "loss": 0.273, "step": 1209 }, { "epoch": 0.058462579117746535, "grad_norm": 5.025668621063232, "learning_rate": 9.415374208822534e-07, "loss": 0.2076, "step": 1210 }, { "epoch": 0.05851089529883558, "grad_norm": 3.193755865097046, "learning_rate": 9.414891047011644e-07, "loss": 0.1948, "step": 1211 }, { "epoch": 0.05855921147992463, "grad_norm": 2.9522087574005127, "learning_rate": 9.414407885200753e-07, "loss": 0.4003, "step": 1212 }, { "epoch": 0.058607527661013674, "grad_norm": 7.202801704406738, "learning_rate": 9.413924723389863e-07, "loss": 0.1588, "step": 1213 }, { "epoch": 0.05865584384210272, "grad_norm": 2.844188690185547, "learning_rate": 9.413441561578972e-07, "loss": 0.3848, "step": 1214 }, { "epoch": 0.058704160023191766, "grad_norm": 2.1544911861419678, "learning_rate": 9.412958399768082e-07, "loss": 0.256, "step": 1215 }, { "epoch": 0.05875247620428081, "grad_norm": 2.6313631534576416, "learning_rate": 9.412475237957192e-07, "loss": 0.362, "step": 1216 }, { "epoch": 0.05880079238536986, "grad_norm": 3.599205493927002, "learning_rate": 9.411992076146302e-07, "loss": 0.4299, "step": 1217 }, { "epoch": 0.058849108566458905, "grad_norm": 6.91365385055542, "learning_rate": 9.411508914335411e-07, "loss": 0.2288, "step": 1218 }, { "epoch": 0.05889742474754795, "grad_norm": 2.7104249000549316, "learning_rate": 9.411025752524519e-07, "loss": 0.3766, "step": 1219 }, { "epoch": 0.058945740928637, "grad_norm": 1.51010262966156, "learning_rate": 9.410542590713629e-07, "loss": 0.16, "step": 1220 }, { "epoch": 0.05899405710972605, "grad_norm": 2.712074041366577, "learning_rate": 9.410059428902739e-07, "loss": 0.3117, "step": 1221 }, { "epoch": 0.059042373290815096, "grad_norm": 4.812071323394775, "learning_rate": 9.409576267091849e-07, "loss": 0.359, "step": 1222 }, { "epoch": 0.05909068947190414, "grad_norm": 3.1415717601776123, "learning_rate": 9.409093105280959e-07, "loss": 0.3255, "step": 1223 }, { "epoch": 0.05913900565299319, "grad_norm": 3.01115345954895, "learning_rate": 9.408609943470068e-07, "loss": 0.3661, "step": 1224 }, { "epoch": 0.059187321834082235, "grad_norm": 2.450929641723633, "learning_rate": 9.408126781659177e-07, "loss": 0.2337, "step": 1225 }, { "epoch": 0.05923563801517128, "grad_norm": 5.283107280731201, "learning_rate": 9.407643619848286e-07, "loss": 0.322, "step": 1226 }, { "epoch": 0.05928395419626033, "grad_norm": 2.53908109664917, "learning_rate": 9.407160458037396e-07, "loss": 0.2724, "step": 1227 }, { "epoch": 0.05933227037734937, "grad_norm": 14.359009742736816, "learning_rate": 9.406677296226506e-07, "loss": 0.2626, "step": 1228 }, { "epoch": 0.05938058655843842, "grad_norm": 1.7916349172592163, "learning_rate": 9.406194134415615e-07, "loss": 0.1853, "step": 1229 }, { "epoch": 0.059428902739527466, "grad_norm": 5.014812469482422, "learning_rate": 9.405710972604725e-07, "loss": 0.4408, "step": 1230 }, { "epoch": 0.05947721892061651, "grad_norm": 2.893869638442993, "learning_rate": 9.405227810793834e-07, "loss": 0.467, "step": 1231 }, { "epoch": 0.05952553510170556, "grad_norm": 1.968237042427063, "learning_rate": 9.404744648982944e-07, "loss": 0.228, "step": 1232 }, { "epoch": 0.05957385128279461, "grad_norm": 2.4653475284576416, "learning_rate": 9.404261487172054e-07, "loss": 0.2535, "step": 1233 }, { "epoch": 0.05962216746388366, "grad_norm": 7.227341651916504, "learning_rate": 9.403778325361164e-07, "loss": 0.3363, "step": 1234 }, { "epoch": 0.059670483644972704, "grad_norm": 2.239016056060791, "learning_rate": 9.403295163550272e-07, "loss": 0.2805, "step": 1235 }, { "epoch": 0.05971879982606175, "grad_norm": 2.5013227462768555, "learning_rate": 9.402812001739382e-07, "loss": 0.2165, "step": 1236 }, { "epoch": 0.059767116007150796, "grad_norm": 4.034952163696289, "learning_rate": 9.402328839928492e-07, "loss": 0.2601, "step": 1237 }, { "epoch": 0.05981543218823984, "grad_norm": 6.271286487579346, "learning_rate": 9.401845678117601e-07, "loss": 0.3222, "step": 1238 }, { "epoch": 0.05986374836932889, "grad_norm": 3.9782462120056152, "learning_rate": 9.401362516306711e-07, "loss": 0.4299, "step": 1239 }, { "epoch": 0.059912064550417934, "grad_norm": 2.456904172897339, "learning_rate": 9.40087935449582e-07, "loss": 0.3489, "step": 1240 }, { "epoch": 0.05996038073150698, "grad_norm": 6.693804740905762, "learning_rate": 9.40039619268493e-07, "loss": 0.5022, "step": 1241 }, { "epoch": 0.06000869691259603, "grad_norm": 3.5928916931152344, "learning_rate": 9.39991303087404e-07, "loss": 0.5065, "step": 1242 }, { "epoch": 0.06005701309368507, "grad_norm": 3.265770196914673, "learning_rate": 9.39942986906315e-07, "loss": 0.3454, "step": 1243 }, { "epoch": 0.06010532927477412, "grad_norm": 3.601248264312744, "learning_rate": 9.398946707252258e-07, "loss": 0.4163, "step": 1244 }, { "epoch": 0.060153645455863165, "grad_norm": 12.607609748840332, "learning_rate": 9.398463545441367e-07, "loss": 0.3953, "step": 1245 }, { "epoch": 0.06020196163695222, "grad_norm": 2.1692004203796387, "learning_rate": 9.397980383630477e-07, "loss": 0.2982, "step": 1246 }, { "epoch": 0.060250277818041265, "grad_norm": 2.958791732788086, "learning_rate": 9.397497221819587e-07, "loss": 0.3424, "step": 1247 }, { "epoch": 0.06029859399913031, "grad_norm": 2.398871898651123, "learning_rate": 9.397014060008697e-07, "loss": 0.2997, "step": 1248 }, { "epoch": 0.06034691018021936, "grad_norm": 2.4467031955718994, "learning_rate": 9.396530898197807e-07, "loss": 0.2301, "step": 1249 }, { "epoch": 0.0603952263613084, "grad_norm": 2.1186108589172363, "learning_rate": 9.396047736386915e-07, "loss": 0.2204, "step": 1250 }, { "epoch": 0.06044354254239745, "grad_norm": 2.3549633026123047, "learning_rate": 9.395564574576024e-07, "loss": 0.2894, "step": 1251 }, { "epoch": 0.060491858723486495, "grad_norm": 2.902719259262085, "learning_rate": 9.395081412765134e-07, "loss": 0.1913, "step": 1252 }, { "epoch": 0.06054017490457554, "grad_norm": 4.401188373565674, "learning_rate": 9.394598250954244e-07, "loss": 0.4054, "step": 1253 }, { "epoch": 0.06058849108566459, "grad_norm": 2.7541754245758057, "learning_rate": 9.394115089143354e-07, "loss": 0.339, "step": 1254 }, { "epoch": 0.060636807266753634, "grad_norm": 2.9627158641815186, "learning_rate": 9.393631927332463e-07, "loss": 0.2557, "step": 1255 }, { "epoch": 0.06068512344784268, "grad_norm": 2.0703163146972656, "learning_rate": 9.393148765521573e-07, "loss": 0.1945, "step": 1256 }, { "epoch": 0.060733439628931726, "grad_norm": 1.9583301544189453, "learning_rate": 9.392665603710682e-07, "loss": 0.2464, "step": 1257 }, { "epoch": 0.06078175581002077, "grad_norm": 2.5779428482055664, "learning_rate": 9.392182441899792e-07, "loss": 0.3547, "step": 1258 }, { "epoch": 0.060830071991109826, "grad_norm": 3.8244524002075195, "learning_rate": 9.391699280088902e-07, "loss": 0.3944, "step": 1259 }, { "epoch": 0.06087838817219887, "grad_norm": 3.478963613510132, "learning_rate": 9.391216118278011e-07, "loss": 0.3112, "step": 1260 }, { "epoch": 0.06092670435328792, "grad_norm": 2.8117685317993164, "learning_rate": 9.39073295646712e-07, "loss": 0.3637, "step": 1261 }, { "epoch": 0.060975020534376964, "grad_norm": 6.86290168762207, "learning_rate": 9.39024979465623e-07, "loss": 0.2679, "step": 1262 }, { "epoch": 0.06102333671546601, "grad_norm": 2.841940402984619, "learning_rate": 9.389766632845339e-07, "loss": 0.3628, "step": 1263 }, { "epoch": 0.061071652896555056, "grad_norm": 2.2910869121551514, "learning_rate": 9.389283471034449e-07, "loss": 0.2777, "step": 1264 }, { "epoch": 0.0611199690776441, "grad_norm": 2.9550833702087402, "learning_rate": 9.388800309223559e-07, "loss": 0.3288, "step": 1265 }, { "epoch": 0.06116828525873315, "grad_norm": 3.4711737632751465, "learning_rate": 9.388317147412668e-07, "loss": 0.2782, "step": 1266 }, { "epoch": 0.061216601439822195, "grad_norm": 2.827951431274414, "learning_rate": 9.387833985601778e-07, "loss": 0.2973, "step": 1267 }, { "epoch": 0.06126491762091124, "grad_norm": 5.238794326782227, "learning_rate": 9.387350823790887e-07, "loss": 0.3046, "step": 1268 }, { "epoch": 0.06131323380200029, "grad_norm": 3.224486827850342, "learning_rate": 9.386867661979997e-07, "loss": 0.3296, "step": 1269 }, { "epoch": 0.061361549983089334, "grad_norm": 2.7170403003692627, "learning_rate": 9.386384500169106e-07, "loss": 0.3361, "step": 1270 }, { "epoch": 0.06140986616417839, "grad_norm": 2.641993284225464, "learning_rate": 9.385901338358215e-07, "loss": 0.3169, "step": 1271 }, { "epoch": 0.06145818234526743, "grad_norm": 3.3477442264556885, "learning_rate": 9.385418176547325e-07, "loss": 0.3493, "step": 1272 }, { "epoch": 0.06150649852635648, "grad_norm": 3.3549883365631104, "learning_rate": 9.384935014736435e-07, "loss": 0.3731, "step": 1273 }, { "epoch": 0.061554814707445525, "grad_norm": 20.345191955566406, "learning_rate": 9.384451852925545e-07, "loss": 0.3116, "step": 1274 }, { "epoch": 0.06160313088853457, "grad_norm": 11.227898597717285, "learning_rate": 9.383968691114655e-07, "loss": 0.2863, "step": 1275 }, { "epoch": 0.06165144706962362, "grad_norm": 4.7221455574035645, "learning_rate": 9.383485529303762e-07, "loss": 0.5616, "step": 1276 }, { "epoch": 0.061699763250712664, "grad_norm": 4.143837928771973, "learning_rate": 9.383002367492872e-07, "loss": 0.3742, "step": 1277 }, { "epoch": 0.06174807943180171, "grad_norm": 2.5341949462890625, "learning_rate": 9.382519205681982e-07, "loss": 0.3492, "step": 1278 }, { "epoch": 0.061796395612890756, "grad_norm": 2.855238437652588, "learning_rate": 9.382036043871092e-07, "loss": 0.3183, "step": 1279 }, { "epoch": 0.0618447117939798, "grad_norm": 2.496300220489502, "learning_rate": 9.381552882060202e-07, "loss": 0.2325, "step": 1280 }, { "epoch": 0.06189302797506885, "grad_norm": 1.7223347425460815, "learning_rate": 9.381069720249311e-07, "loss": 0.1721, "step": 1281 }, { "epoch": 0.061941344156157895, "grad_norm": 3.6020314693450928, "learning_rate": 9.38058655843842e-07, "loss": 0.4077, "step": 1282 }, { "epoch": 0.06198966033724694, "grad_norm": 2.065035343170166, "learning_rate": 9.38010339662753e-07, "loss": 0.226, "step": 1283 }, { "epoch": 0.062037976518335994, "grad_norm": 2.671407461166382, "learning_rate": 9.37962023481664e-07, "loss": 0.3105, "step": 1284 }, { "epoch": 0.06208629269942504, "grad_norm": 2.7462656497955322, "learning_rate": 9.37913707300575e-07, "loss": 0.3499, "step": 1285 }, { "epoch": 0.062134608880514086, "grad_norm": 3.181220769882202, "learning_rate": 9.378653911194859e-07, "loss": 0.3946, "step": 1286 }, { "epoch": 0.06218292506160313, "grad_norm": 3.0272538661956787, "learning_rate": 9.378170749383968e-07, "loss": 0.3483, "step": 1287 }, { "epoch": 0.06223124124269218, "grad_norm": 3.4753153324127197, "learning_rate": 9.377687587573078e-07, "loss": 0.3774, "step": 1288 }, { "epoch": 0.062279557423781225, "grad_norm": 2.3780977725982666, "learning_rate": 9.377204425762187e-07, "loss": 0.276, "step": 1289 }, { "epoch": 0.06232787360487027, "grad_norm": 6.864815711975098, "learning_rate": 9.376721263951297e-07, "loss": 0.3139, "step": 1290 }, { "epoch": 0.06237618978595932, "grad_norm": 2.1514830589294434, "learning_rate": 9.376238102140407e-07, "loss": 0.2372, "step": 1291 }, { "epoch": 0.06242450596704836, "grad_norm": 2.3259973526000977, "learning_rate": 9.375754940329516e-07, "loss": 0.2598, "step": 1292 }, { "epoch": 0.06247282214813741, "grad_norm": 2.9023706912994385, "learning_rate": 9.375271778518626e-07, "loss": 0.3967, "step": 1293 }, { "epoch": 0.06252113832922646, "grad_norm": 7.9026336669921875, "learning_rate": 9.374788616707735e-07, "loss": 0.3465, "step": 1294 }, { "epoch": 0.06256945451031551, "grad_norm": 3.9711010456085205, "learning_rate": 9.374305454896844e-07, "loss": 0.3857, "step": 1295 }, { "epoch": 0.06261777069140455, "grad_norm": 2.5417087078094482, "learning_rate": 9.373822293085954e-07, "loss": 0.1943, "step": 1296 }, { "epoch": 0.0626660868724936, "grad_norm": 1.676398754119873, "learning_rate": 9.373339131275063e-07, "loss": 0.1748, "step": 1297 }, { "epoch": 0.06271440305358264, "grad_norm": 3.3916189670562744, "learning_rate": 9.372855969464173e-07, "loss": 0.3721, "step": 1298 }, { "epoch": 0.0627627192346717, "grad_norm": 2.587162971496582, "learning_rate": 9.372372807653283e-07, "loss": 0.2828, "step": 1299 }, { "epoch": 0.06281103541576073, "grad_norm": 2.8293983936309814, "learning_rate": 9.371889645842393e-07, "loss": 0.3627, "step": 1300 }, { "epoch": 0.06285935159684979, "grad_norm": 2.3689169883728027, "learning_rate": 9.371406484031503e-07, "loss": 0.279, "step": 1301 }, { "epoch": 0.06290766777793882, "grad_norm": 2.653564453125, "learning_rate": 9.37092332222061e-07, "loss": 0.1913, "step": 1302 }, { "epoch": 0.06295598395902788, "grad_norm": 2.4146993160247803, "learning_rate": 9.37044016040972e-07, "loss": 0.2921, "step": 1303 }, { "epoch": 0.06300430014011693, "grad_norm": 2.1147022247314453, "learning_rate": 9.36995699859883e-07, "loss": 0.2817, "step": 1304 }, { "epoch": 0.06305261632120597, "grad_norm": 2.7919137477874756, "learning_rate": 9.36947383678794e-07, "loss": 0.3006, "step": 1305 }, { "epoch": 0.06310093250229502, "grad_norm": 2.0724904537200928, "learning_rate": 9.36899067497705e-07, "loss": 0.2639, "step": 1306 }, { "epoch": 0.06314924868338406, "grad_norm": 2.5200839042663574, "learning_rate": 9.368507513166159e-07, "loss": 0.2633, "step": 1307 }, { "epoch": 0.06319756486447312, "grad_norm": 3.2590994834899902, "learning_rate": 9.368024351355268e-07, "loss": 0.3354, "step": 1308 }, { "epoch": 0.06324588104556216, "grad_norm": 1.7462844848632812, "learning_rate": 9.367541189544378e-07, "loss": 0.2162, "step": 1309 }, { "epoch": 0.06329419722665121, "grad_norm": 2.4268882274627686, "learning_rate": 9.367058027733488e-07, "loss": 0.1847, "step": 1310 }, { "epoch": 0.06334251340774025, "grad_norm": 1.8300715684890747, "learning_rate": 9.366574865922597e-07, "loss": 0.1953, "step": 1311 }, { "epoch": 0.0633908295888293, "grad_norm": 2.8305211067199707, "learning_rate": 9.366091704111707e-07, "loss": 0.2911, "step": 1312 }, { "epoch": 0.06343914576991834, "grad_norm": 2.431725025177002, "learning_rate": 9.365608542300816e-07, "loss": 0.3473, "step": 1313 }, { "epoch": 0.06348746195100739, "grad_norm": 3.1657094955444336, "learning_rate": 9.365125380489925e-07, "loss": 0.3122, "step": 1314 }, { "epoch": 0.06353577813209645, "grad_norm": 1.8358983993530273, "learning_rate": 9.364642218679035e-07, "loss": 0.1706, "step": 1315 }, { "epoch": 0.06358409431318549, "grad_norm": 1.6721866130828857, "learning_rate": 9.364159056868145e-07, "loss": 0.1938, "step": 1316 }, { "epoch": 0.06363241049427454, "grad_norm": 2.7614667415618896, "learning_rate": 9.363675895057255e-07, "loss": 0.2903, "step": 1317 }, { "epoch": 0.06368072667536358, "grad_norm": 2.5204741954803467, "learning_rate": 9.363192733246364e-07, "loss": 0.2018, "step": 1318 }, { "epoch": 0.06372904285645263, "grad_norm": 2.3568778038024902, "learning_rate": 9.362709571435473e-07, "loss": 0.3155, "step": 1319 }, { "epoch": 0.06377735903754167, "grad_norm": 5.309528350830078, "learning_rate": 9.362226409624583e-07, "loss": 0.4262, "step": 1320 }, { "epoch": 0.06382567521863072, "grad_norm": 5.477842330932617, "learning_rate": 9.361743247813692e-07, "loss": 0.3626, "step": 1321 }, { "epoch": 0.06387399139971976, "grad_norm": 3.9330828189849854, "learning_rate": 9.361260086002802e-07, "loss": 0.3512, "step": 1322 }, { "epoch": 0.06392230758080882, "grad_norm": 3.6875996589660645, "learning_rate": 9.360776924191911e-07, "loss": 0.4641, "step": 1323 }, { "epoch": 0.06397062376189785, "grad_norm": 2.123924493789673, "learning_rate": 9.360293762381021e-07, "loss": 0.2209, "step": 1324 }, { "epoch": 0.06401893994298691, "grad_norm": 5.103702545166016, "learning_rate": 9.359810600570131e-07, "loss": 0.3951, "step": 1325 }, { "epoch": 0.06406725612407595, "grad_norm": 2.575565814971924, "learning_rate": 9.359327438759241e-07, "loss": 0.3328, "step": 1326 }, { "epoch": 0.064115572305165, "grad_norm": 3.8719685077667236, "learning_rate": 9.35884427694835e-07, "loss": 0.3152, "step": 1327 }, { "epoch": 0.06416388848625405, "grad_norm": 16.616931915283203, "learning_rate": 9.358361115137458e-07, "loss": 0.3414, "step": 1328 }, { "epoch": 0.06421220466734309, "grad_norm": 5.639588356018066, "learning_rate": 9.357877953326568e-07, "loss": 0.3518, "step": 1329 }, { "epoch": 0.06426052084843215, "grad_norm": 1.5435587167739868, "learning_rate": 9.357394791515678e-07, "loss": 0.1754, "step": 1330 }, { "epoch": 0.06430883702952118, "grad_norm": 3.9863712787628174, "learning_rate": 9.356911629704788e-07, "loss": 0.4299, "step": 1331 }, { "epoch": 0.06435715321061024, "grad_norm": 2.7827234268188477, "learning_rate": 9.356428467893898e-07, "loss": 0.3756, "step": 1332 }, { "epoch": 0.06440546939169928, "grad_norm": 2.7788591384887695, "learning_rate": 9.355945306083006e-07, "loss": 0.3938, "step": 1333 }, { "epoch": 0.06445378557278833, "grad_norm": 3.9156930446624756, "learning_rate": 9.355462144272116e-07, "loss": 0.2505, "step": 1334 }, { "epoch": 0.06450210175387737, "grad_norm": 3.6363577842712402, "learning_rate": 9.354978982461226e-07, "loss": 0.3973, "step": 1335 }, { "epoch": 0.06455041793496642, "grad_norm": 3.000303030014038, "learning_rate": 9.354495820650335e-07, "loss": 0.3873, "step": 1336 }, { "epoch": 0.06459873411605546, "grad_norm": 2.7241177558898926, "learning_rate": 9.354012658839445e-07, "loss": 0.3079, "step": 1337 }, { "epoch": 0.06464705029714451, "grad_norm": 1.9628853797912598, "learning_rate": 9.353529497028555e-07, "loss": 0.281, "step": 1338 }, { "epoch": 0.06469536647823355, "grad_norm": 3.6927170753479004, "learning_rate": 9.353046335217664e-07, "loss": 0.3706, "step": 1339 }, { "epoch": 0.06474368265932261, "grad_norm": 2.77606463432312, "learning_rate": 9.352563173406773e-07, "loss": 0.3413, "step": 1340 }, { "epoch": 0.06479199884041166, "grad_norm": 3.0300710201263428, "learning_rate": 9.352080011595883e-07, "loss": 0.3785, "step": 1341 }, { "epoch": 0.0648403150215007, "grad_norm": 4.299362659454346, "learning_rate": 9.351596849784993e-07, "loss": 0.3673, "step": 1342 }, { "epoch": 0.06488863120258975, "grad_norm": 2.7420077323913574, "learning_rate": 9.351113687974103e-07, "loss": 0.4384, "step": 1343 }, { "epoch": 0.06493694738367879, "grad_norm": 4.174781799316406, "learning_rate": 9.350630526163211e-07, "loss": 0.4106, "step": 1344 }, { "epoch": 0.06498526356476785, "grad_norm": 2.5538535118103027, "learning_rate": 9.350147364352321e-07, "loss": 0.2898, "step": 1345 }, { "epoch": 0.06503357974585688, "grad_norm": 4.080040454864502, "learning_rate": 9.34966420254143e-07, "loss": 0.3718, "step": 1346 }, { "epoch": 0.06508189592694594, "grad_norm": 3.992068290710449, "learning_rate": 9.34918104073054e-07, "loss": 0.2563, "step": 1347 }, { "epoch": 0.06513021210803498, "grad_norm": 3.0728437900543213, "learning_rate": 9.34869787891965e-07, "loss": 0.445, "step": 1348 }, { "epoch": 0.06517852828912403, "grad_norm": 2.5156476497650146, "learning_rate": 9.348214717108759e-07, "loss": 0.371, "step": 1349 }, { "epoch": 0.06522684447021307, "grad_norm": 3.0356180667877197, "learning_rate": 9.347731555297869e-07, "loss": 0.2585, "step": 1350 }, { "epoch": 0.06527516065130212, "grad_norm": 4.32282829284668, "learning_rate": 9.347248393486979e-07, "loss": 0.2287, "step": 1351 }, { "epoch": 0.06532347683239116, "grad_norm": 2.575045108795166, "learning_rate": 9.346765231676089e-07, "loss": 0.438, "step": 1352 }, { "epoch": 0.06537179301348021, "grad_norm": 3.3570291996002197, "learning_rate": 9.346282069865197e-07, "loss": 0.5607, "step": 1353 }, { "epoch": 0.06542010919456927, "grad_norm": 3.116194725036621, "learning_rate": 9.345798908054306e-07, "loss": 0.4271, "step": 1354 }, { "epoch": 0.0654684253756583, "grad_norm": 2.551483154296875, "learning_rate": 9.345315746243416e-07, "loss": 0.3229, "step": 1355 }, { "epoch": 0.06551674155674736, "grad_norm": 4.033331394195557, "learning_rate": 9.344832584432526e-07, "loss": 0.3101, "step": 1356 }, { "epoch": 0.0655650577378364, "grad_norm": 1.6534092426300049, "learning_rate": 9.344349422621636e-07, "loss": 0.2312, "step": 1357 }, { "epoch": 0.06561337391892545, "grad_norm": 10.450888633728027, "learning_rate": 9.343866260810746e-07, "loss": 0.3327, "step": 1358 }, { "epoch": 0.06566169010001449, "grad_norm": 3.9249773025512695, "learning_rate": 9.343383098999854e-07, "loss": 0.3496, "step": 1359 }, { "epoch": 0.06571000628110354, "grad_norm": 2.523193359375, "learning_rate": 9.342899937188964e-07, "loss": 0.2762, "step": 1360 }, { "epoch": 0.06575832246219258, "grad_norm": 93.64087677001953, "learning_rate": 9.342416775378073e-07, "loss": 0.3554, "step": 1361 }, { "epoch": 0.06580663864328164, "grad_norm": 2.6537036895751953, "learning_rate": 9.341933613567183e-07, "loss": 0.3214, "step": 1362 }, { "epoch": 0.06585495482437068, "grad_norm": 4.362783432006836, "learning_rate": 9.341450451756293e-07, "loss": 0.3938, "step": 1363 }, { "epoch": 0.06590327100545973, "grad_norm": 2.8332793712615967, "learning_rate": 9.340967289945403e-07, "loss": 0.2695, "step": 1364 }, { "epoch": 0.06595158718654877, "grad_norm": 3.1027867794036865, "learning_rate": 9.340484128134511e-07, "loss": 0.3734, "step": 1365 }, { "epoch": 0.06599990336763782, "grad_norm": 4.842773914337158, "learning_rate": 9.340000966323621e-07, "loss": 0.3461, "step": 1366 }, { "epoch": 0.06604821954872687, "grad_norm": 2.180973768234253, "learning_rate": 9.339517804512731e-07, "loss": 0.2882, "step": 1367 }, { "epoch": 0.06609653572981591, "grad_norm": 3.1583011150360107, "learning_rate": 9.339034642701841e-07, "loss": 0.4506, "step": 1368 }, { "epoch": 0.06614485191090497, "grad_norm": 2.7258388996124268, "learning_rate": 9.338551480890951e-07, "loss": 0.281, "step": 1369 }, { "epoch": 0.066193168091994, "grad_norm": 3.307974338531494, "learning_rate": 9.338068319080059e-07, "loss": 0.3975, "step": 1370 }, { "epoch": 0.06624148427308306, "grad_norm": 2.763746738433838, "learning_rate": 9.337585157269169e-07, "loss": 0.2614, "step": 1371 }, { "epoch": 0.0662898004541721, "grad_norm": 2.9058444499969482, "learning_rate": 9.337101995458278e-07, "loss": 0.361, "step": 1372 }, { "epoch": 0.06633811663526115, "grad_norm": 2.538198471069336, "learning_rate": 9.336618833647388e-07, "loss": 0.3101, "step": 1373 }, { "epoch": 0.06638643281635019, "grad_norm": 3.622342109680176, "learning_rate": 9.336135671836498e-07, "loss": 0.316, "step": 1374 }, { "epoch": 0.06643474899743924, "grad_norm": 3.4613168239593506, "learning_rate": 9.335652510025607e-07, "loss": 0.3796, "step": 1375 }, { "epoch": 0.06648306517852828, "grad_norm": 2.321755886077881, "learning_rate": 9.335169348214717e-07, "loss": 0.297, "step": 1376 }, { "epoch": 0.06653138135961734, "grad_norm": 3.7619519233703613, "learning_rate": 9.334686186403827e-07, "loss": 0.3244, "step": 1377 }, { "epoch": 0.06657969754070638, "grad_norm": 4.107840538024902, "learning_rate": 9.334203024592935e-07, "loss": 0.3247, "step": 1378 }, { "epoch": 0.06662801372179543, "grad_norm": 2.8555285930633545, "learning_rate": 9.333719862782045e-07, "loss": 0.4054, "step": 1379 }, { "epoch": 0.06667632990288448, "grad_norm": 3.118469476699829, "learning_rate": 9.333236700971154e-07, "loss": 0.3343, "step": 1380 }, { "epoch": 0.06672464608397352, "grad_norm": 3.1225616931915283, "learning_rate": 9.332753539160264e-07, "loss": 0.5199, "step": 1381 }, { "epoch": 0.06677296226506257, "grad_norm": 2.878957509994507, "learning_rate": 9.332270377349374e-07, "loss": 0.3809, "step": 1382 }, { "epoch": 0.06682127844615161, "grad_norm": 3.431981086730957, "learning_rate": 9.331787215538484e-07, "loss": 0.3134, "step": 1383 }, { "epoch": 0.06686959462724067, "grad_norm": 12.338619232177734, "learning_rate": 9.331304053727594e-07, "loss": 0.3525, "step": 1384 }, { "epoch": 0.0669179108083297, "grad_norm": 3.0083439350128174, "learning_rate": 9.330820891916702e-07, "loss": 0.2885, "step": 1385 }, { "epoch": 0.06696622698941876, "grad_norm": 2.4110162258148193, "learning_rate": 9.330337730105811e-07, "loss": 0.2694, "step": 1386 }, { "epoch": 0.0670145431705078, "grad_norm": 2.374333381652832, "learning_rate": 9.329854568294921e-07, "loss": 0.3218, "step": 1387 }, { "epoch": 0.06706285935159685, "grad_norm": 2.6977787017822266, "learning_rate": 9.329371406484031e-07, "loss": 0.2318, "step": 1388 }, { "epoch": 0.06711117553268589, "grad_norm": 2.4629178047180176, "learning_rate": 9.328888244673141e-07, "loss": 0.2791, "step": 1389 }, { "epoch": 0.06715949171377494, "grad_norm": 5.305070400238037, "learning_rate": 9.328405082862251e-07, "loss": 0.2228, "step": 1390 }, { "epoch": 0.067207807894864, "grad_norm": 3.3658039569854736, "learning_rate": 9.327921921051359e-07, "loss": 0.3356, "step": 1391 }, { "epoch": 0.06725612407595304, "grad_norm": 2.7772908210754395, "learning_rate": 9.327438759240469e-07, "loss": 0.3909, "step": 1392 }, { "epoch": 0.06730444025704209, "grad_norm": 3.594956398010254, "learning_rate": 9.326955597429579e-07, "loss": 0.2508, "step": 1393 }, { "epoch": 0.06735275643813113, "grad_norm": 4.754339218139648, "learning_rate": 9.326472435618689e-07, "loss": 0.2552, "step": 1394 }, { "epoch": 0.06740107261922018, "grad_norm": 2.793105125427246, "learning_rate": 9.325989273807798e-07, "loss": 0.2409, "step": 1395 }, { "epoch": 0.06744938880030922, "grad_norm": 2.9563655853271484, "learning_rate": 9.325506111996907e-07, "loss": 0.4015, "step": 1396 }, { "epoch": 0.06749770498139827, "grad_norm": 3.1326045989990234, "learning_rate": 9.325022950186017e-07, "loss": 0.3377, "step": 1397 }, { "epoch": 0.06754602116248731, "grad_norm": 2.4881751537323, "learning_rate": 9.324539788375126e-07, "loss": 0.3123, "step": 1398 }, { "epoch": 0.06759433734357637, "grad_norm": 1.7865458726882935, "learning_rate": 9.324056626564236e-07, "loss": 0.2409, "step": 1399 }, { "epoch": 0.0676426535246654, "grad_norm": 2.7411234378814697, "learning_rate": 9.323573464753346e-07, "loss": 0.214, "step": 1400 }, { "epoch": 0.06769096970575446, "grad_norm": 2.106342315673828, "learning_rate": 9.323090302942455e-07, "loss": 0.315, "step": 1401 }, { "epoch": 0.0677392858868435, "grad_norm": 2.6057846546173096, "learning_rate": 9.322607141131565e-07, "loss": 0.3701, "step": 1402 }, { "epoch": 0.06778760206793255, "grad_norm": 2.471220016479492, "learning_rate": 9.322123979320675e-07, "loss": 0.3316, "step": 1403 }, { "epoch": 0.0678359182490216, "grad_norm": 2.2074661254882812, "learning_rate": 9.321640817509783e-07, "loss": 0.3188, "step": 1404 }, { "epoch": 0.06788423443011064, "grad_norm": 2.736025810241699, "learning_rate": 9.321157655698893e-07, "loss": 0.3175, "step": 1405 }, { "epoch": 0.0679325506111997, "grad_norm": 2.649998664855957, "learning_rate": 9.320674493888002e-07, "loss": 0.3289, "step": 1406 }, { "epoch": 0.06798086679228874, "grad_norm": 2.873091220855713, "learning_rate": 9.320191332077112e-07, "loss": 0.3441, "step": 1407 }, { "epoch": 0.06802918297337779, "grad_norm": 4.787432670593262, "learning_rate": 9.319708170266222e-07, "loss": 0.2661, "step": 1408 }, { "epoch": 0.06807749915446683, "grad_norm": 3.8497579097747803, "learning_rate": 9.319225008455332e-07, "loss": 0.4359, "step": 1409 }, { "epoch": 0.06812581533555588, "grad_norm": 2.030745267868042, "learning_rate": 9.318741846644441e-07, "loss": 0.2494, "step": 1410 }, { "epoch": 0.06817413151664492, "grad_norm": 2.1693708896636963, "learning_rate": 9.31825868483355e-07, "loss": 0.1952, "step": 1411 }, { "epoch": 0.06822244769773397, "grad_norm": 2.336282253265381, "learning_rate": 9.317775523022659e-07, "loss": 0.2783, "step": 1412 }, { "epoch": 0.06827076387882301, "grad_norm": 6.650197505950928, "learning_rate": 9.317292361211769e-07, "loss": 0.4882, "step": 1413 }, { "epoch": 0.06831908005991207, "grad_norm": 4.534753799438477, "learning_rate": 9.316809199400879e-07, "loss": 0.1629, "step": 1414 }, { "epoch": 0.0683673962410011, "grad_norm": 3.115842819213867, "learning_rate": 9.316326037589989e-07, "loss": 0.1993, "step": 1415 }, { "epoch": 0.06841571242209016, "grad_norm": 2.38474702835083, "learning_rate": 9.315842875779099e-07, "loss": 0.3078, "step": 1416 }, { "epoch": 0.06846402860317921, "grad_norm": 4.295228481292725, "learning_rate": 9.315359713968207e-07, "loss": 0.3225, "step": 1417 }, { "epoch": 0.06851234478426825, "grad_norm": 2.6484057903289795, "learning_rate": 9.314876552157317e-07, "loss": 0.2728, "step": 1418 }, { "epoch": 0.0685606609653573, "grad_norm": 2.7159855365753174, "learning_rate": 9.314393390346427e-07, "loss": 0.1745, "step": 1419 }, { "epoch": 0.06860897714644634, "grad_norm": 2.1028800010681152, "learning_rate": 9.313910228535537e-07, "loss": 0.2486, "step": 1420 }, { "epoch": 0.0686572933275354, "grad_norm": 2.8557140827178955, "learning_rate": 9.313427066724646e-07, "loss": 0.2646, "step": 1421 }, { "epoch": 0.06870560950862444, "grad_norm": 3.2633259296417236, "learning_rate": 9.312943904913755e-07, "loss": 0.2738, "step": 1422 }, { "epoch": 0.06875392568971349, "grad_norm": 6.375986576080322, "learning_rate": 9.312460743102864e-07, "loss": 0.3171, "step": 1423 }, { "epoch": 0.06880224187080253, "grad_norm": 3.075988292694092, "learning_rate": 9.311977581291974e-07, "loss": 0.2449, "step": 1424 }, { "epoch": 0.06885055805189158, "grad_norm": 2.387127161026001, "learning_rate": 9.311494419481084e-07, "loss": 0.2906, "step": 1425 }, { "epoch": 0.06889887423298062, "grad_norm": 1.366134762763977, "learning_rate": 9.311011257670194e-07, "loss": 0.131, "step": 1426 }, { "epoch": 0.06894719041406967, "grad_norm": 3.0632238388061523, "learning_rate": 9.310528095859303e-07, "loss": 0.4051, "step": 1427 }, { "epoch": 0.06899550659515871, "grad_norm": 2.14186692237854, "learning_rate": 9.310044934048413e-07, "loss": 0.2437, "step": 1428 }, { "epoch": 0.06904382277624777, "grad_norm": 2.456123113632202, "learning_rate": 9.309561772237522e-07, "loss": 0.2463, "step": 1429 }, { "epoch": 0.06909213895733682, "grad_norm": 18.561872482299805, "learning_rate": 9.309078610426631e-07, "loss": 0.3385, "step": 1430 }, { "epoch": 0.06914045513842586, "grad_norm": 2.29241943359375, "learning_rate": 9.308595448615741e-07, "loss": 0.2965, "step": 1431 }, { "epoch": 0.06918877131951491, "grad_norm": 2.5048129558563232, "learning_rate": 9.30811228680485e-07, "loss": 0.2728, "step": 1432 }, { "epoch": 0.06923708750060395, "grad_norm": 3.247192144393921, "learning_rate": 9.30762912499396e-07, "loss": 0.4114, "step": 1433 }, { "epoch": 0.069285403681693, "grad_norm": 3.554046154022217, "learning_rate": 9.30714596318307e-07, "loss": 0.2662, "step": 1434 }, { "epoch": 0.06933371986278204, "grad_norm": 4.496579647064209, "learning_rate": 9.30666280137218e-07, "loss": 0.3224, "step": 1435 }, { "epoch": 0.0693820360438711, "grad_norm": 1.4256700277328491, "learning_rate": 9.306179639561289e-07, "loss": 0.1318, "step": 1436 }, { "epoch": 0.06943035222496013, "grad_norm": 4.415971755981445, "learning_rate": 9.305696477750397e-07, "loss": 0.3655, "step": 1437 }, { "epoch": 0.06947866840604919, "grad_norm": 2.2253286838531494, "learning_rate": 9.305213315939507e-07, "loss": 0.3013, "step": 1438 }, { "epoch": 0.06952698458713823, "grad_norm": 2.765554904937744, "learning_rate": 9.304730154128617e-07, "loss": 0.382, "step": 1439 }, { "epoch": 0.06957530076822728, "grad_norm": 2.7940404415130615, "learning_rate": 9.304246992317727e-07, "loss": 0.3045, "step": 1440 }, { "epoch": 0.06962361694931632, "grad_norm": 2.8914661407470703, "learning_rate": 9.303763830506837e-07, "loss": 0.4218, "step": 1441 }, { "epoch": 0.06967193313040537, "grad_norm": 1.5372263193130493, "learning_rate": 9.303280668695945e-07, "loss": 0.173, "step": 1442 }, { "epoch": 0.06972024931149443, "grad_norm": 2.246347427368164, "learning_rate": 9.302797506885055e-07, "loss": 0.2496, "step": 1443 }, { "epoch": 0.06976856549258346, "grad_norm": 3.0922961235046387, "learning_rate": 9.302314345074165e-07, "loss": 0.3954, "step": 1444 }, { "epoch": 0.06981688167367252, "grad_norm": 2.402269124984741, "learning_rate": 9.301831183263275e-07, "loss": 0.2848, "step": 1445 }, { "epoch": 0.06986519785476156, "grad_norm": 2.587334156036377, "learning_rate": 9.301348021452384e-07, "loss": 0.3278, "step": 1446 }, { "epoch": 0.06991351403585061, "grad_norm": 2.3166074752807617, "learning_rate": 9.300864859641494e-07, "loss": 0.3049, "step": 1447 }, { "epoch": 0.06996183021693965, "grad_norm": 3.503302574157715, "learning_rate": 9.300381697830603e-07, "loss": 0.3774, "step": 1448 }, { "epoch": 0.0700101463980287, "grad_norm": 3.608253002166748, "learning_rate": 9.299898536019712e-07, "loss": 0.3481, "step": 1449 }, { "epoch": 0.07005846257911774, "grad_norm": 3.784707546234131, "learning_rate": 9.299415374208822e-07, "loss": 0.352, "step": 1450 }, { "epoch": 0.0701067787602068, "grad_norm": 2.9013683795928955, "learning_rate": 9.298932212397932e-07, "loss": 0.439, "step": 1451 }, { "epoch": 0.07015509494129583, "grad_norm": 2.557074546813965, "learning_rate": 9.298449050587042e-07, "loss": 0.2782, "step": 1452 }, { "epoch": 0.07020341112238489, "grad_norm": 2.849531650543213, "learning_rate": 9.297965888776151e-07, "loss": 0.3355, "step": 1453 }, { "epoch": 0.07025172730347394, "grad_norm": 2.197490692138672, "learning_rate": 9.29748272696526e-07, "loss": 0.2606, "step": 1454 }, { "epoch": 0.07030004348456298, "grad_norm": 2.6356565952301025, "learning_rate": 9.296999565154369e-07, "loss": 0.2655, "step": 1455 }, { "epoch": 0.07034835966565203, "grad_norm": 2.3702571392059326, "learning_rate": 9.296516403343479e-07, "loss": 0.2828, "step": 1456 }, { "epoch": 0.07039667584674107, "grad_norm": 3.0964362621307373, "learning_rate": 9.296033241532589e-07, "loss": 0.2851, "step": 1457 }, { "epoch": 0.07044499202783013, "grad_norm": 9.190333366394043, "learning_rate": 9.295550079721698e-07, "loss": 0.647, "step": 1458 }, { "epoch": 0.07049330820891916, "grad_norm": 2.5452685356140137, "learning_rate": 9.295066917910808e-07, "loss": 0.3192, "step": 1459 }, { "epoch": 0.07054162439000822, "grad_norm": 3.377737045288086, "learning_rate": 9.294583756099918e-07, "loss": 0.3056, "step": 1460 }, { "epoch": 0.07058994057109726, "grad_norm": 1.3501533269882202, "learning_rate": 9.294100594289028e-07, "loss": 0.1444, "step": 1461 }, { "epoch": 0.07063825675218631, "grad_norm": 3.329496145248413, "learning_rate": 9.293617432478137e-07, "loss": 0.4668, "step": 1462 }, { "epoch": 0.07068657293327535, "grad_norm": 6.630670070648193, "learning_rate": 9.293134270667245e-07, "loss": 0.2837, "step": 1463 }, { "epoch": 0.0707348891143644, "grad_norm": 2.291301727294922, "learning_rate": 9.292651108856355e-07, "loss": 0.4047, "step": 1464 }, { "epoch": 0.07078320529545344, "grad_norm": 5.201442718505859, "learning_rate": 9.292167947045465e-07, "loss": 0.6123, "step": 1465 }, { "epoch": 0.0708315214765425, "grad_norm": 3.3825085163116455, "learning_rate": 9.291684785234575e-07, "loss": 0.4152, "step": 1466 }, { "epoch": 0.07087983765763155, "grad_norm": 3.201232671737671, "learning_rate": 9.291201623423685e-07, "loss": 0.5045, "step": 1467 }, { "epoch": 0.07092815383872059, "grad_norm": 1.867688536643982, "learning_rate": 9.290718461612793e-07, "loss": 0.1963, "step": 1468 }, { "epoch": 0.07097647001980964, "grad_norm": 2.4757635593414307, "learning_rate": 9.290235299801903e-07, "loss": 0.2784, "step": 1469 }, { "epoch": 0.07102478620089868, "grad_norm": 3.2434709072113037, "learning_rate": 9.289752137991013e-07, "loss": 0.4082, "step": 1470 }, { "epoch": 0.07107310238198773, "grad_norm": 2.5629630088806152, "learning_rate": 9.289268976180122e-07, "loss": 0.33, "step": 1471 }, { "epoch": 0.07112141856307677, "grad_norm": 4.160181045532227, "learning_rate": 9.288785814369232e-07, "loss": 0.3824, "step": 1472 }, { "epoch": 0.07116973474416582, "grad_norm": 3.0022802352905273, "learning_rate": 9.288302652558342e-07, "loss": 0.2982, "step": 1473 }, { "epoch": 0.07121805092525486, "grad_norm": 3.1693644523620605, "learning_rate": 9.28781949074745e-07, "loss": 0.3545, "step": 1474 }, { "epoch": 0.07126636710634392, "grad_norm": 2.595210075378418, "learning_rate": 9.28733632893656e-07, "loss": 0.3072, "step": 1475 }, { "epoch": 0.07131468328743296, "grad_norm": 2.4782955646514893, "learning_rate": 9.28685316712567e-07, "loss": 0.3103, "step": 1476 }, { "epoch": 0.07136299946852201, "grad_norm": 4.007885456085205, "learning_rate": 9.28637000531478e-07, "loss": 0.3063, "step": 1477 }, { "epoch": 0.07141131564961105, "grad_norm": 2.1829702854156494, "learning_rate": 9.28588684350389e-07, "loss": 0.2226, "step": 1478 }, { "epoch": 0.0714596318307001, "grad_norm": 3.951249837875366, "learning_rate": 9.285403681692999e-07, "loss": 0.3274, "step": 1479 }, { "epoch": 0.07150794801178915, "grad_norm": 2.692958354949951, "learning_rate": 9.284920519882108e-07, "loss": 0.2874, "step": 1480 }, { "epoch": 0.0715562641928782, "grad_norm": 2.6857237815856934, "learning_rate": 9.284437358071217e-07, "loss": 0.3212, "step": 1481 }, { "epoch": 0.07160458037396725, "grad_norm": 7.047379970550537, "learning_rate": 9.283954196260327e-07, "loss": 0.4403, "step": 1482 }, { "epoch": 0.07165289655505629, "grad_norm": 2.3326098918914795, "learning_rate": 9.283471034449437e-07, "loss": 0.2755, "step": 1483 }, { "epoch": 0.07170121273614534, "grad_norm": 4.174232006072998, "learning_rate": 9.282987872638546e-07, "loss": 0.3985, "step": 1484 }, { "epoch": 0.07174952891723438, "grad_norm": 2.681654930114746, "learning_rate": 9.282504710827656e-07, "loss": 0.268, "step": 1485 }, { "epoch": 0.07179784509832343, "grad_norm": 2.069392681121826, "learning_rate": 9.282021549016766e-07, "loss": 0.272, "step": 1486 }, { "epoch": 0.07184616127941247, "grad_norm": 2.4200687408447266, "learning_rate": 9.281538387205875e-07, "loss": 0.294, "step": 1487 }, { "epoch": 0.07189447746050152, "grad_norm": 3.5111498832702637, "learning_rate": 9.281055225394984e-07, "loss": 0.2226, "step": 1488 }, { "epoch": 0.07194279364159056, "grad_norm": 2.03844952583313, "learning_rate": 9.280572063584093e-07, "loss": 0.2173, "step": 1489 }, { "epoch": 0.07199110982267962, "grad_norm": 7.493427753448486, "learning_rate": 9.280088901773203e-07, "loss": 0.4204, "step": 1490 }, { "epoch": 0.07203942600376866, "grad_norm": 2.6796228885650635, "learning_rate": 9.279605739962313e-07, "loss": 0.3401, "step": 1491 }, { "epoch": 0.07208774218485771, "grad_norm": 1.9324405193328857, "learning_rate": 9.279122578151423e-07, "loss": 0.2095, "step": 1492 }, { "epoch": 0.07213605836594676, "grad_norm": 2.3077211380004883, "learning_rate": 9.278639416340533e-07, "loss": 0.2345, "step": 1493 }, { "epoch": 0.0721843745470358, "grad_norm": 3.5282578468322754, "learning_rate": 9.278156254529641e-07, "loss": 0.44, "step": 1494 }, { "epoch": 0.07223269072812485, "grad_norm": 2.6330435276031494, "learning_rate": 9.277673092718751e-07, "loss": 0.3405, "step": 1495 }, { "epoch": 0.0722810069092139, "grad_norm": 3.577789068222046, "learning_rate": 9.27718993090786e-07, "loss": 0.349, "step": 1496 }, { "epoch": 0.07232932309030295, "grad_norm": 2.335352897644043, "learning_rate": 9.27670676909697e-07, "loss": 0.3123, "step": 1497 }, { "epoch": 0.07237763927139199, "grad_norm": 2.5513720512390137, "learning_rate": 9.27622360728608e-07, "loss": 0.3718, "step": 1498 }, { "epoch": 0.07242595545248104, "grad_norm": 3.166050910949707, "learning_rate": 9.27574044547519e-07, "loss": 0.4122, "step": 1499 }, { "epoch": 0.07247427163357008, "grad_norm": 2.3469278812408447, "learning_rate": 9.275257283664298e-07, "loss": 0.3424, "step": 1500 }, { "epoch": 0.07252258781465913, "grad_norm": 2.0947930812835693, "learning_rate": 9.274774121853408e-07, "loss": 0.2102, "step": 1501 }, { "epoch": 0.07257090399574817, "grad_norm": 2.733910322189331, "learning_rate": 9.274290960042518e-07, "loss": 0.3164, "step": 1502 }, { "epoch": 0.07261922017683722, "grad_norm": 2.471822500228882, "learning_rate": 9.273807798231628e-07, "loss": 0.2075, "step": 1503 }, { "epoch": 0.07266753635792626, "grad_norm": 4.362372875213623, "learning_rate": 9.273324636420738e-07, "loss": 0.3456, "step": 1504 }, { "epoch": 0.07271585253901532, "grad_norm": 2.2552645206451416, "learning_rate": 9.272841474609846e-07, "loss": 0.2802, "step": 1505 }, { "epoch": 0.07276416872010437, "grad_norm": 4.046197891235352, "learning_rate": 9.272358312798955e-07, "loss": 0.3349, "step": 1506 }, { "epoch": 0.07281248490119341, "grad_norm": 2.719560146331787, "learning_rate": 9.271875150988065e-07, "loss": 0.2639, "step": 1507 }, { "epoch": 0.07286080108228246, "grad_norm": 3.060133695602417, "learning_rate": 9.271391989177175e-07, "loss": 0.3394, "step": 1508 }, { "epoch": 0.0729091172633715, "grad_norm": 2.4415743350982666, "learning_rate": 9.270908827366285e-07, "loss": 0.2744, "step": 1509 }, { "epoch": 0.07295743344446055, "grad_norm": 1.6933335065841675, "learning_rate": 9.270425665555394e-07, "loss": 0.2005, "step": 1510 }, { "epoch": 0.07300574962554959, "grad_norm": 1.9204075336456299, "learning_rate": 9.269942503744504e-07, "loss": 0.2332, "step": 1511 }, { "epoch": 0.07305406580663865, "grad_norm": 15.400178909301758, "learning_rate": 9.269459341933614e-07, "loss": 0.2967, "step": 1512 }, { "epoch": 0.07310238198772769, "grad_norm": 2.665790557861328, "learning_rate": 9.268976180122722e-07, "loss": 0.3074, "step": 1513 }, { "epoch": 0.07315069816881674, "grad_norm": 4.627558708190918, "learning_rate": 9.268493018311832e-07, "loss": 0.3132, "step": 1514 }, { "epoch": 0.07319901434990578, "grad_norm": 4.262248992919922, "learning_rate": 9.268009856500941e-07, "loss": 0.3645, "step": 1515 }, { "epoch": 0.07324733053099483, "grad_norm": 1.7012054920196533, "learning_rate": 9.267526694690051e-07, "loss": 0.2585, "step": 1516 }, { "epoch": 0.07329564671208387, "grad_norm": 4.117474555969238, "learning_rate": 9.267043532879161e-07, "loss": 0.3408, "step": 1517 }, { "epoch": 0.07334396289317292, "grad_norm": 3.7790989875793457, "learning_rate": 9.266560371068271e-07, "loss": 0.2794, "step": 1518 }, { "epoch": 0.07339227907426198, "grad_norm": 9.84658432006836, "learning_rate": 9.26607720925738e-07, "loss": 0.2659, "step": 1519 }, { "epoch": 0.07344059525535102, "grad_norm": 2.3518755435943604, "learning_rate": 9.265594047446489e-07, "loss": 0.3085, "step": 1520 }, { "epoch": 0.07348891143644007, "grad_norm": 4.523443698883057, "learning_rate": 9.265110885635599e-07, "loss": 0.3789, "step": 1521 }, { "epoch": 0.07353722761752911, "grad_norm": 3.115950345993042, "learning_rate": 9.264627723824708e-07, "loss": 0.3119, "step": 1522 }, { "epoch": 0.07358554379861816, "grad_norm": 30.387483596801758, "learning_rate": 9.264144562013818e-07, "loss": 0.4026, "step": 1523 }, { "epoch": 0.0736338599797072, "grad_norm": 3.1569173336029053, "learning_rate": 9.263661400202928e-07, "loss": 0.4112, "step": 1524 }, { "epoch": 0.07368217616079625, "grad_norm": 3.1076998710632324, "learning_rate": 9.263178238392038e-07, "loss": 0.4778, "step": 1525 }, { "epoch": 0.07373049234188529, "grad_norm": 2.874100685119629, "learning_rate": 9.262695076581146e-07, "loss": 0.1723, "step": 1526 }, { "epoch": 0.07377880852297435, "grad_norm": 3.2055981159210205, "learning_rate": 9.262211914770256e-07, "loss": 0.3483, "step": 1527 }, { "epoch": 0.07382712470406338, "grad_norm": 3.0432159900665283, "learning_rate": 9.261728752959366e-07, "loss": 0.3637, "step": 1528 }, { "epoch": 0.07387544088515244, "grad_norm": 4.023904323577881, "learning_rate": 9.261245591148476e-07, "loss": 0.4211, "step": 1529 }, { "epoch": 0.07392375706624149, "grad_norm": 3.0732502937316895, "learning_rate": 9.260762429337586e-07, "loss": 0.4104, "step": 1530 }, { "epoch": 0.07397207324733053, "grad_norm": 15.992677688598633, "learning_rate": 9.260279267526694e-07, "loss": 0.3474, "step": 1531 }, { "epoch": 0.07402038942841958, "grad_norm": 3.296464204788208, "learning_rate": 9.259796105715803e-07, "loss": 0.4371, "step": 1532 }, { "epoch": 0.07406870560950862, "grad_norm": 3.481952428817749, "learning_rate": 9.259312943904913e-07, "loss": 0.4377, "step": 1533 }, { "epoch": 0.07411702179059768, "grad_norm": 1.9247151613235474, "learning_rate": 9.258829782094023e-07, "loss": 0.219, "step": 1534 }, { "epoch": 0.07416533797168672, "grad_norm": 10.072395324707031, "learning_rate": 9.258346620283133e-07, "loss": 0.2038, "step": 1535 }, { "epoch": 0.07421365415277577, "grad_norm": 2.7939512729644775, "learning_rate": 9.257863458472242e-07, "loss": 0.3969, "step": 1536 }, { "epoch": 0.07426197033386481, "grad_norm": 5.160751819610596, "learning_rate": 9.257380296661352e-07, "loss": 0.345, "step": 1537 }, { "epoch": 0.07431028651495386, "grad_norm": 2.327108860015869, "learning_rate": 9.25689713485046e-07, "loss": 0.2948, "step": 1538 }, { "epoch": 0.0743586026960429, "grad_norm": 2.379762887954712, "learning_rate": 9.25641397303957e-07, "loss": 0.2761, "step": 1539 }, { "epoch": 0.07440691887713195, "grad_norm": 2.1298558712005615, "learning_rate": 9.25593081122868e-07, "loss": 0.1763, "step": 1540 }, { "epoch": 0.07445523505822099, "grad_norm": 2.0481207370758057, "learning_rate": 9.255447649417789e-07, "loss": 0.2055, "step": 1541 }, { "epoch": 0.07450355123931005, "grad_norm": 2.4113402366638184, "learning_rate": 9.254964487606899e-07, "loss": 0.3069, "step": 1542 }, { "epoch": 0.0745518674203991, "grad_norm": 4.349020481109619, "learning_rate": 9.254481325796009e-07, "loss": 0.3599, "step": 1543 }, { "epoch": 0.07460018360148814, "grad_norm": 1.906582236289978, "learning_rate": 9.253998163985119e-07, "loss": 0.1725, "step": 1544 }, { "epoch": 0.07464849978257719, "grad_norm": 2.6943891048431396, "learning_rate": 9.253515002174228e-07, "loss": 0.3073, "step": 1545 }, { "epoch": 0.07469681596366623, "grad_norm": 3.1946609020233154, "learning_rate": 9.253031840363337e-07, "loss": 0.3181, "step": 1546 }, { "epoch": 0.07474513214475528, "grad_norm": 1.7269747257232666, "learning_rate": 9.252548678552446e-07, "loss": 0.1538, "step": 1547 }, { "epoch": 0.07479344832584432, "grad_norm": 4.953693389892578, "learning_rate": 9.252065516741556e-07, "loss": 0.3187, "step": 1548 }, { "epoch": 0.07484176450693338, "grad_norm": 7.906215667724609, "learning_rate": 9.251582354930666e-07, "loss": 0.3051, "step": 1549 }, { "epoch": 0.07489008068802241, "grad_norm": 3.3497068881988525, "learning_rate": 9.251099193119776e-07, "loss": 0.1982, "step": 1550 }, { "epoch": 0.07493839686911147, "grad_norm": 2.9248013496398926, "learning_rate": 9.250616031308885e-07, "loss": 0.4492, "step": 1551 }, { "epoch": 0.0749867130502005, "grad_norm": 2.7935919761657715, "learning_rate": 9.250132869497994e-07, "loss": 0.2478, "step": 1552 }, { "epoch": 0.07503502923128956, "grad_norm": 3.534057855606079, "learning_rate": 9.249649707687104e-07, "loss": 0.3845, "step": 1553 }, { "epoch": 0.0750833454123786, "grad_norm": 5.519599437713623, "learning_rate": 9.249166545876214e-07, "loss": 0.424, "step": 1554 }, { "epoch": 0.07513166159346765, "grad_norm": 3.678257942199707, "learning_rate": 9.248683384065324e-07, "loss": 0.328, "step": 1555 }, { "epoch": 0.0751799777745567, "grad_norm": 2.423325538635254, "learning_rate": 9.248200222254433e-07, "loss": 0.37, "step": 1556 }, { "epoch": 0.07522829395564574, "grad_norm": 4.716227054595947, "learning_rate": 9.247717060443541e-07, "loss": 0.3785, "step": 1557 }, { "epoch": 0.0752766101367348, "grad_norm": 5.117674827575684, "learning_rate": 9.247233898632651e-07, "loss": 0.2682, "step": 1558 }, { "epoch": 0.07532492631782384, "grad_norm": 2.2128360271453857, "learning_rate": 9.246750736821761e-07, "loss": 0.2025, "step": 1559 }, { "epoch": 0.07537324249891289, "grad_norm": 2.4085981845855713, "learning_rate": 9.246267575010871e-07, "loss": 0.2703, "step": 1560 }, { "epoch": 0.07542155868000193, "grad_norm": 1.8871290683746338, "learning_rate": 9.245784413199981e-07, "loss": 0.2759, "step": 1561 }, { "epoch": 0.07546987486109098, "grad_norm": 1.9247668981552124, "learning_rate": 9.24530125138909e-07, "loss": 0.1625, "step": 1562 }, { "epoch": 0.07551819104218002, "grad_norm": 4.306339263916016, "learning_rate": 9.2448180895782e-07, "loss": 0.3673, "step": 1563 }, { "epoch": 0.07556650722326907, "grad_norm": 2.5574538707733154, "learning_rate": 9.244334927767308e-07, "loss": 0.3463, "step": 1564 }, { "epoch": 0.07561482340435811, "grad_norm": 2.418816328048706, "learning_rate": 9.243851765956418e-07, "loss": 0.253, "step": 1565 }, { "epoch": 0.07566313958544717, "grad_norm": 2.748357057571411, "learning_rate": 9.243368604145528e-07, "loss": 0.3553, "step": 1566 }, { "epoch": 0.0757114557665362, "grad_norm": 5.9185638427734375, "learning_rate": 9.242885442334637e-07, "loss": 0.4826, "step": 1567 }, { "epoch": 0.07575977194762526, "grad_norm": 2.6886422634124756, "learning_rate": 9.242402280523747e-07, "loss": 0.3685, "step": 1568 }, { "epoch": 0.07580808812871431, "grad_norm": 2.8483917713165283, "learning_rate": 9.241919118712857e-07, "loss": 0.346, "step": 1569 }, { "epoch": 0.07585640430980335, "grad_norm": 3.2630677223205566, "learning_rate": 9.241435956901966e-07, "loss": 0.3144, "step": 1570 }, { "epoch": 0.0759047204908924, "grad_norm": 4.097279071807861, "learning_rate": 9.240952795091076e-07, "loss": 0.3581, "step": 1571 }, { "epoch": 0.07595303667198144, "grad_norm": 2.9821882247924805, "learning_rate": 9.240469633280184e-07, "loss": 0.4302, "step": 1572 }, { "epoch": 0.0760013528530705, "grad_norm": 3.830089569091797, "learning_rate": 9.239986471469294e-07, "loss": 0.555, "step": 1573 }, { "epoch": 0.07604966903415954, "grad_norm": 2.7550599575042725, "learning_rate": 9.239503309658404e-07, "loss": 0.2623, "step": 1574 }, { "epoch": 0.07609798521524859, "grad_norm": 2.3917810916900635, "learning_rate": 9.239020147847514e-07, "loss": 0.2683, "step": 1575 }, { "epoch": 0.07614630139633763, "grad_norm": 2.5851261615753174, "learning_rate": 9.238536986036624e-07, "loss": 0.3792, "step": 1576 }, { "epoch": 0.07619461757742668, "grad_norm": 2.4792568683624268, "learning_rate": 9.238053824225733e-07, "loss": 0.3234, "step": 1577 }, { "epoch": 0.07624293375851572, "grad_norm": 6.166858673095703, "learning_rate": 9.237570662414842e-07, "loss": 0.35, "step": 1578 }, { "epoch": 0.07629124993960477, "grad_norm": 2.969191551208496, "learning_rate": 9.237087500603952e-07, "loss": 0.4526, "step": 1579 }, { "epoch": 0.07633956612069381, "grad_norm": 2.487967014312744, "learning_rate": 9.236604338793062e-07, "loss": 0.3651, "step": 1580 }, { "epoch": 0.07638788230178287, "grad_norm": 3.5468649864196777, "learning_rate": 9.236121176982171e-07, "loss": 0.3263, "step": 1581 }, { "epoch": 0.07643619848287192, "grad_norm": 1.9754202365875244, "learning_rate": 9.235638015171281e-07, "loss": 0.2483, "step": 1582 }, { "epoch": 0.07648451466396096, "grad_norm": 51.25514602661133, "learning_rate": 9.235154853360389e-07, "loss": 0.2714, "step": 1583 }, { "epoch": 0.07653283084505001, "grad_norm": 2.857532501220703, "learning_rate": 9.234671691549499e-07, "loss": 0.3665, "step": 1584 }, { "epoch": 0.07658114702613905, "grad_norm": 2.3859922885894775, "learning_rate": 9.234188529738609e-07, "loss": 0.2857, "step": 1585 }, { "epoch": 0.0766294632072281, "grad_norm": 2.4270312786102295, "learning_rate": 9.233705367927719e-07, "loss": 0.2083, "step": 1586 }, { "epoch": 0.07667777938831714, "grad_norm": 17.842958450317383, "learning_rate": 9.233222206116829e-07, "loss": 0.3141, "step": 1587 }, { "epoch": 0.0767260955694062, "grad_norm": 5.4971184730529785, "learning_rate": 9.232739044305938e-07, "loss": 0.3188, "step": 1588 }, { "epoch": 0.07677441175049524, "grad_norm": 27.408111572265625, "learning_rate": 9.232255882495046e-07, "loss": 0.2327, "step": 1589 }, { "epoch": 0.07682272793158429, "grad_norm": 2.9032061100006104, "learning_rate": 9.231772720684156e-07, "loss": 0.3929, "step": 1590 }, { "epoch": 0.07687104411267333, "grad_norm": 2.8034298419952393, "learning_rate": 9.231289558873266e-07, "loss": 0.2523, "step": 1591 }, { "epoch": 0.07691936029376238, "grad_norm": 1.8833537101745605, "learning_rate": 9.230806397062376e-07, "loss": 0.1832, "step": 1592 }, { "epoch": 0.07696767647485142, "grad_norm": 3.154266834259033, "learning_rate": 9.230323235251485e-07, "loss": 0.3404, "step": 1593 }, { "epoch": 0.07701599265594047, "grad_norm": 2.8188483715057373, "learning_rate": 9.229840073440595e-07, "loss": 0.3865, "step": 1594 }, { "epoch": 0.07706430883702953, "grad_norm": 2.199345350265503, "learning_rate": 9.229356911629705e-07, "loss": 0.3153, "step": 1595 }, { "epoch": 0.07711262501811857, "grad_norm": 2.9950978755950928, "learning_rate": 9.228873749818814e-07, "loss": 0.3114, "step": 1596 }, { "epoch": 0.07716094119920762, "grad_norm": 2.810275077819824, "learning_rate": 9.228390588007924e-07, "loss": 0.3458, "step": 1597 }, { "epoch": 0.07720925738029666, "grad_norm": 10.57754898071289, "learning_rate": 9.227907426197032e-07, "loss": 0.2477, "step": 1598 }, { "epoch": 0.07725757356138571, "grad_norm": 4.159126281738281, "learning_rate": 9.227424264386142e-07, "loss": 0.4135, "step": 1599 }, { "epoch": 0.07730588974247475, "grad_norm": 11.163140296936035, "learning_rate": 9.226941102575252e-07, "loss": 0.4572, "step": 1600 }, { "epoch": 0.0773542059235638, "grad_norm": 2.036761522293091, "learning_rate": 9.226457940764362e-07, "loss": 0.2398, "step": 1601 }, { "epoch": 0.07740252210465284, "grad_norm": 3.120577335357666, "learning_rate": 9.225974778953471e-07, "loss": 0.3009, "step": 1602 }, { "epoch": 0.0774508382857419, "grad_norm": 3.851069688796997, "learning_rate": 9.225491617142581e-07, "loss": 0.3909, "step": 1603 }, { "epoch": 0.07749915446683094, "grad_norm": 1.955848217010498, "learning_rate": 9.22500845533169e-07, "loss": 0.2368, "step": 1604 }, { "epoch": 0.07754747064791999, "grad_norm": 2.0901312828063965, "learning_rate": 9.2245252935208e-07, "loss": 0.2343, "step": 1605 }, { "epoch": 0.07759578682900904, "grad_norm": 2.7778847217559814, "learning_rate": 9.22404213170991e-07, "loss": 0.2848, "step": 1606 }, { "epoch": 0.07764410301009808, "grad_norm": 4.141808032989502, "learning_rate": 9.223558969899019e-07, "loss": 0.4595, "step": 1607 }, { "epoch": 0.07769241919118713, "grad_norm": 5.657296657562256, "learning_rate": 9.223075808088129e-07, "loss": 0.2545, "step": 1608 }, { "epoch": 0.07774073537227617, "grad_norm": 4.157136917114258, "learning_rate": 9.222592646277237e-07, "loss": 0.4021, "step": 1609 }, { "epoch": 0.07778905155336523, "grad_norm": 2.029824733734131, "learning_rate": 9.222109484466347e-07, "loss": 0.2573, "step": 1610 }, { "epoch": 0.07783736773445427, "grad_norm": 2.5665347576141357, "learning_rate": 9.221626322655457e-07, "loss": 0.2807, "step": 1611 }, { "epoch": 0.07788568391554332, "grad_norm": 3.5689356327056885, "learning_rate": 9.221143160844567e-07, "loss": 0.3739, "step": 1612 }, { "epoch": 0.07793400009663236, "grad_norm": 1.456050992012024, "learning_rate": 9.220659999033677e-07, "loss": 0.1714, "step": 1613 }, { "epoch": 0.07798231627772141, "grad_norm": 2.3189494609832764, "learning_rate": 9.220176837222786e-07, "loss": 0.2841, "step": 1614 }, { "epoch": 0.07803063245881045, "grad_norm": 2.2386562824249268, "learning_rate": 9.219693675411894e-07, "loss": 0.2754, "step": 1615 }, { "epoch": 0.0780789486398995, "grad_norm": 3.6242454051971436, "learning_rate": 9.219210513601004e-07, "loss": 0.3182, "step": 1616 }, { "epoch": 0.07812726482098854, "grad_norm": 3.6955058574676514, "learning_rate": 9.218727351790114e-07, "loss": 0.4305, "step": 1617 }, { "epoch": 0.0781755810020776, "grad_norm": 4.558778285980225, "learning_rate": 9.218244189979224e-07, "loss": 0.4695, "step": 1618 }, { "epoch": 0.07822389718316665, "grad_norm": 1.842185378074646, "learning_rate": 9.217761028168333e-07, "loss": 0.1905, "step": 1619 }, { "epoch": 0.07827221336425569, "grad_norm": 1.7599138021469116, "learning_rate": 9.217277866357443e-07, "loss": 0.1902, "step": 1620 }, { "epoch": 0.07832052954534474, "grad_norm": 3.487253189086914, "learning_rate": 9.216794704546552e-07, "loss": 0.3688, "step": 1621 }, { "epoch": 0.07836884572643378, "grad_norm": 2.445211410522461, "learning_rate": 9.216311542735662e-07, "loss": 0.2894, "step": 1622 }, { "epoch": 0.07841716190752283, "grad_norm": 2.6066830158233643, "learning_rate": 9.215828380924771e-07, "loss": 0.3081, "step": 1623 }, { "epoch": 0.07846547808861187, "grad_norm": 2.6077613830566406, "learning_rate": 9.21534521911388e-07, "loss": 0.231, "step": 1624 }, { "epoch": 0.07851379426970093, "grad_norm": 5.497106075286865, "learning_rate": 9.21486205730299e-07, "loss": 0.3898, "step": 1625 }, { "epoch": 0.07856211045078997, "grad_norm": 2.6043262481689453, "learning_rate": 9.2143788954921e-07, "loss": 0.2424, "step": 1626 }, { "epoch": 0.07861042663187902, "grad_norm": 1.704371452331543, "learning_rate": 9.21389573368121e-07, "loss": 0.1924, "step": 1627 }, { "epoch": 0.07865874281296806, "grad_norm": 2.7301809787750244, "learning_rate": 9.213412571870319e-07, "loss": 0.3996, "step": 1628 }, { "epoch": 0.07870705899405711, "grad_norm": 3.031681537628174, "learning_rate": 9.212929410059429e-07, "loss": 0.4313, "step": 1629 }, { "epoch": 0.07875537517514615, "grad_norm": 14.985649108886719, "learning_rate": 9.212446248248538e-07, "loss": 0.4081, "step": 1630 }, { "epoch": 0.0788036913562352, "grad_norm": 3.212003469467163, "learning_rate": 9.211963086437648e-07, "loss": 0.25, "step": 1631 }, { "epoch": 0.07885200753732426, "grad_norm": 9.49986743927002, "learning_rate": 9.211479924626757e-07, "loss": 0.3671, "step": 1632 }, { "epoch": 0.0789003237184133, "grad_norm": 1.5873724222183228, "learning_rate": 9.210996762815867e-07, "loss": 0.1663, "step": 1633 }, { "epoch": 0.07894863989950235, "grad_norm": 10.254700660705566, "learning_rate": 9.210513601004976e-07, "loss": 0.4339, "step": 1634 }, { "epoch": 0.07899695608059139, "grad_norm": 3.5574262142181396, "learning_rate": 9.210030439194085e-07, "loss": 0.2813, "step": 1635 }, { "epoch": 0.07904527226168044, "grad_norm": 4.0117011070251465, "learning_rate": 9.209547277383195e-07, "loss": 0.2925, "step": 1636 }, { "epoch": 0.07909358844276948, "grad_norm": 1.4267867803573608, "learning_rate": 9.209064115572305e-07, "loss": 0.1544, "step": 1637 }, { "epoch": 0.07914190462385853, "grad_norm": 5.571422576904297, "learning_rate": 9.208580953761415e-07, "loss": 0.4395, "step": 1638 }, { "epoch": 0.07919022080494757, "grad_norm": 2.280456066131592, "learning_rate": 9.208097791950525e-07, "loss": 0.2534, "step": 1639 }, { "epoch": 0.07923853698603663, "grad_norm": 2.215463161468506, "learning_rate": 9.207614630139632e-07, "loss": 0.2742, "step": 1640 }, { "epoch": 0.07928685316712566, "grad_norm": 2.970630168914795, "learning_rate": 9.207131468328742e-07, "loss": 0.3646, "step": 1641 }, { "epoch": 0.07933516934821472, "grad_norm": 6.772772789001465, "learning_rate": 9.206648306517852e-07, "loss": 0.2106, "step": 1642 }, { "epoch": 0.07938348552930376, "grad_norm": 4.3104248046875, "learning_rate": 9.206165144706962e-07, "loss": 0.3319, "step": 1643 }, { "epoch": 0.07943180171039281, "grad_norm": 2.189910888671875, "learning_rate": 9.205681982896072e-07, "loss": 0.2526, "step": 1644 }, { "epoch": 0.07948011789148186, "grad_norm": 4.349228858947754, "learning_rate": 9.205198821085181e-07, "loss": 0.4741, "step": 1645 }, { "epoch": 0.0795284340725709, "grad_norm": 4.180288314819336, "learning_rate": 9.204715659274291e-07, "loss": 0.3099, "step": 1646 }, { "epoch": 0.07957675025365996, "grad_norm": 3.0978338718414307, "learning_rate": 9.2042324974634e-07, "loss": 0.1837, "step": 1647 }, { "epoch": 0.079625066434749, "grad_norm": 2.4282288551330566, "learning_rate": 9.20374933565251e-07, "loss": 0.2693, "step": 1648 }, { "epoch": 0.07967338261583805, "grad_norm": 2.7620418071746826, "learning_rate": 9.203266173841619e-07, "loss": 0.1546, "step": 1649 }, { "epoch": 0.07972169879692709, "grad_norm": 2.3819284439086914, "learning_rate": 9.202783012030728e-07, "loss": 0.3147, "step": 1650 }, { "epoch": 0.07977001497801614, "grad_norm": 2.520232915878296, "learning_rate": 9.202299850219838e-07, "loss": 0.3156, "step": 1651 }, { "epoch": 0.07981833115910518, "grad_norm": 2.517709732055664, "learning_rate": 9.201816688408948e-07, "loss": 0.2852, "step": 1652 }, { "epoch": 0.07986664734019423, "grad_norm": 2.4514272212982178, "learning_rate": 9.201333526598057e-07, "loss": 0.3154, "step": 1653 }, { "epoch": 0.07991496352128327, "grad_norm": 1.993951678276062, "learning_rate": 9.200850364787167e-07, "loss": 0.2087, "step": 1654 }, { "epoch": 0.07996327970237233, "grad_norm": 13.969500541687012, "learning_rate": 9.200367202976277e-07, "loss": 0.2511, "step": 1655 }, { "epoch": 0.08001159588346136, "grad_norm": 2.831819534301758, "learning_rate": 9.199884041165386e-07, "loss": 0.3598, "step": 1656 }, { "epoch": 0.08005991206455042, "grad_norm": 2.836343765258789, "learning_rate": 9.199400879354495e-07, "loss": 0.348, "step": 1657 }, { "epoch": 0.08010822824563947, "grad_norm": 6.281289577484131, "learning_rate": 9.198917717543605e-07, "loss": 0.1906, "step": 1658 }, { "epoch": 0.08015654442672851, "grad_norm": 2.987612009048462, "learning_rate": 9.198434555732715e-07, "loss": 0.3211, "step": 1659 }, { "epoch": 0.08020486060781756, "grad_norm": 3.3205554485321045, "learning_rate": 9.197951393921824e-07, "loss": 0.4425, "step": 1660 }, { "epoch": 0.0802531767889066, "grad_norm": 2.6600027084350586, "learning_rate": 9.197468232110933e-07, "loss": 0.3379, "step": 1661 }, { "epoch": 0.08030149296999566, "grad_norm": 4.953793048858643, "learning_rate": 9.196985070300043e-07, "loss": 0.2671, "step": 1662 }, { "epoch": 0.0803498091510847, "grad_norm": 2.890367269515991, "learning_rate": 9.196501908489153e-07, "loss": 0.246, "step": 1663 }, { "epoch": 0.08039812533217375, "grad_norm": 2.6800432205200195, "learning_rate": 9.196018746678263e-07, "loss": 0.3057, "step": 1664 }, { "epoch": 0.08044644151326279, "grad_norm": 3.4028663635253906, "learning_rate": 9.195535584867373e-07, "loss": 0.4962, "step": 1665 }, { "epoch": 0.08049475769435184, "grad_norm": 5.956148624420166, "learning_rate": 9.19505242305648e-07, "loss": 0.4412, "step": 1666 }, { "epoch": 0.08054307387544088, "grad_norm": 3.1189188957214355, "learning_rate": 9.19456926124559e-07, "loss": 0.408, "step": 1667 }, { "epoch": 0.08059139005652993, "grad_norm": 3.3874218463897705, "learning_rate": 9.1940860994347e-07, "loss": 0.389, "step": 1668 }, { "epoch": 0.08063970623761899, "grad_norm": 2.7579410076141357, "learning_rate": 9.19360293762381e-07, "loss": 0.2529, "step": 1669 }, { "epoch": 0.08068802241870802, "grad_norm": 2.293642520904541, "learning_rate": 9.19311977581292e-07, "loss": 0.2964, "step": 1670 }, { "epoch": 0.08073633859979708, "grad_norm": 3.085860013961792, "learning_rate": 9.192636614002029e-07, "loss": 0.3316, "step": 1671 }, { "epoch": 0.08078465478088612, "grad_norm": 3.751539945602417, "learning_rate": 9.192153452191138e-07, "loss": 0.4616, "step": 1672 }, { "epoch": 0.08083297096197517, "grad_norm": 3.4414772987365723, "learning_rate": 9.191670290380248e-07, "loss": 0.3149, "step": 1673 }, { "epoch": 0.08088128714306421, "grad_norm": 4.389012336730957, "learning_rate": 9.191187128569357e-07, "loss": 0.3354, "step": 1674 }, { "epoch": 0.08092960332415326, "grad_norm": 2.0322530269622803, "learning_rate": 9.190703966758467e-07, "loss": 0.2278, "step": 1675 }, { "epoch": 0.0809779195052423, "grad_norm": 2.5122528076171875, "learning_rate": 9.190220804947576e-07, "loss": 0.2812, "step": 1676 }, { "epoch": 0.08102623568633135, "grad_norm": 2.990769386291504, "learning_rate": 9.189737643136686e-07, "loss": 0.409, "step": 1677 }, { "epoch": 0.0810745518674204, "grad_norm": 2.111717700958252, "learning_rate": 9.189254481325796e-07, "loss": 0.2407, "step": 1678 }, { "epoch": 0.08112286804850945, "grad_norm": 2.308453321456909, "learning_rate": 9.188771319514905e-07, "loss": 0.2304, "step": 1679 }, { "epoch": 0.08117118422959849, "grad_norm": 2.460838794708252, "learning_rate": 9.188288157704015e-07, "loss": 0.2885, "step": 1680 }, { "epoch": 0.08121950041068754, "grad_norm": 5.159487247467041, "learning_rate": 9.187804995893125e-07, "loss": 0.3653, "step": 1681 }, { "epoch": 0.08126781659177659, "grad_norm": 2.519768476486206, "learning_rate": 9.187321834082233e-07, "loss": 0.2761, "step": 1682 }, { "epoch": 0.08131613277286563, "grad_norm": 3.1842801570892334, "learning_rate": 9.186838672271343e-07, "loss": 0.2725, "step": 1683 }, { "epoch": 0.08136444895395469, "grad_norm": 3.275925397872925, "learning_rate": 9.186355510460453e-07, "loss": 0.2421, "step": 1684 }, { "epoch": 0.08141276513504372, "grad_norm": 3.4754624366760254, "learning_rate": 9.185872348649562e-07, "loss": 0.3343, "step": 1685 }, { "epoch": 0.08146108131613278, "grad_norm": 2.866245746612549, "learning_rate": 9.185389186838672e-07, "loss": 0.2755, "step": 1686 }, { "epoch": 0.08150939749722182, "grad_norm": 4.8204450607299805, "learning_rate": 9.184906025027781e-07, "loss": 0.5459, "step": 1687 }, { "epoch": 0.08155771367831087, "grad_norm": 3.3460564613342285, "learning_rate": 9.184422863216891e-07, "loss": 0.3627, "step": 1688 }, { "epoch": 0.08160602985939991, "grad_norm": 1.6827067136764526, "learning_rate": 9.183939701406001e-07, "loss": 0.1812, "step": 1689 }, { "epoch": 0.08165434604048896, "grad_norm": 2.0461692810058594, "learning_rate": 9.183456539595111e-07, "loss": 0.2191, "step": 1690 }, { "epoch": 0.081702662221578, "grad_norm": 15.070786476135254, "learning_rate": 9.18297337778422e-07, "loss": 0.3432, "step": 1691 }, { "epoch": 0.08175097840266705, "grad_norm": 4.86683464050293, "learning_rate": 9.182490215973328e-07, "loss": 0.3025, "step": 1692 }, { "epoch": 0.0817992945837561, "grad_norm": 2.9331085681915283, "learning_rate": 9.182007054162438e-07, "loss": 0.2916, "step": 1693 }, { "epoch": 0.08184761076484515, "grad_norm": 6.056687355041504, "learning_rate": 9.181523892351548e-07, "loss": 0.3508, "step": 1694 }, { "epoch": 0.0818959269459342, "grad_norm": 3.1937594413757324, "learning_rate": 9.181040730540658e-07, "loss": 0.2513, "step": 1695 }, { "epoch": 0.08194424312702324, "grad_norm": 1.6425142288208008, "learning_rate": 9.180557568729768e-07, "loss": 0.1876, "step": 1696 }, { "epoch": 0.08199255930811229, "grad_norm": 2.0322961807250977, "learning_rate": 9.180074406918877e-07, "loss": 0.1755, "step": 1697 }, { "epoch": 0.08204087548920133, "grad_norm": 3.7176313400268555, "learning_rate": 9.179591245107986e-07, "loss": 0.2756, "step": 1698 }, { "epoch": 0.08208919167029038, "grad_norm": 3.4455041885375977, "learning_rate": 9.179108083297095e-07, "loss": 0.3057, "step": 1699 }, { "epoch": 0.08213750785137942, "grad_norm": 2.611529588699341, "learning_rate": 9.178624921486205e-07, "loss": 0.3601, "step": 1700 }, { "epoch": 0.08218582403246848, "grad_norm": 2.617936849594116, "learning_rate": 9.178141759675315e-07, "loss": 0.259, "step": 1701 }, { "epoch": 0.08223414021355752, "grad_norm": 2.9022161960601807, "learning_rate": 9.177658597864424e-07, "loss": 0.3741, "step": 1702 }, { "epoch": 0.08228245639464657, "grad_norm": 5.289820671081543, "learning_rate": 9.177175436053534e-07, "loss": 0.3424, "step": 1703 }, { "epoch": 0.08233077257573561, "grad_norm": 3.160982847213745, "learning_rate": 9.176692274242643e-07, "loss": 0.2506, "step": 1704 }, { "epoch": 0.08237908875682466, "grad_norm": 2.410266160964966, "learning_rate": 9.176209112431753e-07, "loss": 0.2393, "step": 1705 }, { "epoch": 0.0824274049379137, "grad_norm": 1.8229962587356567, "learning_rate": 9.175725950620863e-07, "loss": 0.2235, "step": 1706 }, { "epoch": 0.08247572111900275, "grad_norm": 2.3891358375549316, "learning_rate": 9.175242788809971e-07, "loss": 0.3327, "step": 1707 }, { "epoch": 0.08252403730009181, "grad_norm": 2.5447945594787598, "learning_rate": 9.174759626999081e-07, "loss": 0.3185, "step": 1708 }, { "epoch": 0.08257235348118085, "grad_norm": 4.976142406463623, "learning_rate": 9.174276465188191e-07, "loss": 0.2347, "step": 1709 }, { "epoch": 0.0826206696622699, "grad_norm": 3.008171796798706, "learning_rate": 9.173793303377301e-07, "loss": 0.3392, "step": 1710 }, { "epoch": 0.08266898584335894, "grad_norm": 2.268181800842285, "learning_rate": 9.17331014156641e-07, "loss": 0.263, "step": 1711 }, { "epoch": 0.08271730202444799, "grad_norm": 3.7247049808502197, "learning_rate": 9.17282697975552e-07, "loss": 0.371, "step": 1712 }, { "epoch": 0.08276561820553703, "grad_norm": 2.965459108352661, "learning_rate": 9.172343817944629e-07, "loss": 0.4057, "step": 1713 }, { "epoch": 0.08281393438662608, "grad_norm": 5.768454074859619, "learning_rate": 9.171860656133739e-07, "loss": 0.3577, "step": 1714 }, { "epoch": 0.08286225056771512, "grad_norm": 4.694431781768799, "learning_rate": 9.171377494322849e-07, "loss": 0.238, "step": 1715 }, { "epoch": 0.08291056674880418, "grad_norm": 2.7634024620056152, "learning_rate": 9.170894332511958e-07, "loss": 0.3598, "step": 1716 }, { "epoch": 0.08295888292989322, "grad_norm": 2.167288064956665, "learning_rate": 9.170411170701067e-07, "loss": 0.2649, "step": 1717 }, { "epoch": 0.08300719911098227, "grad_norm": 3.1754512786865234, "learning_rate": 9.169928008890176e-07, "loss": 0.393, "step": 1718 }, { "epoch": 0.08305551529207131, "grad_norm": 3.5852270126342773, "learning_rate": 9.169444847079286e-07, "loss": 0.325, "step": 1719 }, { "epoch": 0.08310383147316036, "grad_norm": 1.6597851514816284, "learning_rate": 9.168961685268396e-07, "loss": 0.1645, "step": 1720 }, { "epoch": 0.08315214765424941, "grad_norm": 11.398736953735352, "learning_rate": 9.168478523457506e-07, "loss": 0.2115, "step": 1721 }, { "epoch": 0.08320046383533845, "grad_norm": 2.920686960220337, "learning_rate": 9.167995361646616e-07, "loss": 0.4292, "step": 1722 }, { "epoch": 0.0832487800164275, "grad_norm": 3.0026822090148926, "learning_rate": 9.167512199835725e-07, "loss": 0.3332, "step": 1723 }, { "epoch": 0.08329709619751655, "grad_norm": 2.928891897201538, "learning_rate": 9.167029038024833e-07, "loss": 0.4051, "step": 1724 }, { "epoch": 0.0833454123786056, "grad_norm": 9.709291458129883, "learning_rate": 9.166545876213943e-07, "loss": 0.3351, "step": 1725 }, { "epoch": 0.08339372855969464, "grad_norm": 4.174312591552734, "learning_rate": 9.166062714403053e-07, "loss": 0.4209, "step": 1726 }, { "epoch": 0.08344204474078369, "grad_norm": 21.51166534423828, "learning_rate": 9.165579552592163e-07, "loss": 0.3142, "step": 1727 }, { "epoch": 0.08349036092187273, "grad_norm": 1.9125391244888306, "learning_rate": 9.165096390781272e-07, "loss": 0.22, "step": 1728 }, { "epoch": 0.08353867710296178, "grad_norm": 3.395613193511963, "learning_rate": 9.164613228970382e-07, "loss": 0.4771, "step": 1729 }, { "epoch": 0.08358699328405082, "grad_norm": 2.669252872467041, "learning_rate": 9.164130067159491e-07, "loss": 0.336, "step": 1730 }, { "epoch": 0.08363530946513988, "grad_norm": 4.57705020904541, "learning_rate": 9.163646905348601e-07, "loss": 0.4248, "step": 1731 }, { "epoch": 0.08368362564622892, "grad_norm": 3.001120090484619, "learning_rate": 9.163163743537711e-07, "loss": 0.2951, "step": 1732 }, { "epoch": 0.08373194182731797, "grad_norm": 2.928929567337036, "learning_rate": 9.162680581726819e-07, "loss": 0.3836, "step": 1733 }, { "epoch": 0.08378025800840702, "grad_norm": 2.86153507232666, "learning_rate": 9.162197419915929e-07, "loss": 0.3037, "step": 1734 }, { "epoch": 0.08382857418949606, "grad_norm": 2.162092924118042, "learning_rate": 9.161714258105039e-07, "loss": 0.2379, "step": 1735 }, { "epoch": 0.08387689037058511, "grad_norm": 2.919792652130127, "learning_rate": 9.161231096294148e-07, "loss": 0.389, "step": 1736 }, { "epoch": 0.08392520655167415, "grad_norm": 3.9407663345336914, "learning_rate": 9.160747934483258e-07, "loss": 0.2395, "step": 1737 }, { "epoch": 0.0839735227327632, "grad_norm": 3.3471553325653076, "learning_rate": 9.160264772672368e-07, "loss": 0.3606, "step": 1738 }, { "epoch": 0.08402183891385225, "grad_norm": 1.8051508665084839, "learning_rate": 9.159781610861477e-07, "loss": 0.1948, "step": 1739 }, { "epoch": 0.0840701550949413, "grad_norm": 3.3058393001556396, "learning_rate": 9.159298449050587e-07, "loss": 0.3387, "step": 1740 }, { "epoch": 0.08411847127603034, "grad_norm": 3.8429558277130127, "learning_rate": 9.158815287239697e-07, "loss": 0.358, "step": 1741 }, { "epoch": 0.08416678745711939, "grad_norm": 2.1434943675994873, "learning_rate": 9.158332125428806e-07, "loss": 0.2053, "step": 1742 }, { "epoch": 0.08421510363820843, "grad_norm": 5.547663688659668, "learning_rate": 9.157848963617915e-07, "loss": 0.2385, "step": 1743 }, { "epoch": 0.08426341981929748, "grad_norm": 2.0106544494628906, "learning_rate": 9.157365801807024e-07, "loss": 0.2212, "step": 1744 }, { "epoch": 0.08431173600038654, "grad_norm": 2.112368106842041, "learning_rate": 9.156882639996134e-07, "loss": 0.2409, "step": 1745 }, { "epoch": 0.08436005218147558, "grad_norm": 3.730947971343994, "learning_rate": 9.156399478185244e-07, "loss": 0.4128, "step": 1746 }, { "epoch": 0.08440836836256463, "grad_norm": 3.163172721862793, "learning_rate": 9.155916316374354e-07, "loss": 0.3133, "step": 1747 }, { "epoch": 0.08445668454365367, "grad_norm": 3.0197932720184326, "learning_rate": 9.155433154563464e-07, "loss": 0.4222, "step": 1748 }, { "epoch": 0.08450500072474272, "grad_norm": 2.0511651039123535, "learning_rate": 9.154949992752571e-07, "loss": 0.2037, "step": 1749 }, { "epoch": 0.08455331690583176, "grad_norm": 1.720324158668518, "learning_rate": 9.154466830941681e-07, "loss": 0.1916, "step": 1750 }, { "epoch": 0.08460163308692081, "grad_norm": 8.502616882324219, "learning_rate": 9.153983669130791e-07, "loss": 0.3953, "step": 1751 }, { "epoch": 0.08464994926800985, "grad_norm": 1.3921210765838623, "learning_rate": 9.153500507319901e-07, "loss": 0.1531, "step": 1752 }, { "epoch": 0.0846982654490989, "grad_norm": 2.4657063484191895, "learning_rate": 9.153017345509011e-07, "loss": 0.2462, "step": 1753 }, { "epoch": 0.08474658163018794, "grad_norm": 3.9388022422790527, "learning_rate": 9.15253418369812e-07, "loss": 0.3883, "step": 1754 }, { "epoch": 0.084794897811277, "grad_norm": 4.030212879180908, "learning_rate": 9.15205102188723e-07, "loss": 0.2897, "step": 1755 }, { "epoch": 0.08484321399236604, "grad_norm": 2.3201708793640137, "learning_rate": 9.151567860076339e-07, "loss": 0.2158, "step": 1756 }, { "epoch": 0.08489153017345509, "grad_norm": 2.1582038402557373, "learning_rate": 9.151084698265449e-07, "loss": 0.2417, "step": 1757 }, { "epoch": 0.08493984635454414, "grad_norm": 2.5771403312683105, "learning_rate": 9.150601536454558e-07, "loss": 0.3119, "step": 1758 }, { "epoch": 0.08498816253563318, "grad_norm": 2.2885093688964844, "learning_rate": 9.150118374643667e-07, "loss": 0.2026, "step": 1759 }, { "epoch": 0.08503647871672224, "grad_norm": 2.6204206943511963, "learning_rate": 9.149635212832777e-07, "loss": 0.2735, "step": 1760 }, { "epoch": 0.08508479489781128, "grad_norm": 5.056484222412109, "learning_rate": 9.149152051021887e-07, "loss": 0.4552, "step": 1761 }, { "epoch": 0.08513311107890033, "grad_norm": 3.116462230682373, "learning_rate": 9.148668889210996e-07, "loss": 0.3126, "step": 1762 }, { "epoch": 0.08518142725998937, "grad_norm": 2.55409836769104, "learning_rate": 9.148185727400106e-07, "loss": 0.3791, "step": 1763 }, { "epoch": 0.08522974344107842, "grad_norm": 1.8165305852890015, "learning_rate": 9.147702565589216e-07, "loss": 0.2302, "step": 1764 }, { "epoch": 0.08527805962216746, "grad_norm": 2.3326938152313232, "learning_rate": 9.147219403778325e-07, "loss": 0.2659, "step": 1765 }, { "epoch": 0.08532637580325651, "grad_norm": 8.522701263427734, "learning_rate": 9.146736241967435e-07, "loss": 0.2446, "step": 1766 }, { "epoch": 0.08537469198434555, "grad_norm": 2.4997782707214355, "learning_rate": 9.146253080156544e-07, "loss": 0.2779, "step": 1767 }, { "epoch": 0.0854230081654346, "grad_norm": 2.407550811767578, "learning_rate": 9.145769918345653e-07, "loss": 0.2395, "step": 1768 }, { "epoch": 0.08547132434652364, "grad_norm": 2.2617807388305664, "learning_rate": 9.145286756534763e-07, "loss": 0.3227, "step": 1769 }, { "epoch": 0.0855196405276127, "grad_norm": 2.8487401008605957, "learning_rate": 9.144803594723872e-07, "loss": 0.381, "step": 1770 }, { "epoch": 0.08556795670870175, "grad_norm": 2.767685890197754, "learning_rate": 9.144320432912982e-07, "loss": 0.316, "step": 1771 }, { "epoch": 0.08561627288979079, "grad_norm": 2.781491279602051, "learning_rate": 9.143837271102092e-07, "loss": 0.2686, "step": 1772 }, { "epoch": 0.08566458907087984, "grad_norm": 2.6875765323638916, "learning_rate": 9.143354109291202e-07, "loss": 0.3683, "step": 1773 }, { "epoch": 0.08571290525196888, "grad_norm": 2.465045690536499, "learning_rate": 9.142870947480312e-07, "loss": 0.2989, "step": 1774 }, { "epoch": 0.08576122143305794, "grad_norm": 2.448700428009033, "learning_rate": 9.142387785669419e-07, "loss": 0.3009, "step": 1775 }, { "epoch": 0.08580953761414697, "grad_norm": 2.3354976177215576, "learning_rate": 9.141904623858529e-07, "loss": 0.327, "step": 1776 }, { "epoch": 0.08585785379523603, "grad_norm": 3.9858977794647217, "learning_rate": 9.141421462047639e-07, "loss": 0.5081, "step": 1777 }, { "epoch": 0.08590616997632507, "grad_norm": 2.163013458251953, "learning_rate": 9.140938300236749e-07, "loss": 0.2404, "step": 1778 }, { "epoch": 0.08595448615741412, "grad_norm": 3.5347373485565186, "learning_rate": 9.140455138425859e-07, "loss": 0.3459, "step": 1779 }, { "epoch": 0.08600280233850316, "grad_norm": 3.0673975944519043, "learning_rate": 9.139971976614968e-07, "loss": 0.3391, "step": 1780 }, { "epoch": 0.08605111851959221, "grad_norm": 4.378537178039551, "learning_rate": 9.139488814804077e-07, "loss": 0.3584, "step": 1781 }, { "epoch": 0.08609943470068125, "grad_norm": 2.3644096851348877, "learning_rate": 9.139005652993187e-07, "loss": 0.2284, "step": 1782 }, { "epoch": 0.0861477508817703, "grad_norm": 2.334385871887207, "learning_rate": 9.138522491182297e-07, "loss": 0.3446, "step": 1783 }, { "epoch": 0.08619606706285936, "grad_norm": 2.858001708984375, "learning_rate": 9.138039329371406e-07, "loss": 0.3114, "step": 1784 }, { "epoch": 0.0862443832439484, "grad_norm": 3.9410016536712646, "learning_rate": 9.137556167560515e-07, "loss": 0.4124, "step": 1785 }, { "epoch": 0.08629269942503745, "grad_norm": 2.800917387008667, "learning_rate": 9.137073005749625e-07, "loss": 0.2163, "step": 1786 }, { "epoch": 0.08634101560612649, "grad_norm": 2.671318292617798, "learning_rate": 9.136589843938735e-07, "loss": 0.3851, "step": 1787 }, { "epoch": 0.08638933178721554, "grad_norm": 2.2147042751312256, "learning_rate": 9.136106682127844e-07, "loss": 0.285, "step": 1788 }, { "epoch": 0.08643764796830458, "grad_norm": 2.628309965133667, "learning_rate": 9.135623520316954e-07, "loss": 0.3225, "step": 1789 }, { "epoch": 0.08648596414939363, "grad_norm": 188.9366455078125, "learning_rate": 9.135140358506064e-07, "loss": 0.2701, "step": 1790 }, { "epoch": 0.08653428033048267, "grad_norm": 5.902507305145264, "learning_rate": 9.134657196695173e-07, "loss": 0.3426, "step": 1791 }, { "epoch": 0.08658259651157173, "grad_norm": 3.5945956707000732, "learning_rate": 9.134174034884282e-07, "loss": 0.3136, "step": 1792 }, { "epoch": 0.08663091269266077, "grad_norm": 3.004770517349243, "learning_rate": 9.133690873073392e-07, "loss": 0.4443, "step": 1793 }, { "epoch": 0.08667922887374982, "grad_norm": 2.623481512069702, "learning_rate": 9.133207711262501e-07, "loss": 0.3328, "step": 1794 }, { "epoch": 0.08672754505483886, "grad_norm": 2.8673038482666016, "learning_rate": 9.132724549451611e-07, "loss": 0.436, "step": 1795 }, { "epoch": 0.08677586123592791, "grad_norm": 3.8851161003112793, "learning_rate": 9.13224138764072e-07, "loss": 0.4961, "step": 1796 }, { "epoch": 0.08682417741701697, "grad_norm": 3.1705589294433594, "learning_rate": 9.13175822582983e-07, "loss": 0.4364, "step": 1797 }, { "epoch": 0.086872493598106, "grad_norm": 1.900864839553833, "learning_rate": 9.13127506401894e-07, "loss": 0.2443, "step": 1798 }, { "epoch": 0.08692080977919506, "grad_norm": 2.816429376602173, "learning_rate": 9.13079190220805e-07, "loss": 0.2953, "step": 1799 }, { "epoch": 0.0869691259602841, "grad_norm": 1.53047513961792, "learning_rate": 9.13030874039716e-07, "loss": 0.127, "step": 1800 }, { "epoch": 0.08701744214137315, "grad_norm": 1.8590952157974243, "learning_rate": 9.129825578586267e-07, "loss": 0.2108, "step": 1801 }, { "epoch": 0.08706575832246219, "grad_norm": 2.6878483295440674, "learning_rate": 9.129342416775377e-07, "loss": 0.2602, "step": 1802 }, { "epoch": 0.08711407450355124, "grad_norm": 2.8076257705688477, "learning_rate": 9.128859254964487e-07, "loss": 0.2808, "step": 1803 }, { "epoch": 0.08716239068464028, "grad_norm": 2.945636034011841, "learning_rate": 9.128376093153597e-07, "loss": 0.3306, "step": 1804 }, { "epoch": 0.08721070686572933, "grad_norm": 2.6607065200805664, "learning_rate": 9.127892931342707e-07, "loss": 0.3637, "step": 1805 }, { "epoch": 0.08725902304681837, "grad_norm": 2.5442612171173096, "learning_rate": 9.127409769531816e-07, "loss": 0.2166, "step": 1806 }, { "epoch": 0.08730733922790743, "grad_norm": 2.734156847000122, "learning_rate": 9.126926607720925e-07, "loss": 0.3388, "step": 1807 }, { "epoch": 0.08735565540899648, "grad_norm": 12.409987449645996, "learning_rate": 9.126443445910035e-07, "loss": 0.2978, "step": 1808 }, { "epoch": 0.08740397159008552, "grad_norm": 2.732419490814209, "learning_rate": 9.125960284099144e-07, "loss": 0.2126, "step": 1809 }, { "epoch": 0.08745228777117457, "grad_norm": 4.343268871307373, "learning_rate": 9.125477122288254e-07, "loss": 0.3147, "step": 1810 }, { "epoch": 0.08750060395226361, "grad_norm": 5.2777838706970215, "learning_rate": 9.124993960477363e-07, "loss": 0.3089, "step": 1811 }, { "epoch": 0.08754892013335266, "grad_norm": 2.43009614944458, "learning_rate": 9.124510798666473e-07, "loss": 0.2684, "step": 1812 }, { "epoch": 0.0875972363144417, "grad_norm": 2.025761604309082, "learning_rate": 9.124027636855582e-07, "loss": 0.2453, "step": 1813 }, { "epoch": 0.08764555249553076, "grad_norm": 2.89691162109375, "learning_rate": 9.123544475044692e-07, "loss": 0.3067, "step": 1814 }, { "epoch": 0.0876938686766198, "grad_norm": 2.9245481491088867, "learning_rate": 9.123061313233802e-07, "loss": 0.3756, "step": 1815 }, { "epoch": 0.08774218485770885, "grad_norm": 2.249997615814209, "learning_rate": 9.122578151422912e-07, "loss": 0.2841, "step": 1816 }, { "epoch": 0.08779050103879789, "grad_norm": 2.134643793106079, "learning_rate": 9.12209498961202e-07, "loss": 0.2303, "step": 1817 }, { "epoch": 0.08783881721988694, "grad_norm": 3.213670015335083, "learning_rate": 9.12161182780113e-07, "loss": 0.4288, "step": 1818 }, { "epoch": 0.08788713340097598, "grad_norm": 2.3713016510009766, "learning_rate": 9.12112866599024e-07, "loss": 0.2261, "step": 1819 }, { "epoch": 0.08793544958206503, "grad_norm": 3.1036829948425293, "learning_rate": 9.120645504179349e-07, "loss": 0.4469, "step": 1820 }, { "epoch": 0.08798376576315409, "grad_norm": 7.089291572570801, "learning_rate": 9.120162342368459e-07, "loss": 0.479, "step": 1821 }, { "epoch": 0.08803208194424313, "grad_norm": 2.7680416107177734, "learning_rate": 9.119679180557568e-07, "loss": 0.3152, "step": 1822 }, { "epoch": 0.08808039812533218, "grad_norm": 2.854687213897705, "learning_rate": 9.119196018746678e-07, "loss": 0.2339, "step": 1823 }, { "epoch": 0.08812871430642122, "grad_norm": 38.51402282714844, "learning_rate": 9.118712856935788e-07, "loss": 0.2608, "step": 1824 }, { "epoch": 0.08817703048751027, "grad_norm": 2.214724063873291, "learning_rate": 9.118229695124898e-07, "loss": 0.3051, "step": 1825 }, { "epoch": 0.08822534666859931, "grad_norm": 2.1826250553131104, "learning_rate": 9.117746533314006e-07, "loss": 0.2456, "step": 1826 }, { "epoch": 0.08827366284968836, "grad_norm": 2.6645419597625732, "learning_rate": 9.117263371503115e-07, "loss": 0.2905, "step": 1827 }, { "epoch": 0.0883219790307774, "grad_norm": 3.722581386566162, "learning_rate": 9.116780209692225e-07, "loss": 0.3501, "step": 1828 }, { "epoch": 0.08837029521186646, "grad_norm": 10.988066673278809, "learning_rate": 9.116297047881335e-07, "loss": 0.3247, "step": 1829 }, { "epoch": 0.0884186113929555, "grad_norm": 2.9237759113311768, "learning_rate": 9.115813886070445e-07, "loss": 0.2054, "step": 1830 }, { "epoch": 0.08846692757404455, "grad_norm": 3.2879512310028076, "learning_rate": 9.115330724259555e-07, "loss": 0.4182, "step": 1831 }, { "epoch": 0.08851524375513359, "grad_norm": 2.931640863418579, "learning_rate": 9.114847562448663e-07, "loss": 0.3272, "step": 1832 }, { "epoch": 0.08856355993622264, "grad_norm": 2.296128273010254, "learning_rate": 9.114364400637773e-07, "loss": 0.1793, "step": 1833 }, { "epoch": 0.0886118761173117, "grad_norm": 3.2556188106536865, "learning_rate": 9.113881238826882e-07, "loss": 0.3061, "step": 1834 }, { "epoch": 0.08866019229840073, "grad_norm": 2.993260622024536, "learning_rate": 9.113398077015992e-07, "loss": 0.3661, "step": 1835 }, { "epoch": 0.08870850847948979, "grad_norm": 2.0203802585601807, "learning_rate": 9.112914915205102e-07, "loss": 0.2168, "step": 1836 }, { "epoch": 0.08875682466057883, "grad_norm": 3.676647901535034, "learning_rate": 9.112431753394211e-07, "loss": 0.1818, "step": 1837 }, { "epoch": 0.08880514084166788, "grad_norm": 2.8167452812194824, "learning_rate": 9.111948591583321e-07, "loss": 0.2881, "step": 1838 }, { "epoch": 0.08885345702275692, "grad_norm": 3.1638572216033936, "learning_rate": 9.11146542977243e-07, "loss": 0.3786, "step": 1839 }, { "epoch": 0.08890177320384597, "grad_norm": 2.2279672622680664, "learning_rate": 9.11098226796154e-07, "loss": 0.2003, "step": 1840 }, { "epoch": 0.08895008938493501, "grad_norm": 3.5285067558288574, "learning_rate": 9.11049910615065e-07, "loss": 0.3584, "step": 1841 }, { "epoch": 0.08899840556602406, "grad_norm": 8.544635772705078, "learning_rate": 9.11001594433976e-07, "loss": 0.2592, "step": 1842 }, { "epoch": 0.0890467217471131, "grad_norm": 2.1027779579162598, "learning_rate": 9.109532782528868e-07, "loss": 0.2461, "step": 1843 }, { "epoch": 0.08909503792820216, "grad_norm": 2.899881362915039, "learning_rate": 9.109049620717978e-07, "loss": 0.2555, "step": 1844 }, { "epoch": 0.0891433541092912, "grad_norm": 2.3687710762023926, "learning_rate": 9.108566458907087e-07, "loss": 0.2564, "step": 1845 }, { "epoch": 0.08919167029038025, "grad_norm": 4.897136211395264, "learning_rate": 9.108083297096197e-07, "loss": 0.3413, "step": 1846 }, { "epoch": 0.0892399864714693, "grad_norm": 2.03633713722229, "learning_rate": 9.107600135285307e-07, "loss": 0.2074, "step": 1847 }, { "epoch": 0.08928830265255834, "grad_norm": 2.735642910003662, "learning_rate": 9.107116973474416e-07, "loss": 0.2536, "step": 1848 }, { "epoch": 0.0893366188336474, "grad_norm": 4.193155288696289, "learning_rate": 9.106633811663526e-07, "loss": 0.4494, "step": 1849 }, { "epoch": 0.08938493501473643, "grad_norm": 3.082275390625, "learning_rate": 9.106150649852636e-07, "loss": 0.3688, "step": 1850 }, { "epoch": 0.08943325119582549, "grad_norm": 2.5095815658569336, "learning_rate": 9.105667488041746e-07, "loss": 0.2717, "step": 1851 }, { "epoch": 0.08948156737691453, "grad_norm": 2.8460917472839355, "learning_rate": 9.105184326230854e-07, "loss": 0.3615, "step": 1852 }, { "epoch": 0.08952988355800358, "grad_norm": 3.3404579162597656, "learning_rate": 9.104701164419963e-07, "loss": 0.2807, "step": 1853 }, { "epoch": 0.08957819973909262, "grad_norm": 3.846050262451172, "learning_rate": 9.104218002609073e-07, "loss": 0.3698, "step": 1854 }, { "epoch": 0.08962651592018167, "grad_norm": 2.431828737258911, "learning_rate": 9.103734840798183e-07, "loss": 0.1942, "step": 1855 }, { "epoch": 0.08967483210127071, "grad_norm": 8.792197227478027, "learning_rate": 9.103251678987293e-07, "loss": 0.3148, "step": 1856 }, { "epoch": 0.08972314828235976, "grad_norm": 2.5866951942443848, "learning_rate": 9.102768517176403e-07, "loss": 0.2835, "step": 1857 }, { "epoch": 0.0897714644634488, "grad_norm": 25.0540828704834, "learning_rate": 9.102285355365511e-07, "loss": 0.3439, "step": 1858 }, { "epoch": 0.08981978064453786, "grad_norm": 3.65584659576416, "learning_rate": 9.10180219355462e-07, "loss": 0.3131, "step": 1859 }, { "epoch": 0.08986809682562691, "grad_norm": 2.7831852436065674, "learning_rate": 9.10131903174373e-07, "loss": 0.3238, "step": 1860 }, { "epoch": 0.08991641300671595, "grad_norm": 2.704101324081421, "learning_rate": 9.10083586993284e-07, "loss": 0.2812, "step": 1861 }, { "epoch": 0.089964729187805, "grad_norm": 2.4264395236968994, "learning_rate": 9.10035270812195e-07, "loss": 0.2133, "step": 1862 }, { "epoch": 0.09001304536889404, "grad_norm": 28.869169235229492, "learning_rate": 9.099869546311059e-07, "loss": 0.3071, "step": 1863 }, { "epoch": 0.0900613615499831, "grad_norm": 2.3145501613616943, "learning_rate": 9.099386384500168e-07, "loss": 0.2245, "step": 1864 }, { "epoch": 0.09010967773107213, "grad_norm": 3.1707990169525146, "learning_rate": 9.098903222689278e-07, "loss": 0.432, "step": 1865 }, { "epoch": 0.09015799391216119, "grad_norm": 2.508213758468628, "learning_rate": 9.098420060878388e-07, "loss": 0.1978, "step": 1866 }, { "epoch": 0.09020631009325022, "grad_norm": 2.58559513092041, "learning_rate": 9.097936899067498e-07, "loss": 0.2923, "step": 1867 }, { "epoch": 0.09025462627433928, "grad_norm": 2.802870750427246, "learning_rate": 9.097453737256608e-07, "loss": 0.3021, "step": 1868 }, { "epoch": 0.09030294245542832, "grad_norm": 3.3854148387908936, "learning_rate": 9.096970575445716e-07, "loss": 0.3301, "step": 1869 }, { "epoch": 0.09035125863651737, "grad_norm": 3.0278191566467285, "learning_rate": 9.096487413634826e-07, "loss": 0.3416, "step": 1870 }, { "epoch": 0.09039957481760641, "grad_norm": 3.600583791732788, "learning_rate": 9.096004251823935e-07, "loss": 0.3728, "step": 1871 }, { "epoch": 0.09044789099869546, "grad_norm": 2.229626417160034, "learning_rate": 9.095521090013045e-07, "loss": 0.2653, "step": 1872 }, { "epoch": 0.09049620717978452, "grad_norm": 2.9406001567840576, "learning_rate": 9.095037928202155e-07, "loss": 0.3388, "step": 1873 }, { "epoch": 0.09054452336087356, "grad_norm": 2.489393711090088, "learning_rate": 9.094554766391264e-07, "loss": 0.2615, "step": 1874 }, { "epoch": 0.09059283954196261, "grad_norm": 1.7763772010803223, "learning_rate": 9.094071604580374e-07, "loss": 0.1989, "step": 1875 }, { "epoch": 0.09064115572305165, "grad_norm": 2.9239649772644043, "learning_rate": 9.093588442769484e-07, "loss": 0.4901, "step": 1876 }, { "epoch": 0.0906894719041407, "grad_norm": 2.5128297805786133, "learning_rate": 9.093105280958592e-07, "loss": 0.3794, "step": 1877 }, { "epoch": 0.09073778808522974, "grad_norm": 1.753893256187439, "learning_rate": 9.092622119147702e-07, "loss": 0.2098, "step": 1878 }, { "epoch": 0.09078610426631879, "grad_norm": 3.0974433422088623, "learning_rate": 9.092138957336811e-07, "loss": 0.356, "step": 1879 }, { "epoch": 0.09083442044740783, "grad_norm": 2.1207826137542725, "learning_rate": 9.091655795525921e-07, "loss": 0.2038, "step": 1880 }, { "epoch": 0.09088273662849689, "grad_norm": 3.149494171142578, "learning_rate": 9.091172633715031e-07, "loss": 0.3884, "step": 1881 }, { "epoch": 0.09093105280958592, "grad_norm": 6.640120029449463, "learning_rate": 9.090689471904141e-07, "loss": 0.295, "step": 1882 }, { "epoch": 0.09097936899067498, "grad_norm": 3.9858436584472656, "learning_rate": 9.090206310093251e-07, "loss": 0.4015, "step": 1883 }, { "epoch": 0.09102768517176403, "grad_norm": 3.3891727924346924, "learning_rate": 9.089723148282359e-07, "loss": 0.3176, "step": 1884 }, { "epoch": 0.09107600135285307, "grad_norm": 5.041940689086914, "learning_rate": 9.089239986471468e-07, "loss": 0.3431, "step": 1885 }, { "epoch": 0.09112431753394212, "grad_norm": 2.7829065322875977, "learning_rate": 9.088756824660578e-07, "loss": 0.3088, "step": 1886 }, { "epoch": 0.09117263371503116, "grad_norm": 2.2854976654052734, "learning_rate": 9.088273662849688e-07, "loss": 0.2638, "step": 1887 }, { "epoch": 0.09122094989612022, "grad_norm": 3.7235355377197266, "learning_rate": 9.087790501038798e-07, "loss": 0.4142, "step": 1888 }, { "epoch": 0.09126926607720925, "grad_norm": 7.248450756072998, "learning_rate": 9.087307339227907e-07, "loss": 0.2195, "step": 1889 }, { "epoch": 0.09131758225829831, "grad_norm": 1.9170722961425781, "learning_rate": 9.086824177417016e-07, "loss": 0.2282, "step": 1890 }, { "epoch": 0.09136589843938735, "grad_norm": 8.998387336730957, "learning_rate": 9.086341015606126e-07, "loss": 0.3894, "step": 1891 }, { "epoch": 0.0914142146204764, "grad_norm": 2.7258646488189697, "learning_rate": 9.085857853795236e-07, "loss": 0.4363, "step": 1892 }, { "epoch": 0.09146253080156544, "grad_norm": 3.0673835277557373, "learning_rate": 9.085374691984346e-07, "loss": 0.3302, "step": 1893 }, { "epoch": 0.09151084698265449, "grad_norm": 2.5982143878936768, "learning_rate": 9.084891530173455e-07, "loss": 0.2583, "step": 1894 }, { "epoch": 0.09155916316374353, "grad_norm": 9.527552604675293, "learning_rate": 9.084408368362564e-07, "loss": 0.3691, "step": 1895 }, { "epoch": 0.09160747934483258, "grad_norm": 2.74983549118042, "learning_rate": 9.083925206551673e-07, "loss": 0.3955, "step": 1896 }, { "epoch": 0.09165579552592164, "grad_norm": 3.26846981048584, "learning_rate": 9.083442044740783e-07, "loss": 0.3578, "step": 1897 }, { "epoch": 0.09170411170701068, "grad_norm": 2.7377514839172363, "learning_rate": 9.082958882929893e-07, "loss": 0.3862, "step": 1898 }, { "epoch": 0.09175242788809973, "grad_norm": 3.8782472610473633, "learning_rate": 9.082475721119003e-07, "loss": 0.3996, "step": 1899 }, { "epoch": 0.09180074406918877, "grad_norm": 5.638594627380371, "learning_rate": 9.081992559308112e-07, "loss": 0.3053, "step": 1900 }, { "epoch": 0.09184906025027782, "grad_norm": 2.0229389667510986, "learning_rate": 9.081509397497222e-07, "loss": 0.185, "step": 1901 }, { "epoch": 0.09189737643136686, "grad_norm": 3.503488779067993, "learning_rate": 9.081026235686331e-07, "loss": 0.321, "step": 1902 }, { "epoch": 0.09194569261245591, "grad_norm": 2.365880012512207, "learning_rate": 9.08054307387544e-07, "loss": 0.2082, "step": 1903 }, { "epoch": 0.09199400879354495, "grad_norm": 2.509592056274414, "learning_rate": 9.08005991206455e-07, "loss": 0.3, "step": 1904 }, { "epoch": 0.09204232497463401, "grad_norm": 3.2274720668792725, "learning_rate": 9.079576750253659e-07, "loss": 0.4829, "step": 1905 }, { "epoch": 0.09209064115572305, "grad_norm": 2.9735405445098877, "learning_rate": 9.079093588442769e-07, "loss": 0.3631, "step": 1906 }, { "epoch": 0.0921389573368121, "grad_norm": 2.791328191757202, "learning_rate": 9.078610426631879e-07, "loss": 0.3299, "step": 1907 }, { "epoch": 0.09218727351790114, "grad_norm": 2.9015884399414062, "learning_rate": 9.078127264820989e-07, "loss": 0.2786, "step": 1908 }, { "epoch": 0.09223558969899019, "grad_norm": 3.3424746990203857, "learning_rate": 9.077644103010098e-07, "loss": 0.3181, "step": 1909 }, { "epoch": 0.09228390588007924, "grad_norm": 2.833024501800537, "learning_rate": 9.077160941199206e-07, "loss": 0.2571, "step": 1910 }, { "epoch": 0.09233222206116828, "grad_norm": 3.0467660427093506, "learning_rate": 9.076677779388316e-07, "loss": 0.284, "step": 1911 }, { "epoch": 0.09238053824225734, "grad_norm": 2.986248016357422, "learning_rate": 9.076194617577426e-07, "loss": 0.4771, "step": 1912 }, { "epoch": 0.09242885442334638, "grad_norm": 2.4087705612182617, "learning_rate": 9.075711455766536e-07, "loss": 0.3054, "step": 1913 }, { "epoch": 0.09247717060443543, "grad_norm": 2.7093870639801025, "learning_rate": 9.075228293955646e-07, "loss": 0.351, "step": 1914 }, { "epoch": 0.09252548678552447, "grad_norm": 2.147782325744629, "learning_rate": 9.074745132144754e-07, "loss": 0.2862, "step": 1915 }, { "epoch": 0.09257380296661352, "grad_norm": 3.7367780208587646, "learning_rate": 9.074261970333864e-07, "loss": 0.4292, "step": 1916 }, { "epoch": 0.09262211914770256, "grad_norm": 2.227604627609253, "learning_rate": 9.073778808522974e-07, "loss": 0.2771, "step": 1917 }, { "epoch": 0.09267043532879161, "grad_norm": 3.5902678966522217, "learning_rate": 9.073295646712084e-07, "loss": 0.2923, "step": 1918 }, { "epoch": 0.09271875150988065, "grad_norm": 2.630686044692993, "learning_rate": 9.072812484901193e-07, "loss": 0.2622, "step": 1919 }, { "epoch": 0.0927670676909697, "grad_norm": 2.0411481857299805, "learning_rate": 9.072329323090303e-07, "loss": 0.2681, "step": 1920 }, { "epoch": 0.09281538387205875, "grad_norm": 3.8732354640960693, "learning_rate": 9.071846161279412e-07, "loss": 0.37, "step": 1921 }, { "epoch": 0.0928637000531478, "grad_norm": 8.107843399047852, "learning_rate": 9.071362999468521e-07, "loss": 0.3446, "step": 1922 }, { "epoch": 0.09291201623423685, "grad_norm": 2.8971595764160156, "learning_rate": 9.070879837657631e-07, "loss": 0.438, "step": 1923 }, { "epoch": 0.09296033241532589, "grad_norm": 4.337497711181641, "learning_rate": 9.070396675846741e-07, "loss": 0.3269, "step": 1924 }, { "epoch": 0.09300864859641494, "grad_norm": 2.3773884773254395, "learning_rate": 9.069913514035851e-07, "loss": 0.2239, "step": 1925 }, { "epoch": 0.09305696477750398, "grad_norm": 3.1790144443511963, "learning_rate": 9.06943035222496e-07, "loss": 0.4163, "step": 1926 }, { "epoch": 0.09310528095859304, "grad_norm": 3.395268440246582, "learning_rate": 9.06894719041407e-07, "loss": 0.3615, "step": 1927 }, { "epoch": 0.09315359713968208, "grad_norm": 3.678520679473877, "learning_rate": 9.068464028603178e-07, "loss": 0.3017, "step": 1928 }, { "epoch": 0.09320191332077113, "grad_norm": 2.704176902770996, "learning_rate": 9.067980866792288e-07, "loss": 0.2903, "step": 1929 }, { "epoch": 0.09325022950186017, "grad_norm": 2.935734748840332, "learning_rate": 9.067497704981398e-07, "loss": 0.4234, "step": 1930 }, { "epoch": 0.09329854568294922, "grad_norm": 2.284217119216919, "learning_rate": 9.067014543170507e-07, "loss": 0.2485, "step": 1931 }, { "epoch": 0.09334686186403826, "grad_norm": 4.412893772125244, "learning_rate": 9.066531381359617e-07, "loss": 0.4866, "step": 1932 }, { "epoch": 0.09339517804512731, "grad_norm": 4.359405517578125, "learning_rate": 9.066048219548727e-07, "loss": 0.4279, "step": 1933 }, { "epoch": 0.09344349422621635, "grad_norm": 3.5768985748291016, "learning_rate": 9.065565057737837e-07, "loss": 0.4002, "step": 1934 }, { "epoch": 0.0934918104073054, "grad_norm": 2.493072271347046, "learning_rate": 9.065081895926946e-07, "loss": 0.3355, "step": 1935 }, { "epoch": 0.09354012658839446, "grad_norm": 2.045365571975708, "learning_rate": 9.064598734116054e-07, "loss": 0.2418, "step": 1936 }, { "epoch": 0.0935884427694835, "grad_norm": 3.936802387237549, "learning_rate": 9.064115572305164e-07, "loss": 0.3694, "step": 1937 }, { "epoch": 0.09363675895057255, "grad_norm": 2.647029399871826, "learning_rate": 9.063632410494274e-07, "loss": 0.2961, "step": 1938 }, { "epoch": 0.09368507513166159, "grad_norm": 3.0275700092315674, "learning_rate": 9.063149248683384e-07, "loss": 0.3474, "step": 1939 }, { "epoch": 0.09373339131275064, "grad_norm": 1.711325764656067, "learning_rate": 9.062666086872494e-07, "loss": 0.1157, "step": 1940 }, { "epoch": 0.09378170749383968, "grad_norm": 2.951195240020752, "learning_rate": 9.062182925061602e-07, "loss": 0.3524, "step": 1941 }, { "epoch": 0.09383002367492874, "grad_norm": 9.792557716369629, "learning_rate": 9.061699763250712e-07, "loss": 0.3901, "step": 1942 }, { "epoch": 0.09387833985601778, "grad_norm": 3.6861727237701416, "learning_rate": 9.061216601439822e-07, "loss": 0.4045, "step": 1943 }, { "epoch": 0.09392665603710683, "grad_norm": 4.829576015472412, "learning_rate": 9.060733439628931e-07, "loss": 0.2932, "step": 1944 }, { "epoch": 0.09397497221819587, "grad_norm": 8.482109069824219, "learning_rate": 9.060250277818041e-07, "loss": 0.464, "step": 1945 }, { "epoch": 0.09402328839928492, "grad_norm": 2.369436264038086, "learning_rate": 9.059767116007151e-07, "loss": 0.3217, "step": 1946 }, { "epoch": 0.09407160458037396, "grad_norm": 2.2532029151916504, "learning_rate": 9.059283954196259e-07, "loss": 0.2168, "step": 1947 }, { "epoch": 0.09411992076146301, "grad_norm": 3.225268840789795, "learning_rate": 9.058800792385369e-07, "loss": 0.317, "step": 1948 }, { "epoch": 0.09416823694255207, "grad_norm": 1.7621676921844482, "learning_rate": 9.058317630574479e-07, "loss": 0.2224, "step": 1949 }, { "epoch": 0.0942165531236411, "grad_norm": 3.3819422721862793, "learning_rate": 9.057834468763589e-07, "loss": 0.4683, "step": 1950 }, { "epoch": 0.09426486930473016, "grad_norm": 2.144813060760498, "learning_rate": 9.057351306952699e-07, "loss": 0.2463, "step": 1951 }, { "epoch": 0.0943131854858192, "grad_norm": 4.454516410827637, "learning_rate": 9.056868145141808e-07, "loss": 0.2172, "step": 1952 }, { "epoch": 0.09436150166690825, "grad_norm": 2.711052417755127, "learning_rate": 9.056384983330917e-07, "loss": 0.2269, "step": 1953 }, { "epoch": 0.09440981784799729, "grad_norm": 2.57694149017334, "learning_rate": 9.055901821520026e-07, "loss": 0.2953, "step": 1954 }, { "epoch": 0.09445813402908634, "grad_norm": 2.1826493740081787, "learning_rate": 9.055418659709136e-07, "loss": 0.2604, "step": 1955 }, { "epoch": 0.09450645021017538, "grad_norm": 9.678312301635742, "learning_rate": 9.054935497898246e-07, "loss": 0.3579, "step": 1956 }, { "epoch": 0.09455476639126444, "grad_norm": 2.6240248680114746, "learning_rate": 9.054452336087355e-07, "loss": 0.2819, "step": 1957 }, { "epoch": 0.09460308257235348, "grad_norm": 1.4599888324737549, "learning_rate": 9.053969174276465e-07, "loss": 0.1622, "step": 1958 }, { "epoch": 0.09465139875344253, "grad_norm": 3.380450487136841, "learning_rate": 9.053486012465575e-07, "loss": 0.3183, "step": 1959 }, { "epoch": 0.09469971493453158, "grad_norm": 5.732748508453369, "learning_rate": 9.053002850654684e-07, "loss": 0.2916, "step": 1960 }, { "epoch": 0.09474803111562062, "grad_norm": 2.5662693977355957, "learning_rate": 9.052519688843793e-07, "loss": 0.2935, "step": 1961 }, { "epoch": 0.09479634729670967, "grad_norm": 3.184957981109619, "learning_rate": 9.052036527032902e-07, "loss": 0.3283, "step": 1962 }, { "epoch": 0.09484466347779871, "grad_norm": 2.441284418106079, "learning_rate": 9.051553365222012e-07, "loss": 0.2757, "step": 1963 }, { "epoch": 0.09489297965888777, "grad_norm": 3.1391420364379883, "learning_rate": 9.051070203411122e-07, "loss": 0.4135, "step": 1964 }, { "epoch": 0.0949412958399768, "grad_norm": 2.7465178966522217, "learning_rate": 9.050587041600232e-07, "loss": 0.4033, "step": 1965 }, { "epoch": 0.09498961202106586, "grad_norm": 3.225119113922119, "learning_rate": 9.050103879789342e-07, "loss": 0.4653, "step": 1966 }, { "epoch": 0.0950379282021549, "grad_norm": 3.362900733947754, "learning_rate": 9.04962071797845e-07, "loss": 0.3331, "step": 1967 }, { "epoch": 0.09508624438324395, "grad_norm": 3.540773391723633, "learning_rate": 9.04913755616756e-07, "loss": 0.3777, "step": 1968 }, { "epoch": 0.09513456056433299, "grad_norm": 2.516918182373047, "learning_rate": 9.04865439435667e-07, "loss": 0.3105, "step": 1969 }, { "epoch": 0.09518287674542204, "grad_norm": 3.2660598754882812, "learning_rate": 9.048171232545779e-07, "loss": 0.3635, "step": 1970 }, { "epoch": 0.09523119292651108, "grad_norm": 3.8953850269317627, "learning_rate": 9.047688070734889e-07, "loss": 0.5042, "step": 1971 }, { "epoch": 0.09527950910760014, "grad_norm": 2.953716993331909, "learning_rate": 9.047204908923999e-07, "loss": 0.3499, "step": 1972 }, { "epoch": 0.09532782528868919, "grad_norm": 3.1337263584136963, "learning_rate": 9.046721747113107e-07, "loss": 0.2964, "step": 1973 }, { "epoch": 0.09537614146977823, "grad_norm": 2.7631592750549316, "learning_rate": 9.046238585302217e-07, "loss": 0.3157, "step": 1974 }, { "epoch": 0.09542445765086728, "grad_norm": 2.932013511657715, "learning_rate": 9.045755423491327e-07, "loss": 0.2823, "step": 1975 }, { "epoch": 0.09547277383195632, "grad_norm": 3.0965754985809326, "learning_rate": 9.045272261680437e-07, "loss": 0.3976, "step": 1976 }, { "epoch": 0.09552109001304537, "grad_norm": 2.988297939300537, "learning_rate": 9.044789099869547e-07, "loss": 0.3844, "step": 1977 }, { "epoch": 0.09556940619413441, "grad_norm": 4.894415855407715, "learning_rate": 9.044305938058655e-07, "loss": 0.353, "step": 1978 }, { "epoch": 0.09561772237522347, "grad_norm": 3.740894079208374, "learning_rate": 9.043822776247764e-07, "loss": 0.2254, "step": 1979 }, { "epoch": 0.0956660385563125, "grad_norm": 2.7213802337646484, "learning_rate": 9.043339614436874e-07, "loss": 0.3711, "step": 1980 }, { "epoch": 0.09571435473740156, "grad_norm": 2.8813834190368652, "learning_rate": 9.042856452625984e-07, "loss": 0.3771, "step": 1981 }, { "epoch": 0.0957626709184906, "grad_norm": 4.985052108764648, "learning_rate": 9.042373290815094e-07, "loss": 0.3125, "step": 1982 }, { "epoch": 0.09581098709957965, "grad_norm": 2.576004981994629, "learning_rate": 9.041890129004203e-07, "loss": 0.3051, "step": 1983 }, { "epoch": 0.09585930328066869, "grad_norm": 1.9608681201934814, "learning_rate": 9.041406967193313e-07, "loss": 0.2207, "step": 1984 }, { "epoch": 0.09590761946175774, "grad_norm": 2.0583488941192627, "learning_rate": 9.040923805382423e-07, "loss": 0.2162, "step": 1985 }, { "epoch": 0.0959559356428468, "grad_norm": 2.7539844512939453, "learning_rate": 9.040440643571531e-07, "loss": 0.3256, "step": 1986 }, { "epoch": 0.09600425182393584, "grad_norm": 1.9220808744430542, "learning_rate": 9.039957481760641e-07, "loss": 0.1907, "step": 1987 }, { "epoch": 0.09605256800502489, "grad_norm": 2.0269486904144287, "learning_rate": 9.03947431994975e-07, "loss": 0.2462, "step": 1988 }, { "epoch": 0.09610088418611393, "grad_norm": 2.5805740356445312, "learning_rate": 9.03899115813886e-07, "loss": 0.2726, "step": 1989 }, { "epoch": 0.09614920036720298, "grad_norm": 3.1254665851593018, "learning_rate": 9.03850799632797e-07, "loss": 0.3694, "step": 1990 }, { "epoch": 0.09619751654829202, "grad_norm": 2.6303203105926514, "learning_rate": 9.03802483451708e-07, "loss": 0.3219, "step": 1991 }, { "epoch": 0.09624583272938107, "grad_norm": 2.874506950378418, "learning_rate": 9.037541672706189e-07, "loss": 0.2526, "step": 1992 }, { "epoch": 0.09629414891047011, "grad_norm": 3.139371156692505, "learning_rate": 9.037058510895298e-07, "loss": 0.3948, "step": 1993 }, { "epoch": 0.09634246509155917, "grad_norm": 5.809071063995361, "learning_rate": 9.036575349084408e-07, "loss": 0.2669, "step": 1994 }, { "epoch": 0.0963907812726482, "grad_norm": 2.4990084171295166, "learning_rate": 9.036092187273517e-07, "loss": 0.2248, "step": 1995 }, { "epoch": 0.09643909745373726, "grad_norm": 4.986257553100586, "learning_rate": 9.035609025462627e-07, "loss": 0.2445, "step": 1996 }, { "epoch": 0.0964874136348263, "grad_norm": 2.1681599617004395, "learning_rate": 9.035125863651737e-07, "loss": 0.182, "step": 1997 }, { "epoch": 0.09653572981591535, "grad_norm": 2.7740774154663086, "learning_rate": 9.034642701840845e-07, "loss": 0.3537, "step": 1998 }, { "epoch": 0.0965840459970044, "grad_norm": 3.0771613121032715, "learning_rate": 9.034159540029955e-07, "loss": 0.343, "step": 1999 }, { "epoch": 0.09663236217809344, "grad_norm": 4.286243438720703, "learning_rate": 9.033676378219065e-07, "loss": 0.403, "step": 2000 }, { "epoch": 0.0966806783591825, "grad_norm": 3.115795850753784, "learning_rate": 9.033193216408175e-07, "loss": 0.3269, "step": 2001 }, { "epoch": 0.09672899454027153, "grad_norm": 3.4361958503723145, "learning_rate": 9.032710054597285e-07, "loss": 0.3538, "step": 2002 }, { "epoch": 0.09677731072136059, "grad_norm": 2.1335289478302, "learning_rate": 9.032226892786395e-07, "loss": 0.2403, "step": 2003 }, { "epoch": 0.09682562690244963, "grad_norm": 2.305490016937256, "learning_rate": 9.031743730975503e-07, "loss": 0.2577, "step": 2004 }, { "epoch": 0.09687394308353868, "grad_norm": 3.036410331726074, "learning_rate": 9.031260569164612e-07, "loss": 0.281, "step": 2005 }, { "epoch": 0.09692225926462772, "grad_norm": 1.7593985795974731, "learning_rate": 9.030777407353722e-07, "loss": 0.2175, "step": 2006 }, { "epoch": 0.09697057544571677, "grad_norm": 4.5254058837890625, "learning_rate": 9.030294245542832e-07, "loss": 0.382, "step": 2007 }, { "epoch": 0.09701889162680581, "grad_norm": 5.374584674835205, "learning_rate": 9.029811083731942e-07, "loss": 0.2025, "step": 2008 }, { "epoch": 0.09706720780789486, "grad_norm": 3.9597103595733643, "learning_rate": 9.029327921921051e-07, "loss": 0.1483, "step": 2009 }, { "epoch": 0.0971155239889839, "grad_norm": 2.3143341541290283, "learning_rate": 9.028844760110161e-07, "loss": 0.2648, "step": 2010 }, { "epoch": 0.09716384017007296, "grad_norm": 2.0817794799804688, "learning_rate": 9.02836159829927e-07, "loss": 0.2271, "step": 2011 }, { "epoch": 0.09721215635116201, "grad_norm": 3.523977041244507, "learning_rate": 9.027878436488379e-07, "loss": 0.2672, "step": 2012 }, { "epoch": 0.09726047253225105, "grad_norm": 3.049550771713257, "learning_rate": 9.027395274677489e-07, "loss": 0.2435, "step": 2013 }, { "epoch": 0.0973087887133401, "grad_norm": 2.6956160068511963, "learning_rate": 9.026912112866598e-07, "loss": 0.3656, "step": 2014 }, { "epoch": 0.09735710489442914, "grad_norm": 2.9974822998046875, "learning_rate": 9.026428951055708e-07, "loss": 0.2279, "step": 2015 }, { "epoch": 0.0974054210755182, "grad_norm": 3.636842727661133, "learning_rate": 9.025945789244818e-07, "loss": 0.2435, "step": 2016 }, { "epoch": 0.09745373725660723, "grad_norm": 3.035362482070923, "learning_rate": 9.025462627433928e-07, "loss": 0.2671, "step": 2017 }, { "epoch": 0.09750205343769629, "grad_norm": 3.1618778705596924, "learning_rate": 9.024979465623037e-07, "loss": 0.4003, "step": 2018 }, { "epoch": 0.09755036961878533, "grad_norm": 3.657309055328369, "learning_rate": 9.024496303812146e-07, "loss": 0.3784, "step": 2019 }, { "epoch": 0.09759868579987438, "grad_norm": 2.836439609527588, "learning_rate": 9.024013142001255e-07, "loss": 0.3953, "step": 2020 }, { "epoch": 0.09764700198096342, "grad_norm": 7.020338535308838, "learning_rate": 9.023529980190365e-07, "loss": 0.4658, "step": 2021 }, { "epoch": 0.09769531816205247, "grad_norm": 2.398254156112671, "learning_rate": 9.023046818379475e-07, "loss": 0.3481, "step": 2022 }, { "epoch": 0.09774363434314152, "grad_norm": 3.6928415298461914, "learning_rate": 9.022563656568585e-07, "loss": 0.348, "step": 2023 }, { "epoch": 0.09779195052423056, "grad_norm": 3.7494304180145264, "learning_rate": 9.022080494757693e-07, "loss": 0.4015, "step": 2024 }, { "epoch": 0.09784026670531962, "grad_norm": 1.7191082239151, "learning_rate": 9.021597332946803e-07, "loss": 0.1958, "step": 2025 }, { "epoch": 0.09788858288640866, "grad_norm": 2.877939462661743, "learning_rate": 9.021114171135913e-07, "loss": 0.249, "step": 2026 }, { "epoch": 0.09793689906749771, "grad_norm": 2.7167768478393555, "learning_rate": 9.020631009325023e-07, "loss": 0.3092, "step": 2027 }, { "epoch": 0.09798521524858675, "grad_norm": 2.594468593597412, "learning_rate": 9.020147847514133e-07, "loss": 0.2803, "step": 2028 }, { "epoch": 0.0980335314296758, "grad_norm": 2.3701229095458984, "learning_rate": 9.019664685703242e-07, "loss": 0.3314, "step": 2029 }, { "epoch": 0.09808184761076484, "grad_norm": 1.4610453844070435, "learning_rate": 9.01918152389235e-07, "loss": 0.1817, "step": 2030 }, { "epoch": 0.0981301637918539, "grad_norm": 3.9509103298187256, "learning_rate": 9.01869836208146e-07, "loss": 0.4554, "step": 2031 }, { "epoch": 0.09817847997294293, "grad_norm": 4.05551815032959, "learning_rate": 9.01821520027057e-07, "loss": 0.3202, "step": 2032 }, { "epoch": 0.09822679615403199, "grad_norm": 3.1688578128814697, "learning_rate": 9.01773203845968e-07, "loss": 0.224, "step": 2033 }, { "epoch": 0.09827511233512103, "grad_norm": 3.3861045837402344, "learning_rate": 9.01724887664879e-07, "loss": 0.3362, "step": 2034 }, { "epoch": 0.09832342851621008, "grad_norm": 1.7617874145507812, "learning_rate": 9.016765714837899e-07, "loss": 0.219, "step": 2035 }, { "epoch": 0.09837174469729913, "grad_norm": 15.998021125793457, "learning_rate": 9.016282553027009e-07, "loss": 0.2588, "step": 2036 }, { "epoch": 0.09842006087838817, "grad_norm": 2.4427719116210938, "learning_rate": 9.015799391216117e-07, "loss": 0.2715, "step": 2037 }, { "epoch": 0.09846837705947722, "grad_norm": 2.6961467266082764, "learning_rate": 9.015316229405227e-07, "loss": 0.3891, "step": 2038 }, { "epoch": 0.09851669324056626, "grad_norm": 2.821957588195801, "learning_rate": 9.014833067594337e-07, "loss": 0.3657, "step": 2039 }, { "epoch": 0.09856500942165532, "grad_norm": 2.3229353427886963, "learning_rate": 9.014349905783446e-07, "loss": 0.2655, "step": 2040 }, { "epoch": 0.09861332560274436, "grad_norm": 2.76574444770813, "learning_rate": 9.013866743972556e-07, "loss": 0.3848, "step": 2041 }, { "epoch": 0.09866164178383341, "grad_norm": 1.5542869567871094, "learning_rate": 9.013383582161666e-07, "loss": 0.1626, "step": 2042 }, { "epoch": 0.09870995796492245, "grad_norm": 3.8704535961151123, "learning_rate": 9.012900420350775e-07, "loss": 0.2881, "step": 2043 }, { "epoch": 0.0987582741460115, "grad_norm": 2.9349143505096436, "learning_rate": 9.012417258539885e-07, "loss": 0.3618, "step": 2044 }, { "epoch": 0.09880659032710054, "grad_norm": 3.506648063659668, "learning_rate": 9.011934096728993e-07, "loss": 0.2696, "step": 2045 }, { "epoch": 0.0988549065081896, "grad_norm": 47.769248962402344, "learning_rate": 9.011450934918103e-07, "loss": 0.2361, "step": 2046 }, { "epoch": 0.09890322268927863, "grad_norm": 2.4636263847351074, "learning_rate": 9.010967773107213e-07, "loss": 0.2773, "step": 2047 }, { "epoch": 0.09895153887036769, "grad_norm": 2.1053647994995117, "learning_rate": 9.010484611296323e-07, "loss": 0.225, "step": 2048 }, { "epoch": 0.09899985505145674, "grad_norm": 3.5468027591705322, "learning_rate": 9.010001449485433e-07, "loss": 0.4283, "step": 2049 }, { "epoch": 0.09904817123254578, "grad_norm": 4.186867713928223, "learning_rate": 9.009518287674541e-07, "loss": 0.3263, "step": 2050 }, { "epoch": 0.09909648741363483, "grad_norm": 2.314023017883301, "learning_rate": 9.009035125863651e-07, "loss": 0.2273, "step": 2051 }, { "epoch": 0.09914480359472387, "grad_norm": 4.846542835235596, "learning_rate": 9.008551964052761e-07, "loss": 0.4781, "step": 2052 }, { "epoch": 0.09919311977581292, "grad_norm": 3.059718608856201, "learning_rate": 9.008068802241871e-07, "loss": 0.1996, "step": 2053 }, { "epoch": 0.09924143595690196, "grad_norm": 2.945788860321045, "learning_rate": 9.00758564043098e-07, "loss": 0.357, "step": 2054 }, { "epoch": 0.09928975213799102, "grad_norm": 2.3742735385894775, "learning_rate": 9.00710247862009e-07, "loss": 0.3308, "step": 2055 }, { "epoch": 0.09933806831908006, "grad_norm": 2.8924546241760254, "learning_rate": 9.006619316809198e-07, "loss": 0.2949, "step": 2056 }, { "epoch": 0.09938638450016911, "grad_norm": 23.797285079956055, "learning_rate": 9.006136154998308e-07, "loss": 0.2691, "step": 2057 }, { "epoch": 0.09943470068125815, "grad_norm": 2.9178848266601562, "learning_rate": 9.005652993187418e-07, "loss": 0.3405, "step": 2058 }, { "epoch": 0.0994830168623472, "grad_norm": 5.819261074066162, "learning_rate": 9.005169831376528e-07, "loss": 0.2809, "step": 2059 }, { "epoch": 0.09953133304343624, "grad_norm": 5.672137260437012, "learning_rate": 9.004686669565638e-07, "loss": 0.2841, "step": 2060 }, { "epoch": 0.0995796492245253, "grad_norm": 2.2103354930877686, "learning_rate": 9.004203507754747e-07, "loss": 0.2565, "step": 2061 }, { "epoch": 0.09962796540561435, "grad_norm": 3.9584550857543945, "learning_rate": 9.003720345943855e-07, "loss": 0.2945, "step": 2062 }, { "epoch": 0.09967628158670339, "grad_norm": 5.58083438873291, "learning_rate": 9.003237184132965e-07, "loss": 0.3582, "step": 2063 }, { "epoch": 0.09972459776779244, "grad_norm": 2.432062864303589, "learning_rate": 9.002754022322075e-07, "loss": 0.2275, "step": 2064 }, { "epoch": 0.09977291394888148, "grad_norm": 2.615751266479492, "learning_rate": 9.002270860511185e-07, "loss": 0.2146, "step": 2065 }, { "epoch": 0.09982123012997053, "grad_norm": 3.7347018718719482, "learning_rate": 9.001787698700294e-07, "loss": 0.3912, "step": 2066 }, { "epoch": 0.09986954631105957, "grad_norm": 2.965787887573242, "learning_rate": 9.001304536889404e-07, "loss": 0.3393, "step": 2067 }, { "epoch": 0.09991786249214862, "grad_norm": 2.6138765811920166, "learning_rate": 9.000821375078514e-07, "loss": 0.2744, "step": 2068 }, { "epoch": 0.09996617867323766, "grad_norm": 2.460181951522827, "learning_rate": 9.000338213267623e-07, "loss": 0.2218, "step": 2069 }, { "epoch": 0.10001449485432672, "grad_norm": 2.274941921234131, "learning_rate": 8.999855051456733e-07, "loss": 0.285, "step": 2070 }, { "epoch": 0.10006281103541576, "grad_norm": 13.749554634094238, "learning_rate": 8.999371889645841e-07, "loss": 0.3112, "step": 2071 }, { "epoch": 0.10011112721650481, "grad_norm": 2.2594714164733887, "learning_rate": 8.998888727834951e-07, "loss": 0.2746, "step": 2072 }, { "epoch": 0.10015944339759385, "grad_norm": 2.8994250297546387, "learning_rate": 8.998405566024061e-07, "loss": 0.2941, "step": 2073 }, { "epoch": 0.1002077595786829, "grad_norm": 2.5757486820220947, "learning_rate": 8.997922404213171e-07, "loss": 0.294, "step": 2074 }, { "epoch": 0.10025607575977195, "grad_norm": 3.6754724979400635, "learning_rate": 8.99743924240228e-07, "loss": 0.4096, "step": 2075 }, { "epoch": 0.10030439194086099, "grad_norm": 1.9425725936889648, "learning_rate": 8.996956080591389e-07, "loss": 0.2511, "step": 2076 }, { "epoch": 0.10035270812195005, "grad_norm": 1.8415666818618774, "learning_rate": 8.996472918780499e-07, "loss": 0.1911, "step": 2077 }, { "epoch": 0.10040102430303909, "grad_norm": 13.019009590148926, "learning_rate": 8.995989756969609e-07, "loss": 0.4098, "step": 2078 }, { "epoch": 0.10044934048412814, "grad_norm": 4.828608512878418, "learning_rate": 8.995506595158719e-07, "loss": 0.2445, "step": 2079 }, { "epoch": 0.10049765666521718, "grad_norm": 8.2654447555542, "learning_rate": 8.995023433347828e-07, "loss": 0.3565, "step": 2080 }, { "epoch": 0.10054597284630623, "grad_norm": 3.0050244331359863, "learning_rate": 8.994540271536938e-07, "loss": 0.3875, "step": 2081 }, { "epoch": 0.10059428902739527, "grad_norm": 3.0656678676605225, "learning_rate": 8.994057109726046e-07, "loss": 0.3006, "step": 2082 }, { "epoch": 0.10064260520848432, "grad_norm": 2.7072317600250244, "learning_rate": 8.993573947915156e-07, "loss": 0.39, "step": 2083 }, { "epoch": 0.10069092138957336, "grad_norm": 3.183384656906128, "learning_rate": 8.993090786104266e-07, "loss": 0.4449, "step": 2084 }, { "epoch": 0.10073923757066242, "grad_norm": 2.4090425968170166, "learning_rate": 8.992607624293376e-07, "loss": 0.2836, "step": 2085 }, { "epoch": 0.10078755375175145, "grad_norm": 4.497722148895264, "learning_rate": 8.992124462482486e-07, "loss": 0.3103, "step": 2086 }, { "epoch": 0.10083586993284051, "grad_norm": 3.4344265460968018, "learning_rate": 8.991641300671595e-07, "loss": 0.422, "step": 2087 }, { "epoch": 0.10088418611392956, "grad_norm": 2.8087987899780273, "learning_rate": 8.991158138860703e-07, "loss": 0.3565, "step": 2088 }, { "epoch": 0.1009325022950186, "grad_norm": 7.743268966674805, "learning_rate": 8.990674977049813e-07, "loss": 0.2185, "step": 2089 }, { "epoch": 0.10098081847610765, "grad_norm": 2.817513942718506, "learning_rate": 8.990191815238923e-07, "loss": 0.4686, "step": 2090 }, { "epoch": 0.10102913465719669, "grad_norm": 2.2689499855041504, "learning_rate": 8.989708653428033e-07, "loss": 0.2698, "step": 2091 }, { "epoch": 0.10107745083828575, "grad_norm": 2.5973947048187256, "learning_rate": 8.989225491617142e-07, "loss": 0.2778, "step": 2092 }, { "epoch": 0.10112576701937478, "grad_norm": 2.9430694580078125, "learning_rate": 8.988742329806252e-07, "loss": 0.3161, "step": 2093 }, { "epoch": 0.10117408320046384, "grad_norm": 2.657142400741577, "learning_rate": 8.988259167995361e-07, "loss": 0.4052, "step": 2094 }, { "epoch": 0.10122239938155288, "grad_norm": 2.4832763671875, "learning_rate": 8.987776006184471e-07, "loss": 0.2624, "step": 2095 }, { "epoch": 0.10127071556264193, "grad_norm": 6.537056922912598, "learning_rate": 8.98729284437358e-07, "loss": 0.3358, "step": 2096 }, { "epoch": 0.10131903174373097, "grad_norm": 2.8760573863983154, "learning_rate": 8.986809682562689e-07, "loss": 0.2823, "step": 2097 }, { "epoch": 0.10136734792482002, "grad_norm": 2.4513540267944336, "learning_rate": 8.986326520751799e-07, "loss": 0.2781, "step": 2098 }, { "epoch": 0.10141566410590908, "grad_norm": 2.8923327922821045, "learning_rate": 8.985843358940909e-07, "loss": 0.3758, "step": 2099 }, { "epoch": 0.10146398028699811, "grad_norm": 5.530440807342529, "learning_rate": 8.985360197130019e-07, "loss": 0.2347, "step": 2100 }, { "epoch": 0.10151229646808717, "grad_norm": 6.8280439376831055, "learning_rate": 8.984877035319128e-07, "loss": 0.3669, "step": 2101 }, { "epoch": 0.10156061264917621, "grad_norm": 2.27070951461792, "learning_rate": 8.984393873508237e-07, "loss": 0.2197, "step": 2102 }, { "epoch": 0.10160892883026526, "grad_norm": 2.660783290863037, "learning_rate": 8.983910711697347e-07, "loss": 0.3528, "step": 2103 }, { "epoch": 0.1016572450113543, "grad_norm": 5.056457996368408, "learning_rate": 8.983427549886457e-07, "loss": 0.4968, "step": 2104 }, { "epoch": 0.10170556119244335, "grad_norm": 4.297393798828125, "learning_rate": 8.982944388075566e-07, "loss": 0.3175, "step": 2105 }, { "epoch": 0.10175387737353239, "grad_norm": 1.9879947900772095, "learning_rate": 8.982461226264676e-07, "loss": 0.2376, "step": 2106 }, { "epoch": 0.10180219355462145, "grad_norm": 2.7138209342956543, "learning_rate": 8.981978064453785e-07, "loss": 0.3105, "step": 2107 }, { "epoch": 0.10185050973571048, "grad_norm": 2.151346206665039, "learning_rate": 8.981494902642894e-07, "loss": 0.2876, "step": 2108 }, { "epoch": 0.10189882591679954, "grad_norm": 2.856383800506592, "learning_rate": 8.981011740832004e-07, "loss": 0.3815, "step": 2109 }, { "epoch": 0.10194714209788858, "grad_norm": 2.182598114013672, "learning_rate": 8.980528579021114e-07, "loss": 0.309, "step": 2110 }, { "epoch": 0.10199545827897763, "grad_norm": 4.248740196228027, "learning_rate": 8.980045417210224e-07, "loss": 0.1671, "step": 2111 }, { "epoch": 0.10204377446006668, "grad_norm": 3.277848243713379, "learning_rate": 8.979562255399334e-07, "loss": 0.4139, "step": 2112 }, { "epoch": 0.10209209064115572, "grad_norm": 2.638153314590454, "learning_rate": 8.979079093588442e-07, "loss": 0.2748, "step": 2113 }, { "epoch": 0.10214040682224478, "grad_norm": 2.250993013381958, "learning_rate": 8.978595931777551e-07, "loss": 0.3054, "step": 2114 }, { "epoch": 0.10218872300333381, "grad_norm": 2.041616439819336, "learning_rate": 8.978112769966661e-07, "loss": 0.2382, "step": 2115 }, { "epoch": 0.10223703918442287, "grad_norm": 3.0803351402282715, "learning_rate": 8.977629608155771e-07, "loss": 0.2624, "step": 2116 }, { "epoch": 0.1022853553655119, "grad_norm": 2.515774726867676, "learning_rate": 8.977146446344881e-07, "loss": 0.308, "step": 2117 }, { "epoch": 0.10233367154660096, "grad_norm": 3.8773868083953857, "learning_rate": 8.97666328453399e-07, "loss": 0.3814, "step": 2118 }, { "epoch": 0.10238198772769, "grad_norm": 2.1420302391052246, "learning_rate": 8.9761801227231e-07, "loss": 0.2773, "step": 2119 }, { "epoch": 0.10243030390877905, "grad_norm": 2.2260851860046387, "learning_rate": 8.975696960912209e-07, "loss": 0.2445, "step": 2120 }, { "epoch": 0.10247862008986809, "grad_norm": 3.6063942909240723, "learning_rate": 8.975213799101319e-07, "loss": 0.3018, "step": 2121 }, { "epoch": 0.10252693627095714, "grad_norm": 2.214129686355591, "learning_rate": 8.974730637290428e-07, "loss": 0.2399, "step": 2122 }, { "epoch": 0.10257525245204618, "grad_norm": 3.6317265033721924, "learning_rate": 8.974247475479537e-07, "loss": 0.3199, "step": 2123 }, { "epoch": 0.10262356863313524, "grad_norm": 2.7071735858917236, "learning_rate": 8.973764313668647e-07, "loss": 0.2779, "step": 2124 }, { "epoch": 0.10267188481422429, "grad_norm": 2.3041980266571045, "learning_rate": 8.973281151857757e-07, "loss": 0.2473, "step": 2125 }, { "epoch": 0.10272020099531333, "grad_norm": 2.2950892448425293, "learning_rate": 8.972797990046867e-07, "loss": 0.2053, "step": 2126 }, { "epoch": 0.10276851717640238, "grad_norm": 1.9825448989868164, "learning_rate": 8.972314828235976e-07, "loss": 0.2289, "step": 2127 }, { "epoch": 0.10281683335749142, "grad_norm": 2.8628365993499756, "learning_rate": 8.971831666425085e-07, "loss": 0.3379, "step": 2128 }, { "epoch": 0.10286514953858047, "grad_norm": 1.7588238716125488, "learning_rate": 8.971348504614195e-07, "loss": 0.2031, "step": 2129 }, { "epoch": 0.10291346571966951, "grad_norm": 2.262899875640869, "learning_rate": 8.970865342803304e-07, "loss": 0.1768, "step": 2130 }, { "epoch": 0.10296178190075857, "grad_norm": 2.4389824867248535, "learning_rate": 8.970382180992414e-07, "loss": 0.3387, "step": 2131 }, { "epoch": 0.1030100980818476, "grad_norm": 2.4022560119628906, "learning_rate": 8.969899019181524e-07, "loss": 0.2227, "step": 2132 }, { "epoch": 0.10305841426293666, "grad_norm": 3.8602800369262695, "learning_rate": 8.969415857370633e-07, "loss": 0.3619, "step": 2133 }, { "epoch": 0.1031067304440257, "grad_norm": 9.168136596679688, "learning_rate": 8.968932695559742e-07, "loss": 0.2656, "step": 2134 }, { "epoch": 0.10315504662511475, "grad_norm": 2.527881383895874, "learning_rate": 8.968449533748852e-07, "loss": 0.3503, "step": 2135 }, { "epoch": 0.10320336280620379, "grad_norm": 2.1483476161956787, "learning_rate": 8.967966371937962e-07, "loss": 0.2383, "step": 2136 }, { "epoch": 0.10325167898729284, "grad_norm": 3.4670004844665527, "learning_rate": 8.967483210127072e-07, "loss": 0.3173, "step": 2137 }, { "epoch": 0.1032999951683819, "grad_norm": 68.69830322265625, "learning_rate": 8.967000048316182e-07, "loss": 0.4604, "step": 2138 }, { "epoch": 0.10334831134947094, "grad_norm": 1.9779716730117798, "learning_rate": 8.966516886505289e-07, "loss": 0.2065, "step": 2139 }, { "epoch": 0.10339662753055999, "grad_norm": 2.7423033714294434, "learning_rate": 8.966033724694399e-07, "loss": 0.3319, "step": 2140 }, { "epoch": 0.10344494371164903, "grad_norm": 1.6486225128173828, "learning_rate": 8.965550562883509e-07, "loss": 0.1893, "step": 2141 }, { "epoch": 0.10349325989273808, "grad_norm": 2.5143988132476807, "learning_rate": 8.965067401072619e-07, "loss": 0.2685, "step": 2142 }, { "epoch": 0.10354157607382712, "grad_norm": 2.195507049560547, "learning_rate": 8.964584239261729e-07, "loss": 0.2436, "step": 2143 }, { "epoch": 0.10358989225491617, "grad_norm": 6.64070463180542, "learning_rate": 8.964101077450838e-07, "loss": 0.2103, "step": 2144 }, { "epoch": 0.10363820843600521, "grad_norm": 1.9417744874954224, "learning_rate": 8.963617915639948e-07, "loss": 0.2341, "step": 2145 }, { "epoch": 0.10368652461709427, "grad_norm": 2.7034895420074463, "learning_rate": 8.963134753829057e-07, "loss": 0.2852, "step": 2146 }, { "epoch": 0.1037348407981833, "grad_norm": 1.9794765710830688, "learning_rate": 8.962651592018166e-07, "loss": 0.2117, "step": 2147 }, { "epoch": 0.10378315697927236, "grad_norm": 4.163464546203613, "learning_rate": 8.962168430207276e-07, "loss": 0.3782, "step": 2148 }, { "epoch": 0.1038314731603614, "grad_norm": 2.2557332515716553, "learning_rate": 8.961685268396385e-07, "loss": 0.2586, "step": 2149 }, { "epoch": 0.10387978934145045, "grad_norm": 19.35873031616211, "learning_rate": 8.961202106585495e-07, "loss": 0.3121, "step": 2150 }, { "epoch": 0.1039281055225395, "grad_norm": 4.103061199188232, "learning_rate": 8.960718944774605e-07, "loss": 0.3548, "step": 2151 }, { "epoch": 0.10397642170362854, "grad_norm": 6.62208366394043, "learning_rate": 8.960235782963714e-07, "loss": 0.2525, "step": 2152 }, { "epoch": 0.1040247378847176, "grad_norm": 2.5488176345825195, "learning_rate": 8.959752621152824e-07, "loss": 0.3258, "step": 2153 }, { "epoch": 0.10407305406580664, "grad_norm": 3.77744460105896, "learning_rate": 8.959269459341933e-07, "loss": 0.2978, "step": 2154 }, { "epoch": 0.10412137024689569, "grad_norm": 1.9949442148208618, "learning_rate": 8.958786297531042e-07, "loss": 0.2445, "step": 2155 }, { "epoch": 0.10416968642798473, "grad_norm": 1.8409831523895264, "learning_rate": 8.958303135720152e-07, "loss": 0.2209, "step": 2156 }, { "epoch": 0.10421800260907378, "grad_norm": 2.6930530071258545, "learning_rate": 8.957819973909262e-07, "loss": 0.3692, "step": 2157 }, { "epoch": 0.10426631879016282, "grad_norm": 3.2724497318267822, "learning_rate": 8.957336812098372e-07, "loss": 0.3268, "step": 2158 }, { "epoch": 0.10431463497125187, "grad_norm": 2.285844564437866, "learning_rate": 8.956853650287481e-07, "loss": 0.2834, "step": 2159 }, { "epoch": 0.10436295115234091, "grad_norm": 3.781240463256836, "learning_rate": 8.95637048847659e-07, "loss": 0.2239, "step": 2160 }, { "epoch": 0.10441126733342997, "grad_norm": 3.7441093921661377, "learning_rate": 8.9558873266657e-07, "loss": 0.2923, "step": 2161 }, { "epoch": 0.10445958351451902, "grad_norm": 1.958918809890747, "learning_rate": 8.95540416485481e-07, "loss": 0.2234, "step": 2162 }, { "epoch": 0.10450789969560806, "grad_norm": 1.9985851049423218, "learning_rate": 8.95492100304392e-07, "loss": 0.2076, "step": 2163 }, { "epoch": 0.10455621587669711, "grad_norm": 2.65450119972229, "learning_rate": 8.95443784123303e-07, "loss": 0.3705, "step": 2164 }, { "epoch": 0.10460453205778615, "grad_norm": 2.984942674636841, "learning_rate": 8.953954679422137e-07, "loss": 0.4516, "step": 2165 }, { "epoch": 0.1046528482388752, "grad_norm": 7.241626262664795, "learning_rate": 8.953471517611247e-07, "loss": 0.2426, "step": 2166 }, { "epoch": 0.10470116441996424, "grad_norm": 2.3015499114990234, "learning_rate": 8.952988355800357e-07, "loss": 0.2624, "step": 2167 }, { "epoch": 0.1047494806010533, "grad_norm": 3.780425786972046, "learning_rate": 8.952505193989467e-07, "loss": 0.3516, "step": 2168 }, { "epoch": 0.10479779678214234, "grad_norm": 2.7040698528289795, "learning_rate": 8.952022032178577e-07, "loss": 0.2452, "step": 2169 }, { "epoch": 0.10484611296323139, "grad_norm": 2.7410643100738525, "learning_rate": 8.951538870367686e-07, "loss": 0.3997, "step": 2170 }, { "epoch": 0.10489442914432043, "grad_norm": 2.5838847160339355, "learning_rate": 8.951055708556795e-07, "loss": 0.3183, "step": 2171 }, { "epoch": 0.10494274532540948, "grad_norm": 3.2354984283447266, "learning_rate": 8.950572546745904e-07, "loss": 0.3506, "step": 2172 }, { "epoch": 0.10499106150649852, "grad_norm": 3.323241949081421, "learning_rate": 8.950089384935014e-07, "loss": 0.3782, "step": 2173 }, { "epoch": 0.10503937768758757, "grad_norm": 2.5543408393859863, "learning_rate": 8.949606223124124e-07, "loss": 0.3397, "step": 2174 }, { "epoch": 0.10508769386867663, "grad_norm": 2.94439959526062, "learning_rate": 8.949123061313233e-07, "loss": 0.238, "step": 2175 }, { "epoch": 0.10513601004976567, "grad_norm": 3.9746556282043457, "learning_rate": 8.948639899502343e-07, "loss": 0.3296, "step": 2176 }, { "epoch": 0.10518432623085472, "grad_norm": 2.3081398010253906, "learning_rate": 8.948156737691453e-07, "loss": 0.3057, "step": 2177 }, { "epoch": 0.10523264241194376, "grad_norm": 3.1721596717834473, "learning_rate": 8.947673575880562e-07, "loss": 0.2996, "step": 2178 }, { "epoch": 0.10528095859303281, "grad_norm": 3.287053346633911, "learning_rate": 8.947190414069672e-07, "loss": 0.2239, "step": 2179 }, { "epoch": 0.10532927477412185, "grad_norm": 2.480109453201294, "learning_rate": 8.94670725225878e-07, "loss": 0.2508, "step": 2180 }, { "epoch": 0.1053775909552109, "grad_norm": 56.2392463684082, "learning_rate": 8.94622409044789e-07, "loss": 0.2752, "step": 2181 }, { "epoch": 0.10542590713629994, "grad_norm": 3.6548900604248047, "learning_rate": 8.945740928637e-07, "loss": 0.3743, "step": 2182 }, { "epoch": 0.105474223317389, "grad_norm": 2.556485176086426, "learning_rate": 8.94525776682611e-07, "loss": 0.1795, "step": 2183 }, { "epoch": 0.10552253949847804, "grad_norm": 7.721895694732666, "learning_rate": 8.944774605015219e-07, "loss": 0.4306, "step": 2184 }, { "epoch": 0.10557085567956709, "grad_norm": 10.7454833984375, "learning_rate": 8.944291443204329e-07, "loss": 0.3901, "step": 2185 }, { "epoch": 0.10561917186065613, "grad_norm": 2.9836270809173584, "learning_rate": 8.943808281393438e-07, "loss": 0.3381, "step": 2186 }, { "epoch": 0.10566748804174518, "grad_norm": 3.174015522003174, "learning_rate": 8.943325119582548e-07, "loss": 0.336, "step": 2187 }, { "epoch": 0.10571580422283423, "grad_norm": 3.5464725494384766, "learning_rate": 8.942841957771658e-07, "loss": 0.2598, "step": 2188 }, { "epoch": 0.10576412040392327, "grad_norm": 2.1627073287963867, "learning_rate": 8.942358795960768e-07, "loss": 0.2661, "step": 2189 }, { "epoch": 0.10581243658501233, "grad_norm": 3.215627431869507, "learning_rate": 8.941875634149877e-07, "loss": 0.3522, "step": 2190 }, { "epoch": 0.10586075276610137, "grad_norm": 3.5621681213378906, "learning_rate": 8.941392472338985e-07, "loss": 0.3208, "step": 2191 }, { "epoch": 0.10590906894719042, "grad_norm": 2.881049633026123, "learning_rate": 8.940909310528095e-07, "loss": 0.3384, "step": 2192 }, { "epoch": 0.10595738512827946, "grad_norm": 3.8071322441101074, "learning_rate": 8.940426148717205e-07, "loss": 0.38, "step": 2193 }, { "epoch": 0.10600570130936851, "grad_norm": 1.9618401527404785, "learning_rate": 8.939942986906315e-07, "loss": 0.2088, "step": 2194 }, { "epoch": 0.10605401749045755, "grad_norm": 3.0575449466705322, "learning_rate": 8.939459825095425e-07, "loss": 0.3503, "step": 2195 }, { "epoch": 0.1061023336715466, "grad_norm": 1.995991826057434, "learning_rate": 8.938976663284534e-07, "loss": 0.2236, "step": 2196 }, { "epoch": 0.10615064985263564, "grad_norm": 2.5521695613861084, "learning_rate": 8.938493501473642e-07, "loss": 0.3033, "step": 2197 }, { "epoch": 0.1061989660337247, "grad_norm": 5.567178726196289, "learning_rate": 8.938010339662752e-07, "loss": 0.1892, "step": 2198 }, { "epoch": 0.10624728221481373, "grad_norm": 2.23665452003479, "learning_rate": 8.937527177851862e-07, "loss": 0.2755, "step": 2199 }, { "epoch": 0.10629559839590279, "grad_norm": 2.68774151802063, "learning_rate": 8.937044016040972e-07, "loss": 0.3249, "step": 2200 }, { "epoch": 0.10634391457699184, "grad_norm": 2.815326690673828, "learning_rate": 8.936560854230081e-07, "loss": 0.2531, "step": 2201 }, { "epoch": 0.10639223075808088, "grad_norm": 3.3588056564331055, "learning_rate": 8.936077692419191e-07, "loss": 0.3821, "step": 2202 }, { "epoch": 0.10644054693916993, "grad_norm": 3.375361919403076, "learning_rate": 8.9355945306083e-07, "loss": 0.4742, "step": 2203 }, { "epoch": 0.10648886312025897, "grad_norm": 4.598830699920654, "learning_rate": 8.93511136879741e-07, "loss": 0.5283, "step": 2204 }, { "epoch": 0.10653717930134803, "grad_norm": 2.8901631832122803, "learning_rate": 8.93462820698652e-07, "loss": 0.3683, "step": 2205 }, { "epoch": 0.10658549548243706, "grad_norm": 3.3162429332733154, "learning_rate": 8.934145045175628e-07, "loss": 0.3296, "step": 2206 }, { "epoch": 0.10663381166352612, "grad_norm": 3.49159836769104, "learning_rate": 8.933661883364738e-07, "loss": 0.24, "step": 2207 }, { "epoch": 0.10668212784461516, "grad_norm": 2.12903094291687, "learning_rate": 8.933178721553848e-07, "loss": 0.2417, "step": 2208 }, { "epoch": 0.10673044402570421, "grad_norm": 2.3379738330841064, "learning_rate": 8.932695559742958e-07, "loss": 0.359, "step": 2209 }, { "epoch": 0.10677876020679325, "grad_norm": 3.458548069000244, "learning_rate": 8.932212397932067e-07, "loss": 0.3276, "step": 2210 }, { "epoch": 0.1068270763878823, "grad_norm": 4.661439895629883, "learning_rate": 8.931729236121177e-07, "loss": 0.4151, "step": 2211 }, { "epoch": 0.10687539256897134, "grad_norm": 3.464662551879883, "learning_rate": 8.931246074310286e-07, "loss": 0.4336, "step": 2212 }, { "epoch": 0.1069237087500604, "grad_norm": 5.011883735656738, "learning_rate": 8.930762912499396e-07, "loss": 0.166, "step": 2213 }, { "epoch": 0.10697202493114945, "grad_norm": 1.9643514156341553, "learning_rate": 8.930279750688506e-07, "loss": 0.2581, "step": 2214 }, { "epoch": 0.10702034111223849, "grad_norm": 3.493879795074463, "learning_rate": 8.929796588877615e-07, "loss": 0.4643, "step": 2215 }, { "epoch": 0.10706865729332754, "grad_norm": 11.523862838745117, "learning_rate": 8.929313427066724e-07, "loss": 0.295, "step": 2216 }, { "epoch": 0.10711697347441658, "grad_norm": 2.5793349742889404, "learning_rate": 8.928830265255833e-07, "loss": 0.3096, "step": 2217 }, { "epoch": 0.10716528965550563, "grad_norm": 3.3918426036834717, "learning_rate": 8.928347103444943e-07, "loss": 0.4499, "step": 2218 }, { "epoch": 0.10721360583659467, "grad_norm": 5.79988956451416, "learning_rate": 8.927863941634053e-07, "loss": 0.3476, "step": 2219 }, { "epoch": 0.10726192201768373, "grad_norm": 3.289604902267456, "learning_rate": 8.927380779823163e-07, "loss": 0.4361, "step": 2220 }, { "epoch": 0.10731023819877276, "grad_norm": 4.5193376541137695, "learning_rate": 8.926897618012273e-07, "loss": 0.4766, "step": 2221 }, { "epoch": 0.10735855437986182, "grad_norm": 3.183986186981201, "learning_rate": 8.92641445620138e-07, "loss": 0.4779, "step": 2222 }, { "epoch": 0.10740687056095086, "grad_norm": 2.009829044342041, "learning_rate": 8.92593129439049e-07, "loss": 0.2483, "step": 2223 }, { "epoch": 0.10745518674203991, "grad_norm": 10.633787155151367, "learning_rate": 8.9254481325796e-07, "loss": 0.2677, "step": 2224 }, { "epoch": 0.10750350292312895, "grad_norm": 2.4328229427337646, "learning_rate": 8.92496497076871e-07, "loss": 0.1946, "step": 2225 }, { "epoch": 0.107551819104218, "grad_norm": 2.2689640522003174, "learning_rate": 8.92448180895782e-07, "loss": 0.2661, "step": 2226 }, { "epoch": 0.10760013528530706, "grad_norm": 1.6182478666305542, "learning_rate": 8.923998647146929e-07, "loss": 0.1894, "step": 2227 }, { "epoch": 0.1076484514663961, "grad_norm": 3.055809736251831, "learning_rate": 8.923515485336039e-07, "loss": 0.2967, "step": 2228 }, { "epoch": 0.10769676764748515, "grad_norm": 2.242182731628418, "learning_rate": 8.923032323525148e-07, "loss": 0.2883, "step": 2229 }, { "epoch": 0.10774508382857419, "grad_norm": 3.604044198989868, "learning_rate": 8.922549161714258e-07, "loss": 0.3683, "step": 2230 }, { "epoch": 0.10779340000966324, "grad_norm": 4.167545318603516, "learning_rate": 8.922065999903368e-07, "loss": 0.3222, "step": 2231 }, { "epoch": 0.10784171619075228, "grad_norm": 8.501245498657227, "learning_rate": 8.921582838092476e-07, "loss": 0.432, "step": 2232 }, { "epoch": 0.10789003237184133, "grad_norm": 1.3484899997711182, "learning_rate": 8.921099676281586e-07, "loss": 0.1527, "step": 2233 }, { "epoch": 0.10793834855293037, "grad_norm": 4.453649044036865, "learning_rate": 8.920616514470696e-07, "loss": 0.2498, "step": 2234 }, { "epoch": 0.10798666473401942, "grad_norm": 6.447925567626953, "learning_rate": 8.920133352659805e-07, "loss": 0.3622, "step": 2235 }, { "epoch": 0.10803498091510846, "grad_norm": 1.7969517707824707, "learning_rate": 8.919650190848915e-07, "loss": 0.2252, "step": 2236 }, { "epoch": 0.10808329709619752, "grad_norm": 2.894183874130249, "learning_rate": 8.919167029038025e-07, "loss": 0.3385, "step": 2237 }, { "epoch": 0.10813161327728657, "grad_norm": 2.09486985206604, "learning_rate": 8.918683867227134e-07, "loss": 0.2462, "step": 2238 }, { "epoch": 0.10817992945837561, "grad_norm": 3.00317120552063, "learning_rate": 8.918200705416244e-07, "loss": 0.3655, "step": 2239 }, { "epoch": 0.10822824563946466, "grad_norm": 5.75484037399292, "learning_rate": 8.917717543605353e-07, "loss": 0.3844, "step": 2240 }, { "epoch": 0.1082765618205537, "grad_norm": 5.898135662078857, "learning_rate": 8.917234381794463e-07, "loss": 0.264, "step": 2241 }, { "epoch": 0.10832487800164275, "grad_norm": 2.786630392074585, "learning_rate": 8.916751219983572e-07, "loss": 0.3851, "step": 2242 }, { "epoch": 0.1083731941827318, "grad_norm": 2.602822780609131, "learning_rate": 8.916268058172681e-07, "loss": 0.3379, "step": 2243 }, { "epoch": 0.10842151036382085, "grad_norm": 4.7682085037231445, "learning_rate": 8.915784896361791e-07, "loss": 0.3972, "step": 2244 }, { "epoch": 0.10846982654490989, "grad_norm": 2.670694351196289, "learning_rate": 8.915301734550901e-07, "loss": 0.3048, "step": 2245 }, { "epoch": 0.10851814272599894, "grad_norm": 2.7883081436157227, "learning_rate": 8.914818572740011e-07, "loss": 0.3512, "step": 2246 }, { "epoch": 0.10856645890708798, "grad_norm": 2.3147225379943848, "learning_rate": 8.914335410929121e-07, "loss": 0.3336, "step": 2247 }, { "epoch": 0.10861477508817703, "grad_norm": 1.9286030530929565, "learning_rate": 8.913852249118228e-07, "loss": 0.2373, "step": 2248 }, { "epoch": 0.10866309126926607, "grad_norm": 3.060581922531128, "learning_rate": 8.913369087307338e-07, "loss": 0.4039, "step": 2249 }, { "epoch": 0.10871140745035512, "grad_norm": 3.3678808212280273, "learning_rate": 8.912885925496448e-07, "loss": 0.409, "step": 2250 }, { "epoch": 0.10875972363144418, "grad_norm": 5.687150478363037, "learning_rate": 8.912402763685558e-07, "loss": 0.2975, "step": 2251 }, { "epoch": 0.10880803981253322, "grad_norm": 3.37654972076416, "learning_rate": 8.911919601874668e-07, "loss": 0.2563, "step": 2252 }, { "epoch": 0.10885635599362227, "grad_norm": 2.4519083499908447, "learning_rate": 8.911436440063777e-07, "loss": 0.2692, "step": 2253 }, { "epoch": 0.10890467217471131, "grad_norm": 2.7589244842529297, "learning_rate": 8.910953278252886e-07, "loss": 0.318, "step": 2254 }, { "epoch": 0.10895298835580036, "grad_norm": 2.2814931869506836, "learning_rate": 8.910470116441996e-07, "loss": 0.2042, "step": 2255 }, { "epoch": 0.1090013045368894, "grad_norm": 2.2262539863586426, "learning_rate": 8.909986954631106e-07, "loss": 0.2617, "step": 2256 }, { "epoch": 0.10904962071797845, "grad_norm": 1.866943359375, "learning_rate": 8.909503792820215e-07, "loss": 0.2268, "step": 2257 }, { "epoch": 0.1090979368990675, "grad_norm": 9.151062965393066, "learning_rate": 8.909020631009324e-07, "loss": 0.4132, "step": 2258 }, { "epoch": 0.10914625308015655, "grad_norm": 2.641087770462036, "learning_rate": 8.908537469198434e-07, "loss": 0.2698, "step": 2259 }, { "epoch": 0.10919456926124559, "grad_norm": 3.290808916091919, "learning_rate": 8.908054307387544e-07, "loss": 0.3031, "step": 2260 }, { "epoch": 0.10924288544233464, "grad_norm": 3.4467523097991943, "learning_rate": 8.907571145576653e-07, "loss": 0.335, "step": 2261 }, { "epoch": 0.10929120162342368, "grad_norm": 3.4156105518341064, "learning_rate": 8.907087983765763e-07, "loss": 0.3566, "step": 2262 }, { "epoch": 0.10933951780451273, "grad_norm": 2.0157923698425293, "learning_rate": 8.906604821954873e-07, "loss": 0.2062, "step": 2263 }, { "epoch": 0.10938783398560178, "grad_norm": 4.400396347045898, "learning_rate": 8.906121660143982e-07, "loss": 0.1952, "step": 2264 }, { "epoch": 0.10943615016669082, "grad_norm": 2.0078585147857666, "learning_rate": 8.905638498333091e-07, "loss": 0.2229, "step": 2265 }, { "epoch": 0.10948446634777988, "grad_norm": 3.0428876876831055, "learning_rate": 8.905155336522201e-07, "loss": 0.2984, "step": 2266 }, { "epoch": 0.10953278252886892, "grad_norm": 2.463991403579712, "learning_rate": 8.90467217471131e-07, "loss": 0.3391, "step": 2267 }, { "epoch": 0.10958109870995797, "grad_norm": 5.382617950439453, "learning_rate": 8.90418901290042e-07, "loss": 0.5409, "step": 2268 }, { "epoch": 0.10962941489104701, "grad_norm": 2.2300448417663574, "learning_rate": 8.903705851089529e-07, "loss": 0.3004, "step": 2269 }, { "epoch": 0.10967773107213606, "grad_norm": 1.8323408365249634, "learning_rate": 8.903222689278639e-07, "loss": 0.2448, "step": 2270 }, { "epoch": 0.1097260472532251, "grad_norm": 2.3692963123321533, "learning_rate": 8.902739527467749e-07, "loss": 0.3178, "step": 2271 }, { "epoch": 0.10977436343431415, "grad_norm": 3.3708128929138184, "learning_rate": 8.902256365656859e-07, "loss": 0.2366, "step": 2272 }, { "epoch": 0.1098226796154032, "grad_norm": 1.6731040477752686, "learning_rate": 8.901773203845969e-07, "loss": 0.227, "step": 2273 }, { "epoch": 0.10987099579649225, "grad_norm": 2.3829593658447266, "learning_rate": 8.901290042035076e-07, "loss": 0.3053, "step": 2274 }, { "epoch": 0.10991931197758129, "grad_norm": 3.2462637424468994, "learning_rate": 8.900806880224186e-07, "loss": 0.4177, "step": 2275 }, { "epoch": 0.10996762815867034, "grad_norm": 4.190408229827881, "learning_rate": 8.900323718413296e-07, "loss": 0.3906, "step": 2276 }, { "epoch": 0.11001594433975939, "grad_norm": 2.3981268405914307, "learning_rate": 8.899840556602406e-07, "loss": 0.2952, "step": 2277 }, { "epoch": 0.11006426052084843, "grad_norm": 5.236783981323242, "learning_rate": 8.899357394791516e-07, "loss": 0.4661, "step": 2278 }, { "epoch": 0.11011257670193748, "grad_norm": 3.9470324516296387, "learning_rate": 8.898874232980625e-07, "loss": 0.3, "step": 2279 }, { "epoch": 0.11016089288302652, "grad_norm": 1.9102953672409058, "learning_rate": 8.898391071169734e-07, "loss": 0.2274, "step": 2280 }, { "epoch": 0.11020920906411558, "grad_norm": 2.6413393020629883, "learning_rate": 8.897907909358844e-07, "loss": 0.3702, "step": 2281 }, { "epoch": 0.11025752524520462, "grad_norm": 2.854405164718628, "learning_rate": 8.897424747547953e-07, "loss": 0.2945, "step": 2282 }, { "epoch": 0.11030584142629367, "grad_norm": 3.4881198406219482, "learning_rate": 8.896941585737063e-07, "loss": 0.4415, "step": 2283 }, { "epoch": 0.11035415760738271, "grad_norm": 2.2171571254730225, "learning_rate": 8.896458423926172e-07, "loss": 0.2737, "step": 2284 }, { "epoch": 0.11040247378847176, "grad_norm": 3.896859645843506, "learning_rate": 8.895975262115282e-07, "loss": 0.5069, "step": 2285 }, { "epoch": 0.1104507899695608, "grad_norm": 2.801162004470825, "learning_rate": 8.895492100304391e-07, "loss": 0.4158, "step": 2286 }, { "epoch": 0.11049910615064985, "grad_norm": 2.1087687015533447, "learning_rate": 8.895008938493501e-07, "loss": 0.2414, "step": 2287 }, { "epoch": 0.11054742233173889, "grad_norm": 2.359124183654785, "learning_rate": 8.894525776682611e-07, "loss": 0.3074, "step": 2288 }, { "epoch": 0.11059573851282795, "grad_norm": 6.38983154296875, "learning_rate": 8.89404261487172e-07, "loss": 0.3157, "step": 2289 }, { "epoch": 0.110644054693917, "grad_norm": 2.461786985397339, "learning_rate": 8.89355945306083e-07, "loss": 0.3492, "step": 2290 }, { "epoch": 0.11069237087500604, "grad_norm": 2.4494075775146484, "learning_rate": 8.893076291249939e-07, "loss": 0.3202, "step": 2291 }, { "epoch": 0.11074068705609509, "grad_norm": 2.4880669116973877, "learning_rate": 8.892593129439049e-07, "loss": 0.3278, "step": 2292 }, { "epoch": 0.11078900323718413, "grad_norm": 2.6816513538360596, "learning_rate": 8.892109967628158e-07, "loss": 0.227, "step": 2293 }, { "epoch": 0.11083731941827318, "grad_norm": 2.4144327640533447, "learning_rate": 8.891626805817268e-07, "loss": 0.285, "step": 2294 }, { "epoch": 0.11088563559936222, "grad_norm": 3.467725992202759, "learning_rate": 8.891143644006377e-07, "loss": 0.3659, "step": 2295 }, { "epoch": 0.11093395178045128, "grad_norm": 1.9983172416687012, "learning_rate": 8.890660482195487e-07, "loss": 0.2175, "step": 2296 }, { "epoch": 0.11098226796154032, "grad_norm": 3.9915223121643066, "learning_rate": 8.890177320384597e-07, "loss": 0.2431, "step": 2297 }, { "epoch": 0.11103058414262937, "grad_norm": 2.3049838542938232, "learning_rate": 8.889694158573707e-07, "loss": 0.1997, "step": 2298 }, { "epoch": 0.11107890032371841, "grad_norm": 2.8112905025482178, "learning_rate": 8.889210996762815e-07, "loss": 0.327, "step": 2299 }, { "epoch": 0.11112721650480746, "grad_norm": 1.7629181146621704, "learning_rate": 8.888727834951924e-07, "loss": 0.2064, "step": 2300 }, { "epoch": 0.1111755326858965, "grad_norm": 2.753833055496216, "learning_rate": 8.888244673141034e-07, "loss": 0.3648, "step": 2301 }, { "epoch": 0.11122384886698555, "grad_norm": 4.485295295715332, "learning_rate": 8.887761511330144e-07, "loss": 0.2866, "step": 2302 }, { "epoch": 0.1112721650480746, "grad_norm": 2.4821078777313232, "learning_rate": 8.887278349519254e-07, "loss": 0.2792, "step": 2303 }, { "epoch": 0.11132048122916365, "grad_norm": 2.9062254428863525, "learning_rate": 8.886795187708364e-07, "loss": 0.3068, "step": 2304 }, { "epoch": 0.1113687974102527, "grad_norm": 3.616426944732666, "learning_rate": 8.886312025897472e-07, "loss": 0.5479, "step": 2305 }, { "epoch": 0.11141711359134174, "grad_norm": 4.198230266571045, "learning_rate": 8.885828864086582e-07, "loss": 0.3201, "step": 2306 }, { "epoch": 0.11146542977243079, "grad_norm": 2.7184805870056152, "learning_rate": 8.885345702275691e-07, "loss": 0.2159, "step": 2307 }, { "epoch": 0.11151374595351983, "grad_norm": 3.5905258655548096, "learning_rate": 8.884862540464801e-07, "loss": 0.2973, "step": 2308 }, { "epoch": 0.11156206213460888, "grad_norm": 8.367807388305664, "learning_rate": 8.884379378653911e-07, "loss": 0.289, "step": 2309 }, { "epoch": 0.11161037831569792, "grad_norm": 2.821143388748169, "learning_rate": 8.88389621684302e-07, "loss": 0.3715, "step": 2310 }, { "epoch": 0.11165869449678698, "grad_norm": 2.414926052093506, "learning_rate": 8.88341305503213e-07, "loss": 0.2549, "step": 2311 }, { "epoch": 0.11170701067787601, "grad_norm": 4.338918209075928, "learning_rate": 8.882929893221239e-07, "loss": 0.3592, "step": 2312 }, { "epoch": 0.11175532685896507, "grad_norm": 6.435703754425049, "learning_rate": 8.882446731410349e-07, "loss": 0.3696, "step": 2313 }, { "epoch": 0.11180364304005412, "grad_norm": 3.4755468368530273, "learning_rate": 8.881963569599459e-07, "loss": 0.3065, "step": 2314 }, { "epoch": 0.11185195922114316, "grad_norm": 1.6141188144683838, "learning_rate": 8.881480407788568e-07, "loss": 0.1715, "step": 2315 }, { "epoch": 0.11190027540223221, "grad_norm": 2.9625933170318604, "learning_rate": 8.880997245977677e-07, "loss": 0.3111, "step": 2316 }, { "epoch": 0.11194859158332125, "grad_norm": 2.48313570022583, "learning_rate": 8.880514084166787e-07, "loss": 0.3667, "step": 2317 }, { "epoch": 0.1119969077644103, "grad_norm": 2.6255671977996826, "learning_rate": 8.880030922355896e-07, "loss": 0.3479, "step": 2318 }, { "epoch": 0.11204522394549934, "grad_norm": 2.132138729095459, "learning_rate": 8.879547760545006e-07, "loss": 0.3074, "step": 2319 }, { "epoch": 0.1120935401265884, "grad_norm": 3.243239402770996, "learning_rate": 8.879064598734116e-07, "loss": 0.3262, "step": 2320 }, { "epoch": 0.11214185630767744, "grad_norm": 2.4168570041656494, "learning_rate": 8.878581436923225e-07, "loss": 0.2108, "step": 2321 }, { "epoch": 0.11219017248876649, "grad_norm": 1.9218902587890625, "learning_rate": 8.878098275112335e-07, "loss": 0.2202, "step": 2322 }, { "epoch": 0.11223848866985553, "grad_norm": 2.3096582889556885, "learning_rate": 8.877615113301445e-07, "loss": 0.2636, "step": 2323 }, { "epoch": 0.11228680485094458, "grad_norm": 5.950868129730225, "learning_rate": 8.877131951490555e-07, "loss": 0.3044, "step": 2324 }, { "epoch": 0.11233512103203362, "grad_norm": 4.6220502853393555, "learning_rate": 8.876648789679663e-07, "loss": 0.4104, "step": 2325 }, { "epoch": 0.11238343721312267, "grad_norm": 2.7376811504364014, "learning_rate": 8.876165627868772e-07, "loss": 0.39, "step": 2326 }, { "epoch": 0.11243175339421173, "grad_norm": 3.8928184509277344, "learning_rate": 8.875682466057882e-07, "loss": 0.3879, "step": 2327 }, { "epoch": 0.11248006957530077, "grad_norm": 2.056002616882324, "learning_rate": 8.875199304246992e-07, "loss": 0.2112, "step": 2328 }, { "epoch": 0.11252838575638982, "grad_norm": 3.355496406555176, "learning_rate": 8.874716142436102e-07, "loss": 0.3561, "step": 2329 }, { "epoch": 0.11257670193747886, "grad_norm": 2.896383047103882, "learning_rate": 8.874232980625212e-07, "loss": 0.4397, "step": 2330 }, { "epoch": 0.11262501811856791, "grad_norm": 48.36296844482422, "learning_rate": 8.87374981881432e-07, "loss": 0.3416, "step": 2331 }, { "epoch": 0.11267333429965695, "grad_norm": 3.261357069015503, "learning_rate": 8.87326665700343e-07, "loss": 0.4766, "step": 2332 }, { "epoch": 0.112721650480746, "grad_norm": 3.0706284046173096, "learning_rate": 8.872783495192539e-07, "loss": 0.3377, "step": 2333 }, { "epoch": 0.11276996666183504, "grad_norm": 7.278879165649414, "learning_rate": 8.872300333381649e-07, "loss": 0.3735, "step": 2334 }, { "epoch": 0.1128182828429241, "grad_norm": 3.6102335453033447, "learning_rate": 8.871817171570759e-07, "loss": 0.4006, "step": 2335 }, { "epoch": 0.11286659902401314, "grad_norm": 7.407965183258057, "learning_rate": 8.871334009759868e-07, "loss": 0.3557, "step": 2336 }, { "epoch": 0.11291491520510219, "grad_norm": 2.7187602519989014, "learning_rate": 8.870850847948977e-07, "loss": 0.3433, "step": 2337 }, { "epoch": 0.11296323138619123, "grad_norm": 2.2363638877868652, "learning_rate": 8.870367686138087e-07, "loss": 0.3216, "step": 2338 }, { "epoch": 0.11301154756728028, "grad_norm": 2.6964402198791504, "learning_rate": 8.869884524327197e-07, "loss": 0.3755, "step": 2339 }, { "epoch": 0.11305986374836934, "grad_norm": 7.485875129699707, "learning_rate": 8.869401362516307e-07, "loss": 0.2748, "step": 2340 }, { "epoch": 0.11310817992945837, "grad_norm": 2.343470573425293, "learning_rate": 8.868918200705415e-07, "loss": 0.2849, "step": 2341 }, { "epoch": 0.11315649611054743, "grad_norm": 4.694440841674805, "learning_rate": 8.868435038894525e-07, "loss": 0.3739, "step": 2342 }, { "epoch": 0.11320481229163647, "grad_norm": 2.7848260402679443, "learning_rate": 8.867951877083635e-07, "loss": 0.2409, "step": 2343 }, { "epoch": 0.11325312847272552, "grad_norm": 3.505979061126709, "learning_rate": 8.867468715272744e-07, "loss": 0.2311, "step": 2344 }, { "epoch": 0.11330144465381456, "grad_norm": 10.735234260559082, "learning_rate": 8.866985553461854e-07, "loss": 0.1961, "step": 2345 }, { "epoch": 0.11334976083490361, "grad_norm": 3.0283734798431396, "learning_rate": 8.866502391650964e-07, "loss": 0.399, "step": 2346 }, { "epoch": 0.11339807701599265, "grad_norm": 9.01949691772461, "learning_rate": 8.866019229840073e-07, "loss": 0.3452, "step": 2347 }, { "epoch": 0.1134463931970817, "grad_norm": 3.0008351802825928, "learning_rate": 8.865536068029183e-07, "loss": 0.247, "step": 2348 }, { "epoch": 0.11349470937817074, "grad_norm": 2.4398739337921143, "learning_rate": 8.865052906218293e-07, "loss": 0.272, "step": 2349 }, { "epoch": 0.1135430255592598, "grad_norm": 2.6949737071990967, "learning_rate": 8.864569744407401e-07, "loss": 0.3203, "step": 2350 }, { "epoch": 0.11359134174034884, "grad_norm": 3.7553768157958984, "learning_rate": 8.864086582596511e-07, "loss": 0.4304, "step": 2351 }, { "epoch": 0.11363965792143789, "grad_norm": 3.5602362155914307, "learning_rate": 8.86360342078562e-07, "loss": 0.3902, "step": 2352 }, { "epoch": 0.11368797410252694, "grad_norm": 1.7067855596542358, "learning_rate": 8.86312025897473e-07, "loss": 0.1937, "step": 2353 }, { "epoch": 0.11373629028361598, "grad_norm": 2.993488311767578, "learning_rate": 8.86263709716384e-07, "loss": 0.2637, "step": 2354 }, { "epoch": 0.11378460646470503, "grad_norm": 2.9298317432403564, "learning_rate": 8.86215393535295e-07, "loss": 0.2448, "step": 2355 }, { "epoch": 0.11383292264579407, "grad_norm": 3.3407206535339355, "learning_rate": 8.86167077354206e-07, "loss": 0.5417, "step": 2356 }, { "epoch": 0.11388123882688313, "grad_norm": 3.299375295639038, "learning_rate": 8.861187611731168e-07, "loss": 0.3578, "step": 2357 }, { "epoch": 0.11392955500797217, "grad_norm": 4.937368869781494, "learning_rate": 8.860704449920277e-07, "loss": 0.2764, "step": 2358 }, { "epoch": 0.11397787118906122, "grad_norm": 2.439800977706909, "learning_rate": 8.860221288109387e-07, "loss": 0.3291, "step": 2359 }, { "epoch": 0.11402618737015026, "grad_norm": 1.4321991205215454, "learning_rate": 8.859738126298497e-07, "loss": 0.1699, "step": 2360 }, { "epoch": 0.11407450355123931, "grad_norm": 2.779258966445923, "learning_rate": 8.859254964487607e-07, "loss": 0.2747, "step": 2361 }, { "epoch": 0.11412281973232835, "grad_norm": 2.951371669769287, "learning_rate": 8.858771802676716e-07, "loss": 0.2527, "step": 2362 }, { "epoch": 0.1141711359134174, "grad_norm": 2.7675797939300537, "learning_rate": 8.858288640865825e-07, "loss": 0.3469, "step": 2363 }, { "epoch": 0.11421945209450644, "grad_norm": 2.697047472000122, "learning_rate": 8.857805479054935e-07, "loss": 0.3289, "step": 2364 }, { "epoch": 0.1142677682755955, "grad_norm": 3.9887540340423584, "learning_rate": 8.857322317244045e-07, "loss": 0.2659, "step": 2365 }, { "epoch": 0.11431608445668455, "grad_norm": 3.23703932762146, "learning_rate": 8.856839155433155e-07, "loss": 0.4627, "step": 2366 }, { "epoch": 0.11436440063777359, "grad_norm": 2.499464511871338, "learning_rate": 8.856355993622263e-07, "loss": 0.2484, "step": 2367 }, { "epoch": 0.11441271681886264, "grad_norm": 2.55655574798584, "learning_rate": 8.855872831811373e-07, "loss": 0.2915, "step": 2368 }, { "epoch": 0.11446103299995168, "grad_norm": 2.316171646118164, "learning_rate": 8.855389670000482e-07, "loss": 0.3097, "step": 2369 }, { "epoch": 0.11450934918104073, "grad_norm": 3.7153239250183105, "learning_rate": 8.854906508189592e-07, "loss": 0.4358, "step": 2370 }, { "epoch": 0.11455766536212977, "grad_norm": 2.6086275577545166, "learning_rate": 8.854423346378702e-07, "loss": 0.3206, "step": 2371 }, { "epoch": 0.11460598154321883, "grad_norm": 3.010984420776367, "learning_rate": 8.853940184567812e-07, "loss": 0.2818, "step": 2372 }, { "epoch": 0.11465429772430787, "grad_norm": 2.1932199001312256, "learning_rate": 8.853457022756921e-07, "loss": 0.2812, "step": 2373 }, { "epoch": 0.11470261390539692, "grad_norm": 1.9898569583892822, "learning_rate": 8.852973860946031e-07, "loss": 0.2363, "step": 2374 }, { "epoch": 0.11475093008648596, "grad_norm": 4.390642166137695, "learning_rate": 8.85249069913514e-07, "loss": 0.3931, "step": 2375 }, { "epoch": 0.11479924626757501, "grad_norm": 2.5255653858184814, "learning_rate": 8.852007537324249e-07, "loss": 0.2686, "step": 2376 }, { "epoch": 0.11484756244866406, "grad_norm": 1.8379058837890625, "learning_rate": 8.851524375513359e-07, "loss": 0.1726, "step": 2377 }, { "epoch": 0.1148958786297531, "grad_norm": 3.1672537326812744, "learning_rate": 8.851041213702468e-07, "loss": 0.4057, "step": 2378 }, { "epoch": 0.11494419481084216, "grad_norm": 2.739715337753296, "learning_rate": 8.850558051891578e-07, "loss": 0.343, "step": 2379 }, { "epoch": 0.1149925109919312, "grad_norm": 2.986252546310425, "learning_rate": 8.850074890080688e-07, "loss": 0.4861, "step": 2380 }, { "epoch": 0.11504082717302025, "grad_norm": 11.144835472106934, "learning_rate": 8.849591728269798e-07, "loss": 0.2218, "step": 2381 }, { "epoch": 0.11508914335410929, "grad_norm": 2.5901243686676025, "learning_rate": 8.849108566458907e-07, "loss": 0.3288, "step": 2382 }, { "epoch": 0.11513745953519834, "grad_norm": 2.891780376434326, "learning_rate": 8.848625404648015e-07, "loss": 0.3349, "step": 2383 }, { "epoch": 0.11518577571628738, "grad_norm": 3.0814898014068604, "learning_rate": 8.848142242837125e-07, "loss": 0.3798, "step": 2384 }, { "epoch": 0.11523409189737643, "grad_norm": 2.5672852993011475, "learning_rate": 8.847659081026235e-07, "loss": 0.2954, "step": 2385 }, { "epoch": 0.11528240807846547, "grad_norm": 2.1405439376831055, "learning_rate": 8.847175919215345e-07, "loss": 0.216, "step": 2386 }, { "epoch": 0.11533072425955453, "grad_norm": 2.9414358139038086, "learning_rate": 8.846692757404455e-07, "loss": 0.2291, "step": 2387 }, { "epoch": 0.11537904044064357, "grad_norm": 3.7194809913635254, "learning_rate": 8.846209595593563e-07, "loss": 0.4312, "step": 2388 }, { "epoch": 0.11542735662173262, "grad_norm": 2.168020486831665, "learning_rate": 8.845726433782673e-07, "loss": 0.1796, "step": 2389 }, { "epoch": 0.11547567280282167, "grad_norm": 5.9350810050964355, "learning_rate": 8.845243271971783e-07, "loss": 0.3092, "step": 2390 }, { "epoch": 0.11552398898391071, "grad_norm": 3.2129645347595215, "learning_rate": 8.844760110160893e-07, "loss": 0.4068, "step": 2391 }, { "epoch": 0.11557230516499976, "grad_norm": 3.1020262241363525, "learning_rate": 8.844276948350002e-07, "loss": 0.3455, "step": 2392 }, { "epoch": 0.1156206213460888, "grad_norm": 2.5077061653137207, "learning_rate": 8.843793786539111e-07, "loss": 0.2713, "step": 2393 }, { "epoch": 0.11566893752717786, "grad_norm": 2.93621826171875, "learning_rate": 8.843310624728221e-07, "loss": 0.2439, "step": 2394 }, { "epoch": 0.1157172537082669, "grad_norm": 2.3193631172180176, "learning_rate": 8.84282746291733e-07, "loss": 0.2867, "step": 2395 }, { "epoch": 0.11576556988935595, "grad_norm": 1.6743865013122559, "learning_rate": 8.84234430110644e-07, "loss": 0.1723, "step": 2396 }, { "epoch": 0.11581388607044499, "grad_norm": 4.396361351013184, "learning_rate": 8.84186113929555e-07, "loss": 0.2576, "step": 2397 }, { "epoch": 0.11586220225153404, "grad_norm": 3.365178346633911, "learning_rate": 8.84137797748466e-07, "loss": 0.3525, "step": 2398 }, { "epoch": 0.11591051843262308, "grad_norm": 4.41165828704834, "learning_rate": 8.840894815673769e-07, "loss": 0.2521, "step": 2399 }, { "epoch": 0.11595883461371213, "grad_norm": 2.794438123703003, "learning_rate": 8.840411653862879e-07, "loss": 0.3926, "step": 2400 }, { "epoch": 0.11600715079480117, "grad_norm": 3.6103811264038086, "learning_rate": 8.839928492051987e-07, "loss": 0.3, "step": 2401 }, { "epoch": 0.11605546697589023, "grad_norm": 2.230647087097168, "learning_rate": 8.839445330241097e-07, "loss": 0.2787, "step": 2402 }, { "epoch": 0.11610378315697928, "grad_norm": 1.9676058292388916, "learning_rate": 8.838962168430207e-07, "loss": 0.1472, "step": 2403 }, { "epoch": 0.11615209933806832, "grad_norm": 3.1583685874938965, "learning_rate": 8.838479006619316e-07, "loss": 0.4026, "step": 2404 }, { "epoch": 0.11620041551915737, "grad_norm": 1.9236167669296265, "learning_rate": 8.837995844808426e-07, "loss": 0.2666, "step": 2405 }, { "epoch": 0.11624873170024641, "grad_norm": 2.9798810482025146, "learning_rate": 8.837512682997536e-07, "loss": 0.424, "step": 2406 }, { "epoch": 0.11629704788133546, "grad_norm": 2.5727126598358154, "learning_rate": 8.837029521186646e-07, "loss": 0.3516, "step": 2407 }, { "epoch": 0.1163453640624245, "grad_norm": 4.148901462554932, "learning_rate": 8.836546359375755e-07, "loss": 0.2918, "step": 2408 }, { "epoch": 0.11639368024351356, "grad_norm": 3.043219566345215, "learning_rate": 8.836063197564863e-07, "loss": 0.451, "step": 2409 }, { "epoch": 0.1164419964246026, "grad_norm": 4.028036594390869, "learning_rate": 8.835580035753973e-07, "loss": 0.4195, "step": 2410 }, { "epoch": 0.11649031260569165, "grad_norm": 2.717317819595337, "learning_rate": 8.835096873943083e-07, "loss": 0.2507, "step": 2411 }, { "epoch": 0.11653862878678069, "grad_norm": 2.7639000415802, "learning_rate": 8.834613712132193e-07, "loss": 0.2455, "step": 2412 }, { "epoch": 0.11658694496786974, "grad_norm": 2.955925703048706, "learning_rate": 8.834130550321303e-07, "loss": 0.4078, "step": 2413 }, { "epoch": 0.11663526114895878, "grad_norm": 3.202963352203369, "learning_rate": 8.833647388510411e-07, "loss": 0.2992, "step": 2414 }, { "epoch": 0.11668357733004783, "grad_norm": 2.4016263484954834, "learning_rate": 8.833164226699521e-07, "loss": 0.2525, "step": 2415 }, { "epoch": 0.11673189351113689, "grad_norm": 2.606677293777466, "learning_rate": 8.832681064888631e-07, "loss": 0.3351, "step": 2416 }, { "epoch": 0.11678020969222593, "grad_norm": 3.254004716873169, "learning_rate": 8.83219790307774e-07, "loss": 0.3296, "step": 2417 }, { "epoch": 0.11682852587331498, "grad_norm": 3.287473201751709, "learning_rate": 8.83171474126685e-07, "loss": 0.3289, "step": 2418 }, { "epoch": 0.11687684205440402, "grad_norm": 4.16375732421875, "learning_rate": 8.831231579455959e-07, "loss": 0.3266, "step": 2419 }, { "epoch": 0.11692515823549307, "grad_norm": 2.540750503540039, "learning_rate": 8.830748417645068e-07, "loss": 0.3486, "step": 2420 }, { "epoch": 0.11697347441658211, "grad_norm": 3.171856164932251, "learning_rate": 8.830265255834178e-07, "loss": 0.3531, "step": 2421 }, { "epoch": 0.11702179059767116, "grad_norm": 2.838106632232666, "learning_rate": 8.829782094023288e-07, "loss": 0.2555, "step": 2422 }, { "epoch": 0.1170701067787602, "grad_norm": 2.3375613689422607, "learning_rate": 8.829298932212398e-07, "loss": 0.2037, "step": 2423 }, { "epoch": 0.11711842295984926, "grad_norm": 2.9006810188293457, "learning_rate": 8.828815770401508e-07, "loss": 0.3702, "step": 2424 }, { "epoch": 0.1171667391409383, "grad_norm": 1.7753831148147583, "learning_rate": 8.828332608590617e-07, "loss": 0.1624, "step": 2425 }, { "epoch": 0.11721505532202735, "grad_norm": 2.457486629486084, "learning_rate": 8.827849446779726e-07, "loss": 0.2831, "step": 2426 }, { "epoch": 0.11726337150311639, "grad_norm": 2.3373546600341797, "learning_rate": 8.827366284968835e-07, "loss": 0.2635, "step": 2427 }, { "epoch": 0.11731168768420544, "grad_norm": 2.4407846927642822, "learning_rate": 8.826883123157945e-07, "loss": 0.3109, "step": 2428 }, { "epoch": 0.1173600038652945, "grad_norm": 1.6213868856430054, "learning_rate": 8.826399961347055e-07, "loss": 0.1676, "step": 2429 }, { "epoch": 0.11740832004638353, "grad_norm": 2.6908516883850098, "learning_rate": 8.825916799536164e-07, "loss": 0.3013, "step": 2430 }, { "epoch": 0.11745663622747259, "grad_norm": 3.208487033843994, "learning_rate": 8.825433637725274e-07, "loss": 0.3759, "step": 2431 }, { "epoch": 0.11750495240856162, "grad_norm": 2.6886940002441406, "learning_rate": 8.824950475914384e-07, "loss": 0.3745, "step": 2432 }, { "epoch": 0.11755326858965068, "grad_norm": 2.4133427143096924, "learning_rate": 8.824467314103493e-07, "loss": 0.2959, "step": 2433 }, { "epoch": 0.11760158477073972, "grad_norm": 3.6512629985809326, "learning_rate": 8.823984152292602e-07, "loss": 0.4127, "step": 2434 }, { "epoch": 0.11764990095182877, "grad_norm": 2.572117805480957, "learning_rate": 8.823500990481711e-07, "loss": 0.3283, "step": 2435 }, { "epoch": 0.11769821713291781, "grad_norm": 5.99781608581543, "learning_rate": 8.823017828670821e-07, "loss": 0.2865, "step": 2436 }, { "epoch": 0.11774653331400686, "grad_norm": 2.304506301879883, "learning_rate": 8.822534666859931e-07, "loss": 0.2358, "step": 2437 }, { "epoch": 0.1177948494950959, "grad_norm": 3.530296564102173, "learning_rate": 8.822051505049041e-07, "loss": 0.2779, "step": 2438 }, { "epoch": 0.11784316567618495, "grad_norm": 2.756014347076416, "learning_rate": 8.821568343238151e-07, "loss": 0.3005, "step": 2439 }, { "epoch": 0.117891481857274, "grad_norm": 3.2264063358306885, "learning_rate": 8.821085181427259e-07, "loss": 0.285, "step": 2440 }, { "epoch": 0.11793979803836305, "grad_norm": 8.489005088806152, "learning_rate": 8.820602019616369e-07, "loss": 0.2995, "step": 2441 }, { "epoch": 0.1179881142194521, "grad_norm": 3.22530198097229, "learning_rate": 8.820118857805479e-07, "loss": 0.3951, "step": 2442 }, { "epoch": 0.11803643040054114, "grad_norm": 2.940227508544922, "learning_rate": 8.819635695994588e-07, "loss": 0.3348, "step": 2443 }, { "epoch": 0.11808474658163019, "grad_norm": 3.0723822116851807, "learning_rate": 8.819152534183698e-07, "loss": 0.3297, "step": 2444 }, { "epoch": 0.11813306276271923, "grad_norm": 2.2141854763031006, "learning_rate": 8.818669372372807e-07, "loss": 0.2256, "step": 2445 }, { "epoch": 0.11818137894380829, "grad_norm": 4.206521987915039, "learning_rate": 8.818186210561916e-07, "loss": 0.302, "step": 2446 }, { "epoch": 0.11822969512489732, "grad_norm": 2.549488067626953, "learning_rate": 8.817703048751026e-07, "loss": 0.3247, "step": 2447 }, { "epoch": 0.11827801130598638, "grad_norm": 2.4486517906188965, "learning_rate": 8.817219886940136e-07, "loss": 0.2862, "step": 2448 }, { "epoch": 0.11832632748707542, "grad_norm": 1.9822276830673218, "learning_rate": 8.816736725129246e-07, "loss": 0.2413, "step": 2449 }, { "epoch": 0.11837464366816447, "grad_norm": 6.586766719818115, "learning_rate": 8.816253563318356e-07, "loss": 0.3446, "step": 2450 }, { "epoch": 0.11842295984925351, "grad_norm": 2.862079381942749, "learning_rate": 8.815770401507464e-07, "loss": 0.3094, "step": 2451 }, { "epoch": 0.11847127603034256, "grad_norm": 3.0604329109191895, "learning_rate": 8.815287239696574e-07, "loss": 0.2112, "step": 2452 }, { "epoch": 0.11851959221143162, "grad_norm": 2.547900915145874, "learning_rate": 8.814804077885683e-07, "loss": 0.3272, "step": 2453 }, { "epoch": 0.11856790839252065, "grad_norm": 2.278581380844116, "learning_rate": 8.814320916074793e-07, "loss": 0.2244, "step": 2454 }, { "epoch": 0.11861622457360971, "grad_norm": 2.6859819889068604, "learning_rate": 8.813837754263903e-07, "loss": 0.3501, "step": 2455 }, { "epoch": 0.11866454075469875, "grad_norm": 2.5064711570739746, "learning_rate": 8.813354592453012e-07, "loss": 0.32, "step": 2456 }, { "epoch": 0.1187128569357878, "grad_norm": 2.6905598640441895, "learning_rate": 8.812871430642122e-07, "loss": 0.3483, "step": 2457 }, { "epoch": 0.11876117311687684, "grad_norm": 3.2649691104888916, "learning_rate": 8.812388268831232e-07, "loss": 0.424, "step": 2458 }, { "epoch": 0.11880948929796589, "grad_norm": 2.9363675117492676, "learning_rate": 8.81190510702034e-07, "loss": 0.4104, "step": 2459 }, { "epoch": 0.11885780547905493, "grad_norm": 3.961817502975464, "learning_rate": 8.81142194520945e-07, "loss": 0.2523, "step": 2460 }, { "epoch": 0.11890612166014398, "grad_norm": 3.494259834289551, "learning_rate": 8.810938783398559e-07, "loss": 0.2974, "step": 2461 }, { "epoch": 0.11895443784123302, "grad_norm": 2.7921218872070312, "learning_rate": 8.810455621587669e-07, "loss": 0.4255, "step": 2462 }, { "epoch": 0.11900275402232208, "grad_norm": 11.446711540222168, "learning_rate": 8.809972459776779e-07, "loss": 0.2428, "step": 2463 }, { "epoch": 0.11905107020341112, "grad_norm": 2.5375256538391113, "learning_rate": 8.809489297965889e-07, "loss": 0.2792, "step": 2464 }, { "epoch": 0.11909938638450017, "grad_norm": 2.0773630142211914, "learning_rate": 8.809006136154998e-07, "loss": 0.21, "step": 2465 }, { "epoch": 0.11914770256558922, "grad_norm": 2.5341954231262207, "learning_rate": 8.808522974344107e-07, "loss": 0.3802, "step": 2466 }, { "epoch": 0.11919601874667826, "grad_norm": 4.044219970703125, "learning_rate": 8.808039812533217e-07, "loss": 0.3473, "step": 2467 }, { "epoch": 0.11924433492776731, "grad_norm": 3.3963589668273926, "learning_rate": 8.807556650722326e-07, "loss": 0.2833, "step": 2468 }, { "epoch": 0.11929265110885635, "grad_norm": 2.5694923400878906, "learning_rate": 8.807073488911436e-07, "loss": 0.2172, "step": 2469 }, { "epoch": 0.11934096728994541, "grad_norm": 4.37272310256958, "learning_rate": 8.806590327100546e-07, "loss": 0.3851, "step": 2470 }, { "epoch": 0.11938928347103445, "grad_norm": 2.4811413288116455, "learning_rate": 8.806107165289655e-07, "loss": 0.3191, "step": 2471 }, { "epoch": 0.1194375996521235, "grad_norm": 3.3198466300964355, "learning_rate": 8.805624003478764e-07, "loss": 0.3981, "step": 2472 }, { "epoch": 0.11948591583321254, "grad_norm": 1.8267532587051392, "learning_rate": 8.805140841667874e-07, "loss": 0.2159, "step": 2473 }, { "epoch": 0.11953423201430159, "grad_norm": 6.322642803192139, "learning_rate": 8.804657679856984e-07, "loss": 0.3915, "step": 2474 }, { "epoch": 0.11958254819539063, "grad_norm": 8.520735740661621, "learning_rate": 8.804174518046094e-07, "loss": 0.3416, "step": 2475 }, { "epoch": 0.11963086437647968, "grad_norm": 2.5578644275665283, "learning_rate": 8.803691356235204e-07, "loss": 0.2315, "step": 2476 }, { "epoch": 0.11967918055756872, "grad_norm": 6.002416610717773, "learning_rate": 8.803208194424312e-07, "loss": 0.246, "step": 2477 }, { "epoch": 0.11972749673865778, "grad_norm": 3.5677847862243652, "learning_rate": 8.802725032613421e-07, "loss": 0.252, "step": 2478 }, { "epoch": 0.11977581291974683, "grad_norm": 2.780714511871338, "learning_rate": 8.802241870802531e-07, "loss": 0.4061, "step": 2479 }, { "epoch": 0.11982412910083587, "grad_norm": 3.275657892227173, "learning_rate": 8.801758708991641e-07, "loss": 0.324, "step": 2480 }, { "epoch": 0.11987244528192492, "grad_norm": 2.2201530933380127, "learning_rate": 8.801275547180751e-07, "loss": 0.2738, "step": 2481 }, { "epoch": 0.11992076146301396, "grad_norm": 2.2595865726470947, "learning_rate": 8.80079238536986e-07, "loss": 0.2357, "step": 2482 }, { "epoch": 0.11996907764410301, "grad_norm": 4.0910115242004395, "learning_rate": 8.80030922355897e-07, "loss": 0.3506, "step": 2483 }, { "epoch": 0.12001739382519205, "grad_norm": 3.0581088066101074, "learning_rate": 8.79982606174808e-07, "loss": 0.4408, "step": 2484 }, { "epoch": 0.1200657100062811, "grad_norm": 2.699586868286133, "learning_rate": 8.799342899937188e-07, "loss": 0.3862, "step": 2485 }, { "epoch": 0.12011402618737015, "grad_norm": 2.7967491149902344, "learning_rate": 8.798859738126298e-07, "loss": 0.3677, "step": 2486 }, { "epoch": 0.1201623423684592, "grad_norm": 2.7507224082946777, "learning_rate": 8.798376576315407e-07, "loss": 0.3926, "step": 2487 }, { "epoch": 0.12021065854954824, "grad_norm": 2.2668004035949707, "learning_rate": 8.797893414504517e-07, "loss": 0.2199, "step": 2488 }, { "epoch": 0.12025897473063729, "grad_norm": 2.095376968383789, "learning_rate": 8.797410252693627e-07, "loss": 0.2162, "step": 2489 }, { "epoch": 0.12030729091172633, "grad_norm": 2.0303685665130615, "learning_rate": 8.796927090882737e-07, "loss": 0.2645, "step": 2490 }, { "epoch": 0.12035560709281538, "grad_norm": 2.04852557182312, "learning_rate": 8.796443929071846e-07, "loss": 0.1888, "step": 2491 }, { "epoch": 0.12040392327390444, "grad_norm": 2.473123550415039, "learning_rate": 8.795960767260955e-07, "loss": 0.2768, "step": 2492 }, { "epoch": 0.12045223945499348, "grad_norm": 3.289851665496826, "learning_rate": 8.795477605450064e-07, "loss": 0.3282, "step": 2493 }, { "epoch": 0.12050055563608253, "grad_norm": 3.1161177158355713, "learning_rate": 8.794994443639174e-07, "loss": 0.526, "step": 2494 }, { "epoch": 0.12054887181717157, "grad_norm": 2.826977014541626, "learning_rate": 8.794511281828284e-07, "loss": 0.3682, "step": 2495 }, { "epoch": 0.12059718799826062, "grad_norm": 3.1638195514678955, "learning_rate": 8.794028120017394e-07, "loss": 0.479, "step": 2496 }, { "epoch": 0.12064550417934966, "grad_norm": 8.709050178527832, "learning_rate": 8.793544958206502e-07, "loss": 0.5481, "step": 2497 }, { "epoch": 0.12069382036043871, "grad_norm": 3.38114070892334, "learning_rate": 8.793061796395612e-07, "loss": 0.47, "step": 2498 }, { "epoch": 0.12074213654152775, "grad_norm": 2.4078187942504883, "learning_rate": 8.792578634584722e-07, "loss": 0.2956, "step": 2499 }, { "epoch": 0.1207904527226168, "grad_norm": 10.760485649108887, "learning_rate": 8.792095472773832e-07, "loss": 0.2237, "step": 2500 }, { "epoch": 0.12083876890370585, "grad_norm": 3.2063772678375244, "learning_rate": 8.791612310962942e-07, "loss": 0.3663, "step": 2501 }, { "epoch": 0.1208870850847949, "grad_norm": 2.464689254760742, "learning_rate": 8.791129149152051e-07, "loss": 0.3544, "step": 2502 }, { "epoch": 0.12093540126588394, "grad_norm": 6.721715450286865, "learning_rate": 8.79064598734116e-07, "loss": 0.2257, "step": 2503 }, { "epoch": 0.12098371744697299, "grad_norm": 2.708156108856201, "learning_rate": 8.790162825530269e-07, "loss": 0.3939, "step": 2504 }, { "epoch": 0.12103203362806204, "grad_norm": 3.885408401489258, "learning_rate": 8.789679663719379e-07, "loss": 0.3044, "step": 2505 }, { "epoch": 0.12108034980915108, "grad_norm": 2.9514389038085938, "learning_rate": 8.789196501908489e-07, "loss": 0.3252, "step": 2506 }, { "epoch": 0.12112866599024014, "grad_norm": 2.7554867267608643, "learning_rate": 8.788713340097599e-07, "loss": 0.372, "step": 2507 }, { "epoch": 0.12117698217132918, "grad_norm": 3.2229323387145996, "learning_rate": 8.788230178286708e-07, "loss": 0.4073, "step": 2508 }, { "epoch": 0.12122529835241823, "grad_norm": 3.814176559448242, "learning_rate": 8.787747016475818e-07, "loss": 0.4552, "step": 2509 }, { "epoch": 0.12127361453350727, "grad_norm": 2.455376625061035, "learning_rate": 8.787263854664926e-07, "loss": 0.2992, "step": 2510 }, { "epoch": 0.12132193071459632, "grad_norm": 2.513532876968384, "learning_rate": 8.786780692854036e-07, "loss": 0.274, "step": 2511 }, { "epoch": 0.12137024689568536, "grad_norm": 3.1485443115234375, "learning_rate": 8.786297531043146e-07, "loss": 0.4163, "step": 2512 }, { "epoch": 0.12141856307677441, "grad_norm": 2.421776056289673, "learning_rate": 8.785814369232255e-07, "loss": 0.2708, "step": 2513 }, { "epoch": 0.12146687925786345, "grad_norm": 2.8491790294647217, "learning_rate": 8.785331207421365e-07, "loss": 0.3023, "step": 2514 }, { "epoch": 0.1215151954389525, "grad_norm": 2.749758005142212, "learning_rate": 8.784848045610475e-07, "loss": 0.3705, "step": 2515 }, { "epoch": 0.12156351162004154, "grad_norm": 3.206486463546753, "learning_rate": 8.784364883799585e-07, "loss": 0.3468, "step": 2516 }, { "epoch": 0.1216118278011306, "grad_norm": 2.7599503993988037, "learning_rate": 8.783881721988694e-07, "loss": 0.3875, "step": 2517 }, { "epoch": 0.12166014398221965, "grad_norm": 3.759039878845215, "learning_rate": 8.783398560177802e-07, "loss": 0.4152, "step": 2518 }, { "epoch": 0.12170846016330869, "grad_norm": 1.8437825441360474, "learning_rate": 8.782915398366912e-07, "loss": 0.2285, "step": 2519 }, { "epoch": 0.12175677634439774, "grad_norm": 2.516550064086914, "learning_rate": 8.782432236556022e-07, "loss": 0.3322, "step": 2520 }, { "epoch": 0.12180509252548678, "grad_norm": 3.0075740814208984, "learning_rate": 8.781949074745132e-07, "loss": 0.3668, "step": 2521 }, { "epoch": 0.12185340870657584, "grad_norm": 2.5683398246765137, "learning_rate": 8.781465912934242e-07, "loss": 0.2736, "step": 2522 }, { "epoch": 0.12190172488766488, "grad_norm": 3.716181516647339, "learning_rate": 8.78098275112335e-07, "loss": 0.3206, "step": 2523 }, { "epoch": 0.12195004106875393, "grad_norm": 2.380643844604492, "learning_rate": 8.78049958931246e-07, "loss": 0.2334, "step": 2524 }, { "epoch": 0.12199835724984297, "grad_norm": 2.0984225273132324, "learning_rate": 8.78001642750157e-07, "loss": 0.2015, "step": 2525 }, { "epoch": 0.12204667343093202, "grad_norm": 3.0142133235931396, "learning_rate": 8.77953326569068e-07, "loss": 0.3119, "step": 2526 }, { "epoch": 0.12209498961202106, "grad_norm": 2.1147379875183105, "learning_rate": 8.77905010387979e-07, "loss": 0.2572, "step": 2527 }, { "epoch": 0.12214330579311011, "grad_norm": 3.2102653980255127, "learning_rate": 8.778566942068899e-07, "loss": 0.4786, "step": 2528 }, { "epoch": 0.12219162197419917, "grad_norm": 2.402111291885376, "learning_rate": 8.778083780258007e-07, "loss": 0.2607, "step": 2529 }, { "epoch": 0.1222399381552882, "grad_norm": 2.5587329864501953, "learning_rate": 8.777600618447117e-07, "loss": 0.443, "step": 2530 }, { "epoch": 0.12228825433637726, "grad_norm": 1.7535089254379272, "learning_rate": 8.777117456636227e-07, "loss": 0.2212, "step": 2531 }, { "epoch": 0.1223365705174663, "grad_norm": 3.0532326698303223, "learning_rate": 8.776634294825337e-07, "loss": 0.3792, "step": 2532 }, { "epoch": 0.12238488669855535, "grad_norm": 4.07346773147583, "learning_rate": 8.776151133014447e-07, "loss": 0.3318, "step": 2533 }, { "epoch": 0.12243320287964439, "grad_norm": 3.7084763050079346, "learning_rate": 8.775667971203556e-07, "loss": 0.2898, "step": 2534 }, { "epoch": 0.12248151906073344, "grad_norm": 2.957853317260742, "learning_rate": 8.775184809392666e-07, "loss": 0.2876, "step": 2535 }, { "epoch": 0.12252983524182248, "grad_norm": 6.614701271057129, "learning_rate": 8.774701647581774e-07, "loss": 0.4995, "step": 2536 }, { "epoch": 0.12257815142291154, "grad_norm": 5.923469543457031, "learning_rate": 8.774218485770884e-07, "loss": 0.2777, "step": 2537 }, { "epoch": 0.12262646760400057, "grad_norm": 2.3593544960021973, "learning_rate": 8.773735323959994e-07, "loss": 0.3064, "step": 2538 }, { "epoch": 0.12267478378508963, "grad_norm": 3.251993417739868, "learning_rate": 8.773252162149103e-07, "loss": 0.2692, "step": 2539 }, { "epoch": 0.12272309996617867, "grad_norm": 2.3671727180480957, "learning_rate": 8.772769000338213e-07, "loss": 0.3378, "step": 2540 }, { "epoch": 0.12277141614726772, "grad_norm": 3.647601842880249, "learning_rate": 8.772285838527323e-07, "loss": 0.2928, "step": 2541 }, { "epoch": 0.12281973232835677, "grad_norm": 3.331488609313965, "learning_rate": 8.771802676716432e-07, "loss": 0.4144, "step": 2542 }, { "epoch": 0.12286804850944581, "grad_norm": 3.09098219871521, "learning_rate": 8.771319514905542e-07, "loss": 0.3881, "step": 2543 }, { "epoch": 0.12291636469053487, "grad_norm": 2.748107671737671, "learning_rate": 8.77083635309465e-07, "loss": 0.2872, "step": 2544 }, { "epoch": 0.1229646808716239, "grad_norm": 3.3320398330688477, "learning_rate": 8.77035319128376e-07, "loss": 0.3059, "step": 2545 }, { "epoch": 0.12301299705271296, "grad_norm": 5.198957920074463, "learning_rate": 8.76987002947287e-07, "loss": 0.3656, "step": 2546 }, { "epoch": 0.123061313233802, "grad_norm": 2.58319091796875, "learning_rate": 8.76938686766198e-07, "loss": 0.2354, "step": 2547 }, { "epoch": 0.12310962941489105, "grad_norm": 2.846827507019043, "learning_rate": 8.76890370585109e-07, "loss": 0.2494, "step": 2548 }, { "epoch": 0.12315794559598009, "grad_norm": 2.0916786193847656, "learning_rate": 8.768420544040198e-07, "loss": 0.2799, "step": 2549 }, { "epoch": 0.12320626177706914, "grad_norm": 2.1319549083709717, "learning_rate": 8.767937382229308e-07, "loss": 0.2295, "step": 2550 }, { "epoch": 0.12325457795815818, "grad_norm": 8.582210540771484, "learning_rate": 8.767454220418418e-07, "loss": 0.3532, "step": 2551 }, { "epoch": 0.12330289413924723, "grad_norm": 1.4600765705108643, "learning_rate": 8.766971058607528e-07, "loss": 0.1693, "step": 2552 }, { "epoch": 0.12335121032033627, "grad_norm": 1.938454031944275, "learning_rate": 8.766487896796637e-07, "loss": 0.2506, "step": 2553 }, { "epoch": 0.12339952650142533, "grad_norm": 2.5201425552368164, "learning_rate": 8.766004734985746e-07, "loss": 0.2223, "step": 2554 }, { "epoch": 0.12344784268251438, "grad_norm": 2.6018640995025635, "learning_rate": 8.765521573174855e-07, "loss": 0.2782, "step": 2555 }, { "epoch": 0.12349615886360342, "grad_norm": 6.759713649749756, "learning_rate": 8.765038411363965e-07, "loss": 0.4206, "step": 2556 }, { "epoch": 0.12354447504469247, "grad_norm": 2.202219247817993, "learning_rate": 8.764555249553075e-07, "loss": 0.2333, "step": 2557 }, { "epoch": 0.12359279122578151, "grad_norm": 2.8955860137939453, "learning_rate": 8.764072087742185e-07, "loss": 0.4256, "step": 2558 }, { "epoch": 0.12364110740687057, "grad_norm": 40.312076568603516, "learning_rate": 8.763588925931295e-07, "loss": 0.3182, "step": 2559 }, { "epoch": 0.1236894235879596, "grad_norm": 2.1844258308410645, "learning_rate": 8.763105764120404e-07, "loss": 0.2507, "step": 2560 }, { "epoch": 0.12373773976904866, "grad_norm": 2.6787257194519043, "learning_rate": 8.762622602309512e-07, "loss": 0.3094, "step": 2561 }, { "epoch": 0.1237860559501377, "grad_norm": 2.5868453979492188, "learning_rate": 8.762139440498622e-07, "loss": 0.2736, "step": 2562 }, { "epoch": 0.12383437213122675, "grad_norm": 1.6122422218322754, "learning_rate": 8.761656278687732e-07, "loss": 0.1711, "step": 2563 }, { "epoch": 0.12388268831231579, "grad_norm": 2.5644147396087646, "learning_rate": 8.761173116876842e-07, "loss": 0.3286, "step": 2564 }, { "epoch": 0.12393100449340484, "grad_norm": 2.8768553733825684, "learning_rate": 8.760689955065951e-07, "loss": 0.3231, "step": 2565 }, { "epoch": 0.12397932067449388, "grad_norm": 2.254188060760498, "learning_rate": 8.760206793255061e-07, "loss": 0.2874, "step": 2566 }, { "epoch": 0.12402763685558293, "grad_norm": 2.73983097076416, "learning_rate": 8.759723631444171e-07, "loss": 0.2319, "step": 2567 }, { "epoch": 0.12407595303667199, "grad_norm": 1.7424302101135254, "learning_rate": 8.75924046963328e-07, "loss": 0.1752, "step": 2568 }, { "epoch": 0.12412426921776103, "grad_norm": 5.096596717834473, "learning_rate": 8.75875730782239e-07, "loss": 0.4055, "step": 2569 }, { "epoch": 0.12417258539885008, "grad_norm": 3.1276795864105225, "learning_rate": 8.758274146011498e-07, "loss": 0.3875, "step": 2570 }, { "epoch": 0.12422090157993912, "grad_norm": 5.251519203186035, "learning_rate": 8.757790984200608e-07, "loss": 0.3483, "step": 2571 }, { "epoch": 0.12426921776102817, "grad_norm": 3.7606801986694336, "learning_rate": 8.757307822389718e-07, "loss": 0.2307, "step": 2572 }, { "epoch": 0.12431753394211721, "grad_norm": 2.578868865966797, "learning_rate": 8.756824660578828e-07, "loss": 0.2328, "step": 2573 }, { "epoch": 0.12436585012320626, "grad_norm": 2.3540945053100586, "learning_rate": 8.756341498767937e-07, "loss": 0.2285, "step": 2574 }, { "epoch": 0.1244141663042953, "grad_norm": 2.4326670169830322, "learning_rate": 8.755858336957046e-07, "loss": 0.2505, "step": 2575 }, { "epoch": 0.12446248248538436, "grad_norm": 3.2501680850982666, "learning_rate": 8.755375175146156e-07, "loss": 0.333, "step": 2576 }, { "epoch": 0.1245107986664734, "grad_norm": 2.7435712814331055, "learning_rate": 8.754892013335266e-07, "loss": 0.2369, "step": 2577 }, { "epoch": 0.12455911484756245, "grad_norm": 2.635615825653076, "learning_rate": 8.754408851524375e-07, "loss": 0.249, "step": 2578 }, { "epoch": 0.12460743102865149, "grad_norm": 2.6792678833007812, "learning_rate": 8.753925689713485e-07, "loss": 0.3679, "step": 2579 }, { "epoch": 0.12465574720974054, "grad_norm": 3.5366458892822266, "learning_rate": 8.753442527902593e-07, "loss": 0.2967, "step": 2580 }, { "epoch": 0.1247040633908296, "grad_norm": 2.3829944133758545, "learning_rate": 8.752959366091703e-07, "loss": 0.2587, "step": 2581 }, { "epoch": 0.12475237957191863, "grad_norm": 3.000523567199707, "learning_rate": 8.752476204280813e-07, "loss": 0.204, "step": 2582 }, { "epoch": 0.12480069575300769, "grad_norm": 2.6335535049438477, "learning_rate": 8.751993042469923e-07, "loss": 0.2669, "step": 2583 }, { "epoch": 0.12484901193409673, "grad_norm": 5.0025763511657715, "learning_rate": 8.751509880659033e-07, "loss": 0.3757, "step": 2584 }, { "epoch": 0.12489732811518578, "grad_norm": 3.504944086074829, "learning_rate": 8.751026718848143e-07, "loss": 0.4975, "step": 2585 }, { "epoch": 0.12494564429627482, "grad_norm": 1.9372817277908325, "learning_rate": 8.750543557037251e-07, "loss": 0.2428, "step": 2586 }, { "epoch": 0.12499396047736387, "grad_norm": 2.0724849700927734, "learning_rate": 8.75006039522636e-07, "loss": 0.1925, "step": 2587 }, { "epoch": 0.1250422766584529, "grad_norm": 1.8455744981765747, "learning_rate": 8.74957723341547e-07, "loss": 0.2147, "step": 2588 }, { "epoch": 0.12509059283954196, "grad_norm": 3.6028780937194824, "learning_rate": 8.74909407160458e-07, "loss": 0.5106, "step": 2589 }, { "epoch": 0.12513890902063102, "grad_norm": 4.898664474487305, "learning_rate": 8.74861090979369e-07, "loss": 0.3695, "step": 2590 }, { "epoch": 0.12518722520172004, "grad_norm": 3.707951068878174, "learning_rate": 8.748127747982799e-07, "loss": 0.3264, "step": 2591 }, { "epoch": 0.1252355413828091, "grad_norm": 2.6948535442352295, "learning_rate": 8.747644586171909e-07, "loss": 0.2198, "step": 2592 }, { "epoch": 0.12528385756389815, "grad_norm": 2.1505699157714844, "learning_rate": 8.747161424361018e-07, "loss": 0.3107, "step": 2593 }, { "epoch": 0.1253321737449872, "grad_norm": 2.6988325119018555, "learning_rate": 8.746678262550128e-07, "loss": 0.3669, "step": 2594 }, { "epoch": 0.12538048992607626, "grad_norm": 2.3571271896362305, "learning_rate": 8.746195100739237e-07, "loss": 0.2218, "step": 2595 }, { "epoch": 0.12542880610716528, "grad_norm": 2.9159436225891113, "learning_rate": 8.745711938928346e-07, "loss": 0.3433, "step": 2596 }, { "epoch": 0.12547712228825433, "grad_norm": 1.7353500127792358, "learning_rate": 8.745228777117456e-07, "loss": 0.1685, "step": 2597 }, { "epoch": 0.1255254384693434, "grad_norm": 4.112872123718262, "learning_rate": 8.744745615306566e-07, "loss": 0.1528, "step": 2598 }, { "epoch": 0.12557375465043244, "grad_norm": 2.2733447551727295, "learning_rate": 8.744262453495676e-07, "loss": 0.2987, "step": 2599 }, { "epoch": 0.12562207083152147, "grad_norm": 2.5628418922424316, "learning_rate": 8.743779291684785e-07, "loss": 0.2878, "step": 2600 }, { "epoch": 0.12567038701261052, "grad_norm": 2.1603522300720215, "learning_rate": 8.743296129873894e-07, "loss": 0.2852, "step": 2601 }, { "epoch": 0.12571870319369957, "grad_norm": 4.113892078399658, "learning_rate": 8.742812968063004e-07, "loss": 0.2102, "step": 2602 }, { "epoch": 0.12576701937478862, "grad_norm": 2.8916304111480713, "learning_rate": 8.742329806252113e-07, "loss": 0.3071, "step": 2603 }, { "epoch": 0.12581533555587765, "grad_norm": 3.4270899295806885, "learning_rate": 8.741846644441223e-07, "loss": 0.2829, "step": 2604 }, { "epoch": 0.1258636517369667, "grad_norm": 2.7290198802948, "learning_rate": 8.741363482630333e-07, "loss": 0.2834, "step": 2605 }, { "epoch": 0.12591196791805576, "grad_norm": 2.561495542526245, "learning_rate": 8.740880320819441e-07, "loss": 0.3119, "step": 2606 }, { "epoch": 0.1259602840991448, "grad_norm": 1.7310001850128174, "learning_rate": 8.740397159008551e-07, "loss": 0.2144, "step": 2607 }, { "epoch": 0.12600860028023386, "grad_norm": 1.9527513980865479, "learning_rate": 8.739913997197661e-07, "loss": 0.2117, "step": 2608 }, { "epoch": 0.1260569164613229, "grad_norm": 3.218355178833008, "learning_rate": 8.739430835386771e-07, "loss": 0.3205, "step": 2609 }, { "epoch": 0.12610523264241194, "grad_norm": 2.629610061645508, "learning_rate": 8.738947673575881e-07, "loss": 0.2475, "step": 2610 }, { "epoch": 0.126153548823501, "grad_norm": 2.878283977508545, "learning_rate": 8.738464511764991e-07, "loss": 0.3306, "step": 2611 }, { "epoch": 0.12620186500459005, "grad_norm": 2.3844642639160156, "learning_rate": 8.737981349954098e-07, "loss": 0.3675, "step": 2612 }, { "epoch": 0.12625018118567907, "grad_norm": 1.9592992067337036, "learning_rate": 8.737498188143208e-07, "loss": 0.2017, "step": 2613 }, { "epoch": 0.12629849736676813, "grad_norm": 3.5119807720184326, "learning_rate": 8.737015026332318e-07, "loss": 0.4104, "step": 2614 }, { "epoch": 0.12634681354785718, "grad_norm": 2.408578395843506, "learning_rate": 8.736531864521428e-07, "loss": 0.2885, "step": 2615 }, { "epoch": 0.12639512972894623, "grad_norm": 2.4164624214172363, "learning_rate": 8.736048702710538e-07, "loss": 0.2227, "step": 2616 }, { "epoch": 0.12644344591003526, "grad_norm": 45.51201629638672, "learning_rate": 8.735565540899647e-07, "loss": 0.281, "step": 2617 }, { "epoch": 0.1264917620911243, "grad_norm": 4.708984375, "learning_rate": 8.735082379088757e-07, "loss": 0.4137, "step": 2618 }, { "epoch": 0.12654007827221336, "grad_norm": 6.704923629760742, "learning_rate": 8.734599217277866e-07, "loss": 0.2317, "step": 2619 }, { "epoch": 0.12658839445330242, "grad_norm": 4.125304222106934, "learning_rate": 8.734116055466975e-07, "loss": 0.3714, "step": 2620 }, { "epoch": 0.12663671063439147, "grad_norm": 2.3046047687530518, "learning_rate": 8.733632893656085e-07, "loss": 0.2788, "step": 2621 }, { "epoch": 0.1266850268154805, "grad_norm": 1.9812064170837402, "learning_rate": 8.733149731845194e-07, "loss": 0.2572, "step": 2622 }, { "epoch": 0.12673334299656955, "grad_norm": 4.087294101715088, "learning_rate": 8.732666570034304e-07, "loss": 0.2956, "step": 2623 }, { "epoch": 0.1267816591776586, "grad_norm": 2.6639997959136963, "learning_rate": 8.732183408223414e-07, "loss": 0.222, "step": 2624 }, { "epoch": 0.12682997535874765, "grad_norm": 3.020033359527588, "learning_rate": 8.731700246412523e-07, "loss": 0.352, "step": 2625 }, { "epoch": 0.12687829153983668, "grad_norm": 14.26534652709961, "learning_rate": 8.731217084601633e-07, "loss": 0.3055, "step": 2626 }, { "epoch": 0.12692660772092573, "grad_norm": 2.137787103652954, "learning_rate": 8.730733922790742e-07, "loss": 0.2118, "step": 2627 }, { "epoch": 0.12697492390201479, "grad_norm": 3.7545247077941895, "learning_rate": 8.730250760979852e-07, "loss": 0.298, "step": 2628 }, { "epoch": 0.12702324008310384, "grad_norm": 26.090778350830078, "learning_rate": 8.729767599168961e-07, "loss": 0.3512, "step": 2629 }, { "epoch": 0.1270715562641929, "grad_norm": 2.9922566413879395, "learning_rate": 8.729284437358071e-07, "loss": 0.3319, "step": 2630 }, { "epoch": 0.12711987244528192, "grad_norm": 2.1092982292175293, "learning_rate": 8.728801275547181e-07, "loss": 0.2426, "step": 2631 }, { "epoch": 0.12716818862637097, "grad_norm": 2.453057289123535, "learning_rate": 8.728318113736289e-07, "loss": 0.3917, "step": 2632 }, { "epoch": 0.12721650480746002, "grad_norm": 2.051279067993164, "learning_rate": 8.727834951925399e-07, "loss": 0.2493, "step": 2633 }, { "epoch": 0.12726482098854908, "grad_norm": 3.2152793407440186, "learning_rate": 8.727351790114509e-07, "loss": 0.3658, "step": 2634 }, { "epoch": 0.1273131371696381, "grad_norm": 3.7355704307556152, "learning_rate": 8.726868628303619e-07, "loss": 0.4588, "step": 2635 }, { "epoch": 0.12736145335072716, "grad_norm": 2.5899839401245117, "learning_rate": 8.726385466492729e-07, "loss": 0.2751, "step": 2636 }, { "epoch": 0.1274097695318162, "grad_norm": 2.933659315109253, "learning_rate": 8.725902304681839e-07, "loss": 0.2039, "step": 2637 }, { "epoch": 0.12745808571290526, "grad_norm": 3.8357152938842773, "learning_rate": 8.725419142870946e-07, "loss": 0.2762, "step": 2638 }, { "epoch": 0.1275064018939943, "grad_norm": 4.71292781829834, "learning_rate": 8.724935981060056e-07, "loss": 0.2746, "step": 2639 }, { "epoch": 0.12755471807508334, "grad_norm": 3.9693312644958496, "learning_rate": 8.724452819249166e-07, "loss": 0.3707, "step": 2640 }, { "epoch": 0.1276030342561724, "grad_norm": 7.653265476226807, "learning_rate": 8.723969657438276e-07, "loss": 0.3856, "step": 2641 }, { "epoch": 0.12765135043726145, "grad_norm": 4.016003131866455, "learning_rate": 8.723486495627386e-07, "loss": 0.3837, "step": 2642 }, { "epoch": 0.1276996666183505, "grad_norm": 2.6322343349456787, "learning_rate": 8.723003333816495e-07, "loss": 0.3121, "step": 2643 }, { "epoch": 0.12774798279943952, "grad_norm": 4.866697788238525, "learning_rate": 8.722520172005604e-07, "loss": 0.2352, "step": 2644 }, { "epoch": 0.12779629898052858, "grad_norm": 3.146815299987793, "learning_rate": 8.722037010194713e-07, "loss": 0.3727, "step": 2645 }, { "epoch": 0.12784461516161763, "grad_norm": 5.165810585021973, "learning_rate": 8.721553848383823e-07, "loss": 0.385, "step": 2646 }, { "epoch": 0.12789293134270668, "grad_norm": 2.145902156829834, "learning_rate": 8.721070686572933e-07, "loss": 0.2331, "step": 2647 }, { "epoch": 0.1279412475237957, "grad_norm": 3.389313220977783, "learning_rate": 8.720587524762042e-07, "loss": 0.2708, "step": 2648 }, { "epoch": 0.12798956370488476, "grad_norm": 2.634920597076416, "learning_rate": 8.720104362951152e-07, "loss": 0.2938, "step": 2649 }, { "epoch": 0.12803787988597382, "grad_norm": 2.5020153522491455, "learning_rate": 8.719621201140262e-07, "loss": 0.2667, "step": 2650 }, { "epoch": 0.12808619606706287, "grad_norm": 3.813955307006836, "learning_rate": 8.719138039329371e-07, "loss": 0.5134, "step": 2651 }, { "epoch": 0.1281345122481519, "grad_norm": 2.713226795196533, "learning_rate": 8.718654877518481e-07, "loss": 0.272, "step": 2652 }, { "epoch": 0.12818282842924095, "grad_norm": 2.711160182952881, "learning_rate": 8.71817171570759e-07, "loss": 0.2829, "step": 2653 }, { "epoch": 0.12823114461033, "grad_norm": 2.6052560806274414, "learning_rate": 8.717688553896699e-07, "loss": 0.2808, "step": 2654 }, { "epoch": 0.12827946079141905, "grad_norm": 3.171461343765259, "learning_rate": 8.717205392085809e-07, "loss": 0.4626, "step": 2655 }, { "epoch": 0.1283277769725081, "grad_norm": 3.5980613231658936, "learning_rate": 8.716722230274919e-07, "loss": 0.3994, "step": 2656 }, { "epoch": 0.12837609315359713, "grad_norm": 3.2162985801696777, "learning_rate": 8.716239068464028e-07, "loss": 0.4004, "step": 2657 }, { "epoch": 0.12842440933468618, "grad_norm": 1.5570504665374756, "learning_rate": 8.715755906653137e-07, "loss": 0.1971, "step": 2658 }, { "epoch": 0.12847272551577524, "grad_norm": 3.532236337661743, "learning_rate": 8.715272744842247e-07, "loss": 0.4416, "step": 2659 }, { "epoch": 0.1285210416968643, "grad_norm": 2.857842445373535, "learning_rate": 8.714789583031357e-07, "loss": 0.4197, "step": 2660 }, { "epoch": 0.12856935787795332, "grad_norm": 3.511122941970825, "learning_rate": 8.714306421220467e-07, "loss": 0.3849, "step": 2661 }, { "epoch": 0.12861767405904237, "grad_norm": 9.866250991821289, "learning_rate": 8.713823259409577e-07, "loss": 0.3204, "step": 2662 }, { "epoch": 0.12866599024013142, "grad_norm": 2.8715288639068604, "learning_rate": 8.713340097598686e-07, "loss": 0.3655, "step": 2663 }, { "epoch": 0.12871430642122048, "grad_norm": 1.867766261100769, "learning_rate": 8.712856935787794e-07, "loss": 0.2182, "step": 2664 }, { "epoch": 0.1287626226023095, "grad_norm": 1.7959388494491577, "learning_rate": 8.712373773976904e-07, "loss": 0.2103, "step": 2665 }, { "epoch": 0.12881093878339855, "grad_norm": 2.9724326133728027, "learning_rate": 8.711890612166014e-07, "loss": 0.3085, "step": 2666 }, { "epoch": 0.1288592549644876, "grad_norm": 4.096614837646484, "learning_rate": 8.711407450355124e-07, "loss": 0.2556, "step": 2667 }, { "epoch": 0.12890757114557666, "grad_norm": 2.956155300140381, "learning_rate": 8.710924288544234e-07, "loss": 0.3856, "step": 2668 }, { "epoch": 0.1289558873266657, "grad_norm": 2.3662683963775635, "learning_rate": 8.710441126733343e-07, "loss": 0.2455, "step": 2669 }, { "epoch": 0.12900420350775474, "grad_norm": 2.5699033737182617, "learning_rate": 8.709957964922452e-07, "loss": 0.3134, "step": 2670 }, { "epoch": 0.1290525196888438, "grad_norm": 2.943434476852417, "learning_rate": 8.709474803111561e-07, "loss": 0.3487, "step": 2671 }, { "epoch": 0.12910083586993285, "grad_norm": 3.0773677825927734, "learning_rate": 8.708991641300671e-07, "loss": 0.2684, "step": 2672 }, { "epoch": 0.1291491520510219, "grad_norm": 2.722012758255005, "learning_rate": 8.708508479489781e-07, "loss": 0.3019, "step": 2673 }, { "epoch": 0.12919746823211092, "grad_norm": 2.372995138168335, "learning_rate": 8.70802531767889e-07, "loss": 0.3241, "step": 2674 }, { "epoch": 0.12924578441319998, "grad_norm": 3.1269848346710205, "learning_rate": 8.707542155868e-07, "loss": 0.303, "step": 2675 }, { "epoch": 0.12929410059428903, "grad_norm": 2.4479007720947266, "learning_rate": 8.707058994057109e-07, "loss": 0.2746, "step": 2676 }, { "epoch": 0.12934241677537808, "grad_norm": 16.02507209777832, "learning_rate": 8.706575832246219e-07, "loss": 0.5026, "step": 2677 }, { "epoch": 0.1293907329564671, "grad_norm": 5.1255598068237305, "learning_rate": 8.706092670435329e-07, "loss": 0.3863, "step": 2678 }, { "epoch": 0.12943904913755616, "grad_norm": 2.0827596187591553, "learning_rate": 8.705609508624437e-07, "loss": 0.2118, "step": 2679 }, { "epoch": 0.12948736531864521, "grad_norm": 2.4921677112579346, "learning_rate": 8.705126346813547e-07, "loss": 0.2863, "step": 2680 }, { "epoch": 0.12953568149973427, "grad_norm": 4.015111923217773, "learning_rate": 8.704643185002657e-07, "loss": 0.3925, "step": 2681 }, { "epoch": 0.12958399768082332, "grad_norm": 3.315593957901001, "learning_rate": 8.704160023191767e-07, "loss": 0.3155, "step": 2682 }, { "epoch": 0.12963231386191235, "grad_norm": 2.039189100265503, "learning_rate": 8.703676861380876e-07, "loss": 0.2178, "step": 2683 }, { "epoch": 0.1296806300430014, "grad_norm": 2.0357666015625, "learning_rate": 8.703193699569985e-07, "loss": 0.2231, "step": 2684 }, { "epoch": 0.12972894622409045, "grad_norm": 2.2522201538085938, "learning_rate": 8.702710537759095e-07, "loss": 0.2867, "step": 2685 }, { "epoch": 0.1297772624051795, "grad_norm": 3.6400156021118164, "learning_rate": 8.702227375948205e-07, "loss": 0.2574, "step": 2686 }, { "epoch": 0.12982557858626853, "grad_norm": 2.86960506439209, "learning_rate": 8.701744214137315e-07, "loss": 0.3723, "step": 2687 }, { "epoch": 0.12987389476735758, "grad_norm": 4.787465572357178, "learning_rate": 8.701261052326424e-07, "loss": 0.3293, "step": 2688 }, { "epoch": 0.12992221094844664, "grad_norm": 1.707351565361023, "learning_rate": 8.700777890515533e-07, "loss": 0.2094, "step": 2689 }, { "epoch": 0.1299705271295357, "grad_norm": 2.5777764320373535, "learning_rate": 8.700294728704642e-07, "loss": 0.3873, "step": 2690 }, { "epoch": 0.13001884331062472, "grad_norm": 2.4283432960510254, "learning_rate": 8.699811566893752e-07, "loss": 0.2074, "step": 2691 }, { "epoch": 0.13006715949171377, "grad_norm": 2.6417155265808105, "learning_rate": 8.699328405082862e-07, "loss": 0.2578, "step": 2692 }, { "epoch": 0.13011547567280282, "grad_norm": 2.5594751834869385, "learning_rate": 8.698845243271972e-07, "loss": 0.3243, "step": 2693 }, { "epoch": 0.13016379185389187, "grad_norm": 4.596257209777832, "learning_rate": 8.698362081461082e-07, "loss": 0.2871, "step": 2694 }, { "epoch": 0.13021210803498093, "grad_norm": 3.914537191390991, "learning_rate": 8.69787891965019e-07, "loss": 0.455, "step": 2695 }, { "epoch": 0.13026042421606995, "grad_norm": 2.2681548595428467, "learning_rate": 8.697395757839299e-07, "loss": 0.2113, "step": 2696 }, { "epoch": 0.130308740397159, "grad_norm": 2.6428470611572266, "learning_rate": 8.696912596028409e-07, "loss": 0.2811, "step": 2697 }, { "epoch": 0.13035705657824806, "grad_norm": 2.7680575847625732, "learning_rate": 8.696429434217519e-07, "loss": 0.3464, "step": 2698 }, { "epoch": 0.1304053727593371, "grad_norm": 2.65498948097229, "learning_rate": 8.695946272406629e-07, "loss": 0.3259, "step": 2699 }, { "epoch": 0.13045368894042614, "grad_norm": 1.7864612340927124, "learning_rate": 8.695463110595738e-07, "loss": 0.2186, "step": 2700 }, { "epoch": 0.1305020051215152, "grad_norm": 2.020329236984253, "learning_rate": 8.694979948784848e-07, "loss": 0.2406, "step": 2701 }, { "epoch": 0.13055032130260424, "grad_norm": 3.7622320652008057, "learning_rate": 8.694496786973957e-07, "loss": 0.5017, "step": 2702 }, { "epoch": 0.1305986374836933, "grad_norm": 7.177149772644043, "learning_rate": 8.694013625163067e-07, "loss": 0.2378, "step": 2703 }, { "epoch": 0.13064695366478232, "grad_norm": 2.545184373855591, "learning_rate": 8.693530463352177e-07, "loss": 0.2981, "step": 2704 }, { "epoch": 0.13069526984587138, "grad_norm": 3.0976972579956055, "learning_rate": 8.693047301541285e-07, "loss": 0.4577, "step": 2705 }, { "epoch": 0.13074358602696043, "grad_norm": 26.949602127075195, "learning_rate": 8.692564139730395e-07, "loss": 0.2869, "step": 2706 }, { "epoch": 0.13079190220804948, "grad_norm": 2.2856035232543945, "learning_rate": 8.692080977919505e-07, "loss": 0.1956, "step": 2707 }, { "epoch": 0.13084021838913854, "grad_norm": 2.461172580718994, "learning_rate": 8.691597816108614e-07, "loss": 0.3139, "step": 2708 }, { "epoch": 0.13088853457022756, "grad_norm": 3.646700859069824, "learning_rate": 8.691114654297724e-07, "loss": 0.2862, "step": 2709 }, { "epoch": 0.1309368507513166, "grad_norm": 3.984924793243408, "learning_rate": 8.690631492486833e-07, "loss": 0.2305, "step": 2710 }, { "epoch": 0.13098516693240567, "grad_norm": 3.256352186203003, "learning_rate": 8.690148330675943e-07, "loss": 0.3518, "step": 2711 }, { "epoch": 0.13103348311349472, "grad_norm": 2.723057508468628, "learning_rate": 8.689665168865053e-07, "loss": 0.3476, "step": 2712 }, { "epoch": 0.13108179929458375, "grad_norm": 2.5095434188842773, "learning_rate": 8.689182007054162e-07, "loss": 0.2496, "step": 2713 }, { "epoch": 0.1311301154756728, "grad_norm": 2.866333484649658, "learning_rate": 8.688698845243272e-07, "loss": 0.35, "step": 2714 }, { "epoch": 0.13117843165676185, "grad_norm": 2.1308693885803223, "learning_rate": 8.688215683432381e-07, "loss": 0.206, "step": 2715 }, { "epoch": 0.1312267478378509, "grad_norm": 3.352850914001465, "learning_rate": 8.68773252162149e-07, "loss": 0.5545, "step": 2716 }, { "epoch": 0.13127506401893993, "grad_norm": 1.9577662944793701, "learning_rate": 8.6872493598106e-07, "loss": 0.2147, "step": 2717 }, { "epoch": 0.13132338020002898, "grad_norm": 1.567244291305542, "learning_rate": 8.68676619799971e-07, "loss": 0.1608, "step": 2718 }, { "epoch": 0.13137169638111804, "grad_norm": 4.073480606079102, "learning_rate": 8.68628303618882e-07, "loss": 0.2734, "step": 2719 }, { "epoch": 0.1314200125622071, "grad_norm": 2.2599313259124756, "learning_rate": 8.68579987437793e-07, "loss": 0.2131, "step": 2720 }, { "epoch": 0.13146832874329614, "grad_norm": 2.164231538772583, "learning_rate": 8.685316712567037e-07, "loss": 0.2298, "step": 2721 }, { "epoch": 0.13151664492438517, "grad_norm": 2.9597439765930176, "learning_rate": 8.684833550756147e-07, "loss": 0.4038, "step": 2722 }, { "epoch": 0.13156496110547422, "grad_norm": 2.9747064113616943, "learning_rate": 8.684350388945257e-07, "loss": 0.3877, "step": 2723 }, { "epoch": 0.13161327728656327, "grad_norm": 18.238468170166016, "learning_rate": 8.683867227134367e-07, "loss": 0.3462, "step": 2724 }, { "epoch": 0.13166159346765233, "grad_norm": 2.012470245361328, "learning_rate": 8.683384065323477e-07, "loss": 0.2486, "step": 2725 }, { "epoch": 0.13170990964874135, "grad_norm": 3.0953078269958496, "learning_rate": 8.682900903512586e-07, "loss": 0.3726, "step": 2726 }, { "epoch": 0.1317582258298304, "grad_norm": 2.317595958709717, "learning_rate": 8.682417741701695e-07, "loss": 0.2759, "step": 2727 }, { "epoch": 0.13180654201091946, "grad_norm": 3.8096139430999756, "learning_rate": 8.681934579890805e-07, "loss": 0.2536, "step": 2728 }, { "epoch": 0.1318548581920085, "grad_norm": 2.3658506870269775, "learning_rate": 8.681451418079915e-07, "loss": 0.2413, "step": 2729 }, { "epoch": 0.13190317437309754, "grad_norm": 3.3731141090393066, "learning_rate": 8.680968256269024e-07, "loss": 0.2853, "step": 2730 }, { "epoch": 0.1319514905541866, "grad_norm": 2.4835128784179688, "learning_rate": 8.680485094458133e-07, "loss": 0.3402, "step": 2731 }, { "epoch": 0.13199980673527564, "grad_norm": 2.2618048191070557, "learning_rate": 8.680001932647243e-07, "loss": 0.2277, "step": 2732 }, { "epoch": 0.1320481229163647, "grad_norm": 1.6250916719436646, "learning_rate": 8.679518770836353e-07, "loss": 0.2062, "step": 2733 }, { "epoch": 0.13209643909745375, "grad_norm": 2.197390079498291, "learning_rate": 8.679035609025462e-07, "loss": 0.2779, "step": 2734 }, { "epoch": 0.13214475527854277, "grad_norm": 3.5875298976898193, "learning_rate": 8.678552447214572e-07, "loss": 0.4157, "step": 2735 }, { "epoch": 0.13219307145963183, "grad_norm": 2.0320518016815186, "learning_rate": 8.678069285403681e-07, "loss": 0.1787, "step": 2736 }, { "epoch": 0.13224138764072088, "grad_norm": 2.298431634902954, "learning_rate": 8.677586123592791e-07, "loss": 0.2795, "step": 2737 }, { "epoch": 0.13228970382180993, "grad_norm": 2.4006617069244385, "learning_rate": 8.6771029617819e-07, "loss": 0.3946, "step": 2738 }, { "epoch": 0.13233802000289896, "grad_norm": 3.7937674522399902, "learning_rate": 8.67661979997101e-07, "loss": 0.3679, "step": 2739 }, { "epoch": 0.132386336183988, "grad_norm": 3.097101926803589, "learning_rate": 8.676136638160119e-07, "loss": 0.4313, "step": 2740 }, { "epoch": 0.13243465236507707, "grad_norm": 2.070183038711548, "learning_rate": 8.675653476349229e-07, "loss": 0.2062, "step": 2741 }, { "epoch": 0.13248296854616612, "grad_norm": 2.450422525405884, "learning_rate": 8.675170314538338e-07, "loss": 0.2927, "step": 2742 }, { "epoch": 0.13253128472725514, "grad_norm": 2.492182493209839, "learning_rate": 8.674687152727448e-07, "loss": 0.2535, "step": 2743 }, { "epoch": 0.1325796009083442, "grad_norm": 3.5509262084960938, "learning_rate": 8.674203990916558e-07, "loss": 0.4162, "step": 2744 }, { "epoch": 0.13262791708943325, "grad_norm": 3.4175925254821777, "learning_rate": 8.673720829105668e-07, "loss": 0.3201, "step": 2745 }, { "epoch": 0.1326762332705223, "grad_norm": 4.304439067840576, "learning_rate": 8.673237667294778e-07, "loss": 0.3764, "step": 2746 }, { "epoch": 0.13272454945161136, "grad_norm": 2.503748655319214, "learning_rate": 8.672754505483885e-07, "loss": 0.2497, "step": 2747 }, { "epoch": 0.13277286563270038, "grad_norm": 2.301957845687866, "learning_rate": 8.672271343672995e-07, "loss": 0.2243, "step": 2748 }, { "epoch": 0.13282118181378944, "grad_norm": 2.0148274898529053, "learning_rate": 8.671788181862105e-07, "loss": 0.2508, "step": 2749 }, { "epoch": 0.1328694979948785, "grad_norm": 2.583555221557617, "learning_rate": 8.671305020051215e-07, "loss": 0.235, "step": 2750 }, { "epoch": 0.13291781417596754, "grad_norm": 3.189753293991089, "learning_rate": 8.670821858240325e-07, "loss": 0.3967, "step": 2751 }, { "epoch": 0.13296613035705657, "grad_norm": 2.5001299381256104, "learning_rate": 8.670338696429434e-07, "loss": 0.2751, "step": 2752 }, { "epoch": 0.13301444653814562, "grad_norm": 3.433039903640747, "learning_rate": 8.669855534618543e-07, "loss": 0.3914, "step": 2753 }, { "epoch": 0.13306276271923467, "grad_norm": 2.5124850273132324, "learning_rate": 8.669372372807653e-07, "loss": 0.3412, "step": 2754 }, { "epoch": 0.13311107890032373, "grad_norm": 3.4196388721466064, "learning_rate": 8.668889210996762e-07, "loss": 0.3562, "step": 2755 }, { "epoch": 0.13315939508141275, "grad_norm": 3.1418161392211914, "learning_rate": 8.668406049185872e-07, "loss": 0.3468, "step": 2756 }, { "epoch": 0.1332077112625018, "grad_norm": 2.5684525966644287, "learning_rate": 8.667922887374981e-07, "loss": 0.3225, "step": 2757 }, { "epoch": 0.13325602744359086, "grad_norm": 2.134718179702759, "learning_rate": 8.667439725564091e-07, "loss": 0.2171, "step": 2758 }, { "epoch": 0.1333043436246799, "grad_norm": 2.856775999069214, "learning_rate": 8.6669565637532e-07, "loss": 0.169, "step": 2759 }, { "epoch": 0.13335265980576896, "grad_norm": 2.3937647342681885, "learning_rate": 8.66647340194231e-07, "loss": 0.2982, "step": 2760 }, { "epoch": 0.133400975986858, "grad_norm": 2.429185390472412, "learning_rate": 8.66599024013142e-07, "loss": 0.2581, "step": 2761 }, { "epoch": 0.13344929216794704, "grad_norm": 2.2282590866088867, "learning_rate": 8.665507078320529e-07, "loss": 0.2356, "step": 2762 }, { "epoch": 0.1334976083490361, "grad_norm": 3.598954916000366, "learning_rate": 8.665023916509639e-07, "loss": 0.4113, "step": 2763 }, { "epoch": 0.13354592453012515, "grad_norm": 2.6360206604003906, "learning_rate": 8.664540754698748e-07, "loss": 0.2754, "step": 2764 }, { "epoch": 0.13359424071121417, "grad_norm": 2.431137800216675, "learning_rate": 8.664057592887858e-07, "loss": 0.211, "step": 2765 }, { "epoch": 0.13364255689230323, "grad_norm": 2.4090981483459473, "learning_rate": 8.663574431076967e-07, "loss": 0.2808, "step": 2766 }, { "epoch": 0.13369087307339228, "grad_norm": 2.1471920013427734, "learning_rate": 8.663091269266077e-07, "loss": 0.2288, "step": 2767 }, { "epoch": 0.13373918925448133, "grad_norm": 2.7051455974578857, "learning_rate": 8.662608107455186e-07, "loss": 0.3695, "step": 2768 }, { "epoch": 0.1337875054355704, "grad_norm": 1.8936009407043457, "learning_rate": 8.662124945644296e-07, "loss": 0.2283, "step": 2769 }, { "epoch": 0.1338358216166594, "grad_norm": 3.3107969760894775, "learning_rate": 8.661641783833406e-07, "loss": 0.2804, "step": 2770 }, { "epoch": 0.13388413779774846, "grad_norm": 18.445384979248047, "learning_rate": 8.661158622022516e-07, "loss": 0.2783, "step": 2771 }, { "epoch": 0.13393245397883752, "grad_norm": 2.6421961784362793, "learning_rate": 8.660675460211624e-07, "loss": 0.235, "step": 2772 }, { "epoch": 0.13398077015992657, "grad_norm": 1.971012830734253, "learning_rate": 8.660192298400733e-07, "loss": 0.24, "step": 2773 }, { "epoch": 0.1340290863410156, "grad_norm": 2.4800689220428467, "learning_rate": 8.659709136589843e-07, "loss": 0.2364, "step": 2774 }, { "epoch": 0.13407740252210465, "grad_norm": 4.262056350708008, "learning_rate": 8.659225974778953e-07, "loss": 0.3116, "step": 2775 }, { "epoch": 0.1341257187031937, "grad_norm": 4.2318243980407715, "learning_rate": 8.658742812968063e-07, "loss": 0.4145, "step": 2776 }, { "epoch": 0.13417403488428276, "grad_norm": 9.331177711486816, "learning_rate": 8.658259651157173e-07, "loss": 0.3256, "step": 2777 }, { "epoch": 0.13422235106537178, "grad_norm": 2.4109227657318115, "learning_rate": 8.657776489346282e-07, "loss": 0.2781, "step": 2778 }, { "epoch": 0.13427066724646083, "grad_norm": 2.253695011138916, "learning_rate": 8.657293327535391e-07, "loss": 0.2559, "step": 2779 }, { "epoch": 0.1343189834275499, "grad_norm": 3.0203263759613037, "learning_rate": 8.6568101657245e-07, "loss": 0.4527, "step": 2780 }, { "epoch": 0.13436729960863894, "grad_norm": 2.3190882205963135, "learning_rate": 8.65632700391361e-07, "loss": 0.2768, "step": 2781 }, { "epoch": 0.134415615789728, "grad_norm": 2.7902591228485107, "learning_rate": 8.65584384210272e-07, "loss": 0.2931, "step": 2782 }, { "epoch": 0.13446393197081702, "grad_norm": 5.929026126861572, "learning_rate": 8.655360680291829e-07, "loss": 0.3498, "step": 2783 }, { "epoch": 0.13451224815190607, "grad_norm": 2.6213579177856445, "learning_rate": 8.654877518480939e-07, "loss": 0.3203, "step": 2784 }, { "epoch": 0.13456056433299513, "grad_norm": 3.3182668685913086, "learning_rate": 8.654394356670048e-07, "loss": 0.2701, "step": 2785 }, { "epoch": 0.13460888051408418, "grad_norm": 2.6730844974517822, "learning_rate": 8.653911194859158e-07, "loss": 0.2817, "step": 2786 }, { "epoch": 0.1346571966951732, "grad_norm": 2.0540695190429688, "learning_rate": 8.653428033048268e-07, "loss": 0.2615, "step": 2787 }, { "epoch": 0.13470551287626226, "grad_norm": 3.1111133098602295, "learning_rate": 8.652944871237377e-07, "loss": 0.2982, "step": 2788 }, { "epoch": 0.1347538290573513, "grad_norm": 3.151456594467163, "learning_rate": 8.652461709426486e-07, "loss": 0.3585, "step": 2789 }, { "epoch": 0.13480214523844036, "grad_norm": 2.3376641273498535, "learning_rate": 8.651978547615596e-07, "loss": 0.2307, "step": 2790 }, { "epoch": 0.1348504614195294, "grad_norm": 2.534905433654785, "learning_rate": 8.651495385804705e-07, "loss": 0.3315, "step": 2791 }, { "epoch": 0.13489877760061844, "grad_norm": 3.324291229248047, "learning_rate": 8.651012223993815e-07, "loss": 0.3933, "step": 2792 }, { "epoch": 0.1349470937817075, "grad_norm": 4.121391296386719, "learning_rate": 8.650529062182925e-07, "loss": 0.3922, "step": 2793 }, { "epoch": 0.13499540996279655, "grad_norm": 6.1423115730285645, "learning_rate": 8.650045900372034e-07, "loss": 0.4065, "step": 2794 }, { "epoch": 0.1350437261438856, "grad_norm": 2.450237989425659, "learning_rate": 8.649562738561144e-07, "loss": 0.3616, "step": 2795 }, { "epoch": 0.13509204232497463, "grad_norm": 1.8327025175094604, "learning_rate": 8.649079576750254e-07, "loss": 0.241, "step": 2796 }, { "epoch": 0.13514035850606368, "grad_norm": 2.9639172554016113, "learning_rate": 8.648596414939364e-07, "loss": 0.3484, "step": 2797 }, { "epoch": 0.13518867468715273, "grad_norm": 2.1795427799224854, "learning_rate": 8.648113253128472e-07, "loss": 0.2618, "step": 2798 }, { "epoch": 0.13523699086824179, "grad_norm": 2.3050742149353027, "learning_rate": 8.647630091317581e-07, "loss": 0.242, "step": 2799 }, { "epoch": 0.1352853070493308, "grad_norm": 1.8201216459274292, "learning_rate": 8.647146929506691e-07, "loss": 0.1728, "step": 2800 }, { "epoch": 0.13533362323041986, "grad_norm": 2.8403420448303223, "learning_rate": 8.646663767695801e-07, "loss": 0.4598, "step": 2801 }, { "epoch": 0.13538193941150892, "grad_norm": 3.7401814460754395, "learning_rate": 8.646180605884911e-07, "loss": 0.3762, "step": 2802 }, { "epoch": 0.13543025559259797, "grad_norm": 4.2065815925598145, "learning_rate": 8.645697444074021e-07, "loss": 0.5026, "step": 2803 }, { "epoch": 0.135478571773687, "grad_norm": 3.2034525871276855, "learning_rate": 8.645214282263129e-07, "loss": 0.3465, "step": 2804 }, { "epoch": 0.13552688795477605, "grad_norm": 2.7890000343322754, "learning_rate": 8.644731120452239e-07, "loss": 0.2639, "step": 2805 }, { "epoch": 0.1355752041358651, "grad_norm": 3.4197709560394287, "learning_rate": 8.644247958641348e-07, "loss": 0.3856, "step": 2806 }, { "epoch": 0.13562352031695415, "grad_norm": 2.471961736679077, "learning_rate": 8.643764796830458e-07, "loss": 0.2767, "step": 2807 }, { "epoch": 0.1356718364980432, "grad_norm": 4.572588920593262, "learning_rate": 8.643281635019568e-07, "loss": 0.3663, "step": 2808 }, { "epoch": 0.13572015267913223, "grad_norm": 3.097940683364868, "learning_rate": 8.642798473208677e-07, "loss": 0.3332, "step": 2809 }, { "epoch": 0.1357684688602213, "grad_norm": 2.881223440170288, "learning_rate": 8.642315311397787e-07, "loss": 0.3119, "step": 2810 }, { "epoch": 0.13581678504131034, "grad_norm": 3.0123767852783203, "learning_rate": 8.641832149586896e-07, "loss": 0.3163, "step": 2811 }, { "epoch": 0.1358651012223994, "grad_norm": 3.031386137008667, "learning_rate": 8.641348987776006e-07, "loss": 0.3339, "step": 2812 }, { "epoch": 0.13591341740348842, "grad_norm": 2.309511423110962, "learning_rate": 8.640865825965116e-07, "loss": 0.3363, "step": 2813 }, { "epoch": 0.13596173358457747, "grad_norm": 2.7148897647857666, "learning_rate": 8.640382664154224e-07, "loss": 0.2793, "step": 2814 }, { "epoch": 0.13601004976566652, "grad_norm": 3.4769632816314697, "learning_rate": 8.639899502343334e-07, "loss": 0.3028, "step": 2815 }, { "epoch": 0.13605836594675558, "grad_norm": 3.684035301208496, "learning_rate": 8.639416340532444e-07, "loss": 0.4201, "step": 2816 }, { "epoch": 0.1361066821278446, "grad_norm": 3.350011110305786, "learning_rate": 8.638933178721553e-07, "loss": 0.4267, "step": 2817 }, { "epoch": 0.13615499830893366, "grad_norm": 3.474926710128784, "learning_rate": 8.638450016910663e-07, "loss": 0.3447, "step": 2818 }, { "epoch": 0.1362033144900227, "grad_norm": 3.69173264503479, "learning_rate": 8.637966855099773e-07, "loss": 0.4013, "step": 2819 }, { "epoch": 0.13625163067111176, "grad_norm": 21.386520385742188, "learning_rate": 8.637483693288882e-07, "loss": 0.4008, "step": 2820 }, { "epoch": 0.13629994685220082, "grad_norm": 5.232152938842773, "learning_rate": 8.637000531477992e-07, "loss": 0.3696, "step": 2821 }, { "epoch": 0.13634826303328984, "grad_norm": 3.2376925945281982, "learning_rate": 8.636517369667102e-07, "loss": 0.4417, "step": 2822 }, { "epoch": 0.1363965792143789, "grad_norm": 2.445472002029419, "learning_rate": 8.63603420785621e-07, "loss": 0.3069, "step": 2823 }, { "epoch": 0.13644489539546795, "grad_norm": 3.108502149581909, "learning_rate": 8.63555104604532e-07, "loss": 0.4284, "step": 2824 }, { "epoch": 0.136493211576557, "grad_norm": 2.446580648422241, "learning_rate": 8.635067884234429e-07, "loss": 0.2428, "step": 2825 }, { "epoch": 0.13654152775764603, "grad_norm": 2.7887191772460938, "learning_rate": 8.634584722423539e-07, "loss": 0.2546, "step": 2826 }, { "epoch": 0.13658984393873508, "grad_norm": 3.1870906352996826, "learning_rate": 8.634101560612649e-07, "loss": 0.335, "step": 2827 }, { "epoch": 0.13663816011982413, "grad_norm": 2.686492681503296, "learning_rate": 8.633618398801759e-07, "loss": 0.4567, "step": 2828 }, { "epoch": 0.13668647630091318, "grad_norm": 2.892908811569214, "learning_rate": 8.633135236990869e-07, "loss": 0.3108, "step": 2829 }, { "epoch": 0.1367347924820022, "grad_norm": 4.680367469787598, "learning_rate": 8.632652075179977e-07, "loss": 0.3287, "step": 2830 }, { "epoch": 0.13678310866309126, "grad_norm": 5.716301918029785, "learning_rate": 8.632168913369086e-07, "loss": 0.3578, "step": 2831 }, { "epoch": 0.13683142484418032, "grad_norm": 4.606706619262695, "learning_rate": 8.631685751558196e-07, "loss": 0.2159, "step": 2832 }, { "epoch": 0.13687974102526937, "grad_norm": 2.5991523265838623, "learning_rate": 8.631202589747306e-07, "loss": 0.2917, "step": 2833 }, { "epoch": 0.13692805720635842, "grad_norm": 2.467862606048584, "learning_rate": 8.630719427936416e-07, "loss": 0.3031, "step": 2834 }, { "epoch": 0.13697637338744745, "grad_norm": 3.740060806274414, "learning_rate": 8.630236266125525e-07, "loss": 0.3972, "step": 2835 }, { "epoch": 0.1370246895685365, "grad_norm": 2.8071093559265137, "learning_rate": 8.629753104314634e-07, "loss": 0.3321, "step": 2836 }, { "epoch": 0.13707300574962555, "grad_norm": 3.443934440612793, "learning_rate": 8.629269942503744e-07, "loss": 0.2861, "step": 2837 }, { "epoch": 0.1371213219307146, "grad_norm": 2.075927495956421, "learning_rate": 8.628786780692854e-07, "loss": 0.2542, "step": 2838 }, { "epoch": 0.13716963811180363, "grad_norm": 2.600611448287964, "learning_rate": 8.628303618881964e-07, "loss": 0.2391, "step": 2839 }, { "epoch": 0.13721795429289269, "grad_norm": 5.719698429107666, "learning_rate": 8.627820457071072e-07, "loss": 0.5578, "step": 2840 }, { "epoch": 0.13726627047398174, "grad_norm": 4.339354991912842, "learning_rate": 8.627337295260182e-07, "loss": 0.3143, "step": 2841 }, { "epoch": 0.1373145866550708, "grad_norm": 2.172921895980835, "learning_rate": 8.626854133449292e-07, "loss": 0.2914, "step": 2842 }, { "epoch": 0.13736290283615982, "grad_norm": 2.758901834487915, "learning_rate": 8.626370971638401e-07, "loss": 0.2719, "step": 2843 }, { "epoch": 0.13741121901724887, "grad_norm": 2.786878824234009, "learning_rate": 8.625887809827511e-07, "loss": 0.311, "step": 2844 }, { "epoch": 0.13745953519833792, "grad_norm": 2.992147445678711, "learning_rate": 8.62540464801662e-07, "loss": 0.3844, "step": 2845 }, { "epoch": 0.13750785137942698, "grad_norm": 2.4248898029327393, "learning_rate": 8.62492148620573e-07, "loss": 0.3373, "step": 2846 }, { "epoch": 0.13755616756051603, "grad_norm": 3.5243587493896484, "learning_rate": 8.62443832439484e-07, "loss": 0.4558, "step": 2847 }, { "epoch": 0.13760448374160505, "grad_norm": 3.034907579421997, "learning_rate": 8.62395516258395e-07, "loss": 0.3802, "step": 2848 }, { "epoch": 0.1376527999226941, "grad_norm": 2.5332143306732178, "learning_rate": 8.623472000773058e-07, "loss": 0.2431, "step": 2849 }, { "epoch": 0.13770111610378316, "grad_norm": 1.9361274242401123, "learning_rate": 8.622988838962168e-07, "loss": 0.2461, "step": 2850 }, { "epoch": 0.13774943228487221, "grad_norm": 3.80275297164917, "learning_rate": 8.622505677151277e-07, "loss": 0.4149, "step": 2851 }, { "epoch": 0.13779774846596124, "grad_norm": 7.945606231689453, "learning_rate": 8.622022515340387e-07, "loss": 0.3209, "step": 2852 }, { "epoch": 0.1378460646470503, "grad_norm": 2.7737488746643066, "learning_rate": 8.621539353529497e-07, "loss": 0.335, "step": 2853 }, { "epoch": 0.13789438082813935, "grad_norm": 2.9215047359466553, "learning_rate": 8.621056191718607e-07, "loss": 0.3531, "step": 2854 }, { "epoch": 0.1379426970092284, "grad_norm": 3.251089572906494, "learning_rate": 8.620573029907717e-07, "loss": 0.4345, "step": 2855 }, { "epoch": 0.13799101319031742, "grad_norm": 2.3662941455841064, "learning_rate": 8.620089868096824e-07, "loss": 0.2449, "step": 2856 }, { "epoch": 0.13803932937140648, "grad_norm": 5.998033046722412, "learning_rate": 8.619606706285934e-07, "loss": 0.2389, "step": 2857 }, { "epoch": 0.13808764555249553, "grad_norm": 2.668006181716919, "learning_rate": 8.619123544475044e-07, "loss": 0.3, "step": 2858 }, { "epoch": 0.13813596173358458, "grad_norm": 14.300079345703125, "learning_rate": 8.618640382664154e-07, "loss": 0.4374, "step": 2859 }, { "epoch": 0.13818427791467364, "grad_norm": 2.477583408355713, "learning_rate": 8.618157220853264e-07, "loss": 0.2742, "step": 2860 }, { "epoch": 0.13823259409576266, "grad_norm": 2.3735511302948, "learning_rate": 8.617674059042373e-07, "loss": 0.221, "step": 2861 }, { "epoch": 0.13828091027685172, "grad_norm": 10.022383689880371, "learning_rate": 8.617190897231482e-07, "loss": 0.2249, "step": 2862 }, { "epoch": 0.13832922645794077, "grad_norm": 2.208159923553467, "learning_rate": 8.616707735420592e-07, "loss": 0.2509, "step": 2863 }, { "epoch": 0.13837754263902982, "grad_norm": 2.940751314163208, "learning_rate": 8.616224573609702e-07, "loss": 0.3374, "step": 2864 }, { "epoch": 0.13842585882011885, "grad_norm": 3.8819639682769775, "learning_rate": 8.615741411798811e-07, "loss": 0.326, "step": 2865 }, { "epoch": 0.1384741750012079, "grad_norm": 2.2059624195098877, "learning_rate": 8.61525824998792e-07, "loss": 0.2649, "step": 2866 }, { "epoch": 0.13852249118229695, "grad_norm": 2.604382038116455, "learning_rate": 8.61477508817703e-07, "loss": 0.3034, "step": 2867 }, { "epoch": 0.138570807363386, "grad_norm": 2.2407920360565186, "learning_rate": 8.614291926366139e-07, "loss": 0.2996, "step": 2868 }, { "epoch": 0.13861912354447503, "grad_norm": 2.897902727127075, "learning_rate": 8.613808764555249e-07, "loss": 0.2759, "step": 2869 }, { "epoch": 0.13866743972556408, "grad_norm": 4.10504150390625, "learning_rate": 8.613325602744359e-07, "loss": 0.3862, "step": 2870 }, { "epoch": 0.13871575590665314, "grad_norm": 7.0749030113220215, "learning_rate": 8.612842440933468e-07, "loss": 0.3719, "step": 2871 }, { "epoch": 0.1387640720877422, "grad_norm": 7.61019229888916, "learning_rate": 8.612359279122578e-07, "loss": 0.2275, "step": 2872 }, { "epoch": 0.13881238826883124, "grad_norm": 2.7867209911346436, "learning_rate": 8.611876117311688e-07, "loss": 0.3411, "step": 2873 }, { "epoch": 0.13886070444992027, "grad_norm": 8.20546817779541, "learning_rate": 8.611392955500797e-07, "loss": 0.4966, "step": 2874 }, { "epoch": 0.13890902063100932, "grad_norm": 11.152477264404297, "learning_rate": 8.610909793689906e-07, "loss": 0.4235, "step": 2875 }, { "epoch": 0.13895733681209838, "grad_norm": 3.424384355545044, "learning_rate": 8.610426631879016e-07, "loss": 0.2923, "step": 2876 }, { "epoch": 0.13900565299318743, "grad_norm": 4.314743995666504, "learning_rate": 8.609943470068125e-07, "loss": 0.4004, "step": 2877 }, { "epoch": 0.13905396917427645, "grad_norm": 2.299966335296631, "learning_rate": 8.609460308257235e-07, "loss": 0.3019, "step": 2878 }, { "epoch": 0.1391022853553655, "grad_norm": 3.414584159851074, "learning_rate": 8.608977146446345e-07, "loss": 0.3946, "step": 2879 }, { "epoch": 0.13915060153645456, "grad_norm": 2.318065881729126, "learning_rate": 8.608493984635455e-07, "loss": 0.1791, "step": 2880 }, { "epoch": 0.1391989177175436, "grad_norm": 1.4312351942062378, "learning_rate": 8.608010822824564e-07, "loss": 0.1602, "step": 2881 }, { "epoch": 0.13924723389863264, "grad_norm": 1.4615516662597656, "learning_rate": 8.607527661013672e-07, "loss": 0.1369, "step": 2882 }, { "epoch": 0.1392955500797217, "grad_norm": 2.3866589069366455, "learning_rate": 8.607044499202782e-07, "loss": 0.3427, "step": 2883 }, { "epoch": 0.13934386626081074, "grad_norm": 3.107456922531128, "learning_rate": 8.606561337391892e-07, "loss": 0.3998, "step": 2884 }, { "epoch": 0.1393921824418998, "grad_norm": 4.113301753997803, "learning_rate": 8.606078175581002e-07, "loss": 0.2599, "step": 2885 }, { "epoch": 0.13944049862298885, "grad_norm": 3.6619985103607178, "learning_rate": 8.605595013770112e-07, "loss": 0.3454, "step": 2886 }, { "epoch": 0.13948881480407788, "grad_norm": 2.4755942821502686, "learning_rate": 8.60511185195922e-07, "loss": 0.2686, "step": 2887 }, { "epoch": 0.13953713098516693, "grad_norm": 4.149081707000732, "learning_rate": 8.60462869014833e-07, "loss": 0.2907, "step": 2888 }, { "epoch": 0.13958544716625598, "grad_norm": 2.7433881759643555, "learning_rate": 8.60414552833744e-07, "loss": 0.3065, "step": 2889 }, { "epoch": 0.13963376334734504, "grad_norm": 2.7642903327941895, "learning_rate": 8.60366236652655e-07, "loss": 0.2916, "step": 2890 }, { "epoch": 0.13968207952843406, "grad_norm": 3.2132678031921387, "learning_rate": 8.603179204715659e-07, "loss": 0.4481, "step": 2891 }, { "epoch": 0.13973039570952311, "grad_norm": 2.683901786804199, "learning_rate": 8.602696042904768e-07, "loss": 0.2504, "step": 2892 }, { "epoch": 0.13977871189061217, "grad_norm": 3.451190948486328, "learning_rate": 8.602212881093878e-07, "loss": 0.4017, "step": 2893 }, { "epoch": 0.13982702807170122, "grad_norm": 6.282255172729492, "learning_rate": 8.601729719282987e-07, "loss": 0.354, "step": 2894 }, { "epoch": 0.13987534425279025, "grad_norm": 2.1539227962493896, "learning_rate": 8.601246557472097e-07, "loss": 0.3181, "step": 2895 }, { "epoch": 0.1399236604338793, "grad_norm": 3.438478469848633, "learning_rate": 8.600763395661207e-07, "loss": 0.3262, "step": 2896 }, { "epoch": 0.13997197661496835, "grad_norm": 2.01627516746521, "learning_rate": 8.600280233850316e-07, "loss": 0.1955, "step": 2897 }, { "epoch": 0.1400202927960574, "grad_norm": 2.7887065410614014, "learning_rate": 8.599797072039426e-07, "loss": 0.2588, "step": 2898 }, { "epoch": 0.14006860897714646, "grad_norm": 2.1284961700439453, "learning_rate": 8.599313910228535e-07, "loss": 0.249, "step": 2899 }, { "epoch": 0.14011692515823548, "grad_norm": 2.7581851482391357, "learning_rate": 8.598830748417644e-07, "loss": 0.3029, "step": 2900 }, { "epoch": 0.14016524133932454, "grad_norm": 3.3154773712158203, "learning_rate": 8.598347586606754e-07, "loss": 0.3229, "step": 2901 }, { "epoch": 0.1402135575204136, "grad_norm": 3.209806442260742, "learning_rate": 8.597864424795864e-07, "loss": 0.3094, "step": 2902 }, { "epoch": 0.14026187370150264, "grad_norm": 4.248005390167236, "learning_rate": 8.597381262984973e-07, "loss": 0.2161, "step": 2903 }, { "epoch": 0.14031018988259167, "grad_norm": 2.9657084941864014, "learning_rate": 8.596898101174083e-07, "loss": 0.4588, "step": 2904 }, { "epoch": 0.14035850606368072, "grad_norm": 2.524529218673706, "learning_rate": 8.596414939363193e-07, "loss": 0.2598, "step": 2905 }, { "epoch": 0.14040682224476977, "grad_norm": 2.386502265930176, "learning_rate": 8.595931777552303e-07, "loss": 0.3233, "step": 2906 }, { "epoch": 0.14045513842585883, "grad_norm": 3.5787646770477295, "learning_rate": 8.595448615741412e-07, "loss": 0.4322, "step": 2907 }, { "epoch": 0.14050345460694788, "grad_norm": 5.6984076499938965, "learning_rate": 8.59496545393052e-07, "loss": 0.1443, "step": 2908 }, { "epoch": 0.1405517707880369, "grad_norm": 3.4562926292419434, "learning_rate": 8.59448229211963e-07, "loss": 0.4624, "step": 2909 }, { "epoch": 0.14060008696912596, "grad_norm": 5.61854362487793, "learning_rate": 8.59399913030874e-07, "loss": 0.3707, "step": 2910 }, { "epoch": 0.140648403150215, "grad_norm": 4.0797343254089355, "learning_rate": 8.59351596849785e-07, "loss": 0.4031, "step": 2911 }, { "epoch": 0.14069671933130407, "grad_norm": 2.676605701446533, "learning_rate": 8.59303280668696e-07, "loss": 0.1914, "step": 2912 }, { "epoch": 0.1407450355123931, "grad_norm": 2.603614091873169, "learning_rate": 8.592549644876068e-07, "loss": 0.3373, "step": 2913 }, { "epoch": 0.14079335169348214, "grad_norm": 2.6112303733825684, "learning_rate": 8.592066483065178e-07, "loss": 0.2526, "step": 2914 }, { "epoch": 0.1408416678745712, "grad_norm": 3.7554752826690674, "learning_rate": 8.591583321254288e-07, "loss": 0.2478, "step": 2915 }, { "epoch": 0.14088998405566025, "grad_norm": 12.124170303344727, "learning_rate": 8.591100159443397e-07, "loss": 0.3357, "step": 2916 }, { "epoch": 0.14093830023674928, "grad_norm": 2.3260107040405273, "learning_rate": 8.590616997632507e-07, "loss": 0.2804, "step": 2917 }, { "epoch": 0.14098661641783833, "grad_norm": 2.0728471279144287, "learning_rate": 8.590133835821616e-07, "loss": 0.2231, "step": 2918 }, { "epoch": 0.14103493259892738, "grad_norm": 5.720325469970703, "learning_rate": 8.589650674010725e-07, "loss": 0.5149, "step": 2919 }, { "epoch": 0.14108324878001643, "grad_norm": 2.1735212802886963, "learning_rate": 8.589167512199835e-07, "loss": 0.197, "step": 2920 }, { "epoch": 0.1411315649611055, "grad_norm": 12.03847599029541, "learning_rate": 8.588684350388945e-07, "loss": 0.3357, "step": 2921 }, { "epoch": 0.1411798811421945, "grad_norm": 4.792253017425537, "learning_rate": 8.588201188578055e-07, "loss": 0.3309, "step": 2922 }, { "epoch": 0.14122819732328357, "grad_norm": 3.232888698577881, "learning_rate": 8.587718026767164e-07, "loss": 0.3703, "step": 2923 }, { "epoch": 0.14127651350437262, "grad_norm": 2.2772104740142822, "learning_rate": 8.587234864956273e-07, "loss": 0.2621, "step": 2924 }, { "epoch": 0.14132482968546167, "grad_norm": 2.7119102478027344, "learning_rate": 8.586751703145383e-07, "loss": 0.3177, "step": 2925 }, { "epoch": 0.1413731458665507, "grad_norm": 2.9052693843841553, "learning_rate": 8.586268541334492e-07, "loss": 0.3502, "step": 2926 }, { "epoch": 0.14142146204763975, "grad_norm": 1.9247292280197144, "learning_rate": 8.585785379523602e-07, "loss": 0.2461, "step": 2927 }, { "epoch": 0.1414697782287288, "grad_norm": 5.015661239624023, "learning_rate": 8.585302217712712e-07, "loss": 0.4202, "step": 2928 }, { "epoch": 0.14151809440981786, "grad_norm": 10.906990051269531, "learning_rate": 8.584819055901821e-07, "loss": 0.3903, "step": 2929 }, { "epoch": 0.14156641059090688, "grad_norm": 3.0074572563171387, "learning_rate": 8.584335894090931e-07, "loss": 0.277, "step": 2930 }, { "epoch": 0.14161472677199594, "grad_norm": 13.397092819213867, "learning_rate": 8.583852732280041e-07, "loss": 0.2379, "step": 2931 }, { "epoch": 0.141663042953085, "grad_norm": 2.6903393268585205, "learning_rate": 8.58336957046915e-07, "loss": 0.3046, "step": 2932 }, { "epoch": 0.14171135913417404, "grad_norm": 2.2454020977020264, "learning_rate": 8.582886408658259e-07, "loss": 0.303, "step": 2933 }, { "epoch": 0.1417596753152631, "grad_norm": 3.9199676513671875, "learning_rate": 8.582403246847368e-07, "loss": 0.3039, "step": 2934 }, { "epoch": 0.14180799149635212, "grad_norm": 3.035966157913208, "learning_rate": 8.581920085036478e-07, "loss": 0.3661, "step": 2935 }, { "epoch": 0.14185630767744117, "grad_norm": 2.8262150287628174, "learning_rate": 8.581436923225588e-07, "loss": 0.4659, "step": 2936 }, { "epoch": 0.14190462385853023, "grad_norm": 2.3625574111938477, "learning_rate": 8.580953761414698e-07, "loss": 0.2456, "step": 2937 }, { "epoch": 0.14195294003961928, "grad_norm": 2.1966514587402344, "learning_rate": 8.580470599603808e-07, "loss": 0.2696, "step": 2938 }, { "epoch": 0.1420012562207083, "grad_norm": 1.782743215560913, "learning_rate": 8.579987437792916e-07, "loss": 0.2541, "step": 2939 }, { "epoch": 0.14204957240179736, "grad_norm": 2.894645929336548, "learning_rate": 8.579504275982026e-07, "loss": 0.3163, "step": 2940 }, { "epoch": 0.1420978885828864, "grad_norm": 3.5548691749572754, "learning_rate": 8.579021114171135e-07, "loss": 0.4467, "step": 2941 }, { "epoch": 0.14214620476397546, "grad_norm": 3.339848518371582, "learning_rate": 8.578537952360245e-07, "loss": 0.3642, "step": 2942 }, { "epoch": 0.1421945209450645, "grad_norm": 2.0691778659820557, "learning_rate": 8.578054790549355e-07, "loss": 0.1605, "step": 2943 }, { "epoch": 0.14224283712615354, "grad_norm": 3.2904067039489746, "learning_rate": 8.577571628738464e-07, "loss": 0.272, "step": 2944 }, { "epoch": 0.1422911533072426, "grad_norm": 4.314513683319092, "learning_rate": 8.577088466927573e-07, "loss": 0.4551, "step": 2945 }, { "epoch": 0.14233946948833165, "grad_norm": 2.5180156230926514, "learning_rate": 8.576605305116683e-07, "loss": 0.2253, "step": 2946 }, { "epoch": 0.1423877856694207, "grad_norm": 2.91902232170105, "learning_rate": 8.576122143305793e-07, "loss": 0.3835, "step": 2947 }, { "epoch": 0.14243610185050973, "grad_norm": 2.6655325889587402, "learning_rate": 8.575638981494903e-07, "loss": 0.3015, "step": 2948 }, { "epoch": 0.14248441803159878, "grad_norm": 4.231547832489014, "learning_rate": 8.575155819684012e-07, "loss": 0.4037, "step": 2949 }, { "epoch": 0.14253273421268783, "grad_norm": 6.635552883148193, "learning_rate": 8.574672657873121e-07, "loss": 0.2274, "step": 2950 }, { "epoch": 0.1425810503937769, "grad_norm": 2.461444616317749, "learning_rate": 8.57418949606223e-07, "loss": 0.2516, "step": 2951 }, { "epoch": 0.1426293665748659, "grad_norm": 4.052754878997803, "learning_rate": 8.57370633425134e-07, "loss": 0.2525, "step": 2952 }, { "epoch": 0.14267768275595497, "grad_norm": 3.1151976585388184, "learning_rate": 8.57322317244045e-07, "loss": 0.2993, "step": 2953 }, { "epoch": 0.14272599893704402, "grad_norm": 1.9970321655273438, "learning_rate": 8.57274001062956e-07, "loss": 0.3188, "step": 2954 }, { "epoch": 0.14277431511813307, "grad_norm": 3.962451457977295, "learning_rate": 8.572256848818669e-07, "loss": 0.2744, "step": 2955 }, { "epoch": 0.1428226312992221, "grad_norm": 2.661968231201172, "learning_rate": 8.571773687007779e-07, "loss": 0.2644, "step": 2956 }, { "epoch": 0.14287094748031115, "grad_norm": 2.9327661991119385, "learning_rate": 8.571290525196889e-07, "loss": 0.3714, "step": 2957 }, { "epoch": 0.1429192636614002, "grad_norm": 3.491875648498535, "learning_rate": 8.570807363385997e-07, "loss": 0.3057, "step": 2958 }, { "epoch": 0.14296757984248926, "grad_norm": 2.848797082901001, "learning_rate": 8.570324201575107e-07, "loss": 0.3584, "step": 2959 }, { "epoch": 0.1430158960235783, "grad_norm": 2.1349289417266846, "learning_rate": 8.569841039764216e-07, "loss": 0.2619, "step": 2960 }, { "epoch": 0.14306421220466733, "grad_norm": 2.2933883666992188, "learning_rate": 8.569357877953326e-07, "loss": 0.2697, "step": 2961 }, { "epoch": 0.1431125283857564, "grad_norm": 3.243405342102051, "learning_rate": 8.568874716142436e-07, "loss": 0.271, "step": 2962 }, { "epoch": 0.14316084456684544, "grad_norm": 2.7159464359283447, "learning_rate": 8.568391554331546e-07, "loss": 0.3606, "step": 2963 }, { "epoch": 0.1432091607479345, "grad_norm": 2.239623785018921, "learning_rate": 8.567908392520655e-07, "loss": 0.2465, "step": 2964 }, { "epoch": 0.14325747692902352, "grad_norm": 2.313690423965454, "learning_rate": 8.567425230709764e-07, "loss": 0.3314, "step": 2965 }, { "epoch": 0.14330579311011257, "grad_norm": 3.636547327041626, "learning_rate": 8.566942068898873e-07, "loss": 0.4532, "step": 2966 }, { "epoch": 0.14335410929120163, "grad_norm": 3.2547006607055664, "learning_rate": 8.566458907087983e-07, "loss": 0.379, "step": 2967 }, { "epoch": 0.14340242547229068, "grad_norm": 2.0220675468444824, "learning_rate": 8.565975745277093e-07, "loss": 0.1987, "step": 2968 }, { "epoch": 0.1434507416533797, "grad_norm": 2.1937859058380127, "learning_rate": 8.565492583466203e-07, "loss": 0.3214, "step": 2969 }, { "epoch": 0.14349905783446876, "grad_norm": 6.885488510131836, "learning_rate": 8.565009421655311e-07, "loss": 0.3579, "step": 2970 }, { "epoch": 0.1435473740155578, "grad_norm": 4.901652812957764, "learning_rate": 8.564526259844421e-07, "loss": 0.4256, "step": 2971 }, { "epoch": 0.14359569019664686, "grad_norm": 2.6162619590759277, "learning_rate": 8.564043098033531e-07, "loss": 0.3236, "step": 2972 }, { "epoch": 0.14364400637773592, "grad_norm": 4.256605625152588, "learning_rate": 8.563559936222641e-07, "loss": 0.3994, "step": 2973 }, { "epoch": 0.14369232255882494, "grad_norm": 1.851480484008789, "learning_rate": 8.563076774411751e-07, "loss": 0.2682, "step": 2974 }, { "epoch": 0.143740638739914, "grad_norm": 3.9931800365448, "learning_rate": 8.562593612600859e-07, "loss": 0.2678, "step": 2975 }, { "epoch": 0.14378895492100305, "grad_norm": 2.5097815990448, "learning_rate": 8.562110450789969e-07, "loss": 0.2185, "step": 2976 }, { "epoch": 0.1438372711020921, "grad_norm": 5.113909721374512, "learning_rate": 8.561627288979078e-07, "loss": 0.3778, "step": 2977 }, { "epoch": 0.14388558728318113, "grad_norm": 9.497940063476562, "learning_rate": 8.561144127168188e-07, "loss": 0.3067, "step": 2978 }, { "epoch": 0.14393390346427018, "grad_norm": 4.006482124328613, "learning_rate": 8.560660965357298e-07, "loss": 0.41, "step": 2979 }, { "epoch": 0.14398221964535923, "grad_norm": 1.754933476448059, "learning_rate": 8.560177803546408e-07, "loss": 0.165, "step": 2980 }, { "epoch": 0.14403053582644829, "grad_norm": 3.323789119720459, "learning_rate": 8.559694641735517e-07, "loss": 0.403, "step": 2981 }, { "epoch": 0.1440788520075373, "grad_norm": 2.512385368347168, "learning_rate": 8.559211479924627e-07, "loss": 0.333, "step": 2982 }, { "epoch": 0.14412716818862636, "grad_norm": 2.782658576965332, "learning_rate": 8.558728318113735e-07, "loss": 0.3044, "step": 2983 }, { "epoch": 0.14417548436971542, "grad_norm": 15.165172576904297, "learning_rate": 8.558245156302845e-07, "loss": 0.1983, "step": 2984 }, { "epoch": 0.14422380055080447, "grad_norm": 2.8945226669311523, "learning_rate": 8.557761994491955e-07, "loss": 0.2822, "step": 2985 }, { "epoch": 0.14427211673189352, "grad_norm": 7.779819011688232, "learning_rate": 8.557278832681064e-07, "loss": 0.4852, "step": 2986 }, { "epoch": 0.14432043291298255, "grad_norm": 2.960620164871216, "learning_rate": 8.556795670870174e-07, "loss": 0.2387, "step": 2987 }, { "epoch": 0.1443687490940716, "grad_norm": 5.7625732421875, "learning_rate": 8.556312509059284e-07, "loss": 0.3728, "step": 2988 }, { "epoch": 0.14441706527516066, "grad_norm": 2.1661791801452637, "learning_rate": 8.555829347248394e-07, "loss": 0.2522, "step": 2989 }, { "epoch": 0.1444653814562497, "grad_norm": 1.9456671476364136, "learning_rate": 8.555346185437503e-07, "loss": 0.1674, "step": 2990 }, { "epoch": 0.14451369763733873, "grad_norm": 13.065279960632324, "learning_rate": 8.554863023626612e-07, "loss": 0.3031, "step": 2991 }, { "epoch": 0.1445620138184278, "grad_norm": 3.5095651149749756, "learning_rate": 8.554379861815721e-07, "loss": 0.1453, "step": 2992 }, { "epoch": 0.14461032999951684, "grad_norm": 2.2703452110290527, "learning_rate": 8.553896700004831e-07, "loss": 0.221, "step": 2993 }, { "epoch": 0.1446586461806059, "grad_norm": 4.258889198303223, "learning_rate": 8.553413538193941e-07, "loss": 0.3579, "step": 2994 }, { "epoch": 0.14470696236169492, "grad_norm": 2.907355785369873, "learning_rate": 8.552930376383051e-07, "loss": 0.3663, "step": 2995 }, { "epoch": 0.14475527854278397, "grad_norm": 1.5982612371444702, "learning_rate": 8.552447214572159e-07, "loss": 0.1839, "step": 2996 }, { "epoch": 0.14480359472387302, "grad_norm": 2.3514773845672607, "learning_rate": 8.551964052761269e-07, "loss": 0.2416, "step": 2997 }, { "epoch": 0.14485191090496208, "grad_norm": 2.1799275875091553, "learning_rate": 8.551480890950379e-07, "loss": 0.2782, "step": 2998 }, { "epoch": 0.14490022708605113, "grad_norm": 2.0273702144622803, "learning_rate": 8.550997729139489e-07, "loss": 0.2302, "step": 2999 }, { "epoch": 0.14494854326714016, "grad_norm": 2.5249688625335693, "learning_rate": 8.550514567328599e-07, "loss": 0.2707, "step": 3000 }, { "epoch": 0.1449968594482292, "grad_norm": 7.546736240386963, "learning_rate": 8.550031405517707e-07, "loss": 0.2774, "step": 3001 }, { "epoch": 0.14504517562931826, "grad_norm": 3.2118194103240967, "learning_rate": 8.549548243706816e-07, "loss": 0.4413, "step": 3002 }, { "epoch": 0.14509349181040732, "grad_norm": 3.321634292602539, "learning_rate": 8.549065081895926e-07, "loss": 0.331, "step": 3003 }, { "epoch": 0.14514180799149634, "grad_norm": 3.2058026790618896, "learning_rate": 8.548581920085036e-07, "loss": 0.3245, "step": 3004 }, { "epoch": 0.1451901241725854, "grad_norm": 2.649212121963501, "learning_rate": 8.548098758274146e-07, "loss": 0.2859, "step": 3005 }, { "epoch": 0.14523844035367445, "grad_norm": 1.6938894987106323, "learning_rate": 8.547615596463256e-07, "loss": 0.2029, "step": 3006 }, { "epoch": 0.1452867565347635, "grad_norm": 3.4165918827056885, "learning_rate": 8.547132434652365e-07, "loss": 0.4341, "step": 3007 }, { "epoch": 0.14533507271585253, "grad_norm": 3.2463722229003906, "learning_rate": 8.546649272841475e-07, "loss": 0.3182, "step": 3008 }, { "epoch": 0.14538338889694158, "grad_norm": 2.5970845222473145, "learning_rate": 8.546166111030583e-07, "loss": 0.254, "step": 3009 }, { "epoch": 0.14543170507803063, "grad_norm": 2.678410530090332, "learning_rate": 8.545682949219693e-07, "loss": 0.3241, "step": 3010 }, { "epoch": 0.14548002125911969, "grad_norm": 2.43481707572937, "learning_rate": 8.545199787408803e-07, "loss": 0.3217, "step": 3011 }, { "epoch": 0.14552833744020874, "grad_norm": 2.9689786434173584, "learning_rate": 8.544716625597912e-07, "loss": 0.2932, "step": 3012 }, { "epoch": 0.14557665362129776, "grad_norm": 2.3547141551971436, "learning_rate": 8.544233463787022e-07, "loss": 0.2032, "step": 3013 }, { "epoch": 0.14562496980238682, "grad_norm": 3.2422678470611572, "learning_rate": 8.543750301976132e-07, "loss": 0.4211, "step": 3014 }, { "epoch": 0.14567328598347587, "grad_norm": 3.0827879905700684, "learning_rate": 8.543267140165241e-07, "loss": 0.3095, "step": 3015 }, { "epoch": 0.14572160216456492, "grad_norm": 17.844751358032227, "learning_rate": 8.542783978354351e-07, "loss": 0.3198, "step": 3016 }, { "epoch": 0.14576991834565395, "grad_norm": 2.5043554306030273, "learning_rate": 8.542300816543459e-07, "loss": 0.2829, "step": 3017 }, { "epoch": 0.145818234526743, "grad_norm": 3.021306276321411, "learning_rate": 8.541817654732569e-07, "loss": 0.372, "step": 3018 }, { "epoch": 0.14586655070783205, "grad_norm": 1.8724541664123535, "learning_rate": 8.541334492921679e-07, "loss": 0.1873, "step": 3019 }, { "epoch": 0.1459148668889211, "grad_norm": 2.0296523571014404, "learning_rate": 8.540851331110789e-07, "loss": 0.2407, "step": 3020 }, { "epoch": 0.14596318307001013, "grad_norm": 3.400813102722168, "learning_rate": 8.540368169299899e-07, "loss": 0.3854, "step": 3021 }, { "epoch": 0.14601149925109919, "grad_norm": 3.6622867584228516, "learning_rate": 8.539885007489007e-07, "loss": 0.2695, "step": 3022 }, { "epoch": 0.14605981543218824, "grad_norm": 2.858894109725952, "learning_rate": 8.539401845678117e-07, "loss": 0.1392, "step": 3023 }, { "epoch": 0.1461081316132773, "grad_norm": 3.3894145488739014, "learning_rate": 8.538918683867227e-07, "loss": 0.3826, "step": 3024 }, { "epoch": 0.14615644779436635, "grad_norm": 1.997817873954773, "learning_rate": 8.538435522056337e-07, "loss": 0.2197, "step": 3025 }, { "epoch": 0.14620476397545537, "grad_norm": 2.312211513519287, "learning_rate": 8.537952360245446e-07, "loss": 0.2707, "step": 3026 }, { "epoch": 0.14625308015654442, "grad_norm": 2.042017698287964, "learning_rate": 8.537469198434555e-07, "loss": 0.2317, "step": 3027 }, { "epoch": 0.14630139633763348, "grad_norm": 3.4323344230651855, "learning_rate": 8.536986036623664e-07, "loss": 0.4231, "step": 3028 }, { "epoch": 0.14634971251872253, "grad_norm": 2.4495089054107666, "learning_rate": 8.536502874812774e-07, "loss": 0.3097, "step": 3029 }, { "epoch": 0.14639802869981156, "grad_norm": 2.708085536956787, "learning_rate": 8.536019713001884e-07, "loss": 0.3228, "step": 3030 }, { "epoch": 0.1464463448809006, "grad_norm": 2.705324411392212, "learning_rate": 8.535536551190994e-07, "loss": 0.3205, "step": 3031 }, { "epoch": 0.14649466106198966, "grad_norm": 2.212402582168579, "learning_rate": 8.535053389380104e-07, "loss": 0.2619, "step": 3032 }, { "epoch": 0.14654297724307871, "grad_norm": 2.541133403778076, "learning_rate": 8.534570227569213e-07, "loss": 0.2957, "step": 3033 }, { "epoch": 0.14659129342416774, "grad_norm": 3.3082032203674316, "learning_rate": 8.534087065758321e-07, "loss": 0.2545, "step": 3034 }, { "epoch": 0.1466396096052568, "grad_norm": 2.942732095718384, "learning_rate": 8.533603903947431e-07, "loss": 0.3273, "step": 3035 }, { "epoch": 0.14668792578634585, "grad_norm": 1.9350529909133911, "learning_rate": 8.533120742136541e-07, "loss": 0.239, "step": 3036 }, { "epoch": 0.1467362419674349, "grad_norm": 2.587676525115967, "learning_rate": 8.532637580325651e-07, "loss": 0.3705, "step": 3037 }, { "epoch": 0.14678455814852395, "grad_norm": 2.3948519229888916, "learning_rate": 8.53215441851476e-07, "loss": 0.2705, "step": 3038 }, { "epoch": 0.14683287432961298, "grad_norm": 3.827592372894287, "learning_rate": 8.53167125670387e-07, "loss": 0.4009, "step": 3039 }, { "epoch": 0.14688119051070203, "grad_norm": 2.6290290355682373, "learning_rate": 8.53118809489298e-07, "loss": 0.265, "step": 3040 }, { "epoch": 0.14692950669179108, "grad_norm": 3.488199234008789, "learning_rate": 8.530704933082089e-07, "loss": 0.3112, "step": 3041 }, { "epoch": 0.14697782287288014, "grad_norm": 2.8264143466949463, "learning_rate": 8.530221771271199e-07, "loss": 0.3882, "step": 3042 }, { "epoch": 0.14702613905396916, "grad_norm": 3.3934061527252197, "learning_rate": 8.529738609460307e-07, "loss": 0.4878, "step": 3043 }, { "epoch": 0.14707445523505822, "grad_norm": 1.6182024478912354, "learning_rate": 8.529255447649417e-07, "loss": 0.1758, "step": 3044 }, { "epoch": 0.14712277141614727, "grad_norm": 9.982836723327637, "learning_rate": 8.528772285838527e-07, "loss": 0.3234, "step": 3045 }, { "epoch": 0.14717108759723632, "grad_norm": 2.0183463096618652, "learning_rate": 8.528289124027637e-07, "loss": 0.1825, "step": 3046 }, { "epoch": 0.14721940377832535, "grad_norm": 2.8594906330108643, "learning_rate": 8.527805962216746e-07, "loss": 0.3749, "step": 3047 }, { "epoch": 0.1472677199594144, "grad_norm": 2.9452476501464844, "learning_rate": 8.527322800405855e-07, "loss": 0.3693, "step": 3048 }, { "epoch": 0.14731603614050345, "grad_norm": 2.3841164112091064, "learning_rate": 8.526839638594965e-07, "loss": 0.3311, "step": 3049 }, { "epoch": 0.1473643523215925, "grad_norm": 2.889737129211426, "learning_rate": 8.526356476784075e-07, "loss": 0.2116, "step": 3050 }, { "epoch": 0.14741266850268156, "grad_norm": 2.5179781913757324, "learning_rate": 8.525873314973184e-07, "loss": 0.3183, "step": 3051 }, { "epoch": 0.14746098468377059, "grad_norm": 2.30798602104187, "learning_rate": 8.525390153162294e-07, "loss": 0.34, "step": 3052 }, { "epoch": 0.14750930086485964, "grad_norm": 2.067453622817993, "learning_rate": 8.524906991351402e-07, "loss": 0.269, "step": 3053 }, { "epoch": 0.1475576170459487, "grad_norm": 2.9971346855163574, "learning_rate": 8.524423829540512e-07, "loss": 0.3651, "step": 3054 }, { "epoch": 0.14760593322703774, "grad_norm": 2.6185269355773926, "learning_rate": 8.523940667729622e-07, "loss": 0.3287, "step": 3055 }, { "epoch": 0.14765424940812677, "grad_norm": 3.592601776123047, "learning_rate": 8.523457505918732e-07, "loss": 0.3876, "step": 3056 }, { "epoch": 0.14770256558921582, "grad_norm": 1.8242299556732178, "learning_rate": 8.522974344107842e-07, "loss": 0.2059, "step": 3057 }, { "epoch": 0.14775088177030488, "grad_norm": 20.91542625427246, "learning_rate": 8.522491182296952e-07, "loss": 0.2802, "step": 3058 }, { "epoch": 0.14779919795139393, "grad_norm": 1.7538113594055176, "learning_rate": 8.52200802048606e-07, "loss": 0.2153, "step": 3059 }, { "epoch": 0.14784751413248298, "grad_norm": 2.4605565071105957, "learning_rate": 8.521524858675169e-07, "loss": 0.309, "step": 3060 }, { "epoch": 0.147895830313572, "grad_norm": 3.0085341930389404, "learning_rate": 8.521041696864279e-07, "loss": 0.274, "step": 3061 }, { "epoch": 0.14794414649466106, "grad_norm": 2.4212584495544434, "learning_rate": 8.520558535053389e-07, "loss": 0.3048, "step": 3062 }, { "epoch": 0.1479924626757501, "grad_norm": 3.433354616165161, "learning_rate": 8.520075373242499e-07, "loss": 0.4019, "step": 3063 }, { "epoch": 0.14804077885683917, "grad_norm": 3.0305278301239014, "learning_rate": 8.519592211431608e-07, "loss": 0.4194, "step": 3064 }, { "epoch": 0.1480890950379282, "grad_norm": 3.8520758152008057, "learning_rate": 8.519109049620718e-07, "loss": 0.2899, "step": 3065 }, { "epoch": 0.14813741121901725, "grad_norm": 2.940211534500122, "learning_rate": 8.518625887809827e-07, "loss": 0.3672, "step": 3066 }, { "epoch": 0.1481857274001063, "grad_norm": 3.0207173824310303, "learning_rate": 8.518142725998937e-07, "loss": 0.214, "step": 3067 }, { "epoch": 0.14823404358119535, "grad_norm": 4.41666316986084, "learning_rate": 8.517659564188046e-07, "loss": 0.3424, "step": 3068 }, { "epoch": 0.14828235976228438, "grad_norm": 1.7041544914245605, "learning_rate": 8.517176402377155e-07, "loss": 0.1964, "step": 3069 }, { "epoch": 0.14833067594337343, "grad_norm": 4.065709590911865, "learning_rate": 8.516693240566265e-07, "loss": 0.3966, "step": 3070 }, { "epoch": 0.14837899212446248, "grad_norm": 2.4854838848114014, "learning_rate": 8.516210078755375e-07, "loss": 0.2509, "step": 3071 }, { "epoch": 0.14842730830555154, "grad_norm": 6.135496139526367, "learning_rate": 8.515726916944485e-07, "loss": 0.3782, "step": 3072 }, { "epoch": 0.1484756244866406, "grad_norm": 5.668338775634766, "learning_rate": 8.515243755133594e-07, "loss": 0.2141, "step": 3073 }, { "epoch": 0.14852394066772961, "grad_norm": 5.456298351287842, "learning_rate": 8.514760593322703e-07, "loss": 0.2673, "step": 3074 }, { "epoch": 0.14857225684881867, "grad_norm": 2.661926031112671, "learning_rate": 8.514277431511813e-07, "loss": 0.2106, "step": 3075 }, { "epoch": 0.14862057302990772, "grad_norm": 2.6154537200927734, "learning_rate": 8.513794269700922e-07, "loss": 0.2582, "step": 3076 }, { "epoch": 0.14866888921099677, "grad_norm": 2.652205228805542, "learning_rate": 8.513311107890032e-07, "loss": 0.319, "step": 3077 }, { "epoch": 0.1487172053920858, "grad_norm": 2.2503058910369873, "learning_rate": 8.512827946079142e-07, "loss": 0.2858, "step": 3078 }, { "epoch": 0.14876552157317485, "grad_norm": 3.346830368041992, "learning_rate": 8.51234478426825e-07, "loss": 0.2519, "step": 3079 }, { "epoch": 0.1488138377542639, "grad_norm": 2.6203413009643555, "learning_rate": 8.51186162245736e-07, "loss": 0.3669, "step": 3080 }, { "epoch": 0.14886215393535296, "grad_norm": 1.6981308460235596, "learning_rate": 8.51137846064647e-07, "loss": 0.1908, "step": 3081 }, { "epoch": 0.14891047011644198, "grad_norm": 2.731900453567505, "learning_rate": 8.51089529883558e-07, "loss": 0.2631, "step": 3082 }, { "epoch": 0.14895878629753104, "grad_norm": 3.0314273834228516, "learning_rate": 8.51041213702469e-07, "loss": 0.4025, "step": 3083 }, { "epoch": 0.1490071024786201, "grad_norm": 11.218711853027344, "learning_rate": 8.5099289752138e-07, "loss": 0.2715, "step": 3084 }, { "epoch": 0.14905541865970914, "grad_norm": 2.8220765590667725, "learning_rate": 8.509445813402907e-07, "loss": 0.3998, "step": 3085 }, { "epoch": 0.1491037348407982, "grad_norm": 2.8190178871154785, "learning_rate": 8.508962651592017e-07, "loss": 0.2941, "step": 3086 }, { "epoch": 0.14915205102188722, "grad_norm": 4.566400527954102, "learning_rate": 8.508479489781127e-07, "loss": 0.2547, "step": 3087 }, { "epoch": 0.14920036720297628, "grad_norm": 2.493858814239502, "learning_rate": 8.507996327970237e-07, "loss": 0.2926, "step": 3088 }, { "epoch": 0.14924868338406533, "grad_norm": 2.22031831741333, "learning_rate": 8.507513166159347e-07, "loss": 0.2279, "step": 3089 }, { "epoch": 0.14929699956515438, "grad_norm": 3.083660840988159, "learning_rate": 8.507030004348456e-07, "loss": 0.3578, "step": 3090 }, { "epoch": 0.1493453157462434, "grad_norm": 2.8838891983032227, "learning_rate": 8.506546842537566e-07, "loss": 0.3135, "step": 3091 }, { "epoch": 0.14939363192733246, "grad_norm": 2.7855498790740967, "learning_rate": 8.506063680726675e-07, "loss": 0.3626, "step": 3092 }, { "epoch": 0.1494419481084215, "grad_norm": 32.14374542236328, "learning_rate": 8.505580518915784e-07, "loss": 0.2795, "step": 3093 }, { "epoch": 0.14949026428951057, "grad_norm": 2.8273544311523438, "learning_rate": 8.505097357104894e-07, "loss": 0.2871, "step": 3094 }, { "epoch": 0.1495385804705996, "grad_norm": 2.6397457122802734, "learning_rate": 8.504614195294003e-07, "loss": 0.411, "step": 3095 }, { "epoch": 0.14958689665168864, "grad_norm": 2.5060784816741943, "learning_rate": 8.504131033483113e-07, "loss": 0.2824, "step": 3096 }, { "epoch": 0.1496352128327777, "grad_norm": 2.3310978412628174, "learning_rate": 8.503647871672223e-07, "loss": 0.2674, "step": 3097 }, { "epoch": 0.14968352901386675, "grad_norm": 2.884746551513672, "learning_rate": 8.503164709861332e-07, "loss": 0.2832, "step": 3098 }, { "epoch": 0.1497318451949558, "grad_norm": 2.4965708255767822, "learning_rate": 8.502681548050442e-07, "loss": 0.3381, "step": 3099 }, { "epoch": 0.14978016137604483, "grad_norm": 3.1574933528900146, "learning_rate": 8.502198386239551e-07, "loss": 0.2966, "step": 3100 }, { "epoch": 0.14982847755713388, "grad_norm": 2.1172642707824707, "learning_rate": 8.50171522442866e-07, "loss": 0.3029, "step": 3101 }, { "epoch": 0.14987679373822294, "grad_norm": 54.25661849975586, "learning_rate": 8.50123206261777e-07, "loss": 0.3449, "step": 3102 }, { "epoch": 0.149925109919312, "grad_norm": 2.1466286182403564, "learning_rate": 8.50074890080688e-07, "loss": 0.2387, "step": 3103 }, { "epoch": 0.149973426100401, "grad_norm": 4.175161838531494, "learning_rate": 8.50026573899599e-07, "loss": 0.3238, "step": 3104 }, { "epoch": 0.15002174228149007, "grad_norm": 5.871692180633545, "learning_rate": 8.499782577185098e-07, "loss": 0.3419, "step": 3105 }, { "epoch": 0.15007005846257912, "grad_norm": 2.563725709915161, "learning_rate": 8.499299415374208e-07, "loss": 0.3064, "step": 3106 }, { "epoch": 0.15011837464366817, "grad_norm": 2.646883249282837, "learning_rate": 8.498816253563318e-07, "loss": 0.3741, "step": 3107 }, { "epoch": 0.1501666908247572, "grad_norm": 3.3696415424346924, "learning_rate": 8.498333091752428e-07, "loss": 0.4162, "step": 3108 }, { "epoch": 0.15021500700584625, "grad_norm": 2.2648777961730957, "learning_rate": 8.497849929941538e-07, "loss": 0.2406, "step": 3109 }, { "epoch": 0.1502633231869353, "grad_norm": 2.9593918323516846, "learning_rate": 8.497366768130648e-07, "loss": 0.3688, "step": 3110 }, { "epoch": 0.15031163936802436, "grad_norm": 4.129216194152832, "learning_rate": 8.496883606319755e-07, "loss": 0.2215, "step": 3111 }, { "epoch": 0.1503599555491134, "grad_norm": 3.395522356033325, "learning_rate": 8.496400444508865e-07, "loss": 0.3623, "step": 3112 }, { "epoch": 0.15040827173020244, "grad_norm": 1.4593784809112549, "learning_rate": 8.495917282697975e-07, "loss": 0.1446, "step": 3113 }, { "epoch": 0.1504565879112915, "grad_norm": 5.959508895874023, "learning_rate": 8.495434120887085e-07, "loss": 0.288, "step": 3114 }, { "epoch": 0.15050490409238054, "grad_norm": 2.2748026847839355, "learning_rate": 8.494950959076195e-07, "loss": 0.2683, "step": 3115 }, { "epoch": 0.1505532202734696, "grad_norm": 4.490889549255371, "learning_rate": 8.494467797265304e-07, "loss": 0.5824, "step": 3116 }, { "epoch": 0.15060153645455862, "grad_norm": 6.534095287322998, "learning_rate": 8.493984635454413e-07, "loss": 0.2764, "step": 3117 }, { "epoch": 0.15064985263564767, "grad_norm": 9.48902702331543, "learning_rate": 8.493501473643523e-07, "loss": 0.4227, "step": 3118 }, { "epoch": 0.15069816881673673, "grad_norm": 2.394526958465576, "learning_rate": 8.493018311832632e-07, "loss": 0.2471, "step": 3119 }, { "epoch": 0.15074648499782578, "grad_norm": 3.1840646266937256, "learning_rate": 8.492535150021742e-07, "loss": 0.31, "step": 3120 }, { "epoch": 0.1507948011789148, "grad_norm": 2.417536735534668, "learning_rate": 8.492051988210851e-07, "loss": 0.2554, "step": 3121 }, { "epoch": 0.15084311736000386, "grad_norm": 3.6394224166870117, "learning_rate": 8.491568826399961e-07, "loss": 0.4466, "step": 3122 }, { "epoch": 0.1508914335410929, "grad_norm": 2.141761302947998, "learning_rate": 8.491085664589071e-07, "loss": 0.2044, "step": 3123 }, { "epoch": 0.15093974972218197, "grad_norm": 2.747793197631836, "learning_rate": 8.49060250277818e-07, "loss": 0.3373, "step": 3124 }, { "epoch": 0.15098806590327102, "grad_norm": 2.758481502532959, "learning_rate": 8.49011934096729e-07, "loss": 0.2939, "step": 3125 }, { "epoch": 0.15103638208436004, "grad_norm": 2.333171844482422, "learning_rate": 8.489636179156399e-07, "loss": 0.2176, "step": 3126 }, { "epoch": 0.1510846982654491, "grad_norm": 2.497180461883545, "learning_rate": 8.489153017345508e-07, "loss": 0.2683, "step": 3127 }, { "epoch": 0.15113301444653815, "grad_norm": 2.2975587844848633, "learning_rate": 8.488669855534618e-07, "loss": 0.2467, "step": 3128 }, { "epoch": 0.1511813306276272, "grad_norm": 3.0989577770233154, "learning_rate": 8.488186693723728e-07, "loss": 0.2664, "step": 3129 }, { "epoch": 0.15122964680871623, "grad_norm": 2.6033151149749756, "learning_rate": 8.487703531912837e-07, "loss": 0.2167, "step": 3130 }, { "epoch": 0.15127796298980528, "grad_norm": 2.221463441848755, "learning_rate": 8.487220370101946e-07, "loss": 0.2851, "step": 3131 }, { "epoch": 0.15132627917089433, "grad_norm": 3.6638998985290527, "learning_rate": 8.486737208291056e-07, "loss": 0.2718, "step": 3132 }, { "epoch": 0.1513745953519834, "grad_norm": 3.13460111618042, "learning_rate": 8.486254046480166e-07, "loss": 0.3381, "step": 3133 }, { "epoch": 0.1514229115330724, "grad_norm": 3.344283103942871, "learning_rate": 8.485770884669276e-07, "loss": 0.274, "step": 3134 }, { "epoch": 0.15147122771416147, "grad_norm": 4.31749153137207, "learning_rate": 8.485287722858386e-07, "loss": 0.3483, "step": 3135 }, { "epoch": 0.15151954389525052, "grad_norm": 3.3434853553771973, "learning_rate": 8.484804561047494e-07, "loss": 0.1939, "step": 3136 }, { "epoch": 0.15156786007633957, "grad_norm": 4.101446151733398, "learning_rate": 8.484321399236603e-07, "loss": 0.3086, "step": 3137 }, { "epoch": 0.15161617625742863, "grad_norm": 2.6368825435638428, "learning_rate": 8.483838237425713e-07, "loss": 0.2746, "step": 3138 }, { "epoch": 0.15166449243851765, "grad_norm": 2.524554967880249, "learning_rate": 8.483355075614823e-07, "loss": 0.2303, "step": 3139 }, { "epoch": 0.1517128086196067, "grad_norm": 3.498065233230591, "learning_rate": 8.482871913803933e-07, "loss": 0.3128, "step": 3140 }, { "epoch": 0.15176112480069576, "grad_norm": 2.5972228050231934, "learning_rate": 8.482388751993043e-07, "loss": 0.2579, "step": 3141 }, { "epoch": 0.1518094409817848, "grad_norm": 3.060744047164917, "learning_rate": 8.481905590182152e-07, "loss": 0.4726, "step": 3142 }, { "epoch": 0.15185775716287384, "grad_norm": 1.9383389949798584, "learning_rate": 8.481422428371261e-07, "loss": 0.2239, "step": 3143 }, { "epoch": 0.1519060733439629, "grad_norm": 2.7290525436401367, "learning_rate": 8.48093926656037e-07, "loss": 0.2276, "step": 3144 }, { "epoch": 0.15195438952505194, "grad_norm": 2.458303213119507, "learning_rate": 8.48045610474948e-07, "loss": 0.3137, "step": 3145 }, { "epoch": 0.152002705706141, "grad_norm": 2.6398332118988037, "learning_rate": 8.47997294293859e-07, "loss": 0.391, "step": 3146 }, { "epoch": 0.15205102188723002, "grad_norm": 7.079219341278076, "learning_rate": 8.479489781127699e-07, "loss": 0.3751, "step": 3147 }, { "epoch": 0.15209933806831907, "grad_norm": 3.243061065673828, "learning_rate": 8.479006619316809e-07, "loss": 0.3384, "step": 3148 }, { "epoch": 0.15214765424940813, "grad_norm": 1.8089826107025146, "learning_rate": 8.478523457505918e-07, "loss": 0.1874, "step": 3149 }, { "epoch": 0.15219597043049718, "grad_norm": 2.0017311573028564, "learning_rate": 8.478040295695028e-07, "loss": 0.1895, "step": 3150 }, { "epoch": 0.15224428661158623, "grad_norm": 3.4526946544647217, "learning_rate": 8.477557133884138e-07, "loss": 0.3822, "step": 3151 }, { "epoch": 0.15229260279267526, "grad_norm": 1.5213196277618408, "learning_rate": 8.477073972073246e-07, "loss": 0.1496, "step": 3152 }, { "epoch": 0.1523409189737643, "grad_norm": 3.1740684509277344, "learning_rate": 8.476590810262356e-07, "loss": 0.4102, "step": 3153 }, { "epoch": 0.15238923515485336, "grad_norm": 2.702746868133545, "learning_rate": 8.476107648451466e-07, "loss": 0.415, "step": 3154 }, { "epoch": 0.15243755133594242, "grad_norm": 2.4875144958496094, "learning_rate": 8.475624486640576e-07, "loss": 0.3067, "step": 3155 }, { "epoch": 0.15248586751703144, "grad_norm": 2.9235177040100098, "learning_rate": 8.475141324829685e-07, "loss": 0.2431, "step": 3156 }, { "epoch": 0.1525341836981205, "grad_norm": 6.232335567474365, "learning_rate": 8.474658163018794e-07, "loss": 0.3644, "step": 3157 }, { "epoch": 0.15258249987920955, "grad_norm": 5.0391316413879395, "learning_rate": 8.474175001207904e-07, "loss": 0.3626, "step": 3158 }, { "epoch": 0.1526308160602986, "grad_norm": 2.742316246032715, "learning_rate": 8.473691839397014e-07, "loss": 0.4049, "step": 3159 }, { "epoch": 0.15267913224138763, "grad_norm": 3.0869510173797607, "learning_rate": 8.473208677586124e-07, "loss": 0.2617, "step": 3160 }, { "epoch": 0.15272744842247668, "grad_norm": 2.890448808670044, "learning_rate": 8.472725515775233e-07, "loss": 0.3523, "step": 3161 }, { "epoch": 0.15277576460356573, "grad_norm": 2.106207847595215, "learning_rate": 8.472242353964341e-07, "loss": 0.2186, "step": 3162 }, { "epoch": 0.1528240807846548, "grad_norm": 2.694920539855957, "learning_rate": 8.471759192153451e-07, "loss": 0.3362, "step": 3163 }, { "epoch": 0.15287239696574384, "grad_norm": 3.078348159790039, "learning_rate": 8.471276030342561e-07, "loss": 0.3143, "step": 3164 }, { "epoch": 0.15292071314683287, "grad_norm": 4.229978561401367, "learning_rate": 8.470792868531671e-07, "loss": 0.2994, "step": 3165 }, { "epoch": 0.15296902932792192, "grad_norm": 25.72569465637207, "learning_rate": 8.470309706720781e-07, "loss": 0.3823, "step": 3166 }, { "epoch": 0.15301734550901097, "grad_norm": 1.9862074851989746, "learning_rate": 8.469826544909891e-07, "loss": 0.2139, "step": 3167 }, { "epoch": 0.15306566169010002, "grad_norm": 3.8062710762023926, "learning_rate": 8.469343383099e-07, "loss": 0.3247, "step": 3168 }, { "epoch": 0.15311397787118905, "grad_norm": 2.5097100734710693, "learning_rate": 8.468860221288108e-07, "loss": 0.3003, "step": 3169 }, { "epoch": 0.1531622940522781, "grad_norm": 2.4522290229797363, "learning_rate": 8.468377059477218e-07, "loss": 0.2537, "step": 3170 }, { "epoch": 0.15321061023336716, "grad_norm": 22.377376556396484, "learning_rate": 8.467893897666328e-07, "loss": 0.3613, "step": 3171 }, { "epoch": 0.1532589264144562, "grad_norm": 1.9833682775497437, "learning_rate": 8.467410735855438e-07, "loss": 0.1517, "step": 3172 }, { "epoch": 0.15330724259554523, "grad_norm": 4.398370742797852, "learning_rate": 8.466927574044547e-07, "loss": 0.3841, "step": 3173 }, { "epoch": 0.1533555587766343, "grad_norm": 3.452815532684326, "learning_rate": 8.466444412233657e-07, "loss": 0.3906, "step": 3174 }, { "epoch": 0.15340387495772334, "grad_norm": 2.554295301437378, "learning_rate": 8.465961250422766e-07, "loss": 0.3185, "step": 3175 }, { "epoch": 0.1534521911388124, "grad_norm": 3.4486396312713623, "learning_rate": 8.465478088611876e-07, "loss": 0.409, "step": 3176 }, { "epoch": 0.15350050731990145, "grad_norm": 7.3631391525268555, "learning_rate": 8.464994926800986e-07, "loss": 0.4116, "step": 3177 }, { "epoch": 0.15354882350099047, "grad_norm": 8.583759307861328, "learning_rate": 8.464511764990094e-07, "loss": 0.4365, "step": 3178 }, { "epoch": 0.15359713968207953, "grad_norm": 2.407358407974243, "learning_rate": 8.464028603179204e-07, "loss": 0.2468, "step": 3179 }, { "epoch": 0.15364545586316858, "grad_norm": 2.3660266399383545, "learning_rate": 8.463545441368314e-07, "loss": 0.3736, "step": 3180 }, { "epoch": 0.15369377204425763, "grad_norm": 2.477909564971924, "learning_rate": 8.463062279557424e-07, "loss": 0.3324, "step": 3181 }, { "epoch": 0.15374208822534666, "grad_norm": 5.5793538093566895, "learning_rate": 8.462579117746533e-07, "loss": 0.3049, "step": 3182 }, { "epoch": 0.1537904044064357, "grad_norm": 7.463108539581299, "learning_rate": 8.462095955935642e-07, "loss": 0.3181, "step": 3183 }, { "epoch": 0.15383872058752476, "grad_norm": 4.68549919128418, "learning_rate": 8.461612794124752e-07, "loss": 0.3458, "step": 3184 }, { "epoch": 0.15388703676861382, "grad_norm": 4.18910026550293, "learning_rate": 8.461129632313862e-07, "loss": 0.4572, "step": 3185 }, { "epoch": 0.15393535294970284, "grad_norm": 2.8242127895355225, "learning_rate": 8.460646470502971e-07, "loss": 0.4076, "step": 3186 }, { "epoch": 0.1539836691307919, "grad_norm": 2.382521629333496, "learning_rate": 8.460163308692081e-07, "loss": 0.3106, "step": 3187 }, { "epoch": 0.15403198531188095, "grad_norm": 2.9756221771240234, "learning_rate": 8.459680146881189e-07, "loss": 0.1948, "step": 3188 }, { "epoch": 0.15408030149297, "grad_norm": 2.5393588542938232, "learning_rate": 8.459196985070299e-07, "loss": 0.2776, "step": 3189 }, { "epoch": 0.15412861767405905, "grad_norm": 2.9619390964508057, "learning_rate": 8.458713823259409e-07, "loss": 0.3552, "step": 3190 }, { "epoch": 0.15417693385514808, "grad_norm": 3.478134870529175, "learning_rate": 8.458230661448519e-07, "loss": 0.3075, "step": 3191 }, { "epoch": 0.15422525003623713, "grad_norm": 22.553146362304688, "learning_rate": 8.457747499637629e-07, "loss": 0.3191, "step": 3192 }, { "epoch": 0.15427356621732619, "grad_norm": 3.124666929244995, "learning_rate": 8.457264337826739e-07, "loss": 0.3023, "step": 3193 }, { "epoch": 0.15432188239841524, "grad_norm": 5.71087121963501, "learning_rate": 8.456781176015846e-07, "loss": 0.3043, "step": 3194 }, { "epoch": 0.15437019857950426, "grad_norm": 1.8679267168045044, "learning_rate": 8.456298014204956e-07, "loss": 0.2053, "step": 3195 }, { "epoch": 0.15441851476059332, "grad_norm": 2.5186030864715576, "learning_rate": 8.455814852394066e-07, "loss": 0.2853, "step": 3196 }, { "epoch": 0.15446683094168237, "grad_norm": 5.586045265197754, "learning_rate": 8.455331690583176e-07, "loss": 0.4069, "step": 3197 }, { "epoch": 0.15451514712277142, "grad_norm": 2.9006710052490234, "learning_rate": 8.454848528772286e-07, "loss": 0.4013, "step": 3198 }, { "epoch": 0.15456346330386048, "grad_norm": 2.6992640495300293, "learning_rate": 8.454365366961395e-07, "loss": 0.38, "step": 3199 }, { "epoch": 0.1546117794849495, "grad_norm": 2.1769843101501465, "learning_rate": 8.453882205150505e-07, "loss": 0.2839, "step": 3200 }, { "epoch": 0.15466009566603856, "grad_norm": 5.38316011428833, "learning_rate": 8.453399043339614e-07, "loss": 0.6774, "step": 3201 }, { "epoch": 0.1547084118471276, "grad_norm": 2.577984571456909, "learning_rate": 8.452915881528724e-07, "loss": 0.2567, "step": 3202 }, { "epoch": 0.15475672802821666, "grad_norm": 3.7106330394744873, "learning_rate": 8.452432719717833e-07, "loss": 0.4085, "step": 3203 }, { "epoch": 0.1548050442093057, "grad_norm": 3.2870376110076904, "learning_rate": 8.451949557906942e-07, "loss": 0.3706, "step": 3204 }, { "epoch": 0.15485336039039474, "grad_norm": 4.855985164642334, "learning_rate": 8.451466396096052e-07, "loss": 0.3927, "step": 3205 }, { "epoch": 0.1549016765714838, "grad_norm": 2.983313798904419, "learning_rate": 8.450983234285162e-07, "loss": 0.3327, "step": 3206 }, { "epoch": 0.15494999275257285, "grad_norm": 2.84499192237854, "learning_rate": 8.450500072474271e-07, "loss": 0.3015, "step": 3207 }, { "epoch": 0.15499830893366187, "grad_norm": 2.3913121223449707, "learning_rate": 8.450016910663381e-07, "loss": 0.25, "step": 3208 }, { "epoch": 0.15504662511475092, "grad_norm": 1.746974229812622, "learning_rate": 8.44953374885249e-07, "loss": 0.1752, "step": 3209 }, { "epoch": 0.15509494129583998, "grad_norm": 2.3693699836730957, "learning_rate": 8.4490505870416e-07, "loss": 0.328, "step": 3210 }, { "epoch": 0.15514325747692903, "grad_norm": 3.0557706356048584, "learning_rate": 8.44856742523071e-07, "loss": 0.2668, "step": 3211 }, { "epoch": 0.15519157365801808, "grad_norm": 3.6461355686187744, "learning_rate": 8.448084263419819e-07, "loss": 0.2166, "step": 3212 }, { "epoch": 0.1552398898391071, "grad_norm": 3.6703269481658936, "learning_rate": 8.447601101608929e-07, "loss": 0.459, "step": 3213 }, { "epoch": 0.15528820602019616, "grad_norm": 3.541424512863159, "learning_rate": 8.447117939798037e-07, "loss": 0.2719, "step": 3214 }, { "epoch": 0.15533652220128522, "grad_norm": 2.705787420272827, "learning_rate": 8.446634777987147e-07, "loss": 0.2698, "step": 3215 }, { "epoch": 0.15538483838237427, "grad_norm": 2.4923012256622314, "learning_rate": 8.446151616176257e-07, "loss": 0.2894, "step": 3216 }, { "epoch": 0.1554331545634633, "grad_norm": 2.054488182067871, "learning_rate": 8.445668454365367e-07, "loss": 0.2146, "step": 3217 }, { "epoch": 0.15548147074455235, "grad_norm": 32.32229995727539, "learning_rate": 8.445185292554477e-07, "loss": 0.3195, "step": 3218 }, { "epoch": 0.1555297869256414, "grad_norm": 2.379899024963379, "learning_rate": 8.444702130743587e-07, "loss": 0.3016, "step": 3219 }, { "epoch": 0.15557810310673045, "grad_norm": 2.9873428344726562, "learning_rate": 8.444218968932694e-07, "loss": 0.4474, "step": 3220 }, { "epoch": 0.15562641928781948, "grad_norm": 3.780632257461548, "learning_rate": 8.443735807121804e-07, "loss": 0.4804, "step": 3221 }, { "epoch": 0.15567473546890853, "grad_norm": 14.1766939163208, "learning_rate": 8.443252645310914e-07, "loss": 0.4321, "step": 3222 }, { "epoch": 0.15572305164999758, "grad_norm": 2.3922410011291504, "learning_rate": 8.442769483500024e-07, "loss": 0.4025, "step": 3223 }, { "epoch": 0.15577136783108664, "grad_norm": 3.094494581222534, "learning_rate": 8.442286321689134e-07, "loss": 0.3455, "step": 3224 }, { "epoch": 0.1558196840121757, "grad_norm": 1.7226020097732544, "learning_rate": 8.441803159878243e-07, "loss": 0.2031, "step": 3225 }, { "epoch": 0.15586800019326472, "grad_norm": 5.122159004211426, "learning_rate": 8.441319998067352e-07, "loss": 0.3255, "step": 3226 }, { "epoch": 0.15591631637435377, "grad_norm": 3.326948881149292, "learning_rate": 8.440836836256462e-07, "loss": 0.3893, "step": 3227 }, { "epoch": 0.15596463255544282, "grad_norm": 3.2117650508880615, "learning_rate": 8.440353674445572e-07, "loss": 0.2408, "step": 3228 }, { "epoch": 0.15601294873653188, "grad_norm": 2.6308295726776123, "learning_rate": 8.439870512634681e-07, "loss": 0.2484, "step": 3229 }, { "epoch": 0.1560612649176209, "grad_norm": 2.639141321182251, "learning_rate": 8.43938735082379e-07, "loss": 0.3434, "step": 3230 }, { "epoch": 0.15610958109870995, "grad_norm": 2.7535922527313232, "learning_rate": 8.4389041890129e-07, "loss": 0.3731, "step": 3231 }, { "epoch": 0.156157897279799, "grad_norm": 1.9231263399124146, "learning_rate": 8.43842102720201e-07, "loss": 0.2212, "step": 3232 }, { "epoch": 0.15620621346088806, "grad_norm": 2.925013303756714, "learning_rate": 8.437937865391119e-07, "loss": 0.353, "step": 3233 }, { "epoch": 0.15625452964197709, "grad_norm": 2.042332649230957, "learning_rate": 8.437454703580229e-07, "loss": 0.2447, "step": 3234 }, { "epoch": 0.15630284582306614, "grad_norm": 2.5385892391204834, "learning_rate": 8.436971541769338e-07, "loss": 0.316, "step": 3235 }, { "epoch": 0.1563511620041552, "grad_norm": 10.07363510131836, "learning_rate": 8.436488379958448e-07, "loss": 0.3038, "step": 3236 }, { "epoch": 0.15639947818524425, "grad_norm": 2.3534228801727295, "learning_rate": 8.436005218147557e-07, "loss": 0.258, "step": 3237 }, { "epoch": 0.1564477943663333, "grad_norm": 3.547714948654175, "learning_rate": 8.435522056336667e-07, "loss": 0.437, "step": 3238 }, { "epoch": 0.15649611054742232, "grad_norm": 2.4625892639160156, "learning_rate": 8.435038894525776e-07, "loss": 0.3153, "step": 3239 }, { "epoch": 0.15654442672851138, "grad_norm": 3.549034833908081, "learning_rate": 8.434555732714885e-07, "loss": 0.2745, "step": 3240 }, { "epoch": 0.15659274290960043, "grad_norm": 3.185574769973755, "learning_rate": 8.434072570903995e-07, "loss": 0.4113, "step": 3241 }, { "epoch": 0.15664105909068948, "grad_norm": 2.9140048027038574, "learning_rate": 8.433589409093105e-07, "loss": 0.4428, "step": 3242 }, { "epoch": 0.1566893752717785, "grad_norm": 5.481339931488037, "learning_rate": 8.433106247282215e-07, "loss": 0.2512, "step": 3243 }, { "epoch": 0.15673769145286756, "grad_norm": 2.6770145893096924, "learning_rate": 8.432623085471325e-07, "loss": 0.2951, "step": 3244 }, { "epoch": 0.15678600763395661, "grad_norm": 21.38780403137207, "learning_rate": 8.432139923660435e-07, "loss": 0.3431, "step": 3245 }, { "epoch": 0.15683432381504567, "grad_norm": 2.5234079360961914, "learning_rate": 8.431656761849542e-07, "loss": 0.3484, "step": 3246 }, { "epoch": 0.1568826399961347, "grad_norm": 2.6629555225372314, "learning_rate": 8.431173600038652e-07, "loss": 0.3227, "step": 3247 }, { "epoch": 0.15693095617722375, "grad_norm": 3.076803684234619, "learning_rate": 8.430690438227762e-07, "loss": 0.4496, "step": 3248 }, { "epoch": 0.1569792723583128, "grad_norm": 2.6736466884613037, "learning_rate": 8.430207276416872e-07, "loss": 0.368, "step": 3249 }, { "epoch": 0.15702758853940185, "grad_norm": 3.6237308979034424, "learning_rate": 8.429724114605982e-07, "loss": 0.3597, "step": 3250 }, { "epoch": 0.1570759047204909, "grad_norm": 4.648858547210693, "learning_rate": 8.429240952795091e-07, "loss": 0.3363, "step": 3251 }, { "epoch": 0.15712422090157993, "grad_norm": 3.6530823707580566, "learning_rate": 8.4287577909842e-07, "loss": 0.3632, "step": 3252 }, { "epoch": 0.15717253708266898, "grad_norm": 3.4507129192352295, "learning_rate": 8.42827462917331e-07, "loss": 0.4282, "step": 3253 }, { "epoch": 0.15722085326375804, "grad_norm": 2.1437182426452637, "learning_rate": 8.427791467362419e-07, "loss": 0.268, "step": 3254 }, { "epoch": 0.1572691694448471, "grad_norm": 1.4789029359817505, "learning_rate": 8.427308305551529e-07, "loss": 0.1555, "step": 3255 }, { "epoch": 0.15731748562593612, "grad_norm": 2.349271535873413, "learning_rate": 8.426825143740638e-07, "loss": 0.2494, "step": 3256 }, { "epoch": 0.15736580180702517, "grad_norm": 4.189614772796631, "learning_rate": 8.426341981929748e-07, "loss": 0.4845, "step": 3257 }, { "epoch": 0.15741411798811422, "grad_norm": 2.8914942741394043, "learning_rate": 8.425858820118857e-07, "loss": 0.3079, "step": 3258 }, { "epoch": 0.15746243416920327, "grad_norm": 4.334005355834961, "learning_rate": 8.425375658307967e-07, "loss": 0.3618, "step": 3259 }, { "epoch": 0.1575107503502923, "grad_norm": 2.6541149616241455, "learning_rate": 8.424892496497077e-07, "loss": 0.339, "step": 3260 }, { "epoch": 0.15755906653138135, "grad_norm": 3.476529359817505, "learning_rate": 8.424409334686186e-07, "loss": 0.4225, "step": 3261 }, { "epoch": 0.1576073827124704, "grad_norm": 1.683027744293213, "learning_rate": 8.423926172875295e-07, "loss": 0.1609, "step": 3262 }, { "epoch": 0.15765569889355946, "grad_norm": 2.323517084121704, "learning_rate": 8.423443011064405e-07, "loss": 0.258, "step": 3263 }, { "epoch": 0.1577040150746485, "grad_norm": 2.3207175731658936, "learning_rate": 8.422959849253515e-07, "loss": 0.2379, "step": 3264 }, { "epoch": 0.15775233125573754, "grad_norm": 3.2685277462005615, "learning_rate": 8.422476687442624e-07, "loss": 0.4233, "step": 3265 }, { "epoch": 0.1578006474368266, "grad_norm": 3.0687074661254883, "learning_rate": 8.421993525631733e-07, "loss": 0.2713, "step": 3266 }, { "epoch": 0.15784896361791564, "grad_norm": 2.825162887573242, "learning_rate": 8.421510363820843e-07, "loss": 0.2435, "step": 3267 }, { "epoch": 0.1578972797990047, "grad_norm": 2.323317050933838, "learning_rate": 8.421027202009953e-07, "loss": 0.3645, "step": 3268 }, { "epoch": 0.15794559598009372, "grad_norm": 3.0447139739990234, "learning_rate": 8.420544040199063e-07, "loss": 0.5466, "step": 3269 }, { "epoch": 0.15799391216118278, "grad_norm": 2.8159515857696533, "learning_rate": 8.420060878388173e-07, "loss": 0.4315, "step": 3270 }, { "epoch": 0.15804222834227183, "grad_norm": 5.682948589324951, "learning_rate": 8.419577716577281e-07, "loss": 0.3462, "step": 3271 }, { "epoch": 0.15809054452336088, "grad_norm": 16.35104751586914, "learning_rate": 8.41909455476639e-07, "loss": 0.3066, "step": 3272 }, { "epoch": 0.1581388607044499, "grad_norm": 2.3845489025115967, "learning_rate": 8.4186113929555e-07, "loss": 0.2372, "step": 3273 }, { "epoch": 0.15818717688553896, "grad_norm": 2.8375232219696045, "learning_rate": 8.41812823114461e-07, "loss": 0.3226, "step": 3274 }, { "epoch": 0.158235493066628, "grad_norm": 2.6325230598449707, "learning_rate": 8.41764506933372e-07, "loss": 0.2642, "step": 3275 }, { "epoch": 0.15828380924771707, "grad_norm": 2.4418559074401855, "learning_rate": 8.41716190752283e-07, "loss": 0.2525, "step": 3276 }, { "epoch": 0.15833212542880612, "grad_norm": 2.2266368865966797, "learning_rate": 8.416678745711938e-07, "loss": 0.217, "step": 3277 }, { "epoch": 0.15838044160989515, "grad_norm": 6.1724066734313965, "learning_rate": 8.416195583901048e-07, "loss": 0.5695, "step": 3278 }, { "epoch": 0.1584287577909842, "grad_norm": 1.8496805429458618, "learning_rate": 8.415712422090157e-07, "loss": 0.2358, "step": 3279 }, { "epoch": 0.15847707397207325, "grad_norm": 3.5173864364624023, "learning_rate": 8.415229260279267e-07, "loss": 0.3373, "step": 3280 }, { "epoch": 0.1585253901531623, "grad_norm": 2.5983989238739014, "learning_rate": 8.414746098468377e-07, "loss": 0.3361, "step": 3281 }, { "epoch": 0.15857370633425133, "grad_norm": 1.7476211786270142, "learning_rate": 8.414262936657486e-07, "loss": 0.1936, "step": 3282 }, { "epoch": 0.15862202251534038, "grad_norm": 2.8285250663757324, "learning_rate": 8.413779774846596e-07, "loss": 0.3349, "step": 3283 }, { "epoch": 0.15867033869642944, "grad_norm": 6.015717506408691, "learning_rate": 8.413296613035705e-07, "loss": 0.3623, "step": 3284 }, { "epoch": 0.1587186548775185, "grad_norm": 2.9917755126953125, "learning_rate": 8.412813451224815e-07, "loss": 0.3741, "step": 3285 }, { "epoch": 0.15876697105860751, "grad_norm": 1.7594534158706665, "learning_rate": 8.412330289413925e-07, "loss": 0.1933, "step": 3286 }, { "epoch": 0.15881528723969657, "grad_norm": 3.158736228942871, "learning_rate": 8.411847127603034e-07, "loss": 0.3746, "step": 3287 }, { "epoch": 0.15886360342078562, "grad_norm": 3.7164418697357178, "learning_rate": 8.411363965792143e-07, "loss": 0.2861, "step": 3288 }, { "epoch": 0.15891191960187467, "grad_norm": 2.2365448474884033, "learning_rate": 8.410880803981253e-07, "loss": 0.1756, "step": 3289 }, { "epoch": 0.15896023578296373, "grad_norm": 2.670175313949585, "learning_rate": 8.410397642170362e-07, "loss": 0.2925, "step": 3290 }, { "epoch": 0.15900855196405275, "grad_norm": 2.8378961086273193, "learning_rate": 8.409914480359472e-07, "loss": 0.3399, "step": 3291 }, { "epoch": 0.1590568681451418, "grad_norm": 2.545365333557129, "learning_rate": 8.409431318548581e-07, "loss": 0.3435, "step": 3292 }, { "epoch": 0.15910518432623086, "grad_norm": 2.565133571624756, "learning_rate": 8.408948156737691e-07, "loss": 0.3386, "step": 3293 }, { "epoch": 0.1591535005073199, "grad_norm": 2.742326498031616, "learning_rate": 8.408464994926801e-07, "loss": 0.3241, "step": 3294 }, { "epoch": 0.15920181668840894, "grad_norm": 4.215724945068359, "learning_rate": 8.407981833115911e-07, "loss": 0.4267, "step": 3295 }, { "epoch": 0.159250132869498, "grad_norm": 4.1948442459106445, "learning_rate": 8.40749867130502e-07, "loss": 0.4472, "step": 3296 }, { "epoch": 0.15929844905058704, "grad_norm": 8.546067237854004, "learning_rate": 8.407015509494129e-07, "loss": 0.2575, "step": 3297 }, { "epoch": 0.1593467652316761, "grad_norm": 3.1572253704071045, "learning_rate": 8.406532347683238e-07, "loss": 0.4602, "step": 3298 }, { "epoch": 0.15939508141276512, "grad_norm": 2.233830213546753, "learning_rate": 8.406049185872348e-07, "loss": 0.3455, "step": 3299 }, { "epoch": 0.15944339759385417, "grad_norm": 12.560125350952148, "learning_rate": 8.405566024061458e-07, "loss": 0.2696, "step": 3300 }, { "epoch": 0.15949171377494323, "grad_norm": 2.6561970710754395, "learning_rate": 8.405082862250568e-07, "loss": 0.2842, "step": 3301 }, { "epoch": 0.15954002995603228, "grad_norm": 3.1062545776367188, "learning_rate": 8.404599700439678e-07, "loss": 0.3022, "step": 3302 }, { "epoch": 0.15958834613712133, "grad_norm": 2.680868625640869, "learning_rate": 8.404116538628786e-07, "loss": 0.3167, "step": 3303 }, { "epoch": 0.15963666231821036, "grad_norm": 4.4493865966796875, "learning_rate": 8.403633376817895e-07, "loss": 0.2379, "step": 3304 }, { "epoch": 0.1596849784992994, "grad_norm": 3.07531476020813, "learning_rate": 8.403150215007005e-07, "loss": 0.3188, "step": 3305 }, { "epoch": 0.15973329468038847, "grad_norm": 2.596153974533081, "learning_rate": 8.402667053196115e-07, "loss": 0.3422, "step": 3306 }, { "epoch": 0.15978161086147752, "grad_norm": 5.05538272857666, "learning_rate": 8.402183891385225e-07, "loss": 0.1714, "step": 3307 }, { "epoch": 0.15982992704256654, "grad_norm": 3.206554889678955, "learning_rate": 8.401700729574334e-07, "loss": 0.378, "step": 3308 }, { "epoch": 0.1598782432236556, "grad_norm": 2.6412558555603027, "learning_rate": 8.401217567763443e-07, "loss": 0.2581, "step": 3309 }, { "epoch": 0.15992655940474465, "grad_norm": 2.3201019763946533, "learning_rate": 8.400734405952553e-07, "loss": 0.2457, "step": 3310 }, { "epoch": 0.1599748755858337, "grad_norm": 2.6992435455322266, "learning_rate": 8.400251244141663e-07, "loss": 0.2123, "step": 3311 }, { "epoch": 0.16002319176692273, "grad_norm": 6.162845611572266, "learning_rate": 8.399768082330773e-07, "loss": 0.3424, "step": 3312 }, { "epoch": 0.16007150794801178, "grad_norm": 3.19161057472229, "learning_rate": 8.399284920519881e-07, "loss": 0.2694, "step": 3313 }, { "epoch": 0.16011982412910084, "grad_norm": 1.5206178426742554, "learning_rate": 8.398801758708991e-07, "loss": 0.161, "step": 3314 }, { "epoch": 0.1601681403101899, "grad_norm": 2.580728530883789, "learning_rate": 8.398318596898101e-07, "loss": 0.2655, "step": 3315 }, { "epoch": 0.16021645649127894, "grad_norm": 2.8521416187286377, "learning_rate": 8.39783543508721e-07, "loss": 0.3023, "step": 3316 }, { "epoch": 0.16026477267236797, "grad_norm": 4.078502178192139, "learning_rate": 8.39735227327632e-07, "loss": 0.2963, "step": 3317 }, { "epoch": 0.16031308885345702, "grad_norm": 3.1510262489318848, "learning_rate": 8.396869111465429e-07, "loss": 0.1988, "step": 3318 }, { "epoch": 0.16036140503454607, "grad_norm": 1.788498044013977, "learning_rate": 8.396385949654539e-07, "loss": 0.21, "step": 3319 }, { "epoch": 0.16040972121563513, "grad_norm": 2.6299915313720703, "learning_rate": 8.395902787843649e-07, "loss": 0.3221, "step": 3320 }, { "epoch": 0.16045803739672415, "grad_norm": 7.831316947937012, "learning_rate": 8.395419626032759e-07, "loss": 0.3493, "step": 3321 }, { "epoch": 0.1605063535778132, "grad_norm": 4.35728120803833, "learning_rate": 8.394936464221867e-07, "loss": 0.3949, "step": 3322 }, { "epoch": 0.16055466975890226, "grad_norm": 3.823399305343628, "learning_rate": 8.394453302410977e-07, "loss": 0.3794, "step": 3323 }, { "epoch": 0.1606029859399913, "grad_norm": 3.247784376144409, "learning_rate": 8.393970140600086e-07, "loss": 0.2183, "step": 3324 }, { "epoch": 0.16065130212108034, "grad_norm": 2.575212240219116, "learning_rate": 8.393486978789196e-07, "loss": 0.3555, "step": 3325 }, { "epoch": 0.1606996183021694, "grad_norm": 3.749835968017578, "learning_rate": 8.393003816978306e-07, "loss": 0.4271, "step": 3326 }, { "epoch": 0.16074793448325844, "grad_norm": 5.410661697387695, "learning_rate": 8.392520655167416e-07, "loss": 0.3933, "step": 3327 }, { "epoch": 0.1607962506643475, "grad_norm": 2.689578056335449, "learning_rate": 8.392037493356526e-07, "loss": 0.2905, "step": 3328 }, { "epoch": 0.16084456684543655, "grad_norm": 2.8920445442199707, "learning_rate": 8.391554331545634e-07, "loss": 0.3668, "step": 3329 }, { "epoch": 0.16089288302652557, "grad_norm": 4.538488864898682, "learning_rate": 8.391071169734743e-07, "loss": 0.4584, "step": 3330 }, { "epoch": 0.16094119920761463, "grad_norm": 22.121383666992188, "learning_rate": 8.390588007923853e-07, "loss": 0.2712, "step": 3331 }, { "epoch": 0.16098951538870368, "grad_norm": 3.739584445953369, "learning_rate": 8.390104846112963e-07, "loss": 0.166, "step": 3332 }, { "epoch": 0.16103783156979273, "grad_norm": 2.2329723834991455, "learning_rate": 8.389621684302073e-07, "loss": 0.2434, "step": 3333 }, { "epoch": 0.16108614775088176, "grad_norm": 2.9919509887695312, "learning_rate": 8.389138522491182e-07, "loss": 0.4252, "step": 3334 }, { "epoch": 0.1611344639319708, "grad_norm": 3.241521120071411, "learning_rate": 8.388655360680291e-07, "loss": 0.3297, "step": 3335 }, { "epoch": 0.16118278011305986, "grad_norm": 2.560772180557251, "learning_rate": 8.388172198869401e-07, "loss": 0.1857, "step": 3336 }, { "epoch": 0.16123109629414892, "grad_norm": 3.2573297023773193, "learning_rate": 8.387689037058511e-07, "loss": 0.2941, "step": 3337 }, { "epoch": 0.16127941247523797, "grad_norm": 2.972585439682007, "learning_rate": 8.38720587524762e-07, "loss": 0.3528, "step": 3338 }, { "epoch": 0.161327728656327, "grad_norm": 3.888270616531372, "learning_rate": 8.386722713436729e-07, "loss": 0.3957, "step": 3339 }, { "epoch": 0.16137604483741605, "grad_norm": 1.3750718832015991, "learning_rate": 8.386239551625839e-07, "loss": 0.1613, "step": 3340 }, { "epoch": 0.1614243610185051, "grad_norm": 2.5958781242370605, "learning_rate": 8.385756389814948e-07, "loss": 0.3124, "step": 3341 }, { "epoch": 0.16147267719959416, "grad_norm": 4.3174052238464355, "learning_rate": 8.385273228004058e-07, "loss": 0.2768, "step": 3342 }, { "epoch": 0.16152099338068318, "grad_norm": 5.123873710632324, "learning_rate": 8.384790066193168e-07, "loss": 0.2894, "step": 3343 }, { "epoch": 0.16156930956177223, "grad_norm": 4.822049617767334, "learning_rate": 8.384306904382277e-07, "loss": 0.3429, "step": 3344 }, { "epoch": 0.1616176257428613, "grad_norm": 2.5688347816467285, "learning_rate": 8.383823742571387e-07, "loss": 0.314, "step": 3345 }, { "epoch": 0.16166594192395034, "grad_norm": 2.8890488147735596, "learning_rate": 8.383340580760497e-07, "loss": 0.3635, "step": 3346 }, { "epoch": 0.16171425810503937, "grad_norm": 2.2494089603424072, "learning_rate": 8.382857418949606e-07, "loss": 0.2907, "step": 3347 }, { "epoch": 0.16176257428612842, "grad_norm": 3.851052761077881, "learning_rate": 8.382374257138715e-07, "loss": 0.3604, "step": 3348 }, { "epoch": 0.16181089046721747, "grad_norm": 2.24299693107605, "learning_rate": 8.381891095327825e-07, "loss": 0.2594, "step": 3349 }, { "epoch": 0.16185920664830653, "grad_norm": 3.2355587482452393, "learning_rate": 8.381407933516934e-07, "loss": 0.3388, "step": 3350 }, { "epoch": 0.16190752282939558, "grad_norm": 3.220184564590454, "learning_rate": 8.380924771706044e-07, "loss": 0.4081, "step": 3351 }, { "epoch": 0.1619558390104846, "grad_norm": 2.5059380531311035, "learning_rate": 8.380441609895154e-07, "loss": 0.25, "step": 3352 }, { "epoch": 0.16200415519157366, "grad_norm": 3.127518892288208, "learning_rate": 8.379958448084264e-07, "loss": 0.4106, "step": 3353 }, { "epoch": 0.1620524713726627, "grad_norm": 2.6583523750305176, "learning_rate": 8.379475286273373e-07, "loss": 0.3084, "step": 3354 }, { "epoch": 0.16210078755375176, "grad_norm": 4.599057197570801, "learning_rate": 8.378992124462481e-07, "loss": 0.3452, "step": 3355 }, { "epoch": 0.1621491037348408, "grad_norm": 6.6133646965026855, "learning_rate": 8.378508962651591e-07, "loss": 0.3254, "step": 3356 }, { "epoch": 0.16219741991592984, "grad_norm": 1.8068517446517944, "learning_rate": 8.378025800840701e-07, "loss": 0.1796, "step": 3357 }, { "epoch": 0.1622457360970189, "grad_norm": 2.5171167850494385, "learning_rate": 8.377542639029811e-07, "loss": 0.2984, "step": 3358 }, { "epoch": 0.16229405227810795, "grad_norm": 7.2969255447387695, "learning_rate": 8.377059477218921e-07, "loss": 0.2504, "step": 3359 }, { "epoch": 0.16234236845919697, "grad_norm": 4.661047458648682, "learning_rate": 8.376576315408029e-07, "loss": 0.335, "step": 3360 }, { "epoch": 0.16239068464028603, "grad_norm": 2.546750783920288, "learning_rate": 8.376093153597139e-07, "loss": 0.2779, "step": 3361 }, { "epoch": 0.16243900082137508, "grad_norm": 5.079381465911865, "learning_rate": 8.375609991786249e-07, "loss": 0.4286, "step": 3362 }, { "epoch": 0.16248731700246413, "grad_norm": 2.3655142784118652, "learning_rate": 8.375126829975359e-07, "loss": 0.2324, "step": 3363 }, { "epoch": 0.16253563318355319, "grad_norm": 2.182644844055176, "learning_rate": 8.374643668164468e-07, "loss": 0.2735, "step": 3364 }, { "epoch": 0.1625839493646422, "grad_norm": 2.538677930831909, "learning_rate": 8.374160506353577e-07, "loss": 0.3373, "step": 3365 }, { "epoch": 0.16263226554573126, "grad_norm": 3.181373357772827, "learning_rate": 8.373677344542687e-07, "loss": 0.3514, "step": 3366 }, { "epoch": 0.16268058172682032, "grad_norm": 2.1524813175201416, "learning_rate": 8.373194182731796e-07, "loss": 0.2759, "step": 3367 }, { "epoch": 0.16272889790790937, "grad_norm": 2.540015459060669, "learning_rate": 8.372711020920906e-07, "loss": 0.245, "step": 3368 }, { "epoch": 0.1627772140889984, "grad_norm": 2.80322265625, "learning_rate": 8.372227859110016e-07, "loss": 0.3102, "step": 3369 }, { "epoch": 0.16282553027008745, "grad_norm": 2.5197103023529053, "learning_rate": 8.371744697299125e-07, "loss": 0.2898, "step": 3370 }, { "epoch": 0.1628738464511765, "grad_norm": 5.82328462600708, "learning_rate": 8.371261535488235e-07, "loss": 0.4232, "step": 3371 }, { "epoch": 0.16292216263226555, "grad_norm": 2.6438565254211426, "learning_rate": 8.370778373677344e-07, "loss": 0.3545, "step": 3372 }, { "epoch": 0.16297047881335458, "grad_norm": 2.337533473968506, "learning_rate": 8.370295211866453e-07, "loss": 0.2545, "step": 3373 }, { "epoch": 0.16301879499444363, "grad_norm": 3.1469106674194336, "learning_rate": 8.369812050055563e-07, "loss": 0.4426, "step": 3374 }, { "epoch": 0.1630671111755327, "grad_norm": 2.646008014678955, "learning_rate": 8.369328888244673e-07, "loss": 0.3235, "step": 3375 }, { "epoch": 0.16311542735662174, "grad_norm": 3.6069486141204834, "learning_rate": 8.368845726433782e-07, "loss": 0.2643, "step": 3376 }, { "epoch": 0.1631637435377108, "grad_norm": 2.190826416015625, "learning_rate": 8.368362564622892e-07, "loss": 0.2589, "step": 3377 }, { "epoch": 0.16321205971879982, "grad_norm": 3.2881383895874023, "learning_rate": 8.367879402812002e-07, "loss": 0.4375, "step": 3378 }, { "epoch": 0.16326037589988887, "grad_norm": 2.892594337463379, "learning_rate": 8.367396241001112e-07, "loss": 0.4074, "step": 3379 }, { "epoch": 0.16330869208097792, "grad_norm": 3.196932077407837, "learning_rate": 8.36691307919022e-07, "loss": 0.1807, "step": 3380 }, { "epoch": 0.16335700826206698, "grad_norm": 2.796661615371704, "learning_rate": 8.366429917379329e-07, "loss": 0.2887, "step": 3381 }, { "epoch": 0.163405324443156, "grad_norm": 2.630265235900879, "learning_rate": 8.365946755568439e-07, "loss": 0.3, "step": 3382 }, { "epoch": 0.16345364062424506, "grad_norm": 6.117660999298096, "learning_rate": 8.365463593757549e-07, "loss": 0.4453, "step": 3383 }, { "epoch": 0.1635019568053341, "grad_norm": 2.185359239578247, "learning_rate": 8.364980431946659e-07, "loss": 0.2661, "step": 3384 }, { "epoch": 0.16355027298642316, "grad_norm": 2.9031858444213867, "learning_rate": 8.364497270135769e-07, "loss": 0.2858, "step": 3385 }, { "epoch": 0.1635985891675122, "grad_norm": 2.687316417694092, "learning_rate": 8.364014108324877e-07, "loss": 0.3336, "step": 3386 }, { "epoch": 0.16364690534860124, "grad_norm": 2.3176779747009277, "learning_rate": 8.363530946513987e-07, "loss": 0.3046, "step": 3387 }, { "epoch": 0.1636952215296903, "grad_norm": 2.993481159210205, "learning_rate": 8.363047784703097e-07, "loss": 0.2324, "step": 3388 }, { "epoch": 0.16374353771077935, "grad_norm": 1.9159289598464966, "learning_rate": 8.362564622892206e-07, "loss": 0.223, "step": 3389 }, { "epoch": 0.1637918538918684, "grad_norm": 2.915132999420166, "learning_rate": 8.362081461081316e-07, "loss": 0.3397, "step": 3390 }, { "epoch": 0.16384017007295743, "grad_norm": 2.489241361618042, "learning_rate": 8.361598299270425e-07, "loss": 0.3167, "step": 3391 }, { "epoch": 0.16388848625404648, "grad_norm": 1.9052836894989014, "learning_rate": 8.361115137459534e-07, "loss": 0.2308, "step": 3392 }, { "epoch": 0.16393680243513553, "grad_norm": 2.6144042015075684, "learning_rate": 8.360631975648644e-07, "loss": 0.2056, "step": 3393 }, { "epoch": 0.16398511861622458, "grad_norm": 4.059875011444092, "learning_rate": 8.360148813837754e-07, "loss": 0.3545, "step": 3394 }, { "epoch": 0.1640334347973136, "grad_norm": 3.051192045211792, "learning_rate": 8.359665652026864e-07, "loss": 0.3414, "step": 3395 }, { "epoch": 0.16408175097840266, "grad_norm": 7.180588722229004, "learning_rate": 8.359182490215973e-07, "loss": 0.4198, "step": 3396 }, { "epoch": 0.16413006715949172, "grad_norm": 1.967498779296875, "learning_rate": 8.358699328405083e-07, "loss": 0.2059, "step": 3397 }, { "epoch": 0.16417838334058077, "grad_norm": 1.4356975555419922, "learning_rate": 8.358216166594192e-07, "loss": 0.1834, "step": 3398 }, { "epoch": 0.1642266995216698, "grad_norm": 3.041815757751465, "learning_rate": 8.357733004783301e-07, "loss": 0.3831, "step": 3399 }, { "epoch": 0.16427501570275885, "grad_norm": 8.244634628295898, "learning_rate": 8.357249842972411e-07, "loss": 0.2834, "step": 3400 }, { "epoch": 0.1643233318838479, "grad_norm": 3.110018491744995, "learning_rate": 8.356766681161521e-07, "loss": 0.3143, "step": 3401 }, { "epoch": 0.16437164806493695, "grad_norm": 3.357659101486206, "learning_rate": 8.35628351935063e-07, "loss": 0.3353, "step": 3402 }, { "epoch": 0.164419964246026, "grad_norm": 2.486781120300293, "learning_rate": 8.35580035753974e-07, "loss": 0.2548, "step": 3403 }, { "epoch": 0.16446828042711503, "grad_norm": 2.6323113441467285, "learning_rate": 8.35531719572885e-07, "loss": 0.4101, "step": 3404 }, { "epoch": 0.16451659660820409, "grad_norm": 2.488396167755127, "learning_rate": 8.354834033917959e-07, "loss": 0.2801, "step": 3405 }, { "epoch": 0.16456491278929314, "grad_norm": 6.781274795532227, "learning_rate": 8.354350872107068e-07, "loss": 0.3179, "step": 3406 }, { "epoch": 0.1646132289703822, "grad_norm": 2.6539764404296875, "learning_rate": 8.353867710296177e-07, "loss": 0.3008, "step": 3407 }, { "epoch": 0.16466154515147122, "grad_norm": 2.1804308891296387, "learning_rate": 8.353384548485287e-07, "loss": 0.3151, "step": 3408 }, { "epoch": 0.16470986133256027, "grad_norm": 2.178297281265259, "learning_rate": 8.352901386674397e-07, "loss": 0.2747, "step": 3409 }, { "epoch": 0.16475817751364932, "grad_norm": 9.809137344360352, "learning_rate": 8.352418224863507e-07, "loss": 0.2998, "step": 3410 }, { "epoch": 0.16480649369473838, "grad_norm": 2.6138792037963867, "learning_rate": 8.351935063052617e-07, "loss": 0.3986, "step": 3411 }, { "epoch": 0.1648548098758274, "grad_norm": 2.4961066246032715, "learning_rate": 8.351451901241725e-07, "loss": 0.2692, "step": 3412 }, { "epoch": 0.16490312605691645, "grad_norm": 3.0842182636260986, "learning_rate": 8.350968739430835e-07, "loss": 0.3119, "step": 3413 }, { "epoch": 0.1649514422380055, "grad_norm": 2.502002239227295, "learning_rate": 8.350485577619944e-07, "loss": 0.3132, "step": 3414 }, { "epoch": 0.16499975841909456, "grad_norm": 2.291489601135254, "learning_rate": 8.350002415809054e-07, "loss": 0.2484, "step": 3415 }, { "epoch": 0.16504807460018361, "grad_norm": 2.386451482772827, "learning_rate": 8.349519253998164e-07, "loss": 0.2979, "step": 3416 }, { "epoch": 0.16509639078127264, "grad_norm": 2.689938545227051, "learning_rate": 8.349036092187273e-07, "loss": 0.3359, "step": 3417 }, { "epoch": 0.1651447069623617, "grad_norm": 2.1149837970733643, "learning_rate": 8.348552930376382e-07, "loss": 0.2428, "step": 3418 }, { "epoch": 0.16519302314345075, "grad_norm": 2.142975330352783, "learning_rate": 8.348069768565492e-07, "loss": 0.223, "step": 3419 }, { "epoch": 0.1652413393245398, "grad_norm": 2.216735363006592, "learning_rate": 8.347586606754602e-07, "loss": 0.247, "step": 3420 }, { "epoch": 0.16528965550562882, "grad_norm": 2.0581514835357666, "learning_rate": 8.347103444943712e-07, "loss": 0.313, "step": 3421 }, { "epoch": 0.16533797168671788, "grad_norm": 2.3875608444213867, "learning_rate": 8.34662028313282e-07, "loss": 0.2623, "step": 3422 }, { "epoch": 0.16538628786780693, "grad_norm": 2.626878261566162, "learning_rate": 8.34613712132193e-07, "loss": 0.2947, "step": 3423 }, { "epoch": 0.16543460404889598, "grad_norm": 2.906229019165039, "learning_rate": 8.345653959511039e-07, "loss": 0.2839, "step": 3424 }, { "epoch": 0.165482920229985, "grad_norm": 5.796820163726807, "learning_rate": 8.345170797700149e-07, "loss": 0.2502, "step": 3425 }, { "epoch": 0.16553123641107406, "grad_norm": 2.6782917976379395, "learning_rate": 8.344687635889259e-07, "loss": 0.4022, "step": 3426 }, { "epoch": 0.16557955259216312, "grad_norm": 3.7991020679473877, "learning_rate": 8.344204474078368e-07, "loss": 0.214, "step": 3427 }, { "epoch": 0.16562786877325217, "grad_norm": 2.71598744392395, "learning_rate": 8.343721312267478e-07, "loss": 0.2991, "step": 3428 }, { "epoch": 0.16567618495434122, "grad_norm": 6.987534523010254, "learning_rate": 8.343238150456588e-07, "loss": 0.3256, "step": 3429 }, { "epoch": 0.16572450113543025, "grad_norm": 2.454503297805786, "learning_rate": 8.342754988645698e-07, "loss": 0.2633, "step": 3430 }, { "epoch": 0.1657728173165193, "grad_norm": 2.461951494216919, "learning_rate": 8.342271826834806e-07, "loss": 0.2877, "step": 3431 }, { "epoch": 0.16582113349760835, "grad_norm": 3.7183542251586914, "learning_rate": 8.341788665023916e-07, "loss": 0.3504, "step": 3432 }, { "epoch": 0.1658694496786974, "grad_norm": 2.9665441513061523, "learning_rate": 8.341305503213025e-07, "loss": 0.3695, "step": 3433 }, { "epoch": 0.16591776585978643, "grad_norm": 1.7409770488739014, "learning_rate": 8.340822341402135e-07, "loss": 0.2071, "step": 3434 }, { "epoch": 0.16596608204087548, "grad_norm": 2.4002809524536133, "learning_rate": 8.340339179591245e-07, "loss": 0.2075, "step": 3435 }, { "epoch": 0.16601439822196454, "grad_norm": 2.0323801040649414, "learning_rate": 8.339856017780355e-07, "loss": 0.1494, "step": 3436 }, { "epoch": 0.1660627144030536, "grad_norm": 13.177261352539062, "learning_rate": 8.339372855969464e-07, "loss": 0.1969, "step": 3437 }, { "epoch": 0.16611103058414262, "grad_norm": 2.2387571334838867, "learning_rate": 8.338889694158573e-07, "loss": 0.3055, "step": 3438 }, { "epoch": 0.16615934676523167, "grad_norm": 2.927672863006592, "learning_rate": 8.338406532347683e-07, "loss": 0.4239, "step": 3439 }, { "epoch": 0.16620766294632072, "grad_norm": 2.340986967086792, "learning_rate": 8.337923370536792e-07, "loss": 0.2528, "step": 3440 }, { "epoch": 0.16625597912740978, "grad_norm": 2.41237473487854, "learning_rate": 8.337440208725902e-07, "loss": 0.2699, "step": 3441 }, { "epoch": 0.16630429530849883, "grad_norm": 2.1599886417388916, "learning_rate": 8.336957046915012e-07, "loss": 0.2316, "step": 3442 }, { "epoch": 0.16635261148958785, "grad_norm": 4.2875657081604, "learning_rate": 8.33647388510412e-07, "loss": 0.3604, "step": 3443 }, { "epoch": 0.1664009276706769, "grad_norm": 4.98173713684082, "learning_rate": 8.33599072329323e-07, "loss": 0.3258, "step": 3444 }, { "epoch": 0.16644924385176596, "grad_norm": 3.0751779079437256, "learning_rate": 8.33550756148234e-07, "loss": 0.3415, "step": 3445 }, { "epoch": 0.166497560032855, "grad_norm": 8.521737098693848, "learning_rate": 8.33502439967145e-07, "loss": 0.3648, "step": 3446 }, { "epoch": 0.16654587621394404, "grad_norm": 3.1813273429870605, "learning_rate": 8.33454123786056e-07, "loss": 0.326, "step": 3447 }, { "epoch": 0.1665941923950331, "grad_norm": 2.629509687423706, "learning_rate": 8.334058076049668e-07, "loss": 0.2244, "step": 3448 }, { "epoch": 0.16664250857612214, "grad_norm": 2.9536848068237305, "learning_rate": 8.333574914238778e-07, "loss": 0.2759, "step": 3449 }, { "epoch": 0.1666908247572112, "grad_norm": 1.3855092525482178, "learning_rate": 8.333091752427887e-07, "loss": 0.1274, "step": 3450 }, { "epoch": 0.16673914093830022, "grad_norm": 2.2837777137756348, "learning_rate": 8.332608590616997e-07, "loss": 0.2595, "step": 3451 }, { "epoch": 0.16678745711938928, "grad_norm": 2.153285503387451, "learning_rate": 8.332125428806107e-07, "loss": 0.2784, "step": 3452 }, { "epoch": 0.16683577330047833, "grad_norm": 4.387845039367676, "learning_rate": 8.331642266995216e-07, "loss": 0.4534, "step": 3453 }, { "epoch": 0.16688408948156738, "grad_norm": 3.028674840927124, "learning_rate": 8.331159105184326e-07, "loss": 0.2707, "step": 3454 }, { "epoch": 0.16693240566265644, "grad_norm": 2.471142053604126, "learning_rate": 8.330675943373436e-07, "loss": 0.2507, "step": 3455 }, { "epoch": 0.16698072184374546, "grad_norm": 2.2501490116119385, "learning_rate": 8.330192781562544e-07, "loss": 0.2784, "step": 3456 }, { "epoch": 0.16702903802483451, "grad_norm": 2.183349132537842, "learning_rate": 8.329709619751654e-07, "loss": 0.3041, "step": 3457 }, { "epoch": 0.16707735420592357, "grad_norm": 4.004702568054199, "learning_rate": 8.329226457940764e-07, "loss": 0.3118, "step": 3458 }, { "epoch": 0.16712567038701262, "grad_norm": 2.507567882537842, "learning_rate": 8.328743296129873e-07, "loss": 0.3245, "step": 3459 }, { "epoch": 0.16717398656810165, "grad_norm": 2.8189597129821777, "learning_rate": 8.328260134318983e-07, "loss": 0.272, "step": 3460 }, { "epoch": 0.1672223027491907, "grad_norm": 2.63498592376709, "learning_rate": 8.327776972508093e-07, "loss": 0.3498, "step": 3461 }, { "epoch": 0.16727061893027975, "grad_norm": 2.9059407711029053, "learning_rate": 8.327293810697203e-07, "loss": 0.352, "step": 3462 }, { "epoch": 0.1673189351113688, "grad_norm": 2.393362283706665, "learning_rate": 8.326810648886312e-07, "loss": 0.2672, "step": 3463 }, { "epoch": 0.16736725129245783, "grad_norm": 4.56162166595459, "learning_rate": 8.326327487075421e-07, "loss": 0.3317, "step": 3464 }, { "epoch": 0.16741556747354688, "grad_norm": 3.0763015747070312, "learning_rate": 8.32584432526453e-07, "loss": 0.3196, "step": 3465 }, { "epoch": 0.16746388365463594, "grad_norm": 2.809502363204956, "learning_rate": 8.32536116345364e-07, "loss": 0.3417, "step": 3466 }, { "epoch": 0.167512199835725, "grad_norm": 2.059892416000366, "learning_rate": 8.32487800164275e-07, "loss": 0.2353, "step": 3467 }, { "epoch": 0.16756051601681404, "grad_norm": 4.386944770812988, "learning_rate": 8.32439483983186e-07, "loss": 0.2661, "step": 3468 }, { "epoch": 0.16760883219790307, "grad_norm": 2.9652199745178223, "learning_rate": 8.323911678020968e-07, "loss": 0.3034, "step": 3469 }, { "epoch": 0.16765714837899212, "grad_norm": 3.4833123683929443, "learning_rate": 8.323428516210078e-07, "loss": 0.3391, "step": 3470 }, { "epoch": 0.16770546456008117, "grad_norm": 2.466618776321411, "learning_rate": 8.322945354399188e-07, "loss": 0.2306, "step": 3471 }, { "epoch": 0.16775378074117023, "grad_norm": 2.800220489501953, "learning_rate": 8.322462192588298e-07, "loss": 0.3779, "step": 3472 }, { "epoch": 0.16780209692225925, "grad_norm": 3.1869683265686035, "learning_rate": 8.321979030777408e-07, "loss": 0.3651, "step": 3473 }, { "epoch": 0.1678504131033483, "grad_norm": 1.7608122825622559, "learning_rate": 8.321495868966516e-07, "loss": 0.2026, "step": 3474 }, { "epoch": 0.16789872928443736, "grad_norm": 2.7324273586273193, "learning_rate": 8.321012707155625e-07, "loss": 0.335, "step": 3475 }, { "epoch": 0.1679470454655264, "grad_norm": 2.0366551876068115, "learning_rate": 8.320529545344735e-07, "loss": 0.1944, "step": 3476 }, { "epoch": 0.16799536164661547, "grad_norm": 3.153719425201416, "learning_rate": 8.320046383533845e-07, "loss": 0.2109, "step": 3477 }, { "epoch": 0.1680436778277045, "grad_norm": 2.4738850593566895, "learning_rate": 8.319563221722955e-07, "loss": 0.3175, "step": 3478 }, { "epoch": 0.16809199400879354, "grad_norm": 2.3740506172180176, "learning_rate": 8.319080059912064e-07, "loss": 0.308, "step": 3479 }, { "epoch": 0.1681403101898826, "grad_norm": 3.9027915000915527, "learning_rate": 8.318596898101174e-07, "loss": 0.4074, "step": 3480 }, { "epoch": 0.16818862637097165, "grad_norm": 1.4579592943191528, "learning_rate": 8.318113736290284e-07, "loss": 0.1678, "step": 3481 }, { "epoch": 0.16823694255206068, "grad_norm": 2.063958168029785, "learning_rate": 8.317630574479392e-07, "loss": 0.2387, "step": 3482 }, { "epoch": 0.16828525873314973, "grad_norm": 4.583450794219971, "learning_rate": 8.317147412668502e-07, "loss": 0.3776, "step": 3483 }, { "epoch": 0.16833357491423878, "grad_norm": 3.4596095085144043, "learning_rate": 8.316664250857612e-07, "loss": 0.3657, "step": 3484 }, { "epoch": 0.16838189109532783, "grad_norm": 2.409858226776123, "learning_rate": 8.316181089046721e-07, "loss": 0.275, "step": 3485 }, { "epoch": 0.16843020727641686, "grad_norm": 3.6281745433807373, "learning_rate": 8.315697927235831e-07, "loss": 0.2059, "step": 3486 }, { "epoch": 0.1684785234575059, "grad_norm": 4.967909812927246, "learning_rate": 8.315214765424941e-07, "loss": 0.2626, "step": 3487 }, { "epoch": 0.16852683963859497, "grad_norm": 2.544814348220825, "learning_rate": 8.31473160361405e-07, "loss": 0.3302, "step": 3488 }, { "epoch": 0.16857515581968402, "grad_norm": 4.6687798500061035, "learning_rate": 8.31424844180316e-07, "loss": 0.2476, "step": 3489 }, { "epoch": 0.16862347200077307, "grad_norm": 2.4122824668884277, "learning_rate": 8.313765279992268e-07, "loss": 0.2804, "step": 3490 }, { "epoch": 0.1686717881818621, "grad_norm": 2.786842107772827, "learning_rate": 8.313282118181378e-07, "loss": 0.3463, "step": 3491 }, { "epoch": 0.16872010436295115, "grad_norm": 2.9230847358703613, "learning_rate": 8.312798956370488e-07, "loss": 0.413, "step": 3492 }, { "epoch": 0.1687684205440402, "grad_norm": 2.638122797012329, "learning_rate": 8.312315794559598e-07, "loss": 0.3134, "step": 3493 }, { "epoch": 0.16881673672512926, "grad_norm": 2.209691286087036, "learning_rate": 8.311832632748708e-07, "loss": 0.2462, "step": 3494 }, { "epoch": 0.16886505290621828, "grad_norm": 4.2897562980651855, "learning_rate": 8.311349470937816e-07, "loss": 0.4127, "step": 3495 }, { "epoch": 0.16891336908730734, "grad_norm": 6.1716461181640625, "learning_rate": 8.310866309126926e-07, "loss": 0.2894, "step": 3496 }, { "epoch": 0.1689616852683964, "grad_norm": 2.3993747234344482, "learning_rate": 8.310383147316036e-07, "loss": 0.315, "step": 3497 }, { "epoch": 0.16901000144948544, "grad_norm": 2.998141288757324, "learning_rate": 8.309899985505146e-07, "loss": 0.3825, "step": 3498 }, { "epoch": 0.16905831763057447, "grad_norm": 2.114403009414673, "learning_rate": 8.309416823694255e-07, "loss": 0.2483, "step": 3499 }, { "epoch": 0.16910663381166352, "grad_norm": 2.5387790203094482, "learning_rate": 8.308933661883364e-07, "loss": 0.285, "step": 3500 }, { "epoch": 0.16915494999275257, "grad_norm": 4.676947593688965, "learning_rate": 8.308450500072473e-07, "loss": 0.2545, "step": 3501 }, { "epoch": 0.16920326617384163, "grad_norm": 4.061861991882324, "learning_rate": 8.307967338261583e-07, "loss": 0.3784, "step": 3502 }, { "epoch": 0.16925158235493068, "grad_norm": 2.666887044906616, "learning_rate": 8.307484176450693e-07, "loss": 0.272, "step": 3503 }, { "epoch": 0.1692998985360197, "grad_norm": 8.700143814086914, "learning_rate": 8.307001014639803e-07, "loss": 0.2131, "step": 3504 }, { "epoch": 0.16934821471710876, "grad_norm": 3.2737691402435303, "learning_rate": 8.306517852828912e-07, "loss": 0.3434, "step": 3505 }, { "epoch": 0.1693965308981978, "grad_norm": 1.3213165998458862, "learning_rate": 8.306034691018022e-07, "loss": 0.1641, "step": 3506 }, { "epoch": 0.16944484707928686, "grad_norm": 11.567052841186523, "learning_rate": 8.305551529207132e-07, "loss": 0.1487, "step": 3507 }, { "epoch": 0.1694931632603759, "grad_norm": 3.1183557510375977, "learning_rate": 8.30506836739624e-07, "loss": 0.3431, "step": 3508 }, { "epoch": 0.16954147944146494, "grad_norm": 4.0604658126831055, "learning_rate": 8.30458520558535e-07, "loss": 0.3429, "step": 3509 }, { "epoch": 0.169589795622554, "grad_norm": 2.8671815395355225, "learning_rate": 8.30410204377446e-07, "loss": 0.3653, "step": 3510 }, { "epoch": 0.16963811180364305, "grad_norm": 4.771848201751709, "learning_rate": 8.303618881963569e-07, "loss": 0.2883, "step": 3511 }, { "epoch": 0.16968642798473207, "grad_norm": 8.330883979797363, "learning_rate": 8.303135720152679e-07, "loss": 0.4644, "step": 3512 }, { "epoch": 0.16973474416582113, "grad_norm": 3.25681209564209, "learning_rate": 8.302652558341789e-07, "loss": 0.3231, "step": 3513 }, { "epoch": 0.16978306034691018, "grad_norm": 2.2563443183898926, "learning_rate": 8.302169396530898e-07, "loss": 0.2797, "step": 3514 }, { "epoch": 0.16983137652799923, "grad_norm": 2.548814535140991, "learning_rate": 8.301686234720008e-07, "loss": 0.2463, "step": 3515 }, { "epoch": 0.1698796927090883, "grad_norm": 10.726638793945312, "learning_rate": 8.301203072909116e-07, "loss": 0.2214, "step": 3516 }, { "epoch": 0.1699280088901773, "grad_norm": 14.69083309173584, "learning_rate": 8.300719911098226e-07, "loss": 0.2283, "step": 3517 }, { "epoch": 0.16997632507126637, "grad_norm": 2.634155035018921, "learning_rate": 8.300236749287336e-07, "loss": 0.2679, "step": 3518 }, { "epoch": 0.17002464125235542, "grad_norm": 1.9909262657165527, "learning_rate": 8.299753587476446e-07, "loss": 0.2401, "step": 3519 }, { "epoch": 0.17007295743344447, "grad_norm": 2.4442057609558105, "learning_rate": 8.299270425665555e-07, "loss": 0.2199, "step": 3520 }, { "epoch": 0.1701212736145335, "grad_norm": 4.492969512939453, "learning_rate": 8.298787263854664e-07, "loss": 0.2952, "step": 3521 }, { "epoch": 0.17016958979562255, "grad_norm": 2.2330703735351562, "learning_rate": 8.298304102043774e-07, "loss": 0.3023, "step": 3522 }, { "epoch": 0.1702179059767116, "grad_norm": 2.476348638534546, "learning_rate": 8.297820940232884e-07, "loss": 0.2725, "step": 3523 }, { "epoch": 0.17026622215780066, "grad_norm": 2.39495587348938, "learning_rate": 8.297337778421993e-07, "loss": 0.201, "step": 3524 }, { "epoch": 0.17031453833888968, "grad_norm": 2.213697910308838, "learning_rate": 8.296854616611103e-07, "loss": 0.2027, "step": 3525 }, { "epoch": 0.17036285451997873, "grad_norm": 2.9842612743377686, "learning_rate": 8.296371454800212e-07, "loss": 0.2395, "step": 3526 }, { "epoch": 0.1704111707010678, "grad_norm": 10.314472198486328, "learning_rate": 8.295888292989321e-07, "loss": 0.4207, "step": 3527 }, { "epoch": 0.17045948688215684, "grad_norm": 7.8544511795043945, "learning_rate": 8.295405131178431e-07, "loss": 0.3481, "step": 3528 }, { "epoch": 0.1705078030632459, "grad_norm": 2.7493088245391846, "learning_rate": 8.294921969367541e-07, "loss": 0.3739, "step": 3529 }, { "epoch": 0.17055611924433492, "grad_norm": 9.076350212097168, "learning_rate": 8.294438807556651e-07, "loss": 0.2725, "step": 3530 }, { "epoch": 0.17060443542542397, "grad_norm": 3.387166976928711, "learning_rate": 8.29395564574576e-07, "loss": 0.2847, "step": 3531 }, { "epoch": 0.17065275160651303, "grad_norm": 2.0628414154052734, "learning_rate": 8.29347248393487e-07, "loss": 0.2316, "step": 3532 }, { "epoch": 0.17070106778760208, "grad_norm": 4.587332248687744, "learning_rate": 8.292989322123978e-07, "loss": 0.3752, "step": 3533 }, { "epoch": 0.1707493839686911, "grad_norm": 2.719085931777954, "learning_rate": 8.292506160313088e-07, "loss": 0.2956, "step": 3534 }, { "epoch": 0.17079770014978016, "grad_norm": 3.318946361541748, "learning_rate": 8.292022998502198e-07, "loss": 0.309, "step": 3535 }, { "epoch": 0.1708460163308692, "grad_norm": 2.7593672275543213, "learning_rate": 8.291539836691308e-07, "loss": 0.3082, "step": 3536 }, { "epoch": 0.17089433251195826, "grad_norm": 4.651355266571045, "learning_rate": 8.291056674880417e-07, "loss": 0.2997, "step": 3537 }, { "epoch": 0.1709426486930473, "grad_norm": 2.8724944591522217, "learning_rate": 8.290573513069527e-07, "loss": 0.3754, "step": 3538 }, { "epoch": 0.17099096487413634, "grad_norm": 1.8857566118240356, "learning_rate": 8.290090351258637e-07, "loss": 0.2106, "step": 3539 }, { "epoch": 0.1710392810552254, "grad_norm": 3.362675428390503, "learning_rate": 8.289607189447746e-07, "loss": 0.3981, "step": 3540 }, { "epoch": 0.17108759723631445, "grad_norm": 2.5005156993865967, "learning_rate": 8.289124027636855e-07, "loss": 0.2895, "step": 3541 }, { "epoch": 0.1711359134174035, "grad_norm": 2.5121395587921143, "learning_rate": 8.288640865825964e-07, "loss": 0.2869, "step": 3542 }, { "epoch": 0.17118422959849253, "grad_norm": 2.873682737350464, "learning_rate": 8.288157704015074e-07, "loss": 0.3474, "step": 3543 }, { "epoch": 0.17123254577958158, "grad_norm": 3.110295295715332, "learning_rate": 8.287674542204184e-07, "loss": 0.225, "step": 3544 }, { "epoch": 0.17128086196067063, "grad_norm": 3.3059206008911133, "learning_rate": 8.287191380393294e-07, "loss": 0.2628, "step": 3545 }, { "epoch": 0.17132917814175969, "grad_norm": 3.167922258377075, "learning_rate": 8.286708218582403e-07, "loss": 0.3393, "step": 3546 }, { "epoch": 0.1713774943228487, "grad_norm": 3.8562209606170654, "learning_rate": 8.286225056771512e-07, "loss": 0.4332, "step": 3547 }, { "epoch": 0.17142581050393776, "grad_norm": 2.718677282333374, "learning_rate": 8.285741894960622e-07, "loss": 0.2942, "step": 3548 }, { "epoch": 0.17147412668502682, "grad_norm": 3.0562386512756348, "learning_rate": 8.285258733149732e-07, "loss": 0.3877, "step": 3549 }, { "epoch": 0.17152244286611587, "grad_norm": 2.671238422393799, "learning_rate": 8.284775571338841e-07, "loss": 0.3699, "step": 3550 }, { "epoch": 0.1715707590472049, "grad_norm": 2.279313087463379, "learning_rate": 8.284292409527951e-07, "loss": 0.2815, "step": 3551 }, { "epoch": 0.17161907522829395, "grad_norm": 6.419232368469238, "learning_rate": 8.283809247717059e-07, "loss": 0.3071, "step": 3552 }, { "epoch": 0.171667391409383, "grad_norm": 2.3787009716033936, "learning_rate": 8.283326085906169e-07, "loss": 0.2982, "step": 3553 }, { "epoch": 0.17171570759047206, "grad_norm": 2.6660773754119873, "learning_rate": 8.282842924095279e-07, "loss": 0.4455, "step": 3554 }, { "epoch": 0.1717640237715611, "grad_norm": 2.227128267288208, "learning_rate": 8.282359762284389e-07, "loss": 0.2951, "step": 3555 }, { "epoch": 0.17181233995265013, "grad_norm": 3.8857808113098145, "learning_rate": 8.281876600473499e-07, "loss": 0.2404, "step": 3556 }, { "epoch": 0.1718606561337392, "grad_norm": 2.6321656703948975, "learning_rate": 8.281393438662608e-07, "loss": 0.2832, "step": 3557 }, { "epoch": 0.17190897231482824, "grad_norm": 3.789214849472046, "learning_rate": 8.280910276851717e-07, "loss": 0.2945, "step": 3558 }, { "epoch": 0.1719572884959173, "grad_norm": 2.6507296562194824, "learning_rate": 8.280427115040826e-07, "loss": 0.3409, "step": 3559 }, { "epoch": 0.17200560467700632, "grad_norm": 2.3761396408081055, "learning_rate": 8.279943953229936e-07, "loss": 0.2804, "step": 3560 }, { "epoch": 0.17205392085809537, "grad_norm": 2.714001417160034, "learning_rate": 8.279460791419046e-07, "loss": 0.3561, "step": 3561 }, { "epoch": 0.17210223703918442, "grad_norm": 4.6012043952941895, "learning_rate": 8.278977629608156e-07, "loss": 0.2459, "step": 3562 }, { "epoch": 0.17215055322027348, "grad_norm": 3.483307361602783, "learning_rate": 8.278494467797265e-07, "loss": 0.4489, "step": 3563 }, { "epoch": 0.1721988694013625, "grad_norm": 2.3689708709716797, "learning_rate": 8.278011305986375e-07, "loss": 0.2652, "step": 3564 }, { "epoch": 0.17224718558245156, "grad_norm": 4.28718376159668, "learning_rate": 8.277528144175484e-07, "loss": 0.2763, "step": 3565 }, { "epoch": 0.1722955017635406, "grad_norm": 3.5802760124206543, "learning_rate": 8.277044982364593e-07, "loss": 0.4443, "step": 3566 }, { "epoch": 0.17234381794462966, "grad_norm": 2.292065382003784, "learning_rate": 8.276561820553703e-07, "loss": 0.2906, "step": 3567 }, { "epoch": 0.17239213412571872, "grad_norm": 2.6853084564208984, "learning_rate": 8.276078658742812e-07, "loss": 0.2135, "step": 3568 }, { "epoch": 0.17244045030680774, "grad_norm": 2.1738929748535156, "learning_rate": 8.275595496931922e-07, "loss": 0.238, "step": 3569 }, { "epoch": 0.1724887664878968, "grad_norm": 2.065455198287964, "learning_rate": 8.275112335121032e-07, "loss": 0.2414, "step": 3570 }, { "epoch": 0.17253708266898585, "grad_norm": 4.5786967277526855, "learning_rate": 8.274629173310142e-07, "loss": 0.5782, "step": 3571 }, { "epoch": 0.1725853988500749, "grad_norm": 2.340909242630005, "learning_rate": 8.274146011499251e-07, "loss": 0.2589, "step": 3572 }, { "epoch": 0.17263371503116393, "grad_norm": 2.508186101913452, "learning_rate": 8.27366284968836e-07, "loss": 0.2623, "step": 3573 }, { "epoch": 0.17268203121225298, "grad_norm": 5.232043266296387, "learning_rate": 8.27317968787747e-07, "loss": 0.4246, "step": 3574 }, { "epoch": 0.17273034739334203, "grad_norm": 2.9436285495758057, "learning_rate": 8.272696526066579e-07, "loss": 0.2638, "step": 3575 }, { "epoch": 0.17277866357443109, "grad_norm": 2.2888567447662354, "learning_rate": 8.272213364255689e-07, "loss": 0.2721, "step": 3576 }, { "epoch": 0.1728269797555201, "grad_norm": 4.194972515106201, "learning_rate": 8.271730202444799e-07, "loss": 0.5864, "step": 3577 }, { "epoch": 0.17287529593660916, "grad_norm": 1.9928656816482544, "learning_rate": 8.271247040633907e-07, "loss": 0.1812, "step": 3578 }, { "epoch": 0.17292361211769822, "grad_norm": 5.614376068115234, "learning_rate": 8.270763878823017e-07, "loss": 0.3487, "step": 3579 }, { "epoch": 0.17297192829878727, "grad_norm": 2.8579134941101074, "learning_rate": 8.270280717012127e-07, "loss": 0.2312, "step": 3580 }, { "epoch": 0.17302024447987632, "grad_norm": 3.325054883956909, "learning_rate": 8.269797555201237e-07, "loss": 0.3939, "step": 3581 }, { "epoch": 0.17306856066096535, "grad_norm": 9.48635482788086, "learning_rate": 8.269314393390347e-07, "loss": 0.2852, "step": 3582 }, { "epoch": 0.1731168768420544, "grad_norm": 3.382951498031616, "learning_rate": 8.268831231579455e-07, "loss": 0.3117, "step": 3583 }, { "epoch": 0.17316519302314345, "grad_norm": 3.3320751190185547, "learning_rate": 8.268348069768564e-07, "loss": 0.4934, "step": 3584 }, { "epoch": 0.1732135092042325, "grad_norm": 3.5637826919555664, "learning_rate": 8.267864907957674e-07, "loss": 0.4085, "step": 3585 }, { "epoch": 0.17326182538532153, "grad_norm": 2.597395658493042, "learning_rate": 8.267381746146784e-07, "loss": 0.3047, "step": 3586 }, { "epoch": 0.17331014156641059, "grad_norm": 3.1971912384033203, "learning_rate": 8.266898584335894e-07, "loss": 0.2566, "step": 3587 }, { "epoch": 0.17335845774749964, "grad_norm": 1.9783046245574951, "learning_rate": 8.266415422525004e-07, "loss": 0.2289, "step": 3588 }, { "epoch": 0.1734067739285887, "grad_norm": 3.0645906925201416, "learning_rate": 8.265932260714113e-07, "loss": 0.3727, "step": 3589 }, { "epoch": 0.17345509010967772, "grad_norm": 2.8311238288879395, "learning_rate": 8.265449098903223e-07, "loss": 0.4115, "step": 3590 }, { "epoch": 0.17350340629076677, "grad_norm": 3.939291000366211, "learning_rate": 8.264965937092332e-07, "loss": 0.342, "step": 3591 }, { "epoch": 0.17355172247185582, "grad_norm": 3.532984495162964, "learning_rate": 8.264482775281441e-07, "loss": 0.4054, "step": 3592 }, { "epoch": 0.17360003865294488, "grad_norm": 7.306441307067871, "learning_rate": 8.263999613470551e-07, "loss": 0.3819, "step": 3593 }, { "epoch": 0.17364835483403393, "grad_norm": 2.417454242706299, "learning_rate": 8.26351645165966e-07, "loss": 0.2774, "step": 3594 }, { "epoch": 0.17369667101512296, "grad_norm": 3.750676393508911, "learning_rate": 8.26303328984877e-07, "loss": 0.3118, "step": 3595 }, { "epoch": 0.173744987196212, "grad_norm": 2.8699800968170166, "learning_rate": 8.26255012803788e-07, "loss": 0.4141, "step": 3596 }, { "epoch": 0.17379330337730106, "grad_norm": 16.657411575317383, "learning_rate": 8.262066966226989e-07, "loss": 0.6294, "step": 3597 }, { "epoch": 0.17384161955839011, "grad_norm": 2.9966142177581787, "learning_rate": 8.261583804416099e-07, "loss": 0.3965, "step": 3598 }, { "epoch": 0.17388993573947914, "grad_norm": 7.63084077835083, "learning_rate": 8.261100642605208e-07, "loss": 0.2603, "step": 3599 }, { "epoch": 0.1739382519205682, "grad_norm": 2.6155667304992676, "learning_rate": 8.260617480794317e-07, "loss": 0.3077, "step": 3600 }, { "epoch": 0.17398656810165725, "grad_norm": 2.7996582984924316, "learning_rate": 8.260134318983427e-07, "loss": 0.3581, "step": 3601 }, { "epoch": 0.1740348842827463, "grad_norm": 2.43597149848938, "learning_rate": 8.259651157172537e-07, "loss": 0.2718, "step": 3602 }, { "epoch": 0.17408320046383532, "grad_norm": 5.700910568237305, "learning_rate": 8.259167995361647e-07, "loss": 0.2955, "step": 3603 }, { "epoch": 0.17413151664492438, "grad_norm": 2.3446457386016846, "learning_rate": 8.258684833550755e-07, "loss": 0.2436, "step": 3604 }, { "epoch": 0.17417983282601343, "grad_norm": 2.703850746154785, "learning_rate": 8.258201671739865e-07, "loss": 0.3708, "step": 3605 }, { "epoch": 0.17422814900710248, "grad_norm": 2.546200752258301, "learning_rate": 8.257718509928975e-07, "loss": 0.2757, "step": 3606 }, { "epoch": 0.17427646518819154, "grad_norm": 2.9745044708251953, "learning_rate": 8.257235348118085e-07, "loss": 0.2683, "step": 3607 }, { "epoch": 0.17432478136928056, "grad_norm": 5.000310897827148, "learning_rate": 8.256752186307195e-07, "loss": 0.2063, "step": 3608 }, { "epoch": 0.17437309755036962, "grad_norm": 1.9564342498779297, "learning_rate": 8.256269024496303e-07, "loss": 0.2339, "step": 3609 }, { "epoch": 0.17442141373145867, "grad_norm": 3.6884875297546387, "learning_rate": 8.255785862685412e-07, "loss": 0.3379, "step": 3610 }, { "epoch": 0.17446972991254772, "grad_norm": 3.2163755893707275, "learning_rate": 8.255302700874522e-07, "loss": 0.2875, "step": 3611 }, { "epoch": 0.17451804609363675, "grad_norm": 2.451875686645508, "learning_rate": 8.254819539063632e-07, "loss": 0.321, "step": 3612 }, { "epoch": 0.1745663622747258, "grad_norm": 3.8783631324768066, "learning_rate": 8.254336377252742e-07, "loss": 0.293, "step": 3613 }, { "epoch": 0.17461467845581485, "grad_norm": 3.1111881732940674, "learning_rate": 8.253853215441852e-07, "loss": 0.3741, "step": 3614 }, { "epoch": 0.1746629946369039, "grad_norm": 3.3348302841186523, "learning_rate": 8.253370053630961e-07, "loss": 0.312, "step": 3615 }, { "epoch": 0.17471131081799296, "grad_norm": 2.7373056411743164, "learning_rate": 8.25288689182007e-07, "loss": 0.3972, "step": 3616 }, { "epoch": 0.17475962699908199, "grad_norm": 2.4860737323760986, "learning_rate": 8.252403730009179e-07, "loss": 0.3003, "step": 3617 }, { "epoch": 0.17480794318017104, "grad_norm": 2.029737710952759, "learning_rate": 8.251920568198289e-07, "loss": 0.2417, "step": 3618 }, { "epoch": 0.1748562593612601, "grad_norm": 2.7591230869293213, "learning_rate": 8.251437406387399e-07, "loss": 0.4031, "step": 3619 }, { "epoch": 0.17490457554234914, "grad_norm": 2.706789493560791, "learning_rate": 8.250954244576508e-07, "loss": 0.3413, "step": 3620 }, { "epoch": 0.17495289172343817, "grad_norm": 3.062610149383545, "learning_rate": 8.250471082765618e-07, "loss": 0.3792, "step": 3621 }, { "epoch": 0.17500120790452722, "grad_norm": 2.2372844219207764, "learning_rate": 8.249987920954728e-07, "loss": 0.2206, "step": 3622 }, { "epoch": 0.17504952408561628, "grad_norm": 2.293044328689575, "learning_rate": 8.249504759143837e-07, "loss": 0.2504, "step": 3623 }, { "epoch": 0.17509784026670533, "grad_norm": 3.234039783477783, "learning_rate": 8.249021597332947e-07, "loss": 0.3388, "step": 3624 }, { "epoch": 0.17514615644779435, "grad_norm": 5.253650665283203, "learning_rate": 8.248538435522055e-07, "loss": 0.3452, "step": 3625 }, { "epoch": 0.1751944726288834, "grad_norm": 2.416700839996338, "learning_rate": 8.248055273711165e-07, "loss": 0.304, "step": 3626 }, { "epoch": 0.17524278880997246, "grad_norm": 2.473257541656494, "learning_rate": 8.247572111900275e-07, "loss": 0.3251, "step": 3627 }, { "epoch": 0.1752911049910615, "grad_norm": 3.8754544258117676, "learning_rate": 8.247088950089385e-07, "loss": 0.296, "step": 3628 }, { "epoch": 0.17533942117215057, "grad_norm": 2.4788644313812256, "learning_rate": 8.246605788278494e-07, "loss": 0.3174, "step": 3629 }, { "epoch": 0.1753877373532396, "grad_norm": 2.383852481842041, "learning_rate": 8.246122626467603e-07, "loss": 0.2997, "step": 3630 }, { "epoch": 0.17543605353432865, "grad_norm": 3.566549777984619, "learning_rate": 8.245639464656713e-07, "loss": 0.393, "step": 3631 }, { "epoch": 0.1754843697154177, "grad_norm": 2.6518373489379883, "learning_rate": 8.245156302845823e-07, "loss": 0.351, "step": 3632 }, { "epoch": 0.17553268589650675, "grad_norm": 2.6237566471099854, "learning_rate": 8.244673141034933e-07, "loss": 0.3553, "step": 3633 }, { "epoch": 0.17558100207759578, "grad_norm": 2.435366630554199, "learning_rate": 8.244189979224042e-07, "loss": 0.2744, "step": 3634 }, { "epoch": 0.17562931825868483, "grad_norm": 2.6409809589385986, "learning_rate": 8.24370681741315e-07, "loss": 0.3959, "step": 3635 }, { "epoch": 0.17567763443977388, "grad_norm": 2.6375768184661865, "learning_rate": 8.24322365560226e-07, "loss": 0.3535, "step": 3636 }, { "epoch": 0.17572595062086294, "grad_norm": 2.9353444576263428, "learning_rate": 8.24274049379137e-07, "loss": 0.3721, "step": 3637 }, { "epoch": 0.17577426680195196, "grad_norm": 5.553926467895508, "learning_rate": 8.24225733198048e-07, "loss": 0.2923, "step": 3638 }, { "epoch": 0.17582258298304101, "grad_norm": 4.02314567565918, "learning_rate": 8.24177417016959e-07, "loss": 0.4241, "step": 3639 }, { "epoch": 0.17587089916413007, "grad_norm": 3.0501651763916016, "learning_rate": 8.2412910083587e-07, "loss": 0.3459, "step": 3640 }, { "epoch": 0.17591921534521912, "grad_norm": 2.7237138748168945, "learning_rate": 8.240807846547809e-07, "loss": 0.2398, "step": 3641 }, { "epoch": 0.17596753152630817, "grad_norm": 3.3268744945526123, "learning_rate": 8.240324684736917e-07, "loss": 0.3473, "step": 3642 }, { "epoch": 0.1760158477073972, "grad_norm": 2.6163132190704346, "learning_rate": 8.239841522926027e-07, "loss": 0.2901, "step": 3643 }, { "epoch": 0.17606416388848625, "grad_norm": 2.1111841201782227, "learning_rate": 8.239358361115137e-07, "loss": 0.2465, "step": 3644 }, { "epoch": 0.1761124800695753, "grad_norm": 2.6003520488739014, "learning_rate": 8.238875199304247e-07, "loss": 0.2892, "step": 3645 }, { "epoch": 0.17616079625066436, "grad_norm": 2.8228330612182617, "learning_rate": 8.238392037493356e-07, "loss": 0.3755, "step": 3646 }, { "epoch": 0.17620911243175338, "grad_norm": 3.08771014213562, "learning_rate": 8.237908875682466e-07, "loss": 0.3925, "step": 3647 }, { "epoch": 0.17625742861284244, "grad_norm": 3.6373069286346436, "learning_rate": 8.237425713871575e-07, "loss": 0.3174, "step": 3648 }, { "epoch": 0.1763057447939315, "grad_norm": 5.08929967880249, "learning_rate": 8.236942552060685e-07, "loss": 0.2279, "step": 3649 }, { "epoch": 0.17635406097502054, "grad_norm": 3.389158248901367, "learning_rate": 8.236459390249795e-07, "loss": 0.3234, "step": 3650 }, { "epoch": 0.17640237715610957, "grad_norm": 1.8736144304275513, "learning_rate": 8.235976228438903e-07, "loss": 0.1962, "step": 3651 }, { "epoch": 0.17645069333719862, "grad_norm": 2.9578261375427246, "learning_rate": 8.235493066628013e-07, "loss": 0.3767, "step": 3652 }, { "epoch": 0.17649900951828768, "grad_norm": 2.3355796337127686, "learning_rate": 8.235009904817123e-07, "loss": 0.2563, "step": 3653 }, { "epoch": 0.17654732569937673, "grad_norm": 2.283334732055664, "learning_rate": 8.234526743006233e-07, "loss": 0.2442, "step": 3654 }, { "epoch": 0.17659564188046578, "grad_norm": 3.1259214878082275, "learning_rate": 8.234043581195342e-07, "loss": 0.3231, "step": 3655 }, { "epoch": 0.1766439580615548, "grad_norm": 3.0983405113220215, "learning_rate": 8.233560419384451e-07, "loss": 0.3688, "step": 3656 }, { "epoch": 0.17669227424264386, "grad_norm": 3.108602523803711, "learning_rate": 8.233077257573561e-07, "loss": 0.3887, "step": 3657 }, { "epoch": 0.1767405904237329, "grad_norm": 2.339421510696411, "learning_rate": 8.232594095762671e-07, "loss": 0.2768, "step": 3658 }, { "epoch": 0.17678890660482197, "grad_norm": 3.9676218032836914, "learning_rate": 8.23211093395178e-07, "loss": 0.3843, "step": 3659 }, { "epoch": 0.176837222785911, "grad_norm": 2.87430739402771, "learning_rate": 8.23162777214089e-07, "loss": 0.3508, "step": 3660 }, { "epoch": 0.17688553896700004, "grad_norm": 2.082288980484009, "learning_rate": 8.231144610329998e-07, "loss": 0.2436, "step": 3661 }, { "epoch": 0.1769338551480891, "grad_norm": 3.567714214324951, "learning_rate": 8.230661448519108e-07, "loss": 0.2443, "step": 3662 }, { "epoch": 0.17698217132917815, "grad_norm": 5.818774223327637, "learning_rate": 8.230178286708218e-07, "loss": 0.3049, "step": 3663 }, { "epoch": 0.17703048751026718, "grad_norm": 3.5618717670440674, "learning_rate": 8.229695124897328e-07, "loss": 0.3546, "step": 3664 }, { "epoch": 0.17707880369135623, "grad_norm": 1.8196618556976318, "learning_rate": 8.229211963086438e-07, "loss": 0.2457, "step": 3665 }, { "epoch": 0.17712711987244528, "grad_norm": 2.831165075302124, "learning_rate": 8.228728801275548e-07, "loss": 0.346, "step": 3666 }, { "epoch": 0.17717543605353434, "grad_norm": 3.612313985824585, "learning_rate": 8.228245639464656e-07, "loss": 0.2747, "step": 3667 }, { "epoch": 0.1772237522346234, "grad_norm": 2.5683159828186035, "learning_rate": 8.227762477653765e-07, "loss": 0.2492, "step": 3668 }, { "epoch": 0.1772720684157124, "grad_norm": 2.8817121982574463, "learning_rate": 8.227279315842875e-07, "loss": 0.3068, "step": 3669 }, { "epoch": 0.17732038459680147, "grad_norm": 3.0597341060638428, "learning_rate": 8.226796154031985e-07, "loss": 0.3746, "step": 3670 }, { "epoch": 0.17736870077789052, "grad_norm": 2.576651096343994, "learning_rate": 8.226312992221095e-07, "loss": 0.2553, "step": 3671 }, { "epoch": 0.17741701695897957, "grad_norm": 2.396139144897461, "learning_rate": 8.225829830410204e-07, "loss": 0.2695, "step": 3672 }, { "epoch": 0.1774653331400686, "grad_norm": 64.34297943115234, "learning_rate": 8.225346668599314e-07, "loss": 0.4054, "step": 3673 }, { "epoch": 0.17751364932115765, "grad_norm": 4.074048042297363, "learning_rate": 8.224863506788423e-07, "loss": 0.4719, "step": 3674 }, { "epoch": 0.1775619655022467, "grad_norm": 2.574802875518799, "learning_rate": 8.224380344977533e-07, "loss": 0.2959, "step": 3675 }, { "epoch": 0.17761028168333576, "grad_norm": 2.875269889831543, "learning_rate": 8.223897183166643e-07, "loss": 0.3512, "step": 3676 }, { "epoch": 0.17765859786442478, "grad_norm": 2.1754586696624756, "learning_rate": 8.223414021355751e-07, "loss": 0.2589, "step": 3677 }, { "epoch": 0.17770691404551384, "grad_norm": 2.1906843185424805, "learning_rate": 8.222930859544861e-07, "loss": 0.2547, "step": 3678 }, { "epoch": 0.1777552302266029, "grad_norm": 2.069944381713867, "learning_rate": 8.222447697733971e-07, "loss": 0.2158, "step": 3679 }, { "epoch": 0.17780354640769194, "grad_norm": 2.9910292625427246, "learning_rate": 8.22196453592308e-07, "loss": 0.2983, "step": 3680 }, { "epoch": 0.177851862588781, "grad_norm": 4.927300453186035, "learning_rate": 8.22148137411219e-07, "loss": 0.2538, "step": 3681 }, { "epoch": 0.17790017876987002, "grad_norm": 2.991016149520874, "learning_rate": 8.220998212301299e-07, "loss": 0.2809, "step": 3682 }, { "epoch": 0.17794849495095907, "grad_norm": 4.573217391967773, "learning_rate": 8.220515050490409e-07, "loss": 0.4242, "step": 3683 }, { "epoch": 0.17799681113204813, "grad_norm": 2.3769659996032715, "learning_rate": 8.220031888679519e-07, "loss": 0.2579, "step": 3684 }, { "epoch": 0.17804512731313718, "grad_norm": 2.9011712074279785, "learning_rate": 8.219548726868628e-07, "loss": 0.3566, "step": 3685 }, { "epoch": 0.1780934434942262, "grad_norm": 3.4562692642211914, "learning_rate": 8.219065565057738e-07, "loss": 0.2505, "step": 3686 }, { "epoch": 0.17814175967531526, "grad_norm": 2.7444567680358887, "learning_rate": 8.218582403246846e-07, "loss": 0.3221, "step": 3687 }, { "epoch": 0.1781900758564043, "grad_norm": 2.986780881881714, "learning_rate": 8.218099241435956e-07, "loss": 0.3403, "step": 3688 }, { "epoch": 0.17823839203749337, "grad_norm": 2.0262396335601807, "learning_rate": 8.217616079625066e-07, "loss": 0.2438, "step": 3689 }, { "epoch": 0.1782867082185824, "grad_norm": 2.6798598766326904, "learning_rate": 8.217132917814176e-07, "loss": 0.3232, "step": 3690 }, { "epoch": 0.17833502439967144, "grad_norm": 4.951196670532227, "learning_rate": 8.216649756003286e-07, "loss": 0.2893, "step": 3691 }, { "epoch": 0.1783833405807605, "grad_norm": 2.6571500301361084, "learning_rate": 8.216166594192395e-07, "loss": 0.3326, "step": 3692 }, { "epoch": 0.17843165676184955, "grad_norm": 2.2606050968170166, "learning_rate": 8.215683432381503e-07, "loss": 0.2416, "step": 3693 }, { "epoch": 0.1784799729429386, "grad_norm": 7.791503429412842, "learning_rate": 8.215200270570613e-07, "loss": 0.3308, "step": 3694 }, { "epoch": 0.17852828912402763, "grad_norm": 2.601755142211914, "learning_rate": 8.214717108759723e-07, "loss": 0.3003, "step": 3695 }, { "epoch": 0.17857660530511668, "grad_norm": 4.296886920928955, "learning_rate": 8.214233946948833e-07, "loss": 0.4219, "step": 3696 }, { "epoch": 0.17862492148620573, "grad_norm": 7.086441516876221, "learning_rate": 8.213750785137943e-07, "loss": 0.2959, "step": 3697 }, { "epoch": 0.1786732376672948, "grad_norm": 2.2561187744140625, "learning_rate": 8.213267623327052e-07, "loss": 0.2603, "step": 3698 }, { "epoch": 0.1787215538483838, "grad_norm": 2.6593832969665527, "learning_rate": 8.212784461516161e-07, "loss": 0.266, "step": 3699 }, { "epoch": 0.17876987002947287, "grad_norm": 2.7293541431427, "learning_rate": 8.212301299705271e-07, "loss": 0.3872, "step": 3700 }, { "epoch": 0.17881818621056192, "grad_norm": 2.298448085784912, "learning_rate": 8.21181813789438e-07, "loss": 0.2828, "step": 3701 }, { "epoch": 0.17886650239165097, "grad_norm": 2.4064693450927734, "learning_rate": 8.21133497608349e-07, "loss": 0.2603, "step": 3702 }, { "epoch": 0.17891481857274, "grad_norm": 6.344996929168701, "learning_rate": 8.210851814272599e-07, "loss": 0.2262, "step": 3703 }, { "epoch": 0.17896313475382905, "grad_norm": 2.7466049194335938, "learning_rate": 8.210368652461709e-07, "loss": 0.3562, "step": 3704 }, { "epoch": 0.1790114509349181, "grad_norm": 3.3816211223602295, "learning_rate": 8.209885490650819e-07, "loss": 0.2905, "step": 3705 }, { "epoch": 0.17905976711600716, "grad_norm": 2.589047431945801, "learning_rate": 8.209402328839928e-07, "loss": 0.3117, "step": 3706 }, { "epoch": 0.1791080832970962, "grad_norm": 4.019266605377197, "learning_rate": 8.208919167029038e-07, "loss": 0.2175, "step": 3707 }, { "epoch": 0.17915639947818524, "grad_norm": 1.6465885639190674, "learning_rate": 8.208436005218147e-07, "loss": 0.19, "step": 3708 }, { "epoch": 0.1792047156592743, "grad_norm": 3.086766004562378, "learning_rate": 8.207952843407257e-07, "loss": 0.3519, "step": 3709 }, { "epoch": 0.17925303184036334, "grad_norm": 5.959593296051025, "learning_rate": 8.207469681596366e-07, "loss": 0.4137, "step": 3710 }, { "epoch": 0.1793013480214524, "grad_norm": 7.53260612487793, "learning_rate": 8.206986519785476e-07, "loss": 0.3286, "step": 3711 }, { "epoch": 0.17934966420254142, "grad_norm": 2.011906147003174, "learning_rate": 8.206503357974585e-07, "loss": 0.2251, "step": 3712 }, { "epoch": 0.17939798038363047, "grad_norm": 3.128143787384033, "learning_rate": 8.206020196163694e-07, "loss": 0.3859, "step": 3713 }, { "epoch": 0.17944629656471953, "grad_norm": 2.896658182144165, "learning_rate": 8.205537034352804e-07, "loss": 0.3084, "step": 3714 }, { "epoch": 0.17949461274580858, "grad_norm": 3.03424072265625, "learning_rate": 8.205053872541914e-07, "loss": 0.3626, "step": 3715 }, { "epoch": 0.1795429289268976, "grad_norm": 5.012814044952393, "learning_rate": 8.204570710731024e-07, "loss": 0.3679, "step": 3716 }, { "epoch": 0.17959124510798666, "grad_norm": 2.158776044845581, "learning_rate": 8.204087548920134e-07, "loss": 0.2272, "step": 3717 }, { "epoch": 0.1796395612890757, "grad_norm": 3.128108263015747, "learning_rate": 8.203604387109241e-07, "loss": 0.2872, "step": 3718 }, { "epoch": 0.17968787747016476, "grad_norm": 5.3740644454956055, "learning_rate": 8.203121225298351e-07, "loss": 0.327, "step": 3719 }, { "epoch": 0.17973619365125382, "grad_norm": 3.047618865966797, "learning_rate": 8.202638063487461e-07, "loss": 0.3295, "step": 3720 }, { "epoch": 0.17978450983234284, "grad_norm": 2.714583396911621, "learning_rate": 8.202154901676571e-07, "loss": 0.2911, "step": 3721 }, { "epoch": 0.1798328260134319, "grad_norm": 70.09829711914062, "learning_rate": 8.201671739865681e-07, "loss": 0.3149, "step": 3722 }, { "epoch": 0.17988114219452095, "grad_norm": 2.594935655593872, "learning_rate": 8.201188578054791e-07, "loss": 0.2846, "step": 3723 }, { "epoch": 0.17992945837561, "grad_norm": 10.37669849395752, "learning_rate": 8.2007054162439e-07, "loss": 0.3322, "step": 3724 }, { "epoch": 0.17997777455669903, "grad_norm": 2.608003616333008, "learning_rate": 8.200222254433009e-07, "loss": 0.2413, "step": 3725 }, { "epoch": 0.18002609073778808, "grad_norm": 2.9008026123046875, "learning_rate": 8.199739092622119e-07, "loss": 0.3322, "step": 3726 }, { "epoch": 0.18007440691887713, "grad_norm": 2.8404035568237305, "learning_rate": 8.199255930811228e-07, "loss": 0.3373, "step": 3727 }, { "epoch": 0.1801227230999662, "grad_norm": 2.6883256435394287, "learning_rate": 8.198772769000338e-07, "loss": 0.2441, "step": 3728 }, { "epoch": 0.1801710392810552, "grad_norm": 3.9924092292785645, "learning_rate": 8.198289607189447e-07, "loss": 0.3921, "step": 3729 }, { "epoch": 0.18021935546214427, "grad_norm": 3.1479837894439697, "learning_rate": 8.197806445378557e-07, "loss": 0.3381, "step": 3730 }, { "epoch": 0.18026767164323332, "grad_norm": 2.0583364963531494, "learning_rate": 8.197323283567666e-07, "loss": 0.2054, "step": 3731 }, { "epoch": 0.18031598782432237, "grad_norm": 2.8313615322113037, "learning_rate": 8.196840121756776e-07, "loss": 0.2549, "step": 3732 }, { "epoch": 0.18036430400541142, "grad_norm": 2.745405435562134, "learning_rate": 8.196356959945886e-07, "loss": 0.3322, "step": 3733 }, { "epoch": 0.18041262018650045, "grad_norm": 4.474167346954346, "learning_rate": 8.195873798134995e-07, "loss": 0.3578, "step": 3734 }, { "epoch": 0.1804609363675895, "grad_norm": 3.447030782699585, "learning_rate": 8.195390636324104e-07, "loss": 0.3135, "step": 3735 }, { "epoch": 0.18050925254867856, "grad_norm": 2.389011859893799, "learning_rate": 8.194907474513214e-07, "loss": 0.2476, "step": 3736 }, { "epoch": 0.1805575687297676, "grad_norm": 6.138413906097412, "learning_rate": 8.194424312702324e-07, "loss": 0.3578, "step": 3737 }, { "epoch": 0.18060588491085663, "grad_norm": 3.0678908824920654, "learning_rate": 8.193941150891433e-07, "loss": 0.3209, "step": 3738 }, { "epoch": 0.1806542010919457, "grad_norm": 5.523421764373779, "learning_rate": 8.193457989080542e-07, "loss": 0.4313, "step": 3739 }, { "epoch": 0.18070251727303474, "grad_norm": 1.930655837059021, "learning_rate": 8.192974827269652e-07, "loss": 0.2378, "step": 3740 }, { "epoch": 0.1807508334541238, "grad_norm": 3.0329411029815674, "learning_rate": 8.192491665458762e-07, "loss": 0.5923, "step": 3741 }, { "epoch": 0.18079914963521282, "grad_norm": 7.629317283630371, "learning_rate": 8.192008503647872e-07, "loss": 0.4091, "step": 3742 }, { "epoch": 0.18084746581630187, "grad_norm": 2.893554925918579, "learning_rate": 8.191525341836982e-07, "loss": 0.3058, "step": 3743 }, { "epoch": 0.18089578199739093, "grad_norm": 2.314598560333252, "learning_rate": 8.191042180026089e-07, "loss": 0.242, "step": 3744 }, { "epoch": 0.18094409817847998, "grad_norm": 4.609470367431641, "learning_rate": 8.190559018215199e-07, "loss": 0.3555, "step": 3745 }, { "epoch": 0.18099241435956903, "grad_norm": 2.639188051223755, "learning_rate": 8.190075856404309e-07, "loss": 0.272, "step": 3746 }, { "epoch": 0.18104073054065806, "grad_norm": 14.278971672058105, "learning_rate": 8.189592694593419e-07, "loss": 0.2351, "step": 3747 }, { "epoch": 0.1810890467217471, "grad_norm": 2.4400746822357178, "learning_rate": 8.189109532782529e-07, "loss": 0.204, "step": 3748 }, { "epoch": 0.18113736290283616, "grad_norm": 3.207826852798462, "learning_rate": 8.188626370971639e-07, "loss": 0.4456, "step": 3749 }, { "epoch": 0.18118567908392522, "grad_norm": 4.858123779296875, "learning_rate": 8.188143209160747e-07, "loss": 0.3104, "step": 3750 }, { "epoch": 0.18123399526501424, "grad_norm": 2.267261028289795, "learning_rate": 8.187660047349857e-07, "loss": 0.2694, "step": 3751 }, { "epoch": 0.1812823114461033, "grad_norm": 2.4110500812530518, "learning_rate": 8.187176885538966e-07, "loss": 0.2506, "step": 3752 }, { "epoch": 0.18133062762719235, "grad_norm": 2.613942861557007, "learning_rate": 8.186693723728076e-07, "loss": 0.2779, "step": 3753 }, { "epoch": 0.1813789438082814, "grad_norm": 2.4377267360687256, "learning_rate": 8.186210561917186e-07, "loss": 0.2291, "step": 3754 }, { "epoch": 0.18142725998937043, "grad_norm": 3.3184397220611572, "learning_rate": 8.185727400106295e-07, "loss": 0.3129, "step": 3755 }, { "epoch": 0.18147557617045948, "grad_norm": 2.7540555000305176, "learning_rate": 8.185244238295405e-07, "loss": 0.3438, "step": 3756 }, { "epoch": 0.18152389235154853, "grad_norm": 2.4233205318450928, "learning_rate": 8.184761076484514e-07, "loss": 0.154, "step": 3757 }, { "epoch": 0.18157220853263759, "grad_norm": 3.134471893310547, "learning_rate": 8.184277914673624e-07, "loss": 0.4464, "step": 3758 }, { "epoch": 0.18162052471372664, "grad_norm": 3.3194994926452637, "learning_rate": 8.183794752862734e-07, "loss": 0.3448, "step": 3759 }, { "epoch": 0.18166884089481566, "grad_norm": 1.6792231798171997, "learning_rate": 8.183311591051843e-07, "loss": 0.2254, "step": 3760 }, { "epoch": 0.18171715707590472, "grad_norm": 2.914746046066284, "learning_rate": 8.182828429240952e-07, "loss": 0.3531, "step": 3761 }, { "epoch": 0.18176547325699377, "grad_norm": 2.2646000385284424, "learning_rate": 8.182345267430062e-07, "loss": 0.2628, "step": 3762 }, { "epoch": 0.18181378943808282, "grad_norm": 4.322925090789795, "learning_rate": 8.181862105619171e-07, "loss": 0.4101, "step": 3763 }, { "epoch": 0.18186210561917185, "grad_norm": 2.1882615089416504, "learning_rate": 8.181378943808281e-07, "loss": 0.2406, "step": 3764 }, { "epoch": 0.1819104218002609, "grad_norm": 2.801433563232422, "learning_rate": 8.18089578199739e-07, "loss": 0.3769, "step": 3765 }, { "epoch": 0.18195873798134996, "grad_norm": 2.994689702987671, "learning_rate": 8.1804126201865e-07, "loss": 0.4286, "step": 3766 }, { "epoch": 0.182007054162439, "grad_norm": 2.811772584915161, "learning_rate": 8.17992945837561e-07, "loss": 0.2636, "step": 3767 }, { "epoch": 0.18205537034352806, "grad_norm": 1.9401593208312988, "learning_rate": 8.17944629656472e-07, "loss": 0.2513, "step": 3768 }, { "epoch": 0.1821036865246171, "grad_norm": 2.284213066101074, "learning_rate": 8.17896313475383e-07, "loss": 0.2844, "step": 3769 }, { "epoch": 0.18215200270570614, "grad_norm": 2.421851634979248, "learning_rate": 8.178479972942937e-07, "loss": 0.3579, "step": 3770 }, { "epoch": 0.1822003188867952, "grad_norm": 61.639060974121094, "learning_rate": 8.177996811132047e-07, "loss": 0.3579, "step": 3771 }, { "epoch": 0.18224863506788425, "grad_norm": 5.5764288902282715, "learning_rate": 8.177513649321157e-07, "loss": 0.4652, "step": 3772 }, { "epoch": 0.18229695124897327, "grad_norm": 1.8536417484283447, "learning_rate": 8.177030487510267e-07, "loss": 0.1944, "step": 3773 }, { "epoch": 0.18234526743006232, "grad_norm": 3.137819528579712, "learning_rate": 8.176547325699377e-07, "loss": 0.3481, "step": 3774 }, { "epoch": 0.18239358361115138, "grad_norm": 2.0198452472686768, "learning_rate": 8.176064163888487e-07, "loss": 0.2654, "step": 3775 }, { "epoch": 0.18244189979224043, "grad_norm": 3.999588966369629, "learning_rate": 8.175581002077595e-07, "loss": 0.3967, "step": 3776 }, { "epoch": 0.18249021597332946, "grad_norm": 9.161894798278809, "learning_rate": 8.175097840266705e-07, "loss": 0.1998, "step": 3777 }, { "epoch": 0.1825385321544185, "grad_norm": 3.5322136878967285, "learning_rate": 8.174614678455814e-07, "loss": 0.2656, "step": 3778 }, { "epoch": 0.18258684833550756, "grad_norm": 3.0566964149475098, "learning_rate": 8.174131516644924e-07, "loss": 0.3921, "step": 3779 }, { "epoch": 0.18263516451659662, "grad_norm": 3.3819937705993652, "learning_rate": 8.173648354834034e-07, "loss": 0.3581, "step": 3780 }, { "epoch": 0.18268348069768567, "grad_norm": 4.334120750427246, "learning_rate": 8.173165193023143e-07, "loss": 0.3919, "step": 3781 }, { "epoch": 0.1827317968787747, "grad_norm": 2.851144552230835, "learning_rate": 8.172682031212252e-07, "loss": 0.3409, "step": 3782 }, { "epoch": 0.18278011305986375, "grad_norm": 2.7381532192230225, "learning_rate": 8.172198869401362e-07, "loss": 0.3134, "step": 3783 }, { "epoch": 0.1828284292409528, "grad_norm": 9.50661849975586, "learning_rate": 8.171715707590472e-07, "loss": 0.387, "step": 3784 }, { "epoch": 0.18287674542204185, "grad_norm": 2.6772751808166504, "learning_rate": 8.171232545779582e-07, "loss": 0.3108, "step": 3785 }, { "epoch": 0.18292506160313088, "grad_norm": 2.273085594177246, "learning_rate": 8.17074938396869e-07, "loss": 0.2612, "step": 3786 }, { "epoch": 0.18297337778421993, "grad_norm": 7.412182807922363, "learning_rate": 8.1702662221578e-07, "loss": 0.3184, "step": 3787 }, { "epoch": 0.18302169396530898, "grad_norm": 2.5416409969329834, "learning_rate": 8.16978306034691e-07, "loss": 0.3135, "step": 3788 }, { "epoch": 0.18307001014639804, "grad_norm": 2.370588541030884, "learning_rate": 8.169299898536019e-07, "loss": 0.2661, "step": 3789 }, { "epoch": 0.18311832632748706, "grad_norm": 2.463176727294922, "learning_rate": 8.168816736725129e-07, "loss": 0.2307, "step": 3790 }, { "epoch": 0.18316664250857612, "grad_norm": 3.7828266620635986, "learning_rate": 8.168333574914238e-07, "loss": 0.3787, "step": 3791 }, { "epoch": 0.18321495868966517, "grad_norm": 11.424561500549316, "learning_rate": 8.167850413103348e-07, "loss": 0.494, "step": 3792 }, { "epoch": 0.18326327487075422, "grad_norm": 2.8846194744110107, "learning_rate": 8.167367251292458e-07, "loss": 0.324, "step": 3793 }, { "epoch": 0.18331159105184328, "grad_norm": 1.8157532215118408, "learning_rate": 8.166884089481568e-07, "loss": 0.2212, "step": 3794 }, { "epoch": 0.1833599072329323, "grad_norm": 2.4165854454040527, "learning_rate": 8.166400927670676e-07, "loss": 0.318, "step": 3795 }, { "epoch": 0.18340822341402135, "grad_norm": 3.9958102703094482, "learning_rate": 8.165917765859785e-07, "loss": 0.444, "step": 3796 }, { "epoch": 0.1834565395951104, "grad_norm": 3.903433322906494, "learning_rate": 8.165434604048895e-07, "loss": 0.141, "step": 3797 }, { "epoch": 0.18350485577619946, "grad_norm": 2.623410940170288, "learning_rate": 8.164951442238005e-07, "loss": 0.3185, "step": 3798 }, { "epoch": 0.18355317195728849, "grad_norm": 7.757789611816406, "learning_rate": 8.164468280427115e-07, "loss": 0.3298, "step": 3799 }, { "epoch": 0.18360148813837754, "grad_norm": 2.2004361152648926, "learning_rate": 8.163985118616225e-07, "loss": 0.2322, "step": 3800 }, { "epoch": 0.1836498043194666, "grad_norm": 3.1772220134735107, "learning_rate": 8.163501956805335e-07, "loss": 0.3163, "step": 3801 }, { "epoch": 0.18369812050055565, "grad_norm": 2.407668352127075, "learning_rate": 8.163018794994443e-07, "loss": 0.3016, "step": 3802 }, { "epoch": 0.18374643668164467, "grad_norm": 2.9390170574188232, "learning_rate": 8.162535633183552e-07, "loss": 0.2201, "step": 3803 }, { "epoch": 0.18379475286273372, "grad_norm": 2.7750658988952637, "learning_rate": 8.162052471372662e-07, "loss": 0.3462, "step": 3804 }, { "epoch": 0.18384306904382278, "grad_norm": 2.4925172328948975, "learning_rate": 8.161569309561772e-07, "loss": 0.2618, "step": 3805 }, { "epoch": 0.18389138522491183, "grad_norm": 2.140735149383545, "learning_rate": 8.161086147750882e-07, "loss": 0.2418, "step": 3806 }, { "epoch": 0.18393970140600088, "grad_norm": 4.006753444671631, "learning_rate": 8.160602985939991e-07, "loss": 0.3475, "step": 3807 }, { "epoch": 0.1839880175870899, "grad_norm": 2.923405408859253, "learning_rate": 8.1601198241291e-07, "loss": 0.3068, "step": 3808 }, { "epoch": 0.18403633376817896, "grad_norm": 3.229125738143921, "learning_rate": 8.15963666231821e-07, "loss": 0.3748, "step": 3809 }, { "epoch": 0.18408464994926801, "grad_norm": 2.4946908950805664, "learning_rate": 8.15915350050732e-07, "loss": 0.3475, "step": 3810 }, { "epoch": 0.18413296613035707, "grad_norm": 6.198001861572266, "learning_rate": 8.15867033869643e-07, "loss": 0.298, "step": 3811 }, { "epoch": 0.1841812823114461, "grad_norm": 2.569979429244995, "learning_rate": 8.158187176885538e-07, "loss": 0.2443, "step": 3812 }, { "epoch": 0.18422959849253515, "grad_norm": 2.46770977973938, "learning_rate": 8.157704015074648e-07, "loss": 0.2848, "step": 3813 }, { "epoch": 0.1842779146736242, "grad_norm": 2.4214797019958496, "learning_rate": 8.157220853263757e-07, "loss": 0.2685, "step": 3814 }, { "epoch": 0.18432623085471325, "grad_norm": 2.84204363822937, "learning_rate": 8.156737691452867e-07, "loss": 0.2618, "step": 3815 }, { "epoch": 0.18437454703580228, "grad_norm": 7.348298072814941, "learning_rate": 8.156254529641977e-07, "loss": 0.3524, "step": 3816 }, { "epoch": 0.18442286321689133, "grad_norm": 3.0873427391052246, "learning_rate": 8.155771367831086e-07, "loss": 0.4002, "step": 3817 }, { "epoch": 0.18447117939798038, "grad_norm": 4.051760196685791, "learning_rate": 8.155288206020196e-07, "loss": 0.246, "step": 3818 }, { "epoch": 0.18451949557906944, "grad_norm": 1.7147223949432373, "learning_rate": 8.154805044209306e-07, "loss": 0.1875, "step": 3819 }, { "epoch": 0.1845678117601585, "grad_norm": 2.1168599128723145, "learning_rate": 8.154321882398415e-07, "loss": 0.3225, "step": 3820 }, { "epoch": 0.18461612794124752, "grad_norm": 3.5096445083618164, "learning_rate": 8.153838720587524e-07, "loss": 0.3745, "step": 3821 }, { "epoch": 0.18466444412233657, "grad_norm": 4.257881164550781, "learning_rate": 8.153355558776633e-07, "loss": 0.2661, "step": 3822 }, { "epoch": 0.18471276030342562, "grad_norm": 2.7522246837615967, "learning_rate": 8.152872396965743e-07, "loss": 0.2899, "step": 3823 }, { "epoch": 0.18476107648451467, "grad_norm": 4.2558274269104, "learning_rate": 8.152389235154853e-07, "loss": 0.4594, "step": 3824 }, { "epoch": 0.1848093926656037, "grad_norm": 6.412006855010986, "learning_rate": 8.151906073343963e-07, "loss": 0.2969, "step": 3825 }, { "epoch": 0.18485770884669275, "grad_norm": 2.495760202407837, "learning_rate": 8.151422911533073e-07, "loss": 0.2361, "step": 3826 }, { "epoch": 0.1849060250277818, "grad_norm": 2.463097333908081, "learning_rate": 8.150939749722182e-07, "loss": 0.2754, "step": 3827 }, { "epoch": 0.18495434120887086, "grad_norm": 2.058907985687256, "learning_rate": 8.15045658791129e-07, "loss": 0.2273, "step": 3828 }, { "epoch": 0.18500265738995988, "grad_norm": 2.95208740234375, "learning_rate": 8.1499734261004e-07, "loss": 0.3167, "step": 3829 }, { "epoch": 0.18505097357104894, "grad_norm": 2.7278456687927246, "learning_rate": 8.14949026428951e-07, "loss": 0.3659, "step": 3830 }, { "epoch": 0.185099289752138, "grad_norm": 2.1811749935150146, "learning_rate": 8.14900710247862e-07, "loss": 0.2593, "step": 3831 }, { "epoch": 0.18514760593322704, "grad_norm": 3.259913921356201, "learning_rate": 8.14852394066773e-07, "loss": 0.3593, "step": 3832 }, { "epoch": 0.1851959221143161, "grad_norm": 5.63586950302124, "learning_rate": 8.148040778856839e-07, "loss": 0.3069, "step": 3833 }, { "epoch": 0.18524423829540512, "grad_norm": 2.8375084400177, "learning_rate": 8.147557617045948e-07, "loss": 0.3973, "step": 3834 }, { "epoch": 0.18529255447649418, "grad_norm": 3.4204976558685303, "learning_rate": 8.147074455235058e-07, "loss": 0.238, "step": 3835 }, { "epoch": 0.18534087065758323, "grad_norm": 1.828518033027649, "learning_rate": 8.146591293424168e-07, "loss": 0.1803, "step": 3836 }, { "epoch": 0.18538918683867228, "grad_norm": 2.7444427013397217, "learning_rate": 8.146108131613277e-07, "loss": 0.3361, "step": 3837 }, { "epoch": 0.1854375030197613, "grad_norm": 1.8375440835952759, "learning_rate": 8.145624969802386e-07, "loss": 0.1782, "step": 3838 }, { "epoch": 0.18548581920085036, "grad_norm": 36.45734786987305, "learning_rate": 8.145141807991496e-07, "loss": 0.3853, "step": 3839 }, { "epoch": 0.1855341353819394, "grad_norm": 2.1215085983276367, "learning_rate": 8.144658646180605e-07, "loss": 0.2432, "step": 3840 }, { "epoch": 0.18558245156302847, "grad_norm": 4.138357639312744, "learning_rate": 8.144175484369715e-07, "loss": 0.2768, "step": 3841 }, { "epoch": 0.1856307677441175, "grad_norm": 9.148626327514648, "learning_rate": 8.143692322558825e-07, "loss": 0.308, "step": 3842 }, { "epoch": 0.18567908392520655, "grad_norm": 3.7120230197906494, "learning_rate": 8.143209160747934e-07, "loss": 0.3768, "step": 3843 }, { "epoch": 0.1857274001062956, "grad_norm": 3.7860896587371826, "learning_rate": 8.142725998937044e-07, "loss": 0.3575, "step": 3844 }, { "epoch": 0.18577571628738465, "grad_norm": 3.1409389972686768, "learning_rate": 8.142242837126153e-07, "loss": 0.3808, "step": 3845 }, { "epoch": 0.1858240324684737, "grad_norm": 2.619575023651123, "learning_rate": 8.141759675315262e-07, "loss": 0.3505, "step": 3846 }, { "epoch": 0.18587234864956273, "grad_norm": 4.643584728240967, "learning_rate": 8.141276513504372e-07, "loss": 0.3001, "step": 3847 }, { "epoch": 0.18592066483065178, "grad_norm": 2.7304859161376953, "learning_rate": 8.140793351693481e-07, "loss": 0.3446, "step": 3848 }, { "epoch": 0.18596898101174084, "grad_norm": 2.0849015712738037, "learning_rate": 8.140310189882591e-07, "loss": 0.2387, "step": 3849 }, { "epoch": 0.1860172971928299, "grad_norm": 2.0498147010803223, "learning_rate": 8.139827028071701e-07, "loss": 0.2111, "step": 3850 }, { "epoch": 0.18606561337391891, "grad_norm": 1.8439288139343262, "learning_rate": 8.139343866260811e-07, "loss": 0.1862, "step": 3851 }, { "epoch": 0.18611392955500797, "grad_norm": 2.6429827213287354, "learning_rate": 8.138860704449921e-07, "loss": 0.3261, "step": 3852 }, { "epoch": 0.18616224573609702, "grad_norm": 1.5493288040161133, "learning_rate": 8.13837754263903e-07, "loss": 0.1722, "step": 3853 }, { "epoch": 0.18621056191718607, "grad_norm": 3.066776990890503, "learning_rate": 8.137894380828138e-07, "loss": 0.4077, "step": 3854 }, { "epoch": 0.1862588780982751, "grad_norm": 2.336791515350342, "learning_rate": 8.137411219017248e-07, "loss": 0.2401, "step": 3855 }, { "epoch": 0.18630719427936415, "grad_norm": 3.280344009399414, "learning_rate": 8.136928057206358e-07, "loss": 0.2975, "step": 3856 }, { "epoch": 0.1863555104604532, "grad_norm": 5.174849510192871, "learning_rate": 8.136444895395468e-07, "loss": 0.4542, "step": 3857 }, { "epoch": 0.18640382664154226, "grad_norm": 3.354407548904419, "learning_rate": 8.135961733584578e-07, "loss": 0.2868, "step": 3858 }, { "epoch": 0.1864521428226313, "grad_norm": 2.6198067665100098, "learning_rate": 8.135478571773686e-07, "loss": 0.3519, "step": 3859 }, { "epoch": 0.18650045900372034, "grad_norm": 2.2125964164733887, "learning_rate": 8.134995409962796e-07, "loss": 0.259, "step": 3860 }, { "epoch": 0.1865487751848094, "grad_norm": 2.981527328491211, "learning_rate": 8.134512248151906e-07, "loss": 0.2746, "step": 3861 }, { "epoch": 0.18659709136589844, "grad_norm": 2.944234848022461, "learning_rate": 8.134029086341015e-07, "loss": 0.3607, "step": 3862 }, { "epoch": 0.1866454075469875, "grad_norm": 2.9163389205932617, "learning_rate": 8.133545924530125e-07, "loss": 0.3064, "step": 3863 }, { "epoch": 0.18669372372807652, "grad_norm": 2.728701114654541, "learning_rate": 8.133062762719234e-07, "loss": 0.3064, "step": 3864 }, { "epoch": 0.18674203990916557, "grad_norm": 2.9146676063537598, "learning_rate": 8.132579600908344e-07, "loss": 0.3873, "step": 3865 }, { "epoch": 0.18679035609025463, "grad_norm": 2.7848122119903564, "learning_rate": 8.132096439097453e-07, "loss": 0.2866, "step": 3866 }, { "epoch": 0.18683867227134368, "grad_norm": 2.2393758296966553, "learning_rate": 8.131613277286563e-07, "loss": 0.28, "step": 3867 }, { "epoch": 0.1868869884524327, "grad_norm": 3.5756170749664307, "learning_rate": 8.131130115475673e-07, "loss": 0.2596, "step": 3868 }, { "epoch": 0.18693530463352176, "grad_norm": 3.3204026222229004, "learning_rate": 8.130646953664782e-07, "loss": 0.303, "step": 3869 }, { "epoch": 0.1869836208146108, "grad_norm": 12.377497673034668, "learning_rate": 8.130163791853892e-07, "loss": 0.3725, "step": 3870 }, { "epoch": 0.18703193699569987, "grad_norm": 6.310421943664551, "learning_rate": 8.129680630043001e-07, "loss": 0.3546, "step": 3871 }, { "epoch": 0.18708025317678892, "grad_norm": 3.3375165462493896, "learning_rate": 8.12919746823211e-07, "loss": 0.3495, "step": 3872 }, { "epoch": 0.18712856935787794, "grad_norm": 2.3325183391571045, "learning_rate": 8.12871430642122e-07, "loss": 0.3617, "step": 3873 }, { "epoch": 0.187176885538967, "grad_norm": 1.8346244096755981, "learning_rate": 8.128231144610329e-07, "loss": 0.1619, "step": 3874 }, { "epoch": 0.18722520172005605, "grad_norm": 2.8127269744873047, "learning_rate": 8.127747982799439e-07, "loss": 0.3135, "step": 3875 }, { "epoch": 0.1872735179011451, "grad_norm": 1.8782644271850586, "learning_rate": 8.127264820988549e-07, "loss": 0.2097, "step": 3876 }, { "epoch": 0.18732183408223413, "grad_norm": 2.4143035411834717, "learning_rate": 8.126781659177659e-07, "loss": 0.2322, "step": 3877 }, { "epoch": 0.18737015026332318, "grad_norm": 4.1126227378845215, "learning_rate": 8.126298497366768e-07, "loss": 0.3334, "step": 3878 }, { "epoch": 0.18741846644441224, "grad_norm": 11.848310470581055, "learning_rate": 8.125815335555877e-07, "loss": 0.2759, "step": 3879 }, { "epoch": 0.1874667826255013, "grad_norm": 2.582338809967041, "learning_rate": 8.125332173744986e-07, "loss": 0.3245, "step": 3880 }, { "epoch": 0.1875150988065903, "grad_norm": 2.912148952484131, "learning_rate": 8.124849011934096e-07, "loss": 0.3787, "step": 3881 }, { "epoch": 0.18756341498767937, "grad_norm": 3.3503589630126953, "learning_rate": 8.124365850123206e-07, "loss": 0.3527, "step": 3882 }, { "epoch": 0.18761173116876842, "grad_norm": 3.044705629348755, "learning_rate": 8.123882688312316e-07, "loss": 0.4632, "step": 3883 }, { "epoch": 0.18766004734985747, "grad_norm": 4.116808891296387, "learning_rate": 8.123399526501426e-07, "loss": 0.324, "step": 3884 }, { "epoch": 0.18770836353094653, "grad_norm": 2.0373551845550537, "learning_rate": 8.122916364690534e-07, "loss": 0.2347, "step": 3885 }, { "epoch": 0.18775667971203555, "grad_norm": 2.5771472454071045, "learning_rate": 8.122433202879644e-07, "loss": 0.1965, "step": 3886 }, { "epoch": 0.1878049958931246, "grad_norm": 1.9466828107833862, "learning_rate": 8.121950041068754e-07, "loss": 0.2188, "step": 3887 }, { "epoch": 0.18785331207421366, "grad_norm": 3.0491466522216797, "learning_rate": 8.121466879257863e-07, "loss": 0.2749, "step": 3888 }, { "epoch": 0.1879016282553027, "grad_norm": 2.5973479747772217, "learning_rate": 8.120983717446973e-07, "loss": 0.3088, "step": 3889 }, { "epoch": 0.18794994443639174, "grad_norm": 1.9487404823303223, "learning_rate": 8.120500555636082e-07, "loss": 0.2692, "step": 3890 }, { "epoch": 0.1879982606174808, "grad_norm": 2.5827503204345703, "learning_rate": 8.120017393825191e-07, "loss": 0.2699, "step": 3891 }, { "epoch": 0.18804657679856984, "grad_norm": 4.638366222381592, "learning_rate": 8.119534232014301e-07, "loss": 0.2424, "step": 3892 }, { "epoch": 0.1880948929796589, "grad_norm": 5.020407199859619, "learning_rate": 8.119051070203411e-07, "loss": 0.3164, "step": 3893 }, { "epoch": 0.18814320916074792, "grad_norm": 6.923037528991699, "learning_rate": 8.118567908392521e-07, "loss": 0.4181, "step": 3894 }, { "epoch": 0.18819152534183697, "grad_norm": 2.4897756576538086, "learning_rate": 8.11808474658163e-07, "loss": 0.1762, "step": 3895 }, { "epoch": 0.18823984152292603, "grad_norm": 5.588736534118652, "learning_rate": 8.117601584770739e-07, "loss": 0.2946, "step": 3896 }, { "epoch": 0.18828815770401508, "grad_norm": 3.0829780101776123, "learning_rate": 8.117118422959849e-07, "loss": 0.2209, "step": 3897 }, { "epoch": 0.18833647388510413, "grad_norm": 3.4161221981048584, "learning_rate": 8.116635261148958e-07, "loss": 0.2147, "step": 3898 }, { "epoch": 0.18838479006619316, "grad_norm": 5.076138019561768, "learning_rate": 8.116152099338068e-07, "loss": 0.2904, "step": 3899 }, { "epoch": 0.1884331062472822, "grad_norm": 22.422632217407227, "learning_rate": 8.115668937527177e-07, "loss": 0.2481, "step": 3900 }, { "epoch": 0.18848142242837126, "grad_norm": 3.9743540287017822, "learning_rate": 8.115185775716287e-07, "loss": 0.3734, "step": 3901 }, { "epoch": 0.18852973860946032, "grad_norm": 2.8130948543548584, "learning_rate": 8.114702613905397e-07, "loss": 0.2822, "step": 3902 }, { "epoch": 0.18857805479054934, "grad_norm": 14.043756484985352, "learning_rate": 8.114219452094507e-07, "loss": 0.3065, "step": 3903 }, { "epoch": 0.1886263709716384, "grad_norm": 1.9451779127120972, "learning_rate": 8.113736290283615e-07, "loss": 0.2405, "step": 3904 }, { "epoch": 0.18867468715272745, "grad_norm": 9.896947860717773, "learning_rate": 8.113253128472725e-07, "loss": 0.2607, "step": 3905 }, { "epoch": 0.1887230033338165, "grad_norm": 2.316255569458008, "learning_rate": 8.112769966661834e-07, "loss": 0.2392, "step": 3906 }, { "epoch": 0.18877131951490556, "grad_norm": 3.648391008377075, "learning_rate": 8.112286804850944e-07, "loss": 0.3948, "step": 3907 }, { "epoch": 0.18881963569599458, "grad_norm": 4.684813499450684, "learning_rate": 8.111803643040054e-07, "loss": 0.3314, "step": 3908 }, { "epoch": 0.18886795187708363, "grad_norm": 2.319755792617798, "learning_rate": 8.111320481229164e-07, "loss": 0.2473, "step": 3909 }, { "epoch": 0.1889162680581727, "grad_norm": 3.4630112648010254, "learning_rate": 8.110837319418274e-07, "loss": 0.2964, "step": 3910 }, { "epoch": 0.18896458423926174, "grad_norm": 3.4874520301818848, "learning_rate": 8.110354157607382e-07, "loss": 0.3619, "step": 3911 }, { "epoch": 0.18901290042035077, "grad_norm": 2.199207305908203, "learning_rate": 8.109870995796492e-07, "loss": 0.2476, "step": 3912 }, { "epoch": 0.18906121660143982, "grad_norm": 2.2900941371917725, "learning_rate": 8.109387833985601e-07, "loss": 0.2597, "step": 3913 }, { "epoch": 0.18910953278252887, "grad_norm": 12.491915702819824, "learning_rate": 8.108904672174711e-07, "loss": 0.3385, "step": 3914 }, { "epoch": 0.18915784896361792, "grad_norm": 2.790071487426758, "learning_rate": 8.108421510363821e-07, "loss": 0.3218, "step": 3915 }, { "epoch": 0.18920616514470695, "grad_norm": 2.2816803455352783, "learning_rate": 8.10793834855293e-07, "loss": 0.3131, "step": 3916 }, { "epoch": 0.189254481325796, "grad_norm": 2.2402825355529785, "learning_rate": 8.107455186742039e-07, "loss": 0.2237, "step": 3917 }, { "epoch": 0.18930279750688506, "grad_norm": 2.421755075454712, "learning_rate": 8.106972024931149e-07, "loss": 0.3166, "step": 3918 }, { "epoch": 0.1893511136879741, "grad_norm": 2.263807773590088, "learning_rate": 8.106488863120259e-07, "loss": 0.2605, "step": 3919 }, { "epoch": 0.18939942986906316, "grad_norm": 1.4248144626617432, "learning_rate": 8.106005701309369e-07, "loss": 0.1344, "step": 3920 }, { "epoch": 0.1894477460501522, "grad_norm": 3.147932767868042, "learning_rate": 8.105522539498477e-07, "loss": 0.3125, "step": 3921 }, { "epoch": 0.18949606223124124, "grad_norm": 2.976414680480957, "learning_rate": 8.105039377687587e-07, "loss": 0.3494, "step": 3922 }, { "epoch": 0.1895443784123303, "grad_norm": 2.50357985496521, "learning_rate": 8.104556215876696e-07, "loss": 0.3264, "step": 3923 }, { "epoch": 0.18959269459341935, "grad_norm": 2.9441516399383545, "learning_rate": 8.104073054065806e-07, "loss": 0.4008, "step": 3924 }, { "epoch": 0.18964101077450837, "grad_norm": 2.496889114379883, "learning_rate": 8.103589892254916e-07, "loss": 0.3459, "step": 3925 }, { "epoch": 0.18968932695559743, "grad_norm": 2.6000311374664307, "learning_rate": 8.103106730444025e-07, "loss": 0.2643, "step": 3926 }, { "epoch": 0.18973764313668648, "grad_norm": 2.745816469192505, "learning_rate": 8.102623568633135e-07, "loss": 0.3318, "step": 3927 }, { "epoch": 0.18978595931777553, "grad_norm": 2.784261465072632, "learning_rate": 8.102140406822245e-07, "loss": 0.2429, "step": 3928 }, { "epoch": 0.18983427549886456, "grad_norm": 10.795702934265137, "learning_rate": 8.101657245011355e-07, "loss": 0.2416, "step": 3929 }, { "epoch": 0.1898825916799536, "grad_norm": 2.5503194332122803, "learning_rate": 8.101174083200463e-07, "loss": 0.3111, "step": 3930 }, { "epoch": 0.18993090786104266, "grad_norm": 2.642799139022827, "learning_rate": 8.100690921389573e-07, "loss": 0.3518, "step": 3931 }, { "epoch": 0.18997922404213172, "grad_norm": 4.079859733581543, "learning_rate": 8.100207759578682e-07, "loss": 0.3966, "step": 3932 }, { "epoch": 0.19002754022322077, "grad_norm": 2.1498734951019287, "learning_rate": 8.099724597767792e-07, "loss": 0.2467, "step": 3933 }, { "epoch": 0.1900758564043098, "grad_norm": 4.960547924041748, "learning_rate": 8.099241435956902e-07, "loss": 0.2888, "step": 3934 }, { "epoch": 0.19012417258539885, "grad_norm": 21.671703338623047, "learning_rate": 8.098758274146012e-07, "loss": 0.2739, "step": 3935 }, { "epoch": 0.1901724887664879, "grad_norm": 3.1444151401519775, "learning_rate": 8.098275112335121e-07, "loss": 0.4533, "step": 3936 }, { "epoch": 0.19022080494757695, "grad_norm": 3.097363233566284, "learning_rate": 8.09779195052423e-07, "loss": 0.3682, "step": 3937 }, { "epoch": 0.19026912112866598, "grad_norm": 2.4999523162841797, "learning_rate": 8.097308788713339e-07, "loss": 0.2793, "step": 3938 }, { "epoch": 0.19031743730975503, "grad_norm": 3.12239933013916, "learning_rate": 8.096825626902449e-07, "loss": 0.2523, "step": 3939 }, { "epoch": 0.1903657534908441, "grad_norm": 2.129570722579956, "learning_rate": 8.096342465091559e-07, "loss": 0.2631, "step": 3940 }, { "epoch": 0.19041406967193314, "grad_norm": 2.4462616443634033, "learning_rate": 8.095859303280669e-07, "loss": 0.2868, "step": 3941 }, { "epoch": 0.19046238585302216, "grad_norm": 2.40183424949646, "learning_rate": 8.095376141469777e-07, "loss": 0.3334, "step": 3942 }, { "epoch": 0.19051070203411122, "grad_norm": 2.4485535621643066, "learning_rate": 8.094892979658887e-07, "loss": 0.3039, "step": 3943 }, { "epoch": 0.19055901821520027, "grad_norm": 2.860844850540161, "learning_rate": 8.094409817847997e-07, "loss": 0.2675, "step": 3944 }, { "epoch": 0.19060733439628932, "grad_norm": 2.669794797897339, "learning_rate": 8.093926656037107e-07, "loss": 0.2577, "step": 3945 }, { "epoch": 0.19065565057737838, "grad_norm": 3.1495537757873535, "learning_rate": 8.093443494226217e-07, "loss": 0.2407, "step": 3946 }, { "epoch": 0.1907039667584674, "grad_norm": 2.1536672115325928, "learning_rate": 8.092960332415325e-07, "loss": 0.2683, "step": 3947 }, { "epoch": 0.19075228293955646, "grad_norm": 8.048828125, "learning_rate": 8.092477170604435e-07, "loss": 0.2709, "step": 3948 }, { "epoch": 0.1908005991206455, "grad_norm": 3.1546382904052734, "learning_rate": 8.091994008793544e-07, "loss": 0.1865, "step": 3949 }, { "epoch": 0.19084891530173456, "grad_norm": 2.0276498794555664, "learning_rate": 8.091510846982654e-07, "loss": 0.2339, "step": 3950 }, { "epoch": 0.1908972314828236, "grad_norm": 2.9289205074310303, "learning_rate": 8.091027685171764e-07, "loss": 0.3231, "step": 3951 }, { "epoch": 0.19094554766391264, "grad_norm": 2.9615161418914795, "learning_rate": 8.090544523360873e-07, "loss": 0.4434, "step": 3952 }, { "epoch": 0.1909938638450017, "grad_norm": 5.34763765335083, "learning_rate": 8.090061361549983e-07, "loss": 0.2568, "step": 3953 }, { "epoch": 0.19104218002609075, "grad_norm": 2.388338088989258, "learning_rate": 8.089578199739093e-07, "loss": 0.2372, "step": 3954 }, { "epoch": 0.19109049620717977, "grad_norm": 3.019672155380249, "learning_rate": 8.089095037928201e-07, "loss": 0.3641, "step": 3955 }, { "epoch": 0.19113881238826883, "grad_norm": 2.823214530944824, "learning_rate": 8.088611876117311e-07, "loss": 0.3047, "step": 3956 }, { "epoch": 0.19118712856935788, "grad_norm": 2.830018997192383, "learning_rate": 8.088128714306421e-07, "loss": 0.3324, "step": 3957 }, { "epoch": 0.19123544475044693, "grad_norm": 3.4844419956207275, "learning_rate": 8.08764555249553e-07, "loss": 0.2711, "step": 3958 }, { "epoch": 0.19128376093153598, "grad_norm": 1.2106083631515503, "learning_rate": 8.08716239068464e-07, "loss": 0.1313, "step": 3959 }, { "epoch": 0.191332077112625, "grad_norm": 3.5906808376312256, "learning_rate": 8.08667922887375e-07, "loss": 0.4241, "step": 3960 }, { "epoch": 0.19138039329371406, "grad_norm": 3.543591022491455, "learning_rate": 8.08619606706286e-07, "loss": 0.3518, "step": 3961 }, { "epoch": 0.19142870947480312, "grad_norm": 3.932621479034424, "learning_rate": 8.085712905251969e-07, "loss": 0.2824, "step": 3962 }, { "epoch": 0.19147702565589217, "grad_norm": 25.35531997680664, "learning_rate": 8.085229743441077e-07, "loss": 0.2578, "step": 3963 }, { "epoch": 0.1915253418369812, "grad_norm": 6.107210636138916, "learning_rate": 8.084746581630187e-07, "loss": 0.341, "step": 3964 }, { "epoch": 0.19157365801807025, "grad_norm": 2.742774486541748, "learning_rate": 8.084263419819297e-07, "loss": 0.3095, "step": 3965 }, { "epoch": 0.1916219741991593, "grad_norm": 1.8136107921600342, "learning_rate": 8.083780258008407e-07, "loss": 0.1899, "step": 3966 }, { "epoch": 0.19167029038024835, "grad_norm": 2.7414863109588623, "learning_rate": 8.083297096197517e-07, "loss": 0.2384, "step": 3967 }, { "epoch": 0.19171860656133738, "grad_norm": 2.786982774734497, "learning_rate": 8.082813934386625e-07, "loss": 0.2488, "step": 3968 }, { "epoch": 0.19176692274242643, "grad_norm": 2.7765119075775146, "learning_rate": 8.082330772575735e-07, "loss": 0.3736, "step": 3969 }, { "epoch": 0.19181523892351549, "grad_norm": 2.3147103786468506, "learning_rate": 8.081847610764845e-07, "loss": 0.288, "step": 3970 }, { "epoch": 0.19186355510460454, "grad_norm": 3.5414071083068848, "learning_rate": 8.081364448953955e-07, "loss": 0.4889, "step": 3971 }, { "epoch": 0.1919118712856936, "grad_norm": 2.669316053390503, "learning_rate": 8.080881287143064e-07, "loss": 0.328, "step": 3972 }, { "epoch": 0.19196018746678262, "grad_norm": 3.458109140396118, "learning_rate": 8.080398125332173e-07, "loss": 0.4183, "step": 3973 }, { "epoch": 0.19200850364787167, "grad_norm": 2.8641843795776367, "learning_rate": 8.079914963521282e-07, "loss": 0.3651, "step": 3974 }, { "epoch": 0.19205681982896072, "grad_norm": 1.9886752367019653, "learning_rate": 8.079431801710392e-07, "loss": 0.2386, "step": 3975 }, { "epoch": 0.19210513601004978, "grad_norm": 4.641997337341309, "learning_rate": 8.078948639899502e-07, "loss": 0.4734, "step": 3976 }, { "epoch": 0.1921534521911388, "grad_norm": 3.7657976150512695, "learning_rate": 8.078465478088612e-07, "loss": 0.3973, "step": 3977 }, { "epoch": 0.19220176837222785, "grad_norm": 4.39698600769043, "learning_rate": 8.077982316277721e-07, "loss": 0.3686, "step": 3978 }, { "epoch": 0.1922500845533169, "grad_norm": 2.202793598175049, "learning_rate": 8.077499154466831e-07, "loss": 0.2875, "step": 3979 }, { "epoch": 0.19229840073440596, "grad_norm": 6.231393814086914, "learning_rate": 8.07701599265594e-07, "loss": 0.3087, "step": 3980 }, { "epoch": 0.192346716915495, "grad_norm": 2.840514898300171, "learning_rate": 8.076532830845049e-07, "loss": 0.2596, "step": 3981 }, { "epoch": 0.19239503309658404, "grad_norm": 4.170706748962402, "learning_rate": 8.076049669034159e-07, "loss": 0.2424, "step": 3982 }, { "epoch": 0.1924433492776731, "grad_norm": 2.878962516784668, "learning_rate": 8.075566507223268e-07, "loss": 0.3381, "step": 3983 }, { "epoch": 0.19249166545876215, "grad_norm": 1.7398957014083862, "learning_rate": 8.075083345412378e-07, "loss": 0.2302, "step": 3984 }, { "epoch": 0.1925399816398512, "grad_norm": 2.8752079010009766, "learning_rate": 8.074600183601488e-07, "loss": 0.3598, "step": 3985 }, { "epoch": 0.19258829782094022, "grad_norm": 2.271967887878418, "learning_rate": 8.074117021790598e-07, "loss": 0.2351, "step": 3986 }, { "epoch": 0.19263661400202928, "grad_norm": 3.3061270713806152, "learning_rate": 8.073633859979707e-07, "loss": 0.4364, "step": 3987 }, { "epoch": 0.19268493018311833, "grad_norm": 1.9467215538024902, "learning_rate": 8.073150698168817e-07, "loss": 0.2358, "step": 3988 }, { "epoch": 0.19273324636420738, "grad_norm": 2.7534852027893066, "learning_rate": 8.072667536357925e-07, "loss": 0.3553, "step": 3989 }, { "epoch": 0.1927815625452964, "grad_norm": 3.2521121501922607, "learning_rate": 8.072184374547035e-07, "loss": 0.508, "step": 3990 }, { "epoch": 0.19282987872638546, "grad_norm": 1.9318352937698364, "learning_rate": 8.071701212736145e-07, "loss": 0.2296, "step": 3991 }, { "epoch": 0.19287819490747452, "grad_norm": 1.761191725730896, "learning_rate": 8.071218050925255e-07, "loss": 0.1777, "step": 3992 }, { "epoch": 0.19292651108856357, "grad_norm": 2.268031597137451, "learning_rate": 8.070734889114365e-07, "loss": 0.2818, "step": 3993 }, { "epoch": 0.1929748272696526, "grad_norm": 2.734189748764038, "learning_rate": 8.070251727303473e-07, "loss": 0.2846, "step": 3994 }, { "epoch": 0.19302314345074165, "grad_norm": 2.6798033714294434, "learning_rate": 8.069768565492583e-07, "loss": 0.3408, "step": 3995 }, { "epoch": 0.1930714596318307, "grad_norm": 2.6505062580108643, "learning_rate": 8.069285403681693e-07, "loss": 0.2595, "step": 3996 }, { "epoch": 0.19311977581291975, "grad_norm": 6.569301605224609, "learning_rate": 8.068802241870803e-07, "loss": 0.3541, "step": 3997 }, { "epoch": 0.1931680919940088, "grad_norm": 5.9799604415893555, "learning_rate": 8.068319080059912e-07, "loss": 0.3158, "step": 3998 }, { "epoch": 0.19321640817509783, "grad_norm": 6.410613059997559, "learning_rate": 8.067835918249021e-07, "loss": 0.3218, "step": 3999 }, { "epoch": 0.19326472435618688, "grad_norm": 2.3991594314575195, "learning_rate": 8.06735275643813e-07, "loss": 0.2481, "step": 4000 }, { "epoch": 0.19331304053727594, "grad_norm": 2.7579541206359863, "learning_rate": 8.06686959462724e-07, "loss": 0.3671, "step": 4001 }, { "epoch": 0.193361356718365, "grad_norm": 5.3524675369262695, "learning_rate": 8.06638643281635e-07, "loss": 0.4223, "step": 4002 }, { "epoch": 0.19340967289945402, "grad_norm": 3.2621665000915527, "learning_rate": 8.06590327100546e-07, "loss": 0.419, "step": 4003 }, { "epoch": 0.19345798908054307, "grad_norm": 2.760573625564575, "learning_rate": 8.065420109194569e-07, "loss": 0.431, "step": 4004 }, { "epoch": 0.19350630526163212, "grad_norm": 2.5730650424957275, "learning_rate": 8.064936947383679e-07, "loss": 0.3476, "step": 4005 }, { "epoch": 0.19355462144272118, "grad_norm": 1.924336552619934, "learning_rate": 8.064453785572787e-07, "loss": 0.1914, "step": 4006 }, { "epoch": 0.1936029376238102, "grad_norm": 5.693231105804443, "learning_rate": 8.063970623761897e-07, "loss": 0.4121, "step": 4007 }, { "epoch": 0.19365125380489925, "grad_norm": 2.7283406257629395, "learning_rate": 8.063487461951007e-07, "loss": 0.3813, "step": 4008 }, { "epoch": 0.1936995699859883, "grad_norm": 2.225785970687866, "learning_rate": 8.063004300140116e-07, "loss": 0.2681, "step": 4009 }, { "epoch": 0.19374788616707736, "grad_norm": 1.7643157243728638, "learning_rate": 8.062521138329226e-07, "loss": 0.2025, "step": 4010 }, { "epoch": 0.1937962023481664, "grad_norm": 2.1328954696655273, "learning_rate": 8.062037976518336e-07, "loss": 0.2302, "step": 4011 }, { "epoch": 0.19384451852925544, "grad_norm": 3.809739112854004, "learning_rate": 8.061554814707446e-07, "loss": 0.4243, "step": 4012 }, { "epoch": 0.1938928347103445, "grad_norm": 3.6961491107940674, "learning_rate": 8.061071652896555e-07, "loss": 0.1914, "step": 4013 }, { "epoch": 0.19394115089143354, "grad_norm": 2.6600167751312256, "learning_rate": 8.060588491085664e-07, "loss": 0.3259, "step": 4014 }, { "epoch": 0.1939894670725226, "grad_norm": 2.4572765827178955, "learning_rate": 8.060105329274773e-07, "loss": 0.1963, "step": 4015 }, { "epoch": 0.19403778325361162, "grad_norm": 2.3060731887817383, "learning_rate": 8.059622167463883e-07, "loss": 0.2625, "step": 4016 }, { "epoch": 0.19408609943470068, "grad_norm": 2.3943369388580322, "learning_rate": 8.059139005652993e-07, "loss": 0.2243, "step": 4017 }, { "epoch": 0.19413441561578973, "grad_norm": 2.1387698650360107, "learning_rate": 8.058655843842103e-07, "loss": 0.2952, "step": 4018 }, { "epoch": 0.19418273179687878, "grad_norm": 2.9871113300323486, "learning_rate": 8.058172682031212e-07, "loss": 0.238, "step": 4019 }, { "epoch": 0.1942310479779678, "grad_norm": 2.6080121994018555, "learning_rate": 8.057689520220321e-07, "loss": 0.2465, "step": 4020 }, { "epoch": 0.19427936415905686, "grad_norm": 4.933740615844727, "learning_rate": 8.057206358409431e-07, "loss": 0.3026, "step": 4021 }, { "epoch": 0.19432768034014591, "grad_norm": 2.93502140045166, "learning_rate": 8.056723196598541e-07, "loss": 0.3529, "step": 4022 }, { "epoch": 0.19437599652123497, "grad_norm": 2.832987070083618, "learning_rate": 8.05624003478765e-07, "loss": 0.4363, "step": 4023 }, { "epoch": 0.19442431270232402, "grad_norm": 2.76434588432312, "learning_rate": 8.05575687297676e-07, "loss": 0.3747, "step": 4024 }, { "epoch": 0.19447262888341305, "grad_norm": 2.250290632247925, "learning_rate": 8.055273711165868e-07, "loss": 0.2549, "step": 4025 }, { "epoch": 0.1945209450645021, "grad_norm": 1.8102089166641235, "learning_rate": 8.054790549354978e-07, "loss": 0.2234, "step": 4026 }, { "epoch": 0.19456926124559115, "grad_norm": 3.262352228164673, "learning_rate": 8.054307387544088e-07, "loss": 0.3177, "step": 4027 }, { "epoch": 0.1946175774266802, "grad_norm": 5.913208484649658, "learning_rate": 8.053824225733198e-07, "loss": 0.2889, "step": 4028 }, { "epoch": 0.19466589360776923, "grad_norm": 3.1054437160491943, "learning_rate": 8.053341063922308e-07, "loss": 0.2242, "step": 4029 }, { "epoch": 0.19471420978885828, "grad_norm": 2.969754695892334, "learning_rate": 8.052857902111417e-07, "loss": 0.3602, "step": 4030 }, { "epoch": 0.19476252596994734, "grad_norm": 2.177321195602417, "learning_rate": 8.052374740300526e-07, "loss": 0.2257, "step": 4031 }, { "epoch": 0.1948108421510364, "grad_norm": 2.0013415813446045, "learning_rate": 8.051891578489635e-07, "loss": 0.2139, "step": 4032 }, { "epoch": 0.19485915833212542, "grad_norm": 3.444725513458252, "learning_rate": 8.051408416678745e-07, "loss": 0.2794, "step": 4033 }, { "epoch": 0.19490747451321447, "grad_norm": 2.43912410736084, "learning_rate": 8.050925254867855e-07, "loss": 0.343, "step": 4034 }, { "epoch": 0.19495579069430352, "grad_norm": 1.3369836807250977, "learning_rate": 8.050442093056964e-07, "loss": 0.1473, "step": 4035 }, { "epoch": 0.19500410687539257, "grad_norm": 2.347452163696289, "learning_rate": 8.049958931246074e-07, "loss": 0.3227, "step": 4036 }, { "epoch": 0.19505242305648163, "grad_norm": 2.4761459827423096, "learning_rate": 8.049475769435184e-07, "loss": 0.3098, "step": 4037 }, { "epoch": 0.19510073923757065, "grad_norm": 2.2550809383392334, "learning_rate": 8.048992607624293e-07, "loss": 0.2481, "step": 4038 }, { "epoch": 0.1951490554186597, "grad_norm": 2.4123661518096924, "learning_rate": 8.048509445813403e-07, "loss": 0.3239, "step": 4039 }, { "epoch": 0.19519737159974876, "grad_norm": 2.4132378101348877, "learning_rate": 8.048026284002512e-07, "loss": 0.2906, "step": 4040 }, { "epoch": 0.1952456877808378, "grad_norm": 2.314948797225952, "learning_rate": 8.047543122191621e-07, "loss": 0.3136, "step": 4041 }, { "epoch": 0.19529400396192684, "grad_norm": 3.4355826377868652, "learning_rate": 8.047059960380731e-07, "loss": 0.3657, "step": 4042 }, { "epoch": 0.1953423201430159, "grad_norm": 2.517277717590332, "learning_rate": 8.046576798569841e-07, "loss": 0.2806, "step": 4043 }, { "epoch": 0.19539063632410494, "grad_norm": 3.021481513977051, "learning_rate": 8.046093636758951e-07, "loss": 0.3636, "step": 4044 }, { "epoch": 0.195438952505194, "grad_norm": 12.098498344421387, "learning_rate": 8.04561047494806e-07, "loss": 0.348, "step": 4045 }, { "epoch": 0.19548726868628305, "grad_norm": 1.9363101720809937, "learning_rate": 8.045127313137169e-07, "loss": 0.1934, "step": 4046 }, { "epoch": 0.19553558486737208, "grad_norm": 2.9742002487182617, "learning_rate": 8.044644151326279e-07, "loss": 0.2935, "step": 4047 }, { "epoch": 0.19558390104846113, "grad_norm": 3.374547243118286, "learning_rate": 8.044160989515388e-07, "loss": 0.2144, "step": 4048 }, { "epoch": 0.19563221722955018, "grad_norm": 1.9629491567611694, "learning_rate": 8.043677827704498e-07, "loss": 0.1785, "step": 4049 }, { "epoch": 0.19568053341063923, "grad_norm": 2.321045398712158, "learning_rate": 8.043194665893608e-07, "loss": 0.2983, "step": 4050 }, { "epoch": 0.19572884959172826, "grad_norm": 3.3766796588897705, "learning_rate": 8.042711504082716e-07, "loss": 0.3358, "step": 4051 }, { "epoch": 0.1957771657728173, "grad_norm": 3.330103874206543, "learning_rate": 8.042228342271826e-07, "loss": 0.2687, "step": 4052 }, { "epoch": 0.19582548195390637, "grad_norm": 2.650411367416382, "learning_rate": 8.041745180460936e-07, "loss": 0.3289, "step": 4053 }, { "epoch": 0.19587379813499542, "grad_norm": 2.959775447845459, "learning_rate": 8.041262018650046e-07, "loss": 0.354, "step": 4054 }, { "epoch": 0.19592211431608444, "grad_norm": 2.5682475566864014, "learning_rate": 8.040778856839156e-07, "loss": 0.383, "step": 4055 }, { "epoch": 0.1959704304971735, "grad_norm": 10.22612476348877, "learning_rate": 8.040295695028265e-07, "loss": 0.3071, "step": 4056 }, { "epoch": 0.19601874667826255, "grad_norm": 2.937117099761963, "learning_rate": 8.039812533217373e-07, "loss": 0.3623, "step": 4057 }, { "epoch": 0.1960670628593516, "grad_norm": 2.7710509300231934, "learning_rate": 8.039329371406483e-07, "loss": 0.4073, "step": 4058 }, { "epoch": 0.19611537904044066, "grad_norm": 2.7498931884765625, "learning_rate": 8.038846209595593e-07, "loss": 0.4136, "step": 4059 }, { "epoch": 0.19616369522152968, "grad_norm": 3.322798252105713, "learning_rate": 8.038363047784703e-07, "loss": 0.284, "step": 4060 }, { "epoch": 0.19621201140261874, "grad_norm": 2.3914055824279785, "learning_rate": 8.037879885973812e-07, "loss": 0.2815, "step": 4061 }, { "epoch": 0.1962603275837078, "grad_norm": 2.85652232170105, "learning_rate": 8.037396724162922e-07, "loss": 0.3912, "step": 4062 }, { "epoch": 0.19630864376479684, "grad_norm": 5.4452409744262695, "learning_rate": 8.036913562352032e-07, "loss": 0.489, "step": 4063 }, { "epoch": 0.19635695994588587, "grad_norm": 2.8278541564941406, "learning_rate": 8.036430400541141e-07, "loss": 0.3189, "step": 4064 }, { "epoch": 0.19640527612697492, "grad_norm": 2.4801008701324463, "learning_rate": 8.03594723873025e-07, "loss": 0.3686, "step": 4065 }, { "epoch": 0.19645359230806397, "grad_norm": 2.247359275817871, "learning_rate": 8.03546407691936e-07, "loss": 0.2492, "step": 4066 }, { "epoch": 0.19650190848915303, "grad_norm": 2.873814344406128, "learning_rate": 8.034980915108469e-07, "loss": 0.3057, "step": 4067 }, { "epoch": 0.19655022467024205, "grad_norm": 5.324544429779053, "learning_rate": 8.034497753297579e-07, "loss": 0.3635, "step": 4068 }, { "epoch": 0.1965985408513311, "grad_norm": 2.136225461959839, "learning_rate": 8.034014591486689e-07, "loss": 0.3103, "step": 4069 }, { "epoch": 0.19664685703242016, "grad_norm": 2.434475898742676, "learning_rate": 8.033531429675798e-07, "loss": 0.2352, "step": 4070 }, { "epoch": 0.1966951732135092, "grad_norm": 3.1897668838500977, "learning_rate": 8.033048267864908e-07, "loss": 0.341, "step": 4071 }, { "epoch": 0.19674348939459826, "grad_norm": 2.47025990486145, "learning_rate": 8.032565106054017e-07, "loss": 0.3335, "step": 4072 }, { "epoch": 0.1967918055756873, "grad_norm": 2.493779182434082, "learning_rate": 8.032081944243126e-07, "loss": 0.322, "step": 4073 }, { "epoch": 0.19684012175677634, "grad_norm": 2.1107447147369385, "learning_rate": 8.031598782432236e-07, "loss": 0.1945, "step": 4074 }, { "epoch": 0.1968884379378654, "grad_norm": 2.76151180267334, "learning_rate": 8.031115620621346e-07, "loss": 0.3786, "step": 4075 }, { "epoch": 0.19693675411895445, "grad_norm": 6.080319881439209, "learning_rate": 8.030632458810456e-07, "loss": 0.2297, "step": 4076 }, { "epoch": 0.19698507030004347, "grad_norm": 3.681563377380371, "learning_rate": 8.030149296999564e-07, "loss": 0.2908, "step": 4077 }, { "epoch": 0.19703338648113253, "grad_norm": 3.080543041229248, "learning_rate": 8.029666135188674e-07, "loss": 0.3465, "step": 4078 }, { "epoch": 0.19708170266222158, "grad_norm": 3.530881881713867, "learning_rate": 8.029182973377784e-07, "loss": 0.2512, "step": 4079 }, { "epoch": 0.19713001884331063, "grad_norm": 3.034637212753296, "learning_rate": 8.028699811566894e-07, "loss": 0.2934, "step": 4080 }, { "epoch": 0.19717833502439966, "grad_norm": 2.3592422008514404, "learning_rate": 8.028216649756004e-07, "loss": 0.2202, "step": 4081 }, { "epoch": 0.1972266512054887, "grad_norm": 2.5814309120178223, "learning_rate": 8.027733487945112e-07, "loss": 0.3021, "step": 4082 }, { "epoch": 0.19727496738657777, "grad_norm": 1.9380898475646973, "learning_rate": 8.027250326134221e-07, "loss": 0.2119, "step": 4083 }, { "epoch": 0.19732328356766682, "grad_norm": 3.77504301071167, "learning_rate": 8.026767164323331e-07, "loss": 0.3784, "step": 4084 }, { "epoch": 0.19737159974875587, "grad_norm": 2.7984864711761475, "learning_rate": 8.026284002512441e-07, "loss": 0.3541, "step": 4085 }, { "epoch": 0.1974199159298449, "grad_norm": 2.351895809173584, "learning_rate": 8.025800840701551e-07, "loss": 0.2927, "step": 4086 }, { "epoch": 0.19746823211093395, "grad_norm": 4.906167030334473, "learning_rate": 8.02531767889066e-07, "loss": 0.4205, "step": 4087 }, { "epoch": 0.197516548292023, "grad_norm": 2.1401686668395996, "learning_rate": 8.02483451707977e-07, "loss": 0.2314, "step": 4088 }, { "epoch": 0.19756486447311206, "grad_norm": 2.244384527206421, "learning_rate": 8.024351355268879e-07, "loss": 0.3185, "step": 4089 }, { "epoch": 0.19761318065420108, "grad_norm": 2.343316078186035, "learning_rate": 8.023868193457988e-07, "loss": 0.3173, "step": 4090 }, { "epoch": 0.19766149683529013, "grad_norm": 3.1821963787078857, "learning_rate": 8.023385031647098e-07, "loss": 0.3935, "step": 4091 }, { "epoch": 0.1977098130163792, "grad_norm": 2.226881980895996, "learning_rate": 8.022901869836208e-07, "loss": 0.2443, "step": 4092 }, { "epoch": 0.19775812919746824, "grad_norm": 3.384000778198242, "learning_rate": 8.022418708025317e-07, "loss": 0.3127, "step": 4093 }, { "epoch": 0.19780644537855727, "grad_norm": 2.448991537094116, "learning_rate": 8.021935546214427e-07, "loss": 0.2925, "step": 4094 }, { "epoch": 0.19785476155964632, "grad_norm": 3.7549808025360107, "learning_rate": 8.021452384403537e-07, "loss": 0.3735, "step": 4095 }, { "epoch": 0.19790307774073537, "grad_norm": 3.0674610137939453, "learning_rate": 8.020969222592646e-07, "loss": 0.2587, "step": 4096 }, { "epoch": 0.19795139392182443, "grad_norm": 2.9382333755493164, "learning_rate": 8.020486060781756e-07, "loss": 0.2564, "step": 4097 }, { "epoch": 0.19799971010291348, "grad_norm": 2.9513161182403564, "learning_rate": 8.020002898970865e-07, "loss": 0.4156, "step": 4098 }, { "epoch": 0.1980480262840025, "grad_norm": 2.5118799209594727, "learning_rate": 8.019519737159974e-07, "loss": 0.2676, "step": 4099 }, { "epoch": 0.19809634246509156, "grad_norm": 3.1826305389404297, "learning_rate": 8.019036575349084e-07, "loss": 0.4955, "step": 4100 }, { "epoch": 0.1981446586461806, "grad_norm": 2.8343288898468018, "learning_rate": 8.018553413538194e-07, "loss": 0.2938, "step": 4101 }, { "epoch": 0.19819297482726966, "grad_norm": 6.887574672698975, "learning_rate": 8.018070251727303e-07, "loss": 0.4357, "step": 4102 }, { "epoch": 0.1982412910083587, "grad_norm": 2.19712495803833, "learning_rate": 8.017587089916412e-07, "loss": 0.2441, "step": 4103 }, { "epoch": 0.19828960718944774, "grad_norm": 3.0434978008270264, "learning_rate": 8.017103928105522e-07, "loss": 0.2871, "step": 4104 }, { "epoch": 0.1983379233705368, "grad_norm": 3.05307674407959, "learning_rate": 8.016620766294632e-07, "loss": 0.3667, "step": 4105 }, { "epoch": 0.19838623955162585, "grad_norm": 3.5451748371124268, "learning_rate": 8.016137604483742e-07, "loss": 0.3023, "step": 4106 }, { "epoch": 0.19843455573271487, "grad_norm": 2.66532564163208, "learning_rate": 8.015654442672852e-07, "loss": 0.3645, "step": 4107 }, { "epoch": 0.19848287191380393, "grad_norm": 6.181522846221924, "learning_rate": 8.015171280861959e-07, "loss": 0.3915, "step": 4108 }, { "epoch": 0.19853118809489298, "grad_norm": 2.8201279640197754, "learning_rate": 8.014688119051069e-07, "loss": 0.3222, "step": 4109 }, { "epoch": 0.19857950427598203, "grad_norm": 3.3281946182250977, "learning_rate": 8.014204957240179e-07, "loss": 0.4101, "step": 4110 }, { "epoch": 0.19862782045707109, "grad_norm": 4.776504039764404, "learning_rate": 8.013721795429289e-07, "loss": 0.3488, "step": 4111 }, { "epoch": 0.1986761366381601, "grad_norm": 2.71287202835083, "learning_rate": 8.013238633618399e-07, "loss": 0.2377, "step": 4112 }, { "epoch": 0.19872445281924916, "grad_norm": 2.93355655670166, "learning_rate": 8.012755471807508e-07, "loss": 0.2186, "step": 4113 }, { "epoch": 0.19877276900033822, "grad_norm": 11.706363677978516, "learning_rate": 8.012272309996618e-07, "loss": 0.415, "step": 4114 }, { "epoch": 0.19882108518142727, "grad_norm": 3.191392660140991, "learning_rate": 8.011789148185726e-07, "loss": 0.3714, "step": 4115 }, { "epoch": 0.1988694013625163, "grad_norm": 3.340620279312134, "learning_rate": 8.011305986374836e-07, "loss": 0.35, "step": 4116 }, { "epoch": 0.19891771754360535, "grad_norm": 3.9078803062438965, "learning_rate": 8.010822824563946e-07, "loss": 0.2945, "step": 4117 }, { "epoch": 0.1989660337246944, "grad_norm": 2.616142511367798, "learning_rate": 8.010339662753056e-07, "loss": 0.2463, "step": 4118 }, { "epoch": 0.19901434990578346, "grad_norm": 3.8888092041015625, "learning_rate": 8.009856500942165e-07, "loss": 0.2044, "step": 4119 }, { "epoch": 0.19906266608687248, "grad_norm": 3.214226722717285, "learning_rate": 8.009373339131275e-07, "loss": 0.3113, "step": 4120 }, { "epoch": 0.19911098226796153, "grad_norm": 2.6918177604675293, "learning_rate": 8.008890177320384e-07, "loss": 0.3506, "step": 4121 }, { "epoch": 0.1991592984490506, "grad_norm": 1.632333755493164, "learning_rate": 8.008407015509494e-07, "loss": 0.1547, "step": 4122 }, { "epoch": 0.19920761463013964, "grad_norm": 3.997971534729004, "learning_rate": 8.007923853698604e-07, "loss": 0.2286, "step": 4123 }, { "epoch": 0.1992559308112287, "grad_norm": 4.68573522567749, "learning_rate": 8.007440691887712e-07, "loss": 0.2428, "step": 4124 }, { "epoch": 0.19930424699231772, "grad_norm": 4.253247261047363, "learning_rate": 8.006957530076822e-07, "loss": 0.5262, "step": 4125 }, { "epoch": 0.19935256317340677, "grad_norm": 2.6882104873657227, "learning_rate": 8.006474368265932e-07, "loss": 0.29, "step": 4126 }, { "epoch": 0.19940087935449582, "grad_norm": 2.503014087677002, "learning_rate": 8.005991206455042e-07, "loss": 0.2787, "step": 4127 }, { "epoch": 0.19944919553558488, "grad_norm": 2.498227596282959, "learning_rate": 8.005508044644151e-07, "loss": 0.3152, "step": 4128 }, { "epoch": 0.1994975117166739, "grad_norm": 2.5137062072753906, "learning_rate": 8.00502488283326e-07, "loss": 0.3462, "step": 4129 }, { "epoch": 0.19954582789776296, "grad_norm": 7.773223400115967, "learning_rate": 8.00454172102237e-07, "loss": 0.434, "step": 4130 }, { "epoch": 0.199594144078852, "grad_norm": 3.1145143508911133, "learning_rate": 8.00405855921148e-07, "loss": 0.3073, "step": 4131 }, { "epoch": 0.19964246025994106, "grad_norm": 1.8429878950119019, "learning_rate": 8.00357539740059e-07, "loss": 0.2187, "step": 4132 }, { "epoch": 0.1996907764410301, "grad_norm": 1.6524261236190796, "learning_rate": 8.003092235589699e-07, "loss": 0.1868, "step": 4133 }, { "epoch": 0.19973909262211914, "grad_norm": 2.386139392852783, "learning_rate": 8.002609073778807e-07, "loss": 0.2116, "step": 4134 }, { "epoch": 0.1997874088032082, "grad_norm": 3.2392830848693848, "learning_rate": 8.002125911967917e-07, "loss": 0.3237, "step": 4135 }, { "epoch": 0.19983572498429725, "grad_norm": 3.226871967315674, "learning_rate": 8.001642750157027e-07, "loss": 0.3426, "step": 4136 }, { "epoch": 0.1998840411653863, "grad_norm": 2.3270530700683594, "learning_rate": 8.001159588346137e-07, "loss": 0.2775, "step": 4137 }, { "epoch": 0.19993235734647533, "grad_norm": 2.52111554145813, "learning_rate": 8.000676426535247e-07, "loss": 0.3033, "step": 4138 }, { "epoch": 0.19998067352756438, "grad_norm": 2.126649856567383, "learning_rate": 8.000193264724356e-07, "loss": 0.1891, "step": 4139 }, { "epoch": 0.20002898970865343, "grad_norm": 11.426857948303223, "learning_rate": 7.999710102913465e-07, "loss": 0.2247, "step": 4140 }, { "epoch": 0.20007730588974248, "grad_norm": 7.284478664398193, "learning_rate": 7.999226941102574e-07, "loss": 0.2265, "step": 4141 }, { "epoch": 0.2001256220708315, "grad_norm": 3.259866237640381, "learning_rate": 7.998743779291684e-07, "loss": 0.3847, "step": 4142 }, { "epoch": 0.20017393825192056, "grad_norm": 5.846317291259766, "learning_rate": 7.998260617480794e-07, "loss": 0.2746, "step": 4143 }, { "epoch": 0.20022225443300962, "grad_norm": 3.070694923400879, "learning_rate": 7.997777455669904e-07, "loss": 0.3613, "step": 4144 }, { "epoch": 0.20027057061409867, "grad_norm": 3.2135353088378906, "learning_rate": 7.997294293859013e-07, "loss": 0.4351, "step": 4145 }, { "epoch": 0.2003188867951877, "grad_norm": 4.484731197357178, "learning_rate": 7.996811132048123e-07, "loss": 0.2457, "step": 4146 }, { "epoch": 0.20036720297627675, "grad_norm": 6.0136847496032715, "learning_rate": 7.996327970237232e-07, "loss": 0.4368, "step": 4147 }, { "epoch": 0.2004155191573658, "grad_norm": 2.70196533203125, "learning_rate": 7.995844808426342e-07, "loss": 0.2879, "step": 4148 }, { "epoch": 0.20046383533845485, "grad_norm": 2.8215949535369873, "learning_rate": 7.995361646615452e-07, "loss": 0.2937, "step": 4149 }, { "epoch": 0.2005121515195439, "grad_norm": 5.4615254402160645, "learning_rate": 7.99487848480456e-07, "loss": 0.3047, "step": 4150 }, { "epoch": 0.20056046770063293, "grad_norm": 2.146162509918213, "learning_rate": 7.99439532299367e-07, "loss": 0.3062, "step": 4151 }, { "epoch": 0.20060878388172199, "grad_norm": 3.128037691116333, "learning_rate": 7.99391216118278e-07, "loss": 0.2366, "step": 4152 }, { "epoch": 0.20065710006281104, "grad_norm": 7.126518249511719, "learning_rate": 7.993428999371889e-07, "loss": 0.42, "step": 4153 }, { "epoch": 0.2007054162439001, "grad_norm": 7.639153957366943, "learning_rate": 7.992945837560999e-07, "loss": 0.4459, "step": 4154 }, { "epoch": 0.20075373242498912, "grad_norm": 1.6612533330917358, "learning_rate": 7.992462675750108e-07, "loss": 0.1834, "step": 4155 }, { "epoch": 0.20080204860607817, "grad_norm": 2.631633758544922, "learning_rate": 7.991979513939218e-07, "loss": 0.2703, "step": 4156 }, { "epoch": 0.20085036478716722, "grad_norm": 1.480059027671814, "learning_rate": 7.991496352128328e-07, "loss": 0.1907, "step": 4157 }, { "epoch": 0.20089868096825628, "grad_norm": 2.6256003379821777, "learning_rate": 7.991013190317437e-07, "loss": 0.3936, "step": 4158 }, { "epoch": 0.2009469971493453, "grad_norm": 3.015127420425415, "learning_rate": 7.990530028506547e-07, "loss": 0.3507, "step": 4159 }, { "epoch": 0.20099531333043436, "grad_norm": 2.706372022628784, "learning_rate": 7.990046866695655e-07, "loss": 0.3382, "step": 4160 }, { "epoch": 0.2010436295115234, "grad_norm": 3.003225803375244, "learning_rate": 7.989563704884765e-07, "loss": 0.3798, "step": 4161 }, { "epoch": 0.20109194569261246, "grad_norm": 2.268064260482788, "learning_rate": 7.989080543073875e-07, "loss": 0.2507, "step": 4162 }, { "epoch": 0.20114026187370151, "grad_norm": 1.701709508895874, "learning_rate": 7.988597381262985e-07, "loss": 0.178, "step": 4163 }, { "epoch": 0.20118857805479054, "grad_norm": 9.344252586364746, "learning_rate": 7.988114219452095e-07, "loss": 0.3455, "step": 4164 }, { "epoch": 0.2012368942358796, "grad_norm": 3.4192981719970703, "learning_rate": 7.987631057641204e-07, "loss": 0.2581, "step": 4165 }, { "epoch": 0.20128521041696865, "grad_norm": 3.3571553230285645, "learning_rate": 7.987147895830312e-07, "loss": 0.3068, "step": 4166 }, { "epoch": 0.2013335265980577, "grad_norm": 2.38948655128479, "learning_rate": 7.986664734019422e-07, "loss": 0.2025, "step": 4167 }, { "epoch": 0.20138184277914672, "grad_norm": 3.197380542755127, "learning_rate": 7.986181572208532e-07, "loss": 0.3521, "step": 4168 }, { "epoch": 0.20143015896023578, "grad_norm": 3.0373008251190186, "learning_rate": 7.985698410397642e-07, "loss": 0.3908, "step": 4169 }, { "epoch": 0.20147847514132483, "grad_norm": 3.568153142929077, "learning_rate": 7.985215248586752e-07, "loss": 0.3316, "step": 4170 }, { "epoch": 0.20152679132241388, "grad_norm": 2.3222923278808594, "learning_rate": 7.984732086775861e-07, "loss": 0.2787, "step": 4171 }, { "epoch": 0.2015751075035029, "grad_norm": 2.6432156562805176, "learning_rate": 7.98424892496497e-07, "loss": 0.3405, "step": 4172 }, { "epoch": 0.20162342368459196, "grad_norm": 3.648188352584839, "learning_rate": 7.98376576315408e-07, "loss": 0.1984, "step": 4173 }, { "epoch": 0.20167173986568102, "grad_norm": 3.0256104469299316, "learning_rate": 7.98328260134319e-07, "loss": 0.4411, "step": 4174 }, { "epoch": 0.20172005604677007, "grad_norm": 7.1947784423828125, "learning_rate": 7.982799439532299e-07, "loss": 0.3756, "step": 4175 }, { "epoch": 0.20176837222785912, "grad_norm": 3.6273088455200195, "learning_rate": 7.982316277721408e-07, "loss": 0.4295, "step": 4176 }, { "epoch": 0.20181668840894815, "grad_norm": 2.4789774417877197, "learning_rate": 7.981833115910518e-07, "loss": 0.227, "step": 4177 }, { "epoch": 0.2018650045900372, "grad_norm": 2.3555359840393066, "learning_rate": 7.981349954099628e-07, "loss": 0.2611, "step": 4178 }, { "epoch": 0.20191332077112625, "grad_norm": 13.913748741149902, "learning_rate": 7.980866792288737e-07, "loss": 0.3139, "step": 4179 }, { "epoch": 0.2019616369522153, "grad_norm": 3.0779929161071777, "learning_rate": 7.980383630477847e-07, "loss": 0.355, "step": 4180 }, { "epoch": 0.20200995313330433, "grad_norm": 2.414580821990967, "learning_rate": 7.979900468666956e-07, "loss": 0.2758, "step": 4181 }, { "epoch": 0.20205826931439339, "grad_norm": 3.3837597370147705, "learning_rate": 7.979417306856066e-07, "loss": 0.3488, "step": 4182 }, { "epoch": 0.20210658549548244, "grad_norm": 2.4003939628601074, "learning_rate": 7.978934145045175e-07, "loss": 0.227, "step": 4183 }, { "epoch": 0.2021549016765715, "grad_norm": 2.3918354511260986, "learning_rate": 7.978450983234285e-07, "loss": 0.3075, "step": 4184 }, { "epoch": 0.20220321785766054, "grad_norm": 2.4198451042175293, "learning_rate": 7.977967821423394e-07, "loss": 0.3232, "step": 4185 }, { "epoch": 0.20225153403874957, "grad_norm": 2.965771198272705, "learning_rate": 7.977484659612503e-07, "loss": 0.2123, "step": 4186 }, { "epoch": 0.20229985021983862, "grad_norm": 3.958962917327881, "learning_rate": 7.977001497801613e-07, "loss": 0.2808, "step": 4187 }, { "epoch": 0.20234816640092768, "grad_norm": 3.477627754211426, "learning_rate": 7.976518335990723e-07, "loss": 0.3272, "step": 4188 }, { "epoch": 0.20239648258201673, "grad_norm": 2.070801019668579, "learning_rate": 7.976035174179833e-07, "loss": 0.2765, "step": 4189 }, { "epoch": 0.20244479876310575, "grad_norm": 2.72857403755188, "learning_rate": 7.975552012368943e-07, "loss": 0.3582, "step": 4190 }, { "epoch": 0.2024931149441948, "grad_norm": 151.3383331298828, "learning_rate": 7.975068850558052e-07, "loss": 0.2877, "step": 4191 }, { "epoch": 0.20254143112528386, "grad_norm": 2.7148401737213135, "learning_rate": 7.97458568874716e-07, "loss": 0.3408, "step": 4192 }, { "epoch": 0.2025897473063729, "grad_norm": 2.404285192489624, "learning_rate": 7.97410252693627e-07, "loss": 0.277, "step": 4193 }, { "epoch": 0.20263806348746194, "grad_norm": 8.910663604736328, "learning_rate": 7.97361936512538e-07, "loss": 0.3124, "step": 4194 }, { "epoch": 0.202686379668551, "grad_norm": 2.7774240970611572, "learning_rate": 7.97313620331449e-07, "loss": 0.3116, "step": 4195 }, { "epoch": 0.20273469584964005, "grad_norm": 3.013106107711792, "learning_rate": 7.9726530415036e-07, "loss": 0.376, "step": 4196 }, { "epoch": 0.2027830120307291, "grad_norm": 3.622469902038574, "learning_rate": 7.972169879692709e-07, "loss": 0.234, "step": 4197 }, { "epoch": 0.20283132821181815, "grad_norm": 3.744377851486206, "learning_rate": 7.971686717881818e-07, "loss": 0.3625, "step": 4198 }, { "epoch": 0.20287964439290718, "grad_norm": 2.5115363597869873, "learning_rate": 7.971203556070928e-07, "loss": 0.2281, "step": 4199 }, { "epoch": 0.20292796057399623, "grad_norm": 2.831549882888794, "learning_rate": 7.970720394260037e-07, "loss": 0.3387, "step": 4200 }, { "epoch": 0.20297627675508528, "grad_norm": 8.351354598999023, "learning_rate": 7.970237232449147e-07, "loss": 0.3071, "step": 4201 }, { "epoch": 0.20302459293617434, "grad_norm": 2.005960464477539, "learning_rate": 7.969754070638256e-07, "loss": 0.1855, "step": 4202 }, { "epoch": 0.20307290911726336, "grad_norm": 2.5161776542663574, "learning_rate": 7.969270908827366e-07, "loss": 0.275, "step": 4203 }, { "epoch": 0.20312122529835241, "grad_norm": 5.646961688995361, "learning_rate": 7.968787747016475e-07, "loss": 0.2967, "step": 4204 }, { "epoch": 0.20316954147944147, "grad_norm": 3.5451691150665283, "learning_rate": 7.968304585205585e-07, "loss": 0.3826, "step": 4205 }, { "epoch": 0.20321785766053052, "grad_norm": 2.8047122955322266, "learning_rate": 7.967821423394695e-07, "loss": 0.3196, "step": 4206 }, { "epoch": 0.20326617384161955, "grad_norm": 4.995886325836182, "learning_rate": 7.967338261583804e-07, "loss": 0.2296, "step": 4207 }, { "epoch": 0.2033144900227086, "grad_norm": 2.1353836059570312, "learning_rate": 7.966855099772914e-07, "loss": 0.2994, "step": 4208 }, { "epoch": 0.20336280620379765, "grad_norm": 4.00143575668335, "learning_rate": 7.966371937962023e-07, "loss": 0.2548, "step": 4209 }, { "epoch": 0.2034111223848867, "grad_norm": 1.9936333894729614, "learning_rate": 7.965888776151133e-07, "loss": 0.2461, "step": 4210 }, { "epoch": 0.20345943856597576, "grad_norm": 2.0446701049804688, "learning_rate": 7.965405614340242e-07, "loss": 0.2255, "step": 4211 }, { "epoch": 0.20350775474706478, "grad_norm": 1.9162262678146362, "learning_rate": 7.964922452529351e-07, "loss": 0.2222, "step": 4212 }, { "epoch": 0.20355607092815384, "grad_norm": 3.05049204826355, "learning_rate": 7.964439290718461e-07, "loss": 0.3788, "step": 4213 }, { "epoch": 0.2036043871092429, "grad_norm": 3.079981565475464, "learning_rate": 7.963956128907571e-07, "loss": 0.4165, "step": 4214 }, { "epoch": 0.20365270329033194, "grad_norm": 3.599496603012085, "learning_rate": 7.963472967096681e-07, "loss": 0.5275, "step": 4215 }, { "epoch": 0.20370101947142097, "grad_norm": 2.9099888801574707, "learning_rate": 7.962989805285791e-07, "loss": 0.2887, "step": 4216 }, { "epoch": 0.20374933565251002, "grad_norm": 1.8592348098754883, "learning_rate": 7.962506643474898e-07, "loss": 0.1981, "step": 4217 }, { "epoch": 0.20379765183359907, "grad_norm": 1.8763861656188965, "learning_rate": 7.962023481664008e-07, "loss": 0.1621, "step": 4218 }, { "epoch": 0.20384596801468813, "grad_norm": 2.285517930984497, "learning_rate": 7.961540319853118e-07, "loss": 0.2462, "step": 4219 }, { "epoch": 0.20389428419577715, "grad_norm": 2.8632922172546387, "learning_rate": 7.961057158042228e-07, "loss": 0.3106, "step": 4220 }, { "epoch": 0.2039426003768662, "grad_norm": 2.283010244369507, "learning_rate": 7.960573996231338e-07, "loss": 0.3165, "step": 4221 }, { "epoch": 0.20399091655795526, "grad_norm": 2.6796813011169434, "learning_rate": 7.960090834420448e-07, "loss": 0.3744, "step": 4222 }, { "epoch": 0.2040392327390443, "grad_norm": 3.7452926635742188, "learning_rate": 7.959607672609557e-07, "loss": 0.291, "step": 4223 }, { "epoch": 0.20408754892013337, "grad_norm": 4.426087856292725, "learning_rate": 7.959124510798666e-07, "loss": 0.3682, "step": 4224 }, { "epoch": 0.2041358651012224, "grad_norm": 2.2917709350585938, "learning_rate": 7.958641348987775e-07, "loss": 0.2391, "step": 4225 }, { "epoch": 0.20418418128231144, "grad_norm": 2.422081232070923, "learning_rate": 7.958158187176885e-07, "loss": 0.2934, "step": 4226 }, { "epoch": 0.2042324974634005, "grad_norm": 2.0628323554992676, "learning_rate": 7.957675025365995e-07, "loss": 0.1893, "step": 4227 }, { "epoch": 0.20428081364448955, "grad_norm": 2.3680312633514404, "learning_rate": 7.957191863555104e-07, "loss": 0.3196, "step": 4228 }, { "epoch": 0.20432912982557858, "grad_norm": 1.8615343570709229, "learning_rate": 7.956708701744214e-07, "loss": 0.1939, "step": 4229 }, { "epoch": 0.20437744600666763, "grad_norm": 3.709252119064331, "learning_rate": 7.956225539933323e-07, "loss": 0.4161, "step": 4230 }, { "epoch": 0.20442576218775668, "grad_norm": 3.031637668609619, "learning_rate": 7.955742378122433e-07, "loss": 0.4504, "step": 4231 }, { "epoch": 0.20447407836884574, "grad_norm": 2.943204164505005, "learning_rate": 7.955259216311543e-07, "loss": 0.3859, "step": 4232 }, { "epoch": 0.20452239454993476, "grad_norm": 1.6877663135528564, "learning_rate": 7.954776054500652e-07, "loss": 0.1513, "step": 4233 }, { "epoch": 0.2045707107310238, "grad_norm": 3.1659646034240723, "learning_rate": 7.954292892689761e-07, "loss": 0.3672, "step": 4234 }, { "epoch": 0.20461902691211287, "grad_norm": 1.573120355606079, "learning_rate": 7.953809730878871e-07, "loss": 0.1667, "step": 4235 }, { "epoch": 0.20466734309320192, "grad_norm": 3.574483633041382, "learning_rate": 7.953326569067981e-07, "loss": 0.3495, "step": 4236 }, { "epoch": 0.20471565927429097, "grad_norm": 3.825052261352539, "learning_rate": 7.95284340725709e-07, "loss": 0.2568, "step": 4237 }, { "epoch": 0.20476397545538, "grad_norm": 2.1074883937835693, "learning_rate": 7.952360245446199e-07, "loss": 0.2733, "step": 4238 }, { "epoch": 0.20481229163646905, "grad_norm": 2.5778520107269287, "learning_rate": 7.951877083635309e-07, "loss": 0.2951, "step": 4239 }, { "epoch": 0.2048606078175581, "grad_norm": 2.9173643589019775, "learning_rate": 7.951393921824419e-07, "loss": 0.4036, "step": 4240 }, { "epoch": 0.20490892399864716, "grad_norm": 2.684659719467163, "learning_rate": 7.950910760013529e-07, "loss": 0.2465, "step": 4241 }, { "epoch": 0.20495724017973618, "grad_norm": 2.9136974811553955, "learning_rate": 7.950427598202639e-07, "loss": 0.3799, "step": 4242 }, { "epoch": 0.20500555636082524, "grad_norm": 2.678317070007324, "learning_rate": 7.949944436391746e-07, "loss": 0.348, "step": 4243 }, { "epoch": 0.2050538725419143, "grad_norm": 2.4346988201141357, "learning_rate": 7.949461274580856e-07, "loss": 0.2908, "step": 4244 }, { "epoch": 0.20510218872300334, "grad_norm": 2.446611166000366, "learning_rate": 7.948978112769966e-07, "loss": 0.3087, "step": 4245 }, { "epoch": 0.20515050490409237, "grad_norm": 2.2622721195220947, "learning_rate": 7.948494950959076e-07, "loss": 0.2827, "step": 4246 }, { "epoch": 0.20519882108518142, "grad_norm": 2.4365181922912598, "learning_rate": 7.948011789148186e-07, "loss": 0.2822, "step": 4247 }, { "epoch": 0.20524713726627047, "grad_norm": 2.4971516132354736, "learning_rate": 7.947528627337296e-07, "loss": 0.2158, "step": 4248 }, { "epoch": 0.20529545344735953, "grad_norm": 2.8762576580047607, "learning_rate": 7.947045465526404e-07, "loss": 0.2285, "step": 4249 }, { "epoch": 0.20534376962844858, "grad_norm": 2.114508628845215, "learning_rate": 7.946562303715514e-07, "loss": 0.1702, "step": 4250 }, { "epoch": 0.2053920858095376, "grad_norm": 4.735641002655029, "learning_rate": 7.946079141904623e-07, "loss": 0.2648, "step": 4251 }, { "epoch": 0.20544040199062666, "grad_norm": 2.536256790161133, "learning_rate": 7.945595980093733e-07, "loss": 0.3002, "step": 4252 }, { "epoch": 0.2054887181717157, "grad_norm": 3.3622570037841797, "learning_rate": 7.945112818282843e-07, "loss": 0.2966, "step": 4253 }, { "epoch": 0.20553703435280476, "grad_norm": 3.8504204750061035, "learning_rate": 7.944629656471952e-07, "loss": 0.2308, "step": 4254 }, { "epoch": 0.2055853505338938, "grad_norm": 2.9939701557159424, "learning_rate": 7.944146494661062e-07, "loss": 0.3916, "step": 4255 }, { "epoch": 0.20563366671498284, "grad_norm": 3.3828680515289307, "learning_rate": 7.943663332850171e-07, "loss": 0.3499, "step": 4256 }, { "epoch": 0.2056819828960719, "grad_norm": 3.0172502994537354, "learning_rate": 7.943180171039281e-07, "loss": 0.3707, "step": 4257 }, { "epoch": 0.20573029907716095, "grad_norm": 1.4202216863632202, "learning_rate": 7.942697009228391e-07, "loss": 0.1675, "step": 4258 }, { "epoch": 0.20577861525824998, "grad_norm": 3.599505662918091, "learning_rate": 7.942213847417499e-07, "loss": 0.4272, "step": 4259 }, { "epoch": 0.20582693143933903, "grad_norm": 6.985313415527344, "learning_rate": 7.941730685606609e-07, "loss": 0.3527, "step": 4260 }, { "epoch": 0.20587524762042808, "grad_norm": 3.65814471244812, "learning_rate": 7.941247523795719e-07, "loss": 0.3995, "step": 4261 }, { "epoch": 0.20592356380151713, "grad_norm": 2.3299331665039062, "learning_rate": 7.940764361984828e-07, "loss": 0.304, "step": 4262 }, { "epoch": 0.2059718799826062, "grad_norm": 2.7252538204193115, "learning_rate": 7.940281200173938e-07, "loss": 0.2359, "step": 4263 }, { "epoch": 0.2060201961636952, "grad_norm": 2.5676143169403076, "learning_rate": 7.939798038363047e-07, "loss": 0.2476, "step": 4264 }, { "epoch": 0.20606851234478427, "grad_norm": 3.7982215881347656, "learning_rate": 7.939314876552157e-07, "loss": 0.3974, "step": 4265 }, { "epoch": 0.20611682852587332, "grad_norm": 1.7896091938018799, "learning_rate": 7.938831714741267e-07, "loss": 0.245, "step": 4266 }, { "epoch": 0.20616514470696237, "grad_norm": 3.1838395595550537, "learning_rate": 7.938348552930377e-07, "loss": 0.3406, "step": 4267 }, { "epoch": 0.2062134608880514, "grad_norm": 1.5340306758880615, "learning_rate": 7.937865391119486e-07, "loss": 0.1554, "step": 4268 }, { "epoch": 0.20626177706914045, "grad_norm": 3.570695638656616, "learning_rate": 7.937382229308594e-07, "loss": 0.3527, "step": 4269 }, { "epoch": 0.2063100932502295, "grad_norm": 4.716060161590576, "learning_rate": 7.936899067497704e-07, "loss": 0.3255, "step": 4270 }, { "epoch": 0.20635840943131856, "grad_norm": 2.616206407546997, "learning_rate": 7.936415905686814e-07, "loss": 0.2766, "step": 4271 }, { "epoch": 0.20640672561240758, "grad_norm": 4.292959213256836, "learning_rate": 7.935932743875924e-07, "loss": 0.4799, "step": 4272 }, { "epoch": 0.20645504179349664, "grad_norm": 2.875601291656494, "learning_rate": 7.935449582065034e-07, "loss": 0.3345, "step": 4273 }, { "epoch": 0.2065033579745857, "grad_norm": 2.557607412338257, "learning_rate": 7.934966420254143e-07, "loss": 0.2347, "step": 4274 }, { "epoch": 0.20655167415567474, "grad_norm": 2.1595773696899414, "learning_rate": 7.934483258443252e-07, "loss": 0.2032, "step": 4275 }, { "epoch": 0.2065999903367638, "grad_norm": 2.33890438079834, "learning_rate": 7.934000096632361e-07, "loss": 0.1883, "step": 4276 }, { "epoch": 0.20664830651785282, "grad_norm": 6.508013725280762, "learning_rate": 7.933516934821471e-07, "loss": 0.2704, "step": 4277 }, { "epoch": 0.20669662269894187, "grad_norm": 1.4961532354354858, "learning_rate": 7.933033773010581e-07, "loss": 0.1853, "step": 4278 }, { "epoch": 0.20674493888003093, "grad_norm": 4.98799991607666, "learning_rate": 7.932550611199691e-07, "loss": 0.3968, "step": 4279 }, { "epoch": 0.20679325506111998, "grad_norm": 2.2334234714508057, "learning_rate": 7.9320674493888e-07, "loss": 0.2364, "step": 4280 }, { "epoch": 0.206841571242209, "grad_norm": 2.981560468673706, "learning_rate": 7.931584287577909e-07, "loss": 0.2646, "step": 4281 }, { "epoch": 0.20688988742329806, "grad_norm": 2.604283332824707, "learning_rate": 7.931101125767019e-07, "loss": 0.1824, "step": 4282 }, { "epoch": 0.2069382036043871, "grad_norm": 3.3283462524414062, "learning_rate": 7.930617963956129e-07, "loss": 0.3358, "step": 4283 }, { "epoch": 0.20698651978547616, "grad_norm": 3.0963897705078125, "learning_rate": 7.930134802145239e-07, "loss": 0.213, "step": 4284 }, { "epoch": 0.2070348359665652, "grad_norm": 5.241774559020996, "learning_rate": 7.929651640334347e-07, "loss": 0.4766, "step": 4285 }, { "epoch": 0.20708315214765424, "grad_norm": 4.454643249511719, "learning_rate": 7.929168478523457e-07, "loss": 0.2481, "step": 4286 }, { "epoch": 0.2071314683287433, "grad_norm": 2.8078525066375732, "learning_rate": 7.928685316712567e-07, "loss": 0.3692, "step": 4287 }, { "epoch": 0.20717978450983235, "grad_norm": 4.135210990905762, "learning_rate": 7.928202154901676e-07, "loss": 0.2797, "step": 4288 }, { "epoch": 0.2072281006909214, "grad_norm": 3.15095853805542, "learning_rate": 7.927718993090786e-07, "loss": 0.2511, "step": 4289 }, { "epoch": 0.20727641687201043, "grad_norm": 2.5620789527893066, "learning_rate": 7.927235831279895e-07, "loss": 0.2589, "step": 4290 }, { "epoch": 0.20732473305309948, "grad_norm": 2.780136823654175, "learning_rate": 7.926752669469005e-07, "loss": 0.3485, "step": 4291 }, { "epoch": 0.20737304923418853, "grad_norm": 2.3918302059173584, "learning_rate": 7.926269507658115e-07, "loss": 0.2765, "step": 4292 }, { "epoch": 0.2074213654152776, "grad_norm": 2.381887197494507, "learning_rate": 7.925786345847224e-07, "loss": 0.2617, "step": 4293 }, { "epoch": 0.2074696815963666, "grad_norm": 2.415335178375244, "learning_rate": 7.925303184036333e-07, "loss": 0.3337, "step": 4294 }, { "epoch": 0.20751799777745566, "grad_norm": 2.4212706089019775, "learning_rate": 7.924820022225442e-07, "loss": 0.2847, "step": 4295 }, { "epoch": 0.20756631395854472, "grad_norm": 1.7396513223648071, "learning_rate": 7.924336860414552e-07, "loss": 0.2649, "step": 4296 }, { "epoch": 0.20761463013963377, "grad_norm": 4.939333915710449, "learning_rate": 7.923853698603662e-07, "loss": 0.2563, "step": 4297 }, { "epoch": 0.2076629463207228, "grad_norm": 2.401076316833496, "learning_rate": 7.923370536792772e-07, "loss": 0.283, "step": 4298 }, { "epoch": 0.20771126250181185, "grad_norm": 9.51859188079834, "learning_rate": 7.922887374981882e-07, "loss": 0.2606, "step": 4299 }, { "epoch": 0.2077595786829009, "grad_norm": 1.9781343936920166, "learning_rate": 7.92240421317099e-07, "loss": 0.1945, "step": 4300 }, { "epoch": 0.20780789486398996, "grad_norm": 2.0867714881896973, "learning_rate": 7.9219210513601e-07, "loss": 0.1836, "step": 4301 }, { "epoch": 0.207856211045079, "grad_norm": 2.7864224910736084, "learning_rate": 7.921437889549209e-07, "loss": 0.2975, "step": 4302 }, { "epoch": 0.20790452722616803, "grad_norm": 1.9229542016983032, "learning_rate": 7.920954727738319e-07, "loss": 0.1642, "step": 4303 }, { "epoch": 0.2079528434072571, "grad_norm": 2.56270170211792, "learning_rate": 7.920471565927429e-07, "loss": 0.3468, "step": 4304 }, { "epoch": 0.20800115958834614, "grad_norm": 2.62372088432312, "learning_rate": 7.919988404116539e-07, "loss": 0.2594, "step": 4305 }, { "epoch": 0.2080494757694352, "grad_norm": 2.832271099090576, "learning_rate": 7.919505242305648e-07, "loss": 0.3597, "step": 4306 }, { "epoch": 0.20809779195052422, "grad_norm": 2.0445356369018555, "learning_rate": 7.919022080494757e-07, "loss": 0.2817, "step": 4307 }, { "epoch": 0.20814610813161327, "grad_norm": 4.387575626373291, "learning_rate": 7.918538918683867e-07, "loss": 0.2656, "step": 4308 }, { "epoch": 0.20819442431270233, "grad_norm": 15.870640754699707, "learning_rate": 7.918055756872977e-07, "loss": 0.2649, "step": 4309 }, { "epoch": 0.20824274049379138, "grad_norm": 2.0336670875549316, "learning_rate": 7.917572595062086e-07, "loss": 0.2019, "step": 4310 }, { "epoch": 0.2082910566748804, "grad_norm": 2.327415704727173, "learning_rate": 7.917089433251195e-07, "loss": 0.287, "step": 4311 }, { "epoch": 0.20833937285596946, "grad_norm": 2.4530396461486816, "learning_rate": 7.916606271440305e-07, "loss": 0.2505, "step": 4312 }, { "epoch": 0.2083876890370585, "grad_norm": 2.370213270187378, "learning_rate": 7.916123109629414e-07, "loss": 0.3032, "step": 4313 }, { "epoch": 0.20843600521814756, "grad_norm": 2.8641247749328613, "learning_rate": 7.915639947818524e-07, "loss": 0.3382, "step": 4314 }, { "epoch": 0.20848432139923662, "grad_norm": 2.266977548599243, "learning_rate": 7.915156786007634e-07, "loss": 0.3401, "step": 4315 }, { "epoch": 0.20853263758032564, "grad_norm": 2.458700656890869, "learning_rate": 7.914673624196743e-07, "loss": 0.4051, "step": 4316 }, { "epoch": 0.2085809537614147, "grad_norm": 2.346639633178711, "learning_rate": 7.914190462385853e-07, "loss": 0.2411, "step": 4317 }, { "epoch": 0.20862926994250375, "grad_norm": 1.8872756958007812, "learning_rate": 7.913707300574963e-07, "loss": 0.1955, "step": 4318 }, { "epoch": 0.2086775861235928, "grad_norm": 3.0484778881073, "learning_rate": 7.913224138764072e-07, "loss": 0.3874, "step": 4319 }, { "epoch": 0.20872590230468183, "grad_norm": 2.882856845855713, "learning_rate": 7.912740976953181e-07, "loss": 0.382, "step": 4320 }, { "epoch": 0.20877421848577088, "grad_norm": 4.141261577606201, "learning_rate": 7.91225781514229e-07, "loss": 0.3472, "step": 4321 }, { "epoch": 0.20882253466685993, "grad_norm": 7.620251655578613, "learning_rate": 7.9117746533314e-07, "loss": 0.3369, "step": 4322 }, { "epoch": 0.20887085084794899, "grad_norm": 3.746868371963501, "learning_rate": 7.91129149152051e-07, "loss": 0.4284, "step": 4323 }, { "epoch": 0.20891916702903804, "grad_norm": 2.3885695934295654, "learning_rate": 7.91080832970962e-07, "loss": 0.2876, "step": 4324 }, { "epoch": 0.20896748321012706, "grad_norm": 2.6614584922790527, "learning_rate": 7.91032516789873e-07, "loss": 0.2673, "step": 4325 }, { "epoch": 0.20901579939121612, "grad_norm": 6.527892589569092, "learning_rate": 7.909842006087838e-07, "loss": 0.2571, "step": 4326 }, { "epoch": 0.20906411557230517, "grad_norm": 3.1261439323425293, "learning_rate": 7.909358844276947e-07, "loss": 0.2943, "step": 4327 }, { "epoch": 0.20911243175339422, "grad_norm": 4.959040641784668, "learning_rate": 7.908875682466057e-07, "loss": 0.3554, "step": 4328 }, { "epoch": 0.20916074793448325, "grad_norm": 4.820040225982666, "learning_rate": 7.908392520655167e-07, "loss": 0.2799, "step": 4329 }, { "epoch": 0.2092090641155723, "grad_norm": 6.593330383300781, "learning_rate": 7.907909358844277e-07, "loss": 0.3262, "step": 4330 }, { "epoch": 0.20925738029666135, "grad_norm": 2.771225929260254, "learning_rate": 7.907426197033387e-07, "loss": 0.2286, "step": 4331 }, { "epoch": 0.2093056964777504, "grad_norm": 1.8652210235595703, "learning_rate": 7.906943035222495e-07, "loss": 0.247, "step": 4332 }, { "epoch": 0.20935401265883943, "grad_norm": 2.3777642250061035, "learning_rate": 7.906459873411605e-07, "loss": 0.2938, "step": 4333 }, { "epoch": 0.2094023288399285, "grad_norm": 2.947174310684204, "learning_rate": 7.905976711600715e-07, "loss": 0.3316, "step": 4334 }, { "epoch": 0.20945064502101754, "grad_norm": 3.3575806617736816, "learning_rate": 7.905493549789825e-07, "loss": 0.3361, "step": 4335 }, { "epoch": 0.2094989612021066, "grad_norm": 7.517913818359375, "learning_rate": 7.905010387978934e-07, "loss": 0.3529, "step": 4336 }, { "epoch": 0.20954727738319565, "grad_norm": 2.5633745193481445, "learning_rate": 7.904527226168043e-07, "loss": 0.4202, "step": 4337 }, { "epoch": 0.20959559356428467, "grad_norm": 4.5072526931762695, "learning_rate": 7.904044064357153e-07, "loss": 0.2105, "step": 4338 }, { "epoch": 0.20964390974537372, "grad_norm": 2.655606508255005, "learning_rate": 7.903560902546262e-07, "loss": 0.287, "step": 4339 }, { "epoch": 0.20969222592646278, "grad_norm": 35.14727783203125, "learning_rate": 7.903077740735372e-07, "loss": 0.2604, "step": 4340 }, { "epoch": 0.20974054210755183, "grad_norm": 1.8067706823349, "learning_rate": 7.902594578924482e-07, "loss": 0.22, "step": 4341 }, { "epoch": 0.20978885828864086, "grad_norm": 3.9998257160186768, "learning_rate": 7.902111417113591e-07, "loss": 0.2967, "step": 4342 }, { "epoch": 0.2098371744697299, "grad_norm": 2.6087753772735596, "learning_rate": 7.901628255302701e-07, "loss": 0.1975, "step": 4343 }, { "epoch": 0.20988549065081896, "grad_norm": 4.0120344161987305, "learning_rate": 7.90114509349181e-07, "loss": 0.2665, "step": 4344 }, { "epoch": 0.20993380683190802, "grad_norm": 2.0008089542388916, "learning_rate": 7.900661931680919e-07, "loss": 0.235, "step": 4345 }, { "epoch": 0.20998212301299704, "grad_norm": 48.60403060913086, "learning_rate": 7.900178769870029e-07, "loss": 0.2355, "step": 4346 }, { "epoch": 0.2100304391940861, "grad_norm": 2.4438536167144775, "learning_rate": 7.899695608059138e-07, "loss": 0.3432, "step": 4347 }, { "epoch": 0.21007875537517515, "grad_norm": 4.97822380065918, "learning_rate": 7.899212446248248e-07, "loss": 0.25, "step": 4348 }, { "epoch": 0.2101270715562642, "grad_norm": 3.6297426223754883, "learning_rate": 7.898729284437358e-07, "loss": 0.515, "step": 4349 }, { "epoch": 0.21017538773735325, "grad_norm": 38.602149963378906, "learning_rate": 7.898246122626468e-07, "loss": 0.355, "step": 4350 }, { "epoch": 0.21022370391844228, "grad_norm": 2.1439945697784424, "learning_rate": 7.897762960815578e-07, "loss": 0.2885, "step": 4351 }, { "epoch": 0.21027202009953133, "grad_norm": 2.396190643310547, "learning_rate": 7.897279799004685e-07, "loss": 0.2963, "step": 4352 }, { "epoch": 0.21032033628062038, "grad_norm": 3.755218505859375, "learning_rate": 7.896796637193795e-07, "loss": 0.3736, "step": 4353 }, { "epoch": 0.21036865246170944, "grad_norm": 3.575308084487915, "learning_rate": 7.896313475382905e-07, "loss": 0.2951, "step": 4354 }, { "epoch": 0.21041696864279846, "grad_norm": 3.877725601196289, "learning_rate": 7.895830313572015e-07, "loss": 0.3483, "step": 4355 }, { "epoch": 0.21046528482388752, "grad_norm": 6.306342601776123, "learning_rate": 7.895347151761125e-07, "loss": 0.3981, "step": 4356 }, { "epoch": 0.21051360100497657, "grad_norm": 4.297966957092285, "learning_rate": 7.894863989950235e-07, "loss": 0.3125, "step": 4357 }, { "epoch": 0.21056191718606562, "grad_norm": 2.924375295639038, "learning_rate": 7.894380828139343e-07, "loss": 0.2743, "step": 4358 }, { "epoch": 0.21061023336715465, "grad_norm": 1.7084378004074097, "learning_rate": 7.893897666328453e-07, "loss": 0.223, "step": 4359 }, { "epoch": 0.2106585495482437, "grad_norm": 1.6835665702819824, "learning_rate": 7.893414504517563e-07, "loss": 0.1778, "step": 4360 }, { "epoch": 0.21070686572933275, "grad_norm": 2.962416648864746, "learning_rate": 7.892931342706672e-07, "loss": 0.3392, "step": 4361 }, { "epoch": 0.2107551819104218, "grad_norm": 2.2833657264709473, "learning_rate": 7.892448180895782e-07, "loss": 0.2256, "step": 4362 }, { "epoch": 0.21080349809151086, "grad_norm": 2.4510021209716797, "learning_rate": 7.891965019084891e-07, "loss": 0.246, "step": 4363 }, { "epoch": 0.21085181427259989, "grad_norm": 2.9263150691986084, "learning_rate": 7.891481857274e-07, "loss": 0.2876, "step": 4364 }, { "epoch": 0.21090013045368894, "grad_norm": 2.545922040939331, "learning_rate": 7.89099869546311e-07, "loss": 0.3191, "step": 4365 }, { "epoch": 0.210948446634778, "grad_norm": 1.99259352684021, "learning_rate": 7.89051553365222e-07, "loss": 0.2046, "step": 4366 }, { "epoch": 0.21099676281586704, "grad_norm": 5.43715763092041, "learning_rate": 7.89003237184133e-07, "loss": 0.4282, "step": 4367 }, { "epoch": 0.21104507899695607, "grad_norm": 2.1644210815429688, "learning_rate": 7.889549210030439e-07, "loss": 0.2111, "step": 4368 }, { "epoch": 0.21109339517804512, "grad_norm": 5.019657135009766, "learning_rate": 7.889066048219548e-07, "loss": 0.4124, "step": 4369 }, { "epoch": 0.21114171135913418, "grad_norm": 3.2322208881378174, "learning_rate": 7.888582886408658e-07, "loss": 0.3119, "step": 4370 }, { "epoch": 0.21119002754022323, "grad_norm": 21.651235580444336, "learning_rate": 7.888099724597767e-07, "loss": 0.2792, "step": 4371 }, { "epoch": 0.21123834372131226, "grad_norm": 3.076368808746338, "learning_rate": 7.887616562786877e-07, "loss": 0.3446, "step": 4372 }, { "epoch": 0.2112866599024013, "grad_norm": 2.065046548843384, "learning_rate": 7.887133400975986e-07, "loss": 0.2412, "step": 4373 }, { "epoch": 0.21133497608349036, "grad_norm": 5.2017316818237305, "learning_rate": 7.886650239165096e-07, "loss": 0.2585, "step": 4374 }, { "epoch": 0.21138329226457941, "grad_norm": 16.99859046936035, "learning_rate": 7.886167077354206e-07, "loss": 0.3076, "step": 4375 }, { "epoch": 0.21143160844566847, "grad_norm": 1.9338808059692383, "learning_rate": 7.885683915543316e-07, "loss": 0.2108, "step": 4376 }, { "epoch": 0.2114799246267575, "grad_norm": 4.1739020347595215, "learning_rate": 7.885200753732425e-07, "loss": 0.2775, "step": 4377 }, { "epoch": 0.21152824080784655, "grad_norm": 3.09334397315979, "learning_rate": 7.884717591921533e-07, "loss": 0.3869, "step": 4378 }, { "epoch": 0.2115765569889356, "grad_norm": 3.1645407676696777, "learning_rate": 7.884234430110643e-07, "loss": 0.2864, "step": 4379 }, { "epoch": 0.21162487317002465, "grad_norm": 2.194235324859619, "learning_rate": 7.883751268299753e-07, "loss": 0.2606, "step": 4380 }, { "epoch": 0.21167318935111368, "grad_norm": 2.4381697177886963, "learning_rate": 7.883268106488863e-07, "loss": 0.3283, "step": 4381 }, { "epoch": 0.21172150553220273, "grad_norm": 2.705768585205078, "learning_rate": 7.882784944677973e-07, "loss": 0.2514, "step": 4382 }, { "epoch": 0.21176982171329178, "grad_norm": 2.3959972858428955, "learning_rate": 7.882301782867083e-07, "loss": 0.2343, "step": 4383 }, { "epoch": 0.21181813789438084, "grad_norm": 2.423795700073242, "learning_rate": 7.881818621056191e-07, "loss": 0.2704, "step": 4384 }, { "epoch": 0.21186645407546986, "grad_norm": 2.95222544670105, "learning_rate": 7.881335459245301e-07, "loss": 0.2808, "step": 4385 }, { "epoch": 0.21191477025655892, "grad_norm": 3.286872386932373, "learning_rate": 7.88085229743441e-07, "loss": 0.3176, "step": 4386 }, { "epoch": 0.21196308643764797, "grad_norm": 2.703984260559082, "learning_rate": 7.88036913562352e-07, "loss": 0.2997, "step": 4387 }, { "epoch": 0.21201140261873702, "grad_norm": 3.9821832180023193, "learning_rate": 7.87988597381263e-07, "loss": 0.3848, "step": 4388 }, { "epoch": 0.21205971879982607, "grad_norm": 16.859338760375977, "learning_rate": 7.879402812001739e-07, "loss": 0.3409, "step": 4389 }, { "epoch": 0.2121080349809151, "grad_norm": 2.5099618434906006, "learning_rate": 7.878919650190848e-07, "loss": 0.2855, "step": 4390 }, { "epoch": 0.21215635116200415, "grad_norm": 2.822770833969116, "learning_rate": 7.878436488379958e-07, "loss": 0.3874, "step": 4391 }, { "epoch": 0.2122046673430932, "grad_norm": 2.245762586593628, "learning_rate": 7.877953326569068e-07, "loss": 0.2199, "step": 4392 }, { "epoch": 0.21225298352418226, "grad_norm": 2.870593547821045, "learning_rate": 7.877470164758178e-07, "loss": 0.2586, "step": 4393 }, { "epoch": 0.21230129970527128, "grad_norm": 3.09548020362854, "learning_rate": 7.876987002947286e-07, "loss": 0.3424, "step": 4394 }, { "epoch": 0.21234961588636034, "grad_norm": 2.45371150970459, "learning_rate": 7.876503841136396e-07, "loss": 0.3419, "step": 4395 }, { "epoch": 0.2123979320674494, "grad_norm": 3.7551350593566895, "learning_rate": 7.876020679325505e-07, "loss": 0.2458, "step": 4396 }, { "epoch": 0.21244624824853844, "grad_norm": 3.5896472930908203, "learning_rate": 7.875537517514615e-07, "loss": 0.3468, "step": 4397 }, { "epoch": 0.21249456442962747, "grad_norm": 2.223071813583374, "learning_rate": 7.875054355703725e-07, "loss": 0.2284, "step": 4398 }, { "epoch": 0.21254288061071652, "grad_norm": 2.133077621459961, "learning_rate": 7.874571193892834e-07, "loss": 0.2548, "step": 4399 }, { "epoch": 0.21259119679180558, "grad_norm": 1.7248371839523315, "learning_rate": 7.874088032081944e-07, "loss": 0.2069, "step": 4400 }, { "epoch": 0.21263951297289463, "grad_norm": 9.303474426269531, "learning_rate": 7.873604870271054e-07, "loss": 0.3941, "step": 4401 }, { "epoch": 0.21268782915398368, "grad_norm": 2.626577615737915, "learning_rate": 7.873121708460164e-07, "loss": 0.3103, "step": 4402 }, { "epoch": 0.2127361453350727, "grad_norm": 1.9368129968643188, "learning_rate": 7.872638546649272e-07, "loss": 0.15, "step": 4403 }, { "epoch": 0.21278446151616176, "grad_norm": 4.60552978515625, "learning_rate": 7.872155384838381e-07, "loss": 0.325, "step": 4404 }, { "epoch": 0.2128327776972508, "grad_norm": 2.792093515396118, "learning_rate": 7.871672223027491e-07, "loss": 0.2509, "step": 4405 }, { "epoch": 0.21288109387833987, "grad_norm": 3.047590970993042, "learning_rate": 7.871189061216601e-07, "loss": 0.4488, "step": 4406 }, { "epoch": 0.2129294100594289, "grad_norm": 3.075392723083496, "learning_rate": 7.870705899405711e-07, "loss": 0.3361, "step": 4407 }, { "epoch": 0.21297772624051794, "grad_norm": 2.5730226039886475, "learning_rate": 7.870222737594821e-07, "loss": 0.3523, "step": 4408 }, { "epoch": 0.213026042421607, "grad_norm": 1.6600459814071655, "learning_rate": 7.86973957578393e-07, "loss": 0.1922, "step": 4409 }, { "epoch": 0.21307435860269605, "grad_norm": 2.2249345779418945, "learning_rate": 7.869256413973039e-07, "loss": 0.2604, "step": 4410 }, { "epoch": 0.21312267478378508, "grad_norm": 2.4718542098999023, "learning_rate": 7.868773252162148e-07, "loss": 0.3296, "step": 4411 }, { "epoch": 0.21317099096487413, "grad_norm": 3.5930352210998535, "learning_rate": 7.868290090351258e-07, "loss": 0.3053, "step": 4412 }, { "epoch": 0.21321930714596318, "grad_norm": 3.138570785522461, "learning_rate": 7.867806928540368e-07, "loss": 0.4059, "step": 4413 }, { "epoch": 0.21326762332705224, "grad_norm": 1.733976125717163, "learning_rate": 7.867323766729478e-07, "loss": 0.211, "step": 4414 }, { "epoch": 0.2133159395081413, "grad_norm": 3.0776851177215576, "learning_rate": 7.866840604918586e-07, "loss": 0.3833, "step": 4415 }, { "epoch": 0.21336425568923031, "grad_norm": 2.7687370777130127, "learning_rate": 7.866357443107696e-07, "loss": 0.3815, "step": 4416 }, { "epoch": 0.21341257187031937, "grad_norm": 2.455448865890503, "learning_rate": 7.865874281296806e-07, "loss": 0.3137, "step": 4417 }, { "epoch": 0.21346088805140842, "grad_norm": 2.1671011447906494, "learning_rate": 7.865391119485916e-07, "loss": 0.2519, "step": 4418 }, { "epoch": 0.21350920423249747, "grad_norm": 3.1957619190216064, "learning_rate": 7.864907957675026e-07, "loss": 0.389, "step": 4419 }, { "epoch": 0.2135575204135865, "grad_norm": 2.771728277206421, "learning_rate": 7.864424795864134e-07, "loss": 0.3879, "step": 4420 }, { "epoch": 0.21360583659467555, "grad_norm": 2.818314790725708, "learning_rate": 7.863941634053244e-07, "loss": 0.3277, "step": 4421 }, { "epoch": 0.2136541527757646, "grad_norm": 14.066436767578125, "learning_rate": 7.863458472242353e-07, "loss": 0.3646, "step": 4422 }, { "epoch": 0.21370246895685366, "grad_norm": 3.480604410171509, "learning_rate": 7.862975310431463e-07, "loss": 0.396, "step": 4423 }, { "epoch": 0.21375078513794268, "grad_norm": 2.130035877227783, "learning_rate": 7.862492148620573e-07, "loss": 0.2865, "step": 4424 }, { "epoch": 0.21379910131903174, "grad_norm": 3.5942466259002686, "learning_rate": 7.862008986809682e-07, "loss": 0.1605, "step": 4425 }, { "epoch": 0.2138474175001208, "grad_norm": 3.0735385417938232, "learning_rate": 7.861525824998792e-07, "loss": 0.2241, "step": 4426 }, { "epoch": 0.21389573368120984, "grad_norm": 1.7571933269500732, "learning_rate": 7.861042663187902e-07, "loss": 0.1743, "step": 4427 }, { "epoch": 0.2139440498622989, "grad_norm": 2.3597609996795654, "learning_rate": 7.86055950137701e-07, "loss": 0.2415, "step": 4428 }, { "epoch": 0.21399236604338792, "grad_norm": 2.6538891792297363, "learning_rate": 7.86007633956612e-07, "loss": 0.2606, "step": 4429 }, { "epoch": 0.21404068222447697, "grad_norm": 3.1693434715270996, "learning_rate": 7.859593177755229e-07, "loss": 0.264, "step": 4430 }, { "epoch": 0.21408899840556603, "grad_norm": 2.5586562156677246, "learning_rate": 7.859110015944339e-07, "loss": 0.2232, "step": 4431 }, { "epoch": 0.21413731458665508, "grad_norm": 2.8752903938293457, "learning_rate": 7.858626854133449e-07, "loss": 0.4055, "step": 4432 }, { "epoch": 0.2141856307677441, "grad_norm": 3.002474784851074, "learning_rate": 7.858143692322559e-07, "loss": 0.4013, "step": 4433 }, { "epoch": 0.21423394694883316, "grad_norm": 2.4458367824554443, "learning_rate": 7.857660530511669e-07, "loss": 0.3037, "step": 4434 }, { "epoch": 0.2142822631299222, "grad_norm": 2.4865834712982178, "learning_rate": 7.857177368700778e-07, "loss": 0.2968, "step": 4435 }, { "epoch": 0.21433057931101127, "grad_norm": 2.689688205718994, "learning_rate": 7.856694206889887e-07, "loss": 0.3414, "step": 4436 }, { "epoch": 0.2143788954921003, "grad_norm": 2.2116687297821045, "learning_rate": 7.856211045078996e-07, "loss": 0.2507, "step": 4437 }, { "epoch": 0.21442721167318934, "grad_norm": 2.665168285369873, "learning_rate": 7.855727883268106e-07, "loss": 0.2937, "step": 4438 }, { "epoch": 0.2144755278542784, "grad_norm": 3.668623447418213, "learning_rate": 7.855244721457216e-07, "loss": 0.3496, "step": 4439 }, { "epoch": 0.21452384403536745, "grad_norm": 2.560633659362793, "learning_rate": 7.854761559646326e-07, "loss": 0.2946, "step": 4440 }, { "epoch": 0.2145721602164565, "grad_norm": 3.004249095916748, "learning_rate": 7.854278397835434e-07, "loss": 0.4446, "step": 4441 }, { "epoch": 0.21462047639754553, "grad_norm": 2.7719051837921143, "learning_rate": 7.853795236024544e-07, "loss": 0.3424, "step": 4442 }, { "epoch": 0.21466879257863458, "grad_norm": 2.492406129837036, "learning_rate": 7.853312074213654e-07, "loss": 0.3706, "step": 4443 }, { "epoch": 0.21471710875972363, "grad_norm": 1.8886204957962036, "learning_rate": 7.852828912402764e-07, "loss": 0.1637, "step": 4444 }, { "epoch": 0.2147654249408127, "grad_norm": 2.589322805404663, "learning_rate": 7.852345750591874e-07, "loss": 0.3561, "step": 4445 }, { "epoch": 0.2148137411219017, "grad_norm": 1.3954678773880005, "learning_rate": 7.851862588780982e-07, "loss": 0.144, "step": 4446 }, { "epoch": 0.21486205730299077, "grad_norm": 2.1118783950805664, "learning_rate": 7.851379426970091e-07, "loss": 0.2363, "step": 4447 }, { "epoch": 0.21491037348407982, "grad_norm": 3.198965311050415, "learning_rate": 7.850896265159201e-07, "loss": 0.2889, "step": 4448 }, { "epoch": 0.21495868966516887, "grad_norm": 1.8232635259628296, "learning_rate": 7.850413103348311e-07, "loss": 0.1988, "step": 4449 }, { "epoch": 0.2150070058462579, "grad_norm": 3.0178253650665283, "learning_rate": 7.849929941537421e-07, "loss": 0.3154, "step": 4450 }, { "epoch": 0.21505532202734695, "grad_norm": 3.138434886932373, "learning_rate": 7.84944677972653e-07, "loss": 0.3241, "step": 4451 }, { "epoch": 0.215103638208436, "grad_norm": 2.85878849029541, "learning_rate": 7.84896361791564e-07, "loss": 0.3474, "step": 4452 }, { "epoch": 0.21515195438952506, "grad_norm": 2.696192741394043, "learning_rate": 7.84848045610475e-07, "loss": 0.2603, "step": 4453 }, { "epoch": 0.2152002705706141, "grad_norm": 3.0197935104370117, "learning_rate": 7.847997294293858e-07, "loss": 0.4085, "step": 4454 }, { "epoch": 0.21524858675170314, "grad_norm": 2.1271770000457764, "learning_rate": 7.847514132482968e-07, "loss": 0.2276, "step": 4455 }, { "epoch": 0.2152969029327922, "grad_norm": 2.4687297344207764, "learning_rate": 7.847030970672077e-07, "loss": 0.2802, "step": 4456 }, { "epoch": 0.21534521911388124, "grad_norm": 2.678668975830078, "learning_rate": 7.846547808861187e-07, "loss": 0.308, "step": 4457 }, { "epoch": 0.2153935352949703, "grad_norm": 1.7823216915130615, "learning_rate": 7.846064647050297e-07, "loss": 0.2035, "step": 4458 }, { "epoch": 0.21544185147605932, "grad_norm": 2.2270090579986572, "learning_rate": 7.845581485239407e-07, "loss": 0.2111, "step": 4459 }, { "epoch": 0.21549016765714837, "grad_norm": 2.85536527633667, "learning_rate": 7.845098323428516e-07, "loss": 0.2005, "step": 4460 }, { "epoch": 0.21553848383823743, "grad_norm": 4.061201095581055, "learning_rate": 7.844615161617626e-07, "loss": 0.3826, "step": 4461 }, { "epoch": 0.21558680001932648, "grad_norm": 4.540688991546631, "learning_rate": 7.844131999806734e-07, "loss": 0.2123, "step": 4462 }, { "epoch": 0.2156351162004155, "grad_norm": 2.2733612060546875, "learning_rate": 7.843648837995844e-07, "loss": 0.3331, "step": 4463 }, { "epoch": 0.21568343238150456, "grad_norm": 2.3479721546173096, "learning_rate": 7.843165676184954e-07, "loss": 0.3007, "step": 4464 }, { "epoch": 0.2157317485625936, "grad_norm": 1.6359423398971558, "learning_rate": 7.842682514374064e-07, "loss": 0.1842, "step": 4465 }, { "epoch": 0.21578006474368266, "grad_norm": 3.1428780555725098, "learning_rate": 7.842199352563174e-07, "loss": 0.2668, "step": 4466 }, { "epoch": 0.21582838092477172, "grad_norm": 6.167180061340332, "learning_rate": 7.841716190752282e-07, "loss": 0.2734, "step": 4467 }, { "epoch": 0.21587669710586074, "grad_norm": 2.5919833183288574, "learning_rate": 7.841233028941392e-07, "loss": 0.2242, "step": 4468 }, { "epoch": 0.2159250132869498, "grad_norm": 3.640038251876831, "learning_rate": 7.840749867130502e-07, "loss": 0.2878, "step": 4469 }, { "epoch": 0.21597332946803885, "grad_norm": 2.7679789066314697, "learning_rate": 7.840266705319612e-07, "loss": 0.222, "step": 4470 }, { "epoch": 0.2160216456491279, "grad_norm": 4.916796684265137, "learning_rate": 7.839783543508721e-07, "loss": 0.2498, "step": 4471 }, { "epoch": 0.21606996183021693, "grad_norm": 2.2076663970947266, "learning_rate": 7.83930038169783e-07, "loss": 0.2609, "step": 4472 }, { "epoch": 0.21611827801130598, "grad_norm": 2.3143651485443115, "learning_rate": 7.838817219886939e-07, "loss": 0.2236, "step": 4473 }, { "epoch": 0.21616659419239503, "grad_norm": 2.2179172039031982, "learning_rate": 7.838334058076049e-07, "loss": 0.3287, "step": 4474 }, { "epoch": 0.2162149103734841, "grad_norm": 3.898922920227051, "learning_rate": 7.837850896265159e-07, "loss": 0.3353, "step": 4475 }, { "epoch": 0.21626322655457314, "grad_norm": 1.690290093421936, "learning_rate": 7.837367734454269e-07, "loss": 0.1993, "step": 4476 }, { "epoch": 0.21631154273566217, "grad_norm": 2.6657140254974365, "learning_rate": 7.836884572643378e-07, "loss": 0.2514, "step": 4477 }, { "epoch": 0.21635985891675122, "grad_norm": 2.149536609649658, "learning_rate": 7.836401410832488e-07, "loss": 0.2271, "step": 4478 }, { "epoch": 0.21640817509784027, "grad_norm": 2.533428430557251, "learning_rate": 7.835918249021596e-07, "loss": 0.2857, "step": 4479 }, { "epoch": 0.21645649127892932, "grad_norm": 4.080658912658691, "learning_rate": 7.835435087210706e-07, "loss": 0.3994, "step": 4480 }, { "epoch": 0.21650480746001835, "grad_norm": 2.1994340419769287, "learning_rate": 7.834951925399816e-07, "loss": 0.2671, "step": 4481 }, { "epoch": 0.2165531236411074, "grad_norm": 3.1376736164093018, "learning_rate": 7.834468763588925e-07, "loss": 0.2584, "step": 4482 }, { "epoch": 0.21660143982219646, "grad_norm": 3.3491833209991455, "learning_rate": 7.833985601778035e-07, "loss": 0.3143, "step": 4483 }, { "epoch": 0.2166497560032855, "grad_norm": 1.7429461479187012, "learning_rate": 7.833502439967145e-07, "loss": 0.1955, "step": 4484 }, { "epoch": 0.21669807218437453, "grad_norm": 2.010296583175659, "learning_rate": 7.833019278156255e-07, "loss": 0.1888, "step": 4485 }, { "epoch": 0.2167463883654636, "grad_norm": 3.178650379180908, "learning_rate": 7.832536116345364e-07, "loss": 0.2582, "step": 4486 }, { "epoch": 0.21679470454655264, "grad_norm": 3.3128223419189453, "learning_rate": 7.832052954534474e-07, "loss": 0.2468, "step": 4487 }, { "epoch": 0.2168430207276417, "grad_norm": 3.0129687786102295, "learning_rate": 7.831569792723582e-07, "loss": 0.3728, "step": 4488 }, { "epoch": 0.21689133690873075, "grad_norm": 2.432960271835327, "learning_rate": 7.831086630912692e-07, "loss": 0.2031, "step": 4489 }, { "epoch": 0.21693965308981977, "grad_norm": 2.6113979816436768, "learning_rate": 7.830603469101802e-07, "loss": 0.3421, "step": 4490 }, { "epoch": 0.21698796927090883, "grad_norm": 2.9650444984436035, "learning_rate": 7.830120307290912e-07, "loss": 0.3529, "step": 4491 }, { "epoch": 0.21703628545199788, "grad_norm": 2.7180562019348145, "learning_rate": 7.829637145480021e-07, "loss": 0.2996, "step": 4492 }, { "epoch": 0.21708460163308693, "grad_norm": 2.282489776611328, "learning_rate": 7.82915398366913e-07, "loss": 0.2565, "step": 4493 }, { "epoch": 0.21713291781417596, "grad_norm": 4.692864894866943, "learning_rate": 7.82867082185824e-07, "loss": 0.4286, "step": 4494 }, { "epoch": 0.217181233995265, "grad_norm": 2.5484418869018555, "learning_rate": 7.82818766004735e-07, "loss": 0.207, "step": 4495 }, { "epoch": 0.21722955017635406, "grad_norm": 2.6525092124938965, "learning_rate": 7.827704498236459e-07, "loss": 0.3409, "step": 4496 }, { "epoch": 0.21727786635744312, "grad_norm": 4.692233562469482, "learning_rate": 7.827221336425569e-07, "loss": 0.4172, "step": 4497 }, { "epoch": 0.21732618253853214, "grad_norm": 3.114673614501953, "learning_rate": 7.826738174614677e-07, "loss": 0.3835, "step": 4498 }, { "epoch": 0.2173744987196212, "grad_norm": 3.239363670349121, "learning_rate": 7.826255012803787e-07, "loss": 0.3044, "step": 4499 }, { "epoch": 0.21742281490071025, "grad_norm": 2.368793249130249, "learning_rate": 7.825771850992897e-07, "loss": 0.3266, "step": 4500 }, { "epoch": 0.2174711310817993, "grad_norm": 2.765285015106201, "learning_rate": 7.825288689182007e-07, "loss": 0.2613, "step": 4501 }, { "epoch": 0.21751944726288835, "grad_norm": 13.776534080505371, "learning_rate": 7.824805527371117e-07, "loss": 0.3476, "step": 4502 }, { "epoch": 0.21756776344397738, "grad_norm": 4.22501277923584, "learning_rate": 7.824322365560226e-07, "loss": 0.3202, "step": 4503 }, { "epoch": 0.21761607962506643, "grad_norm": 2.102440357208252, "learning_rate": 7.823839203749335e-07, "loss": 0.3325, "step": 4504 }, { "epoch": 0.2176643958061555, "grad_norm": 2.5639681816101074, "learning_rate": 7.823356041938444e-07, "loss": 0.3973, "step": 4505 }, { "epoch": 0.21771271198724454, "grad_norm": 2.5299487113952637, "learning_rate": 7.822872880127554e-07, "loss": 0.1781, "step": 4506 }, { "epoch": 0.21776102816833356, "grad_norm": 2.702784299850464, "learning_rate": 7.822389718316664e-07, "loss": 0.2456, "step": 4507 }, { "epoch": 0.21780934434942262, "grad_norm": 2.1882331371307373, "learning_rate": 7.821906556505773e-07, "loss": 0.2922, "step": 4508 }, { "epoch": 0.21785766053051167, "grad_norm": 6.177511215209961, "learning_rate": 7.821423394694883e-07, "loss": 0.3279, "step": 4509 }, { "epoch": 0.21790597671160072, "grad_norm": 2.292459487915039, "learning_rate": 7.820940232883993e-07, "loss": 0.2876, "step": 4510 }, { "epoch": 0.21795429289268975, "grad_norm": 2.2756309509277344, "learning_rate": 7.820457071073102e-07, "loss": 0.2739, "step": 4511 }, { "epoch": 0.2180026090737788, "grad_norm": 2.4460816383361816, "learning_rate": 7.819973909262212e-07, "loss": 0.2459, "step": 4512 }, { "epoch": 0.21805092525486786, "grad_norm": 3.159996509552002, "learning_rate": 7.819490747451321e-07, "loss": 0.4842, "step": 4513 }, { "epoch": 0.2180992414359569, "grad_norm": 2.96755051612854, "learning_rate": 7.81900758564043e-07, "loss": 0.3298, "step": 4514 }, { "epoch": 0.21814755761704596, "grad_norm": 3.874763011932373, "learning_rate": 7.81852442382954e-07, "loss": 0.3537, "step": 4515 }, { "epoch": 0.218195873798135, "grad_norm": 1.930314064025879, "learning_rate": 7.81804126201865e-07, "loss": 0.2339, "step": 4516 }, { "epoch": 0.21824418997922404, "grad_norm": 5.119231224060059, "learning_rate": 7.81755810020776e-07, "loss": 0.3489, "step": 4517 }, { "epoch": 0.2182925061603131, "grad_norm": 2.706784725189209, "learning_rate": 7.817074938396869e-07, "loss": 0.2702, "step": 4518 }, { "epoch": 0.21834082234140215, "grad_norm": 3.592747449874878, "learning_rate": 7.816591776585978e-07, "loss": 0.1904, "step": 4519 }, { "epoch": 0.21838913852249117, "grad_norm": 2.856935977935791, "learning_rate": 7.816108614775088e-07, "loss": 0.4132, "step": 4520 }, { "epoch": 0.21843745470358022, "grad_norm": 2.6947991847991943, "learning_rate": 7.815625452964197e-07, "loss": 0.3113, "step": 4521 }, { "epoch": 0.21848577088466928, "grad_norm": 80.34122467041016, "learning_rate": 7.815142291153307e-07, "loss": 0.1871, "step": 4522 }, { "epoch": 0.21853408706575833, "grad_norm": 2.712217092514038, "learning_rate": 7.814659129342417e-07, "loss": 0.3698, "step": 4523 }, { "epoch": 0.21858240324684736, "grad_norm": 2.973224401473999, "learning_rate": 7.814175967531525e-07, "loss": 0.2996, "step": 4524 }, { "epoch": 0.2186307194279364, "grad_norm": 1.585904836654663, "learning_rate": 7.813692805720635e-07, "loss": 0.1713, "step": 4525 }, { "epoch": 0.21867903560902546, "grad_norm": 6.305344104766846, "learning_rate": 7.813209643909745e-07, "loss": 0.3448, "step": 4526 }, { "epoch": 0.21872735179011452, "grad_norm": 4.469231128692627, "learning_rate": 7.812726482098855e-07, "loss": 0.3461, "step": 4527 }, { "epoch": 0.21877566797120357, "grad_norm": 2.9139344692230225, "learning_rate": 7.812243320287965e-07, "loss": 0.2307, "step": 4528 }, { "epoch": 0.2188239841522926, "grad_norm": 2.5033481121063232, "learning_rate": 7.811760158477074e-07, "loss": 0.3045, "step": 4529 }, { "epoch": 0.21887230033338165, "grad_norm": 7.772347450256348, "learning_rate": 7.811276996666182e-07, "loss": 0.3831, "step": 4530 }, { "epoch": 0.2189206165144707, "grad_norm": 2.7382869720458984, "learning_rate": 7.810793834855292e-07, "loss": 0.3475, "step": 4531 }, { "epoch": 0.21896893269555975, "grad_norm": 1.4444425106048584, "learning_rate": 7.810310673044402e-07, "loss": 0.1615, "step": 4532 }, { "epoch": 0.21901724887664878, "grad_norm": 1.8104513883590698, "learning_rate": 7.809827511233512e-07, "loss": 0.2274, "step": 4533 }, { "epoch": 0.21906556505773783, "grad_norm": 2.2957606315612793, "learning_rate": 7.809344349422621e-07, "loss": 0.3063, "step": 4534 }, { "epoch": 0.21911388123882689, "grad_norm": 2.5026814937591553, "learning_rate": 7.808861187611731e-07, "loss": 0.2971, "step": 4535 }, { "epoch": 0.21916219741991594, "grad_norm": 2.768913984298706, "learning_rate": 7.808378025800841e-07, "loss": 0.2362, "step": 4536 }, { "epoch": 0.21921051360100496, "grad_norm": 2.503441333770752, "learning_rate": 7.80789486398995e-07, "loss": 0.288, "step": 4537 }, { "epoch": 0.21925882978209402, "grad_norm": 2.1884045600891113, "learning_rate": 7.807411702179059e-07, "loss": 0.2529, "step": 4538 }, { "epoch": 0.21930714596318307, "grad_norm": 2.4414873123168945, "learning_rate": 7.806928540368168e-07, "loss": 0.2698, "step": 4539 }, { "epoch": 0.21935546214427212, "grad_norm": 1.9954229593276978, "learning_rate": 7.806445378557278e-07, "loss": 0.2219, "step": 4540 }, { "epoch": 0.21940377832536118, "grad_norm": 2.0829107761383057, "learning_rate": 7.805962216746388e-07, "loss": 0.2311, "step": 4541 }, { "epoch": 0.2194520945064502, "grad_norm": 1.6084049940109253, "learning_rate": 7.805479054935498e-07, "loss": 0.187, "step": 4542 }, { "epoch": 0.21950041068753925, "grad_norm": 5.82145881652832, "learning_rate": 7.804995893124607e-07, "loss": 0.2528, "step": 4543 }, { "epoch": 0.2195487268686283, "grad_norm": 2.7039220333099365, "learning_rate": 7.804512731313717e-07, "loss": 0.2739, "step": 4544 }, { "epoch": 0.21959704304971736, "grad_norm": 2.1419291496276855, "learning_rate": 7.804029569502826e-07, "loss": 0.2293, "step": 4545 }, { "epoch": 0.2196453592308064, "grad_norm": 2.431762456893921, "learning_rate": 7.803546407691936e-07, "loss": 0.3342, "step": 4546 }, { "epoch": 0.21969367541189544, "grad_norm": 2.112330913543701, "learning_rate": 7.803063245881045e-07, "loss": 0.2589, "step": 4547 }, { "epoch": 0.2197419915929845, "grad_norm": 2.8716888427734375, "learning_rate": 7.802580084070155e-07, "loss": 0.3024, "step": 4548 }, { "epoch": 0.21979030777407355, "grad_norm": 2.813671112060547, "learning_rate": 7.802096922259265e-07, "loss": 0.3185, "step": 4549 }, { "epoch": 0.21983862395516257, "grad_norm": 40.66287612915039, "learning_rate": 7.801613760448373e-07, "loss": 0.3217, "step": 4550 }, { "epoch": 0.21988694013625162, "grad_norm": 3.3462088108062744, "learning_rate": 7.801130598637483e-07, "loss": 0.399, "step": 4551 }, { "epoch": 0.21993525631734068, "grad_norm": 2.5945310592651367, "learning_rate": 7.800647436826593e-07, "loss": 0.2982, "step": 4552 }, { "epoch": 0.21998357249842973, "grad_norm": 2.8796327114105225, "learning_rate": 7.800164275015703e-07, "loss": 0.2685, "step": 4553 }, { "epoch": 0.22003188867951878, "grad_norm": 2.7883846759796143, "learning_rate": 7.799681113204813e-07, "loss": 0.4065, "step": 4554 }, { "epoch": 0.2200802048606078, "grad_norm": 2.209182024002075, "learning_rate": 7.799197951393921e-07, "loss": 0.2602, "step": 4555 }, { "epoch": 0.22012852104169686, "grad_norm": 2.0928609371185303, "learning_rate": 7.79871478958303e-07, "loss": 0.1958, "step": 4556 }, { "epoch": 0.22017683722278591, "grad_norm": 2.703162670135498, "learning_rate": 7.79823162777214e-07, "loss": 0.3738, "step": 4557 }, { "epoch": 0.22022515340387497, "grad_norm": 5.202444553375244, "learning_rate": 7.79774846596125e-07, "loss": 0.4053, "step": 4558 }, { "epoch": 0.220273469584964, "grad_norm": 2.4708828926086426, "learning_rate": 7.79726530415036e-07, "loss": 0.3028, "step": 4559 }, { "epoch": 0.22032178576605305, "grad_norm": 3.0087180137634277, "learning_rate": 7.796782142339469e-07, "loss": 0.3874, "step": 4560 }, { "epoch": 0.2203701019471421, "grad_norm": 3.7030580043792725, "learning_rate": 7.796298980528579e-07, "loss": 0.3394, "step": 4561 }, { "epoch": 0.22041841812823115, "grad_norm": 2.6537442207336426, "learning_rate": 7.795815818717689e-07, "loss": 0.2995, "step": 4562 }, { "epoch": 0.22046673430932018, "grad_norm": 3.0483129024505615, "learning_rate": 7.795332656906797e-07, "loss": 0.4654, "step": 4563 }, { "epoch": 0.22051505049040923, "grad_norm": 2.097198009490967, "learning_rate": 7.794849495095907e-07, "loss": 0.1916, "step": 4564 }, { "epoch": 0.22056336667149828, "grad_norm": 2.4887282848358154, "learning_rate": 7.794366333285016e-07, "loss": 0.2178, "step": 4565 }, { "epoch": 0.22061168285258734, "grad_norm": 3.4218835830688477, "learning_rate": 7.793883171474126e-07, "loss": 0.337, "step": 4566 }, { "epoch": 0.2206599990336764, "grad_norm": 2.3579225540161133, "learning_rate": 7.793400009663236e-07, "loss": 0.2127, "step": 4567 }, { "epoch": 0.22070831521476542, "grad_norm": 2.4615321159362793, "learning_rate": 7.792916847852346e-07, "loss": 0.3012, "step": 4568 }, { "epoch": 0.22075663139585447, "grad_norm": 2.7193355560302734, "learning_rate": 7.792433686041455e-07, "loss": 0.3511, "step": 4569 }, { "epoch": 0.22080494757694352, "grad_norm": 2.2520782947540283, "learning_rate": 7.791950524230565e-07, "loss": 0.2809, "step": 4570 }, { "epoch": 0.22085326375803258, "grad_norm": 2.9324796199798584, "learning_rate": 7.791467362419674e-07, "loss": 0.3944, "step": 4571 }, { "epoch": 0.2209015799391216, "grad_norm": 2.915318250656128, "learning_rate": 7.790984200608783e-07, "loss": 0.2785, "step": 4572 }, { "epoch": 0.22094989612021065, "grad_norm": 2.5068671703338623, "learning_rate": 7.790501038797893e-07, "loss": 0.3168, "step": 4573 }, { "epoch": 0.2209982123012997, "grad_norm": 3.370088815689087, "learning_rate": 7.790017876987003e-07, "loss": 0.3577, "step": 4574 }, { "epoch": 0.22104652848238876, "grad_norm": 2.232116460800171, "learning_rate": 7.789534715176112e-07, "loss": 0.2363, "step": 4575 }, { "epoch": 0.22109484466347779, "grad_norm": 3.1992483139038086, "learning_rate": 7.789051553365221e-07, "loss": 0.3623, "step": 4576 }, { "epoch": 0.22114316084456684, "grad_norm": 1.626112937927246, "learning_rate": 7.788568391554331e-07, "loss": 0.1948, "step": 4577 }, { "epoch": 0.2211914770256559, "grad_norm": 6.697475910186768, "learning_rate": 7.788085229743441e-07, "loss": 0.3299, "step": 4578 }, { "epoch": 0.22123979320674494, "grad_norm": 3.35552978515625, "learning_rate": 7.787602067932551e-07, "loss": 0.4343, "step": 4579 }, { "epoch": 0.221288109387834, "grad_norm": 2.1759631633758545, "learning_rate": 7.787118906121661e-07, "loss": 0.228, "step": 4580 }, { "epoch": 0.22133642556892302, "grad_norm": 2.093818187713623, "learning_rate": 7.786635744310769e-07, "loss": 0.3401, "step": 4581 }, { "epoch": 0.22138474175001208, "grad_norm": 2.9410593509674072, "learning_rate": 7.786152582499878e-07, "loss": 0.4199, "step": 4582 }, { "epoch": 0.22143305793110113, "grad_norm": 16.0241641998291, "learning_rate": 7.785669420688988e-07, "loss": 0.2838, "step": 4583 }, { "epoch": 0.22148137411219018, "grad_norm": 4.050350666046143, "learning_rate": 7.785186258878098e-07, "loss": 0.2642, "step": 4584 }, { "epoch": 0.2215296902932792, "grad_norm": 1.7099111080169678, "learning_rate": 7.784703097067208e-07, "loss": 0.1759, "step": 4585 }, { "epoch": 0.22157800647436826, "grad_norm": 3.6775364875793457, "learning_rate": 7.784219935256317e-07, "loss": 0.2118, "step": 4586 }, { "epoch": 0.22162632265545731, "grad_norm": 2.52754545211792, "learning_rate": 7.783736773445427e-07, "loss": 0.2156, "step": 4587 }, { "epoch": 0.22167463883654637, "grad_norm": 2.0744736194610596, "learning_rate": 7.783253611634536e-07, "loss": 0.2156, "step": 4588 }, { "epoch": 0.2217229550176354, "grad_norm": 1.959580898284912, "learning_rate": 7.782770449823645e-07, "loss": 0.1698, "step": 4589 }, { "epoch": 0.22177127119872445, "grad_norm": 2.6441471576690674, "learning_rate": 7.782287288012755e-07, "loss": 0.3078, "step": 4590 }, { "epoch": 0.2218195873798135, "grad_norm": 5.517317771911621, "learning_rate": 7.781804126201864e-07, "loss": 0.3981, "step": 4591 }, { "epoch": 0.22186790356090255, "grad_norm": 3.286358118057251, "learning_rate": 7.781320964390974e-07, "loss": 0.2448, "step": 4592 }, { "epoch": 0.2219162197419916, "grad_norm": 3.353717803955078, "learning_rate": 7.780837802580084e-07, "loss": 0.4668, "step": 4593 }, { "epoch": 0.22196453592308063, "grad_norm": 2.1455163955688477, "learning_rate": 7.780354640769194e-07, "loss": 0.2647, "step": 4594 }, { "epoch": 0.22201285210416968, "grad_norm": 2.346538543701172, "learning_rate": 7.779871478958303e-07, "loss": 0.2807, "step": 4595 }, { "epoch": 0.22206116828525874, "grad_norm": 2.4316141605377197, "learning_rate": 7.779388317147413e-07, "loss": 0.2392, "step": 4596 }, { "epoch": 0.2221094844663478, "grad_norm": 7.365656852722168, "learning_rate": 7.778905155336521e-07, "loss": 0.3177, "step": 4597 }, { "epoch": 0.22215780064743681, "grad_norm": 2.650346279144287, "learning_rate": 7.778421993525631e-07, "loss": 0.3207, "step": 4598 }, { "epoch": 0.22220611682852587, "grad_norm": 2.2591004371643066, "learning_rate": 7.777938831714741e-07, "loss": 0.3065, "step": 4599 }, { "epoch": 0.22225443300961492, "grad_norm": 1.9259477853775024, "learning_rate": 7.777455669903851e-07, "loss": 0.162, "step": 4600 }, { "epoch": 0.22230274919070397, "grad_norm": 2.783783197402954, "learning_rate": 7.77697250809296e-07, "loss": 0.3386, "step": 4601 }, { "epoch": 0.222351065371793, "grad_norm": 3.0856266021728516, "learning_rate": 7.776489346282069e-07, "loss": 0.3676, "step": 4602 }, { "epoch": 0.22239938155288205, "grad_norm": 2.632148265838623, "learning_rate": 7.776006184471179e-07, "loss": 0.3616, "step": 4603 }, { "epoch": 0.2224476977339711, "grad_norm": 1.8903313875198364, "learning_rate": 7.775523022660289e-07, "loss": 0.1713, "step": 4604 }, { "epoch": 0.22249601391506016, "grad_norm": 2.582512855529785, "learning_rate": 7.775039860849399e-07, "loss": 0.2111, "step": 4605 }, { "epoch": 0.2225443300961492, "grad_norm": 3.2244958877563477, "learning_rate": 7.774556699038508e-07, "loss": 0.4368, "step": 4606 }, { "epoch": 0.22259264627723824, "grad_norm": 4.161839485168457, "learning_rate": 7.774073537227616e-07, "loss": 0.3124, "step": 4607 }, { "epoch": 0.2226409624583273, "grad_norm": 3.693784713745117, "learning_rate": 7.773590375416726e-07, "loss": 0.3673, "step": 4608 }, { "epoch": 0.22268927863941634, "grad_norm": 4.620481014251709, "learning_rate": 7.773107213605836e-07, "loss": 0.3046, "step": 4609 }, { "epoch": 0.2227375948205054, "grad_norm": 2.148059368133545, "learning_rate": 7.772624051794946e-07, "loss": 0.2556, "step": 4610 }, { "epoch": 0.22278591100159442, "grad_norm": 3.7394607067108154, "learning_rate": 7.772140889984056e-07, "loss": 0.3591, "step": 4611 }, { "epoch": 0.22283422718268348, "grad_norm": 1.800376534461975, "learning_rate": 7.771657728173165e-07, "loss": 0.1663, "step": 4612 }, { "epoch": 0.22288254336377253, "grad_norm": 2.5091640949249268, "learning_rate": 7.771174566362275e-07, "loss": 0.3007, "step": 4613 }, { "epoch": 0.22293085954486158, "grad_norm": 2.2874958515167236, "learning_rate": 7.770691404551383e-07, "loss": 0.275, "step": 4614 }, { "epoch": 0.22297917572595063, "grad_norm": 2.6534204483032227, "learning_rate": 7.770208242740493e-07, "loss": 0.4061, "step": 4615 }, { "epoch": 0.22302749190703966, "grad_norm": 3.2231791019439697, "learning_rate": 7.769725080929603e-07, "loss": 0.4095, "step": 4616 }, { "epoch": 0.2230758080881287, "grad_norm": 2.3451342582702637, "learning_rate": 7.769241919118712e-07, "loss": 0.3202, "step": 4617 }, { "epoch": 0.22312412426921777, "grad_norm": 3.32486629486084, "learning_rate": 7.768758757307822e-07, "loss": 0.3138, "step": 4618 }, { "epoch": 0.22317244045030682, "grad_norm": 2.560451030731201, "learning_rate": 7.768275595496932e-07, "loss": 0.2979, "step": 4619 }, { "epoch": 0.22322075663139584, "grad_norm": 2.57271671295166, "learning_rate": 7.767792433686041e-07, "loss": 0.2876, "step": 4620 }, { "epoch": 0.2232690728124849, "grad_norm": 3.200641393661499, "learning_rate": 7.767309271875151e-07, "loss": 0.3971, "step": 4621 }, { "epoch": 0.22331738899357395, "grad_norm": 3.0477263927459717, "learning_rate": 7.766826110064261e-07, "loss": 0.3011, "step": 4622 }, { "epoch": 0.223365705174663, "grad_norm": 6.943805694580078, "learning_rate": 7.766342948253369e-07, "loss": 0.3126, "step": 4623 }, { "epoch": 0.22341402135575203, "grad_norm": 5.408891677856445, "learning_rate": 7.765859786442479e-07, "loss": 0.3831, "step": 4624 }, { "epoch": 0.22346233753684108, "grad_norm": 19.06586456298828, "learning_rate": 7.765376624631589e-07, "loss": 0.3872, "step": 4625 }, { "epoch": 0.22351065371793014, "grad_norm": 2.7545268535614014, "learning_rate": 7.764893462820699e-07, "loss": 0.1869, "step": 4626 }, { "epoch": 0.2235589698990192, "grad_norm": 2.9056777954101562, "learning_rate": 7.764410301009808e-07, "loss": 0.3776, "step": 4627 }, { "epoch": 0.22360728608010824, "grad_norm": 2.197667121887207, "learning_rate": 7.763927139198917e-07, "loss": 0.2219, "step": 4628 }, { "epoch": 0.22365560226119727, "grad_norm": 3.5749967098236084, "learning_rate": 7.763443977388027e-07, "loss": 0.3515, "step": 4629 }, { "epoch": 0.22370391844228632, "grad_norm": 2.7781333923339844, "learning_rate": 7.762960815577137e-07, "loss": 0.3581, "step": 4630 }, { "epoch": 0.22375223462337537, "grad_norm": 3.3989713191986084, "learning_rate": 7.762477653766246e-07, "loss": 0.3718, "step": 4631 }, { "epoch": 0.22380055080446443, "grad_norm": 1.6635260581970215, "learning_rate": 7.761994491955356e-07, "loss": 0.1774, "step": 4632 }, { "epoch": 0.22384886698555345, "grad_norm": 2.3831090927124023, "learning_rate": 7.761511330144464e-07, "loss": 0.2003, "step": 4633 }, { "epoch": 0.2238971831666425, "grad_norm": 1.9346706867218018, "learning_rate": 7.761028168333574e-07, "loss": 0.1731, "step": 4634 }, { "epoch": 0.22394549934773156, "grad_norm": 3.1334428787231445, "learning_rate": 7.760545006522684e-07, "loss": 0.3737, "step": 4635 }, { "epoch": 0.2239938155288206, "grad_norm": 2.2885680198669434, "learning_rate": 7.760061844711794e-07, "loss": 0.2679, "step": 4636 }, { "epoch": 0.22404213170990964, "grad_norm": 12.001982688903809, "learning_rate": 7.759578682900904e-07, "loss": 0.211, "step": 4637 }, { "epoch": 0.2240904478909987, "grad_norm": 2.0380332469940186, "learning_rate": 7.759095521090013e-07, "loss": 0.1822, "step": 4638 }, { "epoch": 0.22413876407208774, "grad_norm": 2.58005690574646, "learning_rate": 7.758612359279121e-07, "loss": 0.2506, "step": 4639 }, { "epoch": 0.2241870802531768, "grad_norm": 6.710004806518555, "learning_rate": 7.758129197468231e-07, "loss": 0.2697, "step": 4640 }, { "epoch": 0.22423539643426585, "grad_norm": 3.484121322631836, "learning_rate": 7.757646035657341e-07, "loss": 0.3567, "step": 4641 }, { "epoch": 0.22428371261535487, "grad_norm": 120.70475769042969, "learning_rate": 7.757162873846451e-07, "loss": 0.2433, "step": 4642 }, { "epoch": 0.22433202879644393, "grad_norm": 3.172497510910034, "learning_rate": 7.75667971203556e-07, "loss": 0.3556, "step": 4643 }, { "epoch": 0.22438034497753298, "grad_norm": 1.8290364742279053, "learning_rate": 7.75619655022467e-07, "loss": 0.2088, "step": 4644 }, { "epoch": 0.22442866115862203, "grad_norm": 2.957904815673828, "learning_rate": 7.75571338841378e-07, "loss": 0.4359, "step": 4645 }, { "epoch": 0.22447697733971106, "grad_norm": 2.5261199474334717, "learning_rate": 7.755230226602889e-07, "loss": 0.2864, "step": 4646 }, { "epoch": 0.2245252935208001, "grad_norm": 2.3164923191070557, "learning_rate": 7.754747064791999e-07, "loss": 0.2692, "step": 4647 }, { "epoch": 0.22457360970188917, "grad_norm": 6.073535919189453, "learning_rate": 7.754263902981108e-07, "loss": 0.2924, "step": 4648 }, { "epoch": 0.22462192588297822, "grad_norm": 2.4657628536224365, "learning_rate": 7.753780741170217e-07, "loss": 0.3175, "step": 4649 }, { "epoch": 0.22467024206406724, "grad_norm": 3.197948455810547, "learning_rate": 7.753297579359327e-07, "loss": 0.3849, "step": 4650 }, { "epoch": 0.2247185582451563, "grad_norm": 3.541297197341919, "learning_rate": 7.752814417548437e-07, "loss": 0.3325, "step": 4651 }, { "epoch": 0.22476687442624535, "grad_norm": 3.5447349548339844, "learning_rate": 7.752331255737546e-07, "loss": 0.2856, "step": 4652 }, { "epoch": 0.2248151906073344, "grad_norm": 2.5255258083343506, "learning_rate": 7.751848093926656e-07, "loss": 0.2242, "step": 4653 }, { "epoch": 0.22486350678842346, "grad_norm": 3.823889970779419, "learning_rate": 7.751364932115765e-07, "loss": 0.3199, "step": 4654 }, { "epoch": 0.22491182296951248, "grad_norm": 2.368086338043213, "learning_rate": 7.750881770304875e-07, "loss": 0.2842, "step": 4655 }, { "epoch": 0.22496013915060153, "grad_norm": 3.284306049346924, "learning_rate": 7.750398608493985e-07, "loss": 0.4202, "step": 4656 }, { "epoch": 0.2250084553316906, "grad_norm": 2.828251361846924, "learning_rate": 7.749915446683094e-07, "loss": 0.322, "step": 4657 }, { "epoch": 0.22505677151277964, "grad_norm": 2.7105910778045654, "learning_rate": 7.749432284872204e-07, "loss": 0.269, "step": 4658 }, { "epoch": 0.22510508769386867, "grad_norm": 3.0293736457824707, "learning_rate": 7.748949123061312e-07, "loss": 0.3248, "step": 4659 }, { "epoch": 0.22515340387495772, "grad_norm": 4.956627368927002, "learning_rate": 7.748465961250422e-07, "loss": 0.3028, "step": 4660 }, { "epoch": 0.22520172005604677, "grad_norm": 3.117847204208374, "learning_rate": 7.747982799439532e-07, "loss": 0.2432, "step": 4661 }, { "epoch": 0.22525003623713583, "grad_norm": 2.139409065246582, "learning_rate": 7.747499637628642e-07, "loss": 0.2826, "step": 4662 }, { "epoch": 0.22529835241822485, "grad_norm": 2.0994904041290283, "learning_rate": 7.747016475817752e-07, "loss": 0.2115, "step": 4663 }, { "epoch": 0.2253466685993139, "grad_norm": 2.1391165256500244, "learning_rate": 7.746533314006861e-07, "loss": 0.2104, "step": 4664 }, { "epoch": 0.22539498478040296, "grad_norm": 4.711083889007568, "learning_rate": 7.746050152195969e-07, "loss": 0.3627, "step": 4665 }, { "epoch": 0.225443300961492, "grad_norm": 2.891263008117676, "learning_rate": 7.745566990385079e-07, "loss": 0.3664, "step": 4666 }, { "epoch": 0.22549161714258106, "grad_norm": 2.5693447589874268, "learning_rate": 7.745083828574189e-07, "loss": 0.2113, "step": 4667 }, { "epoch": 0.2255399333236701, "grad_norm": 3.171736478805542, "learning_rate": 7.744600666763299e-07, "loss": 0.3249, "step": 4668 }, { "epoch": 0.22558824950475914, "grad_norm": 3.1260457038879395, "learning_rate": 7.744117504952408e-07, "loss": 0.3735, "step": 4669 }, { "epoch": 0.2256365656858482, "grad_norm": 4.949263095855713, "learning_rate": 7.743634343141518e-07, "loss": 0.3045, "step": 4670 }, { "epoch": 0.22568488186693725, "grad_norm": 3.047361135482788, "learning_rate": 7.743151181330627e-07, "loss": 0.327, "step": 4671 }, { "epoch": 0.22573319804802627, "grad_norm": 2.921543598175049, "learning_rate": 7.742668019519737e-07, "loss": 0.3406, "step": 4672 }, { "epoch": 0.22578151422911533, "grad_norm": 2.408541440963745, "learning_rate": 7.742184857708846e-07, "loss": 0.3725, "step": 4673 }, { "epoch": 0.22582983041020438, "grad_norm": 3.0333945751190186, "learning_rate": 7.741701695897956e-07, "loss": 0.274, "step": 4674 }, { "epoch": 0.22587814659129343, "grad_norm": 3.275524377822876, "learning_rate": 7.741218534087065e-07, "loss": 0.2932, "step": 4675 }, { "epoch": 0.22592646277238246, "grad_norm": 2.3426082134246826, "learning_rate": 7.740735372276175e-07, "loss": 0.2826, "step": 4676 }, { "epoch": 0.2259747789534715, "grad_norm": 3.1100950241088867, "learning_rate": 7.740252210465285e-07, "loss": 0.3304, "step": 4677 }, { "epoch": 0.22602309513456056, "grad_norm": 2.2222156524658203, "learning_rate": 7.739769048654394e-07, "loss": 0.1979, "step": 4678 }, { "epoch": 0.22607141131564962, "grad_norm": 3.7124931812286377, "learning_rate": 7.739285886843504e-07, "loss": 0.4271, "step": 4679 }, { "epoch": 0.22611972749673867, "grad_norm": 1.945607304573059, "learning_rate": 7.738802725032613e-07, "loss": 0.2156, "step": 4680 }, { "epoch": 0.2261680436778277, "grad_norm": 1.9340132474899292, "learning_rate": 7.738319563221723e-07, "loss": 0.1939, "step": 4681 }, { "epoch": 0.22621635985891675, "grad_norm": 2.8658878803253174, "learning_rate": 7.737836401410832e-07, "loss": 0.2621, "step": 4682 }, { "epoch": 0.2262646760400058, "grad_norm": 2.1061203479766846, "learning_rate": 7.737353239599942e-07, "loss": 0.1744, "step": 4683 }, { "epoch": 0.22631299222109486, "grad_norm": 3.9562385082244873, "learning_rate": 7.736870077789051e-07, "loss": 0.3044, "step": 4684 }, { "epoch": 0.22636130840218388, "grad_norm": 5.810737133026123, "learning_rate": 7.73638691597816e-07, "loss": 0.2337, "step": 4685 }, { "epoch": 0.22640962458327293, "grad_norm": 4.10613489151001, "learning_rate": 7.73590375416727e-07, "loss": 0.3633, "step": 4686 }, { "epoch": 0.226457940764362, "grad_norm": 1.6344499588012695, "learning_rate": 7.73542059235638e-07, "loss": 0.1918, "step": 4687 }, { "epoch": 0.22650625694545104, "grad_norm": 2.968205690383911, "learning_rate": 7.73493743054549e-07, "loss": 0.4453, "step": 4688 }, { "epoch": 0.22655457312654007, "grad_norm": 2.862765312194824, "learning_rate": 7.7344542687346e-07, "loss": 0.3157, "step": 4689 }, { "epoch": 0.22660288930762912, "grad_norm": 2.920133590698242, "learning_rate": 7.733971106923707e-07, "loss": 0.3743, "step": 4690 }, { "epoch": 0.22665120548871817, "grad_norm": 2.3792905807495117, "learning_rate": 7.733487945112817e-07, "loss": 0.3318, "step": 4691 }, { "epoch": 0.22669952166980722, "grad_norm": 1.8648273944854736, "learning_rate": 7.733004783301927e-07, "loss": 0.1997, "step": 4692 }, { "epoch": 0.22674783785089628, "grad_norm": 2.5799472332000732, "learning_rate": 7.732521621491037e-07, "loss": 0.4048, "step": 4693 }, { "epoch": 0.2267961540319853, "grad_norm": 3.878633975982666, "learning_rate": 7.732038459680147e-07, "loss": 0.3276, "step": 4694 }, { "epoch": 0.22684447021307436, "grad_norm": 2.6512250900268555, "learning_rate": 7.731555297869256e-07, "loss": 0.3113, "step": 4695 }, { "epoch": 0.2268927863941634, "grad_norm": 3.0203118324279785, "learning_rate": 7.731072136058366e-07, "loss": 0.4255, "step": 4696 }, { "epoch": 0.22694110257525246, "grad_norm": 5.4497833251953125, "learning_rate": 7.730588974247475e-07, "loss": 0.3097, "step": 4697 }, { "epoch": 0.2269894187563415, "grad_norm": 2.634427070617676, "learning_rate": 7.730105812436585e-07, "loss": 0.4019, "step": 4698 }, { "epoch": 0.22703773493743054, "grad_norm": 3.219999313354492, "learning_rate": 7.729622650625694e-07, "loss": 0.2959, "step": 4699 }, { "epoch": 0.2270860511185196, "grad_norm": 2.0993471145629883, "learning_rate": 7.729139488814804e-07, "loss": 0.2561, "step": 4700 }, { "epoch": 0.22713436729960865, "grad_norm": 1.8310056924819946, "learning_rate": 7.728656327003913e-07, "loss": 0.1982, "step": 4701 }, { "epoch": 0.22718268348069767, "grad_norm": 2.752166509628296, "learning_rate": 7.728173165193023e-07, "loss": 0.2153, "step": 4702 }, { "epoch": 0.22723099966178673, "grad_norm": 1.981201171875, "learning_rate": 7.727690003382132e-07, "loss": 0.16, "step": 4703 }, { "epoch": 0.22727931584287578, "grad_norm": 2.0524566173553467, "learning_rate": 7.727206841571242e-07, "loss": 0.2814, "step": 4704 }, { "epoch": 0.22732763202396483, "grad_norm": 3.1289961338043213, "learning_rate": 7.726723679760352e-07, "loss": 0.3469, "step": 4705 }, { "epoch": 0.22737594820505388, "grad_norm": 8.808453559875488, "learning_rate": 7.726240517949461e-07, "loss": 0.4582, "step": 4706 }, { "epoch": 0.2274242643861429, "grad_norm": 2.9809138774871826, "learning_rate": 7.72575735613857e-07, "loss": 0.4301, "step": 4707 }, { "epoch": 0.22747258056723196, "grad_norm": 2.346085786819458, "learning_rate": 7.72527419432768e-07, "loss": 0.3053, "step": 4708 }, { "epoch": 0.22752089674832102, "grad_norm": 2.8661088943481445, "learning_rate": 7.72479103251679e-07, "loss": 0.4924, "step": 4709 }, { "epoch": 0.22756921292941007, "grad_norm": 2.830779790878296, "learning_rate": 7.724307870705899e-07, "loss": 0.3314, "step": 4710 }, { "epoch": 0.2276175291104991, "grad_norm": 2.5492918491363525, "learning_rate": 7.723824708895008e-07, "loss": 0.4282, "step": 4711 }, { "epoch": 0.22766584529158815, "grad_norm": 8.571024894714355, "learning_rate": 7.723341547084118e-07, "loss": 0.2824, "step": 4712 }, { "epoch": 0.2277141614726772, "grad_norm": 2.3894197940826416, "learning_rate": 7.722858385273228e-07, "loss": 0.2237, "step": 4713 }, { "epoch": 0.22776247765376625, "grad_norm": 2.516655683517456, "learning_rate": 7.722375223462338e-07, "loss": 0.3166, "step": 4714 }, { "epoch": 0.22781079383485528, "grad_norm": 2.584404230117798, "learning_rate": 7.721892061651448e-07, "loss": 0.1898, "step": 4715 }, { "epoch": 0.22785911001594433, "grad_norm": 2.4054219722747803, "learning_rate": 7.721408899840555e-07, "loss": 0.2093, "step": 4716 }, { "epoch": 0.22790742619703339, "grad_norm": 2.608708620071411, "learning_rate": 7.720925738029665e-07, "loss": 0.3606, "step": 4717 }, { "epoch": 0.22795574237812244, "grad_norm": 2.805225133895874, "learning_rate": 7.720442576218775e-07, "loss": 0.3301, "step": 4718 }, { "epoch": 0.2280040585592115, "grad_norm": 2.093705177307129, "learning_rate": 7.719959414407885e-07, "loss": 0.2702, "step": 4719 }, { "epoch": 0.22805237474030052, "grad_norm": 3.0162353515625, "learning_rate": 7.719476252596995e-07, "loss": 0.395, "step": 4720 }, { "epoch": 0.22810069092138957, "grad_norm": 6.977963447570801, "learning_rate": 7.718993090786104e-07, "loss": 0.2707, "step": 4721 }, { "epoch": 0.22814900710247862, "grad_norm": 2.531242609024048, "learning_rate": 7.718509928975213e-07, "loss": 0.3387, "step": 4722 }, { "epoch": 0.22819732328356768, "grad_norm": 2.199673652648926, "learning_rate": 7.718026767164323e-07, "loss": 0.2842, "step": 4723 }, { "epoch": 0.2282456394646567, "grad_norm": 2.611496686935425, "learning_rate": 7.717543605353432e-07, "loss": 0.2447, "step": 4724 }, { "epoch": 0.22829395564574576, "grad_norm": 2.0560619831085205, "learning_rate": 7.717060443542542e-07, "loss": 0.1468, "step": 4725 }, { "epoch": 0.2283422718268348, "grad_norm": 2.3396918773651123, "learning_rate": 7.716577281731652e-07, "loss": 0.2568, "step": 4726 }, { "epoch": 0.22839058800792386, "grad_norm": 2.384145736694336, "learning_rate": 7.716094119920761e-07, "loss": 0.2796, "step": 4727 }, { "epoch": 0.2284389041890129, "grad_norm": 2.740978717803955, "learning_rate": 7.715610958109871e-07, "loss": 0.265, "step": 4728 }, { "epoch": 0.22848722037010194, "grad_norm": 3.033041000366211, "learning_rate": 7.71512779629898e-07, "loss": 0.3777, "step": 4729 }, { "epoch": 0.228535536551191, "grad_norm": 2.7184574604034424, "learning_rate": 7.71464463448809e-07, "loss": 0.3382, "step": 4730 }, { "epoch": 0.22858385273228005, "grad_norm": 2.160935878753662, "learning_rate": 7.7141614726772e-07, "loss": 0.2201, "step": 4731 }, { "epoch": 0.2286321689133691, "grad_norm": 2.4079010486602783, "learning_rate": 7.713678310866308e-07, "loss": 0.246, "step": 4732 }, { "epoch": 0.22868048509445812, "grad_norm": 4.761903285980225, "learning_rate": 7.713195149055418e-07, "loss": 0.3187, "step": 4733 }, { "epoch": 0.22872880127554718, "grad_norm": 1.5354955196380615, "learning_rate": 7.712711987244528e-07, "loss": 0.2044, "step": 4734 }, { "epoch": 0.22877711745663623, "grad_norm": 2.712538003921509, "learning_rate": 7.712228825433637e-07, "loss": 0.2586, "step": 4735 }, { "epoch": 0.22882543363772528, "grad_norm": 2.0128097534179688, "learning_rate": 7.711745663622747e-07, "loss": 0.221, "step": 4736 }, { "epoch": 0.2288737498188143, "grad_norm": 2.5610711574554443, "learning_rate": 7.711262501811856e-07, "loss": 0.3347, "step": 4737 }, { "epoch": 0.22892206599990336, "grad_norm": 2.3214352130889893, "learning_rate": 7.710779340000966e-07, "loss": 0.2245, "step": 4738 }, { "epoch": 0.22897038218099242, "grad_norm": 2.1113600730895996, "learning_rate": 7.710296178190076e-07, "loss": 0.1924, "step": 4739 }, { "epoch": 0.22901869836208147, "grad_norm": 3.249459981918335, "learning_rate": 7.709813016379186e-07, "loss": 0.4208, "step": 4740 }, { "epoch": 0.2290670145431705, "grad_norm": 2.375213861465454, "learning_rate": 7.709329854568295e-07, "loss": 0.1661, "step": 4741 }, { "epoch": 0.22911533072425955, "grad_norm": 2.780529499053955, "learning_rate": 7.708846692757403e-07, "loss": 0.3796, "step": 4742 }, { "epoch": 0.2291636469053486, "grad_norm": 2.640404462814331, "learning_rate": 7.708363530946513e-07, "loss": 0.3223, "step": 4743 }, { "epoch": 0.22921196308643765, "grad_norm": 2.4326210021972656, "learning_rate": 7.707880369135623e-07, "loss": 0.3336, "step": 4744 }, { "epoch": 0.2292602792675267, "grad_norm": 1.5563509464263916, "learning_rate": 7.707397207324733e-07, "loss": 0.1471, "step": 4745 }, { "epoch": 0.22930859544861573, "grad_norm": 1.9023663997650146, "learning_rate": 7.706914045513843e-07, "loss": 0.2037, "step": 4746 }, { "epoch": 0.22935691162970478, "grad_norm": 2.1366331577301025, "learning_rate": 7.706430883702952e-07, "loss": 0.2091, "step": 4747 }, { "epoch": 0.22940522781079384, "grad_norm": 2.0110507011413574, "learning_rate": 7.705947721892061e-07, "loss": 0.2032, "step": 4748 }, { "epoch": 0.2294535439918829, "grad_norm": 2.269538640975952, "learning_rate": 7.70546456008117e-07, "loss": 0.2315, "step": 4749 }, { "epoch": 0.22950186017297192, "grad_norm": 5.801553726196289, "learning_rate": 7.70498139827028e-07, "loss": 0.2359, "step": 4750 }, { "epoch": 0.22955017635406097, "grad_norm": 2.1382713317871094, "learning_rate": 7.70449823645939e-07, "loss": 0.2386, "step": 4751 }, { "epoch": 0.22959849253515002, "grad_norm": 2.4424846172332764, "learning_rate": 7.7040150746485e-07, "loss": 0.2799, "step": 4752 }, { "epoch": 0.22964680871623908, "grad_norm": 3.808107852935791, "learning_rate": 7.703531912837609e-07, "loss": 0.3663, "step": 4753 }, { "epoch": 0.22969512489732813, "grad_norm": 2.2281529903411865, "learning_rate": 7.703048751026718e-07, "loss": 0.2985, "step": 4754 }, { "epoch": 0.22974344107841715, "grad_norm": 3.3831706047058105, "learning_rate": 7.702565589215828e-07, "loss": 0.448, "step": 4755 }, { "epoch": 0.2297917572595062, "grad_norm": 3.09592342376709, "learning_rate": 7.702082427404938e-07, "loss": 0.3551, "step": 4756 }, { "epoch": 0.22984007344059526, "grad_norm": 2.339116334915161, "learning_rate": 7.701599265594048e-07, "loss": 0.3045, "step": 4757 }, { "epoch": 0.2298883896216843, "grad_norm": 3.008573293685913, "learning_rate": 7.701116103783156e-07, "loss": 0.3372, "step": 4758 }, { "epoch": 0.22993670580277334, "grad_norm": 2.719381093978882, "learning_rate": 7.700632941972266e-07, "loss": 0.2522, "step": 4759 }, { "epoch": 0.2299850219838624, "grad_norm": 2.688767671585083, "learning_rate": 7.700149780161376e-07, "loss": 0.2855, "step": 4760 }, { "epoch": 0.23003333816495145, "grad_norm": 10.341358184814453, "learning_rate": 7.699666618350485e-07, "loss": 0.3995, "step": 4761 }, { "epoch": 0.2300816543460405, "grad_norm": 3.4967615604400635, "learning_rate": 7.699183456539595e-07, "loss": 0.3185, "step": 4762 }, { "epoch": 0.23012997052712952, "grad_norm": 9.074162483215332, "learning_rate": 7.698700294728704e-07, "loss": 0.3577, "step": 4763 }, { "epoch": 0.23017828670821858, "grad_norm": 3.013813018798828, "learning_rate": 7.698217132917814e-07, "loss": 0.3718, "step": 4764 }, { "epoch": 0.23022660288930763, "grad_norm": 2.278275489807129, "learning_rate": 7.697733971106924e-07, "loss": 0.2561, "step": 4765 }, { "epoch": 0.23027491907039668, "grad_norm": 2.245814561843872, "learning_rate": 7.697250809296034e-07, "loss": 0.2336, "step": 4766 }, { "epoch": 0.23032323525148574, "grad_norm": 3.1490087509155273, "learning_rate": 7.696767647485142e-07, "loss": 0.3694, "step": 4767 }, { "epoch": 0.23037155143257476, "grad_norm": 2.327544689178467, "learning_rate": 7.696284485674251e-07, "loss": 0.2944, "step": 4768 }, { "epoch": 0.23041986761366381, "grad_norm": 2.0465445518493652, "learning_rate": 7.695801323863361e-07, "loss": 0.2393, "step": 4769 }, { "epoch": 0.23046818379475287, "grad_norm": 2.047463893890381, "learning_rate": 7.695318162052471e-07, "loss": 0.2067, "step": 4770 }, { "epoch": 0.23051649997584192, "grad_norm": 3.609875202178955, "learning_rate": 7.694835000241581e-07, "loss": 0.2519, "step": 4771 }, { "epoch": 0.23056481615693095, "grad_norm": 1.0962176322937012, "learning_rate": 7.694351838430691e-07, "loss": 0.1264, "step": 4772 }, { "epoch": 0.23061313233802, "grad_norm": 3.1935126781463623, "learning_rate": 7.693868676619799e-07, "loss": 0.3575, "step": 4773 }, { "epoch": 0.23066144851910905, "grad_norm": 4.099767208099365, "learning_rate": 7.693385514808908e-07, "loss": 0.3819, "step": 4774 }, { "epoch": 0.2307097647001981, "grad_norm": 4.275262355804443, "learning_rate": 7.692902352998018e-07, "loss": 0.2732, "step": 4775 }, { "epoch": 0.23075808088128713, "grad_norm": 4.388251304626465, "learning_rate": 7.692419191187128e-07, "loss": 0.3554, "step": 4776 }, { "epoch": 0.23080639706237618, "grad_norm": 4.721278667449951, "learning_rate": 7.691936029376238e-07, "loss": 0.2772, "step": 4777 }, { "epoch": 0.23085471324346524, "grad_norm": 2.292205333709717, "learning_rate": 7.691452867565348e-07, "loss": 0.3156, "step": 4778 }, { "epoch": 0.2309030294245543, "grad_norm": 3.648350477218628, "learning_rate": 7.690969705754457e-07, "loss": 0.3511, "step": 4779 }, { "epoch": 0.23095134560564334, "grad_norm": 4.617166519165039, "learning_rate": 7.690486543943566e-07, "loss": 0.38, "step": 4780 }, { "epoch": 0.23099966178673237, "grad_norm": 3.129836082458496, "learning_rate": 7.690003382132676e-07, "loss": 0.2809, "step": 4781 }, { "epoch": 0.23104797796782142, "grad_norm": 2.5066349506378174, "learning_rate": 7.689520220321786e-07, "loss": 0.2769, "step": 4782 }, { "epoch": 0.23109629414891047, "grad_norm": 2.344600200653076, "learning_rate": 7.689037058510895e-07, "loss": 0.3132, "step": 4783 }, { "epoch": 0.23114461032999953, "grad_norm": 3.9231367111206055, "learning_rate": 7.688553896700004e-07, "loss": 0.3395, "step": 4784 }, { "epoch": 0.23119292651108855, "grad_norm": 2.454700231552124, "learning_rate": 7.688070734889114e-07, "loss": 0.3165, "step": 4785 }, { "epoch": 0.2312412426921776, "grad_norm": 5.6007490158081055, "learning_rate": 7.687587573078223e-07, "loss": 0.2197, "step": 4786 }, { "epoch": 0.23128955887326666, "grad_norm": 2.785008430480957, "learning_rate": 7.687104411267333e-07, "loss": 0.2681, "step": 4787 }, { "epoch": 0.2313378750543557, "grad_norm": 2.226146697998047, "learning_rate": 7.686621249456443e-07, "loss": 0.2896, "step": 4788 }, { "epoch": 0.23138619123544474, "grad_norm": 2.0550105571746826, "learning_rate": 7.686138087645552e-07, "loss": 0.1906, "step": 4789 }, { "epoch": 0.2314345074165338, "grad_norm": 3.4283816814422607, "learning_rate": 7.685654925834662e-07, "loss": 0.2009, "step": 4790 }, { "epoch": 0.23148282359762284, "grad_norm": 2.7259159088134766, "learning_rate": 7.685171764023772e-07, "loss": 0.3458, "step": 4791 }, { "epoch": 0.2315311397787119, "grad_norm": 2.5898027420043945, "learning_rate": 7.684688602212881e-07, "loss": 0.2877, "step": 4792 }, { "epoch": 0.23157945595980095, "grad_norm": 2.2052032947540283, "learning_rate": 7.68420544040199e-07, "loss": 0.2149, "step": 4793 }, { "epoch": 0.23162777214088998, "grad_norm": 2.243328809738159, "learning_rate": 7.683722278591099e-07, "loss": 0.3139, "step": 4794 }, { "epoch": 0.23167608832197903, "grad_norm": 2.8671772480010986, "learning_rate": 7.683239116780209e-07, "loss": 0.2787, "step": 4795 }, { "epoch": 0.23172440450306808, "grad_norm": 9.855156898498535, "learning_rate": 7.682755954969319e-07, "loss": 0.3317, "step": 4796 }, { "epoch": 0.23177272068415714, "grad_norm": 5.837789535522461, "learning_rate": 7.682272793158429e-07, "loss": 0.2389, "step": 4797 }, { "epoch": 0.23182103686524616, "grad_norm": 3.444308042526245, "learning_rate": 7.681789631347539e-07, "loss": 0.4044, "step": 4798 }, { "epoch": 0.2318693530463352, "grad_norm": 2.8811287879943848, "learning_rate": 7.681306469536647e-07, "loss": 0.2672, "step": 4799 }, { "epoch": 0.23191766922742427, "grad_norm": 2.762514591217041, "learning_rate": 7.680823307725756e-07, "loss": 0.465, "step": 4800 }, { "epoch": 0.23196598540851332, "grad_norm": 3.0850000381469727, "learning_rate": 7.680340145914866e-07, "loss": 0.4024, "step": 4801 }, { "epoch": 0.23201430158960235, "grad_norm": 3.303140878677368, "learning_rate": 7.679856984103976e-07, "loss": 0.4111, "step": 4802 }, { "epoch": 0.2320626177706914, "grad_norm": 13.222003936767578, "learning_rate": 7.679373822293086e-07, "loss": 0.3298, "step": 4803 }, { "epoch": 0.23211093395178045, "grad_norm": 2.119511604309082, "learning_rate": 7.678890660482196e-07, "loss": 0.3296, "step": 4804 }, { "epoch": 0.2321592501328695, "grad_norm": 4.134300708770752, "learning_rate": 7.678407498671304e-07, "loss": 0.355, "step": 4805 }, { "epoch": 0.23220756631395856, "grad_norm": 2.929442882537842, "learning_rate": 7.677924336860414e-07, "loss": 0.3905, "step": 4806 }, { "epoch": 0.23225588249504758, "grad_norm": 3.2966740131378174, "learning_rate": 7.677441175049524e-07, "loss": 0.3553, "step": 4807 }, { "epoch": 0.23230419867613664, "grad_norm": 3.918998956680298, "learning_rate": 7.676958013238634e-07, "loss": 0.3241, "step": 4808 }, { "epoch": 0.2323525148572257, "grad_norm": 2.5939767360687256, "learning_rate": 7.676474851427743e-07, "loss": 0.3419, "step": 4809 }, { "epoch": 0.23240083103831474, "grad_norm": 3.689351797103882, "learning_rate": 7.675991689616852e-07, "loss": 0.2214, "step": 4810 }, { "epoch": 0.23244914721940377, "grad_norm": 2.6923859119415283, "learning_rate": 7.675508527805962e-07, "loss": 0.2218, "step": 4811 }, { "epoch": 0.23249746340049282, "grad_norm": 2.552353858947754, "learning_rate": 7.675025365995071e-07, "loss": 0.2682, "step": 4812 }, { "epoch": 0.23254577958158187, "grad_norm": 2.9107632637023926, "learning_rate": 7.674542204184181e-07, "loss": 0.3186, "step": 4813 }, { "epoch": 0.23259409576267093, "grad_norm": 6.026315212249756, "learning_rate": 7.674059042373291e-07, "loss": 0.3606, "step": 4814 }, { "epoch": 0.23264241194375995, "grad_norm": 1.8966283798217773, "learning_rate": 7.6735758805624e-07, "loss": 0.1613, "step": 4815 }, { "epoch": 0.232690728124849, "grad_norm": 2.7142584323883057, "learning_rate": 7.67309271875151e-07, "loss": 0.2744, "step": 4816 }, { "epoch": 0.23273904430593806, "grad_norm": 2.14742112159729, "learning_rate": 7.672609556940619e-07, "loss": 0.2626, "step": 4817 }, { "epoch": 0.2327873604870271, "grad_norm": 2.3357417583465576, "learning_rate": 7.672126395129728e-07, "loss": 0.3146, "step": 4818 }, { "epoch": 0.23283567666811616, "grad_norm": 2.474419593811035, "learning_rate": 7.671643233318838e-07, "loss": 0.2812, "step": 4819 }, { "epoch": 0.2328839928492052, "grad_norm": 1.8777457475662231, "learning_rate": 7.671160071507947e-07, "loss": 0.198, "step": 4820 }, { "epoch": 0.23293230903029424, "grad_norm": 3.6181087493896484, "learning_rate": 7.670676909697057e-07, "loss": 0.1596, "step": 4821 }, { "epoch": 0.2329806252113833, "grad_norm": 2.202669858932495, "learning_rate": 7.670193747886167e-07, "loss": 0.2611, "step": 4822 }, { "epoch": 0.23302894139247235, "grad_norm": 10.801556587219238, "learning_rate": 7.669710586075277e-07, "loss": 0.3473, "step": 4823 }, { "epoch": 0.23307725757356137, "grad_norm": 3.648573160171509, "learning_rate": 7.669227424264387e-07, "loss": 0.3895, "step": 4824 }, { "epoch": 0.23312557375465043, "grad_norm": 2.509277582168579, "learning_rate": 7.668744262453494e-07, "loss": 0.2801, "step": 4825 }, { "epoch": 0.23317388993573948, "grad_norm": 3.4300625324249268, "learning_rate": 7.668261100642604e-07, "loss": 0.3585, "step": 4826 }, { "epoch": 0.23322220611682853, "grad_norm": 2.560438394546509, "learning_rate": 7.667777938831714e-07, "loss": 0.3075, "step": 4827 }, { "epoch": 0.23327052229791756, "grad_norm": 2.780733823776245, "learning_rate": 7.667294777020824e-07, "loss": 0.2557, "step": 4828 }, { "epoch": 0.2333188384790066, "grad_norm": 3.9068100452423096, "learning_rate": 7.666811615209934e-07, "loss": 0.3327, "step": 4829 }, { "epoch": 0.23336715466009567, "grad_norm": 2.1102936267852783, "learning_rate": 7.666328453399043e-07, "loss": 0.294, "step": 4830 }, { "epoch": 0.23341547084118472, "grad_norm": 2.7273197174072266, "learning_rate": 7.665845291588152e-07, "loss": 0.3301, "step": 4831 }, { "epoch": 0.23346378702227377, "grad_norm": 2.533233642578125, "learning_rate": 7.665362129777262e-07, "loss": 0.2701, "step": 4832 }, { "epoch": 0.2335121032033628, "grad_norm": 5.279065132141113, "learning_rate": 7.664878967966372e-07, "loss": 0.2929, "step": 4833 }, { "epoch": 0.23356041938445185, "grad_norm": 2.9994823932647705, "learning_rate": 7.664395806155481e-07, "loss": 0.3647, "step": 4834 }, { "epoch": 0.2336087355655409, "grad_norm": 6.628347396850586, "learning_rate": 7.663912644344591e-07, "loss": 0.4064, "step": 4835 }, { "epoch": 0.23365705174662996, "grad_norm": 3.0618762969970703, "learning_rate": 7.6634294825337e-07, "loss": 0.4074, "step": 4836 }, { "epoch": 0.23370536792771898, "grad_norm": 2.502939224243164, "learning_rate": 7.662946320722809e-07, "loss": 0.2236, "step": 4837 }, { "epoch": 0.23375368410880804, "grad_norm": 3.2082736492156982, "learning_rate": 7.662463158911919e-07, "loss": 0.3919, "step": 4838 }, { "epoch": 0.2338020002898971, "grad_norm": 2.687929630279541, "learning_rate": 7.661979997101029e-07, "loss": 0.3907, "step": 4839 }, { "epoch": 0.23385031647098614, "grad_norm": 2.7188472747802734, "learning_rate": 7.661496835290139e-07, "loss": 0.329, "step": 4840 }, { "epoch": 0.23389863265207517, "grad_norm": 2.5350472927093506, "learning_rate": 7.661013673479248e-07, "loss": 0.2795, "step": 4841 }, { "epoch": 0.23394694883316422, "grad_norm": 2.4140450954437256, "learning_rate": 7.660530511668357e-07, "loss": 0.2993, "step": 4842 }, { "epoch": 0.23399526501425327, "grad_norm": 2.2644810676574707, "learning_rate": 7.660047349857467e-07, "loss": 0.2387, "step": 4843 }, { "epoch": 0.23404358119534233, "grad_norm": 3.5948429107666016, "learning_rate": 7.659564188046576e-07, "loss": 0.2479, "step": 4844 }, { "epoch": 0.23409189737643138, "grad_norm": 2.9071691036224365, "learning_rate": 7.659081026235686e-07, "loss": 0.285, "step": 4845 }, { "epoch": 0.2341402135575204, "grad_norm": 2.760136127471924, "learning_rate": 7.658597864424795e-07, "loss": 0.2059, "step": 4846 }, { "epoch": 0.23418852973860946, "grad_norm": 3.410193920135498, "learning_rate": 7.658114702613905e-07, "loss": 0.3157, "step": 4847 }, { "epoch": 0.2342368459196985, "grad_norm": 2.1869242191314697, "learning_rate": 7.657631540803015e-07, "loss": 0.233, "step": 4848 }, { "epoch": 0.23428516210078756, "grad_norm": 2.0570178031921387, "learning_rate": 7.657148378992125e-07, "loss": 0.261, "step": 4849 }, { "epoch": 0.2343334782818766, "grad_norm": 2.7597908973693848, "learning_rate": 7.656665217181234e-07, "loss": 0.2438, "step": 4850 }, { "epoch": 0.23438179446296564, "grad_norm": 2.420905828475952, "learning_rate": 7.656182055370342e-07, "loss": 0.2693, "step": 4851 }, { "epoch": 0.2344301106440547, "grad_norm": 2.5450780391693115, "learning_rate": 7.655698893559452e-07, "loss": 0.3566, "step": 4852 }, { "epoch": 0.23447842682514375, "grad_norm": 2.8850958347320557, "learning_rate": 7.655215731748562e-07, "loss": 0.4099, "step": 4853 }, { "epoch": 0.23452674300623277, "grad_norm": 2.645575523376465, "learning_rate": 7.654732569937672e-07, "loss": 0.2818, "step": 4854 }, { "epoch": 0.23457505918732183, "grad_norm": 3.8027172088623047, "learning_rate": 7.654249408126782e-07, "loss": 0.3482, "step": 4855 }, { "epoch": 0.23462337536841088, "grad_norm": 2.18951153755188, "learning_rate": 7.653766246315891e-07, "loss": 0.2255, "step": 4856 }, { "epoch": 0.23467169154949993, "grad_norm": 2.40531587600708, "learning_rate": 7.653283084505e-07, "loss": 0.282, "step": 4857 }, { "epoch": 0.234720007730589, "grad_norm": 2.755265474319458, "learning_rate": 7.65279992269411e-07, "loss": 0.3439, "step": 4858 }, { "epoch": 0.234768323911678, "grad_norm": 2.404979705810547, "learning_rate": 7.65231676088322e-07, "loss": 0.2473, "step": 4859 }, { "epoch": 0.23481664009276706, "grad_norm": 4.222693920135498, "learning_rate": 7.651833599072329e-07, "loss": 0.2405, "step": 4860 }, { "epoch": 0.23486495627385612, "grad_norm": 5.262125015258789, "learning_rate": 7.651350437261439e-07, "loss": 0.3813, "step": 4861 }, { "epoch": 0.23491327245494517, "grad_norm": 4.298902988433838, "learning_rate": 7.650867275450548e-07, "loss": 0.2797, "step": 4862 }, { "epoch": 0.2349615886360342, "grad_norm": 4.603606700897217, "learning_rate": 7.650384113639657e-07, "loss": 0.3618, "step": 4863 }, { "epoch": 0.23500990481712325, "grad_norm": 2.6971426010131836, "learning_rate": 7.649900951828767e-07, "loss": 0.3296, "step": 4864 }, { "epoch": 0.2350582209982123, "grad_norm": 3.2241737842559814, "learning_rate": 7.649417790017877e-07, "loss": 0.3088, "step": 4865 }, { "epoch": 0.23510653717930136, "grad_norm": 2.7135889530181885, "learning_rate": 7.648934628206987e-07, "loss": 0.3763, "step": 4866 }, { "epoch": 0.23515485336039038, "grad_norm": 1.9058552980422974, "learning_rate": 7.648451466396096e-07, "loss": 0.1534, "step": 4867 }, { "epoch": 0.23520316954147943, "grad_norm": 3.7690110206604004, "learning_rate": 7.647968304585205e-07, "loss": 0.3907, "step": 4868 }, { "epoch": 0.2352514857225685, "grad_norm": 2.9694371223449707, "learning_rate": 7.647485142774314e-07, "loss": 0.2808, "step": 4869 }, { "epoch": 0.23529980190365754, "grad_norm": 2.0668697357177734, "learning_rate": 7.647001980963424e-07, "loss": 0.2481, "step": 4870 }, { "epoch": 0.2353481180847466, "grad_norm": 2.715850353240967, "learning_rate": 7.646518819152534e-07, "loss": 0.3192, "step": 4871 }, { "epoch": 0.23539643426583562, "grad_norm": 3.2336947917938232, "learning_rate": 7.646035657341643e-07, "loss": 0.3653, "step": 4872 }, { "epoch": 0.23544475044692467, "grad_norm": 3.135387897491455, "learning_rate": 7.645552495530753e-07, "loss": 0.3352, "step": 4873 }, { "epoch": 0.23549306662801373, "grad_norm": 2.1840455532073975, "learning_rate": 7.645069333719863e-07, "loss": 0.2446, "step": 4874 }, { "epoch": 0.23554138280910278, "grad_norm": 1.971327304840088, "learning_rate": 7.644586171908973e-07, "loss": 0.2449, "step": 4875 }, { "epoch": 0.2355896989901918, "grad_norm": 2.9809985160827637, "learning_rate": 7.644103010098081e-07, "loss": 0.4233, "step": 4876 }, { "epoch": 0.23563801517128086, "grad_norm": 3.259462594985962, "learning_rate": 7.64361984828719e-07, "loss": 0.3804, "step": 4877 }, { "epoch": 0.2356863313523699, "grad_norm": 2.4379429817199707, "learning_rate": 7.6431366864763e-07, "loss": 0.3331, "step": 4878 }, { "epoch": 0.23573464753345896, "grad_norm": 2.718191623687744, "learning_rate": 7.64265352466541e-07, "loss": 0.2621, "step": 4879 }, { "epoch": 0.235782963714548, "grad_norm": 34.508148193359375, "learning_rate": 7.64217036285452e-07, "loss": 0.4126, "step": 4880 }, { "epoch": 0.23583127989563704, "grad_norm": 3.580073595046997, "learning_rate": 7.64168720104363e-07, "loss": 0.3884, "step": 4881 }, { "epoch": 0.2358795960767261, "grad_norm": 1.8004658222198486, "learning_rate": 7.641204039232738e-07, "loss": 0.2635, "step": 4882 }, { "epoch": 0.23592791225781515, "grad_norm": 2.803223133087158, "learning_rate": 7.640720877421848e-07, "loss": 0.2864, "step": 4883 }, { "epoch": 0.2359762284389042, "grad_norm": 3.1231837272644043, "learning_rate": 7.640237715610957e-07, "loss": 0.3939, "step": 4884 }, { "epoch": 0.23602454461999323, "grad_norm": 2.420576572418213, "learning_rate": 7.639754553800067e-07, "loss": 0.2362, "step": 4885 }, { "epoch": 0.23607286080108228, "grad_norm": 2.722076416015625, "learning_rate": 7.639271391989177e-07, "loss": 0.2582, "step": 4886 }, { "epoch": 0.23612117698217133, "grad_norm": 3.80023193359375, "learning_rate": 7.638788230178287e-07, "loss": 0.2398, "step": 4887 }, { "epoch": 0.23616949316326039, "grad_norm": 2.816070318222046, "learning_rate": 7.638305068367396e-07, "loss": 0.3789, "step": 4888 }, { "epoch": 0.2362178093443494, "grad_norm": 1.7551231384277344, "learning_rate": 7.637821906556505e-07, "loss": 0.2023, "step": 4889 }, { "epoch": 0.23626612552543846, "grad_norm": 2.062122344970703, "learning_rate": 7.637338744745615e-07, "loss": 0.2577, "step": 4890 }, { "epoch": 0.23631444170652752, "grad_norm": 2.0340986251831055, "learning_rate": 7.636855582934725e-07, "loss": 0.2571, "step": 4891 }, { "epoch": 0.23636275788761657, "grad_norm": 2.931974172592163, "learning_rate": 7.636372421123835e-07, "loss": 0.4307, "step": 4892 }, { "epoch": 0.23641107406870562, "grad_norm": 1.561354398727417, "learning_rate": 7.635889259312943e-07, "loss": 0.1557, "step": 4893 }, { "epoch": 0.23645939024979465, "grad_norm": 2.9847559928894043, "learning_rate": 7.635406097502053e-07, "loss": 0.3469, "step": 4894 }, { "epoch": 0.2365077064308837, "grad_norm": 3.1514625549316406, "learning_rate": 7.634922935691162e-07, "loss": 0.253, "step": 4895 }, { "epoch": 0.23655602261197275, "grad_norm": 7.6482696533203125, "learning_rate": 7.634439773880272e-07, "loss": 0.3781, "step": 4896 }, { "epoch": 0.2366043387930618, "grad_norm": 4.271120071411133, "learning_rate": 7.633956612069382e-07, "loss": 0.3179, "step": 4897 }, { "epoch": 0.23665265497415083, "grad_norm": 1.6047911643981934, "learning_rate": 7.633473450258491e-07, "loss": 0.2122, "step": 4898 }, { "epoch": 0.2367009711552399, "grad_norm": 2.352796792984009, "learning_rate": 7.632990288447601e-07, "loss": 0.2519, "step": 4899 }, { "epoch": 0.23674928733632894, "grad_norm": 2.184286594390869, "learning_rate": 7.632507126636711e-07, "loss": 0.2209, "step": 4900 }, { "epoch": 0.236797603517418, "grad_norm": 2.0570881366729736, "learning_rate": 7.63202396482582e-07, "loss": 0.2552, "step": 4901 }, { "epoch": 0.23684591969850702, "grad_norm": 2.8825113773345947, "learning_rate": 7.631540803014929e-07, "loss": 0.3444, "step": 4902 }, { "epoch": 0.23689423587959607, "grad_norm": 2.4022998809814453, "learning_rate": 7.631057641204038e-07, "loss": 0.298, "step": 4903 }, { "epoch": 0.23694255206068512, "grad_norm": 5.0883708000183105, "learning_rate": 7.630574479393148e-07, "loss": 0.3499, "step": 4904 }, { "epoch": 0.23699086824177418, "grad_norm": 2.6064047813415527, "learning_rate": 7.630091317582258e-07, "loss": 0.3069, "step": 4905 }, { "epoch": 0.23703918442286323, "grad_norm": 2.202268362045288, "learning_rate": 7.629608155771368e-07, "loss": 0.2264, "step": 4906 }, { "epoch": 0.23708750060395226, "grad_norm": 2.838142156600952, "learning_rate": 7.629124993960478e-07, "loss": 0.2243, "step": 4907 }, { "epoch": 0.2371358167850413, "grad_norm": 3.2068142890930176, "learning_rate": 7.628641832149586e-07, "loss": 0.3298, "step": 4908 }, { "epoch": 0.23718413296613036, "grad_norm": 3.2768828868865967, "learning_rate": 7.628158670338696e-07, "loss": 0.315, "step": 4909 }, { "epoch": 0.23723244914721942, "grad_norm": 4.651310920715332, "learning_rate": 7.627675508527805e-07, "loss": 0.3022, "step": 4910 }, { "epoch": 0.23728076532830844, "grad_norm": 2.785443067550659, "learning_rate": 7.627192346716915e-07, "loss": 0.3574, "step": 4911 }, { "epoch": 0.2373290815093975, "grad_norm": 2.785815954208374, "learning_rate": 7.626709184906025e-07, "loss": 0.3742, "step": 4912 }, { "epoch": 0.23737739769048655, "grad_norm": 2.591708183288574, "learning_rate": 7.626226023095135e-07, "loss": 0.2815, "step": 4913 }, { "epoch": 0.2374257138715756, "grad_norm": 6.111705303192139, "learning_rate": 7.625742861284243e-07, "loss": 0.339, "step": 4914 }, { "epoch": 0.23747403005266463, "grad_norm": 2.58901309967041, "learning_rate": 7.625259699473353e-07, "loss": 0.2991, "step": 4915 }, { "epoch": 0.23752234623375368, "grad_norm": 2.4575278759002686, "learning_rate": 7.624776537662463e-07, "loss": 0.1951, "step": 4916 }, { "epoch": 0.23757066241484273, "grad_norm": 4.2979607582092285, "learning_rate": 7.624293375851573e-07, "loss": 0.3866, "step": 4917 }, { "epoch": 0.23761897859593178, "grad_norm": 15.102218627929688, "learning_rate": 7.623810214040683e-07, "loss": 0.365, "step": 4918 }, { "epoch": 0.23766729477702084, "grad_norm": 2.9122793674468994, "learning_rate": 7.623327052229791e-07, "loss": 0.3563, "step": 4919 }, { "epoch": 0.23771561095810986, "grad_norm": 4.113455772399902, "learning_rate": 7.622843890418901e-07, "loss": 0.3974, "step": 4920 }, { "epoch": 0.23776392713919892, "grad_norm": 3.3583171367645264, "learning_rate": 7.62236072860801e-07, "loss": 0.1477, "step": 4921 }, { "epoch": 0.23781224332028797, "grad_norm": 2.0348923206329346, "learning_rate": 7.62187756679712e-07, "loss": 0.2531, "step": 4922 }, { "epoch": 0.23786055950137702, "grad_norm": 2.411616563796997, "learning_rate": 7.62139440498623e-07, "loss": 0.1852, "step": 4923 }, { "epoch": 0.23790887568246605, "grad_norm": 1.7501786947250366, "learning_rate": 7.620911243175339e-07, "loss": 0.1864, "step": 4924 }, { "epoch": 0.2379571918635551, "grad_norm": 2.8894076347351074, "learning_rate": 7.620428081364449e-07, "loss": 0.3639, "step": 4925 }, { "epoch": 0.23800550804464415, "grad_norm": 2.642786741256714, "learning_rate": 7.619944919553559e-07, "loss": 0.1776, "step": 4926 }, { "epoch": 0.2380538242257332, "grad_norm": 3.2145802974700928, "learning_rate": 7.619461757742667e-07, "loss": 0.2698, "step": 4927 }, { "epoch": 0.23810214040682223, "grad_norm": 2.937107563018799, "learning_rate": 7.618978595931777e-07, "loss": 0.3264, "step": 4928 }, { "epoch": 0.23815045658791129, "grad_norm": 2.8874194622039795, "learning_rate": 7.618495434120886e-07, "loss": 0.2936, "step": 4929 }, { "epoch": 0.23819877276900034, "grad_norm": 3.623997211456299, "learning_rate": 7.618012272309996e-07, "loss": 0.5053, "step": 4930 }, { "epoch": 0.2382470889500894, "grad_norm": 2.3389313220977783, "learning_rate": 7.617529110499106e-07, "loss": 0.3113, "step": 4931 }, { "epoch": 0.23829540513117844, "grad_norm": 2.325094223022461, "learning_rate": 7.617045948688216e-07, "loss": 0.2817, "step": 4932 }, { "epoch": 0.23834372131226747, "grad_norm": 2.660313844680786, "learning_rate": 7.616562786877325e-07, "loss": 0.3095, "step": 4933 }, { "epoch": 0.23839203749335652, "grad_norm": 2.4166033267974854, "learning_rate": 7.616079625066434e-07, "loss": 0.2707, "step": 4934 }, { "epoch": 0.23844035367444558, "grad_norm": 2.956312417984009, "learning_rate": 7.615596463255543e-07, "loss": 0.2473, "step": 4935 }, { "epoch": 0.23848866985553463, "grad_norm": 2.357635736465454, "learning_rate": 7.615113301444653e-07, "loss": 0.164, "step": 4936 }, { "epoch": 0.23853698603662365, "grad_norm": 2.510533571243286, "learning_rate": 7.614630139633763e-07, "loss": 0.2413, "step": 4937 }, { "epoch": 0.2385853022177127, "grad_norm": 2.7533485889434814, "learning_rate": 7.614146977822873e-07, "loss": 0.3466, "step": 4938 }, { "epoch": 0.23863361839880176, "grad_norm": 3.5260629653930664, "learning_rate": 7.613663816011983e-07, "loss": 0.5005, "step": 4939 }, { "epoch": 0.23868193457989081, "grad_norm": 3.3582396507263184, "learning_rate": 7.613180654201091e-07, "loss": 0.3921, "step": 4940 }, { "epoch": 0.23873025076097984, "grad_norm": 2.280808687210083, "learning_rate": 7.612697492390201e-07, "loss": 0.2283, "step": 4941 }, { "epoch": 0.2387785669420689, "grad_norm": 1.7419832944869995, "learning_rate": 7.612214330579311e-07, "loss": 0.195, "step": 4942 }, { "epoch": 0.23882688312315795, "grad_norm": 2.961951971054077, "learning_rate": 7.611731168768421e-07, "loss": 0.3283, "step": 4943 }, { "epoch": 0.238875199304247, "grad_norm": 3.4831666946411133, "learning_rate": 7.61124800695753e-07, "loss": 0.1684, "step": 4944 }, { "epoch": 0.23892351548533605, "grad_norm": 1.8852218389511108, "learning_rate": 7.610764845146639e-07, "loss": 0.2055, "step": 4945 }, { "epoch": 0.23897183166642508, "grad_norm": 2.658252239227295, "learning_rate": 7.610281683335748e-07, "loss": 0.298, "step": 4946 }, { "epoch": 0.23902014784751413, "grad_norm": 2.7258307933807373, "learning_rate": 7.609798521524858e-07, "loss": 0.2507, "step": 4947 }, { "epoch": 0.23906846402860318, "grad_norm": 7.848755836486816, "learning_rate": 7.609315359713968e-07, "loss": 0.2996, "step": 4948 }, { "epoch": 0.23911678020969224, "grad_norm": 2.7925753593444824, "learning_rate": 7.608832197903078e-07, "loss": 0.3158, "step": 4949 }, { "epoch": 0.23916509639078126, "grad_norm": 2.9249956607818604, "learning_rate": 7.608349036092187e-07, "loss": 0.4008, "step": 4950 }, { "epoch": 0.23921341257187032, "grad_norm": 3.2387523651123047, "learning_rate": 7.607865874281297e-07, "loss": 0.4365, "step": 4951 }, { "epoch": 0.23926172875295937, "grad_norm": 2.0177266597747803, "learning_rate": 7.607382712470406e-07, "loss": 0.2312, "step": 4952 }, { "epoch": 0.23931004493404842, "grad_norm": 4.046980381011963, "learning_rate": 7.606899550659515e-07, "loss": 0.1491, "step": 4953 }, { "epoch": 0.23935836111513745, "grad_norm": 2.0941109657287598, "learning_rate": 7.606416388848625e-07, "loss": 0.241, "step": 4954 }, { "epoch": 0.2394066772962265, "grad_norm": 2.055576801300049, "learning_rate": 7.605933227037734e-07, "loss": 0.2321, "step": 4955 }, { "epoch": 0.23945499347731555, "grad_norm": 1.8536465167999268, "learning_rate": 7.605450065226844e-07, "loss": 0.2088, "step": 4956 }, { "epoch": 0.2395033096584046, "grad_norm": 4.612725734710693, "learning_rate": 7.604966903415954e-07, "loss": 0.3188, "step": 4957 }, { "epoch": 0.23955162583949366, "grad_norm": 2.5256035327911377, "learning_rate": 7.604483741605064e-07, "loss": 0.244, "step": 4958 }, { "epoch": 0.23959994202058268, "grad_norm": 2.2486913204193115, "learning_rate": 7.604000579794173e-07, "loss": 0.2681, "step": 4959 }, { "epoch": 0.23964825820167174, "grad_norm": 4.375328540802002, "learning_rate": 7.603517417983281e-07, "loss": 0.3556, "step": 4960 }, { "epoch": 0.2396965743827608, "grad_norm": 4.499166488647461, "learning_rate": 7.603034256172391e-07, "loss": 0.3419, "step": 4961 }, { "epoch": 0.23974489056384984, "grad_norm": 2.396636724472046, "learning_rate": 7.602551094361501e-07, "loss": 0.2374, "step": 4962 }, { "epoch": 0.23979320674493887, "grad_norm": 1.897364616394043, "learning_rate": 7.602067932550611e-07, "loss": 0.214, "step": 4963 }, { "epoch": 0.23984152292602792, "grad_norm": 2.4802117347717285, "learning_rate": 7.601584770739721e-07, "loss": 0.2457, "step": 4964 }, { "epoch": 0.23988983910711698, "grad_norm": 2.290921926498413, "learning_rate": 7.60110160892883e-07, "loss": 0.3036, "step": 4965 }, { "epoch": 0.23993815528820603, "grad_norm": 2.9747183322906494, "learning_rate": 7.600618447117939e-07, "loss": 0.3498, "step": 4966 }, { "epoch": 0.23998647146929505, "grad_norm": 4.881371021270752, "learning_rate": 7.600135285307049e-07, "loss": 0.3202, "step": 4967 }, { "epoch": 0.2400347876503841, "grad_norm": 8.041399955749512, "learning_rate": 7.599652123496159e-07, "loss": 0.4594, "step": 4968 }, { "epoch": 0.24008310383147316, "grad_norm": 4.822211265563965, "learning_rate": 7.599168961685268e-07, "loss": 0.2639, "step": 4969 }, { "epoch": 0.2401314200125622, "grad_norm": 3.5975406169891357, "learning_rate": 7.598685799874378e-07, "loss": 0.329, "step": 4970 }, { "epoch": 0.24017973619365127, "grad_norm": 2.689196825027466, "learning_rate": 7.598202638063487e-07, "loss": 0.2717, "step": 4971 }, { "epoch": 0.2402280523747403, "grad_norm": 2.6950602531433105, "learning_rate": 7.597719476252596e-07, "loss": 0.2423, "step": 4972 }, { "epoch": 0.24027636855582934, "grad_norm": 1.209395170211792, "learning_rate": 7.597236314441706e-07, "loss": 0.1307, "step": 4973 }, { "epoch": 0.2403246847369184, "grad_norm": 2.647320508956909, "learning_rate": 7.596753152630816e-07, "loss": 0.3346, "step": 4974 }, { "epoch": 0.24037300091800745, "grad_norm": 4.0602006912231445, "learning_rate": 7.596269990819926e-07, "loss": 0.3997, "step": 4975 }, { "epoch": 0.24042131709909648, "grad_norm": 1.2266534566879272, "learning_rate": 7.595786829009035e-07, "loss": 0.1277, "step": 4976 }, { "epoch": 0.24046963328018553, "grad_norm": 2.8921947479248047, "learning_rate": 7.595303667198145e-07, "loss": 0.2004, "step": 4977 }, { "epoch": 0.24051794946127458, "grad_norm": 2.10733962059021, "learning_rate": 7.594820505387253e-07, "loss": 0.2194, "step": 4978 }, { "epoch": 0.24056626564236364, "grad_norm": 4.470938682556152, "learning_rate": 7.594337343576363e-07, "loss": 0.401, "step": 4979 }, { "epoch": 0.24061458182345266, "grad_norm": 2.679100275039673, "learning_rate": 7.593854181765473e-07, "loss": 0.4567, "step": 4980 }, { "epoch": 0.24066289800454171, "grad_norm": 8.701005935668945, "learning_rate": 7.593371019954582e-07, "loss": 0.3503, "step": 4981 }, { "epoch": 0.24071121418563077, "grad_norm": 2.7681424617767334, "learning_rate": 7.592887858143692e-07, "loss": 0.3585, "step": 4982 }, { "epoch": 0.24075953036671982, "grad_norm": 2.2411720752716064, "learning_rate": 7.592404696332802e-07, "loss": 0.2892, "step": 4983 }, { "epoch": 0.24080784654780887, "grad_norm": 1.8406286239624023, "learning_rate": 7.591921534521912e-07, "loss": 0.2993, "step": 4984 }, { "epoch": 0.2408561627288979, "grad_norm": 13.235750198364258, "learning_rate": 7.591438372711021e-07, "loss": 0.3693, "step": 4985 }, { "epoch": 0.24090447890998695, "grad_norm": 2.85815691947937, "learning_rate": 7.590955210900129e-07, "loss": 0.4074, "step": 4986 }, { "epoch": 0.240952795091076, "grad_norm": 2.27719783782959, "learning_rate": 7.590472049089239e-07, "loss": 0.2516, "step": 4987 }, { "epoch": 0.24100111127216506, "grad_norm": 2.5437159538269043, "learning_rate": 7.589988887278349e-07, "loss": 0.3435, "step": 4988 }, { "epoch": 0.24104942745325408, "grad_norm": 2.4732155799865723, "learning_rate": 7.589505725467459e-07, "loss": 0.2737, "step": 4989 }, { "epoch": 0.24109774363434314, "grad_norm": 3.464682102203369, "learning_rate": 7.589022563656569e-07, "loss": 0.2833, "step": 4990 }, { "epoch": 0.2411460598154322, "grad_norm": 2.577922821044922, "learning_rate": 7.588539401845678e-07, "loss": 0.3328, "step": 4991 }, { "epoch": 0.24119437599652124, "grad_norm": 2.166226625442505, "learning_rate": 7.588056240034787e-07, "loss": 0.2552, "step": 4992 }, { "epoch": 0.24124269217761027, "grad_norm": 2.5285043716430664, "learning_rate": 7.587573078223897e-07, "loss": 0.2866, "step": 4993 }, { "epoch": 0.24129100835869932, "grad_norm": 1.717702031135559, "learning_rate": 7.587089916413007e-07, "loss": 0.1568, "step": 4994 }, { "epoch": 0.24133932453978837, "grad_norm": 2.263315439224243, "learning_rate": 7.586606754602116e-07, "loss": 0.2688, "step": 4995 }, { "epoch": 0.24138764072087743, "grad_norm": 2.865473508834839, "learning_rate": 7.586123592791226e-07, "loss": 0.3681, "step": 4996 }, { "epoch": 0.24143595690196648, "grad_norm": 2.6515748500823975, "learning_rate": 7.585640430980334e-07, "loss": 0.324, "step": 4997 }, { "epoch": 0.2414842730830555, "grad_norm": 3.1292500495910645, "learning_rate": 7.585157269169444e-07, "loss": 0.2757, "step": 4998 }, { "epoch": 0.24153258926414456, "grad_norm": 1.8680256605148315, "learning_rate": 7.584674107358554e-07, "loss": 0.2371, "step": 4999 }, { "epoch": 0.2415809054452336, "grad_norm": 4.0087080001831055, "learning_rate": 7.584190945547664e-07, "loss": 0.3651, "step": 5000 }, { "epoch": 0.24162922162632267, "grad_norm": 4.531729221343994, "learning_rate": 7.583707783736774e-07, "loss": 0.3247, "step": 5001 }, { "epoch": 0.2416775378074117, "grad_norm": 2.871236562728882, "learning_rate": 7.583224621925883e-07, "loss": 0.3397, "step": 5002 }, { "epoch": 0.24172585398850074, "grad_norm": 2.803579568862915, "learning_rate": 7.582741460114992e-07, "loss": 0.237, "step": 5003 }, { "epoch": 0.2417741701695898, "grad_norm": 2.4115240573883057, "learning_rate": 7.582258298304101e-07, "loss": 0.2722, "step": 5004 }, { "epoch": 0.24182248635067885, "grad_norm": 3.5771446228027344, "learning_rate": 7.581775136493211e-07, "loss": 0.3438, "step": 5005 }, { "epoch": 0.24187080253176788, "grad_norm": 2.7085492610931396, "learning_rate": 7.581291974682321e-07, "loss": 0.3373, "step": 5006 }, { "epoch": 0.24191911871285693, "grad_norm": 3.1925394535064697, "learning_rate": 7.58080881287143e-07, "loss": 0.2983, "step": 5007 }, { "epoch": 0.24196743489394598, "grad_norm": 6.625551700592041, "learning_rate": 7.58032565106054e-07, "loss": 0.3999, "step": 5008 }, { "epoch": 0.24201575107503503, "grad_norm": 4.948273181915283, "learning_rate": 7.57984248924965e-07, "loss": 0.1981, "step": 5009 }, { "epoch": 0.2420640672561241, "grad_norm": 6.822253704071045, "learning_rate": 7.579359327438759e-07, "loss": 0.3792, "step": 5010 }, { "epoch": 0.2421123834372131, "grad_norm": 2.558607339859009, "learning_rate": 7.578876165627868e-07, "loss": 0.2444, "step": 5011 }, { "epoch": 0.24216069961830217, "grad_norm": 3.3482282161712646, "learning_rate": 7.578393003816977e-07, "loss": 0.3075, "step": 5012 }, { "epoch": 0.24220901579939122, "grad_norm": 2.327571392059326, "learning_rate": 7.577909842006087e-07, "loss": 0.2074, "step": 5013 }, { "epoch": 0.24225733198048027, "grad_norm": 2.729308843612671, "learning_rate": 7.577426680195197e-07, "loss": 0.3926, "step": 5014 }, { "epoch": 0.2423056481615693, "grad_norm": 3.1531476974487305, "learning_rate": 7.576943518384307e-07, "loss": 0.3247, "step": 5015 }, { "epoch": 0.24235396434265835, "grad_norm": 1.8470653295516968, "learning_rate": 7.576460356573417e-07, "loss": 0.2041, "step": 5016 }, { "epoch": 0.2424022805237474, "grad_norm": 2.3651318550109863, "learning_rate": 7.575977194762526e-07, "loss": 0.2481, "step": 5017 }, { "epoch": 0.24245059670483646, "grad_norm": 2.45733642578125, "learning_rate": 7.575494032951635e-07, "loss": 0.2589, "step": 5018 }, { "epoch": 0.24249891288592548, "grad_norm": 2.471466064453125, "learning_rate": 7.575010871140745e-07, "loss": 0.3264, "step": 5019 }, { "epoch": 0.24254722906701454, "grad_norm": 2.8125905990600586, "learning_rate": 7.574527709329854e-07, "loss": 0.3128, "step": 5020 }, { "epoch": 0.2425955452481036, "grad_norm": 1.828529953956604, "learning_rate": 7.574044547518964e-07, "loss": 0.1874, "step": 5021 }, { "epoch": 0.24264386142919264, "grad_norm": 2.8021934032440186, "learning_rate": 7.573561385708074e-07, "loss": 0.4364, "step": 5022 }, { "epoch": 0.2426921776102817, "grad_norm": 3.0496861934661865, "learning_rate": 7.573078223897182e-07, "loss": 0.3909, "step": 5023 }, { "epoch": 0.24274049379137072, "grad_norm": 9.492484092712402, "learning_rate": 7.572595062086292e-07, "loss": 0.3789, "step": 5024 }, { "epoch": 0.24278880997245977, "grad_norm": 6.991070747375488, "learning_rate": 7.572111900275402e-07, "loss": 0.2957, "step": 5025 }, { "epoch": 0.24283712615354883, "grad_norm": 2.867720365524292, "learning_rate": 7.571628738464512e-07, "loss": 0.2417, "step": 5026 }, { "epoch": 0.24288544233463788, "grad_norm": 2.4117140769958496, "learning_rate": 7.571145576653622e-07, "loss": 0.2936, "step": 5027 }, { "epoch": 0.2429337585157269, "grad_norm": 7.210855484008789, "learning_rate": 7.57066241484273e-07, "loss": 0.224, "step": 5028 }, { "epoch": 0.24298207469681596, "grad_norm": 2.2263994216918945, "learning_rate": 7.570179253031839e-07, "loss": 0.2947, "step": 5029 }, { "epoch": 0.243030390877905, "grad_norm": 2.511277914047241, "learning_rate": 7.569696091220949e-07, "loss": 0.2943, "step": 5030 }, { "epoch": 0.24307870705899406, "grad_norm": 3.707209348678589, "learning_rate": 7.569212929410059e-07, "loss": 0.3395, "step": 5031 }, { "epoch": 0.2431270232400831, "grad_norm": 3.3663218021392822, "learning_rate": 7.568729767599169e-07, "loss": 0.4848, "step": 5032 }, { "epoch": 0.24317533942117214, "grad_norm": 2.63401198387146, "learning_rate": 7.568246605788278e-07, "loss": 0.3361, "step": 5033 }, { "epoch": 0.2432236556022612, "grad_norm": 2.3784213066101074, "learning_rate": 7.567763443977388e-07, "loss": 0.2269, "step": 5034 }, { "epoch": 0.24327197178335025, "grad_norm": 3.736759662628174, "learning_rate": 7.567280282166498e-07, "loss": 0.3096, "step": 5035 }, { "epoch": 0.2433202879644393, "grad_norm": 2.5536255836486816, "learning_rate": 7.566797120355607e-07, "loss": 0.2974, "step": 5036 }, { "epoch": 0.24336860414552833, "grad_norm": 4.7177324295043945, "learning_rate": 7.566313958544716e-07, "loss": 0.2738, "step": 5037 }, { "epoch": 0.24341692032661738, "grad_norm": 3.5259299278259277, "learning_rate": 7.565830796733825e-07, "loss": 0.3637, "step": 5038 }, { "epoch": 0.24346523650770643, "grad_norm": 3.934635639190674, "learning_rate": 7.565347634922935e-07, "loss": 0.4302, "step": 5039 }, { "epoch": 0.2435135526887955, "grad_norm": 6.651780605316162, "learning_rate": 7.564864473112045e-07, "loss": 0.3361, "step": 5040 }, { "epoch": 0.2435618688698845, "grad_norm": 4.311716079711914, "learning_rate": 7.564381311301155e-07, "loss": 0.2496, "step": 5041 }, { "epoch": 0.24361018505097357, "grad_norm": 3.7158212661743164, "learning_rate": 7.563898149490264e-07, "loss": 0.3359, "step": 5042 }, { "epoch": 0.24365850123206262, "grad_norm": 2.372999429702759, "learning_rate": 7.563414987679374e-07, "loss": 0.3139, "step": 5043 }, { "epoch": 0.24370681741315167, "grad_norm": 3.165940999984741, "learning_rate": 7.562931825868483e-07, "loss": 0.3988, "step": 5044 }, { "epoch": 0.24375513359424072, "grad_norm": 3.11983585357666, "learning_rate": 7.562448664057592e-07, "loss": 0.3245, "step": 5045 }, { "epoch": 0.24380344977532975, "grad_norm": 2.5217156410217285, "learning_rate": 7.561965502246702e-07, "loss": 0.2939, "step": 5046 }, { "epoch": 0.2438517659564188, "grad_norm": 2.9894490242004395, "learning_rate": 7.561482340435812e-07, "loss": 0.2703, "step": 5047 }, { "epoch": 0.24390008213750786, "grad_norm": 3.359989881515503, "learning_rate": 7.560999178624922e-07, "loss": 0.3435, "step": 5048 }, { "epoch": 0.2439483983185969, "grad_norm": 1.9195294380187988, "learning_rate": 7.56051601681403e-07, "loss": 0.2506, "step": 5049 }, { "epoch": 0.24399671449968593, "grad_norm": 2.4460854530334473, "learning_rate": 7.56003285500314e-07, "loss": 0.2607, "step": 5050 }, { "epoch": 0.244045030680775, "grad_norm": 1.3063101768493652, "learning_rate": 7.55954969319225e-07, "loss": 0.1483, "step": 5051 }, { "epoch": 0.24409334686186404, "grad_norm": 3.594792127609253, "learning_rate": 7.55906653138136e-07, "loss": 0.2962, "step": 5052 }, { "epoch": 0.2441416630429531, "grad_norm": 2.5474557876586914, "learning_rate": 7.55858336957047e-07, "loss": 0.3244, "step": 5053 }, { "epoch": 0.24418997922404212, "grad_norm": 1.5788049697875977, "learning_rate": 7.558100207759578e-07, "loss": 0.1468, "step": 5054 }, { "epoch": 0.24423829540513117, "grad_norm": 3.099663019180298, "learning_rate": 7.557617045948687e-07, "loss": 0.3284, "step": 5055 }, { "epoch": 0.24428661158622023, "grad_norm": 4.945630073547363, "learning_rate": 7.557133884137797e-07, "loss": 0.2384, "step": 5056 }, { "epoch": 0.24433492776730928, "grad_norm": 2.522191047668457, "learning_rate": 7.556650722326907e-07, "loss": 0.3048, "step": 5057 }, { "epoch": 0.24438324394839833, "grad_norm": 2.7409117221832275, "learning_rate": 7.556167560516017e-07, "loss": 0.2708, "step": 5058 }, { "epoch": 0.24443156012948736, "grad_norm": 3.0001707077026367, "learning_rate": 7.555684398705126e-07, "loss": 0.3337, "step": 5059 }, { "epoch": 0.2444798763105764, "grad_norm": 2.036297082901001, "learning_rate": 7.555201236894236e-07, "loss": 0.2357, "step": 5060 }, { "epoch": 0.24452819249166546, "grad_norm": 2.043186664581299, "learning_rate": 7.554718075083345e-07, "loss": 0.1915, "step": 5061 }, { "epoch": 0.24457650867275452, "grad_norm": 2.8311338424682617, "learning_rate": 7.554234913272454e-07, "loss": 0.3657, "step": 5062 }, { "epoch": 0.24462482485384354, "grad_norm": 1.709669589996338, "learning_rate": 7.553751751461564e-07, "loss": 0.1836, "step": 5063 }, { "epoch": 0.2446731410349326, "grad_norm": 2.494227886199951, "learning_rate": 7.553268589650673e-07, "loss": 0.3139, "step": 5064 }, { "epoch": 0.24472145721602165, "grad_norm": 2.7851803302764893, "learning_rate": 7.552785427839783e-07, "loss": 0.3109, "step": 5065 }, { "epoch": 0.2447697733971107, "grad_norm": 2.0810563564300537, "learning_rate": 7.552302266028893e-07, "loss": 0.2247, "step": 5066 }, { "epoch": 0.24481808957819973, "grad_norm": 2.9243240356445312, "learning_rate": 7.551819104218003e-07, "loss": 0.4346, "step": 5067 }, { "epoch": 0.24486640575928878, "grad_norm": 1.4421542882919312, "learning_rate": 7.551335942407112e-07, "loss": 0.1555, "step": 5068 }, { "epoch": 0.24491472194037783, "grad_norm": 2.8502469062805176, "learning_rate": 7.550852780596222e-07, "loss": 0.3347, "step": 5069 }, { "epoch": 0.2449630381214669, "grad_norm": 2.9841058254241943, "learning_rate": 7.55036961878533e-07, "loss": 0.4053, "step": 5070 }, { "epoch": 0.24501135430255594, "grad_norm": 2.6019527912139893, "learning_rate": 7.54988645697444e-07, "loss": 0.3268, "step": 5071 }, { "epoch": 0.24505967048364496, "grad_norm": 2.192918300628662, "learning_rate": 7.54940329516355e-07, "loss": 0.2395, "step": 5072 }, { "epoch": 0.24510798666473402, "grad_norm": 2.3162310123443604, "learning_rate": 7.54892013335266e-07, "loss": 0.2544, "step": 5073 }, { "epoch": 0.24515630284582307, "grad_norm": 3.224334478378296, "learning_rate": 7.548436971541769e-07, "loss": 0.3586, "step": 5074 }, { "epoch": 0.24520461902691212, "grad_norm": 2.9139020442962646, "learning_rate": 7.547953809730878e-07, "loss": 0.3572, "step": 5075 }, { "epoch": 0.24525293520800115, "grad_norm": 3.0477256774902344, "learning_rate": 7.547470647919988e-07, "loss": 0.291, "step": 5076 }, { "epoch": 0.2453012513890902, "grad_norm": 2.1449625492095947, "learning_rate": 7.546987486109098e-07, "loss": 0.2447, "step": 5077 }, { "epoch": 0.24534956757017926, "grad_norm": 2.7823262214660645, "learning_rate": 7.546504324298208e-07, "loss": 0.3425, "step": 5078 }, { "epoch": 0.2453978837512683, "grad_norm": 2.9260456562042236, "learning_rate": 7.546021162487317e-07, "loss": 0.2497, "step": 5079 }, { "epoch": 0.24544619993235733, "grad_norm": 2.648155927658081, "learning_rate": 7.545538000676425e-07, "loss": 0.2833, "step": 5080 }, { "epoch": 0.2454945161134464, "grad_norm": 3.664768934249878, "learning_rate": 7.545054838865535e-07, "loss": 0.5445, "step": 5081 }, { "epoch": 0.24554283229453544, "grad_norm": 2.8454067707061768, "learning_rate": 7.544571677054645e-07, "loss": 0.27, "step": 5082 }, { "epoch": 0.2455911484756245, "grad_norm": 2.8595244884490967, "learning_rate": 7.544088515243755e-07, "loss": 0.3407, "step": 5083 }, { "epoch": 0.24563946465671355, "grad_norm": 2.825111150741577, "learning_rate": 7.543605353432865e-07, "loss": 0.2791, "step": 5084 }, { "epoch": 0.24568778083780257, "grad_norm": 3.3703832626342773, "learning_rate": 7.543122191621974e-07, "loss": 0.4072, "step": 5085 }, { "epoch": 0.24573609701889162, "grad_norm": 2.8201799392700195, "learning_rate": 7.542639029811084e-07, "loss": 0.1736, "step": 5086 }, { "epoch": 0.24578441319998068, "grad_norm": 4.403817176818848, "learning_rate": 7.542155868000192e-07, "loss": 0.2354, "step": 5087 }, { "epoch": 0.24583272938106973, "grad_norm": 3.2385289669036865, "learning_rate": 7.541672706189302e-07, "loss": 0.2955, "step": 5088 }, { "epoch": 0.24588104556215876, "grad_norm": 2.8463194370269775, "learning_rate": 7.541189544378412e-07, "loss": 0.399, "step": 5089 }, { "epoch": 0.2459293617432478, "grad_norm": 2.7672629356384277, "learning_rate": 7.540706382567521e-07, "loss": 0.4485, "step": 5090 }, { "epoch": 0.24597767792433686, "grad_norm": 4.99940299987793, "learning_rate": 7.540223220756631e-07, "loss": 0.309, "step": 5091 }, { "epoch": 0.24602599410542592, "grad_norm": 3.796679735183716, "learning_rate": 7.539740058945741e-07, "loss": 0.3316, "step": 5092 }, { "epoch": 0.24607431028651494, "grad_norm": 1.8055860996246338, "learning_rate": 7.53925689713485e-07, "loss": 0.203, "step": 5093 }, { "epoch": 0.246122626467604, "grad_norm": 2.41209077835083, "learning_rate": 7.53877373532396e-07, "loss": 0.2611, "step": 5094 }, { "epoch": 0.24617094264869305, "grad_norm": 6.007416725158691, "learning_rate": 7.53829057351307e-07, "loss": 0.2972, "step": 5095 }, { "epoch": 0.2462192588297821, "grad_norm": 1.7823739051818848, "learning_rate": 7.537807411702178e-07, "loss": 0.2031, "step": 5096 }, { "epoch": 0.24626757501087115, "grad_norm": 2.204211711883545, "learning_rate": 7.537324249891288e-07, "loss": 0.229, "step": 5097 }, { "epoch": 0.24631589119196018, "grad_norm": 2.439535617828369, "learning_rate": 7.536841088080398e-07, "loss": 0.1971, "step": 5098 }, { "epoch": 0.24636420737304923, "grad_norm": 3.1562798023223877, "learning_rate": 7.536357926269508e-07, "loss": 0.3361, "step": 5099 }, { "epoch": 0.24641252355413829, "grad_norm": 3.605267286300659, "learning_rate": 7.535874764458617e-07, "loss": 0.4009, "step": 5100 }, { "epoch": 0.24646083973522734, "grad_norm": 2.0559325218200684, "learning_rate": 7.535391602647726e-07, "loss": 0.2, "step": 5101 }, { "epoch": 0.24650915591631636, "grad_norm": 3.4506375789642334, "learning_rate": 7.534908440836836e-07, "loss": 0.3374, "step": 5102 }, { "epoch": 0.24655747209740542, "grad_norm": 4.114334583282471, "learning_rate": 7.534425279025946e-07, "loss": 0.5036, "step": 5103 }, { "epoch": 0.24660578827849447, "grad_norm": 3.8463268280029297, "learning_rate": 7.533942117215056e-07, "loss": 0.1822, "step": 5104 }, { "epoch": 0.24665410445958352, "grad_norm": 2.8811843395233154, "learning_rate": 7.533458955404165e-07, "loss": 0.3903, "step": 5105 }, { "epoch": 0.24670242064067255, "grad_norm": 3.9455859661102295, "learning_rate": 7.532975793593273e-07, "loss": 0.2933, "step": 5106 }, { "epoch": 0.2467507368217616, "grad_norm": 2.4008448123931885, "learning_rate": 7.532492631782383e-07, "loss": 0.2856, "step": 5107 }, { "epoch": 0.24679905300285065, "grad_norm": 2.5464608669281006, "learning_rate": 7.532009469971493e-07, "loss": 0.2991, "step": 5108 }, { "epoch": 0.2468473691839397, "grad_norm": 4.745504379272461, "learning_rate": 7.531526308160603e-07, "loss": 0.3198, "step": 5109 }, { "epoch": 0.24689568536502876, "grad_norm": 7.012850761413574, "learning_rate": 7.531043146349713e-07, "loss": 0.3111, "step": 5110 }, { "epoch": 0.2469440015461178, "grad_norm": 2.68435001373291, "learning_rate": 7.530559984538822e-07, "loss": 0.3147, "step": 5111 }, { "epoch": 0.24699231772720684, "grad_norm": 2.331113338470459, "learning_rate": 7.53007682272793e-07, "loss": 0.2561, "step": 5112 }, { "epoch": 0.2470406339082959, "grad_norm": 2.4650352001190186, "learning_rate": 7.52959366091704e-07, "loss": 0.2251, "step": 5113 }, { "epoch": 0.24708895008938495, "grad_norm": 2.675786018371582, "learning_rate": 7.52911049910615e-07, "loss": 0.3188, "step": 5114 }, { "epoch": 0.24713726627047397, "grad_norm": 2.579068899154663, "learning_rate": 7.52862733729526e-07, "loss": 0.2371, "step": 5115 }, { "epoch": 0.24718558245156302, "grad_norm": 3.612135410308838, "learning_rate": 7.528144175484369e-07, "loss": 0.2556, "step": 5116 }, { "epoch": 0.24723389863265208, "grad_norm": 3.204007148742676, "learning_rate": 7.527661013673479e-07, "loss": 0.3909, "step": 5117 }, { "epoch": 0.24728221481374113, "grad_norm": 2.871108293533325, "learning_rate": 7.527177851862589e-07, "loss": 0.344, "step": 5118 }, { "epoch": 0.24733053099483016, "grad_norm": 3.949204444885254, "learning_rate": 7.526694690051698e-07, "loss": 0.3813, "step": 5119 }, { "epoch": 0.2473788471759192, "grad_norm": 2.123363733291626, "learning_rate": 7.526211528240808e-07, "loss": 0.2313, "step": 5120 }, { "epoch": 0.24742716335700826, "grad_norm": 3.7449228763580322, "learning_rate": 7.525728366429916e-07, "loss": 0.2931, "step": 5121 }, { "epoch": 0.24747547953809731, "grad_norm": 2.6588451862335205, "learning_rate": 7.525245204619026e-07, "loss": 0.3032, "step": 5122 }, { "epoch": 0.24752379571918637, "grad_norm": 1.9754997491836548, "learning_rate": 7.524762042808136e-07, "loss": 0.2142, "step": 5123 }, { "epoch": 0.2475721119002754, "grad_norm": 3.0399301052093506, "learning_rate": 7.524278880997246e-07, "loss": 0.371, "step": 5124 }, { "epoch": 0.24762042808136445, "grad_norm": 2.5430872440338135, "learning_rate": 7.523795719186355e-07, "loss": 0.2885, "step": 5125 }, { "epoch": 0.2476687442624535, "grad_norm": 10.181968688964844, "learning_rate": 7.523312557375465e-07, "loss": 0.3965, "step": 5126 }, { "epoch": 0.24771706044354255, "grad_norm": 2.450495958328247, "learning_rate": 7.522829395564574e-07, "loss": 0.3463, "step": 5127 }, { "epoch": 0.24776537662463158, "grad_norm": 4.103266716003418, "learning_rate": 7.522346233753684e-07, "loss": 0.435, "step": 5128 }, { "epoch": 0.24781369280572063, "grad_norm": 2.4886839389801025, "learning_rate": 7.521863071942794e-07, "loss": 0.2296, "step": 5129 }, { "epoch": 0.24786200898680968, "grad_norm": 2.7260329723358154, "learning_rate": 7.521379910131903e-07, "loss": 0.2898, "step": 5130 }, { "epoch": 0.24791032516789874, "grad_norm": 5.056642532348633, "learning_rate": 7.520896748321013e-07, "loss": 0.2958, "step": 5131 }, { "epoch": 0.24795864134898776, "grad_norm": 2.8382515907287598, "learning_rate": 7.520413586510121e-07, "loss": 0.3488, "step": 5132 }, { "epoch": 0.24800695753007682, "grad_norm": 1.810133457183838, "learning_rate": 7.519930424699231e-07, "loss": 0.2353, "step": 5133 }, { "epoch": 0.24805527371116587, "grad_norm": 1.763594627380371, "learning_rate": 7.519447262888341e-07, "loss": 0.1873, "step": 5134 }, { "epoch": 0.24810358989225492, "grad_norm": 3.309852361679077, "learning_rate": 7.518964101077451e-07, "loss": 0.4725, "step": 5135 }, { "epoch": 0.24815190607334398, "grad_norm": 3.712435722351074, "learning_rate": 7.518480939266561e-07, "loss": 0.201, "step": 5136 }, { "epoch": 0.248200222254433, "grad_norm": 2.2014987468719482, "learning_rate": 7.51799777745567e-07, "loss": 0.278, "step": 5137 }, { "epoch": 0.24824853843552205, "grad_norm": 2.7048611640930176, "learning_rate": 7.517514615644778e-07, "loss": 0.3133, "step": 5138 }, { "epoch": 0.2482968546166111, "grad_norm": 4.394967079162598, "learning_rate": 7.517031453833888e-07, "loss": 0.4035, "step": 5139 }, { "epoch": 0.24834517079770016, "grad_norm": 5.773996353149414, "learning_rate": 7.516548292022998e-07, "loss": 0.1555, "step": 5140 }, { "epoch": 0.24839348697878919, "grad_norm": 2.8599228858947754, "learning_rate": 7.516065130212108e-07, "loss": 0.2966, "step": 5141 }, { "epoch": 0.24844180315987824, "grad_norm": 2.949334144592285, "learning_rate": 7.515581968401217e-07, "loss": 0.3108, "step": 5142 }, { "epoch": 0.2484901193409673, "grad_norm": 2.8559741973876953, "learning_rate": 7.515098806590327e-07, "loss": 0.2917, "step": 5143 }, { "epoch": 0.24853843552205634, "grad_norm": 2.448331117630005, "learning_rate": 7.514615644779436e-07, "loss": 0.3276, "step": 5144 }, { "epoch": 0.24858675170314537, "grad_norm": 5.503905773162842, "learning_rate": 7.514132482968546e-07, "loss": 0.2963, "step": 5145 }, { "epoch": 0.24863506788423442, "grad_norm": 2.482367515563965, "learning_rate": 7.513649321157656e-07, "loss": 0.2801, "step": 5146 }, { "epoch": 0.24868338406532348, "grad_norm": 2.798152446746826, "learning_rate": 7.513166159346764e-07, "loss": 0.336, "step": 5147 }, { "epoch": 0.24873170024641253, "grad_norm": 3.653764009475708, "learning_rate": 7.512682997535874e-07, "loss": 0.2842, "step": 5148 }, { "epoch": 0.24878001642750158, "grad_norm": 2.4328298568725586, "learning_rate": 7.512199835724984e-07, "loss": 0.2801, "step": 5149 }, { "epoch": 0.2488283326085906, "grad_norm": 3.8517119884490967, "learning_rate": 7.511716673914094e-07, "loss": 0.4333, "step": 5150 }, { "epoch": 0.24887664878967966, "grad_norm": 3.5054450035095215, "learning_rate": 7.511233512103203e-07, "loss": 0.3037, "step": 5151 }, { "epoch": 0.24892496497076871, "grad_norm": 2.5745139122009277, "learning_rate": 7.510750350292313e-07, "loss": 0.147, "step": 5152 }, { "epoch": 0.24897328115185777, "grad_norm": 2.033609390258789, "learning_rate": 7.510267188481422e-07, "loss": 0.2292, "step": 5153 }, { "epoch": 0.2490215973329468, "grad_norm": 2.374319553375244, "learning_rate": 7.509784026670532e-07, "loss": 0.2272, "step": 5154 }, { "epoch": 0.24906991351403585, "grad_norm": 2.3220813274383545, "learning_rate": 7.509300864859641e-07, "loss": 0.2711, "step": 5155 }, { "epoch": 0.2491182296951249, "grad_norm": 3.622699022293091, "learning_rate": 7.508817703048751e-07, "loss": 0.3663, "step": 5156 }, { "epoch": 0.24916654587621395, "grad_norm": 2.9426093101501465, "learning_rate": 7.50833454123786e-07, "loss": 0.3157, "step": 5157 }, { "epoch": 0.24921486205730298, "grad_norm": 2.2804830074310303, "learning_rate": 7.507851379426969e-07, "loss": 0.2555, "step": 5158 }, { "epoch": 0.24926317823839203, "grad_norm": 4.879279613494873, "learning_rate": 7.507368217616079e-07, "loss": 0.4972, "step": 5159 }, { "epoch": 0.24931149441948108, "grad_norm": 2.412041664123535, "learning_rate": 7.506885055805189e-07, "loss": 0.2172, "step": 5160 }, { "epoch": 0.24935981060057014, "grad_norm": 2.701245069503784, "learning_rate": 7.506401893994299e-07, "loss": 0.3074, "step": 5161 }, { "epoch": 0.2494081267816592, "grad_norm": 3.883373260498047, "learning_rate": 7.505918732183409e-07, "loss": 0.275, "step": 5162 }, { "epoch": 0.24945644296274821, "grad_norm": 3.052604913711548, "learning_rate": 7.505435570372516e-07, "loss": 0.3479, "step": 5163 }, { "epoch": 0.24950475914383727, "grad_norm": 2.625084161758423, "learning_rate": 7.504952408561626e-07, "loss": 0.3337, "step": 5164 }, { "epoch": 0.24955307532492632, "grad_norm": 2.4050233364105225, "learning_rate": 7.504469246750736e-07, "loss": 0.1984, "step": 5165 }, { "epoch": 0.24960139150601537, "grad_norm": 2.474510431289673, "learning_rate": 7.503986084939846e-07, "loss": 0.283, "step": 5166 }, { "epoch": 0.2496497076871044, "grad_norm": 2.704850435256958, "learning_rate": 7.503502923128956e-07, "loss": 0.312, "step": 5167 }, { "epoch": 0.24969802386819345, "grad_norm": 2.0272488594055176, "learning_rate": 7.503019761318065e-07, "loss": 0.2034, "step": 5168 }, { "epoch": 0.2497463400492825, "grad_norm": 3.550175189971924, "learning_rate": 7.502536599507175e-07, "loss": 0.2337, "step": 5169 }, { "epoch": 0.24979465623037156, "grad_norm": 2.268028974533081, "learning_rate": 7.502053437696284e-07, "loss": 0.2082, "step": 5170 }, { "epoch": 0.24984297241146058, "grad_norm": 3.2380428314208984, "learning_rate": 7.501570275885394e-07, "loss": 0.3929, "step": 5171 }, { "epoch": 0.24989128859254964, "grad_norm": 7.752634048461914, "learning_rate": 7.501087114074503e-07, "loss": 0.2277, "step": 5172 }, { "epoch": 0.2499396047736387, "grad_norm": 2.43882155418396, "learning_rate": 7.500603952263612e-07, "loss": 0.2953, "step": 5173 }, { "epoch": 0.24998792095472774, "grad_norm": 2.7585933208465576, "learning_rate": 7.500120790452722e-07, "loss": 0.3459, "step": 5174 }, { "epoch": 0.2500362371358168, "grad_norm": 1.9410607814788818, "learning_rate": 7.499637628641832e-07, "loss": 0.2395, "step": 5175 }, { "epoch": 0.2500845533169058, "grad_norm": 4.595141887664795, "learning_rate": 7.499154466830941e-07, "loss": 0.2499, "step": 5176 }, { "epoch": 0.2501328694979949, "grad_norm": 3.158505439758301, "learning_rate": 7.498671305020051e-07, "loss": 0.3227, "step": 5177 }, { "epoch": 0.25018118567908393, "grad_norm": 3.070676803588867, "learning_rate": 7.498188143209161e-07, "loss": 0.4069, "step": 5178 }, { "epoch": 0.25022950186017295, "grad_norm": 2.391575336456299, "learning_rate": 7.49770498139827e-07, "loss": 0.304, "step": 5179 }, { "epoch": 0.25027781804126203, "grad_norm": 2.2379770278930664, "learning_rate": 7.49722181958738e-07, "loss": 0.2564, "step": 5180 }, { "epoch": 0.25032613422235106, "grad_norm": 3.087228298187256, "learning_rate": 7.496738657776489e-07, "loss": 0.3124, "step": 5181 }, { "epoch": 0.2503744504034401, "grad_norm": 2.0669593811035156, "learning_rate": 7.496255495965599e-07, "loss": 0.229, "step": 5182 }, { "epoch": 0.25042276658452917, "grad_norm": 10.044392585754395, "learning_rate": 7.495772334154708e-07, "loss": 0.289, "step": 5183 }, { "epoch": 0.2504710827656182, "grad_norm": 1.9260436296463013, "learning_rate": 7.495289172343817e-07, "loss": 0.2196, "step": 5184 }, { "epoch": 0.2505193989467073, "grad_norm": 2.4118247032165527, "learning_rate": 7.494806010532927e-07, "loss": 0.3146, "step": 5185 }, { "epoch": 0.2505677151277963, "grad_norm": 6.527277946472168, "learning_rate": 7.494322848722037e-07, "loss": 0.3525, "step": 5186 }, { "epoch": 0.2506160313088853, "grad_norm": 2.9447004795074463, "learning_rate": 7.493839686911147e-07, "loss": 0.3921, "step": 5187 }, { "epoch": 0.2506643474899744, "grad_norm": 2.717583179473877, "learning_rate": 7.493356525100257e-07, "loss": 0.3383, "step": 5188 }, { "epoch": 0.25071266367106343, "grad_norm": 2.312652826309204, "learning_rate": 7.492873363289364e-07, "loss": 0.3179, "step": 5189 }, { "epoch": 0.2507609798521525, "grad_norm": 1.700553297996521, "learning_rate": 7.492390201478474e-07, "loss": 0.1475, "step": 5190 }, { "epoch": 0.25080929603324154, "grad_norm": 12.64185905456543, "learning_rate": 7.491907039667584e-07, "loss": 0.2959, "step": 5191 }, { "epoch": 0.25085761221433056, "grad_norm": 2.3858437538146973, "learning_rate": 7.491423877856694e-07, "loss": 0.2467, "step": 5192 }, { "epoch": 0.25090592839541964, "grad_norm": 3.2522659301757812, "learning_rate": 7.490940716045804e-07, "loss": 0.3987, "step": 5193 }, { "epoch": 0.25095424457650867, "grad_norm": 3.390929698944092, "learning_rate": 7.490457554234913e-07, "loss": 0.4248, "step": 5194 }, { "epoch": 0.2510025607575977, "grad_norm": 3.0823824405670166, "learning_rate": 7.489974392424022e-07, "loss": 0.3805, "step": 5195 }, { "epoch": 0.2510508769386868, "grad_norm": 7.253442764282227, "learning_rate": 7.489491230613132e-07, "loss": 0.323, "step": 5196 }, { "epoch": 0.2510991931197758, "grad_norm": 4.912155628204346, "learning_rate": 7.489008068802241e-07, "loss": 0.4471, "step": 5197 }, { "epoch": 0.2511475093008649, "grad_norm": 2.1874072551727295, "learning_rate": 7.488524906991351e-07, "loss": 0.2264, "step": 5198 }, { "epoch": 0.2511958254819539, "grad_norm": 3.034017562866211, "learning_rate": 7.48804174518046e-07, "loss": 0.408, "step": 5199 }, { "epoch": 0.25124414166304293, "grad_norm": 5.1372294425964355, "learning_rate": 7.48755858336957e-07, "loss": 0.355, "step": 5200 }, { "epoch": 0.251292457844132, "grad_norm": 3.7796690464019775, "learning_rate": 7.48707542155868e-07, "loss": 0.3634, "step": 5201 }, { "epoch": 0.25134077402522104, "grad_norm": 2.3491718769073486, "learning_rate": 7.486592259747789e-07, "loss": 0.2811, "step": 5202 }, { "epoch": 0.2513890902063101, "grad_norm": 2.989150047302246, "learning_rate": 7.486109097936899e-07, "loss": 0.3852, "step": 5203 }, { "epoch": 0.25143740638739914, "grad_norm": 2.25559139251709, "learning_rate": 7.485625936126009e-07, "loss": 0.2949, "step": 5204 }, { "epoch": 0.25148572256848817, "grad_norm": 5.292452812194824, "learning_rate": 7.485142774315118e-07, "loss": 0.2374, "step": 5205 }, { "epoch": 0.25153403874957725, "grad_norm": 2.809663772583008, "learning_rate": 7.484659612504227e-07, "loss": 0.3974, "step": 5206 }, { "epoch": 0.2515823549306663, "grad_norm": 12.62151050567627, "learning_rate": 7.484176450693337e-07, "loss": 0.1656, "step": 5207 }, { "epoch": 0.2516306711117553, "grad_norm": 3.5667223930358887, "learning_rate": 7.483693288882446e-07, "loss": 0.4764, "step": 5208 }, { "epoch": 0.2516789872928444, "grad_norm": 4.297202110290527, "learning_rate": 7.483210127071556e-07, "loss": 0.3418, "step": 5209 }, { "epoch": 0.2517273034739334, "grad_norm": 2.8606910705566406, "learning_rate": 7.482726965260665e-07, "loss": 0.329, "step": 5210 }, { "epoch": 0.2517756196550225, "grad_norm": 2.228757858276367, "learning_rate": 7.482243803449775e-07, "loss": 0.2513, "step": 5211 }, { "epoch": 0.2518239358361115, "grad_norm": 3.006459951400757, "learning_rate": 7.481760641638885e-07, "loss": 0.4056, "step": 5212 }, { "epoch": 0.25187225201720054, "grad_norm": 4.080069065093994, "learning_rate": 7.481277479827995e-07, "loss": 0.3828, "step": 5213 }, { "epoch": 0.2519205681982896, "grad_norm": 3.6420841217041016, "learning_rate": 7.480794318017105e-07, "loss": 0.3025, "step": 5214 }, { "epoch": 0.25196888437937864, "grad_norm": 3.0549890995025635, "learning_rate": 7.480311156206212e-07, "loss": 0.1706, "step": 5215 }, { "epoch": 0.2520172005604677, "grad_norm": 11.640034675598145, "learning_rate": 7.479827994395322e-07, "loss": 0.413, "step": 5216 }, { "epoch": 0.25206551674155675, "grad_norm": 2.8418941497802734, "learning_rate": 7.479344832584432e-07, "loss": 0.3, "step": 5217 }, { "epoch": 0.2521138329226458, "grad_norm": 4.117597579956055, "learning_rate": 7.478861670773542e-07, "loss": 0.3234, "step": 5218 }, { "epoch": 0.25216214910373486, "grad_norm": 4.032301902770996, "learning_rate": 7.478378508962652e-07, "loss": 0.2308, "step": 5219 }, { "epoch": 0.2522104652848239, "grad_norm": 4.138357639312744, "learning_rate": 7.477895347151761e-07, "loss": 0.4374, "step": 5220 }, { "epoch": 0.2522587814659129, "grad_norm": 2.0478153228759766, "learning_rate": 7.47741218534087e-07, "loss": 0.2467, "step": 5221 }, { "epoch": 0.252307097647002, "grad_norm": 2.033994436264038, "learning_rate": 7.47692902352998e-07, "loss": 0.2772, "step": 5222 }, { "epoch": 0.252355413828091, "grad_norm": 3.1583900451660156, "learning_rate": 7.476445861719089e-07, "loss": 0.5187, "step": 5223 }, { "epoch": 0.2524037300091801, "grad_norm": 1.4345070123672485, "learning_rate": 7.475962699908199e-07, "loss": 0.1651, "step": 5224 }, { "epoch": 0.2524520461902691, "grad_norm": 2.2353858947753906, "learning_rate": 7.475479538097308e-07, "loss": 0.2504, "step": 5225 }, { "epoch": 0.25250036237135814, "grad_norm": 2.5836453437805176, "learning_rate": 7.474996376286418e-07, "loss": 0.3051, "step": 5226 }, { "epoch": 0.2525486785524472, "grad_norm": 2.065103769302368, "learning_rate": 7.474513214475527e-07, "loss": 0.2671, "step": 5227 }, { "epoch": 0.25259699473353625, "grad_norm": 4.700089931488037, "learning_rate": 7.474030052664637e-07, "loss": 0.2536, "step": 5228 }, { "epoch": 0.25264531091462533, "grad_norm": 3.004345655441284, "learning_rate": 7.473546890853747e-07, "loss": 0.2204, "step": 5229 }, { "epoch": 0.25269362709571436, "grad_norm": 2.2592544555664062, "learning_rate": 7.473063729042857e-07, "loss": 0.2572, "step": 5230 }, { "epoch": 0.2527419432768034, "grad_norm": 1.879062294960022, "learning_rate": 7.472580567231965e-07, "loss": 0.2626, "step": 5231 }, { "epoch": 0.25279025945789246, "grad_norm": 1.4742798805236816, "learning_rate": 7.472097405421075e-07, "loss": 0.154, "step": 5232 }, { "epoch": 0.2528385756389815, "grad_norm": 2.8280045986175537, "learning_rate": 7.471614243610185e-07, "loss": 0.3846, "step": 5233 }, { "epoch": 0.2528868918200705, "grad_norm": 3.063585042953491, "learning_rate": 7.471131081799294e-07, "loss": 0.4285, "step": 5234 }, { "epoch": 0.2529352080011596, "grad_norm": 3.2831015586853027, "learning_rate": 7.470647919988404e-07, "loss": 0.4016, "step": 5235 }, { "epoch": 0.2529835241822486, "grad_norm": 43.38501739501953, "learning_rate": 7.470164758177513e-07, "loss": 0.261, "step": 5236 }, { "epoch": 0.2530318403633377, "grad_norm": 10.31129264831543, "learning_rate": 7.469681596366623e-07, "loss": 0.2366, "step": 5237 }, { "epoch": 0.2530801565444267, "grad_norm": 2.2246360778808594, "learning_rate": 7.469198434555733e-07, "loss": 0.2456, "step": 5238 }, { "epoch": 0.25312847272551575, "grad_norm": 3.2856969833374023, "learning_rate": 7.468715272744843e-07, "loss": 0.3314, "step": 5239 }, { "epoch": 0.25317678890660483, "grad_norm": 2.7501416206359863, "learning_rate": 7.468232110933951e-07, "loss": 0.3403, "step": 5240 }, { "epoch": 0.25322510508769386, "grad_norm": 2.958138942718506, "learning_rate": 7.46774894912306e-07, "loss": 0.2651, "step": 5241 }, { "epoch": 0.25327342126878294, "grad_norm": 2.0901310443878174, "learning_rate": 7.46726578731217e-07, "loss": 0.2042, "step": 5242 }, { "epoch": 0.25332173744987196, "grad_norm": 2.645232915878296, "learning_rate": 7.46678262550128e-07, "loss": 0.2322, "step": 5243 }, { "epoch": 0.253370053630961, "grad_norm": 2.0348355770111084, "learning_rate": 7.46629946369039e-07, "loss": 0.2414, "step": 5244 }, { "epoch": 0.25341836981205007, "grad_norm": 2.517916679382324, "learning_rate": 7.4658163018795e-07, "loss": 0.2234, "step": 5245 }, { "epoch": 0.2534666859931391, "grad_norm": 2.1851859092712402, "learning_rate": 7.465333140068609e-07, "loss": 0.2301, "step": 5246 }, { "epoch": 0.2535150021742282, "grad_norm": 12.912461280822754, "learning_rate": 7.464849978257718e-07, "loss": 0.2742, "step": 5247 }, { "epoch": 0.2535633183553172, "grad_norm": 3.138476848602295, "learning_rate": 7.464366816446827e-07, "loss": 0.245, "step": 5248 }, { "epoch": 0.2536116345364062, "grad_norm": 2.809169292449951, "learning_rate": 7.463883654635937e-07, "loss": 0.352, "step": 5249 }, { "epoch": 0.2536599507174953, "grad_norm": 2.528224229812622, "learning_rate": 7.463400492825047e-07, "loss": 0.2242, "step": 5250 }, { "epoch": 0.25370826689858433, "grad_norm": 2.728970766067505, "learning_rate": 7.462917331014156e-07, "loss": 0.396, "step": 5251 }, { "epoch": 0.25375658307967336, "grad_norm": 2.4896767139434814, "learning_rate": 7.462434169203266e-07, "loss": 0.211, "step": 5252 }, { "epoch": 0.25380489926076244, "grad_norm": 3.0511600971221924, "learning_rate": 7.461951007392375e-07, "loss": 0.2346, "step": 5253 }, { "epoch": 0.25385321544185147, "grad_norm": 38.66176223754883, "learning_rate": 7.461467845581485e-07, "loss": 0.2919, "step": 5254 }, { "epoch": 0.25390153162294055, "grad_norm": 2.986666440963745, "learning_rate": 7.460984683770595e-07, "loss": 0.3407, "step": 5255 }, { "epoch": 0.25394984780402957, "grad_norm": 2.8658077716827393, "learning_rate": 7.460501521959705e-07, "loss": 0.2804, "step": 5256 }, { "epoch": 0.2539981639851186, "grad_norm": 2.782005548477173, "learning_rate": 7.460018360148813e-07, "loss": 0.3866, "step": 5257 }, { "epoch": 0.2540464801662077, "grad_norm": 1.9640709161758423, "learning_rate": 7.459535198337923e-07, "loss": 0.2354, "step": 5258 }, { "epoch": 0.2540947963472967, "grad_norm": 4.143271446228027, "learning_rate": 7.459052036527032e-07, "loss": 0.2753, "step": 5259 }, { "epoch": 0.2541431125283858, "grad_norm": 5.983206272125244, "learning_rate": 7.458568874716142e-07, "loss": 0.3313, "step": 5260 }, { "epoch": 0.2541914287094748, "grad_norm": 2.683180570602417, "learning_rate": 7.458085712905252e-07, "loss": 0.2995, "step": 5261 }, { "epoch": 0.25423974489056383, "grad_norm": 5.4995646476745605, "learning_rate": 7.457602551094361e-07, "loss": 0.3777, "step": 5262 }, { "epoch": 0.2542880610716529, "grad_norm": 2.2819225788116455, "learning_rate": 7.457119389283471e-07, "loss": 0.248, "step": 5263 }, { "epoch": 0.25433637725274194, "grad_norm": 3.0004782676696777, "learning_rate": 7.456636227472581e-07, "loss": 0.306, "step": 5264 }, { "epoch": 0.25438469343383097, "grad_norm": 1.4787918329238892, "learning_rate": 7.45615306566169e-07, "loss": 0.1221, "step": 5265 }, { "epoch": 0.25443300961492005, "grad_norm": 2.433027505874634, "learning_rate": 7.455669903850799e-07, "loss": 0.3315, "step": 5266 }, { "epoch": 0.2544813257960091, "grad_norm": 3.071718454360962, "learning_rate": 7.455186742039908e-07, "loss": 0.3801, "step": 5267 }, { "epoch": 0.25452964197709815, "grad_norm": 3.4863131046295166, "learning_rate": 7.454703580229018e-07, "loss": 0.4106, "step": 5268 }, { "epoch": 0.2545779581581872, "grad_norm": 2.2239274978637695, "learning_rate": 7.454220418418128e-07, "loss": 0.2334, "step": 5269 }, { "epoch": 0.2546262743392762, "grad_norm": 3.001253604888916, "learning_rate": 7.453737256607238e-07, "loss": 0.2199, "step": 5270 }, { "epoch": 0.2546745905203653, "grad_norm": 2.5578956604003906, "learning_rate": 7.453254094796348e-07, "loss": 0.3408, "step": 5271 }, { "epoch": 0.2547229067014543, "grad_norm": 2.216280460357666, "learning_rate": 7.452770932985456e-07, "loss": 0.2086, "step": 5272 }, { "epoch": 0.2547712228825434, "grad_norm": 3.753758430480957, "learning_rate": 7.452287771174565e-07, "loss": 0.3756, "step": 5273 }, { "epoch": 0.2548195390636324, "grad_norm": 1.5217747688293457, "learning_rate": 7.451804609363675e-07, "loss": 0.1772, "step": 5274 }, { "epoch": 0.25486785524472144, "grad_norm": 2.84724497795105, "learning_rate": 7.451321447552785e-07, "loss": 0.3272, "step": 5275 }, { "epoch": 0.2549161714258105, "grad_norm": 2.688929796218872, "learning_rate": 7.450838285741895e-07, "loss": 0.2281, "step": 5276 }, { "epoch": 0.25496448760689955, "grad_norm": 2.103285551071167, "learning_rate": 7.450355123931004e-07, "loss": 0.192, "step": 5277 }, { "epoch": 0.2550128037879886, "grad_norm": 3.05259108543396, "learning_rate": 7.449871962120114e-07, "loss": 0.3209, "step": 5278 }, { "epoch": 0.25506111996907765, "grad_norm": 11.473015785217285, "learning_rate": 7.449388800309223e-07, "loss": 0.3728, "step": 5279 }, { "epoch": 0.2551094361501667, "grad_norm": 2.4066107273101807, "learning_rate": 7.448905638498333e-07, "loss": 0.3346, "step": 5280 }, { "epoch": 0.25515775233125576, "grad_norm": 2.130858898162842, "learning_rate": 7.448422476687443e-07, "loss": 0.1729, "step": 5281 }, { "epoch": 0.2552060685123448, "grad_norm": 2.9144792556762695, "learning_rate": 7.447939314876552e-07, "loss": 0.3032, "step": 5282 }, { "epoch": 0.2552543846934338, "grad_norm": 2.8364243507385254, "learning_rate": 7.447456153065661e-07, "loss": 0.3272, "step": 5283 }, { "epoch": 0.2553027008745229, "grad_norm": 2.3294217586517334, "learning_rate": 7.446972991254771e-07, "loss": 0.2823, "step": 5284 }, { "epoch": 0.2553510170556119, "grad_norm": 2.378675937652588, "learning_rate": 7.44648982944388e-07, "loss": 0.292, "step": 5285 }, { "epoch": 0.255399333236701, "grad_norm": 3.501377820968628, "learning_rate": 7.44600666763299e-07, "loss": 0.4393, "step": 5286 }, { "epoch": 0.25544764941779, "grad_norm": 2.691739082336426, "learning_rate": 7.4455235058221e-07, "loss": 0.3991, "step": 5287 }, { "epoch": 0.25549596559887905, "grad_norm": 2.6136178970336914, "learning_rate": 7.445040344011209e-07, "loss": 0.3131, "step": 5288 }, { "epoch": 0.25554428177996813, "grad_norm": 2.851215362548828, "learning_rate": 7.444557182200319e-07, "loss": 0.392, "step": 5289 }, { "epoch": 0.25559259796105716, "grad_norm": 1.6699854135513306, "learning_rate": 7.444074020389428e-07, "loss": 0.1778, "step": 5290 }, { "epoch": 0.2556409141421462, "grad_norm": 2.394913673400879, "learning_rate": 7.443590858578538e-07, "loss": 0.2471, "step": 5291 }, { "epoch": 0.25568923032323526, "grad_norm": 2.7100794315338135, "learning_rate": 7.443107696767647e-07, "loss": 0.3032, "step": 5292 }, { "epoch": 0.2557375465043243, "grad_norm": 6.144251346588135, "learning_rate": 7.442624534956756e-07, "loss": 0.4281, "step": 5293 }, { "epoch": 0.25578586268541337, "grad_norm": 3.087399482727051, "learning_rate": 7.442141373145866e-07, "loss": 0.3486, "step": 5294 }, { "epoch": 0.2558341788665024, "grad_norm": 3.544252634048462, "learning_rate": 7.441658211334976e-07, "loss": 0.3792, "step": 5295 }, { "epoch": 0.2558824950475914, "grad_norm": 2.273942708969116, "learning_rate": 7.441175049524086e-07, "loss": 0.1921, "step": 5296 }, { "epoch": 0.2559308112286805, "grad_norm": 2.8561441898345947, "learning_rate": 7.440691887713196e-07, "loss": 0.2671, "step": 5297 }, { "epoch": 0.2559791274097695, "grad_norm": 3.021653413772583, "learning_rate": 7.440208725902303e-07, "loss": 0.3946, "step": 5298 }, { "epoch": 0.2560274435908586, "grad_norm": 2.648162841796875, "learning_rate": 7.439725564091413e-07, "loss": 0.2621, "step": 5299 }, { "epoch": 0.25607575977194763, "grad_norm": 2.702688217163086, "learning_rate": 7.439242402280523e-07, "loss": 0.3411, "step": 5300 }, { "epoch": 0.25612407595303666, "grad_norm": 2.0553817749023438, "learning_rate": 7.438759240469633e-07, "loss": 0.2337, "step": 5301 }, { "epoch": 0.25617239213412574, "grad_norm": 4.218410968780518, "learning_rate": 7.438276078658743e-07, "loss": 0.3767, "step": 5302 }, { "epoch": 0.25622070831521476, "grad_norm": 2.420853614807129, "learning_rate": 7.437792916847852e-07, "loss": 0.257, "step": 5303 }, { "epoch": 0.2562690244963038, "grad_norm": 2.3584365844726562, "learning_rate": 7.437309755036961e-07, "loss": 0.2252, "step": 5304 }, { "epoch": 0.25631734067739287, "grad_norm": 2.1783761978149414, "learning_rate": 7.436826593226071e-07, "loss": 0.2819, "step": 5305 }, { "epoch": 0.2563656568584819, "grad_norm": 3.2611007690429688, "learning_rate": 7.436343431415181e-07, "loss": 0.4183, "step": 5306 }, { "epoch": 0.256413973039571, "grad_norm": 2.5374815464019775, "learning_rate": 7.43586026960429e-07, "loss": 0.3978, "step": 5307 }, { "epoch": 0.25646228922066, "grad_norm": 2.2801055908203125, "learning_rate": 7.4353771077934e-07, "loss": 0.2657, "step": 5308 }, { "epoch": 0.256510605401749, "grad_norm": 2.3162620067596436, "learning_rate": 7.434893945982509e-07, "loss": 0.253, "step": 5309 }, { "epoch": 0.2565589215828381, "grad_norm": 2.248272657394409, "learning_rate": 7.434410784171619e-07, "loss": 0.2491, "step": 5310 }, { "epoch": 0.25660723776392713, "grad_norm": 3.692554235458374, "learning_rate": 7.433927622360728e-07, "loss": 0.3124, "step": 5311 }, { "epoch": 0.2566555539450162, "grad_norm": 2.4790940284729004, "learning_rate": 7.433444460549838e-07, "loss": 0.2446, "step": 5312 }, { "epoch": 0.25670387012610524, "grad_norm": 9.1321382522583, "learning_rate": 7.432961298738948e-07, "loss": 0.2986, "step": 5313 }, { "epoch": 0.25675218630719426, "grad_norm": 3.584042549133301, "learning_rate": 7.432478136928057e-07, "loss": 0.4368, "step": 5314 }, { "epoch": 0.25680050248828334, "grad_norm": 2.0891528129577637, "learning_rate": 7.431994975117167e-07, "loss": 0.2221, "step": 5315 }, { "epoch": 0.25684881866937237, "grad_norm": 2.868309259414673, "learning_rate": 7.431511813306276e-07, "loss": 0.3396, "step": 5316 }, { "epoch": 0.2568971348504614, "grad_norm": 2.377300500869751, "learning_rate": 7.431028651495385e-07, "loss": 0.2882, "step": 5317 }, { "epoch": 0.2569454510315505, "grad_norm": 2.8219966888427734, "learning_rate": 7.430545489684495e-07, "loss": 0.425, "step": 5318 }, { "epoch": 0.2569937672126395, "grad_norm": 1.9130092859268188, "learning_rate": 7.430062327873604e-07, "loss": 0.1762, "step": 5319 }, { "epoch": 0.2570420833937286, "grad_norm": 2.171943187713623, "learning_rate": 7.429579166062714e-07, "loss": 0.2506, "step": 5320 }, { "epoch": 0.2570903995748176, "grad_norm": 3.065894365310669, "learning_rate": 7.429096004251824e-07, "loss": 0.3365, "step": 5321 }, { "epoch": 0.25713871575590663, "grad_norm": 5.815096378326416, "learning_rate": 7.428612842440934e-07, "loss": 0.3945, "step": 5322 }, { "epoch": 0.2571870319369957, "grad_norm": 2.8760383129119873, "learning_rate": 7.428129680630044e-07, "loss": 0.2336, "step": 5323 }, { "epoch": 0.25723534811808474, "grad_norm": 3.8795886039733887, "learning_rate": 7.427646518819151e-07, "loss": 0.3967, "step": 5324 }, { "epoch": 0.2572836642991738, "grad_norm": 3.640488862991333, "learning_rate": 7.427163357008261e-07, "loss": 0.3954, "step": 5325 }, { "epoch": 0.25733198048026285, "grad_norm": 2.4331297874450684, "learning_rate": 7.426680195197371e-07, "loss": 0.2755, "step": 5326 }, { "epoch": 0.25738029666135187, "grad_norm": 3.143850803375244, "learning_rate": 7.426197033386481e-07, "loss": 0.3497, "step": 5327 }, { "epoch": 0.25742861284244095, "grad_norm": 3.1480553150177, "learning_rate": 7.425713871575591e-07, "loss": 0.3484, "step": 5328 }, { "epoch": 0.25747692902353, "grad_norm": 4.43885612487793, "learning_rate": 7.4252307097647e-07, "loss": 0.3368, "step": 5329 }, { "epoch": 0.257525245204619, "grad_norm": 2.971210479736328, "learning_rate": 7.424747547953809e-07, "loss": 0.2717, "step": 5330 }, { "epoch": 0.2575735613857081, "grad_norm": 3.1182479858398438, "learning_rate": 7.424264386142919e-07, "loss": 0.3622, "step": 5331 }, { "epoch": 0.2576218775667971, "grad_norm": 2.358158826828003, "learning_rate": 7.423781224332028e-07, "loss": 0.2698, "step": 5332 }, { "epoch": 0.2576701937478862, "grad_norm": 2.6949448585510254, "learning_rate": 7.423298062521138e-07, "loss": 0.3002, "step": 5333 }, { "epoch": 0.2577185099289752, "grad_norm": 2.226154327392578, "learning_rate": 7.422814900710248e-07, "loss": 0.3114, "step": 5334 }, { "epoch": 0.25776682611006424, "grad_norm": 2.518162727355957, "learning_rate": 7.422331738899357e-07, "loss": 0.2811, "step": 5335 }, { "epoch": 0.2578151422911533, "grad_norm": 2.3110134601593018, "learning_rate": 7.421848577088466e-07, "loss": 0.2726, "step": 5336 }, { "epoch": 0.25786345847224235, "grad_norm": 2.6984336376190186, "learning_rate": 7.421365415277576e-07, "loss": 0.2564, "step": 5337 }, { "epoch": 0.2579117746533314, "grad_norm": 2.6765501499176025, "learning_rate": 7.420882253466686e-07, "loss": 0.2658, "step": 5338 }, { "epoch": 0.25796009083442045, "grad_norm": 5.455158233642578, "learning_rate": 7.420399091655796e-07, "loss": 0.3646, "step": 5339 }, { "epoch": 0.2580084070155095, "grad_norm": 4.910270690917969, "learning_rate": 7.419915929844905e-07, "loss": 0.3699, "step": 5340 }, { "epoch": 0.25805672319659856, "grad_norm": 3.4454495906829834, "learning_rate": 7.419432768034014e-07, "loss": 0.3477, "step": 5341 }, { "epoch": 0.2581050393776876, "grad_norm": 2.6977009773254395, "learning_rate": 7.418949606223124e-07, "loss": 0.2611, "step": 5342 }, { "epoch": 0.2581533555587766, "grad_norm": 2.509490489959717, "learning_rate": 7.418466444412233e-07, "loss": 0.2519, "step": 5343 }, { "epoch": 0.2582016717398657, "grad_norm": 3.323589324951172, "learning_rate": 7.417983282601343e-07, "loss": 0.2459, "step": 5344 }, { "epoch": 0.2582499879209547, "grad_norm": 3.5639209747314453, "learning_rate": 7.417500120790452e-07, "loss": 0.2301, "step": 5345 }, { "epoch": 0.2582983041020438, "grad_norm": 3.4789609909057617, "learning_rate": 7.417016958979562e-07, "loss": 0.2614, "step": 5346 }, { "epoch": 0.2583466202831328, "grad_norm": 3.6890738010406494, "learning_rate": 7.416533797168672e-07, "loss": 0.4225, "step": 5347 }, { "epoch": 0.25839493646422185, "grad_norm": 5.450486660003662, "learning_rate": 7.416050635357782e-07, "loss": 0.154, "step": 5348 }, { "epoch": 0.25844325264531093, "grad_norm": 2.691434383392334, "learning_rate": 7.41556747354689e-07, "loss": 0.2782, "step": 5349 }, { "epoch": 0.25849156882639995, "grad_norm": 2.4645304679870605, "learning_rate": 7.415084311735999e-07, "loss": 0.2122, "step": 5350 }, { "epoch": 0.25853988500748903, "grad_norm": 2.353379011154175, "learning_rate": 7.414601149925109e-07, "loss": 0.2917, "step": 5351 }, { "epoch": 0.25858820118857806, "grad_norm": 2.7277894020080566, "learning_rate": 7.414117988114219e-07, "loss": 0.2456, "step": 5352 }, { "epoch": 0.2586365173696671, "grad_norm": 1.8382817506790161, "learning_rate": 7.413634826303329e-07, "loss": 0.2351, "step": 5353 }, { "epoch": 0.25868483355075617, "grad_norm": 2.5167617797851562, "learning_rate": 7.413151664492439e-07, "loss": 0.2435, "step": 5354 }, { "epoch": 0.2587331497318452, "grad_norm": 11.827816009521484, "learning_rate": 7.412668502681547e-07, "loss": 0.3472, "step": 5355 }, { "epoch": 0.2587814659129342, "grad_norm": 2.676866054534912, "learning_rate": 7.412185340870657e-07, "loss": 0.3696, "step": 5356 }, { "epoch": 0.2588297820940233, "grad_norm": 2.8693487644195557, "learning_rate": 7.411702179059767e-07, "loss": 0.2855, "step": 5357 }, { "epoch": 0.2588780982751123, "grad_norm": 2.0006978511810303, "learning_rate": 7.411219017248876e-07, "loss": 0.2857, "step": 5358 }, { "epoch": 0.2589264144562014, "grad_norm": 2.864166259765625, "learning_rate": 7.410735855437986e-07, "loss": 0.3569, "step": 5359 }, { "epoch": 0.25897473063729043, "grad_norm": 2.79386043548584, "learning_rate": 7.410252693627096e-07, "loss": 0.2737, "step": 5360 }, { "epoch": 0.25902304681837945, "grad_norm": 2.3572869300842285, "learning_rate": 7.409769531816205e-07, "loss": 0.2969, "step": 5361 }, { "epoch": 0.25907136299946854, "grad_norm": 2.322502613067627, "learning_rate": 7.409286370005314e-07, "loss": 0.2641, "step": 5362 }, { "epoch": 0.25911967918055756, "grad_norm": 2.5294644832611084, "learning_rate": 7.408803208194424e-07, "loss": 0.2544, "step": 5363 }, { "epoch": 0.25916799536164664, "grad_norm": 2.695964813232422, "learning_rate": 7.408320046383534e-07, "loss": 0.2982, "step": 5364 }, { "epoch": 0.25921631154273567, "grad_norm": 4.549217224121094, "learning_rate": 7.407836884572644e-07, "loss": 0.3995, "step": 5365 }, { "epoch": 0.2592646277238247, "grad_norm": 2.7509658336639404, "learning_rate": 7.407353722761752e-07, "loss": 0.3785, "step": 5366 }, { "epoch": 0.2593129439049138, "grad_norm": 3.3945140838623047, "learning_rate": 7.406870560950862e-07, "loss": 0.3928, "step": 5367 }, { "epoch": 0.2593612600860028, "grad_norm": 2.4663126468658447, "learning_rate": 7.406387399139971e-07, "loss": 0.2951, "step": 5368 }, { "epoch": 0.2594095762670918, "grad_norm": 3.734065055847168, "learning_rate": 7.405904237329081e-07, "loss": 0.2801, "step": 5369 }, { "epoch": 0.2594578924481809, "grad_norm": 2.335827350616455, "learning_rate": 7.405421075518191e-07, "loss": 0.2955, "step": 5370 }, { "epoch": 0.25950620862926993, "grad_norm": 3.2163619995117188, "learning_rate": 7.4049379137073e-07, "loss": 0.3903, "step": 5371 }, { "epoch": 0.259554524810359, "grad_norm": 2.3098196983337402, "learning_rate": 7.40445475189641e-07, "loss": 0.2187, "step": 5372 }, { "epoch": 0.25960284099144804, "grad_norm": 2.258427143096924, "learning_rate": 7.40397159008552e-07, "loss": 0.1953, "step": 5373 }, { "epoch": 0.25965115717253706, "grad_norm": 7.298981666564941, "learning_rate": 7.40348842827463e-07, "loss": 0.3392, "step": 5374 }, { "epoch": 0.25969947335362614, "grad_norm": 2.6650161743164062, "learning_rate": 7.403005266463738e-07, "loss": 0.259, "step": 5375 }, { "epoch": 0.25974778953471517, "grad_norm": 4.614869117736816, "learning_rate": 7.402522104652847e-07, "loss": 0.3683, "step": 5376 }, { "epoch": 0.25979610571580425, "grad_norm": 2.490145206451416, "learning_rate": 7.402038942841957e-07, "loss": 0.3429, "step": 5377 }, { "epoch": 0.2598444218968933, "grad_norm": 6.140491962432861, "learning_rate": 7.401555781031067e-07, "loss": 0.3541, "step": 5378 }, { "epoch": 0.2598927380779823, "grad_norm": 2.033140182495117, "learning_rate": 7.401072619220177e-07, "loss": 0.2119, "step": 5379 }, { "epoch": 0.2599410542590714, "grad_norm": 3.338031530380249, "learning_rate": 7.400589457409287e-07, "loss": 0.4001, "step": 5380 }, { "epoch": 0.2599893704401604, "grad_norm": 2.424657106399536, "learning_rate": 7.400106295598395e-07, "loss": 0.2615, "step": 5381 }, { "epoch": 0.26003768662124943, "grad_norm": 3.197162628173828, "learning_rate": 7.399623133787505e-07, "loss": 0.3647, "step": 5382 }, { "epoch": 0.2600860028023385, "grad_norm": 1.9603766202926636, "learning_rate": 7.399139971976614e-07, "loss": 0.183, "step": 5383 }, { "epoch": 0.26013431898342754, "grad_norm": 3.9487667083740234, "learning_rate": 7.398656810165724e-07, "loss": 0.3905, "step": 5384 }, { "epoch": 0.2601826351645166, "grad_norm": 3.6352949142456055, "learning_rate": 7.398173648354834e-07, "loss": 0.544, "step": 5385 }, { "epoch": 0.26023095134560564, "grad_norm": 3.2224879264831543, "learning_rate": 7.397690486543944e-07, "loss": 0.2786, "step": 5386 }, { "epoch": 0.26027926752669467, "grad_norm": 4.691555023193359, "learning_rate": 7.397207324733052e-07, "loss": 0.3171, "step": 5387 }, { "epoch": 0.26032758370778375, "grad_norm": 10.21530532836914, "learning_rate": 7.396724162922162e-07, "loss": 0.2423, "step": 5388 }, { "epoch": 0.2603758998888728, "grad_norm": 3.157259941101074, "learning_rate": 7.396241001111272e-07, "loss": 0.146, "step": 5389 }, { "epoch": 0.26042421606996186, "grad_norm": 2.7900278568267822, "learning_rate": 7.395757839300382e-07, "loss": 0.2469, "step": 5390 }, { "epoch": 0.2604725322510509, "grad_norm": 9.58544635772705, "learning_rate": 7.395274677489492e-07, "loss": 0.2382, "step": 5391 }, { "epoch": 0.2605208484321399, "grad_norm": 2.559603691101074, "learning_rate": 7.3947915156786e-07, "loss": 0.3112, "step": 5392 }, { "epoch": 0.260569164613229, "grad_norm": 4.2643585205078125, "learning_rate": 7.39430835386771e-07, "loss": 0.2972, "step": 5393 }, { "epoch": 0.260617480794318, "grad_norm": 2.697878360748291, "learning_rate": 7.393825192056819e-07, "loss": 0.3301, "step": 5394 }, { "epoch": 0.26066579697540704, "grad_norm": 7.736194610595703, "learning_rate": 7.393342030245929e-07, "loss": 0.3897, "step": 5395 }, { "epoch": 0.2607141131564961, "grad_norm": 2.389218807220459, "learning_rate": 7.392858868435039e-07, "loss": 0.2713, "step": 5396 }, { "epoch": 0.26076242933758514, "grad_norm": 2.97105073928833, "learning_rate": 7.392375706624148e-07, "loss": 0.3062, "step": 5397 }, { "epoch": 0.2608107455186742, "grad_norm": 2.7410850524902344, "learning_rate": 7.391892544813258e-07, "loss": 0.3019, "step": 5398 }, { "epoch": 0.26085906169976325, "grad_norm": 2.648189067840576, "learning_rate": 7.391409383002368e-07, "loss": 0.3306, "step": 5399 }, { "epoch": 0.2609073778808523, "grad_norm": 2.76979398727417, "learning_rate": 7.390926221191476e-07, "loss": 0.2513, "step": 5400 }, { "epoch": 0.26095569406194136, "grad_norm": 2.733989715576172, "learning_rate": 7.390443059380586e-07, "loss": 0.3845, "step": 5401 }, { "epoch": 0.2610040102430304, "grad_norm": 2.5702712535858154, "learning_rate": 7.389959897569695e-07, "loss": 0.332, "step": 5402 }, { "epoch": 0.26105232642411946, "grad_norm": 1.9192993640899658, "learning_rate": 7.389476735758805e-07, "loss": 0.2554, "step": 5403 }, { "epoch": 0.2611006426052085, "grad_norm": 3.3289976119995117, "learning_rate": 7.388993573947915e-07, "loss": 0.3232, "step": 5404 }, { "epoch": 0.2611489587862975, "grad_norm": 6.3317975997924805, "learning_rate": 7.388510412137025e-07, "loss": 0.3852, "step": 5405 }, { "epoch": 0.2611972749673866, "grad_norm": 12.089493751525879, "learning_rate": 7.388027250326135e-07, "loss": 0.1741, "step": 5406 }, { "epoch": 0.2612455911484756, "grad_norm": 2.1958000659942627, "learning_rate": 7.387544088515243e-07, "loss": 0.247, "step": 5407 }, { "epoch": 0.26129390732956465, "grad_norm": 5.07353401184082, "learning_rate": 7.387060926704352e-07, "loss": 0.4181, "step": 5408 }, { "epoch": 0.2613422235106537, "grad_norm": 2.7525148391723633, "learning_rate": 7.386577764893462e-07, "loss": 0.3397, "step": 5409 }, { "epoch": 0.26139053969174275, "grad_norm": 6.52570915222168, "learning_rate": 7.386094603082572e-07, "loss": 0.2449, "step": 5410 }, { "epoch": 0.26143885587283183, "grad_norm": 3.8495988845825195, "learning_rate": 7.385611441271682e-07, "loss": 0.2797, "step": 5411 }, { "epoch": 0.26148717205392086, "grad_norm": 2.4535725116729736, "learning_rate": 7.385128279460791e-07, "loss": 0.2937, "step": 5412 }, { "epoch": 0.2615354882350099, "grad_norm": 3.478066921234131, "learning_rate": 7.3846451176499e-07, "loss": 0.3833, "step": 5413 }, { "epoch": 0.26158380441609896, "grad_norm": 3.266960382461548, "learning_rate": 7.38416195583901e-07, "loss": 0.3306, "step": 5414 }, { "epoch": 0.261632120597188, "grad_norm": 6.435799598693848, "learning_rate": 7.38367879402812e-07, "loss": 0.3984, "step": 5415 }, { "epoch": 0.26168043677827707, "grad_norm": 1.9500690698623657, "learning_rate": 7.38319563221723e-07, "loss": 0.2285, "step": 5416 }, { "epoch": 0.2617287529593661, "grad_norm": 2.2039198875427246, "learning_rate": 7.382712470406339e-07, "loss": 0.2649, "step": 5417 }, { "epoch": 0.2617770691404551, "grad_norm": 2.4266672134399414, "learning_rate": 7.382229308595448e-07, "loss": 0.2412, "step": 5418 }, { "epoch": 0.2618253853215442, "grad_norm": 2.991337776184082, "learning_rate": 7.381746146784557e-07, "loss": 0.3473, "step": 5419 }, { "epoch": 0.2618737015026332, "grad_norm": 2.132791042327881, "learning_rate": 7.381262984973667e-07, "loss": 0.245, "step": 5420 }, { "epoch": 0.26192201768372225, "grad_norm": 2.478130578994751, "learning_rate": 7.380779823162777e-07, "loss": 0.3045, "step": 5421 }, { "epoch": 0.26197033386481133, "grad_norm": 9.41119384765625, "learning_rate": 7.380296661351887e-07, "loss": 0.3095, "step": 5422 }, { "epoch": 0.26201865004590036, "grad_norm": 3.129810094833374, "learning_rate": 7.379813499540996e-07, "loss": 0.3563, "step": 5423 }, { "epoch": 0.26206696622698944, "grad_norm": 2.4041483402252197, "learning_rate": 7.379330337730106e-07, "loss": 0.2937, "step": 5424 }, { "epoch": 0.26211528240807846, "grad_norm": 3.765104055404663, "learning_rate": 7.378847175919216e-07, "loss": 0.4118, "step": 5425 }, { "epoch": 0.2621635985891675, "grad_norm": 2.1274654865264893, "learning_rate": 7.378364014108324e-07, "loss": 0.2577, "step": 5426 }, { "epoch": 0.26221191477025657, "grad_norm": 3.143176317214966, "learning_rate": 7.377880852297434e-07, "loss": 0.4422, "step": 5427 }, { "epoch": 0.2622602309513456, "grad_norm": 2.5845797061920166, "learning_rate": 7.377397690486543e-07, "loss": 0.2887, "step": 5428 }, { "epoch": 0.2623085471324347, "grad_norm": 4.236306190490723, "learning_rate": 7.376914528675653e-07, "loss": 0.3993, "step": 5429 }, { "epoch": 0.2623568633135237, "grad_norm": 3.003620147705078, "learning_rate": 7.376431366864763e-07, "loss": 0.2758, "step": 5430 }, { "epoch": 0.26240517949461273, "grad_norm": 3.025129556655884, "learning_rate": 7.375948205053873e-07, "loss": 0.3129, "step": 5431 }, { "epoch": 0.2624534956757018, "grad_norm": 2.4259836673736572, "learning_rate": 7.375465043242982e-07, "loss": 0.3327, "step": 5432 }, { "epoch": 0.26250181185679083, "grad_norm": 3.012603282928467, "learning_rate": 7.37498188143209e-07, "loss": 0.2334, "step": 5433 }, { "epoch": 0.26255012803787986, "grad_norm": 2.82051682472229, "learning_rate": 7.3744987196212e-07, "loss": 0.3559, "step": 5434 }, { "epoch": 0.26259844421896894, "grad_norm": 2.0556795597076416, "learning_rate": 7.37401555781031e-07, "loss": 0.233, "step": 5435 }, { "epoch": 0.26264676040005797, "grad_norm": 2.2333779335021973, "learning_rate": 7.37353239599942e-07, "loss": 0.2482, "step": 5436 }, { "epoch": 0.26269507658114705, "grad_norm": 13.938433647155762, "learning_rate": 7.37304923418853e-07, "loss": 0.1952, "step": 5437 }, { "epoch": 0.26274339276223607, "grad_norm": 2.298840045928955, "learning_rate": 7.372566072377638e-07, "loss": 0.2849, "step": 5438 }, { "epoch": 0.2627917089433251, "grad_norm": 1.9538322687149048, "learning_rate": 7.372082910566748e-07, "loss": 0.1841, "step": 5439 }, { "epoch": 0.2628400251244142, "grad_norm": 2.2568631172180176, "learning_rate": 7.371599748755858e-07, "loss": 0.2522, "step": 5440 }, { "epoch": 0.2628883413055032, "grad_norm": 2.656010150909424, "learning_rate": 7.371116586944968e-07, "loss": 0.2612, "step": 5441 }, { "epoch": 0.2629366574865923, "grad_norm": 3.8066158294677734, "learning_rate": 7.370633425134077e-07, "loss": 0.2799, "step": 5442 }, { "epoch": 0.2629849736676813, "grad_norm": 3.6035995483398438, "learning_rate": 7.370150263323187e-07, "loss": 0.242, "step": 5443 }, { "epoch": 0.26303328984877034, "grad_norm": 3.833385705947876, "learning_rate": 7.369667101512296e-07, "loss": 0.2872, "step": 5444 }, { "epoch": 0.2630816060298594, "grad_norm": 3.984103202819824, "learning_rate": 7.369183939701405e-07, "loss": 0.3242, "step": 5445 }, { "epoch": 0.26312992221094844, "grad_norm": 1.969956398010254, "learning_rate": 7.368700777890515e-07, "loss": 0.2051, "step": 5446 }, { "epoch": 0.26317823839203747, "grad_norm": 3.3555476665496826, "learning_rate": 7.368217616079625e-07, "loss": 0.4401, "step": 5447 }, { "epoch": 0.26322655457312655, "grad_norm": 2.7892158031463623, "learning_rate": 7.367734454268735e-07, "loss": 0.2896, "step": 5448 }, { "epoch": 0.2632748707542156, "grad_norm": 3.1894493103027344, "learning_rate": 7.367251292457844e-07, "loss": 0.2579, "step": 5449 }, { "epoch": 0.26332318693530465, "grad_norm": 2.2341501712799072, "learning_rate": 7.366768130646954e-07, "loss": 0.3115, "step": 5450 }, { "epoch": 0.2633715031163937, "grad_norm": 2.657644748687744, "learning_rate": 7.366284968836062e-07, "loss": 0.2651, "step": 5451 }, { "epoch": 0.2634198192974827, "grad_norm": 5.309545516967773, "learning_rate": 7.365801807025172e-07, "loss": 0.431, "step": 5452 }, { "epoch": 0.2634681354785718, "grad_norm": 1.994186282157898, "learning_rate": 7.365318645214282e-07, "loss": 0.2183, "step": 5453 }, { "epoch": 0.2635164516596608, "grad_norm": 2.9077041149139404, "learning_rate": 7.364835483403391e-07, "loss": 0.3319, "step": 5454 }, { "epoch": 0.2635647678407499, "grad_norm": 2.843623638153076, "learning_rate": 7.364352321592501e-07, "loss": 0.3916, "step": 5455 }, { "epoch": 0.2636130840218389, "grad_norm": 2.140878677368164, "learning_rate": 7.363869159781611e-07, "loss": 0.3015, "step": 5456 }, { "epoch": 0.26366140020292794, "grad_norm": 2.907588481903076, "learning_rate": 7.363385997970721e-07, "loss": 0.2303, "step": 5457 }, { "epoch": 0.263709716384017, "grad_norm": 2.4329257011413574, "learning_rate": 7.36290283615983e-07, "loss": 0.3572, "step": 5458 }, { "epoch": 0.26375803256510605, "grad_norm": 2.363574981689453, "learning_rate": 7.362419674348938e-07, "loss": 0.2839, "step": 5459 }, { "epoch": 0.2638063487461951, "grad_norm": 2.709723472595215, "learning_rate": 7.361936512538048e-07, "loss": 0.3399, "step": 5460 }, { "epoch": 0.26385466492728415, "grad_norm": 2.1712169647216797, "learning_rate": 7.361453350727158e-07, "loss": 0.3448, "step": 5461 }, { "epoch": 0.2639029811083732, "grad_norm": 7.596611022949219, "learning_rate": 7.360970188916268e-07, "loss": 0.3696, "step": 5462 }, { "epoch": 0.26395129728946226, "grad_norm": 3.15838360786438, "learning_rate": 7.360487027105378e-07, "loss": 0.3308, "step": 5463 }, { "epoch": 0.2639996134705513, "grad_norm": 3.6761486530303955, "learning_rate": 7.360003865294486e-07, "loss": 0.3174, "step": 5464 }, { "epoch": 0.2640479296516403, "grad_norm": 3.7366273403167725, "learning_rate": 7.359520703483596e-07, "loss": 0.1812, "step": 5465 }, { "epoch": 0.2640962458327294, "grad_norm": 2.9294161796569824, "learning_rate": 7.359037541672706e-07, "loss": 0.369, "step": 5466 }, { "epoch": 0.2641445620138184, "grad_norm": 2.7287139892578125, "learning_rate": 7.358554379861816e-07, "loss": 0.3493, "step": 5467 }, { "epoch": 0.2641928781949075, "grad_norm": 2.063492774963379, "learning_rate": 7.358071218050925e-07, "loss": 0.2424, "step": 5468 }, { "epoch": 0.2642411943759965, "grad_norm": 2.522261381149292, "learning_rate": 7.357588056240035e-07, "loss": 0.3182, "step": 5469 }, { "epoch": 0.26428951055708555, "grad_norm": 2.6221182346343994, "learning_rate": 7.357104894429143e-07, "loss": 0.3208, "step": 5470 }, { "epoch": 0.26433782673817463, "grad_norm": 4.085443496704102, "learning_rate": 7.356621732618253e-07, "loss": 0.2609, "step": 5471 }, { "epoch": 0.26438614291926366, "grad_norm": 2.462399482727051, "learning_rate": 7.356138570807363e-07, "loss": 0.2484, "step": 5472 }, { "epoch": 0.2644344591003527, "grad_norm": 3.093832015991211, "learning_rate": 7.355655408996473e-07, "loss": 0.3019, "step": 5473 }, { "epoch": 0.26448277528144176, "grad_norm": 2.239300489425659, "learning_rate": 7.355172247185583e-07, "loss": 0.2717, "step": 5474 }, { "epoch": 0.2645310914625308, "grad_norm": 2.7632761001586914, "learning_rate": 7.354689085374692e-07, "loss": 0.3783, "step": 5475 }, { "epoch": 0.26457940764361987, "grad_norm": 2.472688674926758, "learning_rate": 7.354205923563801e-07, "loss": 0.2773, "step": 5476 }, { "epoch": 0.2646277238247089, "grad_norm": 2.6457889080047607, "learning_rate": 7.35372276175291e-07, "loss": 0.2792, "step": 5477 }, { "epoch": 0.2646760400057979, "grad_norm": 1.7741971015930176, "learning_rate": 7.35323959994202e-07, "loss": 0.2529, "step": 5478 }, { "epoch": 0.264724356186887, "grad_norm": 4.2188720703125, "learning_rate": 7.35275643813113e-07, "loss": 0.1914, "step": 5479 }, { "epoch": 0.264772672367976, "grad_norm": 2.8684024810791016, "learning_rate": 7.352273276320239e-07, "loss": 0.3208, "step": 5480 }, { "epoch": 0.2648209885490651, "grad_norm": 6.300963878631592, "learning_rate": 7.351790114509349e-07, "loss": 0.3691, "step": 5481 }, { "epoch": 0.26486930473015413, "grad_norm": 4.332812786102295, "learning_rate": 7.351306952698459e-07, "loss": 0.3115, "step": 5482 }, { "epoch": 0.26491762091124316, "grad_norm": 2.3248212337493896, "learning_rate": 7.350823790887568e-07, "loss": 0.237, "step": 5483 }, { "epoch": 0.26496593709233224, "grad_norm": 2.156045913696289, "learning_rate": 7.350340629076678e-07, "loss": 0.2191, "step": 5484 }, { "epoch": 0.26501425327342126, "grad_norm": 13.383769035339355, "learning_rate": 7.349857467265786e-07, "loss": 0.283, "step": 5485 }, { "epoch": 0.2650625694545103, "grad_norm": 2.4180850982666016, "learning_rate": 7.349374305454896e-07, "loss": 0.2147, "step": 5486 }, { "epoch": 0.26511088563559937, "grad_norm": 2.6181085109710693, "learning_rate": 7.348891143644006e-07, "loss": 0.2459, "step": 5487 }, { "epoch": 0.2651592018166884, "grad_norm": 2.6652653217315674, "learning_rate": 7.348407981833116e-07, "loss": 0.2824, "step": 5488 }, { "epoch": 0.2652075179977775, "grad_norm": 2.4190096855163574, "learning_rate": 7.347924820022226e-07, "loss": 0.3737, "step": 5489 }, { "epoch": 0.2652558341788665, "grad_norm": 2.6133527755737305, "learning_rate": 7.347441658211334e-07, "loss": 0.2625, "step": 5490 }, { "epoch": 0.2653041503599555, "grad_norm": 2.229365825653076, "learning_rate": 7.346958496400444e-07, "loss": 0.295, "step": 5491 }, { "epoch": 0.2653524665410446, "grad_norm": 2.1193363666534424, "learning_rate": 7.346475334589554e-07, "loss": 0.1789, "step": 5492 }, { "epoch": 0.26540078272213363, "grad_norm": 2.307711124420166, "learning_rate": 7.345992172778663e-07, "loss": 0.305, "step": 5493 }, { "epoch": 0.2654490989032227, "grad_norm": 2.9461612701416016, "learning_rate": 7.345509010967773e-07, "loss": 0.2572, "step": 5494 }, { "epoch": 0.26549741508431174, "grad_norm": 1.7352476119995117, "learning_rate": 7.345025849156883e-07, "loss": 0.2012, "step": 5495 }, { "epoch": 0.26554573126540076, "grad_norm": 3.894989490509033, "learning_rate": 7.344542687345991e-07, "loss": 0.2478, "step": 5496 }, { "epoch": 0.26559404744648984, "grad_norm": 3.0909202098846436, "learning_rate": 7.344059525535101e-07, "loss": 0.3703, "step": 5497 }, { "epoch": 0.26564236362757887, "grad_norm": 2.4518606662750244, "learning_rate": 7.343576363724211e-07, "loss": 0.2785, "step": 5498 }, { "epoch": 0.2656906798086679, "grad_norm": 4.870954990386963, "learning_rate": 7.343093201913321e-07, "loss": 0.2869, "step": 5499 }, { "epoch": 0.265738995989757, "grad_norm": 2.860729455947876, "learning_rate": 7.342610040102431e-07, "loss": 0.4056, "step": 5500 }, { "epoch": 0.265787312170846, "grad_norm": 2.433035135269165, "learning_rate": 7.34212687829154e-07, "loss": 0.2646, "step": 5501 }, { "epoch": 0.2658356283519351, "grad_norm": 1.8546574115753174, "learning_rate": 7.341643716480648e-07, "loss": 0.2518, "step": 5502 }, { "epoch": 0.2658839445330241, "grad_norm": 2.4776358604431152, "learning_rate": 7.341160554669758e-07, "loss": 0.3662, "step": 5503 }, { "epoch": 0.26593226071411313, "grad_norm": 2.4601707458496094, "learning_rate": 7.340677392858868e-07, "loss": 0.295, "step": 5504 }, { "epoch": 0.2659805768952022, "grad_norm": 2.9829394817352295, "learning_rate": 7.340194231047978e-07, "loss": 0.3626, "step": 5505 }, { "epoch": 0.26602889307629124, "grad_norm": 2.3043313026428223, "learning_rate": 7.339711069237087e-07, "loss": 0.3492, "step": 5506 }, { "epoch": 0.2660772092573803, "grad_norm": 3.2143940925598145, "learning_rate": 7.339227907426197e-07, "loss": 0.3935, "step": 5507 }, { "epoch": 0.26612552543846935, "grad_norm": 3.2361745834350586, "learning_rate": 7.338744745615307e-07, "loss": 0.3285, "step": 5508 }, { "epoch": 0.26617384161955837, "grad_norm": 3.204190731048584, "learning_rate": 7.338261583804416e-07, "loss": 0.2511, "step": 5509 }, { "epoch": 0.26622215780064745, "grad_norm": 2.1486823558807373, "learning_rate": 7.337778421993525e-07, "loss": 0.2708, "step": 5510 }, { "epoch": 0.2662704739817365, "grad_norm": 2.867424249649048, "learning_rate": 7.337295260182634e-07, "loss": 0.3746, "step": 5511 }, { "epoch": 0.2663187901628255, "grad_norm": 4.301700115203857, "learning_rate": 7.336812098371744e-07, "loss": 0.2144, "step": 5512 }, { "epoch": 0.2663671063439146, "grad_norm": 2.7864227294921875, "learning_rate": 7.336328936560854e-07, "loss": 0.3231, "step": 5513 }, { "epoch": 0.2664154225250036, "grad_norm": 3.3027403354644775, "learning_rate": 7.335845774749964e-07, "loss": 0.4043, "step": 5514 }, { "epoch": 0.2664637387060927, "grad_norm": 7.2923688888549805, "learning_rate": 7.335362612939073e-07, "loss": 0.2963, "step": 5515 }, { "epoch": 0.2665120548871817, "grad_norm": 3.1174979209899902, "learning_rate": 7.334879451128182e-07, "loss": 0.3879, "step": 5516 }, { "epoch": 0.26656037106827074, "grad_norm": 2.7916202545166016, "learning_rate": 7.334396289317292e-07, "loss": 0.3637, "step": 5517 }, { "epoch": 0.2666086872493598, "grad_norm": 2.4012458324432373, "learning_rate": 7.333913127506401e-07, "loss": 0.2693, "step": 5518 }, { "epoch": 0.26665700343044885, "grad_norm": 2.1517553329467773, "learning_rate": 7.333429965695511e-07, "loss": 0.2748, "step": 5519 }, { "epoch": 0.2667053196115379, "grad_norm": 5.0096893310546875, "learning_rate": 7.332946803884621e-07, "loss": 0.416, "step": 5520 }, { "epoch": 0.26675363579262695, "grad_norm": 3.057408571243286, "learning_rate": 7.332463642073731e-07, "loss": 0.2085, "step": 5521 }, { "epoch": 0.266801951973716, "grad_norm": 4.4046502113342285, "learning_rate": 7.331980480262839e-07, "loss": 0.5294, "step": 5522 }, { "epoch": 0.26685026815480506, "grad_norm": 2.0543737411499023, "learning_rate": 7.331497318451949e-07, "loss": 0.196, "step": 5523 }, { "epoch": 0.2668985843358941, "grad_norm": 2.783278226852417, "learning_rate": 7.331014156641059e-07, "loss": 0.3522, "step": 5524 }, { "epoch": 0.2669469005169831, "grad_norm": 2.280649423599243, "learning_rate": 7.330530994830169e-07, "loss": 0.2179, "step": 5525 }, { "epoch": 0.2669952166980722, "grad_norm": 3.2106432914733887, "learning_rate": 7.330047833019279e-07, "loss": 0.3087, "step": 5526 }, { "epoch": 0.2670435328791612, "grad_norm": 3.5565667152404785, "learning_rate": 7.329564671208387e-07, "loss": 0.4403, "step": 5527 }, { "epoch": 0.2670918490602503, "grad_norm": 2.7889316082000732, "learning_rate": 7.329081509397496e-07, "loss": 0.2968, "step": 5528 }, { "epoch": 0.2671401652413393, "grad_norm": 3.4785103797912598, "learning_rate": 7.328598347586606e-07, "loss": 0.349, "step": 5529 }, { "epoch": 0.26718848142242835, "grad_norm": 13.023584365844727, "learning_rate": 7.328115185775716e-07, "loss": 0.2773, "step": 5530 }, { "epoch": 0.26723679760351743, "grad_norm": 3.062781572341919, "learning_rate": 7.327632023964826e-07, "loss": 0.4151, "step": 5531 }, { "epoch": 0.26728511378460645, "grad_norm": 4.261837959289551, "learning_rate": 7.327148862153935e-07, "loss": 0.4461, "step": 5532 }, { "epoch": 0.26733342996569553, "grad_norm": 16.517230987548828, "learning_rate": 7.326665700343045e-07, "loss": 0.3247, "step": 5533 }, { "epoch": 0.26738174614678456, "grad_norm": 2.615764617919922, "learning_rate": 7.326182538532154e-07, "loss": 0.3305, "step": 5534 }, { "epoch": 0.2674300623278736, "grad_norm": 5.749774932861328, "learning_rate": 7.325699376721263e-07, "loss": 0.3142, "step": 5535 }, { "epoch": 0.26747837850896267, "grad_norm": 2.8119595050811768, "learning_rate": 7.325216214910373e-07, "loss": 0.3648, "step": 5536 }, { "epoch": 0.2675266946900517, "grad_norm": 3.117630958557129, "learning_rate": 7.324733053099482e-07, "loss": 0.4126, "step": 5537 }, { "epoch": 0.2675750108711408, "grad_norm": 3.3493244647979736, "learning_rate": 7.324249891288592e-07, "loss": 0.4237, "step": 5538 }, { "epoch": 0.2676233270522298, "grad_norm": 1.8703399896621704, "learning_rate": 7.323766729477702e-07, "loss": 0.2395, "step": 5539 }, { "epoch": 0.2676716432333188, "grad_norm": 2.846973180770874, "learning_rate": 7.323283567666812e-07, "loss": 0.3913, "step": 5540 }, { "epoch": 0.2677199594144079, "grad_norm": 2.843820095062256, "learning_rate": 7.322800405855921e-07, "loss": 0.2497, "step": 5541 }, { "epoch": 0.26776827559549693, "grad_norm": 2.9958746433258057, "learning_rate": 7.32231724404503e-07, "loss": 0.3547, "step": 5542 }, { "epoch": 0.26781659177658595, "grad_norm": 2.5028226375579834, "learning_rate": 7.32183408223414e-07, "loss": 0.3296, "step": 5543 }, { "epoch": 0.26786490795767504, "grad_norm": 2.9346694946289062, "learning_rate": 7.321350920423249e-07, "loss": 0.3899, "step": 5544 }, { "epoch": 0.26791322413876406, "grad_norm": 2.182996988296509, "learning_rate": 7.320867758612359e-07, "loss": 0.263, "step": 5545 }, { "epoch": 0.26796154031985314, "grad_norm": 3.3043510913848877, "learning_rate": 7.320384596801469e-07, "loss": 0.4075, "step": 5546 }, { "epoch": 0.26800985650094217, "grad_norm": 2.7688567638397217, "learning_rate": 7.319901434990578e-07, "loss": 0.3579, "step": 5547 }, { "epoch": 0.2680581726820312, "grad_norm": 2.718501091003418, "learning_rate": 7.319418273179687e-07, "loss": 0.3238, "step": 5548 }, { "epoch": 0.2681064888631203, "grad_norm": 2.6125664710998535, "learning_rate": 7.318935111368797e-07, "loss": 0.1932, "step": 5549 }, { "epoch": 0.2681548050442093, "grad_norm": 2.7318146228790283, "learning_rate": 7.318451949557907e-07, "loss": 0.2765, "step": 5550 }, { "epoch": 0.2682031212252984, "grad_norm": 4.26815128326416, "learning_rate": 7.317968787747017e-07, "loss": 0.3921, "step": 5551 }, { "epoch": 0.2682514374063874, "grad_norm": 1.795571208000183, "learning_rate": 7.317485625936126e-07, "loss": 0.2322, "step": 5552 }, { "epoch": 0.26829975358747643, "grad_norm": 2.160327911376953, "learning_rate": 7.317002464125234e-07, "loss": 0.2273, "step": 5553 }, { "epoch": 0.2683480697685655, "grad_norm": 2.8129236698150635, "learning_rate": 7.316519302314344e-07, "loss": 0.4259, "step": 5554 }, { "epoch": 0.26839638594965454, "grad_norm": 3.170245885848999, "learning_rate": 7.316036140503454e-07, "loss": 0.3449, "step": 5555 }, { "epoch": 0.26844470213074356, "grad_norm": 3.1577036380767822, "learning_rate": 7.315552978692564e-07, "loss": 0.4882, "step": 5556 }, { "epoch": 0.26849301831183264, "grad_norm": 2.171340227127075, "learning_rate": 7.315069816881674e-07, "loss": 0.2326, "step": 5557 }, { "epoch": 0.26854133449292167, "grad_norm": 2.7969882488250732, "learning_rate": 7.314586655070783e-07, "loss": 0.3235, "step": 5558 }, { "epoch": 0.26858965067401075, "grad_norm": 1.7451343536376953, "learning_rate": 7.314103493259893e-07, "loss": 0.1769, "step": 5559 }, { "epoch": 0.2686379668550998, "grad_norm": 2.18467378616333, "learning_rate": 7.313620331449001e-07, "loss": 0.2308, "step": 5560 }, { "epoch": 0.2686862830361888, "grad_norm": 4.545289039611816, "learning_rate": 7.313137169638111e-07, "loss": 0.4256, "step": 5561 }, { "epoch": 0.2687345992172779, "grad_norm": 1.4974523782730103, "learning_rate": 7.312654007827221e-07, "loss": 0.1783, "step": 5562 }, { "epoch": 0.2687829153983669, "grad_norm": 2.8551719188690186, "learning_rate": 7.31217084601633e-07, "loss": 0.3035, "step": 5563 }, { "epoch": 0.268831231579456, "grad_norm": 2.0106704235076904, "learning_rate": 7.31168768420544e-07, "loss": 0.1941, "step": 5564 }, { "epoch": 0.268879547760545, "grad_norm": 3.3724327087402344, "learning_rate": 7.31120452239455e-07, "loss": 0.3986, "step": 5565 }, { "epoch": 0.26892786394163404, "grad_norm": 2.7357683181762695, "learning_rate": 7.310721360583659e-07, "loss": 0.3132, "step": 5566 }, { "epoch": 0.2689761801227231, "grad_norm": 2.5959184169769287, "learning_rate": 7.310238198772769e-07, "loss": 0.3593, "step": 5567 }, { "epoch": 0.26902449630381214, "grad_norm": 2.1477532386779785, "learning_rate": 7.309755036961878e-07, "loss": 0.2535, "step": 5568 }, { "epoch": 0.26907281248490117, "grad_norm": 2.9156293869018555, "learning_rate": 7.309271875150987e-07, "loss": 0.2723, "step": 5569 }, { "epoch": 0.26912112866599025, "grad_norm": 3.1454215049743652, "learning_rate": 7.308788713340097e-07, "loss": 0.2409, "step": 5570 }, { "epoch": 0.2691694448470793, "grad_norm": 2.6431362628936768, "learning_rate": 7.308305551529207e-07, "loss": 0.3641, "step": 5571 }, { "epoch": 0.26921776102816836, "grad_norm": 2.4160993099212646, "learning_rate": 7.307822389718317e-07, "loss": 0.3358, "step": 5572 }, { "epoch": 0.2692660772092574, "grad_norm": 2.038423538208008, "learning_rate": 7.307339227907426e-07, "loss": 0.2091, "step": 5573 }, { "epoch": 0.2693143933903464, "grad_norm": 2.8114535808563232, "learning_rate": 7.306856066096535e-07, "loss": 0.4182, "step": 5574 }, { "epoch": 0.2693627095714355, "grad_norm": 2.471482038497925, "learning_rate": 7.306372904285645e-07, "loss": 0.2022, "step": 5575 }, { "epoch": 0.2694110257525245, "grad_norm": 2.8451287746429443, "learning_rate": 7.305889742474755e-07, "loss": 0.3338, "step": 5576 }, { "epoch": 0.2694593419336136, "grad_norm": 2.4531350135803223, "learning_rate": 7.305406580663865e-07, "loss": 0.2483, "step": 5577 }, { "epoch": 0.2695076581147026, "grad_norm": 1.597423791885376, "learning_rate": 7.304923418852974e-07, "loss": 0.1893, "step": 5578 }, { "epoch": 0.26955597429579164, "grad_norm": 2.6112372875213623, "learning_rate": 7.304440257042082e-07, "loss": 0.3573, "step": 5579 }, { "epoch": 0.2696042904768807, "grad_norm": 2.653864860534668, "learning_rate": 7.303957095231192e-07, "loss": 0.3586, "step": 5580 }, { "epoch": 0.26965260665796975, "grad_norm": 1.8443256616592407, "learning_rate": 7.303473933420302e-07, "loss": 0.2, "step": 5581 }, { "epoch": 0.2697009228390588, "grad_norm": 2.70110821723938, "learning_rate": 7.302990771609412e-07, "loss": 0.3198, "step": 5582 }, { "epoch": 0.26974923902014786, "grad_norm": 2.606379747390747, "learning_rate": 7.302507609798522e-07, "loss": 0.3251, "step": 5583 }, { "epoch": 0.2697975552012369, "grad_norm": 2.678798198699951, "learning_rate": 7.302024447987631e-07, "loss": 0.2596, "step": 5584 }, { "epoch": 0.26984587138232596, "grad_norm": 2.9112205505371094, "learning_rate": 7.30154128617674e-07, "loss": 0.3362, "step": 5585 }, { "epoch": 0.269894187563415, "grad_norm": 2.7676072120666504, "learning_rate": 7.301058124365849e-07, "loss": 0.3514, "step": 5586 }, { "epoch": 0.269942503744504, "grad_norm": 2.9445650577545166, "learning_rate": 7.300574962554959e-07, "loss": 0.3733, "step": 5587 }, { "epoch": 0.2699908199255931, "grad_norm": 1.9918104410171509, "learning_rate": 7.300091800744069e-07, "loss": 0.2944, "step": 5588 }, { "epoch": 0.2700391361066821, "grad_norm": 2.798555850982666, "learning_rate": 7.299608638933178e-07, "loss": 0.2909, "step": 5589 }, { "epoch": 0.2700874522877712, "grad_norm": 3.2104732990264893, "learning_rate": 7.299125477122288e-07, "loss": 0.3756, "step": 5590 }, { "epoch": 0.2701357684688602, "grad_norm": 1.7543859481811523, "learning_rate": 7.298642315311398e-07, "loss": 0.2154, "step": 5591 }, { "epoch": 0.27018408464994925, "grad_norm": 2.7054977416992188, "learning_rate": 7.298159153500507e-07, "loss": 0.3686, "step": 5592 }, { "epoch": 0.27023240083103833, "grad_norm": 8.330596923828125, "learning_rate": 7.297675991689617e-07, "loss": 0.3626, "step": 5593 }, { "epoch": 0.27028071701212736, "grad_norm": 2.788771390914917, "learning_rate": 7.297192829878725e-07, "loss": 0.3934, "step": 5594 }, { "epoch": 0.2703290331932164, "grad_norm": 2.1924424171447754, "learning_rate": 7.296709668067835e-07, "loss": 0.2609, "step": 5595 }, { "epoch": 0.27037734937430546, "grad_norm": 2.788250684738159, "learning_rate": 7.296226506256945e-07, "loss": 0.3497, "step": 5596 }, { "epoch": 0.2704256655553945, "grad_norm": 2.343231678009033, "learning_rate": 7.295743344446055e-07, "loss": 0.2812, "step": 5597 }, { "epoch": 0.27047398173648357, "grad_norm": 3.527162790298462, "learning_rate": 7.295260182635164e-07, "loss": 0.281, "step": 5598 }, { "epoch": 0.2705222979175726, "grad_norm": 4.192524433135986, "learning_rate": 7.294777020824274e-07, "loss": 0.3215, "step": 5599 }, { "epoch": 0.2705706140986616, "grad_norm": 4.426510810852051, "learning_rate": 7.294293859013383e-07, "loss": 0.3424, "step": 5600 }, { "epoch": 0.2706189302797507, "grad_norm": 2.5598626136779785, "learning_rate": 7.293810697202493e-07, "loss": 0.3741, "step": 5601 }, { "epoch": 0.2706672464608397, "grad_norm": 2.8573269844055176, "learning_rate": 7.293327535391603e-07, "loss": 0.2811, "step": 5602 }, { "epoch": 0.2707155626419288, "grad_norm": 6.09246301651001, "learning_rate": 7.292844373580712e-07, "loss": 0.2474, "step": 5603 }, { "epoch": 0.27076387882301783, "grad_norm": 13.139717102050781, "learning_rate": 7.292361211769822e-07, "loss": 0.26, "step": 5604 }, { "epoch": 0.27081219500410686, "grad_norm": 2.619027853012085, "learning_rate": 7.29187804995893e-07, "loss": 0.3262, "step": 5605 }, { "epoch": 0.27086051118519594, "grad_norm": 4.088361740112305, "learning_rate": 7.29139488814804e-07, "loss": 0.4519, "step": 5606 }, { "epoch": 0.27090882736628497, "grad_norm": 11.587895393371582, "learning_rate": 7.29091172633715e-07, "loss": 0.2657, "step": 5607 }, { "epoch": 0.270957143547374, "grad_norm": 1.3546454906463623, "learning_rate": 7.29042856452626e-07, "loss": 0.1813, "step": 5608 }, { "epoch": 0.27100545972846307, "grad_norm": 2.662557601928711, "learning_rate": 7.28994540271537e-07, "loss": 0.4007, "step": 5609 }, { "epoch": 0.2710537759095521, "grad_norm": 2.0594210624694824, "learning_rate": 7.289462240904479e-07, "loss": 0.2862, "step": 5610 }, { "epoch": 0.2711020920906412, "grad_norm": 1.8440403938293457, "learning_rate": 7.288979079093587e-07, "loss": 0.1323, "step": 5611 }, { "epoch": 0.2711504082717302, "grad_norm": 2.275489330291748, "learning_rate": 7.288495917282697e-07, "loss": 0.2957, "step": 5612 }, { "epoch": 0.27119872445281923, "grad_norm": 2.707792043685913, "learning_rate": 7.288012755471807e-07, "loss": 0.2233, "step": 5613 }, { "epoch": 0.2712470406339083, "grad_norm": 1.7114696502685547, "learning_rate": 7.287529593660917e-07, "loss": 0.1641, "step": 5614 }, { "epoch": 0.27129535681499733, "grad_norm": 2.039454698562622, "learning_rate": 7.287046431850026e-07, "loss": 0.2399, "step": 5615 }, { "epoch": 0.2713436729960864, "grad_norm": 2.546187400817871, "learning_rate": 7.286563270039136e-07, "loss": 0.2449, "step": 5616 }, { "epoch": 0.27139198917717544, "grad_norm": 3.6123836040496826, "learning_rate": 7.286080108228246e-07, "loss": 0.4798, "step": 5617 }, { "epoch": 0.27144030535826447, "grad_norm": 2.2977092266082764, "learning_rate": 7.285596946417355e-07, "loss": 0.2506, "step": 5618 }, { "epoch": 0.27148862153935355, "grad_norm": 3.065495014190674, "learning_rate": 7.285113784606465e-07, "loss": 0.3509, "step": 5619 }, { "epoch": 0.2715369377204426, "grad_norm": 3.05983304977417, "learning_rate": 7.284630622795573e-07, "loss": 0.394, "step": 5620 }, { "epoch": 0.2715852539015316, "grad_norm": 2.8948593139648438, "learning_rate": 7.284147460984683e-07, "loss": 0.3314, "step": 5621 }, { "epoch": 0.2716335700826207, "grad_norm": 2.7859742641448975, "learning_rate": 7.283664299173793e-07, "loss": 0.2584, "step": 5622 }, { "epoch": 0.2716818862637097, "grad_norm": 1.735910177230835, "learning_rate": 7.283181137362903e-07, "loss": 0.2236, "step": 5623 }, { "epoch": 0.2717302024447988, "grad_norm": 2.5213685035705566, "learning_rate": 7.282697975552012e-07, "loss": 0.3075, "step": 5624 }, { "epoch": 0.2717785186258878, "grad_norm": 3.7221908569335938, "learning_rate": 7.282214813741122e-07, "loss": 0.2761, "step": 5625 }, { "epoch": 0.27182683480697684, "grad_norm": 2.3149807453155518, "learning_rate": 7.281731651930231e-07, "loss": 0.3096, "step": 5626 }, { "epoch": 0.2718751509880659, "grad_norm": 2.0007054805755615, "learning_rate": 7.281248490119341e-07, "loss": 0.2558, "step": 5627 }, { "epoch": 0.27192346716915494, "grad_norm": 2.8939931392669678, "learning_rate": 7.28076532830845e-07, "loss": 0.2215, "step": 5628 }, { "epoch": 0.271971783350244, "grad_norm": 2.635939121246338, "learning_rate": 7.28028216649756e-07, "loss": 0.3575, "step": 5629 }, { "epoch": 0.27202009953133305, "grad_norm": 2.8389627933502197, "learning_rate": 7.279799004686669e-07, "loss": 0.3796, "step": 5630 }, { "epoch": 0.2720684157124221, "grad_norm": 2.945216417312622, "learning_rate": 7.279315842875778e-07, "loss": 0.2927, "step": 5631 }, { "epoch": 0.27211673189351115, "grad_norm": 4.040414810180664, "learning_rate": 7.278832681064888e-07, "loss": 0.3778, "step": 5632 }, { "epoch": 0.2721650480746002, "grad_norm": 3.2824885845184326, "learning_rate": 7.278349519253998e-07, "loss": 0.3932, "step": 5633 }, { "epoch": 0.2722133642556892, "grad_norm": 3.315014362335205, "learning_rate": 7.277866357443108e-07, "loss": 0.2072, "step": 5634 }, { "epoch": 0.2722616804367783, "grad_norm": 2.3413240909576416, "learning_rate": 7.277383195632218e-07, "loss": 0.2411, "step": 5635 }, { "epoch": 0.2723099966178673, "grad_norm": 2.9179117679595947, "learning_rate": 7.276900033821327e-07, "loss": 0.2912, "step": 5636 }, { "epoch": 0.2723583127989564, "grad_norm": 2.40753436088562, "learning_rate": 7.276416872010435e-07, "loss": 0.2985, "step": 5637 }, { "epoch": 0.2724066289800454, "grad_norm": 3.440934658050537, "learning_rate": 7.275933710199545e-07, "loss": 0.4512, "step": 5638 }, { "epoch": 0.27245494516113444, "grad_norm": 2.719888925552368, "learning_rate": 7.275450548388655e-07, "loss": 0.4073, "step": 5639 }, { "epoch": 0.2725032613422235, "grad_norm": 2.6574442386627197, "learning_rate": 7.274967386577765e-07, "loss": 0.3429, "step": 5640 }, { "epoch": 0.27255157752331255, "grad_norm": 2.730346441268921, "learning_rate": 7.274484224766874e-07, "loss": 0.2436, "step": 5641 }, { "epoch": 0.27259989370440163, "grad_norm": 2.4943583011627197, "learning_rate": 7.274001062955984e-07, "loss": 0.2969, "step": 5642 }, { "epoch": 0.27264820988549066, "grad_norm": 2.04289174079895, "learning_rate": 7.273517901145093e-07, "loss": 0.2382, "step": 5643 }, { "epoch": 0.2726965260665797, "grad_norm": 1.8435084819793701, "learning_rate": 7.273034739334203e-07, "loss": 0.1912, "step": 5644 }, { "epoch": 0.27274484224766876, "grad_norm": 2.3760571479797363, "learning_rate": 7.272551577523312e-07, "loss": 0.3214, "step": 5645 }, { "epoch": 0.2727931584287578, "grad_norm": 3.162795066833496, "learning_rate": 7.272068415712421e-07, "loss": 0.2972, "step": 5646 }, { "epoch": 0.2728414746098468, "grad_norm": 3.1098899841308594, "learning_rate": 7.271585253901531e-07, "loss": 0.362, "step": 5647 }, { "epoch": 0.2728897907909359, "grad_norm": 2.12324595451355, "learning_rate": 7.271102092090641e-07, "loss": 0.2984, "step": 5648 }, { "epoch": 0.2729381069720249, "grad_norm": 2.7347328662872314, "learning_rate": 7.270618930279751e-07, "loss": 0.2287, "step": 5649 }, { "epoch": 0.272986423153114, "grad_norm": 3.7554891109466553, "learning_rate": 7.27013576846886e-07, "loss": 0.443, "step": 5650 }, { "epoch": 0.273034739334203, "grad_norm": 4.389887809753418, "learning_rate": 7.26965260665797e-07, "loss": 0.3221, "step": 5651 }, { "epoch": 0.27308305551529205, "grad_norm": 2.1455278396606445, "learning_rate": 7.269169444847079e-07, "loss": 0.2432, "step": 5652 }, { "epoch": 0.27313137169638113, "grad_norm": 2.434767246246338, "learning_rate": 7.268686283036188e-07, "loss": 0.1932, "step": 5653 }, { "epoch": 0.27317968787747016, "grad_norm": 1.9159046411514282, "learning_rate": 7.268203121225298e-07, "loss": 0.3013, "step": 5654 }, { "epoch": 0.27322800405855924, "grad_norm": 3.3206229209899902, "learning_rate": 7.267719959414408e-07, "loss": 0.2804, "step": 5655 }, { "epoch": 0.27327632023964826, "grad_norm": 3.96596097946167, "learning_rate": 7.267236797603517e-07, "loss": 0.3301, "step": 5656 }, { "epoch": 0.2733246364207373, "grad_norm": 4.550795078277588, "learning_rate": 7.266753635792626e-07, "loss": 0.3315, "step": 5657 }, { "epoch": 0.27337295260182637, "grad_norm": 2.2887816429138184, "learning_rate": 7.266270473981736e-07, "loss": 0.3067, "step": 5658 }, { "epoch": 0.2734212687829154, "grad_norm": 1.988438606262207, "learning_rate": 7.265787312170846e-07, "loss": 0.1986, "step": 5659 }, { "epoch": 0.2734695849640044, "grad_norm": 2.5055785179138184, "learning_rate": 7.265304150359956e-07, "loss": 0.3431, "step": 5660 }, { "epoch": 0.2735179011450935, "grad_norm": 2.8578686714172363, "learning_rate": 7.264820988549066e-07, "loss": 0.3342, "step": 5661 }, { "epoch": 0.2735662173261825, "grad_norm": 2.6097311973571777, "learning_rate": 7.264337826738173e-07, "loss": 0.3659, "step": 5662 }, { "epoch": 0.2736145335072716, "grad_norm": 10.716559410095215, "learning_rate": 7.263854664927283e-07, "loss": 0.2464, "step": 5663 }, { "epoch": 0.27366284968836063, "grad_norm": 5.344036102294922, "learning_rate": 7.263371503116393e-07, "loss": 0.3658, "step": 5664 }, { "epoch": 0.27371116586944966, "grad_norm": 1.9150437116622925, "learning_rate": 7.262888341305503e-07, "loss": 0.1886, "step": 5665 }, { "epoch": 0.27375948205053874, "grad_norm": 2.368391990661621, "learning_rate": 7.262405179494613e-07, "loss": 0.2656, "step": 5666 }, { "epoch": 0.27380779823162776, "grad_norm": 2.231900930404663, "learning_rate": 7.261922017683722e-07, "loss": 0.3009, "step": 5667 }, { "epoch": 0.27385611441271684, "grad_norm": 2.5329225063323975, "learning_rate": 7.261438855872832e-07, "loss": 0.2565, "step": 5668 }, { "epoch": 0.27390443059380587, "grad_norm": 28.2957706451416, "learning_rate": 7.260955694061941e-07, "loss": 0.3545, "step": 5669 }, { "epoch": 0.2739527467748949, "grad_norm": 2.9687530994415283, "learning_rate": 7.26047253225105e-07, "loss": 0.3268, "step": 5670 }, { "epoch": 0.274001062955984, "grad_norm": 2.46923565864563, "learning_rate": 7.25998937044016e-07, "loss": 0.296, "step": 5671 }, { "epoch": 0.274049379137073, "grad_norm": 2.9007785320281982, "learning_rate": 7.259506208629269e-07, "loss": 0.3212, "step": 5672 }, { "epoch": 0.274097695318162, "grad_norm": 4.588761806488037, "learning_rate": 7.259023046818379e-07, "loss": 0.3239, "step": 5673 }, { "epoch": 0.2741460114992511, "grad_norm": 2.7351388931274414, "learning_rate": 7.258539885007489e-07, "loss": 0.3496, "step": 5674 }, { "epoch": 0.27419432768034013, "grad_norm": 3.24064564704895, "learning_rate": 7.258056723196598e-07, "loss": 0.1989, "step": 5675 }, { "epoch": 0.2742426438614292, "grad_norm": 2.296200752258301, "learning_rate": 7.257573561385708e-07, "loss": 0.2272, "step": 5676 }, { "epoch": 0.27429096004251824, "grad_norm": 3.1336793899536133, "learning_rate": 7.257090399574817e-07, "loss": 0.3394, "step": 5677 }, { "epoch": 0.27433927622360726, "grad_norm": 5.925944805145264, "learning_rate": 7.256607237763927e-07, "loss": 0.1827, "step": 5678 }, { "epoch": 0.27438759240469635, "grad_norm": 1.8881527185440063, "learning_rate": 7.256124075953036e-07, "loss": 0.2083, "step": 5679 }, { "epoch": 0.27443590858578537, "grad_norm": 2.128026008605957, "learning_rate": 7.255640914142146e-07, "loss": 0.2406, "step": 5680 }, { "epoch": 0.27448422476687445, "grad_norm": 2.75561785697937, "learning_rate": 7.255157752331256e-07, "loss": 0.2673, "step": 5681 }, { "epoch": 0.2745325409479635, "grad_norm": 3.3514604568481445, "learning_rate": 7.254674590520365e-07, "loss": 0.2362, "step": 5682 }, { "epoch": 0.2745808571290525, "grad_norm": 2.8215107917785645, "learning_rate": 7.254191428709474e-07, "loss": 0.3307, "step": 5683 }, { "epoch": 0.2746291733101416, "grad_norm": 6.364210605621338, "learning_rate": 7.253708266898584e-07, "loss": 0.2766, "step": 5684 }, { "epoch": 0.2746774894912306, "grad_norm": 8.61685848236084, "learning_rate": 7.253225105087694e-07, "loss": 0.3469, "step": 5685 }, { "epoch": 0.27472580567231963, "grad_norm": 2.7187774181365967, "learning_rate": 7.252741943276804e-07, "loss": 0.3444, "step": 5686 }, { "epoch": 0.2747741218534087, "grad_norm": 2.791522741317749, "learning_rate": 7.252258781465914e-07, "loss": 0.3781, "step": 5687 }, { "epoch": 0.27482243803449774, "grad_norm": 1.9341845512390137, "learning_rate": 7.251775619655021e-07, "loss": 0.2038, "step": 5688 }, { "epoch": 0.2748707542155868, "grad_norm": 3.8958654403686523, "learning_rate": 7.251292457844131e-07, "loss": 0.4619, "step": 5689 }, { "epoch": 0.27491907039667585, "grad_norm": 2.1247825622558594, "learning_rate": 7.250809296033241e-07, "loss": 0.2811, "step": 5690 }, { "epoch": 0.27496738657776487, "grad_norm": 3.6820836067199707, "learning_rate": 7.250326134222351e-07, "loss": 0.4014, "step": 5691 }, { "epoch": 0.27501570275885395, "grad_norm": 2.4998891353607178, "learning_rate": 7.249842972411461e-07, "loss": 0.2762, "step": 5692 }, { "epoch": 0.275064018939943, "grad_norm": 2.530841588973999, "learning_rate": 7.24935981060057e-07, "loss": 0.318, "step": 5693 }, { "epoch": 0.27511233512103206, "grad_norm": 2.408301830291748, "learning_rate": 7.248876648789679e-07, "loss": 0.2874, "step": 5694 }, { "epoch": 0.2751606513021211, "grad_norm": 5.391392707824707, "learning_rate": 7.248393486978789e-07, "loss": 0.5165, "step": 5695 }, { "epoch": 0.2752089674832101, "grad_norm": 2.418287515640259, "learning_rate": 7.247910325167898e-07, "loss": 0.2831, "step": 5696 }, { "epoch": 0.2752572836642992, "grad_norm": 2.44960355758667, "learning_rate": 7.247427163357008e-07, "loss": 0.2822, "step": 5697 }, { "epoch": 0.2753055998453882, "grad_norm": 3.04586124420166, "learning_rate": 7.246944001546117e-07, "loss": 0.2833, "step": 5698 }, { "epoch": 0.27535391602647724, "grad_norm": 2.9940133094787598, "learning_rate": 7.246460839735227e-07, "loss": 0.4088, "step": 5699 }, { "epoch": 0.2754022322075663, "grad_norm": 1.529040813446045, "learning_rate": 7.245977677924337e-07, "loss": 0.1485, "step": 5700 }, { "epoch": 0.27545054838865535, "grad_norm": 2.842090606689453, "learning_rate": 7.245494516113446e-07, "loss": 0.3872, "step": 5701 }, { "epoch": 0.27549886456974443, "grad_norm": 3.041517734527588, "learning_rate": 7.245011354302556e-07, "loss": 0.4495, "step": 5702 }, { "epoch": 0.27554718075083345, "grad_norm": 2.5561063289642334, "learning_rate": 7.244528192491665e-07, "loss": 0.3434, "step": 5703 }, { "epoch": 0.2755954969319225, "grad_norm": 6.518429756164551, "learning_rate": 7.244045030680774e-07, "loss": 0.3031, "step": 5704 }, { "epoch": 0.27564381311301156, "grad_norm": 2.9946889877319336, "learning_rate": 7.243561868869884e-07, "loss": 0.4648, "step": 5705 }, { "epoch": 0.2756921292941006, "grad_norm": 6.537413120269775, "learning_rate": 7.243078707058994e-07, "loss": 0.317, "step": 5706 }, { "epoch": 0.27574044547518967, "grad_norm": 3.471803903579712, "learning_rate": 7.242595545248103e-07, "loss": 0.3899, "step": 5707 }, { "epoch": 0.2757887616562787, "grad_norm": 1.6728492975234985, "learning_rate": 7.242112383437213e-07, "loss": 0.1751, "step": 5708 }, { "epoch": 0.2758370778373677, "grad_norm": 3.0012669563293457, "learning_rate": 7.241629221626322e-07, "loss": 0.3662, "step": 5709 }, { "epoch": 0.2758853940184568, "grad_norm": 2.348844051361084, "learning_rate": 7.241146059815432e-07, "loss": 0.3126, "step": 5710 }, { "epoch": 0.2759337101995458, "grad_norm": 2.7551631927490234, "learning_rate": 7.240662898004542e-07, "loss": 0.3771, "step": 5711 }, { "epoch": 0.27598202638063485, "grad_norm": 2.516176700592041, "learning_rate": 7.240179736193652e-07, "loss": 0.2762, "step": 5712 }, { "epoch": 0.27603034256172393, "grad_norm": 2.3622140884399414, "learning_rate": 7.239696574382761e-07, "loss": 0.275, "step": 5713 }, { "epoch": 0.27607865874281295, "grad_norm": 2.517094373703003, "learning_rate": 7.239213412571869e-07, "loss": 0.3099, "step": 5714 }, { "epoch": 0.27612697492390204, "grad_norm": 3.087549924850464, "learning_rate": 7.238730250760979e-07, "loss": 0.4384, "step": 5715 }, { "epoch": 0.27617529110499106, "grad_norm": 3.534679651260376, "learning_rate": 7.238247088950089e-07, "loss": 0.3287, "step": 5716 }, { "epoch": 0.2762236072860801, "grad_norm": 3.2960875034332275, "learning_rate": 7.237763927139199e-07, "loss": 0.4397, "step": 5717 }, { "epoch": 0.27627192346716917, "grad_norm": 2.3585150241851807, "learning_rate": 7.237280765328309e-07, "loss": 0.3852, "step": 5718 }, { "epoch": 0.2763202396482582, "grad_norm": 3.822040557861328, "learning_rate": 7.236797603517418e-07, "loss": 0.4209, "step": 5719 }, { "epoch": 0.2763685558293473, "grad_norm": 2.9991607666015625, "learning_rate": 7.236314441706527e-07, "loss": 0.2866, "step": 5720 }, { "epoch": 0.2764168720104363, "grad_norm": 3.2017812728881836, "learning_rate": 7.235831279895636e-07, "loss": 0.4575, "step": 5721 }, { "epoch": 0.2764651881915253, "grad_norm": 2.328756093978882, "learning_rate": 7.235348118084746e-07, "loss": 0.2733, "step": 5722 }, { "epoch": 0.2765135043726144, "grad_norm": 2.8523635864257812, "learning_rate": 7.234864956273856e-07, "loss": 0.4271, "step": 5723 }, { "epoch": 0.27656182055370343, "grad_norm": 2.7516841888427734, "learning_rate": 7.234381794462965e-07, "loss": 0.2238, "step": 5724 }, { "epoch": 0.27661013673479246, "grad_norm": 2.432417392730713, "learning_rate": 7.233898632652075e-07, "loss": 0.2633, "step": 5725 }, { "epoch": 0.27665845291588154, "grad_norm": 4.703615665435791, "learning_rate": 7.233415470841184e-07, "loss": 0.221, "step": 5726 }, { "epoch": 0.27670676909697056, "grad_norm": 6.903266429901123, "learning_rate": 7.232932309030294e-07, "loss": 0.4296, "step": 5727 }, { "epoch": 0.27675508527805964, "grad_norm": 2.7172327041625977, "learning_rate": 7.232449147219404e-07, "loss": 0.3786, "step": 5728 }, { "epoch": 0.27680340145914867, "grad_norm": 2.3803436756134033, "learning_rate": 7.231965985408512e-07, "loss": 0.2912, "step": 5729 }, { "epoch": 0.2768517176402377, "grad_norm": 2.392416477203369, "learning_rate": 7.231482823597622e-07, "loss": 0.3161, "step": 5730 }, { "epoch": 0.2769000338213268, "grad_norm": 3.5362679958343506, "learning_rate": 7.230999661786732e-07, "loss": 0.3829, "step": 5731 }, { "epoch": 0.2769483500024158, "grad_norm": 2.6927874088287354, "learning_rate": 7.230516499975842e-07, "loss": 0.3717, "step": 5732 }, { "epoch": 0.2769966661835049, "grad_norm": 3.499843120574951, "learning_rate": 7.230033338164951e-07, "loss": 0.3221, "step": 5733 }, { "epoch": 0.2770449823645939, "grad_norm": 3.456460475921631, "learning_rate": 7.229550176354061e-07, "loss": 0.4515, "step": 5734 }, { "epoch": 0.27709329854568293, "grad_norm": 2.0855677127838135, "learning_rate": 7.22906701454317e-07, "loss": 0.2559, "step": 5735 }, { "epoch": 0.277141614726772, "grad_norm": 34.02018737792969, "learning_rate": 7.22858385273228e-07, "loss": 0.2961, "step": 5736 }, { "epoch": 0.27718993090786104, "grad_norm": 3.164120674133301, "learning_rate": 7.22810069092139e-07, "loss": 0.3845, "step": 5737 }, { "epoch": 0.27723824708895006, "grad_norm": 1.2586225271224976, "learning_rate": 7.2276175291105e-07, "loss": 0.1289, "step": 5738 }, { "epoch": 0.27728656327003914, "grad_norm": 1.748826503753662, "learning_rate": 7.227134367299608e-07, "loss": 0.168, "step": 5739 }, { "epoch": 0.27733487945112817, "grad_norm": 4.582533359527588, "learning_rate": 7.226651205488717e-07, "loss": 0.274, "step": 5740 }, { "epoch": 0.27738319563221725, "grad_norm": 4.62335205078125, "learning_rate": 7.226168043677827e-07, "loss": 0.471, "step": 5741 }, { "epoch": 0.2774315118133063, "grad_norm": 2.258483409881592, "learning_rate": 7.225684881866937e-07, "loss": 0.2545, "step": 5742 }, { "epoch": 0.2774798279943953, "grad_norm": 54.710472106933594, "learning_rate": 7.225201720056047e-07, "loss": 0.3556, "step": 5743 }, { "epoch": 0.2775281441754844, "grad_norm": 2.5395076274871826, "learning_rate": 7.224718558245157e-07, "loss": 0.2742, "step": 5744 }, { "epoch": 0.2775764603565734, "grad_norm": 2.360557794570923, "learning_rate": 7.224235396434265e-07, "loss": 0.3079, "step": 5745 }, { "epoch": 0.2776247765376625, "grad_norm": 9.015292167663574, "learning_rate": 7.223752234623374e-07, "loss": 0.3157, "step": 5746 }, { "epoch": 0.2776730927187515, "grad_norm": 2.984849452972412, "learning_rate": 7.223269072812484e-07, "loss": 0.3865, "step": 5747 }, { "epoch": 0.27772140889984054, "grad_norm": 7.4762115478515625, "learning_rate": 7.222785911001594e-07, "loss": 0.2658, "step": 5748 }, { "epoch": 0.2777697250809296, "grad_norm": 2.377802848815918, "learning_rate": 7.222302749190704e-07, "loss": 0.2223, "step": 5749 }, { "epoch": 0.27781804126201864, "grad_norm": 2.622086763381958, "learning_rate": 7.221819587379813e-07, "loss": 0.3276, "step": 5750 }, { "epoch": 0.27786635744310767, "grad_norm": 2.002821445465088, "learning_rate": 7.221336425568923e-07, "loss": 0.1973, "step": 5751 }, { "epoch": 0.27791467362419675, "grad_norm": 2.4186630249023438, "learning_rate": 7.220853263758032e-07, "loss": 0.2366, "step": 5752 }, { "epoch": 0.2779629898052858, "grad_norm": 3.0000336170196533, "learning_rate": 7.220370101947142e-07, "loss": 0.3243, "step": 5753 }, { "epoch": 0.27801130598637486, "grad_norm": 5.195069789886475, "learning_rate": 7.219886940136252e-07, "loss": 0.3271, "step": 5754 }, { "epoch": 0.2780596221674639, "grad_norm": 1.9785826206207275, "learning_rate": 7.21940377832536e-07, "loss": 0.235, "step": 5755 }, { "epoch": 0.2781079383485529, "grad_norm": 3.365222215652466, "learning_rate": 7.21892061651447e-07, "loss": 0.4648, "step": 5756 }, { "epoch": 0.278156254529642, "grad_norm": 6.631811618804932, "learning_rate": 7.21843745470358e-07, "loss": 0.3804, "step": 5757 }, { "epoch": 0.278204570710731, "grad_norm": 1.8500347137451172, "learning_rate": 7.217954292892689e-07, "loss": 0.192, "step": 5758 }, { "epoch": 0.2782528868918201, "grad_norm": 2.050443410873413, "learning_rate": 7.217471131081799e-07, "loss": 0.1928, "step": 5759 }, { "epoch": 0.2783012030729091, "grad_norm": 2.919725179672241, "learning_rate": 7.216987969270909e-07, "loss": 0.2766, "step": 5760 }, { "epoch": 0.27834951925399815, "grad_norm": 2.100442886352539, "learning_rate": 7.216504807460018e-07, "loss": 0.2398, "step": 5761 }, { "epoch": 0.2783978354350872, "grad_norm": 2.7182564735412598, "learning_rate": 7.216021645649128e-07, "loss": 0.2675, "step": 5762 }, { "epoch": 0.27844615161617625, "grad_norm": 1.8481817245483398, "learning_rate": 7.215538483838238e-07, "loss": 0.1349, "step": 5763 }, { "epoch": 0.2784944677972653, "grad_norm": 3.8356826305389404, "learning_rate": 7.215055322027347e-07, "loss": 0.3408, "step": 5764 }, { "epoch": 0.27854278397835436, "grad_norm": 4.870777130126953, "learning_rate": 7.214572160216456e-07, "loss": 0.4471, "step": 5765 }, { "epoch": 0.2785911001594434, "grad_norm": 2.7797138690948486, "learning_rate": 7.214088998405565e-07, "loss": 0.4063, "step": 5766 }, { "epoch": 0.27863941634053246, "grad_norm": 3.7384889125823975, "learning_rate": 7.213605836594675e-07, "loss": 0.4559, "step": 5767 }, { "epoch": 0.2786877325216215, "grad_norm": 7.2810468673706055, "learning_rate": 7.213122674783785e-07, "loss": 0.1987, "step": 5768 }, { "epoch": 0.2787360487027105, "grad_norm": 3.797642707824707, "learning_rate": 7.212639512972895e-07, "loss": 0.3375, "step": 5769 }, { "epoch": 0.2787843648837996, "grad_norm": 2.655203342437744, "learning_rate": 7.212156351162005e-07, "loss": 0.2994, "step": 5770 }, { "epoch": 0.2788326810648886, "grad_norm": 2.904337167739868, "learning_rate": 7.211673189351112e-07, "loss": 0.248, "step": 5771 }, { "epoch": 0.2788809972459777, "grad_norm": 3.1878178119659424, "learning_rate": 7.211190027540222e-07, "loss": 0.3071, "step": 5772 }, { "epoch": 0.2789293134270667, "grad_norm": 3.306859016418457, "learning_rate": 7.210706865729332e-07, "loss": 0.3714, "step": 5773 }, { "epoch": 0.27897762960815575, "grad_norm": 2.9297564029693604, "learning_rate": 7.210223703918442e-07, "loss": 0.3989, "step": 5774 }, { "epoch": 0.27902594578924483, "grad_norm": 2.460211753845215, "learning_rate": 7.209740542107552e-07, "loss": 0.3485, "step": 5775 }, { "epoch": 0.27907426197033386, "grad_norm": 1.822777509689331, "learning_rate": 7.209257380296661e-07, "loss": 0.1762, "step": 5776 }, { "epoch": 0.2791225781514229, "grad_norm": 2.856743335723877, "learning_rate": 7.20877421848577e-07, "loss": 0.3904, "step": 5777 }, { "epoch": 0.27917089433251197, "grad_norm": 4.98947811126709, "learning_rate": 7.20829105667488e-07, "loss": 0.2632, "step": 5778 }, { "epoch": 0.279219210513601, "grad_norm": 2.6301825046539307, "learning_rate": 7.20780789486399e-07, "loss": 0.2561, "step": 5779 }, { "epoch": 0.27926752669469007, "grad_norm": 2.5696892738342285, "learning_rate": 7.2073247330531e-07, "loss": 0.2689, "step": 5780 }, { "epoch": 0.2793158428757791, "grad_norm": 1.8261555433273315, "learning_rate": 7.206841571242208e-07, "loss": 0.1683, "step": 5781 }, { "epoch": 0.2793641590568681, "grad_norm": 2.333294630050659, "learning_rate": 7.206358409431318e-07, "loss": 0.276, "step": 5782 }, { "epoch": 0.2794124752379572, "grad_norm": 2.960092544555664, "learning_rate": 7.205875247620428e-07, "loss": 0.2938, "step": 5783 }, { "epoch": 0.27946079141904623, "grad_norm": 1.567704200744629, "learning_rate": 7.205392085809537e-07, "loss": 0.2034, "step": 5784 }, { "epoch": 0.2795091076001353, "grad_norm": 3.5511159896850586, "learning_rate": 7.204908923998647e-07, "loss": 0.2453, "step": 5785 }, { "epoch": 0.27955742378122433, "grad_norm": 1.8408344984054565, "learning_rate": 7.204425762187757e-07, "loss": 0.2365, "step": 5786 }, { "epoch": 0.27960573996231336, "grad_norm": 4.952930450439453, "learning_rate": 7.203942600376866e-07, "loss": 0.3657, "step": 5787 }, { "epoch": 0.27965405614340244, "grad_norm": 2.372396469116211, "learning_rate": 7.203459438565976e-07, "loss": 0.2935, "step": 5788 }, { "epoch": 0.27970237232449147, "grad_norm": 2.0069801807403564, "learning_rate": 7.202976276755085e-07, "loss": 0.2772, "step": 5789 }, { "epoch": 0.2797506885055805, "grad_norm": 5.730746269226074, "learning_rate": 7.202493114944194e-07, "loss": 0.3678, "step": 5790 }, { "epoch": 0.2797990046866696, "grad_norm": 2.6620748043060303, "learning_rate": 7.202009953133304e-07, "loss": 0.3174, "step": 5791 }, { "epoch": 0.2798473208677586, "grad_norm": 3.5669069290161133, "learning_rate": 7.201526791322413e-07, "loss": 0.3597, "step": 5792 }, { "epoch": 0.2798956370488477, "grad_norm": 2.147409439086914, "learning_rate": 7.201043629511523e-07, "loss": 0.2492, "step": 5793 }, { "epoch": 0.2799439532299367, "grad_norm": 2.690227746963501, "learning_rate": 7.200560467700633e-07, "loss": 0.2488, "step": 5794 }, { "epoch": 0.27999226941102573, "grad_norm": 2.406442165374756, "learning_rate": 7.200077305889743e-07, "loss": 0.2566, "step": 5795 }, { "epoch": 0.2800405855921148, "grad_norm": 48.508056640625, "learning_rate": 7.199594144078853e-07, "loss": 0.2132, "step": 5796 }, { "epoch": 0.28008890177320384, "grad_norm": 2.50342059135437, "learning_rate": 7.19911098226796e-07, "loss": 0.2769, "step": 5797 }, { "epoch": 0.2801372179542929, "grad_norm": 2.506289482116699, "learning_rate": 7.19862782045707e-07, "loss": 0.2997, "step": 5798 }, { "epoch": 0.28018553413538194, "grad_norm": 3.0360794067382812, "learning_rate": 7.19814465864618e-07, "loss": 0.2821, "step": 5799 }, { "epoch": 0.28023385031647097, "grad_norm": 2.216510057449341, "learning_rate": 7.19766149683529e-07, "loss": 0.3021, "step": 5800 }, { "epoch": 0.28028216649756005, "grad_norm": 4.0853800773620605, "learning_rate": 7.1971783350244e-07, "loss": 0.3253, "step": 5801 }, { "epoch": 0.2803304826786491, "grad_norm": 3.0401015281677246, "learning_rate": 7.196695173213509e-07, "loss": 0.4183, "step": 5802 }, { "epoch": 0.2803787988597381, "grad_norm": 3.264885187149048, "learning_rate": 7.196212011402618e-07, "loss": 0.3248, "step": 5803 }, { "epoch": 0.2804271150408272, "grad_norm": 9.192429542541504, "learning_rate": 7.195728849591728e-07, "loss": 0.2286, "step": 5804 }, { "epoch": 0.2804754312219162, "grad_norm": 3.3019943237304688, "learning_rate": 7.195245687780838e-07, "loss": 0.2903, "step": 5805 }, { "epoch": 0.2805237474030053, "grad_norm": 2.477991819381714, "learning_rate": 7.194762525969947e-07, "loss": 0.2885, "step": 5806 }, { "epoch": 0.2805720635840943, "grad_norm": 2.7834558486938477, "learning_rate": 7.194279364159056e-07, "loss": 0.3719, "step": 5807 }, { "epoch": 0.28062037976518334, "grad_norm": 2.592545747756958, "learning_rate": 7.193796202348166e-07, "loss": 0.3851, "step": 5808 }, { "epoch": 0.2806686959462724, "grad_norm": 2.3422131538391113, "learning_rate": 7.193313040537275e-07, "loss": 0.2089, "step": 5809 }, { "epoch": 0.28071701212736144, "grad_norm": 2.7890026569366455, "learning_rate": 7.192829878726385e-07, "loss": 0.2632, "step": 5810 }, { "epoch": 0.2807653283084505, "grad_norm": 2.1838338375091553, "learning_rate": 7.192346716915495e-07, "loss": 0.2448, "step": 5811 }, { "epoch": 0.28081364448953955, "grad_norm": 3.19474196434021, "learning_rate": 7.191863555104605e-07, "loss": 0.3619, "step": 5812 }, { "epoch": 0.2808619606706286, "grad_norm": 3.9562909603118896, "learning_rate": 7.191380393293714e-07, "loss": 0.3324, "step": 5813 }, { "epoch": 0.28091027685171766, "grad_norm": 4.479877948760986, "learning_rate": 7.190897231482823e-07, "loss": 0.3273, "step": 5814 }, { "epoch": 0.2809585930328067, "grad_norm": 3.0520553588867188, "learning_rate": 7.190414069671933e-07, "loss": 0.2283, "step": 5815 }, { "epoch": 0.28100690921389576, "grad_norm": 3.1366991996765137, "learning_rate": 7.189930907861042e-07, "loss": 0.4063, "step": 5816 }, { "epoch": 0.2810552253949848, "grad_norm": 2.19531512260437, "learning_rate": 7.189447746050152e-07, "loss": 0.2643, "step": 5817 }, { "epoch": 0.2811035415760738, "grad_norm": 8.300058364868164, "learning_rate": 7.188964584239261e-07, "loss": 0.3091, "step": 5818 }, { "epoch": 0.2811518577571629, "grad_norm": 5.438379764556885, "learning_rate": 7.188481422428371e-07, "loss": 0.237, "step": 5819 }, { "epoch": 0.2812001739382519, "grad_norm": 2.5992846488952637, "learning_rate": 7.187998260617481e-07, "loss": 0.3, "step": 5820 }, { "epoch": 0.28124849011934094, "grad_norm": 2.8962197303771973, "learning_rate": 7.187515098806591e-07, "loss": 0.2928, "step": 5821 }, { "epoch": 0.28129680630043, "grad_norm": 2.1249489784240723, "learning_rate": 7.1870319369957e-07, "loss": 0.2861, "step": 5822 }, { "epoch": 0.28134512248151905, "grad_norm": 2.9516441822052, "learning_rate": 7.186548775184808e-07, "loss": 0.3468, "step": 5823 }, { "epoch": 0.28139343866260813, "grad_norm": 10.950422286987305, "learning_rate": 7.186065613373918e-07, "loss": 0.4065, "step": 5824 }, { "epoch": 0.28144175484369716, "grad_norm": 2.7512331008911133, "learning_rate": 7.185582451563028e-07, "loss": 0.3452, "step": 5825 }, { "epoch": 0.2814900710247862, "grad_norm": 4.729092121124268, "learning_rate": 7.185099289752138e-07, "loss": 0.4543, "step": 5826 }, { "epoch": 0.28153838720587526, "grad_norm": 3.311662197113037, "learning_rate": 7.184616127941248e-07, "loss": 0.5096, "step": 5827 }, { "epoch": 0.2815867033869643, "grad_norm": 3.3071048259735107, "learning_rate": 7.184132966130356e-07, "loss": 0.2965, "step": 5828 }, { "epoch": 0.28163501956805337, "grad_norm": 2.5274810791015625, "learning_rate": 7.183649804319466e-07, "loss": 0.2978, "step": 5829 }, { "epoch": 0.2816833357491424, "grad_norm": 3.685898542404175, "learning_rate": 7.183166642508576e-07, "loss": 0.2669, "step": 5830 }, { "epoch": 0.2817316519302314, "grad_norm": 2.841883897781372, "learning_rate": 7.182683480697685e-07, "loss": 0.314, "step": 5831 }, { "epoch": 0.2817799681113205, "grad_norm": 2.3651123046875, "learning_rate": 7.182200318886795e-07, "loss": 0.2919, "step": 5832 }, { "epoch": 0.2818282842924095, "grad_norm": 2.6419708728790283, "learning_rate": 7.181717157075904e-07, "loss": 0.2848, "step": 5833 }, { "epoch": 0.28187660047349855, "grad_norm": 2.7369775772094727, "learning_rate": 7.181233995265014e-07, "loss": 0.2816, "step": 5834 }, { "epoch": 0.28192491665458763, "grad_norm": 3.0274832248687744, "learning_rate": 7.180750833454123e-07, "loss": 0.3158, "step": 5835 }, { "epoch": 0.28197323283567666, "grad_norm": 3.32431960105896, "learning_rate": 7.180267671643233e-07, "loss": 0.3641, "step": 5836 }, { "epoch": 0.28202154901676574, "grad_norm": 3.0483453273773193, "learning_rate": 7.179784509832343e-07, "loss": 0.2174, "step": 5837 }, { "epoch": 0.28206986519785476, "grad_norm": 2.9584243297576904, "learning_rate": 7.179301348021453e-07, "loss": 0.2695, "step": 5838 }, { "epoch": 0.2821181813789438, "grad_norm": 1.9926693439483643, "learning_rate": 7.178818186210561e-07, "loss": 0.2488, "step": 5839 }, { "epoch": 0.28216649756003287, "grad_norm": 3.003523826599121, "learning_rate": 7.178335024399671e-07, "loss": 0.33, "step": 5840 }, { "epoch": 0.2822148137411219, "grad_norm": 33.941009521484375, "learning_rate": 7.17785186258878e-07, "loss": 0.3533, "step": 5841 }, { "epoch": 0.282263129922211, "grad_norm": 3.149259567260742, "learning_rate": 7.17736870077789e-07, "loss": 0.3517, "step": 5842 }, { "epoch": 0.2823114461033, "grad_norm": 2.4372639656066895, "learning_rate": 7.176885538967e-07, "loss": 0.2235, "step": 5843 }, { "epoch": 0.282359762284389, "grad_norm": 3.9537580013275146, "learning_rate": 7.176402377156109e-07, "loss": 0.4431, "step": 5844 }, { "epoch": 0.2824080784654781, "grad_norm": 2.734316825866699, "learning_rate": 7.175919215345219e-07, "loss": 0.2355, "step": 5845 }, { "epoch": 0.28245639464656713, "grad_norm": 2.04150390625, "learning_rate": 7.175436053534329e-07, "loss": 0.2025, "step": 5846 }, { "epoch": 0.28250471082765616, "grad_norm": 4.235065937042236, "learning_rate": 7.174952891723439e-07, "loss": 0.2339, "step": 5847 }, { "epoch": 0.28255302700874524, "grad_norm": 2.6859471797943115, "learning_rate": 7.174469729912547e-07, "loss": 0.3765, "step": 5848 }, { "epoch": 0.28260134318983426, "grad_norm": 3.2351772785186768, "learning_rate": 7.173986568101656e-07, "loss": 0.374, "step": 5849 }, { "epoch": 0.28264965937092335, "grad_norm": 1.9520090818405151, "learning_rate": 7.173503406290766e-07, "loss": 0.2745, "step": 5850 }, { "epoch": 0.28269797555201237, "grad_norm": 1.8862594366073608, "learning_rate": 7.173020244479876e-07, "loss": 0.2587, "step": 5851 }, { "epoch": 0.2827462917331014, "grad_norm": 2.4947805404663086, "learning_rate": 7.172537082668986e-07, "loss": 0.2367, "step": 5852 }, { "epoch": 0.2827946079141905, "grad_norm": 5.338348865509033, "learning_rate": 7.172053920858096e-07, "loss": 0.211, "step": 5853 }, { "epoch": 0.2828429240952795, "grad_norm": 2.339273691177368, "learning_rate": 7.171570759047204e-07, "loss": 0.3024, "step": 5854 }, { "epoch": 0.2828912402763686, "grad_norm": 2.5424857139587402, "learning_rate": 7.171087597236314e-07, "loss": 0.2696, "step": 5855 }, { "epoch": 0.2829395564574576, "grad_norm": 3.3411386013031006, "learning_rate": 7.170604435425423e-07, "loss": 0.2541, "step": 5856 }, { "epoch": 0.28298787263854663, "grad_norm": 6.021061897277832, "learning_rate": 7.170121273614533e-07, "loss": 0.3434, "step": 5857 }, { "epoch": 0.2830361888196357, "grad_norm": 2.7801647186279297, "learning_rate": 7.169638111803643e-07, "loss": 0.3422, "step": 5858 }, { "epoch": 0.28308450500072474, "grad_norm": 3.2711496353149414, "learning_rate": 7.169154949992752e-07, "loss": 0.2814, "step": 5859 }, { "epoch": 0.28313282118181377, "grad_norm": 2.2671890258789062, "learning_rate": 7.168671788181861e-07, "loss": 0.2542, "step": 5860 }, { "epoch": 0.28318113736290285, "grad_norm": 2.9136242866516113, "learning_rate": 7.168188626370971e-07, "loss": 0.3166, "step": 5861 }, { "epoch": 0.28322945354399187, "grad_norm": 2.710350751876831, "learning_rate": 7.167705464560081e-07, "loss": 0.3041, "step": 5862 }, { "epoch": 0.28327776972508095, "grad_norm": 1.8445448875427246, "learning_rate": 7.167222302749191e-07, "loss": 0.2143, "step": 5863 }, { "epoch": 0.28332608590617, "grad_norm": 2.611487865447998, "learning_rate": 7.166739140938301e-07, "loss": 0.4002, "step": 5864 }, { "epoch": 0.283374402087259, "grad_norm": 2.7652060985565186, "learning_rate": 7.166255979127409e-07, "loss": 0.2865, "step": 5865 }, { "epoch": 0.2834227182683481, "grad_norm": 2.462519407272339, "learning_rate": 7.165772817316519e-07, "loss": 0.2627, "step": 5866 }, { "epoch": 0.2834710344494371, "grad_norm": 2.297974109649658, "learning_rate": 7.165289655505628e-07, "loss": 0.2895, "step": 5867 }, { "epoch": 0.2835193506305262, "grad_norm": 1.8522536754608154, "learning_rate": 7.164806493694738e-07, "loss": 0.1407, "step": 5868 }, { "epoch": 0.2835676668116152, "grad_norm": 2.7484045028686523, "learning_rate": 7.164323331883848e-07, "loss": 0.315, "step": 5869 }, { "epoch": 0.28361598299270424, "grad_norm": 4.374518871307373, "learning_rate": 7.163840170072957e-07, "loss": 0.4215, "step": 5870 }, { "epoch": 0.2836642991737933, "grad_norm": 3.0615642070770264, "learning_rate": 7.163357008262067e-07, "loss": 0.2887, "step": 5871 }, { "epoch": 0.28371261535488235, "grad_norm": 3.128087043762207, "learning_rate": 7.162873846451177e-07, "loss": 0.2613, "step": 5872 }, { "epoch": 0.2837609315359714, "grad_norm": 2.4544942378997803, "learning_rate": 7.162390684640285e-07, "loss": 0.3063, "step": 5873 }, { "epoch": 0.28380924771706045, "grad_norm": 2.6726155281066895, "learning_rate": 7.161907522829395e-07, "loss": 0.2546, "step": 5874 }, { "epoch": 0.2838575638981495, "grad_norm": 4.445496559143066, "learning_rate": 7.161424361018504e-07, "loss": 0.2428, "step": 5875 }, { "epoch": 0.28390588007923856, "grad_norm": 2.4659242630004883, "learning_rate": 7.160941199207614e-07, "loss": 0.3705, "step": 5876 }, { "epoch": 0.2839541962603276, "grad_norm": 2.7012267112731934, "learning_rate": 7.160458037396724e-07, "loss": 0.4164, "step": 5877 }, { "epoch": 0.2840025124414166, "grad_norm": 13.587973594665527, "learning_rate": 7.159974875585834e-07, "loss": 0.3225, "step": 5878 }, { "epoch": 0.2840508286225057, "grad_norm": 4.638149738311768, "learning_rate": 7.159491713774944e-07, "loss": 0.2347, "step": 5879 }, { "epoch": 0.2840991448035947, "grad_norm": 2.353487014770508, "learning_rate": 7.159008551964052e-07, "loss": 0.2293, "step": 5880 }, { "epoch": 0.2841474609846838, "grad_norm": 2.1496052742004395, "learning_rate": 7.158525390153161e-07, "loss": 0.2759, "step": 5881 }, { "epoch": 0.2841957771657728, "grad_norm": 3.529665946960449, "learning_rate": 7.158042228342271e-07, "loss": 0.3462, "step": 5882 }, { "epoch": 0.28424409334686185, "grad_norm": 3.529881477355957, "learning_rate": 7.157559066531381e-07, "loss": 0.3048, "step": 5883 }, { "epoch": 0.28429240952795093, "grad_norm": 3.506096601486206, "learning_rate": 7.157075904720491e-07, "loss": 0.3798, "step": 5884 }, { "epoch": 0.28434072570903995, "grad_norm": 3.0350496768951416, "learning_rate": 7.1565927429096e-07, "loss": 0.3026, "step": 5885 }, { "epoch": 0.284389041890129, "grad_norm": 4.4427289962768555, "learning_rate": 7.156109581098709e-07, "loss": 0.3435, "step": 5886 }, { "epoch": 0.28443735807121806, "grad_norm": 3.649439811706543, "learning_rate": 7.155626419287819e-07, "loss": 0.435, "step": 5887 }, { "epoch": 0.2844856742523071, "grad_norm": 2.617979049682617, "learning_rate": 7.155143257476929e-07, "loss": 0.1954, "step": 5888 }, { "epoch": 0.28453399043339617, "grad_norm": 2.762108087539673, "learning_rate": 7.154660095666039e-07, "loss": 0.337, "step": 5889 }, { "epoch": 0.2845823066144852, "grad_norm": 2.1848440170288086, "learning_rate": 7.154176933855148e-07, "loss": 0.247, "step": 5890 }, { "epoch": 0.2846306227955742, "grad_norm": 2.621140956878662, "learning_rate": 7.153693772044257e-07, "loss": 0.2611, "step": 5891 }, { "epoch": 0.2846789389766633, "grad_norm": 1.8219801187515259, "learning_rate": 7.153210610233366e-07, "loss": 0.1861, "step": 5892 }, { "epoch": 0.2847272551577523, "grad_norm": 2.500878095626831, "learning_rate": 7.152727448422476e-07, "loss": 0.3655, "step": 5893 }, { "epoch": 0.2847755713388414, "grad_norm": 2.6225779056549072, "learning_rate": 7.152244286611586e-07, "loss": 0.3372, "step": 5894 }, { "epoch": 0.28482388751993043, "grad_norm": 2.164306640625, "learning_rate": 7.151761124800696e-07, "loss": 0.2631, "step": 5895 }, { "epoch": 0.28487220370101946, "grad_norm": 2.7691702842712402, "learning_rate": 7.151277962989805e-07, "loss": 0.2922, "step": 5896 }, { "epoch": 0.28492051988210854, "grad_norm": 2.6202642917633057, "learning_rate": 7.150794801178915e-07, "loss": 0.2664, "step": 5897 }, { "epoch": 0.28496883606319756, "grad_norm": 2.67655611038208, "learning_rate": 7.150311639368025e-07, "loss": 0.2487, "step": 5898 }, { "epoch": 0.2850171522442866, "grad_norm": 3.3999171257019043, "learning_rate": 7.149828477557133e-07, "loss": 0.3246, "step": 5899 }, { "epoch": 0.28506546842537567, "grad_norm": 3.9970972537994385, "learning_rate": 7.149345315746243e-07, "loss": 0.1562, "step": 5900 }, { "epoch": 0.2851137846064647, "grad_norm": 3.368536949157715, "learning_rate": 7.148862153935352e-07, "loss": 0.2737, "step": 5901 }, { "epoch": 0.2851621007875538, "grad_norm": 2.3964145183563232, "learning_rate": 7.148378992124462e-07, "loss": 0.3285, "step": 5902 }, { "epoch": 0.2852104169686428, "grad_norm": 3.4302098751068115, "learning_rate": 7.147895830313572e-07, "loss": 0.2901, "step": 5903 }, { "epoch": 0.2852587331497318, "grad_norm": 2.875532388687134, "learning_rate": 7.147412668502682e-07, "loss": 0.2523, "step": 5904 }, { "epoch": 0.2853070493308209, "grad_norm": 4.1955342292785645, "learning_rate": 7.146929506691791e-07, "loss": 0.3332, "step": 5905 }, { "epoch": 0.28535536551190993, "grad_norm": 3.2393720149993896, "learning_rate": 7.1464463448809e-07, "loss": 0.3572, "step": 5906 }, { "epoch": 0.285403681692999, "grad_norm": 3.658921480178833, "learning_rate": 7.145963183070009e-07, "loss": 0.2576, "step": 5907 }, { "epoch": 0.28545199787408804, "grad_norm": 3.1053950786590576, "learning_rate": 7.145480021259119e-07, "loss": 0.319, "step": 5908 }, { "epoch": 0.28550031405517706, "grad_norm": 2.9426515102386475, "learning_rate": 7.144996859448229e-07, "loss": 0.3991, "step": 5909 }, { "epoch": 0.28554863023626614, "grad_norm": 9.965659141540527, "learning_rate": 7.144513697637339e-07, "loss": 0.2263, "step": 5910 }, { "epoch": 0.28559694641735517, "grad_norm": 20.81161117553711, "learning_rate": 7.144030535826447e-07, "loss": 0.2904, "step": 5911 }, { "epoch": 0.2856452625984442, "grad_norm": 2.874967575073242, "learning_rate": 7.143547374015557e-07, "loss": 0.4204, "step": 5912 }, { "epoch": 0.2856935787795333, "grad_norm": 2.5506184101104736, "learning_rate": 7.143064212204667e-07, "loss": 0.2984, "step": 5913 }, { "epoch": 0.2857418949606223, "grad_norm": 2.389308452606201, "learning_rate": 7.142581050393777e-07, "loss": 0.3046, "step": 5914 }, { "epoch": 0.2857902111417114, "grad_norm": 2.715998888015747, "learning_rate": 7.142097888582887e-07, "loss": 0.3068, "step": 5915 }, { "epoch": 0.2858385273228004, "grad_norm": 3.5327565670013428, "learning_rate": 7.141614726771996e-07, "loss": 0.404, "step": 5916 }, { "epoch": 0.28588684350388943, "grad_norm": 2.521735429763794, "learning_rate": 7.141131564961105e-07, "loss": 0.2587, "step": 5917 }, { "epoch": 0.2859351596849785, "grad_norm": 2.1531288623809814, "learning_rate": 7.140648403150214e-07, "loss": 0.3297, "step": 5918 }, { "epoch": 0.28598347586606754, "grad_norm": 2.9642958641052246, "learning_rate": 7.140165241339324e-07, "loss": 0.392, "step": 5919 }, { "epoch": 0.2860317920471566, "grad_norm": 1.3459316492080688, "learning_rate": 7.139682079528434e-07, "loss": 0.1597, "step": 5920 }, { "epoch": 0.28608010822824564, "grad_norm": 2.7395401000976562, "learning_rate": 7.139198917717544e-07, "loss": 0.2343, "step": 5921 }, { "epoch": 0.28612842440933467, "grad_norm": 2.924112319946289, "learning_rate": 7.138715755906653e-07, "loss": 0.3683, "step": 5922 }, { "epoch": 0.28617674059042375, "grad_norm": 2.4461488723754883, "learning_rate": 7.138232594095763e-07, "loss": 0.269, "step": 5923 }, { "epoch": 0.2862250567715128, "grad_norm": 3.2202470302581787, "learning_rate": 7.137749432284871e-07, "loss": 0.3319, "step": 5924 }, { "epoch": 0.2862733729526018, "grad_norm": 2.8630902767181396, "learning_rate": 7.137266270473981e-07, "loss": 0.3019, "step": 5925 }, { "epoch": 0.2863216891336909, "grad_norm": 4.73801851272583, "learning_rate": 7.136783108663091e-07, "loss": 0.2406, "step": 5926 }, { "epoch": 0.2863700053147799, "grad_norm": 3.5032317638397217, "learning_rate": 7.1362999468522e-07, "loss": 0.3503, "step": 5927 }, { "epoch": 0.286418321495869, "grad_norm": 3.240323066711426, "learning_rate": 7.13581678504131e-07, "loss": 0.2948, "step": 5928 }, { "epoch": 0.286466637676958, "grad_norm": 2.6293914318084717, "learning_rate": 7.13533362323042e-07, "loss": 0.3208, "step": 5929 }, { "epoch": 0.28651495385804704, "grad_norm": 2.498591423034668, "learning_rate": 7.13485046141953e-07, "loss": 0.2629, "step": 5930 }, { "epoch": 0.2865632700391361, "grad_norm": 6.809144496917725, "learning_rate": 7.134367299608639e-07, "loss": 0.35, "step": 5931 }, { "epoch": 0.28661158622022515, "grad_norm": 3.2328684329986572, "learning_rate": 7.133884137797747e-07, "loss": 0.2984, "step": 5932 }, { "epoch": 0.2866599024013142, "grad_norm": 2.5356130599975586, "learning_rate": 7.133400975986857e-07, "loss": 0.3538, "step": 5933 }, { "epoch": 0.28670821858240325, "grad_norm": 1.6610510349273682, "learning_rate": 7.132917814175967e-07, "loss": 0.2043, "step": 5934 }, { "epoch": 0.2867565347634923, "grad_norm": 4.812193393707275, "learning_rate": 7.132434652365077e-07, "loss": 0.2005, "step": 5935 }, { "epoch": 0.28680485094458136, "grad_norm": 2.369961738586426, "learning_rate": 7.131951490554187e-07, "loss": 0.2619, "step": 5936 }, { "epoch": 0.2868531671256704, "grad_norm": 2.401146411895752, "learning_rate": 7.131468328743295e-07, "loss": 0.2075, "step": 5937 }, { "epoch": 0.2869014833067594, "grad_norm": 1.9776997566223145, "learning_rate": 7.130985166932405e-07, "loss": 0.1774, "step": 5938 }, { "epoch": 0.2869497994878485, "grad_norm": 6.606064796447754, "learning_rate": 7.130502005121515e-07, "loss": 0.196, "step": 5939 }, { "epoch": 0.2869981156689375, "grad_norm": 3.123227596282959, "learning_rate": 7.130018843310625e-07, "loss": 0.3259, "step": 5940 }, { "epoch": 0.2870464318500266, "grad_norm": 3.3807342052459717, "learning_rate": 7.129535681499734e-07, "loss": 0.2397, "step": 5941 }, { "epoch": 0.2870947480311156, "grad_norm": 2.996774673461914, "learning_rate": 7.129052519688844e-07, "loss": 0.4472, "step": 5942 }, { "epoch": 0.28714306421220465, "grad_norm": 4.22972297668457, "learning_rate": 7.128569357877953e-07, "loss": 0.2895, "step": 5943 }, { "epoch": 0.2871913803932937, "grad_norm": 2.1418628692626953, "learning_rate": 7.128086196067062e-07, "loss": 0.2393, "step": 5944 }, { "epoch": 0.28723969657438275, "grad_norm": 3.5660243034362793, "learning_rate": 7.127603034256172e-07, "loss": 0.2781, "step": 5945 }, { "epoch": 0.28728801275547183, "grad_norm": 2.4335949420928955, "learning_rate": 7.127119872445282e-07, "loss": 0.2195, "step": 5946 }, { "epoch": 0.28733632893656086, "grad_norm": 2.5510261058807373, "learning_rate": 7.126636710634392e-07, "loss": 0.2804, "step": 5947 }, { "epoch": 0.2873846451176499, "grad_norm": 9.999317169189453, "learning_rate": 7.126153548823501e-07, "loss": 0.2449, "step": 5948 }, { "epoch": 0.28743296129873896, "grad_norm": 1.4777193069458008, "learning_rate": 7.12567038701261e-07, "loss": 0.1635, "step": 5949 }, { "epoch": 0.287481277479828, "grad_norm": 2.1631343364715576, "learning_rate": 7.125187225201719e-07, "loss": 0.2318, "step": 5950 }, { "epoch": 0.287529593660917, "grad_norm": 2.071648120880127, "learning_rate": 7.124704063390829e-07, "loss": 0.2328, "step": 5951 }, { "epoch": 0.2875779098420061, "grad_norm": 3.356861114501953, "learning_rate": 7.124220901579939e-07, "loss": 0.2843, "step": 5952 }, { "epoch": 0.2876262260230951, "grad_norm": 2.854771375656128, "learning_rate": 7.123737739769048e-07, "loss": 0.3852, "step": 5953 }, { "epoch": 0.2876745422041842, "grad_norm": 5.499043941497803, "learning_rate": 7.123254577958158e-07, "loss": 0.2857, "step": 5954 }, { "epoch": 0.28772285838527323, "grad_norm": 2.482395648956299, "learning_rate": 7.122771416147268e-07, "loss": 0.3329, "step": 5955 }, { "epoch": 0.28777117456636225, "grad_norm": 6.953704833984375, "learning_rate": 7.122288254336377e-07, "loss": 0.2719, "step": 5956 }, { "epoch": 0.28781949074745133, "grad_norm": 2.9790709018707275, "learning_rate": 7.121805092525487e-07, "loss": 0.3477, "step": 5957 }, { "epoch": 0.28786780692854036, "grad_norm": 3.833235025405884, "learning_rate": 7.121321930714595e-07, "loss": 0.3723, "step": 5958 }, { "epoch": 0.28791612310962944, "grad_norm": 2.1286849975585938, "learning_rate": 7.120838768903705e-07, "loss": 0.1895, "step": 5959 }, { "epoch": 0.28796443929071847, "grad_norm": 1.5995287895202637, "learning_rate": 7.120355607092815e-07, "loss": 0.1803, "step": 5960 }, { "epoch": 0.2880127554718075, "grad_norm": 2.88047456741333, "learning_rate": 7.119872445281925e-07, "loss": 0.3667, "step": 5961 }, { "epoch": 0.28806107165289657, "grad_norm": 2.8609139919281006, "learning_rate": 7.119389283471035e-07, "loss": 0.3029, "step": 5962 }, { "epoch": 0.2881093878339856, "grad_norm": 5.449337005615234, "learning_rate": 7.118906121660143e-07, "loss": 0.2494, "step": 5963 }, { "epoch": 0.2881577040150746, "grad_norm": 3.4945473670959473, "learning_rate": 7.118422959849253e-07, "loss": 0.3387, "step": 5964 }, { "epoch": 0.2882060201961637, "grad_norm": 2.862534284591675, "learning_rate": 7.117939798038363e-07, "loss": 0.3418, "step": 5965 }, { "epoch": 0.28825433637725273, "grad_norm": 5.141637802124023, "learning_rate": 7.117456636227472e-07, "loss": 0.3253, "step": 5966 }, { "epoch": 0.2883026525583418, "grad_norm": 3.8530948162078857, "learning_rate": 7.116973474416582e-07, "loss": 0.392, "step": 5967 }, { "epoch": 0.28835096873943084, "grad_norm": 2.5220861434936523, "learning_rate": 7.116490312605691e-07, "loss": 0.2577, "step": 5968 }, { "epoch": 0.28839928492051986, "grad_norm": 3.287996530532837, "learning_rate": 7.1160071507948e-07, "loss": 0.33, "step": 5969 }, { "epoch": 0.28844760110160894, "grad_norm": 2.569141149520874, "learning_rate": 7.11552398898391e-07, "loss": 0.1721, "step": 5970 }, { "epoch": 0.28849591728269797, "grad_norm": 1.8466579914093018, "learning_rate": 7.11504082717302e-07, "loss": 0.1789, "step": 5971 }, { "epoch": 0.28854423346378705, "grad_norm": 2.3284690380096436, "learning_rate": 7.11455766536213e-07, "loss": 0.3138, "step": 5972 }, { "epoch": 0.2885925496448761, "grad_norm": 1.6384762525558472, "learning_rate": 7.11407450355124e-07, "loss": 0.1484, "step": 5973 }, { "epoch": 0.2886408658259651, "grad_norm": 2.183344602584839, "learning_rate": 7.113591341740349e-07, "loss": 0.1703, "step": 5974 }, { "epoch": 0.2886891820070542, "grad_norm": 44.301570892333984, "learning_rate": 7.113108179929458e-07, "loss": 0.3322, "step": 5975 }, { "epoch": 0.2887374981881432, "grad_norm": 2.5295157432556152, "learning_rate": 7.112625018118567e-07, "loss": 0.3175, "step": 5976 }, { "epoch": 0.28878581436923223, "grad_norm": 3.2659072875976562, "learning_rate": 7.112141856307677e-07, "loss": 0.4292, "step": 5977 }, { "epoch": 0.2888341305503213, "grad_norm": 2.4273264408111572, "learning_rate": 7.111658694496787e-07, "loss": 0.3143, "step": 5978 }, { "epoch": 0.28888244673141034, "grad_norm": 9.96707534790039, "learning_rate": 7.111175532685896e-07, "loss": 0.3537, "step": 5979 }, { "epoch": 0.2889307629124994, "grad_norm": 4.246527194976807, "learning_rate": 7.110692370875006e-07, "loss": 0.3494, "step": 5980 }, { "epoch": 0.28897907909358844, "grad_norm": 2.3092453479766846, "learning_rate": 7.110209209064116e-07, "loss": 0.2928, "step": 5981 }, { "epoch": 0.28902739527467747, "grad_norm": 3.9930927753448486, "learning_rate": 7.109726047253225e-07, "loss": 0.3248, "step": 5982 }, { "epoch": 0.28907571145576655, "grad_norm": 3.702775239944458, "learning_rate": 7.109242885442334e-07, "loss": 0.5064, "step": 5983 }, { "epoch": 0.2891240276368556, "grad_norm": 3.273423433303833, "learning_rate": 7.108759723631443e-07, "loss": 0.275, "step": 5984 }, { "epoch": 0.28917234381794465, "grad_norm": 3.2533066272735596, "learning_rate": 7.108276561820553e-07, "loss": 0.3239, "step": 5985 }, { "epoch": 0.2892206599990337, "grad_norm": 3.4763295650482178, "learning_rate": 7.107793400009663e-07, "loss": 0.489, "step": 5986 }, { "epoch": 0.2892689761801227, "grad_norm": 3.253608226776123, "learning_rate": 7.107310238198773e-07, "loss": 0.314, "step": 5987 }, { "epoch": 0.2893172923612118, "grad_norm": 3.318542718887329, "learning_rate": 7.106827076387882e-07, "loss": 0.2104, "step": 5988 }, { "epoch": 0.2893656085423008, "grad_norm": 2.709815263748169, "learning_rate": 7.106343914576991e-07, "loss": 0.3376, "step": 5989 }, { "epoch": 0.28941392472338984, "grad_norm": 4.607706069946289, "learning_rate": 7.105860752766101e-07, "loss": 0.4308, "step": 5990 }, { "epoch": 0.2894622409044789, "grad_norm": 4.101991176605225, "learning_rate": 7.10537759095521e-07, "loss": 0.3737, "step": 5991 }, { "epoch": 0.28951055708556794, "grad_norm": 2.374706983566284, "learning_rate": 7.10489442914432e-07, "loss": 0.2738, "step": 5992 }, { "epoch": 0.289558873266657, "grad_norm": 3.03489351272583, "learning_rate": 7.10441126733343e-07, "loss": 0.2933, "step": 5993 }, { "epoch": 0.28960718944774605, "grad_norm": 2.3017256259918213, "learning_rate": 7.103928105522539e-07, "loss": 0.2704, "step": 5994 }, { "epoch": 0.2896555056288351, "grad_norm": 3.2343897819519043, "learning_rate": 7.103444943711648e-07, "loss": 0.3289, "step": 5995 }, { "epoch": 0.28970382180992416, "grad_norm": 2.4900214672088623, "learning_rate": 7.102961781900758e-07, "loss": 0.2953, "step": 5996 }, { "epoch": 0.2897521379910132, "grad_norm": 3.190253734588623, "learning_rate": 7.102478620089868e-07, "loss": 0.307, "step": 5997 }, { "epoch": 0.28980045417210226, "grad_norm": 3.291490077972412, "learning_rate": 7.101995458278978e-07, "loss": 0.2881, "step": 5998 }, { "epoch": 0.2898487703531913, "grad_norm": 6.403573036193848, "learning_rate": 7.101512296468088e-07, "loss": 0.2133, "step": 5999 }, { "epoch": 0.2898970865342803, "grad_norm": 2.844146966934204, "learning_rate": 7.101029134657196e-07, "loss": 0.3305, "step": 6000 }, { "epoch": 0.2899454027153694, "grad_norm": 3.934555768966675, "learning_rate": 7.100545972846305e-07, "loss": 0.3663, "step": 6001 }, { "epoch": 0.2899937188964584, "grad_norm": 2.0226025581359863, "learning_rate": 7.100062811035415e-07, "loss": 0.2858, "step": 6002 }, { "epoch": 0.29004203507754744, "grad_norm": 3.424243450164795, "learning_rate": 7.099579649224525e-07, "loss": 0.3445, "step": 6003 }, { "epoch": 0.2900903512586365, "grad_norm": 2.203428268432617, "learning_rate": 7.099096487413635e-07, "loss": 0.2336, "step": 6004 }, { "epoch": 0.29013866743972555, "grad_norm": 2.3655643463134766, "learning_rate": 7.098613325602744e-07, "loss": 0.2151, "step": 6005 }, { "epoch": 0.29018698362081463, "grad_norm": 4.4908294677734375, "learning_rate": 7.098130163791854e-07, "loss": 0.4779, "step": 6006 }, { "epoch": 0.29023529980190366, "grad_norm": 3.1827566623687744, "learning_rate": 7.097647001980964e-07, "loss": 0.3236, "step": 6007 }, { "epoch": 0.2902836159829927, "grad_norm": 3.898169755935669, "learning_rate": 7.097163840170072e-07, "loss": 0.2643, "step": 6008 }, { "epoch": 0.29033193216408176, "grad_norm": 3.721299171447754, "learning_rate": 7.096680678359182e-07, "loss": 0.4299, "step": 6009 }, { "epoch": 0.2903802483451708, "grad_norm": 3.3732123374938965, "learning_rate": 7.096197516548291e-07, "loss": 0.3131, "step": 6010 }, { "epoch": 0.29042856452625987, "grad_norm": 5.743424892425537, "learning_rate": 7.095714354737401e-07, "loss": 0.3767, "step": 6011 }, { "epoch": 0.2904768807073489, "grad_norm": 2.7275054454803467, "learning_rate": 7.095231192926511e-07, "loss": 0.2914, "step": 6012 }, { "epoch": 0.2905251968884379, "grad_norm": 2.2002735137939453, "learning_rate": 7.094748031115621e-07, "loss": 0.2562, "step": 6013 }, { "epoch": 0.290573513069527, "grad_norm": 9.399189949035645, "learning_rate": 7.09426486930473e-07, "loss": 0.338, "step": 6014 }, { "epoch": 0.290621829250616, "grad_norm": 2.5127100944519043, "learning_rate": 7.093781707493839e-07, "loss": 0.2534, "step": 6015 }, { "epoch": 0.29067014543170505, "grad_norm": 2.328016519546509, "learning_rate": 7.093298545682949e-07, "loss": 0.2802, "step": 6016 }, { "epoch": 0.29071846161279413, "grad_norm": 3.3094842433929443, "learning_rate": 7.092815383872058e-07, "loss": 0.3382, "step": 6017 }, { "epoch": 0.29076677779388316, "grad_norm": 8.418359756469727, "learning_rate": 7.092332222061168e-07, "loss": 0.3347, "step": 6018 }, { "epoch": 0.29081509397497224, "grad_norm": 2.8760986328125, "learning_rate": 7.091849060250278e-07, "loss": 0.3032, "step": 6019 }, { "epoch": 0.29086341015606126, "grad_norm": 6.862001895904541, "learning_rate": 7.091365898439386e-07, "loss": 0.2203, "step": 6020 }, { "epoch": 0.2909117263371503, "grad_norm": 3.2630417346954346, "learning_rate": 7.090882736628496e-07, "loss": 0.3169, "step": 6021 }, { "epoch": 0.29096004251823937, "grad_norm": 3.518828868865967, "learning_rate": 7.090399574817606e-07, "loss": 0.2296, "step": 6022 }, { "epoch": 0.2910083586993284, "grad_norm": 2.8037919998168945, "learning_rate": 7.089916413006716e-07, "loss": 0.2993, "step": 6023 }, { "epoch": 0.2910566748804175, "grad_norm": 12.72506046295166, "learning_rate": 7.089433251195826e-07, "loss": 0.3061, "step": 6024 }, { "epoch": 0.2911049910615065, "grad_norm": 2.0532965660095215, "learning_rate": 7.088950089384936e-07, "loss": 0.2449, "step": 6025 }, { "epoch": 0.2911533072425955, "grad_norm": 3.0963079929351807, "learning_rate": 7.088466927574044e-07, "loss": 0.2897, "step": 6026 }, { "epoch": 0.2912016234236846, "grad_norm": 56.001556396484375, "learning_rate": 7.087983765763153e-07, "loss": 0.3647, "step": 6027 }, { "epoch": 0.29124993960477363, "grad_norm": 2.102738380432129, "learning_rate": 7.087500603952263e-07, "loss": 0.2089, "step": 6028 }, { "epoch": 0.29129825578586266, "grad_norm": 1.8016306161880493, "learning_rate": 7.087017442141373e-07, "loss": 0.1777, "step": 6029 }, { "epoch": 0.29134657196695174, "grad_norm": 3.016756057739258, "learning_rate": 7.086534280330483e-07, "loss": 0.2238, "step": 6030 }, { "epoch": 0.29139488814804076, "grad_norm": 3.4316251277923584, "learning_rate": 7.086051118519592e-07, "loss": 0.2934, "step": 6031 }, { "epoch": 0.29144320432912985, "grad_norm": 2.9616165161132812, "learning_rate": 7.085567956708702e-07, "loss": 0.355, "step": 6032 }, { "epoch": 0.29149152051021887, "grad_norm": 2.5609045028686523, "learning_rate": 7.08508479489781e-07, "loss": 0.2898, "step": 6033 }, { "epoch": 0.2915398366913079, "grad_norm": 2.692617654800415, "learning_rate": 7.08460163308692e-07, "loss": 0.2948, "step": 6034 }, { "epoch": 0.291588152872397, "grad_norm": 2.2198047637939453, "learning_rate": 7.08411847127603e-07, "loss": 0.2751, "step": 6035 }, { "epoch": 0.291636469053486, "grad_norm": 5.030613422393799, "learning_rate": 7.083635309465139e-07, "loss": 0.2905, "step": 6036 }, { "epoch": 0.2916847852345751, "grad_norm": 1.5782654285430908, "learning_rate": 7.083152147654249e-07, "loss": 0.1564, "step": 6037 }, { "epoch": 0.2917331014156641, "grad_norm": 2.3191630840301514, "learning_rate": 7.082668985843359e-07, "loss": 0.2558, "step": 6038 }, { "epoch": 0.29178141759675313, "grad_norm": 2.2563211917877197, "learning_rate": 7.082185824032469e-07, "loss": 0.2302, "step": 6039 }, { "epoch": 0.2918297337778422, "grad_norm": 2.5867810249328613, "learning_rate": 7.081702662221578e-07, "loss": 0.3296, "step": 6040 }, { "epoch": 0.29187804995893124, "grad_norm": 2.549683094024658, "learning_rate": 7.081219500410687e-07, "loss": 0.2962, "step": 6041 }, { "epoch": 0.29192636614002027, "grad_norm": 12.127269744873047, "learning_rate": 7.080736338599796e-07, "loss": 0.3587, "step": 6042 }, { "epoch": 0.29197468232110935, "grad_norm": 1.6476916074752808, "learning_rate": 7.080253176788906e-07, "loss": 0.1626, "step": 6043 }, { "epoch": 0.29202299850219837, "grad_norm": 3.314535140991211, "learning_rate": 7.079770014978016e-07, "loss": 0.3864, "step": 6044 }, { "epoch": 0.29207131468328745, "grad_norm": 39.86579132080078, "learning_rate": 7.079286853167126e-07, "loss": 0.4637, "step": 6045 }, { "epoch": 0.2921196308643765, "grad_norm": 2.244921922683716, "learning_rate": 7.078803691356234e-07, "loss": 0.2297, "step": 6046 }, { "epoch": 0.2921679470454655, "grad_norm": 2.0350394248962402, "learning_rate": 7.078320529545344e-07, "loss": 0.188, "step": 6047 }, { "epoch": 0.2922162632265546, "grad_norm": 3.3323755264282227, "learning_rate": 7.077837367734454e-07, "loss": 0.2258, "step": 6048 }, { "epoch": 0.2922645794076436, "grad_norm": 3.239753484725952, "learning_rate": 7.077354205923564e-07, "loss": 0.4511, "step": 6049 }, { "epoch": 0.2923128955887327, "grad_norm": 4.208090305328369, "learning_rate": 7.076871044112674e-07, "loss": 0.3976, "step": 6050 }, { "epoch": 0.2923612117698217, "grad_norm": 2.6180922985076904, "learning_rate": 7.076387882301783e-07, "loss": 0.2976, "step": 6051 }, { "epoch": 0.29240952795091074, "grad_norm": 1.8345575332641602, "learning_rate": 7.075904720490891e-07, "loss": 0.1994, "step": 6052 }, { "epoch": 0.2924578441319998, "grad_norm": 4.624633312225342, "learning_rate": 7.075421558680001e-07, "loss": 0.357, "step": 6053 }, { "epoch": 0.29250616031308885, "grad_norm": 1.917159914970398, "learning_rate": 7.074938396869111e-07, "loss": 0.1998, "step": 6054 }, { "epoch": 0.2925544764941779, "grad_norm": 1.941118597984314, "learning_rate": 7.074455235058221e-07, "loss": 0.241, "step": 6055 }, { "epoch": 0.29260279267526695, "grad_norm": 1.8684908151626587, "learning_rate": 7.073972073247331e-07, "loss": 0.2022, "step": 6056 }, { "epoch": 0.292651108856356, "grad_norm": 19.896116256713867, "learning_rate": 7.07348891143644e-07, "loss": 0.301, "step": 6057 }, { "epoch": 0.29269942503744506, "grad_norm": 4.152097225189209, "learning_rate": 7.07300574962555e-07, "loss": 0.396, "step": 6058 }, { "epoch": 0.2927477412185341, "grad_norm": 3.9832541942596436, "learning_rate": 7.072522587814658e-07, "loss": 0.4359, "step": 6059 }, { "epoch": 0.2927960573996231, "grad_norm": 2.6053786277770996, "learning_rate": 7.072039426003768e-07, "loss": 0.312, "step": 6060 }, { "epoch": 0.2928443735807122, "grad_norm": 3.1854984760284424, "learning_rate": 7.071556264192878e-07, "loss": 0.2137, "step": 6061 }, { "epoch": 0.2928926897618012, "grad_norm": 36.890567779541016, "learning_rate": 7.071073102381987e-07, "loss": 0.3548, "step": 6062 }, { "epoch": 0.2929410059428903, "grad_norm": 2.570204496383667, "learning_rate": 7.070589940571097e-07, "loss": 0.3087, "step": 6063 }, { "epoch": 0.2929893221239793, "grad_norm": 3.2010741233825684, "learning_rate": 7.070106778760207e-07, "loss": 0.2215, "step": 6064 }, { "epoch": 0.29303763830506835, "grad_norm": 2.990894317626953, "learning_rate": 7.069623616949316e-07, "loss": 0.3008, "step": 6065 }, { "epoch": 0.29308595448615743, "grad_norm": 2.516915798187256, "learning_rate": 7.069140455138426e-07, "loss": 0.284, "step": 6066 }, { "epoch": 0.29313427066724645, "grad_norm": 4.080555438995361, "learning_rate": 7.068657293327534e-07, "loss": 0.4423, "step": 6067 }, { "epoch": 0.2931825868483355, "grad_norm": 2.434152126312256, "learning_rate": 7.068174131516644e-07, "loss": 0.2716, "step": 6068 }, { "epoch": 0.29323090302942456, "grad_norm": 2.198712110519409, "learning_rate": 7.067690969705754e-07, "loss": 0.2652, "step": 6069 }, { "epoch": 0.2932792192105136, "grad_norm": 3.2771215438842773, "learning_rate": 7.067207807894864e-07, "loss": 0.297, "step": 6070 }, { "epoch": 0.29332753539160267, "grad_norm": 3.005678415298462, "learning_rate": 7.066724646083974e-07, "loss": 0.4027, "step": 6071 }, { "epoch": 0.2933758515726917, "grad_norm": 1.27694571018219, "learning_rate": 7.066241484273082e-07, "loss": 0.1442, "step": 6072 }, { "epoch": 0.2934241677537807, "grad_norm": 2.4456772804260254, "learning_rate": 7.065758322462192e-07, "loss": 0.2506, "step": 6073 }, { "epoch": 0.2934724839348698, "grad_norm": 2.6863820552825928, "learning_rate": 7.065275160651302e-07, "loss": 0.3835, "step": 6074 }, { "epoch": 0.2935208001159588, "grad_norm": 3.751950263977051, "learning_rate": 7.064791998840412e-07, "loss": 0.2229, "step": 6075 }, { "epoch": 0.2935691162970479, "grad_norm": 2.65375018119812, "learning_rate": 7.064308837029521e-07, "loss": 0.2694, "step": 6076 }, { "epoch": 0.29361743247813693, "grad_norm": 2.076730728149414, "learning_rate": 7.063825675218631e-07, "loss": 0.2602, "step": 6077 }, { "epoch": 0.29366574865922596, "grad_norm": 4.546097755432129, "learning_rate": 7.063342513407739e-07, "loss": 0.1955, "step": 6078 }, { "epoch": 0.29371406484031504, "grad_norm": 5.0106048583984375, "learning_rate": 7.062859351596849e-07, "loss": 0.2771, "step": 6079 }, { "epoch": 0.29376238102140406, "grad_norm": 2.0022637844085693, "learning_rate": 7.062376189785959e-07, "loss": 0.1783, "step": 6080 }, { "epoch": 0.2938106972024931, "grad_norm": 4.978405952453613, "learning_rate": 7.061893027975069e-07, "loss": 0.3098, "step": 6081 }, { "epoch": 0.29385901338358217, "grad_norm": 2.0463619232177734, "learning_rate": 7.061409866164179e-07, "loss": 0.2232, "step": 6082 }, { "epoch": 0.2939073295646712, "grad_norm": 3.045196771621704, "learning_rate": 7.060926704353288e-07, "loss": 0.2939, "step": 6083 }, { "epoch": 0.2939556457457603, "grad_norm": 2.4568557739257812, "learning_rate": 7.060443542542396e-07, "loss": 0.3221, "step": 6084 }, { "epoch": 0.2940039619268493, "grad_norm": 3.2027220726013184, "learning_rate": 7.059960380731506e-07, "loss": 0.359, "step": 6085 }, { "epoch": 0.2940522781079383, "grad_norm": 2.935694694519043, "learning_rate": 7.059477218920616e-07, "loss": 0.3348, "step": 6086 }, { "epoch": 0.2941005942890274, "grad_norm": 2.5994105339050293, "learning_rate": 7.058994057109726e-07, "loss": 0.1699, "step": 6087 }, { "epoch": 0.29414891047011643, "grad_norm": 4.352014541625977, "learning_rate": 7.058510895298835e-07, "loss": 0.4046, "step": 6088 }, { "epoch": 0.2941972266512055, "grad_norm": 2.9025816917419434, "learning_rate": 7.058027733487945e-07, "loss": 0.2876, "step": 6089 }, { "epoch": 0.29424554283229454, "grad_norm": 1.4908446073532104, "learning_rate": 7.057544571677055e-07, "loss": 0.1405, "step": 6090 }, { "epoch": 0.29429385901338356, "grad_norm": 3.317702054977417, "learning_rate": 7.057061409866164e-07, "loss": 0.2916, "step": 6091 }, { "epoch": 0.29434217519447264, "grad_norm": 2.422797918319702, "learning_rate": 7.056578248055274e-07, "loss": 0.2564, "step": 6092 }, { "epoch": 0.29439049137556167, "grad_norm": 5.4078521728515625, "learning_rate": 7.056095086244382e-07, "loss": 0.2719, "step": 6093 }, { "epoch": 0.2944388075566507, "grad_norm": 5.7914509773254395, "learning_rate": 7.055611924433492e-07, "loss": 0.2273, "step": 6094 }, { "epoch": 0.2944871237377398, "grad_norm": 3.5756189823150635, "learning_rate": 7.055128762622602e-07, "loss": 0.4272, "step": 6095 }, { "epoch": 0.2945354399188288, "grad_norm": 2.3354175090789795, "learning_rate": 7.054645600811712e-07, "loss": 0.3063, "step": 6096 }, { "epoch": 0.2945837560999179, "grad_norm": 2.844062566757202, "learning_rate": 7.054162439000821e-07, "loss": 0.2901, "step": 6097 }, { "epoch": 0.2946320722810069, "grad_norm": 1.8424947261810303, "learning_rate": 7.05367927718993e-07, "loss": 0.1975, "step": 6098 }, { "epoch": 0.29468038846209593, "grad_norm": 3.655275344848633, "learning_rate": 7.05319611537904e-07, "loss": 0.3316, "step": 6099 }, { "epoch": 0.294728704643185, "grad_norm": 1.9526625871658325, "learning_rate": 7.05271295356815e-07, "loss": 0.2405, "step": 6100 }, { "epoch": 0.29477702082427404, "grad_norm": 46.343387603759766, "learning_rate": 7.05222979175726e-07, "loss": 0.1941, "step": 6101 }, { "epoch": 0.2948253370053631, "grad_norm": 6.767411708831787, "learning_rate": 7.051746629946369e-07, "loss": 0.3857, "step": 6102 }, { "epoch": 0.29487365318645214, "grad_norm": 2.577828884124756, "learning_rate": 7.051263468135479e-07, "loss": 0.3208, "step": 6103 }, { "epoch": 0.29492196936754117, "grad_norm": 2.689929485321045, "learning_rate": 7.050780306324587e-07, "loss": 0.2411, "step": 6104 }, { "epoch": 0.29497028554863025, "grad_norm": 2.275359630584717, "learning_rate": 7.050297144513697e-07, "loss": 0.2879, "step": 6105 }, { "epoch": 0.2950186017297193, "grad_norm": 2.6863648891448975, "learning_rate": 7.049813982702807e-07, "loss": 0.2406, "step": 6106 }, { "epoch": 0.29506691791080836, "grad_norm": 3.727001190185547, "learning_rate": 7.049330820891917e-07, "loss": 0.439, "step": 6107 }, { "epoch": 0.2951152340918974, "grad_norm": 2.5392978191375732, "learning_rate": 7.048847659081027e-07, "loss": 0.3518, "step": 6108 }, { "epoch": 0.2951635502729864, "grad_norm": 2.7964892387390137, "learning_rate": 7.048364497270136e-07, "loss": 0.2776, "step": 6109 }, { "epoch": 0.2952118664540755, "grad_norm": 3.8158090114593506, "learning_rate": 7.047881335459244e-07, "loss": 0.3813, "step": 6110 }, { "epoch": 0.2952601826351645, "grad_norm": 3.847235679626465, "learning_rate": 7.047398173648354e-07, "loss": 0.1616, "step": 6111 }, { "epoch": 0.29530849881625354, "grad_norm": 2.37441349029541, "learning_rate": 7.046915011837464e-07, "loss": 0.2676, "step": 6112 }, { "epoch": 0.2953568149973426, "grad_norm": 2.1571762561798096, "learning_rate": 7.046431850026574e-07, "loss": 0.2965, "step": 6113 }, { "epoch": 0.29540513117843165, "grad_norm": 2.9216301441192627, "learning_rate": 7.045948688215683e-07, "loss": 0.4375, "step": 6114 }, { "epoch": 0.2954534473595207, "grad_norm": 3.9796814918518066, "learning_rate": 7.045465526404793e-07, "loss": 0.2323, "step": 6115 }, { "epoch": 0.29550176354060975, "grad_norm": 2.549321413040161, "learning_rate": 7.044982364593902e-07, "loss": 0.2507, "step": 6116 }, { "epoch": 0.2955500797216988, "grad_norm": 3.1628313064575195, "learning_rate": 7.044499202783012e-07, "loss": 0.3996, "step": 6117 }, { "epoch": 0.29559839590278786, "grad_norm": 2.1133322715759277, "learning_rate": 7.044016040972121e-07, "loss": 0.1909, "step": 6118 }, { "epoch": 0.2956467120838769, "grad_norm": 2.4546358585357666, "learning_rate": 7.04353287916123e-07, "loss": 0.2282, "step": 6119 }, { "epoch": 0.29569502826496596, "grad_norm": 2.483328342437744, "learning_rate": 7.04304971735034e-07, "loss": 0.2636, "step": 6120 }, { "epoch": 0.295743344446055, "grad_norm": 2.7594964504241943, "learning_rate": 7.04256655553945e-07, "loss": 0.3369, "step": 6121 }, { "epoch": 0.295791660627144, "grad_norm": 2.468266248703003, "learning_rate": 7.04208339372856e-07, "loss": 0.3359, "step": 6122 }, { "epoch": 0.2958399768082331, "grad_norm": 1.9553147554397583, "learning_rate": 7.041600231917669e-07, "loss": 0.207, "step": 6123 }, { "epoch": 0.2958882929893221, "grad_norm": 2.1389079093933105, "learning_rate": 7.041117070106778e-07, "loss": 0.2587, "step": 6124 }, { "epoch": 0.29593660917041115, "grad_norm": 3.6111457347869873, "learning_rate": 7.040633908295888e-07, "loss": 0.2882, "step": 6125 }, { "epoch": 0.2959849253515002, "grad_norm": 1.965486764907837, "learning_rate": 7.040150746484998e-07, "loss": 0.2182, "step": 6126 }, { "epoch": 0.29603324153258925, "grad_norm": 5.489132404327393, "learning_rate": 7.039667584674107e-07, "loss": 0.2284, "step": 6127 }, { "epoch": 0.29608155771367833, "grad_norm": 2.1185083389282227, "learning_rate": 7.039184422863217e-07, "loss": 0.3172, "step": 6128 }, { "epoch": 0.29612987389476736, "grad_norm": 3.7595667839050293, "learning_rate": 7.038701261052326e-07, "loss": 0.2855, "step": 6129 }, { "epoch": 0.2961781900758564, "grad_norm": 4.6772356033325195, "learning_rate": 7.038218099241435e-07, "loss": 0.2447, "step": 6130 }, { "epoch": 0.29622650625694547, "grad_norm": 3.195209264755249, "learning_rate": 7.037734937430545e-07, "loss": 0.4025, "step": 6131 }, { "epoch": 0.2962748224380345, "grad_norm": 2.6301891803741455, "learning_rate": 7.037251775619655e-07, "loss": 0.3098, "step": 6132 }, { "epoch": 0.29632313861912357, "grad_norm": 2.1670942306518555, "learning_rate": 7.036768613808765e-07, "loss": 0.2561, "step": 6133 }, { "epoch": 0.2963714548002126, "grad_norm": 2.469545841217041, "learning_rate": 7.036285451997875e-07, "loss": 0.2796, "step": 6134 }, { "epoch": 0.2964197709813016, "grad_norm": 2.283848285675049, "learning_rate": 7.035802290186982e-07, "loss": 0.1669, "step": 6135 }, { "epoch": 0.2964680871623907, "grad_norm": 10.474785804748535, "learning_rate": 7.035319128376092e-07, "loss": 0.4019, "step": 6136 }, { "epoch": 0.29651640334347973, "grad_norm": 1.882702350616455, "learning_rate": 7.034835966565202e-07, "loss": 0.2226, "step": 6137 }, { "epoch": 0.29656471952456875, "grad_norm": 2.1900813579559326, "learning_rate": 7.034352804754312e-07, "loss": 0.1863, "step": 6138 }, { "epoch": 0.29661303570565783, "grad_norm": 3.39322829246521, "learning_rate": 7.033869642943422e-07, "loss": 0.3133, "step": 6139 }, { "epoch": 0.29666135188674686, "grad_norm": 2.7216827869415283, "learning_rate": 7.033386481132531e-07, "loss": 0.2451, "step": 6140 }, { "epoch": 0.29670966806783594, "grad_norm": 2.7123258113861084, "learning_rate": 7.032903319321641e-07, "loss": 0.3452, "step": 6141 }, { "epoch": 0.29675798424892497, "grad_norm": 2.4516074657440186, "learning_rate": 7.03242015751075e-07, "loss": 0.297, "step": 6142 }, { "epoch": 0.296806300430014, "grad_norm": 6.516155242919922, "learning_rate": 7.03193699569986e-07, "loss": 0.317, "step": 6143 }, { "epoch": 0.2968546166111031, "grad_norm": 1.8947694301605225, "learning_rate": 7.031453833888969e-07, "loss": 0.1957, "step": 6144 }, { "epoch": 0.2969029327921921, "grad_norm": 2.8722286224365234, "learning_rate": 7.030970672078078e-07, "loss": 0.3031, "step": 6145 }, { "epoch": 0.2969512489732812, "grad_norm": 1.4148544073104858, "learning_rate": 7.030487510267188e-07, "loss": 0.1299, "step": 6146 }, { "epoch": 0.2969995651543702, "grad_norm": 2.67655873298645, "learning_rate": 7.030004348456298e-07, "loss": 0.3967, "step": 6147 }, { "epoch": 0.29704788133545923, "grad_norm": 4.2718281745910645, "learning_rate": 7.029521186645407e-07, "loss": 0.3752, "step": 6148 }, { "epoch": 0.2970961975165483, "grad_norm": 2.0939977169036865, "learning_rate": 7.029038024834517e-07, "loss": 0.2265, "step": 6149 }, { "epoch": 0.29714451369763734, "grad_norm": 6.673652648925781, "learning_rate": 7.028554863023626e-07, "loss": 0.3823, "step": 6150 }, { "epoch": 0.29719282987872636, "grad_norm": 2.9326276779174805, "learning_rate": 7.028071701212736e-07, "loss": 0.2267, "step": 6151 }, { "epoch": 0.29724114605981544, "grad_norm": 4.651679515838623, "learning_rate": 7.027588539401845e-07, "loss": 0.3382, "step": 6152 }, { "epoch": 0.29728946224090447, "grad_norm": 52.6235466003418, "learning_rate": 7.027105377590955e-07, "loss": 0.3371, "step": 6153 }, { "epoch": 0.29733777842199355, "grad_norm": 2.6192760467529297, "learning_rate": 7.026622215780065e-07, "loss": 0.3059, "step": 6154 }, { "epoch": 0.2973860946030826, "grad_norm": 2.7837741374969482, "learning_rate": 7.026139053969174e-07, "loss": 0.382, "step": 6155 }, { "epoch": 0.2974344107841716, "grad_norm": 2.0822994709014893, "learning_rate": 7.025655892158283e-07, "loss": 0.2336, "step": 6156 }, { "epoch": 0.2974827269652607, "grad_norm": 7.407377243041992, "learning_rate": 7.025172730347393e-07, "loss": 0.2071, "step": 6157 }, { "epoch": 0.2975310431463497, "grad_norm": 2.1619391441345215, "learning_rate": 7.024689568536503e-07, "loss": 0.2809, "step": 6158 }, { "epoch": 0.2975793593274388, "grad_norm": 2.2072629928588867, "learning_rate": 7.024206406725613e-07, "loss": 0.2588, "step": 6159 }, { "epoch": 0.2976276755085278, "grad_norm": 4.618122100830078, "learning_rate": 7.023723244914723e-07, "loss": 0.3696, "step": 6160 }, { "epoch": 0.29767599168961684, "grad_norm": 2.7807788848876953, "learning_rate": 7.02324008310383e-07, "loss": 0.3575, "step": 6161 }, { "epoch": 0.2977243078707059, "grad_norm": 2.6750264167785645, "learning_rate": 7.02275692129294e-07, "loss": 0.2406, "step": 6162 }, { "epoch": 0.29777262405179494, "grad_norm": 2.1540350914001465, "learning_rate": 7.02227375948205e-07, "loss": 0.2398, "step": 6163 }, { "epoch": 0.29782094023288397, "grad_norm": 3.3297464847564697, "learning_rate": 7.02179059767116e-07, "loss": 0.2141, "step": 6164 }, { "epoch": 0.29786925641397305, "grad_norm": 2.678334951400757, "learning_rate": 7.02130743586027e-07, "loss": 0.352, "step": 6165 }, { "epoch": 0.2979175725950621, "grad_norm": 3.555708169937134, "learning_rate": 7.020824274049379e-07, "loss": 0.2401, "step": 6166 }, { "epoch": 0.29796588877615116, "grad_norm": 2.3617560863494873, "learning_rate": 7.020341112238488e-07, "loss": 0.2686, "step": 6167 }, { "epoch": 0.2980142049572402, "grad_norm": 2.1968443393707275, "learning_rate": 7.019857950427598e-07, "loss": 0.2772, "step": 6168 }, { "epoch": 0.2980625211383292, "grad_norm": 2.4511868953704834, "learning_rate": 7.019374788616707e-07, "loss": 0.2478, "step": 6169 }, { "epoch": 0.2981108373194183, "grad_norm": 2.5858709812164307, "learning_rate": 7.018891626805817e-07, "loss": 0.3326, "step": 6170 }, { "epoch": 0.2981591535005073, "grad_norm": 2.9226491451263428, "learning_rate": 7.018408464994926e-07, "loss": 0.2551, "step": 6171 }, { "epoch": 0.2982074696815964, "grad_norm": 5.2215895652771, "learning_rate": 7.017925303184036e-07, "loss": 0.3041, "step": 6172 }, { "epoch": 0.2982557858626854, "grad_norm": 2.9822747707366943, "learning_rate": 7.017442141373146e-07, "loss": 0.1907, "step": 6173 }, { "epoch": 0.29830410204377444, "grad_norm": 2.0290586948394775, "learning_rate": 7.016958979562255e-07, "loss": 0.2523, "step": 6174 }, { "epoch": 0.2983524182248635, "grad_norm": 3.376718759536743, "learning_rate": 7.016475817751365e-07, "loss": 0.3291, "step": 6175 }, { "epoch": 0.29840073440595255, "grad_norm": 3.591017961502075, "learning_rate": 7.015992655940474e-07, "loss": 0.4203, "step": 6176 }, { "epoch": 0.2984490505870416, "grad_norm": 3.166452646255493, "learning_rate": 7.015509494129583e-07, "loss": 0.2627, "step": 6177 }, { "epoch": 0.29849736676813066, "grad_norm": 8.478401184082031, "learning_rate": 7.015026332318693e-07, "loss": 0.2761, "step": 6178 }, { "epoch": 0.2985456829492197, "grad_norm": 2.684239625930786, "learning_rate": 7.014543170507803e-07, "loss": 0.2385, "step": 6179 }, { "epoch": 0.29859399913030876, "grad_norm": 4.385095596313477, "learning_rate": 7.014060008696912e-07, "loss": 0.3026, "step": 6180 }, { "epoch": 0.2986423153113978, "grad_norm": 2.556506872177124, "learning_rate": 7.013576846886022e-07, "loss": 0.304, "step": 6181 }, { "epoch": 0.2986906314924868, "grad_norm": 3.134399890899658, "learning_rate": 7.013093685075131e-07, "loss": 0.2544, "step": 6182 }, { "epoch": 0.2987389476735759, "grad_norm": 1.9656827449798584, "learning_rate": 7.012610523264241e-07, "loss": 0.1877, "step": 6183 }, { "epoch": 0.2987872638546649, "grad_norm": 1.54001784324646, "learning_rate": 7.012127361453351e-07, "loss": 0.1901, "step": 6184 }, { "epoch": 0.298835580035754, "grad_norm": 3.1289446353912354, "learning_rate": 7.011644199642461e-07, "loss": 0.3811, "step": 6185 }, { "epoch": 0.298883896216843, "grad_norm": 4.3688225746154785, "learning_rate": 7.01116103783157e-07, "loss": 0.397, "step": 6186 }, { "epoch": 0.29893221239793205, "grad_norm": 1.9512791633605957, "learning_rate": 7.010677876020678e-07, "loss": 0.191, "step": 6187 }, { "epoch": 0.29898052857902113, "grad_norm": 2.6017613410949707, "learning_rate": 7.010194714209788e-07, "loss": 0.3425, "step": 6188 }, { "epoch": 0.29902884476011016, "grad_norm": 2.4814531803131104, "learning_rate": 7.009711552398898e-07, "loss": 0.285, "step": 6189 }, { "epoch": 0.2990771609411992, "grad_norm": 2.6647496223449707, "learning_rate": 7.009228390588008e-07, "loss": 0.2437, "step": 6190 }, { "epoch": 0.29912547712228826, "grad_norm": 3.2052500247955322, "learning_rate": 7.008745228777118e-07, "loss": 0.281, "step": 6191 }, { "epoch": 0.2991737933033773, "grad_norm": 2.456559419631958, "learning_rate": 7.008262066966227e-07, "loss": 0.2812, "step": 6192 }, { "epoch": 0.29922210948446637, "grad_norm": 2.496137857437134, "learning_rate": 7.007778905155336e-07, "loss": 0.2948, "step": 6193 }, { "epoch": 0.2992704256655554, "grad_norm": 3.532341957092285, "learning_rate": 7.007295743344445e-07, "loss": 0.1937, "step": 6194 }, { "epoch": 0.2993187418466444, "grad_norm": 2.5707831382751465, "learning_rate": 7.006812581533555e-07, "loss": 0.3231, "step": 6195 }, { "epoch": 0.2993670580277335, "grad_norm": 13.609048843383789, "learning_rate": 7.006329419722665e-07, "loss": 0.2865, "step": 6196 }, { "epoch": 0.2994153742088225, "grad_norm": 2.4864501953125, "learning_rate": 7.005846257911774e-07, "loss": 0.2859, "step": 6197 }, { "epoch": 0.2994636903899116, "grad_norm": 2.898625612258911, "learning_rate": 7.005363096100884e-07, "loss": 0.3363, "step": 6198 }, { "epoch": 0.29951200657100063, "grad_norm": 1.785927414894104, "learning_rate": 7.004879934289993e-07, "loss": 0.1953, "step": 6199 }, { "epoch": 0.29956032275208966, "grad_norm": 3.8884952068328857, "learning_rate": 7.004396772479103e-07, "loss": 0.2388, "step": 6200 }, { "epoch": 0.29960863893317874, "grad_norm": 4.384668827056885, "learning_rate": 7.003913610668213e-07, "loss": 0.2988, "step": 6201 }, { "epoch": 0.29965695511426776, "grad_norm": 1.8252284526824951, "learning_rate": 7.003430448857321e-07, "loss": 0.2088, "step": 6202 }, { "epoch": 0.2997052712953568, "grad_norm": 2.837613105773926, "learning_rate": 7.002947287046431e-07, "loss": 0.2738, "step": 6203 }, { "epoch": 0.29975358747644587, "grad_norm": 2.602146863937378, "learning_rate": 7.002464125235541e-07, "loss": 0.2411, "step": 6204 }, { "epoch": 0.2998019036575349, "grad_norm": 3.2298471927642822, "learning_rate": 7.001980963424651e-07, "loss": 0.2638, "step": 6205 }, { "epoch": 0.299850219838624, "grad_norm": 2.7267611026763916, "learning_rate": 7.00149780161376e-07, "loss": 0.2758, "step": 6206 }, { "epoch": 0.299898536019713, "grad_norm": 2.293646812438965, "learning_rate": 7.00101463980287e-07, "loss": 0.333, "step": 6207 }, { "epoch": 0.299946852200802, "grad_norm": 4.429770469665527, "learning_rate": 7.000531477991979e-07, "loss": 0.3702, "step": 6208 }, { "epoch": 0.2999951683818911, "grad_norm": 5.325848579406738, "learning_rate": 7.000048316181089e-07, "loss": 0.1946, "step": 6209 }, { "epoch": 0.30004348456298013, "grad_norm": 2.4001595973968506, "learning_rate": 6.999565154370199e-07, "loss": 0.2447, "step": 6210 }, { "epoch": 0.3000918007440692, "grad_norm": 6.068942070007324, "learning_rate": 6.999081992559308e-07, "loss": 0.208, "step": 6211 }, { "epoch": 0.30014011692515824, "grad_norm": 2.5187056064605713, "learning_rate": 6.998598830748417e-07, "loss": 0.3259, "step": 6212 }, { "epoch": 0.30018843310624727, "grad_norm": 2.3866639137268066, "learning_rate": 6.998115668937526e-07, "loss": 0.2858, "step": 6213 }, { "epoch": 0.30023674928733635, "grad_norm": 2.9621455669403076, "learning_rate": 6.997632507126636e-07, "loss": 0.2948, "step": 6214 }, { "epoch": 0.30028506546842537, "grad_norm": 14.830626487731934, "learning_rate": 6.997149345315746e-07, "loss": 0.4575, "step": 6215 }, { "epoch": 0.3003333816495144, "grad_norm": 2.7223191261291504, "learning_rate": 6.996666183504856e-07, "loss": 0.3069, "step": 6216 }, { "epoch": 0.3003816978306035, "grad_norm": 2.941781997680664, "learning_rate": 6.996183021693966e-07, "loss": 0.3707, "step": 6217 }, { "epoch": 0.3004300140116925, "grad_norm": 4.034793853759766, "learning_rate": 6.995699859883074e-07, "loss": 0.3896, "step": 6218 }, { "epoch": 0.3004783301927816, "grad_norm": 2.750422716140747, "learning_rate": 6.995216698072183e-07, "loss": 0.2257, "step": 6219 }, { "epoch": 0.3005266463738706, "grad_norm": 1.7767844200134277, "learning_rate": 6.994733536261293e-07, "loss": 0.1861, "step": 6220 }, { "epoch": 0.30057496255495963, "grad_norm": 6.207118988037109, "learning_rate": 6.994250374450403e-07, "loss": 0.2497, "step": 6221 }, { "epoch": 0.3006232787360487, "grad_norm": 6.826432228088379, "learning_rate": 6.993767212639513e-07, "loss": 0.325, "step": 6222 }, { "epoch": 0.30067159491713774, "grad_norm": 2.8685295581817627, "learning_rate": 6.993284050828622e-07, "loss": 0.4022, "step": 6223 }, { "epoch": 0.3007199110982268, "grad_norm": 2.7203619480133057, "learning_rate": 6.992800889017732e-07, "loss": 0.3116, "step": 6224 }, { "epoch": 0.30076822727931585, "grad_norm": 2.7907965183258057, "learning_rate": 6.992317727206841e-07, "loss": 0.3271, "step": 6225 }, { "epoch": 0.3008165434604049, "grad_norm": 5.275141716003418, "learning_rate": 6.991834565395951e-07, "loss": 0.3648, "step": 6226 }, { "epoch": 0.30086485964149395, "grad_norm": 3.975364923477173, "learning_rate": 6.991351403585061e-07, "loss": 0.2653, "step": 6227 }, { "epoch": 0.300913175822583, "grad_norm": 2.7271337509155273, "learning_rate": 6.990868241774169e-07, "loss": 0.2903, "step": 6228 }, { "epoch": 0.300961492003672, "grad_norm": 2.9849183559417725, "learning_rate": 6.990385079963279e-07, "loss": 0.3246, "step": 6229 }, { "epoch": 0.3010098081847611, "grad_norm": 3.1837639808654785, "learning_rate": 6.989901918152389e-07, "loss": 0.3095, "step": 6230 }, { "epoch": 0.3010581243658501, "grad_norm": 2.2546095848083496, "learning_rate": 6.989418756341498e-07, "loss": 0.2778, "step": 6231 }, { "epoch": 0.3011064405469392, "grad_norm": 3.1797053813934326, "learning_rate": 6.988935594530608e-07, "loss": 0.3657, "step": 6232 }, { "epoch": 0.3011547567280282, "grad_norm": 3.0402562618255615, "learning_rate": 6.988452432719718e-07, "loss": 0.2937, "step": 6233 }, { "epoch": 0.30120307290911724, "grad_norm": 3.7011592388153076, "learning_rate": 6.987969270908827e-07, "loss": 0.3685, "step": 6234 }, { "epoch": 0.3012513890902063, "grad_norm": 1.5169440507888794, "learning_rate": 6.987486109097937e-07, "loss": 0.1459, "step": 6235 }, { "epoch": 0.30129970527129535, "grad_norm": 4.367960453033447, "learning_rate": 6.987002947287047e-07, "loss": 0.2993, "step": 6236 }, { "epoch": 0.30134802145238443, "grad_norm": 2.124562978744507, "learning_rate": 6.986519785476156e-07, "loss": 0.2131, "step": 6237 }, { "epoch": 0.30139633763347345, "grad_norm": 14.124802589416504, "learning_rate": 6.986036623665265e-07, "loss": 0.3226, "step": 6238 }, { "epoch": 0.3014446538145625, "grad_norm": 2.5740556716918945, "learning_rate": 6.985553461854374e-07, "loss": 0.2612, "step": 6239 }, { "epoch": 0.30149296999565156, "grad_norm": 1.3612778186798096, "learning_rate": 6.985070300043484e-07, "loss": 0.1318, "step": 6240 }, { "epoch": 0.3015412861767406, "grad_norm": 1.6672827005386353, "learning_rate": 6.984587138232594e-07, "loss": 0.1841, "step": 6241 }, { "epoch": 0.3015896023578296, "grad_norm": 4.708919525146484, "learning_rate": 6.984103976421704e-07, "loss": 0.2667, "step": 6242 }, { "epoch": 0.3016379185389187, "grad_norm": 2.2917115688323975, "learning_rate": 6.983620814610814e-07, "loss": 0.2304, "step": 6243 }, { "epoch": 0.3016862347200077, "grad_norm": 4.260239124298096, "learning_rate": 6.983137652799922e-07, "loss": 0.3339, "step": 6244 }, { "epoch": 0.3017345509010968, "grad_norm": 1.5252089500427246, "learning_rate": 6.982654490989031e-07, "loss": 0.1527, "step": 6245 }, { "epoch": 0.3017828670821858, "grad_norm": 117.2906494140625, "learning_rate": 6.982171329178141e-07, "loss": 0.437, "step": 6246 }, { "epoch": 0.30183118326327485, "grad_norm": 2.0735206604003906, "learning_rate": 6.981688167367251e-07, "loss": 0.2263, "step": 6247 }, { "epoch": 0.30187949944436393, "grad_norm": 1.5261024236679077, "learning_rate": 6.981205005556361e-07, "loss": 0.15, "step": 6248 }, { "epoch": 0.30192781562545296, "grad_norm": 2.4002509117126465, "learning_rate": 6.98072184374547e-07, "loss": 0.2564, "step": 6249 }, { "epoch": 0.30197613180654204, "grad_norm": 2.2447123527526855, "learning_rate": 6.980238681934579e-07, "loss": 0.2743, "step": 6250 }, { "epoch": 0.30202444798763106, "grad_norm": 3.0575554370880127, "learning_rate": 6.979755520123689e-07, "loss": 0.3229, "step": 6251 }, { "epoch": 0.3020727641687201, "grad_norm": 3.106147527694702, "learning_rate": 6.979272358312799e-07, "loss": 0.4051, "step": 6252 }, { "epoch": 0.30212108034980917, "grad_norm": 2.501941680908203, "learning_rate": 6.978789196501909e-07, "loss": 0.3333, "step": 6253 }, { "epoch": 0.3021693965308982, "grad_norm": 3.5072340965270996, "learning_rate": 6.978306034691017e-07, "loss": 0.3446, "step": 6254 }, { "epoch": 0.3022177127119872, "grad_norm": 3.0243566036224365, "learning_rate": 6.977822872880127e-07, "loss": 0.3738, "step": 6255 }, { "epoch": 0.3022660288930763, "grad_norm": 3.427544116973877, "learning_rate": 6.977339711069237e-07, "loss": 0.2915, "step": 6256 }, { "epoch": 0.3023143450741653, "grad_norm": 4.590493202209473, "learning_rate": 6.976856549258346e-07, "loss": 0.4191, "step": 6257 }, { "epoch": 0.3023626612552544, "grad_norm": 2.3054704666137695, "learning_rate": 6.976373387447456e-07, "loss": 0.1931, "step": 6258 }, { "epoch": 0.30241097743634343, "grad_norm": 4.656085014343262, "learning_rate": 6.975890225636565e-07, "loss": 0.3093, "step": 6259 }, { "epoch": 0.30245929361743246, "grad_norm": 2.8725719451904297, "learning_rate": 6.975407063825675e-07, "loss": 0.2515, "step": 6260 }, { "epoch": 0.30250760979852154, "grad_norm": 3.8876194953918457, "learning_rate": 6.974923902014785e-07, "loss": 0.3809, "step": 6261 }, { "epoch": 0.30255592597961056, "grad_norm": 4.014631748199463, "learning_rate": 6.974440740203894e-07, "loss": 0.4826, "step": 6262 }, { "epoch": 0.30260424216069964, "grad_norm": 1.9832615852355957, "learning_rate": 6.973957578393003e-07, "loss": 0.1572, "step": 6263 }, { "epoch": 0.30265255834178867, "grad_norm": 2.7512879371643066, "learning_rate": 6.973474416582113e-07, "loss": 0.3656, "step": 6264 }, { "epoch": 0.3027008745228777, "grad_norm": 3.074352741241455, "learning_rate": 6.972991254771222e-07, "loss": 0.3085, "step": 6265 }, { "epoch": 0.3027491907039668, "grad_norm": 2.8248167037963867, "learning_rate": 6.972508092960332e-07, "loss": 0.3423, "step": 6266 }, { "epoch": 0.3027975068850558, "grad_norm": 1.3977251052856445, "learning_rate": 6.972024931149442e-07, "loss": 0.1513, "step": 6267 }, { "epoch": 0.3028458230661448, "grad_norm": 2.412484884262085, "learning_rate": 6.971541769338552e-07, "loss": 0.259, "step": 6268 }, { "epoch": 0.3028941392472339, "grad_norm": 1.8060312271118164, "learning_rate": 6.971058607527662e-07, "loss": 0.1949, "step": 6269 }, { "epoch": 0.30294245542832293, "grad_norm": 3.4688541889190674, "learning_rate": 6.970575445716769e-07, "loss": 0.2782, "step": 6270 }, { "epoch": 0.302990771609412, "grad_norm": 2.6684701442718506, "learning_rate": 6.970092283905879e-07, "loss": 0.317, "step": 6271 }, { "epoch": 0.30303908779050104, "grad_norm": 5.785958766937256, "learning_rate": 6.969609122094989e-07, "loss": 0.2638, "step": 6272 }, { "epoch": 0.30308740397159006, "grad_norm": 7.588430404663086, "learning_rate": 6.969125960284099e-07, "loss": 0.3562, "step": 6273 }, { "epoch": 0.30313572015267914, "grad_norm": 2.542410135269165, "learning_rate": 6.968642798473209e-07, "loss": 0.2933, "step": 6274 }, { "epoch": 0.30318403633376817, "grad_norm": 2.296121597290039, "learning_rate": 6.968159636662318e-07, "loss": 0.3197, "step": 6275 }, { "epoch": 0.30323235251485725, "grad_norm": 2.606783866882324, "learning_rate": 6.967676474851427e-07, "loss": 0.3045, "step": 6276 }, { "epoch": 0.3032806686959463, "grad_norm": 2.9110445976257324, "learning_rate": 6.967193313040537e-07, "loss": 0.4055, "step": 6277 }, { "epoch": 0.3033289848770353, "grad_norm": 1.9068865776062012, "learning_rate": 6.966710151229647e-07, "loss": 0.214, "step": 6278 }, { "epoch": 0.3033773010581244, "grad_norm": 2.7484822273254395, "learning_rate": 6.966226989418756e-07, "loss": 0.3002, "step": 6279 }, { "epoch": 0.3034256172392134, "grad_norm": 4.339433670043945, "learning_rate": 6.965743827607865e-07, "loss": 0.3418, "step": 6280 }, { "epoch": 0.30347393342030243, "grad_norm": 3.332321882247925, "learning_rate": 6.965260665796975e-07, "loss": 0.2174, "step": 6281 }, { "epoch": 0.3035222496013915, "grad_norm": 2.6006863117218018, "learning_rate": 6.964777503986084e-07, "loss": 0.2934, "step": 6282 }, { "epoch": 0.30357056578248054, "grad_norm": 2.623270034790039, "learning_rate": 6.964294342175194e-07, "loss": 0.3888, "step": 6283 }, { "epoch": 0.3036188819635696, "grad_norm": 4.595832347869873, "learning_rate": 6.963811180364304e-07, "loss": 0.3894, "step": 6284 }, { "epoch": 0.30366719814465865, "grad_norm": 2.761793613433838, "learning_rate": 6.963328018553413e-07, "loss": 0.2886, "step": 6285 }, { "epoch": 0.30371551432574767, "grad_norm": 2.2738213539123535, "learning_rate": 6.962844856742523e-07, "loss": 0.2975, "step": 6286 }, { "epoch": 0.30376383050683675, "grad_norm": 14.072726249694824, "learning_rate": 6.962361694931632e-07, "loss": 0.3264, "step": 6287 }, { "epoch": 0.3038121466879258, "grad_norm": 2.2373616695404053, "learning_rate": 6.961878533120742e-07, "loss": 0.2889, "step": 6288 }, { "epoch": 0.30386046286901486, "grad_norm": 2.3093676567077637, "learning_rate": 6.961395371309851e-07, "loss": 0.2765, "step": 6289 }, { "epoch": 0.3039087790501039, "grad_norm": 2.6909029483795166, "learning_rate": 6.960912209498961e-07, "loss": 0.3116, "step": 6290 }, { "epoch": 0.3039570952311929, "grad_norm": 4.534310817718506, "learning_rate": 6.96042904768807e-07, "loss": 0.1924, "step": 6291 }, { "epoch": 0.304005411412282, "grad_norm": 2.7573530673980713, "learning_rate": 6.95994588587718e-07, "loss": 0.1868, "step": 6292 }, { "epoch": 0.304053727593371, "grad_norm": 3.7468161582946777, "learning_rate": 6.95946272406629e-07, "loss": 0.3592, "step": 6293 }, { "epoch": 0.30410204377446004, "grad_norm": 2.484510898590088, "learning_rate": 6.9589795622554e-07, "loss": 0.3299, "step": 6294 }, { "epoch": 0.3041503599555491, "grad_norm": 4.204073429107666, "learning_rate": 6.958496400444509e-07, "loss": 0.3496, "step": 6295 }, { "epoch": 0.30419867613663815, "grad_norm": 2.7245264053344727, "learning_rate": 6.958013238633617e-07, "loss": 0.3193, "step": 6296 }, { "epoch": 0.3042469923177272, "grad_norm": 2.5023083686828613, "learning_rate": 6.957530076822727e-07, "loss": 0.3342, "step": 6297 }, { "epoch": 0.30429530849881625, "grad_norm": 3.8566370010375977, "learning_rate": 6.957046915011837e-07, "loss": 0.2422, "step": 6298 }, { "epoch": 0.3043436246799053, "grad_norm": 2.591689348220825, "learning_rate": 6.956563753200947e-07, "loss": 0.2867, "step": 6299 }, { "epoch": 0.30439194086099436, "grad_norm": 14.761902809143066, "learning_rate": 6.956080591390057e-07, "loss": 0.1987, "step": 6300 }, { "epoch": 0.3044402570420834, "grad_norm": 1.9599884748458862, "learning_rate": 6.955597429579166e-07, "loss": 0.1683, "step": 6301 }, { "epoch": 0.30448857322317247, "grad_norm": 2.0459089279174805, "learning_rate": 6.955114267768275e-07, "loss": 0.2726, "step": 6302 }, { "epoch": 0.3045368894042615, "grad_norm": 2.37336802482605, "learning_rate": 6.954631105957385e-07, "loss": 0.3031, "step": 6303 }, { "epoch": 0.3045852055853505, "grad_norm": 3.996201753616333, "learning_rate": 6.954147944146494e-07, "loss": 0.3147, "step": 6304 }, { "epoch": 0.3046335217664396, "grad_norm": 2.6774866580963135, "learning_rate": 6.953664782335604e-07, "loss": 0.3809, "step": 6305 }, { "epoch": 0.3046818379475286, "grad_norm": 2.823401927947998, "learning_rate": 6.953181620524713e-07, "loss": 0.4058, "step": 6306 }, { "epoch": 0.30473015412861765, "grad_norm": 7.501384258270264, "learning_rate": 6.952698458713823e-07, "loss": 0.4756, "step": 6307 }, { "epoch": 0.30477847030970673, "grad_norm": 3.755387306213379, "learning_rate": 6.952215296902932e-07, "loss": 0.2511, "step": 6308 }, { "epoch": 0.30482678649079575, "grad_norm": 2.0256154537200928, "learning_rate": 6.951732135092042e-07, "loss": 0.2307, "step": 6309 }, { "epoch": 0.30487510267188483, "grad_norm": 2.1732778549194336, "learning_rate": 6.951248973281152e-07, "loss": 0.2437, "step": 6310 }, { "epoch": 0.30492341885297386, "grad_norm": 2.7881083488464355, "learning_rate": 6.950765811470261e-07, "loss": 0.2491, "step": 6311 }, { "epoch": 0.3049717350340629, "grad_norm": 1.826314926147461, "learning_rate": 6.95028264965937e-07, "loss": 0.1454, "step": 6312 }, { "epoch": 0.30502005121515197, "grad_norm": 3.973106861114502, "learning_rate": 6.94979948784848e-07, "loss": 0.3594, "step": 6313 }, { "epoch": 0.305068367396241, "grad_norm": 2.6445116996765137, "learning_rate": 6.949316326037589e-07, "loss": 0.4341, "step": 6314 }, { "epoch": 0.3051166835773301, "grad_norm": 3.08105206489563, "learning_rate": 6.948833164226699e-07, "loss": 0.3019, "step": 6315 }, { "epoch": 0.3051649997584191, "grad_norm": 3.0583152770996094, "learning_rate": 6.948350002415809e-07, "loss": 0.2033, "step": 6316 }, { "epoch": 0.3052133159395081, "grad_norm": 1.928290843963623, "learning_rate": 6.947866840604918e-07, "loss": 0.183, "step": 6317 }, { "epoch": 0.3052616321205972, "grad_norm": 2.2322444915771484, "learning_rate": 6.947383678794028e-07, "loss": 0.2185, "step": 6318 }, { "epoch": 0.30530994830168623, "grad_norm": 4.683701038360596, "learning_rate": 6.946900516983138e-07, "loss": 0.3164, "step": 6319 }, { "epoch": 0.30535826448277525, "grad_norm": 2.0677478313446045, "learning_rate": 6.946417355172248e-07, "loss": 0.2549, "step": 6320 }, { "epoch": 0.30540658066386434, "grad_norm": 3.01991868019104, "learning_rate": 6.945934193361356e-07, "loss": 0.3049, "step": 6321 }, { "epoch": 0.30545489684495336, "grad_norm": 1.9995070695877075, "learning_rate": 6.945451031550465e-07, "loss": 0.2445, "step": 6322 }, { "epoch": 0.30550321302604244, "grad_norm": 3.5219619274139404, "learning_rate": 6.944967869739575e-07, "loss": 0.3987, "step": 6323 }, { "epoch": 0.30555152920713147, "grad_norm": 2.6266286373138428, "learning_rate": 6.944484707928685e-07, "loss": 0.2309, "step": 6324 }, { "epoch": 0.3055998453882205, "grad_norm": 1.7970519065856934, "learning_rate": 6.944001546117795e-07, "loss": 0.2239, "step": 6325 }, { "epoch": 0.3056481615693096, "grad_norm": 2.775352954864502, "learning_rate": 6.943518384306905e-07, "loss": 0.2974, "step": 6326 }, { "epoch": 0.3056964777503986, "grad_norm": 2.0135769844055176, "learning_rate": 6.943035222496013e-07, "loss": 0.2364, "step": 6327 }, { "epoch": 0.3057447939314877, "grad_norm": 3.1516051292419434, "learning_rate": 6.942552060685123e-07, "loss": 0.4032, "step": 6328 }, { "epoch": 0.3057931101125767, "grad_norm": 3.124671220779419, "learning_rate": 6.942068898874232e-07, "loss": 0.4655, "step": 6329 }, { "epoch": 0.30584142629366573, "grad_norm": 6.553169250488281, "learning_rate": 6.941585737063342e-07, "loss": 0.3049, "step": 6330 }, { "epoch": 0.3058897424747548, "grad_norm": 2.320936441421509, "learning_rate": 6.941102575252452e-07, "loss": 0.2302, "step": 6331 }, { "epoch": 0.30593805865584384, "grad_norm": 2.737757921218872, "learning_rate": 6.940619413441561e-07, "loss": 0.3506, "step": 6332 }, { "epoch": 0.30598637483693286, "grad_norm": 2.0376737117767334, "learning_rate": 6.940136251630671e-07, "loss": 0.2968, "step": 6333 }, { "epoch": 0.30603469101802194, "grad_norm": 3.2636964321136475, "learning_rate": 6.93965308981978e-07, "loss": 0.3568, "step": 6334 }, { "epoch": 0.30608300719911097, "grad_norm": 1.8727102279663086, "learning_rate": 6.93916992800889e-07, "loss": 0.2054, "step": 6335 }, { "epoch": 0.30613132338020005, "grad_norm": 2.413055658340454, "learning_rate": 6.938686766198e-07, "loss": 0.2837, "step": 6336 }, { "epoch": 0.3061796395612891, "grad_norm": 3.1568846702575684, "learning_rate": 6.938203604387109e-07, "loss": 0.3464, "step": 6337 }, { "epoch": 0.3062279557423781, "grad_norm": 2.7592954635620117, "learning_rate": 6.937720442576218e-07, "loss": 0.2673, "step": 6338 }, { "epoch": 0.3062762719234672, "grad_norm": 3.033292531967163, "learning_rate": 6.937237280765328e-07, "loss": 0.4262, "step": 6339 }, { "epoch": 0.3063245881045562, "grad_norm": 2.6463286876678467, "learning_rate": 6.936754118954437e-07, "loss": 0.3455, "step": 6340 }, { "epoch": 0.3063729042856453, "grad_norm": 2.3956174850463867, "learning_rate": 6.936270957143547e-07, "loss": 0.2651, "step": 6341 }, { "epoch": 0.3064212204667343, "grad_norm": 3.517956495285034, "learning_rate": 6.935787795332657e-07, "loss": 0.468, "step": 6342 }, { "epoch": 0.30646953664782334, "grad_norm": 4.949244022369385, "learning_rate": 6.935304633521766e-07, "loss": 0.2586, "step": 6343 }, { "epoch": 0.3065178528289124, "grad_norm": 6.109328269958496, "learning_rate": 6.934821471710876e-07, "loss": 0.2553, "step": 6344 }, { "epoch": 0.30656616901000144, "grad_norm": 2.4905433654785156, "learning_rate": 6.934338309899986e-07, "loss": 0.2935, "step": 6345 }, { "epoch": 0.30661448519109047, "grad_norm": 3.3654558658599854, "learning_rate": 6.933855148089096e-07, "loss": 0.2852, "step": 6346 }, { "epoch": 0.30666280137217955, "grad_norm": 2.6988823413848877, "learning_rate": 6.933371986278204e-07, "loss": 0.3043, "step": 6347 }, { "epoch": 0.3067111175532686, "grad_norm": 2.033407211303711, "learning_rate": 6.932888824467313e-07, "loss": 0.2197, "step": 6348 }, { "epoch": 0.30675943373435766, "grad_norm": 4.318305969238281, "learning_rate": 6.932405662656423e-07, "loss": 0.41, "step": 6349 }, { "epoch": 0.3068077499154467, "grad_norm": 2.6823675632476807, "learning_rate": 6.931922500845533e-07, "loss": 0.3192, "step": 6350 }, { "epoch": 0.3068560660965357, "grad_norm": 2.1437456607818604, "learning_rate": 6.931439339034643e-07, "loss": 0.2582, "step": 6351 }, { "epoch": 0.3069043822776248, "grad_norm": 3.1113452911376953, "learning_rate": 6.930956177223753e-07, "loss": 0.3799, "step": 6352 }, { "epoch": 0.3069526984587138, "grad_norm": 2.519660472869873, "learning_rate": 6.930473015412861e-07, "loss": 0.2512, "step": 6353 }, { "epoch": 0.3070010146398029, "grad_norm": 3.7566192150115967, "learning_rate": 6.92998985360197e-07, "loss": 0.3542, "step": 6354 }, { "epoch": 0.3070493308208919, "grad_norm": 6.700499057769775, "learning_rate": 6.92950669179108e-07, "loss": 0.2961, "step": 6355 }, { "epoch": 0.30709764700198094, "grad_norm": 2.360126495361328, "learning_rate": 6.92902352998019e-07, "loss": 0.2361, "step": 6356 }, { "epoch": 0.30714596318307, "grad_norm": 3.0073330402374268, "learning_rate": 6.9285403681693e-07, "loss": 0.3609, "step": 6357 }, { "epoch": 0.30719427936415905, "grad_norm": 3.069831371307373, "learning_rate": 6.928057206358409e-07, "loss": 0.2933, "step": 6358 }, { "epoch": 0.3072425955452481, "grad_norm": 7.781386375427246, "learning_rate": 6.927574044547518e-07, "loss": 0.2212, "step": 6359 }, { "epoch": 0.30729091172633716, "grad_norm": 4.181946754455566, "learning_rate": 6.927090882736628e-07, "loss": 0.3554, "step": 6360 }, { "epoch": 0.3073392279074262, "grad_norm": 2.324310541152954, "learning_rate": 6.926607720925738e-07, "loss": 0.233, "step": 6361 }, { "epoch": 0.30738754408851526, "grad_norm": 2.558651924133301, "learning_rate": 6.926124559114848e-07, "loss": 0.234, "step": 6362 }, { "epoch": 0.3074358602696043, "grad_norm": 3.562018632888794, "learning_rate": 6.925641397303956e-07, "loss": 0.4356, "step": 6363 }, { "epoch": 0.3074841764506933, "grad_norm": 6.661210536956787, "learning_rate": 6.925158235493066e-07, "loss": 0.2259, "step": 6364 }, { "epoch": 0.3075324926317824, "grad_norm": 3.043415069580078, "learning_rate": 6.924675073682176e-07, "loss": 0.3042, "step": 6365 }, { "epoch": 0.3075808088128714, "grad_norm": 8.054237365722656, "learning_rate": 6.924191911871285e-07, "loss": 0.335, "step": 6366 }, { "epoch": 0.3076291249939605, "grad_norm": 3.0561647415161133, "learning_rate": 6.923708750060395e-07, "loss": 0.346, "step": 6367 }, { "epoch": 0.3076774411750495, "grad_norm": 6.307343482971191, "learning_rate": 6.923225588249505e-07, "loss": 0.3689, "step": 6368 }, { "epoch": 0.30772575735613855, "grad_norm": 3.0868654251098633, "learning_rate": 6.922742426438614e-07, "loss": 0.2668, "step": 6369 }, { "epoch": 0.30777407353722763, "grad_norm": 2.803549289703369, "learning_rate": 6.922259264627724e-07, "loss": 0.3803, "step": 6370 }, { "epoch": 0.30782238971831666, "grad_norm": 2.2850840091705322, "learning_rate": 6.921776102816834e-07, "loss": 0.2071, "step": 6371 }, { "epoch": 0.3078707058994057, "grad_norm": 5.355499267578125, "learning_rate": 6.921292941005942e-07, "loss": 0.3112, "step": 6372 }, { "epoch": 0.30791902208049476, "grad_norm": 2.8118033409118652, "learning_rate": 6.920809779195052e-07, "loss": 0.3609, "step": 6373 }, { "epoch": 0.3079673382615838, "grad_norm": 2.122129201889038, "learning_rate": 6.920326617384161e-07, "loss": 0.2543, "step": 6374 }, { "epoch": 0.30801565444267287, "grad_norm": 3.086609363555908, "learning_rate": 6.919843455573271e-07, "loss": 0.2424, "step": 6375 }, { "epoch": 0.3080639706237619, "grad_norm": 2.5072271823883057, "learning_rate": 6.919360293762381e-07, "loss": 0.3314, "step": 6376 }, { "epoch": 0.3081122868048509, "grad_norm": 2.8215973377227783, "learning_rate": 6.918877131951491e-07, "loss": 0.2489, "step": 6377 }, { "epoch": 0.30816060298594, "grad_norm": 3.0024125576019287, "learning_rate": 6.918393970140601e-07, "loss": 0.3076, "step": 6378 }, { "epoch": 0.308208919167029, "grad_norm": 2.456871509552002, "learning_rate": 6.917910808329709e-07, "loss": 0.2759, "step": 6379 }, { "epoch": 0.3082572353481181, "grad_norm": 1.7858458757400513, "learning_rate": 6.917427646518818e-07, "loss": 0.2035, "step": 6380 }, { "epoch": 0.30830555152920713, "grad_norm": 3.3477706909179688, "learning_rate": 6.916944484707928e-07, "loss": 0.2337, "step": 6381 }, { "epoch": 0.30835386771029616, "grad_norm": 2.589138984680176, "learning_rate": 6.916461322897038e-07, "loss": 0.2242, "step": 6382 }, { "epoch": 0.30840218389138524, "grad_norm": 1.997058629989624, "learning_rate": 6.915978161086148e-07, "loss": 0.2056, "step": 6383 }, { "epoch": 0.30845050007247427, "grad_norm": 2.907754898071289, "learning_rate": 6.915494999275257e-07, "loss": 0.3605, "step": 6384 }, { "epoch": 0.30849881625356335, "grad_norm": 2.33213210105896, "learning_rate": 6.915011837464366e-07, "loss": 0.1971, "step": 6385 }, { "epoch": 0.30854713243465237, "grad_norm": 1.7450675964355469, "learning_rate": 6.914528675653476e-07, "loss": 0.2228, "step": 6386 }, { "epoch": 0.3085954486157414, "grad_norm": 3.2248783111572266, "learning_rate": 6.914045513842586e-07, "loss": 0.2859, "step": 6387 }, { "epoch": 0.3086437647968305, "grad_norm": 1.6688004732131958, "learning_rate": 6.913562352031696e-07, "loss": 0.1543, "step": 6388 }, { "epoch": 0.3086920809779195, "grad_norm": 2.8201398849487305, "learning_rate": 6.913079190220804e-07, "loss": 0.3421, "step": 6389 }, { "epoch": 0.30874039715900853, "grad_norm": 2.2199740409851074, "learning_rate": 6.912596028409914e-07, "loss": 0.2044, "step": 6390 }, { "epoch": 0.3087887133400976, "grad_norm": 2.0972511768341064, "learning_rate": 6.912112866599023e-07, "loss": 0.2628, "step": 6391 }, { "epoch": 0.30883702952118663, "grad_norm": 2.456634521484375, "learning_rate": 6.911629704788133e-07, "loss": 0.2625, "step": 6392 }, { "epoch": 0.3088853457022757, "grad_norm": 12.215535163879395, "learning_rate": 6.911146542977243e-07, "loss": 0.3549, "step": 6393 }, { "epoch": 0.30893366188336474, "grad_norm": 2.897120952606201, "learning_rate": 6.910663381166353e-07, "loss": 0.2765, "step": 6394 }, { "epoch": 0.30898197806445377, "grad_norm": 3.4460854530334473, "learning_rate": 6.910180219355462e-07, "loss": 0.38, "step": 6395 }, { "epoch": 0.30903029424554285, "grad_norm": 2.7749078273773193, "learning_rate": 6.909697057544572e-07, "loss": 0.3035, "step": 6396 }, { "epoch": 0.3090786104266319, "grad_norm": 2.6568429470062256, "learning_rate": 6.909213895733681e-07, "loss": 0.2486, "step": 6397 }, { "epoch": 0.30912692660772095, "grad_norm": 2.634843349456787, "learning_rate": 6.90873073392279e-07, "loss": 0.2989, "step": 6398 }, { "epoch": 0.30917524278881, "grad_norm": 2.389172315597534, "learning_rate": 6.9082475721119e-07, "loss": 0.2492, "step": 6399 }, { "epoch": 0.309223558969899, "grad_norm": 2.7197153568267822, "learning_rate": 6.907764410301009e-07, "loss": 0.3536, "step": 6400 }, { "epoch": 0.3092718751509881, "grad_norm": 2.759517192840576, "learning_rate": 6.907281248490119e-07, "loss": 0.3398, "step": 6401 }, { "epoch": 0.3093201913320771, "grad_norm": 5.28226375579834, "learning_rate": 6.906798086679229e-07, "loss": 0.3171, "step": 6402 }, { "epoch": 0.30936850751316614, "grad_norm": 6.300417423248291, "learning_rate": 6.906314924868339e-07, "loss": 0.3789, "step": 6403 }, { "epoch": 0.3094168236942552, "grad_norm": 2.4191298484802246, "learning_rate": 6.905831763057448e-07, "loss": 0.2217, "step": 6404 }, { "epoch": 0.30946513987534424, "grad_norm": 2.5739974975585938, "learning_rate": 6.905348601246556e-07, "loss": 0.2568, "step": 6405 }, { "epoch": 0.3095134560564333, "grad_norm": 2.4558448791503906, "learning_rate": 6.904865439435666e-07, "loss": 0.3142, "step": 6406 }, { "epoch": 0.30956177223752235, "grad_norm": 1.8933511972427368, "learning_rate": 6.904382277624776e-07, "loss": 0.1772, "step": 6407 }, { "epoch": 0.3096100884186114, "grad_norm": 2.1181044578552246, "learning_rate": 6.903899115813886e-07, "loss": 0.2379, "step": 6408 }, { "epoch": 0.30965840459970045, "grad_norm": 3.5567142963409424, "learning_rate": 6.903415954002996e-07, "loss": 0.4222, "step": 6409 }, { "epoch": 0.3097067207807895, "grad_norm": 1.9667762517929077, "learning_rate": 6.902932792192104e-07, "loss": 0.2226, "step": 6410 }, { "epoch": 0.30975503696187856, "grad_norm": 2.1448850631713867, "learning_rate": 6.902449630381214e-07, "loss": 0.2131, "step": 6411 }, { "epoch": 0.3098033531429676, "grad_norm": 2.656949758529663, "learning_rate": 6.901966468570324e-07, "loss": 0.3627, "step": 6412 }, { "epoch": 0.3098516693240566, "grad_norm": 3.6197516918182373, "learning_rate": 6.901483306759434e-07, "loss": 0.3312, "step": 6413 }, { "epoch": 0.3098999855051457, "grad_norm": 3.27093768119812, "learning_rate": 6.901000144948543e-07, "loss": 0.26, "step": 6414 }, { "epoch": 0.3099483016862347, "grad_norm": 2.6214020252227783, "learning_rate": 6.900516983137652e-07, "loss": 0.2649, "step": 6415 }, { "epoch": 0.30999661786732374, "grad_norm": 2.9661221504211426, "learning_rate": 6.900033821326762e-07, "loss": 0.3918, "step": 6416 }, { "epoch": 0.3100449340484128, "grad_norm": 2.2063217163085938, "learning_rate": 6.899550659515871e-07, "loss": 0.1647, "step": 6417 }, { "epoch": 0.31009325022950185, "grad_norm": 3.848154306411743, "learning_rate": 6.899067497704981e-07, "loss": 0.3249, "step": 6418 }, { "epoch": 0.31014156641059093, "grad_norm": 3.4303572177886963, "learning_rate": 6.898584335894091e-07, "loss": 0.3256, "step": 6419 }, { "epoch": 0.31018988259167996, "grad_norm": 3.232855796813965, "learning_rate": 6.898101174083201e-07, "loss": 0.4075, "step": 6420 }, { "epoch": 0.310238198772769, "grad_norm": 3.685128927230835, "learning_rate": 6.89761801227231e-07, "loss": 0.2408, "step": 6421 }, { "epoch": 0.31028651495385806, "grad_norm": 2.2420053482055664, "learning_rate": 6.89713485046142e-07, "loss": 0.2426, "step": 6422 }, { "epoch": 0.3103348311349471, "grad_norm": 2.6331851482391357, "learning_rate": 6.896651688650528e-07, "loss": 0.2092, "step": 6423 }, { "epoch": 0.31038314731603617, "grad_norm": 2.019416093826294, "learning_rate": 6.896168526839638e-07, "loss": 0.2415, "step": 6424 }, { "epoch": 0.3104314634971252, "grad_norm": 3.511101245880127, "learning_rate": 6.895685365028748e-07, "loss": 0.3217, "step": 6425 }, { "epoch": 0.3104797796782142, "grad_norm": 3.056760787963867, "learning_rate": 6.895202203217857e-07, "loss": 0.5068, "step": 6426 }, { "epoch": 0.3105280958593033, "grad_norm": 4.16926383972168, "learning_rate": 6.894719041406967e-07, "loss": 0.3017, "step": 6427 }, { "epoch": 0.3105764120403923, "grad_norm": 2.430054187774658, "learning_rate": 6.894235879596077e-07, "loss": 0.2578, "step": 6428 }, { "epoch": 0.31062472822148135, "grad_norm": 6.762131690979004, "learning_rate": 6.893752717785187e-07, "loss": 0.4469, "step": 6429 }, { "epoch": 0.31067304440257043, "grad_norm": 2.480316162109375, "learning_rate": 6.893269555974296e-07, "loss": 0.2639, "step": 6430 }, { "epoch": 0.31072136058365946, "grad_norm": 2.9169466495513916, "learning_rate": 6.892786394163404e-07, "loss": 0.1809, "step": 6431 }, { "epoch": 0.31076967676474854, "grad_norm": 2.7601263523101807, "learning_rate": 6.892303232352514e-07, "loss": 0.2833, "step": 6432 }, { "epoch": 0.31081799294583756, "grad_norm": 2.4631147384643555, "learning_rate": 6.891820070541624e-07, "loss": 0.3781, "step": 6433 }, { "epoch": 0.3108663091269266, "grad_norm": 2.5437989234924316, "learning_rate": 6.891336908730734e-07, "loss": 0.1909, "step": 6434 }, { "epoch": 0.31091462530801567, "grad_norm": 72.08605194091797, "learning_rate": 6.890853746919844e-07, "loss": 0.2943, "step": 6435 }, { "epoch": 0.3109629414891047, "grad_norm": 2.6697216033935547, "learning_rate": 6.890370585108952e-07, "loss": 0.3474, "step": 6436 }, { "epoch": 0.3110112576701938, "grad_norm": 2.9756431579589844, "learning_rate": 6.889887423298062e-07, "loss": 0.2068, "step": 6437 }, { "epoch": 0.3110595738512828, "grad_norm": 6.789895534515381, "learning_rate": 6.889404261487172e-07, "loss": 0.2387, "step": 6438 }, { "epoch": 0.3111078900323718, "grad_norm": 4.641596794128418, "learning_rate": 6.888921099676281e-07, "loss": 0.2396, "step": 6439 }, { "epoch": 0.3111562062134609, "grad_norm": 2.8941099643707275, "learning_rate": 6.888437937865391e-07, "loss": 0.3083, "step": 6440 }, { "epoch": 0.31120452239454993, "grad_norm": 3.7455742359161377, "learning_rate": 6.8879547760545e-07, "loss": 0.3517, "step": 6441 }, { "epoch": 0.31125283857563896, "grad_norm": 2.6816744804382324, "learning_rate": 6.887471614243609e-07, "loss": 0.245, "step": 6442 }, { "epoch": 0.31130115475672804, "grad_norm": 2.467866897583008, "learning_rate": 6.886988452432719e-07, "loss": 0.2492, "step": 6443 }, { "epoch": 0.31134947093781706, "grad_norm": 2.939253330230713, "learning_rate": 6.886505290621829e-07, "loss": 0.4069, "step": 6444 }, { "epoch": 0.31139778711890614, "grad_norm": 2.549523115158081, "learning_rate": 6.886022128810939e-07, "loss": 0.3531, "step": 6445 }, { "epoch": 0.31144610329999517, "grad_norm": 3.4453368186950684, "learning_rate": 6.885538967000049e-07, "loss": 0.3865, "step": 6446 }, { "epoch": 0.3114944194810842, "grad_norm": 3.3658149242401123, "learning_rate": 6.885055805189158e-07, "loss": 0.3617, "step": 6447 }, { "epoch": 0.3115427356621733, "grad_norm": 2.2341785430908203, "learning_rate": 6.884572643378267e-07, "loss": 0.2494, "step": 6448 }, { "epoch": 0.3115910518432623, "grad_norm": 7.132495880126953, "learning_rate": 6.884089481567376e-07, "loss": 0.3345, "step": 6449 }, { "epoch": 0.3116393680243514, "grad_norm": 3.0139477252960205, "learning_rate": 6.883606319756486e-07, "loss": 0.3843, "step": 6450 }, { "epoch": 0.3116876842054404, "grad_norm": 6.278153896331787, "learning_rate": 6.883123157945596e-07, "loss": 0.2916, "step": 6451 }, { "epoch": 0.31173600038652943, "grad_norm": 2.1251237392425537, "learning_rate": 6.882639996134705e-07, "loss": 0.2203, "step": 6452 }, { "epoch": 0.3117843165676185, "grad_norm": 2.588202714920044, "learning_rate": 6.882156834323815e-07, "loss": 0.3131, "step": 6453 }, { "epoch": 0.31183263274870754, "grad_norm": 2.9992995262145996, "learning_rate": 6.881673672512925e-07, "loss": 0.3251, "step": 6454 }, { "epoch": 0.31188094892979656, "grad_norm": 2.9288980960845947, "learning_rate": 6.881190510702034e-07, "loss": 0.3594, "step": 6455 }, { "epoch": 0.31192926511088565, "grad_norm": 3.2925546169281006, "learning_rate": 6.880707348891143e-07, "loss": 0.3826, "step": 6456 }, { "epoch": 0.31197758129197467, "grad_norm": 5.58563756942749, "learning_rate": 6.880224187080252e-07, "loss": 0.4243, "step": 6457 }, { "epoch": 0.31202589747306375, "grad_norm": 2.577775478363037, "learning_rate": 6.879741025269362e-07, "loss": 0.2385, "step": 6458 }, { "epoch": 0.3120742136541528, "grad_norm": 2.0601003170013428, "learning_rate": 6.879257863458472e-07, "loss": 0.2201, "step": 6459 }, { "epoch": 0.3121225298352418, "grad_norm": 3.399487257003784, "learning_rate": 6.878774701647582e-07, "loss": 0.3512, "step": 6460 }, { "epoch": 0.3121708460163309, "grad_norm": 2.5835208892822266, "learning_rate": 6.878291539836692e-07, "loss": 0.2841, "step": 6461 }, { "epoch": 0.3122191621974199, "grad_norm": 4.958135604858398, "learning_rate": 6.8778083780258e-07, "loss": 0.2803, "step": 6462 }, { "epoch": 0.312267478378509, "grad_norm": 1.7303798198699951, "learning_rate": 6.87732521621491e-07, "loss": 0.231, "step": 6463 }, { "epoch": 0.312315794559598, "grad_norm": 2.4363834857940674, "learning_rate": 6.87684205440402e-07, "loss": 0.2691, "step": 6464 }, { "epoch": 0.31236411074068704, "grad_norm": 3.8166394233703613, "learning_rate": 6.876358892593129e-07, "loss": 0.3954, "step": 6465 }, { "epoch": 0.3124124269217761, "grad_norm": 2.5583410263061523, "learning_rate": 6.875875730782239e-07, "loss": 0.3195, "step": 6466 }, { "epoch": 0.31246074310286515, "grad_norm": 2.815047264099121, "learning_rate": 6.875392568971348e-07, "loss": 0.2624, "step": 6467 }, { "epoch": 0.31250905928395417, "grad_norm": 2.914456844329834, "learning_rate": 6.874909407160457e-07, "loss": 0.2931, "step": 6468 }, { "epoch": 0.31255737546504325, "grad_norm": 2.060880422592163, "learning_rate": 6.874426245349567e-07, "loss": 0.2123, "step": 6469 }, { "epoch": 0.3126056916461323, "grad_norm": 4.471579551696777, "learning_rate": 6.873943083538677e-07, "loss": 0.3185, "step": 6470 }, { "epoch": 0.31265400782722136, "grad_norm": 2.3623337745666504, "learning_rate": 6.873459921727787e-07, "loss": 0.1619, "step": 6471 }, { "epoch": 0.3127023240083104, "grad_norm": 2.071828603744507, "learning_rate": 6.872976759916897e-07, "loss": 0.1991, "step": 6472 }, { "epoch": 0.3127506401893994, "grad_norm": 2.623769521713257, "learning_rate": 6.872493598106005e-07, "loss": 0.2975, "step": 6473 }, { "epoch": 0.3127989563704885, "grad_norm": 4.300243854522705, "learning_rate": 6.872010436295114e-07, "loss": 0.288, "step": 6474 }, { "epoch": 0.3128472725515775, "grad_norm": 2.4644296169281006, "learning_rate": 6.871527274484224e-07, "loss": 0.344, "step": 6475 }, { "epoch": 0.3128955887326666, "grad_norm": 2.723170757293701, "learning_rate": 6.871044112673334e-07, "loss": 0.4119, "step": 6476 }, { "epoch": 0.3129439049137556, "grad_norm": 2.685854911804199, "learning_rate": 6.870560950862444e-07, "loss": 0.2924, "step": 6477 }, { "epoch": 0.31299222109484465, "grad_norm": 2.387053966522217, "learning_rate": 6.870077789051553e-07, "loss": 0.292, "step": 6478 }, { "epoch": 0.31304053727593373, "grad_norm": 1.4357128143310547, "learning_rate": 6.869594627240663e-07, "loss": 0.1448, "step": 6479 }, { "epoch": 0.31308885345702275, "grad_norm": 6.1748433113098145, "learning_rate": 6.869111465429773e-07, "loss": 0.3304, "step": 6480 }, { "epoch": 0.3131371696381118, "grad_norm": 2.5152976512908936, "learning_rate": 6.868628303618881e-07, "loss": 0.3318, "step": 6481 }, { "epoch": 0.31318548581920086, "grad_norm": 3.5549116134643555, "learning_rate": 6.868145141807991e-07, "loss": 0.3231, "step": 6482 }, { "epoch": 0.3132338020002899, "grad_norm": 2.475739002227783, "learning_rate": 6.8676619799971e-07, "loss": 0.3152, "step": 6483 }, { "epoch": 0.31328211818137897, "grad_norm": 3.0947015285491943, "learning_rate": 6.86717881818621e-07, "loss": 0.4049, "step": 6484 }, { "epoch": 0.313330434362468, "grad_norm": 2.595628499984741, "learning_rate": 6.86669565637532e-07, "loss": 0.2801, "step": 6485 }, { "epoch": 0.313378750543557, "grad_norm": 2.3046727180480957, "learning_rate": 6.86621249456443e-07, "loss": 0.3198, "step": 6486 }, { "epoch": 0.3134270667246461, "grad_norm": 3.1853227615356445, "learning_rate": 6.865729332753539e-07, "loss": 0.3272, "step": 6487 }, { "epoch": 0.3134753829057351, "grad_norm": 3.647444009780884, "learning_rate": 6.865246170942648e-07, "loss": 0.1936, "step": 6488 }, { "epoch": 0.3135236990868242, "grad_norm": 2.1974036693573, "learning_rate": 6.864763009131758e-07, "loss": 0.2638, "step": 6489 }, { "epoch": 0.31357201526791323, "grad_norm": 2.6490790843963623, "learning_rate": 6.864279847320867e-07, "loss": 0.299, "step": 6490 }, { "epoch": 0.31362033144900225, "grad_norm": 3.1358931064605713, "learning_rate": 6.863796685509977e-07, "loss": 0.4041, "step": 6491 }, { "epoch": 0.31366864763009134, "grad_norm": 2.746626377105713, "learning_rate": 6.863313523699087e-07, "loss": 0.2046, "step": 6492 }, { "epoch": 0.31371696381118036, "grad_norm": 2.2716708183288574, "learning_rate": 6.862830361888195e-07, "loss": 0.1954, "step": 6493 }, { "epoch": 0.3137652799922694, "grad_norm": 1.9275484085083008, "learning_rate": 6.862347200077305e-07, "loss": 0.2151, "step": 6494 }, { "epoch": 0.31381359617335847, "grad_norm": 2.1238110065460205, "learning_rate": 6.861864038266415e-07, "loss": 0.27, "step": 6495 }, { "epoch": 0.3138619123544475, "grad_norm": 1.7568926811218262, "learning_rate": 6.861380876455525e-07, "loss": 0.2123, "step": 6496 }, { "epoch": 0.3139102285355366, "grad_norm": 1.9688966274261475, "learning_rate": 6.860897714644635e-07, "loss": 0.247, "step": 6497 }, { "epoch": 0.3139585447166256, "grad_norm": 2.6281533241271973, "learning_rate": 6.860414552833745e-07, "loss": 0.299, "step": 6498 }, { "epoch": 0.3140068608977146, "grad_norm": 2.537655830383301, "learning_rate": 6.859931391022853e-07, "loss": 0.376, "step": 6499 }, { "epoch": 0.3140551770788037, "grad_norm": 1.8246169090270996, "learning_rate": 6.859448229211962e-07, "loss": 0.1927, "step": 6500 }, { "epoch": 0.31410349325989273, "grad_norm": 2.7585716247558594, "learning_rate": 6.858965067401072e-07, "loss": 0.3464, "step": 6501 }, { "epoch": 0.3141518094409818, "grad_norm": 2.774679183959961, "learning_rate": 6.858481905590182e-07, "loss": 0.3197, "step": 6502 }, { "epoch": 0.31420012562207084, "grad_norm": 21.64972686767578, "learning_rate": 6.857998743779292e-07, "loss": 0.3828, "step": 6503 }, { "epoch": 0.31424844180315986, "grad_norm": 4.906699180603027, "learning_rate": 6.857515581968401e-07, "loss": 0.4166, "step": 6504 }, { "epoch": 0.31429675798424894, "grad_norm": 2.5172998905181885, "learning_rate": 6.857032420157511e-07, "loss": 0.2949, "step": 6505 }, { "epoch": 0.31434507416533797, "grad_norm": 7.15935754776001, "learning_rate": 6.85654925834662e-07, "loss": 0.2569, "step": 6506 }, { "epoch": 0.314393390346427, "grad_norm": 1.7219512462615967, "learning_rate": 6.856066096535729e-07, "loss": 0.24, "step": 6507 }, { "epoch": 0.3144417065275161, "grad_norm": 3.2979753017425537, "learning_rate": 6.855582934724839e-07, "loss": 0.311, "step": 6508 }, { "epoch": 0.3144900227086051, "grad_norm": 4.282379150390625, "learning_rate": 6.855099772913948e-07, "loss": 0.4073, "step": 6509 }, { "epoch": 0.3145383388896942, "grad_norm": 2.629115581512451, "learning_rate": 6.854616611103058e-07, "loss": 0.2892, "step": 6510 }, { "epoch": 0.3145866550707832, "grad_norm": 2.400757312774658, "learning_rate": 6.854133449292168e-07, "loss": 0.2381, "step": 6511 }, { "epoch": 0.31463497125187223, "grad_norm": 2.6024396419525146, "learning_rate": 6.853650287481278e-07, "loss": 0.297, "step": 6512 }, { "epoch": 0.3146832874329613, "grad_norm": 2.2237401008605957, "learning_rate": 6.853167125670387e-07, "loss": 0.2281, "step": 6513 }, { "epoch": 0.31473160361405034, "grad_norm": 2.5513954162597656, "learning_rate": 6.852683963859496e-07, "loss": 0.3098, "step": 6514 }, { "epoch": 0.3147799197951394, "grad_norm": 6.471248626708984, "learning_rate": 6.852200802048605e-07, "loss": 0.339, "step": 6515 }, { "epoch": 0.31482823597622844, "grad_norm": 2.7017202377319336, "learning_rate": 6.851717640237715e-07, "loss": 0.2444, "step": 6516 }, { "epoch": 0.31487655215731747, "grad_norm": 3.3032114505767822, "learning_rate": 6.851234478426825e-07, "loss": 0.2838, "step": 6517 }, { "epoch": 0.31492486833840655, "grad_norm": 2.690917491912842, "learning_rate": 6.850751316615935e-07, "loss": 0.4304, "step": 6518 }, { "epoch": 0.3149731845194956, "grad_norm": 2.6437056064605713, "learning_rate": 6.850268154805043e-07, "loss": 0.2545, "step": 6519 }, { "epoch": 0.3150215007005846, "grad_norm": 2.7381646633148193, "learning_rate": 6.849784992994153e-07, "loss": 0.2686, "step": 6520 }, { "epoch": 0.3150698168816737, "grad_norm": 2.5431644916534424, "learning_rate": 6.849301831183263e-07, "loss": 0.2713, "step": 6521 }, { "epoch": 0.3151181330627627, "grad_norm": 3.028066873550415, "learning_rate": 6.848818669372373e-07, "loss": 0.4899, "step": 6522 }, { "epoch": 0.3151664492438518, "grad_norm": 2.1334950923919678, "learning_rate": 6.848335507561483e-07, "loss": 0.2666, "step": 6523 }, { "epoch": 0.3152147654249408, "grad_norm": 2.66575288772583, "learning_rate": 6.847852345750591e-07, "loss": 0.3199, "step": 6524 }, { "epoch": 0.31526308160602984, "grad_norm": 2.9958689212799072, "learning_rate": 6.8473691839397e-07, "loss": 0.2966, "step": 6525 }, { "epoch": 0.3153113977871189, "grad_norm": 2.3582000732421875, "learning_rate": 6.84688602212881e-07, "loss": 0.2496, "step": 6526 }, { "epoch": 0.31535971396820794, "grad_norm": 2.73567271232605, "learning_rate": 6.84640286031792e-07, "loss": 0.2347, "step": 6527 }, { "epoch": 0.315408030149297, "grad_norm": 1.7684435844421387, "learning_rate": 6.84591969850703e-07, "loss": 0.2416, "step": 6528 }, { "epoch": 0.31545634633038605, "grad_norm": 2.5085580348968506, "learning_rate": 6.84543653669614e-07, "loss": 0.3347, "step": 6529 }, { "epoch": 0.3155046625114751, "grad_norm": 2.756004571914673, "learning_rate": 6.844953374885249e-07, "loss": 0.3418, "step": 6530 }, { "epoch": 0.31555297869256416, "grad_norm": 2.8462913036346436, "learning_rate": 6.844470213074359e-07, "loss": 0.3746, "step": 6531 }, { "epoch": 0.3156012948736532, "grad_norm": 1.8984800577163696, "learning_rate": 6.843987051263467e-07, "loss": 0.201, "step": 6532 }, { "epoch": 0.3156496110547422, "grad_norm": 3.646575689315796, "learning_rate": 6.843503889452577e-07, "loss": 0.2346, "step": 6533 }, { "epoch": 0.3156979272358313, "grad_norm": 2.2047767639160156, "learning_rate": 6.843020727641687e-07, "loss": 0.2203, "step": 6534 }, { "epoch": 0.3157462434169203, "grad_norm": 5.922506332397461, "learning_rate": 6.842537565830796e-07, "loss": 0.2514, "step": 6535 }, { "epoch": 0.3157945595980094, "grad_norm": 3.7788150310516357, "learning_rate": 6.842054404019906e-07, "loss": 0.3004, "step": 6536 }, { "epoch": 0.3158428757790984, "grad_norm": 8.816336631774902, "learning_rate": 6.841571242209016e-07, "loss": 0.2109, "step": 6537 }, { "epoch": 0.31589119196018745, "grad_norm": 3.2065072059631348, "learning_rate": 6.841088080398125e-07, "loss": 0.2489, "step": 6538 }, { "epoch": 0.3159395081412765, "grad_norm": 2.2969396114349365, "learning_rate": 6.840604918587235e-07, "loss": 0.2626, "step": 6539 }, { "epoch": 0.31598782432236555, "grad_norm": 1.9707006216049194, "learning_rate": 6.840121756776343e-07, "loss": 0.2106, "step": 6540 }, { "epoch": 0.31603614050345463, "grad_norm": 4.936793327331543, "learning_rate": 6.839638594965453e-07, "loss": 0.2684, "step": 6541 }, { "epoch": 0.31608445668454366, "grad_norm": 2.4240570068359375, "learning_rate": 6.839155433154563e-07, "loss": 0.2571, "step": 6542 }, { "epoch": 0.3161327728656327, "grad_norm": 1.9682667255401611, "learning_rate": 6.838672271343673e-07, "loss": 0.2464, "step": 6543 }, { "epoch": 0.31618108904672176, "grad_norm": 2.265091896057129, "learning_rate": 6.838189109532783e-07, "loss": 0.2335, "step": 6544 }, { "epoch": 0.3162294052278108, "grad_norm": 2.275751829147339, "learning_rate": 6.837705947721891e-07, "loss": 0.2203, "step": 6545 }, { "epoch": 0.3162777214088998, "grad_norm": 2.5817008018493652, "learning_rate": 6.837222785911001e-07, "loss": 0.4017, "step": 6546 }, { "epoch": 0.3163260375899889, "grad_norm": 2.685269594192505, "learning_rate": 6.836739624100111e-07, "loss": 0.2845, "step": 6547 }, { "epoch": 0.3163743537710779, "grad_norm": 2.2272462844848633, "learning_rate": 6.836256462289221e-07, "loss": 0.2318, "step": 6548 }, { "epoch": 0.316422669952167, "grad_norm": 2.145655632019043, "learning_rate": 6.83577330047833e-07, "loss": 0.2565, "step": 6549 }, { "epoch": 0.316470986133256, "grad_norm": 2.8859786987304688, "learning_rate": 6.835290138667439e-07, "loss": 0.337, "step": 6550 }, { "epoch": 0.31651930231434505, "grad_norm": 2.3937454223632812, "learning_rate": 6.834806976856548e-07, "loss": 0.3176, "step": 6551 }, { "epoch": 0.31656761849543413, "grad_norm": 2.288512706756592, "learning_rate": 6.834323815045658e-07, "loss": 0.2691, "step": 6552 }, { "epoch": 0.31661593467652316, "grad_norm": 2.5295848846435547, "learning_rate": 6.833840653234768e-07, "loss": 0.2933, "step": 6553 }, { "epoch": 0.31666425085761224, "grad_norm": 2.668426513671875, "learning_rate": 6.833357491423878e-07, "loss": 0.2946, "step": 6554 }, { "epoch": 0.31671256703870126, "grad_norm": 3.0872819423675537, "learning_rate": 6.832874329612988e-07, "loss": 0.3652, "step": 6555 }, { "epoch": 0.3167608832197903, "grad_norm": 3.0298666954040527, "learning_rate": 6.832391167802097e-07, "loss": 0.4057, "step": 6556 }, { "epoch": 0.31680919940087937, "grad_norm": 5.118081092834473, "learning_rate": 6.831908005991205e-07, "loss": 0.4088, "step": 6557 }, { "epoch": 0.3168575155819684, "grad_norm": 2.9724721908569336, "learning_rate": 6.831424844180315e-07, "loss": 0.3614, "step": 6558 }, { "epoch": 0.3169058317630574, "grad_norm": 2.731611490249634, "learning_rate": 6.830941682369425e-07, "loss": 0.3139, "step": 6559 }, { "epoch": 0.3169541479441465, "grad_norm": 3.4302852153778076, "learning_rate": 6.830458520558535e-07, "loss": 0.339, "step": 6560 }, { "epoch": 0.31700246412523553, "grad_norm": 2.7083723545074463, "learning_rate": 6.829975358747644e-07, "loss": 0.2319, "step": 6561 }, { "epoch": 0.3170507803063246, "grad_norm": 2.9007866382598877, "learning_rate": 6.829492196936754e-07, "loss": 0.3653, "step": 6562 }, { "epoch": 0.31709909648741363, "grad_norm": 2.128354549407959, "learning_rate": 6.829009035125864e-07, "loss": 0.223, "step": 6563 }, { "epoch": 0.31714741266850266, "grad_norm": 1.667664647102356, "learning_rate": 6.828525873314973e-07, "loss": 0.224, "step": 6564 }, { "epoch": 0.31719572884959174, "grad_norm": 2.6086106300354004, "learning_rate": 6.828042711504083e-07, "loss": 0.1971, "step": 6565 }, { "epoch": 0.31724404503068077, "grad_norm": 2.294822931289673, "learning_rate": 6.827559549693191e-07, "loss": 0.2618, "step": 6566 }, { "epoch": 0.31729236121176985, "grad_norm": 2.7390756607055664, "learning_rate": 6.827076387882301e-07, "loss": 0.1927, "step": 6567 }, { "epoch": 0.31734067739285887, "grad_norm": 2.3736255168914795, "learning_rate": 6.826593226071411e-07, "loss": 0.2638, "step": 6568 }, { "epoch": 0.3173889935739479, "grad_norm": 2.866205930709839, "learning_rate": 6.826110064260521e-07, "loss": 0.4996, "step": 6569 }, { "epoch": 0.317437309755037, "grad_norm": 3.6944375038146973, "learning_rate": 6.82562690244963e-07, "loss": 0.2368, "step": 6570 }, { "epoch": 0.317485625936126, "grad_norm": 2.825364351272583, "learning_rate": 6.825143740638739e-07, "loss": 0.3304, "step": 6571 }, { "epoch": 0.31753394211721503, "grad_norm": 2.109581708908081, "learning_rate": 6.824660578827849e-07, "loss": 0.218, "step": 6572 }, { "epoch": 0.3175822582983041, "grad_norm": 3.283600091934204, "learning_rate": 6.824177417016959e-07, "loss": 0.4452, "step": 6573 }, { "epoch": 0.31763057447939314, "grad_norm": 4.189059257507324, "learning_rate": 6.823694255206069e-07, "loss": 0.3166, "step": 6574 }, { "epoch": 0.3176788906604822, "grad_norm": 9.125405311584473, "learning_rate": 6.823211093395178e-07, "loss": 0.2502, "step": 6575 }, { "epoch": 0.31772720684157124, "grad_norm": 2.1660873889923096, "learning_rate": 6.822727931584286e-07, "loss": 0.2439, "step": 6576 }, { "epoch": 0.31777552302266027, "grad_norm": 4.591302394866943, "learning_rate": 6.822244769773396e-07, "loss": 0.1429, "step": 6577 }, { "epoch": 0.31782383920374935, "grad_norm": 2.9930014610290527, "learning_rate": 6.821761607962506e-07, "loss": 0.328, "step": 6578 }, { "epoch": 0.3178721553848384, "grad_norm": 2.6792807579040527, "learning_rate": 6.821278446151616e-07, "loss": 0.3175, "step": 6579 }, { "epoch": 0.31792047156592745, "grad_norm": 4.2983784675598145, "learning_rate": 6.820795284340726e-07, "loss": 0.2488, "step": 6580 }, { "epoch": 0.3179687877470165, "grad_norm": 2.837625026702881, "learning_rate": 6.820312122529836e-07, "loss": 0.2866, "step": 6581 }, { "epoch": 0.3180171039281055, "grad_norm": 2.4366588592529297, "learning_rate": 6.819828960718945e-07, "loss": 0.2815, "step": 6582 }, { "epoch": 0.3180654201091946, "grad_norm": 9.455078125, "learning_rate": 6.819345798908053e-07, "loss": 0.2173, "step": 6583 }, { "epoch": 0.3181137362902836, "grad_norm": 2.530686855316162, "learning_rate": 6.818862637097163e-07, "loss": 0.3292, "step": 6584 }, { "epoch": 0.31816205247137264, "grad_norm": 2.5122663974761963, "learning_rate": 6.818379475286273e-07, "loss": 0.2892, "step": 6585 }, { "epoch": 0.3182103686524617, "grad_norm": 2.6025214195251465, "learning_rate": 6.817896313475383e-07, "loss": 0.1873, "step": 6586 }, { "epoch": 0.31825868483355074, "grad_norm": 1.907953143119812, "learning_rate": 6.817413151664492e-07, "loss": 0.1369, "step": 6587 }, { "epoch": 0.3183070010146398, "grad_norm": 2.574023723602295, "learning_rate": 6.816929989853602e-07, "loss": 0.2662, "step": 6588 }, { "epoch": 0.31835531719572885, "grad_norm": 2.39522123336792, "learning_rate": 6.816446828042711e-07, "loss": 0.2727, "step": 6589 }, { "epoch": 0.3184036333768179, "grad_norm": 2.459639549255371, "learning_rate": 6.815963666231821e-07, "loss": 0.2845, "step": 6590 }, { "epoch": 0.31845194955790695, "grad_norm": 2.776329278945923, "learning_rate": 6.81548050442093e-07, "loss": 0.3598, "step": 6591 }, { "epoch": 0.318500265738996, "grad_norm": 3.5315604209899902, "learning_rate": 6.814997342610039e-07, "loss": 0.3493, "step": 6592 }, { "epoch": 0.31854858192008506, "grad_norm": 3.122756242752075, "learning_rate": 6.814514180799149e-07, "loss": 0.4327, "step": 6593 }, { "epoch": 0.3185968981011741, "grad_norm": 2.11612606048584, "learning_rate": 6.814031018988259e-07, "loss": 0.2137, "step": 6594 }, { "epoch": 0.3186452142822631, "grad_norm": 2.2372634410858154, "learning_rate": 6.813547857177369e-07, "loss": 0.2175, "step": 6595 }, { "epoch": 0.3186935304633522, "grad_norm": 2.4304661750793457, "learning_rate": 6.813064695366478e-07, "loss": 0.2836, "step": 6596 }, { "epoch": 0.3187418466444412, "grad_norm": 11.171211242675781, "learning_rate": 6.812581533555587e-07, "loss": 0.3238, "step": 6597 }, { "epoch": 0.31879016282553024, "grad_norm": 2.7114877700805664, "learning_rate": 6.812098371744697e-07, "loss": 0.341, "step": 6598 }, { "epoch": 0.3188384790066193, "grad_norm": 2.8715076446533203, "learning_rate": 6.811615209933807e-07, "loss": 0.3423, "step": 6599 }, { "epoch": 0.31888679518770835, "grad_norm": 2.518860340118408, "learning_rate": 6.811132048122916e-07, "loss": 0.3267, "step": 6600 }, { "epoch": 0.31893511136879743, "grad_norm": 1.6429004669189453, "learning_rate": 6.810648886312026e-07, "loss": 0.1896, "step": 6601 }, { "epoch": 0.31898342754988646, "grad_norm": 3.9447128772735596, "learning_rate": 6.810165724501134e-07, "loss": 0.2017, "step": 6602 }, { "epoch": 0.3190317437309755, "grad_norm": 5.150912284851074, "learning_rate": 6.809682562690244e-07, "loss": 0.3519, "step": 6603 }, { "epoch": 0.31908005991206456, "grad_norm": 4.556421279907227, "learning_rate": 6.809199400879354e-07, "loss": 0.2792, "step": 6604 }, { "epoch": 0.3191283760931536, "grad_norm": 2.510545253753662, "learning_rate": 6.808716239068464e-07, "loss": 0.2859, "step": 6605 }, { "epoch": 0.31917669227424267, "grad_norm": 2.5392045974731445, "learning_rate": 6.808233077257574e-07, "loss": 0.3423, "step": 6606 }, { "epoch": 0.3192250084553317, "grad_norm": 4.232660293579102, "learning_rate": 6.807749915446684e-07, "loss": 0.2221, "step": 6607 }, { "epoch": 0.3192733246364207, "grad_norm": 3.0592050552368164, "learning_rate": 6.807266753635791e-07, "loss": 0.2572, "step": 6608 }, { "epoch": 0.3193216408175098, "grad_norm": 2.3401577472686768, "learning_rate": 6.806783591824901e-07, "loss": 0.2401, "step": 6609 }, { "epoch": 0.3193699569985988, "grad_norm": 3.4659712314605713, "learning_rate": 6.806300430014011e-07, "loss": 0.3458, "step": 6610 }, { "epoch": 0.31941827317968785, "grad_norm": 4.5702290534973145, "learning_rate": 6.805817268203121e-07, "loss": 0.3597, "step": 6611 }, { "epoch": 0.31946658936077693, "grad_norm": 3.128282070159912, "learning_rate": 6.805334106392231e-07, "loss": 0.2807, "step": 6612 }, { "epoch": 0.31951490554186596, "grad_norm": 2.725004196166992, "learning_rate": 6.80485094458134e-07, "loss": 0.3904, "step": 6613 }, { "epoch": 0.31956322172295504, "grad_norm": 3.130519151687622, "learning_rate": 6.80436778277045e-07, "loss": 0.2307, "step": 6614 }, { "epoch": 0.31961153790404406, "grad_norm": 3.5473473072052, "learning_rate": 6.803884620959559e-07, "loss": 0.3916, "step": 6615 }, { "epoch": 0.3196598540851331, "grad_norm": 5.454367160797119, "learning_rate": 6.803401459148669e-07, "loss": 0.2748, "step": 6616 }, { "epoch": 0.31970817026622217, "grad_norm": 3.516096591949463, "learning_rate": 6.802918297337778e-07, "loss": 0.3545, "step": 6617 }, { "epoch": 0.3197564864473112, "grad_norm": 3.7764387130737305, "learning_rate": 6.802435135526887e-07, "loss": 0.3186, "step": 6618 }, { "epoch": 0.3198048026284003, "grad_norm": 5.879449367523193, "learning_rate": 6.801951973715997e-07, "loss": 0.265, "step": 6619 }, { "epoch": 0.3198531188094893, "grad_norm": 4.476685523986816, "learning_rate": 6.801468811905107e-07, "loss": 0.2544, "step": 6620 }, { "epoch": 0.3199014349905783, "grad_norm": 2.08978271484375, "learning_rate": 6.800985650094216e-07, "loss": 0.2107, "step": 6621 }, { "epoch": 0.3199497511716674, "grad_norm": 8.151162147521973, "learning_rate": 6.800502488283326e-07, "loss": 0.305, "step": 6622 }, { "epoch": 0.31999806735275643, "grad_norm": 8.584354400634766, "learning_rate": 6.800019326472435e-07, "loss": 0.3879, "step": 6623 }, { "epoch": 0.32004638353384546, "grad_norm": 3.6598353385925293, "learning_rate": 6.799536164661545e-07, "loss": 0.2327, "step": 6624 }, { "epoch": 0.32009469971493454, "grad_norm": 3.094312906265259, "learning_rate": 6.799053002850654e-07, "loss": 0.4399, "step": 6625 }, { "epoch": 0.32014301589602356, "grad_norm": 2.5779128074645996, "learning_rate": 6.798569841039764e-07, "loss": 0.2901, "step": 6626 }, { "epoch": 0.32019133207711264, "grad_norm": 6.687222480773926, "learning_rate": 6.798086679228874e-07, "loss": 0.3074, "step": 6627 }, { "epoch": 0.32023964825820167, "grad_norm": 2.9616005420684814, "learning_rate": 6.797603517417982e-07, "loss": 0.2443, "step": 6628 }, { "epoch": 0.3202879644392907, "grad_norm": 3.524958848953247, "learning_rate": 6.797120355607092e-07, "loss": 0.456, "step": 6629 }, { "epoch": 0.3203362806203798, "grad_norm": 7.234632968902588, "learning_rate": 6.796637193796202e-07, "loss": 0.2811, "step": 6630 }, { "epoch": 0.3203845968014688, "grad_norm": 3.938588857650757, "learning_rate": 6.796154031985312e-07, "loss": 0.2431, "step": 6631 }, { "epoch": 0.3204329129825579, "grad_norm": 2.651960611343384, "learning_rate": 6.795670870174422e-07, "loss": 0.3712, "step": 6632 }, { "epoch": 0.3204812291636469, "grad_norm": 2.1964869499206543, "learning_rate": 6.795187708363532e-07, "loss": 0.2395, "step": 6633 }, { "epoch": 0.32052954534473593, "grad_norm": 2.062511682510376, "learning_rate": 6.794704546552639e-07, "loss": 0.2082, "step": 6634 }, { "epoch": 0.320577861525825, "grad_norm": 2.479198932647705, "learning_rate": 6.794221384741749e-07, "loss": 0.2646, "step": 6635 }, { "epoch": 0.32062617770691404, "grad_norm": 3.8346168994903564, "learning_rate": 6.793738222930859e-07, "loss": 0.3302, "step": 6636 }, { "epoch": 0.32067449388800306, "grad_norm": 8.14591121673584, "learning_rate": 6.793255061119969e-07, "loss": 0.1932, "step": 6637 }, { "epoch": 0.32072281006909215, "grad_norm": 2.614570140838623, "learning_rate": 6.792771899309079e-07, "loss": 0.3738, "step": 6638 }, { "epoch": 0.32077112625018117, "grad_norm": 3.0200724601745605, "learning_rate": 6.792288737498188e-07, "loss": 0.403, "step": 6639 }, { "epoch": 0.32081944243127025, "grad_norm": 2.272001028060913, "learning_rate": 6.791805575687297e-07, "loss": 0.2516, "step": 6640 }, { "epoch": 0.3208677586123593, "grad_norm": 4.019650459289551, "learning_rate": 6.791322413876407e-07, "loss": 0.3393, "step": 6641 }, { "epoch": 0.3209160747934483, "grad_norm": 2.1832275390625, "learning_rate": 6.790839252065516e-07, "loss": 0.2295, "step": 6642 }, { "epoch": 0.3209643909745374, "grad_norm": 3.0573079586029053, "learning_rate": 6.790356090254626e-07, "loss": 0.2925, "step": 6643 }, { "epoch": 0.3210127071556264, "grad_norm": 3.156520366668701, "learning_rate": 6.789872928443735e-07, "loss": 0.3057, "step": 6644 }, { "epoch": 0.3210610233367155, "grad_norm": 2.7550477981567383, "learning_rate": 6.789389766632845e-07, "loss": 0.3241, "step": 6645 }, { "epoch": 0.3211093395178045, "grad_norm": 5.3537726402282715, "learning_rate": 6.788906604821955e-07, "loss": 0.3658, "step": 6646 }, { "epoch": 0.32115765569889354, "grad_norm": 6.127007961273193, "learning_rate": 6.788423443011064e-07, "loss": 0.2678, "step": 6647 }, { "epoch": 0.3212059718799826, "grad_norm": 5.716090679168701, "learning_rate": 6.787940281200174e-07, "loss": 0.395, "step": 6648 }, { "epoch": 0.32125428806107165, "grad_norm": 2.7271952629089355, "learning_rate": 6.787457119389283e-07, "loss": 0.2158, "step": 6649 }, { "epoch": 0.32130260424216067, "grad_norm": 1.8067494630813599, "learning_rate": 6.786973957578392e-07, "loss": 0.1735, "step": 6650 }, { "epoch": 0.32135092042324975, "grad_norm": 27.747102737426758, "learning_rate": 6.786490795767502e-07, "loss": 0.3176, "step": 6651 }, { "epoch": 0.3213992366043388, "grad_norm": 3.13624906539917, "learning_rate": 6.786007633956612e-07, "loss": 0.3549, "step": 6652 }, { "epoch": 0.32144755278542786, "grad_norm": 2.438138723373413, "learning_rate": 6.785524472145721e-07, "loss": 0.2683, "step": 6653 }, { "epoch": 0.3214958689665169, "grad_norm": 1.876954197883606, "learning_rate": 6.78504131033483e-07, "loss": 0.219, "step": 6654 }, { "epoch": 0.3215441851476059, "grad_norm": 1.937375545501709, "learning_rate": 6.78455814852394e-07, "loss": 0.2006, "step": 6655 }, { "epoch": 0.321592501328695, "grad_norm": 3.426793336868286, "learning_rate": 6.78407498671305e-07, "loss": 0.3838, "step": 6656 }, { "epoch": 0.321640817509784, "grad_norm": 2.26533579826355, "learning_rate": 6.78359182490216e-07, "loss": 0.2821, "step": 6657 }, { "epoch": 0.3216891336908731, "grad_norm": 2.6100666522979736, "learning_rate": 6.78310866309127e-07, "loss": 0.327, "step": 6658 }, { "epoch": 0.3217374498719621, "grad_norm": 6.476776123046875, "learning_rate": 6.78262550128038e-07, "loss": 0.292, "step": 6659 }, { "epoch": 0.32178576605305115, "grad_norm": 2.5768680572509766, "learning_rate": 6.782142339469487e-07, "loss": 0.1676, "step": 6660 }, { "epoch": 0.32183408223414023, "grad_norm": 3.0803158283233643, "learning_rate": 6.781659177658597e-07, "loss": 0.2272, "step": 6661 }, { "epoch": 0.32188239841522925, "grad_norm": 2.715620517730713, "learning_rate": 6.781176015847707e-07, "loss": 0.3419, "step": 6662 }, { "epoch": 0.32193071459631833, "grad_norm": 2.2705323696136475, "learning_rate": 6.780692854036817e-07, "loss": 0.2567, "step": 6663 }, { "epoch": 0.32197903077740736, "grad_norm": 2.8321025371551514, "learning_rate": 6.780209692225927e-07, "loss": 0.3514, "step": 6664 }, { "epoch": 0.3220273469584964, "grad_norm": 2.816873550415039, "learning_rate": 6.779726530415036e-07, "loss": 0.3328, "step": 6665 }, { "epoch": 0.32207566313958547, "grad_norm": 3.0627870559692383, "learning_rate": 6.779243368604145e-07, "loss": 0.3959, "step": 6666 }, { "epoch": 0.3221239793206745, "grad_norm": 1.8916740417480469, "learning_rate": 6.778760206793254e-07, "loss": 0.2272, "step": 6667 }, { "epoch": 0.3221722955017635, "grad_norm": 2.817099094390869, "learning_rate": 6.778277044982364e-07, "loss": 0.339, "step": 6668 }, { "epoch": 0.3222206116828526, "grad_norm": 3.090585231781006, "learning_rate": 6.777793883171474e-07, "loss": 0.2236, "step": 6669 }, { "epoch": 0.3222689278639416, "grad_norm": 4.688111305236816, "learning_rate": 6.777310721360583e-07, "loss": 0.3612, "step": 6670 }, { "epoch": 0.3223172440450307, "grad_norm": 2.4590113162994385, "learning_rate": 6.776827559549693e-07, "loss": 0.2832, "step": 6671 }, { "epoch": 0.32236556022611973, "grad_norm": 6.81027889251709, "learning_rate": 6.776344397738803e-07, "loss": 0.2939, "step": 6672 }, { "epoch": 0.32241387640720875, "grad_norm": 5.0549139976501465, "learning_rate": 6.775861235927912e-07, "loss": 0.3288, "step": 6673 }, { "epoch": 0.32246219258829784, "grad_norm": 3.006220817565918, "learning_rate": 6.775378074117022e-07, "loss": 0.2957, "step": 6674 }, { "epoch": 0.32251050876938686, "grad_norm": 3.3878583908081055, "learning_rate": 6.77489491230613e-07, "loss": 0.3008, "step": 6675 }, { "epoch": 0.32255882495047594, "grad_norm": 2.4017038345336914, "learning_rate": 6.77441175049524e-07, "loss": 0.2857, "step": 6676 }, { "epoch": 0.32260714113156497, "grad_norm": 2.592472553253174, "learning_rate": 6.77392858868435e-07, "loss": 0.3338, "step": 6677 }, { "epoch": 0.322655457312654, "grad_norm": 4.053649425506592, "learning_rate": 6.77344542687346e-07, "loss": 0.3904, "step": 6678 }, { "epoch": 0.3227037734937431, "grad_norm": 3.369624614715576, "learning_rate": 6.772962265062569e-07, "loss": 0.4414, "step": 6679 }, { "epoch": 0.3227520896748321, "grad_norm": 3.3752243518829346, "learning_rate": 6.772479103251678e-07, "loss": 0.3025, "step": 6680 }, { "epoch": 0.3228004058559211, "grad_norm": 3.987308979034424, "learning_rate": 6.771995941440788e-07, "loss": 0.2555, "step": 6681 }, { "epoch": 0.3228487220370102, "grad_norm": 2.277810573577881, "learning_rate": 6.771512779629898e-07, "loss": 0.3035, "step": 6682 }, { "epoch": 0.32289703821809923, "grad_norm": 4.741107940673828, "learning_rate": 6.771029617819008e-07, "loss": 0.3876, "step": 6683 }, { "epoch": 0.3229453543991883, "grad_norm": 55.62595748901367, "learning_rate": 6.770546456008118e-07, "loss": 0.2069, "step": 6684 }, { "epoch": 0.32299367058027734, "grad_norm": 2.2375833988189697, "learning_rate": 6.770063294197226e-07, "loss": 0.2286, "step": 6685 }, { "epoch": 0.32304198676136636, "grad_norm": 3.4151906967163086, "learning_rate": 6.769580132386335e-07, "loss": 0.3393, "step": 6686 }, { "epoch": 0.32309030294245544, "grad_norm": 10.18598747253418, "learning_rate": 6.769096970575445e-07, "loss": 0.2933, "step": 6687 }, { "epoch": 0.32313861912354447, "grad_norm": 4.661059856414795, "learning_rate": 6.768613808764555e-07, "loss": 0.4359, "step": 6688 }, { "epoch": 0.32318693530463355, "grad_norm": 2.380890369415283, "learning_rate": 6.768130646953665e-07, "loss": 0.2575, "step": 6689 }, { "epoch": 0.3232352514857226, "grad_norm": 7.791652202606201, "learning_rate": 6.767647485142775e-07, "loss": 0.277, "step": 6690 }, { "epoch": 0.3232835676668116, "grad_norm": 4.202347755432129, "learning_rate": 6.767164323331884e-07, "loss": 0.4105, "step": 6691 }, { "epoch": 0.3233318838479007, "grad_norm": 2.443525791168213, "learning_rate": 6.766681161520992e-07, "loss": 0.3198, "step": 6692 }, { "epoch": 0.3233802000289897, "grad_norm": 2.539618730545044, "learning_rate": 6.766197999710102e-07, "loss": 0.3153, "step": 6693 }, { "epoch": 0.32342851621007873, "grad_norm": 3.62263560295105, "learning_rate": 6.765714837899212e-07, "loss": 0.4594, "step": 6694 }, { "epoch": 0.3234768323911678, "grad_norm": 23.655057907104492, "learning_rate": 6.765231676088322e-07, "loss": 0.1722, "step": 6695 }, { "epoch": 0.32352514857225684, "grad_norm": 1.3489490747451782, "learning_rate": 6.764748514277431e-07, "loss": 0.1338, "step": 6696 }, { "epoch": 0.3235734647533459, "grad_norm": 1.5692003965377808, "learning_rate": 6.764265352466541e-07, "loss": 0.1727, "step": 6697 }, { "epoch": 0.32362178093443494, "grad_norm": 2.623440742492676, "learning_rate": 6.76378219065565e-07, "loss": 0.4227, "step": 6698 }, { "epoch": 0.32367009711552397, "grad_norm": 6.671947956085205, "learning_rate": 6.76329902884476e-07, "loss": 0.288, "step": 6699 }, { "epoch": 0.32371841329661305, "grad_norm": 5.018773078918457, "learning_rate": 6.76281586703387e-07, "loss": 0.349, "step": 6700 }, { "epoch": 0.3237667294777021, "grad_norm": 6.843771457672119, "learning_rate": 6.762332705222978e-07, "loss": 0.3499, "step": 6701 }, { "epoch": 0.32381504565879116, "grad_norm": 2.2867166996002197, "learning_rate": 6.761849543412088e-07, "loss": 0.2041, "step": 6702 }, { "epoch": 0.3238633618398802, "grad_norm": 2.3116583824157715, "learning_rate": 6.761366381601198e-07, "loss": 0.2808, "step": 6703 }, { "epoch": 0.3239116780209692, "grad_norm": 2.419471263885498, "learning_rate": 6.760883219790308e-07, "loss": 0.3278, "step": 6704 }, { "epoch": 0.3239599942020583, "grad_norm": 3.2513551712036133, "learning_rate": 6.760400057979417e-07, "loss": 0.3872, "step": 6705 }, { "epoch": 0.3240083103831473, "grad_norm": 4.942601680755615, "learning_rate": 6.759916896168526e-07, "loss": 0.3252, "step": 6706 }, { "epoch": 0.32405662656423634, "grad_norm": 2.444662094116211, "learning_rate": 6.759433734357636e-07, "loss": 0.3076, "step": 6707 }, { "epoch": 0.3241049427453254, "grad_norm": 4.220186233520508, "learning_rate": 6.758950572546746e-07, "loss": 0.2359, "step": 6708 }, { "epoch": 0.32415325892641444, "grad_norm": 2.5344719886779785, "learning_rate": 6.758467410735856e-07, "loss": 0.2745, "step": 6709 }, { "epoch": 0.3242015751075035, "grad_norm": 2.718372344970703, "learning_rate": 6.757984248924965e-07, "loss": 0.3861, "step": 6710 }, { "epoch": 0.32424989128859255, "grad_norm": 2.5546960830688477, "learning_rate": 6.757501087114074e-07, "loss": 0.211, "step": 6711 }, { "epoch": 0.3242982074696816, "grad_norm": 2.6396830081939697, "learning_rate": 6.757017925303183e-07, "loss": 0.3736, "step": 6712 }, { "epoch": 0.32434652365077066, "grad_norm": 3.870271921157837, "learning_rate": 6.756534763492293e-07, "loss": 0.2537, "step": 6713 }, { "epoch": 0.3243948398318597, "grad_norm": 2.756181240081787, "learning_rate": 6.756051601681403e-07, "loss": 0.302, "step": 6714 }, { "epoch": 0.32444315601294876, "grad_norm": 2.5663962364196777, "learning_rate": 6.755568439870513e-07, "loss": 0.2763, "step": 6715 }, { "epoch": 0.3244914721940378, "grad_norm": 2.460960626602173, "learning_rate": 6.755085278059623e-07, "loss": 0.2992, "step": 6716 }, { "epoch": 0.3245397883751268, "grad_norm": 3.502368450164795, "learning_rate": 6.75460211624873e-07, "loss": 0.2927, "step": 6717 }, { "epoch": 0.3245881045562159, "grad_norm": 3.2291712760925293, "learning_rate": 6.75411895443784e-07, "loss": 0.4357, "step": 6718 }, { "epoch": 0.3246364207373049, "grad_norm": 2.371022939682007, "learning_rate": 6.75363579262695e-07, "loss": 0.2657, "step": 6719 }, { "epoch": 0.32468473691839395, "grad_norm": 2.1234679222106934, "learning_rate": 6.75315263081606e-07, "loss": 0.2117, "step": 6720 }, { "epoch": 0.324733053099483, "grad_norm": 5.323822021484375, "learning_rate": 6.75266946900517e-07, "loss": 0.4652, "step": 6721 }, { "epoch": 0.32478136928057205, "grad_norm": 3.407726764678955, "learning_rate": 6.752186307194279e-07, "loss": 0.3923, "step": 6722 }, { "epoch": 0.32482968546166113, "grad_norm": 32.63219451904297, "learning_rate": 6.751703145383389e-07, "loss": 0.234, "step": 6723 }, { "epoch": 0.32487800164275016, "grad_norm": 2.3276050090789795, "learning_rate": 6.751219983572498e-07, "loss": 0.2871, "step": 6724 }, { "epoch": 0.3249263178238392, "grad_norm": 2.6700921058654785, "learning_rate": 6.750736821761608e-07, "loss": 0.3638, "step": 6725 }, { "epoch": 0.32497463400492826, "grad_norm": 2.5119926929473877, "learning_rate": 6.750253659950718e-07, "loss": 0.2751, "step": 6726 }, { "epoch": 0.3250229501860173, "grad_norm": 2.766007661819458, "learning_rate": 6.749770498139826e-07, "loss": 0.3278, "step": 6727 }, { "epoch": 0.32507126636710637, "grad_norm": 3.7642626762390137, "learning_rate": 6.749287336328936e-07, "loss": 0.3047, "step": 6728 }, { "epoch": 0.3251195825481954, "grad_norm": 2.527992010116577, "learning_rate": 6.748804174518046e-07, "loss": 0.3921, "step": 6729 }, { "epoch": 0.3251678987292844, "grad_norm": 21.144224166870117, "learning_rate": 6.748321012707155e-07, "loss": 0.408, "step": 6730 }, { "epoch": 0.3252162149103735, "grad_norm": 2.542579174041748, "learning_rate": 6.747837850896265e-07, "loss": 0.2849, "step": 6731 }, { "epoch": 0.3252645310914625, "grad_norm": 2.7260186672210693, "learning_rate": 6.747354689085374e-07, "loss": 0.2997, "step": 6732 }, { "epoch": 0.32531284727255155, "grad_norm": 2.332509994506836, "learning_rate": 6.746871527274484e-07, "loss": 0.3119, "step": 6733 }, { "epoch": 0.32536116345364063, "grad_norm": 3.0680534839630127, "learning_rate": 6.746388365463594e-07, "loss": 0.3968, "step": 6734 }, { "epoch": 0.32540947963472966, "grad_norm": 2.0133056640625, "learning_rate": 6.745905203652703e-07, "loss": 0.2352, "step": 6735 }, { "epoch": 0.32545779581581874, "grad_norm": 17.188743591308594, "learning_rate": 6.745422041841813e-07, "loss": 0.1987, "step": 6736 }, { "epoch": 0.32550611199690777, "grad_norm": 4.815965175628662, "learning_rate": 6.744938880030922e-07, "loss": 0.2844, "step": 6737 }, { "epoch": 0.3255544281779968, "grad_norm": 2.991755485534668, "learning_rate": 6.744455718220031e-07, "loss": 0.3282, "step": 6738 }, { "epoch": 0.32560274435908587, "grad_norm": 2.659172296524048, "learning_rate": 6.743972556409141e-07, "loss": 0.3458, "step": 6739 }, { "epoch": 0.3256510605401749, "grad_norm": 2.138284921646118, "learning_rate": 6.743489394598251e-07, "loss": 0.2546, "step": 6740 }, { "epoch": 0.325699376721264, "grad_norm": 1.939314842224121, "learning_rate": 6.743006232787361e-07, "loss": 0.2385, "step": 6741 }, { "epoch": 0.325747692902353, "grad_norm": 3.0234720706939697, "learning_rate": 6.742523070976471e-07, "loss": 0.375, "step": 6742 }, { "epoch": 0.32579600908344203, "grad_norm": 2.0217931270599365, "learning_rate": 6.742039909165578e-07, "loss": 0.2125, "step": 6743 }, { "epoch": 0.3258443252645311, "grad_norm": 2.4244465827941895, "learning_rate": 6.741556747354688e-07, "loss": 0.3319, "step": 6744 }, { "epoch": 0.32589264144562013, "grad_norm": 4.699285507202148, "learning_rate": 6.741073585543798e-07, "loss": 0.2936, "step": 6745 }, { "epoch": 0.32594095762670916, "grad_norm": 3.3071932792663574, "learning_rate": 6.740590423732908e-07, "loss": 0.3768, "step": 6746 }, { "epoch": 0.32598927380779824, "grad_norm": 2.737657308578491, "learning_rate": 6.740107261922018e-07, "loss": 0.3349, "step": 6747 }, { "epoch": 0.32603758998888727, "grad_norm": 3.3640036582946777, "learning_rate": 6.739624100111127e-07, "loss": 0.2419, "step": 6748 }, { "epoch": 0.32608590616997635, "grad_norm": 4.071180820465088, "learning_rate": 6.739140938300236e-07, "loss": 0.242, "step": 6749 }, { "epoch": 0.3261342223510654, "grad_norm": 1.4672837257385254, "learning_rate": 6.738657776489346e-07, "loss": 0.1413, "step": 6750 }, { "epoch": 0.3261825385321544, "grad_norm": 4.176219940185547, "learning_rate": 6.738174614678456e-07, "loss": 0.3986, "step": 6751 }, { "epoch": 0.3262308547132435, "grad_norm": 2.446570873260498, "learning_rate": 6.737691452867565e-07, "loss": 0.3394, "step": 6752 }, { "epoch": 0.3262791708943325, "grad_norm": 2.8390986919403076, "learning_rate": 6.737208291056674e-07, "loss": 0.3059, "step": 6753 }, { "epoch": 0.3263274870754216, "grad_norm": 4.761132717132568, "learning_rate": 6.736725129245784e-07, "loss": 0.3863, "step": 6754 }, { "epoch": 0.3263758032565106, "grad_norm": 2.9128122329711914, "learning_rate": 6.736241967434894e-07, "loss": 0.2704, "step": 6755 }, { "epoch": 0.32642411943759964, "grad_norm": 2.608093738555908, "learning_rate": 6.735758805624003e-07, "loss": 0.3052, "step": 6756 }, { "epoch": 0.3264724356186887, "grad_norm": 3.516012191772461, "learning_rate": 6.735275643813113e-07, "loss": 0.4855, "step": 6757 }, { "epoch": 0.32652075179977774, "grad_norm": 3.3600659370422363, "learning_rate": 6.734792482002222e-07, "loss": 0.2338, "step": 6758 }, { "epoch": 0.32656906798086677, "grad_norm": 2.439188241958618, "learning_rate": 6.734309320191332e-07, "loss": 0.2246, "step": 6759 }, { "epoch": 0.32661738416195585, "grad_norm": 2.037226438522339, "learning_rate": 6.733826158380441e-07, "loss": 0.2626, "step": 6760 }, { "epoch": 0.3266657003430449, "grad_norm": 2.7940828800201416, "learning_rate": 6.733342996569551e-07, "loss": 0.3106, "step": 6761 }, { "epoch": 0.32671401652413395, "grad_norm": 3.5439140796661377, "learning_rate": 6.73285983475866e-07, "loss": 0.3196, "step": 6762 }, { "epoch": 0.326762332705223, "grad_norm": 3.519228935241699, "learning_rate": 6.73237667294777e-07, "loss": 0.2539, "step": 6763 }, { "epoch": 0.326810648886312, "grad_norm": 2.7475459575653076, "learning_rate": 6.731893511136879e-07, "loss": 0.2768, "step": 6764 }, { "epoch": 0.3268589650674011, "grad_norm": 3.4388091564178467, "learning_rate": 6.731410349325989e-07, "loss": 0.2605, "step": 6765 }, { "epoch": 0.3269072812484901, "grad_norm": 2.769773006439209, "learning_rate": 6.730927187515099e-07, "loss": 0.3012, "step": 6766 }, { "epoch": 0.3269555974295792, "grad_norm": 2.0559699535369873, "learning_rate": 6.730444025704209e-07, "loss": 0.2569, "step": 6767 }, { "epoch": 0.3270039136106682, "grad_norm": 6.294211387634277, "learning_rate": 6.729960863893319e-07, "loss": 0.317, "step": 6768 }, { "epoch": 0.32705222979175724, "grad_norm": 2.2875988483428955, "learning_rate": 6.729477702082426e-07, "loss": 0.2854, "step": 6769 }, { "epoch": 0.3271005459728463, "grad_norm": 1.8720922470092773, "learning_rate": 6.728994540271536e-07, "loss": 0.1688, "step": 6770 }, { "epoch": 0.32714886215393535, "grad_norm": 2.549567937850952, "learning_rate": 6.728511378460646e-07, "loss": 0.3104, "step": 6771 }, { "epoch": 0.3271971783350244, "grad_norm": 3.672053575515747, "learning_rate": 6.728028216649756e-07, "loss": 0.1752, "step": 6772 }, { "epoch": 0.32724549451611346, "grad_norm": 2.3109772205352783, "learning_rate": 6.727545054838866e-07, "loss": 0.2866, "step": 6773 }, { "epoch": 0.3272938106972025, "grad_norm": 2.2366855144500732, "learning_rate": 6.727061893027975e-07, "loss": 0.2381, "step": 6774 }, { "epoch": 0.32734212687829156, "grad_norm": 1.9551446437835693, "learning_rate": 6.726578731217084e-07, "loss": 0.1815, "step": 6775 }, { "epoch": 0.3273904430593806, "grad_norm": 3.6031343936920166, "learning_rate": 6.726095569406194e-07, "loss": 0.3435, "step": 6776 }, { "epoch": 0.3274387592404696, "grad_norm": 1.6886014938354492, "learning_rate": 6.725612407595303e-07, "loss": 0.1753, "step": 6777 }, { "epoch": 0.3274870754215587, "grad_norm": 3.0012612342834473, "learning_rate": 6.725129245784413e-07, "loss": 0.3402, "step": 6778 }, { "epoch": 0.3275353916026477, "grad_norm": 2.184516191482544, "learning_rate": 6.724646083973522e-07, "loss": 0.2708, "step": 6779 }, { "epoch": 0.3275837077837368, "grad_norm": 3.191375255584717, "learning_rate": 6.724162922162632e-07, "loss": 0.2322, "step": 6780 }, { "epoch": 0.3276320239648258, "grad_norm": 2.9561283588409424, "learning_rate": 6.723679760351741e-07, "loss": 0.2501, "step": 6781 }, { "epoch": 0.32768034014591485, "grad_norm": 2.581148147583008, "learning_rate": 6.723196598540851e-07, "loss": 0.3296, "step": 6782 }, { "epoch": 0.32772865632700393, "grad_norm": 4.2170186042785645, "learning_rate": 6.722713436729961e-07, "loss": 0.4824, "step": 6783 }, { "epoch": 0.32777697250809296, "grad_norm": 16.6671142578125, "learning_rate": 6.72223027491907e-07, "loss": 0.2452, "step": 6784 }, { "epoch": 0.327825288689182, "grad_norm": 1.8542213439941406, "learning_rate": 6.72174711310818e-07, "loss": 0.1635, "step": 6785 }, { "epoch": 0.32787360487027106, "grad_norm": 3.43735933303833, "learning_rate": 6.721263951297289e-07, "loss": 0.4779, "step": 6786 }, { "epoch": 0.3279219210513601, "grad_norm": 2.6872949600219727, "learning_rate": 6.720780789486399e-07, "loss": 0.327, "step": 6787 }, { "epoch": 0.32797023723244917, "grad_norm": 2.466479539871216, "learning_rate": 6.720297627675508e-07, "loss": 0.2754, "step": 6788 }, { "epoch": 0.3280185534135382, "grad_norm": 10.222503662109375, "learning_rate": 6.719814465864618e-07, "loss": 0.3109, "step": 6789 }, { "epoch": 0.3280668695946272, "grad_norm": 2.240804672241211, "learning_rate": 6.719331304053727e-07, "loss": 0.2811, "step": 6790 }, { "epoch": 0.3281151857757163, "grad_norm": 2.083479166030884, "learning_rate": 6.718848142242837e-07, "loss": 0.2076, "step": 6791 }, { "epoch": 0.3281635019568053, "grad_norm": 2.6859350204467773, "learning_rate": 6.718364980431947e-07, "loss": 0.358, "step": 6792 }, { "epoch": 0.3282118181378944, "grad_norm": 2.5166687965393066, "learning_rate": 6.717881818621057e-07, "loss": 0.2079, "step": 6793 }, { "epoch": 0.32826013431898343, "grad_norm": 3.186245918273926, "learning_rate": 6.717398656810165e-07, "loss": 0.3131, "step": 6794 }, { "epoch": 0.32830845050007246, "grad_norm": 2.522991180419922, "learning_rate": 6.716915494999274e-07, "loss": 0.2721, "step": 6795 }, { "epoch": 0.32835676668116154, "grad_norm": 2.571791648864746, "learning_rate": 6.716432333188384e-07, "loss": 0.3025, "step": 6796 }, { "epoch": 0.32840508286225056, "grad_norm": 2.1533632278442383, "learning_rate": 6.715949171377494e-07, "loss": 0.2211, "step": 6797 }, { "epoch": 0.3284533990433396, "grad_norm": 2.757401704788208, "learning_rate": 6.715466009566604e-07, "loss": 0.324, "step": 6798 }, { "epoch": 0.32850171522442867, "grad_norm": 3.065781593322754, "learning_rate": 6.714982847755714e-07, "loss": 0.3877, "step": 6799 }, { "epoch": 0.3285500314055177, "grad_norm": 3.964665412902832, "learning_rate": 6.714499685944822e-07, "loss": 0.4675, "step": 6800 }, { "epoch": 0.3285983475866068, "grad_norm": 2.6072332859039307, "learning_rate": 6.714016524133932e-07, "loss": 0.3247, "step": 6801 }, { "epoch": 0.3286466637676958, "grad_norm": 9.157855987548828, "learning_rate": 6.713533362323042e-07, "loss": 0.366, "step": 6802 }, { "epoch": 0.3286949799487848, "grad_norm": 2.582153081893921, "learning_rate": 6.713050200512151e-07, "loss": 0.3453, "step": 6803 }, { "epoch": 0.3287432961298739, "grad_norm": 3.588557720184326, "learning_rate": 6.712567038701261e-07, "loss": 0.429, "step": 6804 }, { "epoch": 0.32879161231096293, "grad_norm": 3.54413104057312, "learning_rate": 6.71208387689037e-07, "loss": 0.2526, "step": 6805 }, { "epoch": 0.328839928492052, "grad_norm": 2.4875688552856445, "learning_rate": 6.71160071507948e-07, "loss": 0.3141, "step": 6806 }, { "epoch": 0.32888824467314104, "grad_norm": 9.935782432556152, "learning_rate": 6.711117553268589e-07, "loss": 0.2888, "step": 6807 }, { "epoch": 0.32893656085423006, "grad_norm": 3.7476699352264404, "learning_rate": 6.710634391457699e-07, "loss": 0.3297, "step": 6808 }, { "epoch": 0.32898487703531915, "grad_norm": 3.887017011642456, "learning_rate": 6.710151229646809e-07, "loss": 0.2083, "step": 6809 }, { "epoch": 0.32903319321640817, "grad_norm": 2.0327484607696533, "learning_rate": 6.709668067835918e-07, "loss": 0.1958, "step": 6810 }, { "epoch": 0.3290815093974972, "grad_norm": 2.630021572113037, "learning_rate": 6.709184906025027e-07, "loss": 0.3089, "step": 6811 }, { "epoch": 0.3291298255785863, "grad_norm": 2.356313467025757, "learning_rate": 6.708701744214137e-07, "loss": 0.2399, "step": 6812 }, { "epoch": 0.3291781417596753, "grad_norm": 3.9145524501800537, "learning_rate": 6.708218582403246e-07, "loss": 0.3405, "step": 6813 }, { "epoch": 0.3292264579407644, "grad_norm": 2.072715997695923, "learning_rate": 6.707735420592356e-07, "loss": 0.2783, "step": 6814 }, { "epoch": 0.3292747741218534, "grad_norm": 2.9965696334838867, "learning_rate": 6.707252258781465e-07, "loss": 0.472, "step": 6815 }, { "epoch": 0.32932309030294243, "grad_norm": 2.1324849128723145, "learning_rate": 6.706769096970575e-07, "loss": 0.2195, "step": 6816 }, { "epoch": 0.3293714064840315, "grad_norm": 3.1556243896484375, "learning_rate": 6.706285935159685e-07, "loss": 0.389, "step": 6817 }, { "epoch": 0.32941972266512054, "grad_norm": 3.5742292404174805, "learning_rate": 6.705802773348795e-07, "loss": 0.288, "step": 6818 }, { "epoch": 0.3294680388462096, "grad_norm": 2.730665683746338, "learning_rate": 6.705319611537905e-07, "loss": 0.315, "step": 6819 }, { "epoch": 0.32951635502729865, "grad_norm": 6.928828239440918, "learning_rate": 6.704836449727013e-07, "loss": 0.4071, "step": 6820 }, { "epoch": 0.32956467120838767, "grad_norm": 2.2750329971313477, "learning_rate": 6.704353287916122e-07, "loss": 0.3067, "step": 6821 }, { "epoch": 0.32961298738947675, "grad_norm": 2.879502058029175, "learning_rate": 6.703870126105232e-07, "loss": 0.2458, "step": 6822 }, { "epoch": 0.3296613035705658, "grad_norm": 2.6862096786499023, "learning_rate": 6.703386964294342e-07, "loss": 0.2123, "step": 6823 }, { "epoch": 0.3297096197516548, "grad_norm": 2.1290526390075684, "learning_rate": 6.702903802483452e-07, "loss": 0.2259, "step": 6824 }, { "epoch": 0.3297579359327439, "grad_norm": 3.2978110313415527, "learning_rate": 6.702420640672562e-07, "loss": 0.3292, "step": 6825 }, { "epoch": 0.3298062521138329, "grad_norm": 12.310513496398926, "learning_rate": 6.70193747886167e-07, "loss": 0.2188, "step": 6826 }, { "epoch": 0.329854568294922, "grad_norm": 5.41534423828125, "learning_rate": 6.70145431705078e-07, "loss": 0.2587, "step": 6827 }, { "epoch": 0.329902884476011, "grad_norm": 1.9630697965621948, "learning_rate": 6.700971155239889e-07, "loss": 0.2024, "step": 6828 }, { "epoch": 0.32995120065710004, "grad_norm": 3.7110908031463623, "learning_rate": 6.700487993428999e-07, "loss": 0.2984, "step": 6829 }, { "epoch": 0.3299995168381891, "grad_norm": 2.3561744689941406, "learning_rate": 6.700004831618109e-07, "loss": 0.3372, "step": 6830 }, { "epoch": 0.33004783301927815, "grad_norm": 2.6093854904174805, "learning_rate": 6.699521669807218e-07, "loss": 0.2143, "step": 6831 }, { "epoch": 0.33009614920036723, "grad_norm": 3.2505433559417725, "learning_rate": 6.699038507996327e-07, "loss": 0.3149, "step": 6832 }, { "epoch": 0.33014446538145625, "grad_norm": 3.98983097076416, "learning_rate": 6.698555346185437e-07, "loss": 0.3941, "step": 6833 }, { "epoch": 0.3301927815625453, "grad_norm": 2.9940526485443115, "learning_rate": 6.698072184374547e-07, "loss": 0.3652, "step": 6834 }, { "epoch": 0.33024109774363436, "grad_norm": 3.0355167388916016, "learning_rate": 6.697589022563657e-07, "loss": 0.2836, "step": 6835 }, { "epoch": 0.3302894139247234, "grad_norm": 3.3385255336761475, "learning_rate": 6.697105860752765e-07, "loss": 0.3556, "step": 6836 }, { "epoch": 0.3303377301058124, "grad_norm": 3.221544027328491, "learning_rate": 6.696622698941875e-07, "loss": 0.2332, "step": 6837 }, { "epoch": 0.3303860462869015, "grad_norm": 2.0555803775787354, "learning_rate": 6.696139537130985e-07, "loss": 0.2582, "step": 6838 }, { "epoch": 0.3304343624679905, "grad_norm": 2.526758909225464, "learning_rate": 6.695656375320094e-07, "loss": 0.2515, "step": 6839 }, { "epoch": 0.3304826786490796, "grad_norm": 2.194430112838745, "learning_rate": 6.695173213509204e-07, "loss": 0.1877, "step": 6840 }, { "epoch": 0.3305309948301686, "grad_norm": 2.433523178100586, "learning_rate": 6.694690051698313e-07, "loss": 0.2971, "step": 6841 }, { "epoch": 0.33057931101125765, "grad_norm": 1.8800559043884277, "learning_rate": 6.694206889887423e-07, "loss": 0.1992, "step": 6842 }, { "epoch": 0.33062762719234673, "grad_norm": 3.3641934394836426, "learning_rate": 6.693723728076533e-07, "loss": 0.3755, "step": 6843 }, { "epoch": 0.33067594337343575, "grad_norm": 2.829354763031006, "learning_rate": 6.693240566265643e-07, "loss": 0.3143, "step": 6844 }, { "epoch": 0.33072425955452484, "grad_norm": 4.719474792480469, "learning_rate": 6.692757404454751e-07, "loss": 0.3605, "step": 6845 }, { "epoch": 0.33077257573561386, "grad_norm": 2.3287758827209473, "learning_rate": 6.692274242643861e-07, "loss": 0.3985, "step": 6846 }, { "epoch": 0.3308208919167029, "grad_norm": 3.7440667152404785, "learning_rate": 6.69179108083297e-07, "loss": 0.3745, "step": 6847 }, { "epoch": 0.33086920809779197, "grad_norm": 2.417881965637207, "learning_rate": 6.69130791902208e-07, "loss": 0.3337, "step": 6848 }, { "epoch": 0.330917524278881, "grad_norm": 2.290985345840454, "learning_rate": 6.69082475721119e-07, "loss": 0.2797, "step": 6849 }, { "epoch": 0.33096584045997, "grad_norm": 1.8009109497070312, "learning_rate": 6.6903415954003e-07, "loss": 0.2005, "step": 6850 }, { "epoch": 0.3310141566410591, "grad_norm": 3.439746141433716, "learning_rate": 6.68985843358941e-07, "loss": 0.3518, "step": 6851 }, { "epoch": 0.3310624728221481, "grad_norm": 1.83834969997406, "learning_rate": 6.689375271778518e-07, "loss": 0.204, "step": 6852 }, { "epoch": 0.3311107890032372, "grad_norm": 2.8651282787323, "learning_rate": 6.688892109967627e-07, "loss": 0.2778, "step": 6853 }, { "epoch": 0.33115910518432623, "grad_norm": 3.1002185344696045, "learning_rate": 6.688408948156737e-07, "loss": 0.353, "step": 6854 }, { "epoch": 0.33120742136541526, "grad_norm": 2.331589698791504, "learning_rate": 6.687925786345847e-07, "loss": 0.255, "step": 6855 }, { "epoch": 0.33125573754650434, "grad_norm": 2.203242063522339, "learning_rate": 6.687442624534957e-07, "loss": 0.2554, "step": 6856 }, { "epoch": 0.33130405372759336, "grad_norm": 4.189525127410889, "learning_rate": 6.686959462724066e-07, "loss": 0.3089, "step": 6857 }, { "epoch": 0.33135236990868244, "grad_norm": 2.8524935245513916, "learning_rate": 6.686476300913175e-07, "loss": 0.3162, "step": 6858 }, { "epoch": 0.33140068608977147, "grad_norm": 3.295293092727661, "learning_rate": 6.685993139102285e-07, "loss": 0.2248, "step": 6859 }, { "epoch": 0.3314490022708605, "grad_norm": 2.4949848651885986, "learning_rate": 6.685509977291395e-07, "loss": 0.373, "step": 6860 }, { "epoch": 0.3314973184519496, "grad_norm": 2.7457942962646484, "learning_rate": 6.685026815480505e-07, "loss": 0.3029, "step": 6861 }, { "epoch": 0.3315456346330386, "grad_norm": 2.9610726833343506, "learning_rate": 6.684543653669613e-07, "loss": 0.3393, "step": 6862 }, { "epoch": 0.3315939508141276, "grad_norm": 2.534005880355835, "learning_rate": 6.684060491858723e-07, "loss": 0.2686, "step": 6863 }, { "epoch": 0.3316422669952167, "grad_norm": 2.0969951152801514, "learning_rate": 6.683577330047832e-07, "loss": 0.2338, "step": 6864 }, { "epoch": 0.33169058317630573, "grad_norm": 1.887775182723999, "learning_rate": 6.683094168236942e-07, "loss": 0.2421, "step": 6865 }, { "epoch": 0.3317388993573948, "grad_norm": 4.098283290863037, "learning_rate": 6.682611006426052e-07, "loss": 0.2964, "step": 6866 }, { "epoch": 0.33178721553848384, "grad_norm": 7.6543474197387695, "learning_rate": 6.682127844615161e-07, "loss": 0.3474, "step": 6867 }, { "epoch": 0.33183553171957286, "grad_norm": 2.1081886291503906, "learning_rate": 6.681644682804271e-07, "loss": 0.2237, "step": 6868 }, { "epoch": 0.33188384790066194, "grad_norm": 3.0220115184783936, "learning_rate": 6.681161520993381e-07, "loss": 0.3586, "step": 6869 }, { "epoch": 0.33193216408175097, "grad_norm": 2.003479242324829, "learning_rate": 6.68067835918249e-07, "loss": 0.1427, "step": 6870 }, { "epoch": 0.33198048026284005, "grad_norm": 2.719425678253174, "learning_rate": 6.680195197371599e-07, "loss": 0.3554, "step": 6871 }, { "epoch": 0.3320287964439291, "grad_norm": 2.2583370208740234, "learning_rate": 6.679712035560709e-07, "loss": 0.2362, "step": 6872 }, { "epoch": 0.3320771126250181, "grad_norm": 2.6124777793884277, "learning_rate": 6.679228873749818e-07, "loss": 0.3067, "step": 6873 }, { "epoch": 0.3321254288061072, "grad_norm": 3.298377513885498, "learning_rate": 6.678745711938928e-07, "loss": 0.2469, "step": 6874 }, { "epoch": 0.3321737449871962, "grad_norm": 2.6992154121398926, "learning_rate": 6.678262550128038e-07, "loss": 0.3023, "step": 6875 }, { "epoch": 0.33222206116828523, "grad_norm": 2.086456775665283, "learning_rate": 6.677779388317148e-07, "loss": 0.2339, "step": 6876 }, { "epoch": 0.3322703773493743, "grad_norm": 2.154913902282715, "learning_rate": 6.677296226506257e-07, "loss": 0.2048, "step": 6877 }, { "epoch": 0.33231869353046334, "grad_norm": 9.96303939819336, "learning_rate": 6.676813064695365e-07, "loss": 0.4996, "step": 6878 }, { "epoch": 0.3323670097115524, "grad_norm": 2.481559991836548, "learning_rate": 6.676329902884475e-07, "loss": 0.2711, "step": 6879 }, { "epoch": 0.33241532589264144, "grad_norm": 2.5544795989990234, "learning_rate": 6.675846741073585e-07, "loss": 0.2861, "step": 6880 }, { "epoch": 0.33246364207373047, "grad_norm": 2.214730739593506, "learning_rate": 6.675363579262695e-07, "loss": 0.3133, "step": 6881 }, { "epoch": 0.33251195825481955, "grad_norm": 2.531829833984375, "learning_rate": 6.674880417451805e-07, "loss": 0.2564, "step": 6882 }, { "epoch": 0.3325602744359086, "grad_norm": 2.2553353309631348, "learning_rate": 6.674397255640913e-07, "loss": 0.3333, "step": 6883 }, { "epoch": 0.33260859061699766, "grad_norm": 2.045792579650879, "learning_rate": 6.673914093830023e-07, "loss": 0.2907, "step": 6884 }, { "epoch": 0.3326569067980867, "grad_norm": 2.3999521732330322, "learning_rate": 6.673430932019133e-07, "loss": 0.2754, "step": 6885 }, { "epoch": 0.3327052229791757, "grad_norm": 3.2273519039154053, "learning_rate": 6.672947770208243e-07, "loss": 0.4608, "step": 6886 }, { "epoch": 0.3327535391602648, "grad_norm": 2.091379404067993, "learning_rate": 6.672464608397352e-07, "loss": 0.2381, "step": 6887 }, { "epoch": 0.3328018553413538, "grad_norm": 14.418025970458984, "learning_rate": 6.671981446586461e-07, "loss": 0.2581, "step": 6888 }, { "epoch": 0.33285017152244284, "grad_norm": 3.312788486480713, "learning_rate": 6.671498284775571e-07, "loss": 0.3009, "step": 6889 }, { "epoch": 0.3328984877035319, "grad_norm": 2.2979354858398438, "learning_rate": 6.67101512296468e-07, "loss": 0.2647, "step": 6890 }, { "epoch": 0.33294680388462095, "grad_norm": 7.539339065551758, "learning_rate": 6.67053196115379e-07, "loss": 0.3151, "step": 6891 }, { "epoch": 0.33299512006571, "grad_norm": 3.2449424266815186, "learning_rate": 6.6700487993429e-07, "loss": 0.2534, "step": 6892 }, { "epoch": 0.33304343624679905, "grad_norm": 2.1838457584381104, "learning_rate": 6.669565637532009e-07, "loss": 0.2408, "step": 6893 }, { "epoch": 0.3330917524278881, "grad_norm": 2.676335096359253, "learning_rate": 6.669082475721119e-07, "loss": 0.3165, "step": 6894 }, { "epoch": 0.33314006860897716, "grad_norm": 2.498453378677368, "learning_rate": 6.668599313910229e-07, "loss": 0.3179, "step": 6895 }, { "epoch": 0.3331883847900662, "grad_norm": 2.870866537094116, "learning_rate": 6.668116152099337e-07, "loss": 0.4382, "step": 6896 }, { "epoch": 0.33323670097115526, "grad_norm": 2.6380703449249268, "learning_rate": 6.667632990288447e-07, "loss": 0.3079, "step": 6897 }, { "epoch": 0.3332850171522443, "grad_norm": 3.9218058586120605, "learning_rate": 6.667149828477557e-07, "loss": 0.3786, "step": 6898 }, { "epoch": 0.3333333333333333, "grad_norm": 2.3509480953216553, "learning_rate": 6.666666666666666e-07, "loss": 0.2368, "step": 6899 }, { "epoch": 0.3333816495144224, "grad_norm": 2.8766515254974365, "learning_rate": 6.666183504855776e-07, "loss": 0.3891, "step": 6900 }, { "epoch": 0.3334299656955114, "grad_norm": 2.1150996685028076, "learning_rate": 6.665700343044886e-07, "loss": 0.2213, "step": 6901 }, { "epoch": 0.33347828187660045, "grad_norm": 3.3617286682128906, "learning_rate": 6.665217181233996e-07, "loss": 0.3756, "step": 6902 }, { "epoch": 0.3335265980576895, "grad_norm": 3.496738910675049, "learning_rate": 6.664734019423105e-07, "loss": 0.388, "step": 6903 }, { "epoch": 0.33357491423877855, "grad_norm": 2.730342388153076, "learning_rate": 6.664250857612213e-07, "loss": 0.2986, "step": 6904 }, { "epoch": 0.33362323041986763, "grad_norm": 2.7150261402130127, "learning_rate": 6.663767695801323e-07, "loss": 0.2558, "step": 6905 }, { "epoch": 0.33367154660095666, "grad_norm": 3.8792002201080322, "learning_rate": 6.663284533990433e-07, "loss": 0.3491, "step": 6906 }, { "epoch": 0.3337198627820457, "grad_norm": 4.509825706481934, "learning_rate": 6.662801372179543e-07, "loss": 0.3558, "step": 6907 }, { "epoch": 0.33376817896313477, "grad_norm": 2.3771793842315674, "learning_rate": 6.662318210368653e-07, "loss": 0.1607, "step": 6908 }, { "epoch": 0.3338164951442238, "grad_norm": 3.2096593379974365, "learning_rate": 6.661835048557761e-07, "loss": 0.2895, "step": 6909 }, { "epoch": 0.33386481132531287, "grad_norm": 1.8241424560546875, "learning_rate": 6.661351886746871e-07, "loss": 0.184, "step": 6910 }, { "epoch": 0.3339131275064019, "grad_norm": 4.068781852722168, "learning_rate": 6.660868724935981e-07, "loss": 0.4046, "step": 6911 }, { "epoch": 0.3339614436874909, "grad_norm": 3.186990261077881, "learning_rate": 6.66038556312509e-07, "loss": 0.2549, "step": 6912 }, { "epoch": 0.33400975986858, "grad_norm": 2.5700504779815674, "learning_rate": 6.6599024013142e-07, "loss": 0.3595, "step": 6913 }, { "epoch": 0.33405807604966903, "grad_norm": 5.053548336029053, "learning_rate": 6.659419239503309e-07, "loss": 0.328, "step": 6914 }, { "epoch": 0.33410639223075805, "grad_norm": 3.0195648670196533, "learning_rate": 6.658936077692418e-07, "loss": 0.3179, "step": 6915 }, { "epoch": 0.33415470841184713, "grad_norm": 2.3470637798309326, "learning_rate": 6.658452915881528e-07, "loss": 0.3493, "step": 6916 }, { "epoch": 0.33420302459293616, "grad_norm": 2.330324649810791, "learning_rate": 6.657969754070638e-07, "loss": 0.2532, "step": 6917 }, { "epoch": 0.33425134077402524, "grad_norm": 2.7534525394439697, "learning_rate": 6.657486592259748e-07, "loss": 0.2673, "step": 6918 }, { "epoch": 0.33429965695511427, "grad_norm": 2.7301573753356934, "learning_rate": 6.657003430448857e-07, "loss": 0.2363, "step": 6919 }, { "epoch": 0.3343479731362033, "grad_norm": 2.5043985843658447, "learning_rate": 6.656520268637967e-07, "loss": 0.3772, "step": 6920 }, { "epoch": 0.3343962893172924, "grad_norm": 1.4873188734054565, "learning_rate": 6.656037106827076e-07, "loss": 0.1696, "step": 6921 }, { "epoch": 0.3344446054983814, "grad_norm": 2.2966957092285156, "learning_rate": 6.655553945016185e-07, "loss": 0.2209, "step": 6922 }, { "epoch": 0.3344929216794705, "grad_norm": 2.6470210552215576, "learning_rate": 6.655070783205295e-07, "loss": 0.3048, "step": 6923 }, { "epoch": 0.3345412378605595, "grad_norm": 2.29430890083313, "learning_rate": 6.654587621394405e-07, "loss": 0.2362, "step": 6924 }, { "epoch": 0.33458955404164853, "grad_norm": 2.7379093170166016, "learning_rate": 6.654104459583514e-07, "loss": 0.1943, "step": 6925 }, { "epoch": 0.3346378702227376, "grad_norm": 4.8863301277160645, "learning_rate": 6.653621297772624e-07, "loss": 0.2951, "step": 6926 }, { "epoch": 0.33468618640382664, "grad_norm": 59.652767181396484, "learning_rate": 6.653138135961734e-07, "loss": 0.2607, "step": 6927 }, { "epoch": 0.33473450258491566, "grad_norm": 2.1375784873962402, "learning_rate": 6.652654974150843e-07, "loss": 0.235, "step": 6928 }, { "epoch": 0.33478281876600474, "grad_norm": 3.64099383354187, "learning_rate": 6.652171812339952e-07, "loss": 0.2628, "step": 6929 }, { "epoch": 0.33483113494709377, "grad_norm": 8.031665802001953, "learning_rate": 6.651688650529061e-07, "loss": 0.3078, "step": 6930 }, { "epoch": 0.33487945112818285, "grad_norm": 2.9177327156066895, "learning_rate": 6.651205488718171e-07, "loss": 0.2855, "step": 6931 }, { "epoch": 0.3349277673092719, "grad_norm": 2.7425639629364014, "learning_rate": 6.650722326907281e-07, "loss": 0.3596, "step": 6932 }, { "epoch": 0.3349760834903609, "grad_norm": 3.9098994731903076, "learning_rate": 6.650239165096391e-07, "loss": 0.4273, "step": 6933 }, { "epoch": 0.33502439967145, "grad_norm": 3.2009239196777344, "learning_rate": 6.649756003285501e-07, "loss": 0.3606, "step": 6934 }, { "epoch": 0.335072715852539, "grad_norm": 2.369799852371216, "learning_rate": 6.649272841474609e-07, "loss": 0.2731, "step": 6935 }, { "epoch": 0.3351210320336281, "grad_norm": 1.9930920600891113, "learning_rate": 6.648789679663719e-07, "loss": 0.2233, "step": 6936 }, { "epoch": 0.3351693482147171, "grad_norm": 2.432549476623535, "learning_rate": 6.648306517852829e-07, "loss": 0.3423, "step": 6937 }, { "epoch": 0.33521766439580614, "grad_norm": 2.228904962539673, "learning_rate": 6.647823356041938e-07, "loss": 0.2273, "step": 6938 }, { "epoch": 0.3352659805768952, "grad_norm": 2.1804654598236084, "learning_rate": 6.647340194231048e-07, "loss": 0.2296, "step": 6939 }, { "epoch": 0.33531429675798424, "grad_norm": 2.0619962215423584, "learning_rate": 6.646857032420157e-07, "loss": 0.227, "step": 6940 }, { "epoch": 0.33536261293907327, "grad_norm": 2.1844675540924072, "learning_rate": 6.646373870609266e-07, "loss": 0.2401, "step": 6941 }, { "epoch": 0.33541092912016235, "grad_norm": 2.3581056594848633, "learning_rate": 6.645890708798376e-07, "loss": 0.3054, "step": 6942 }, { "epoch": 0.3354592453012514, "grad_norm": 3.281402826309204, "learning_rate": 6.645407546987486e-07, "loss": 0.3786, "step": 6943 }, { "epoch": 0.33550756148234046, "grad_norm": 2.9581730365753174, "learning_rate": 6.644924385176596e-07, "loss": 0.3719, "step": 6944 }, { "epoch": 0.3355558776634295, "grad_norm": 2.7654521465301514, "learning_rate": 6.644441223365705e-07, "loss": 0.3553, "step": 6945 }, { "epoch": 0.3356041938445185, "grad_norm": 2.975106954574585, "learning_rate": 6.643958061554814e-07, "loss": 0.41, "step": 6946 }, { "epoch": 0.3356525100256076, "grad_norm": 2.1202783584594727, "learning_rate": 6.643474899743923e-07, "loss": 0.243, "step": 6947 }, { "epoch": 0.3357008262066966, "grad_norm": 205.70993041992188, "learning_rate": 6.642991737933033e-07, "loss": 0.434, "step": 6948 }, { "epoch": 0.3357491423877857, "grad_norm": 2.8666107654571533, "learning_rate": 6.642508576122143e-07, "loss": 0.3564, "step": 6949 }, { "epoch": 0.3357974585688747, "grad_norm": 4.503426551818848, "learning_rate": 6.642025414311253e-07, "loss": 0.3655, "step": 6950 }, { "epoch": 0.33584577474996374, "grad_norm": 3.1836390495300293, "learning_rate": 6.641542252500362e-07, "loss": 0.2657, "step": 6951 }, { "epoch": 0.3358940909310528, "grad_norm": 3.030379295349121, "learning_rate": 6.641059090689472e-07, "loss": 0.2919, "step": 6952 }, { "epoch": 0.33594240711214185, "grad_norm": 2.0533595085144043, "learning_rate": 6.640575928878582e-07, "loss": 0.2179, "step": 6953 }, { "epoch": 0.33599072329323093, "grad_norm": 3.1319406032562256, "learning_rate": 6.64009276706769e-07, "loss": 0.2272, "step": 6954 }, { "epoch": 0.33603903947431996, "grad_norm": 2.84694242477417, "learning_rate": 6.6396096052568e-07, "loss": 0.3246, "step": 6955 }, { "epoch": 0.336087355655409, "grad_norm": 2.834817886352539, "learning_rate": 6.639126443445909e-07, "loss": 0.3686, "step": 6956 }, { "epoch": 0.33613567183649806, "grad_norm": 3.2859973907470703, "learning_rate": 6.638643281635019e-07, "loss": 0.2658, "step": 6957 }, { "epoch": 0.3361839880175871, "grad_norm": 1.9957257509231567, "learning_rate": 6.638160119824129e-07, "loss": 0.1751, "step": 6958 }, { "epoch": 0.3362323041986761, "grad_norm": 2.9232800006866455, "learning_rate": 6.637676958013239e-07, "loss": 0.4519, "step": 6959 }, { "epoch": 0.3362806203797652, "grad_norm": 2.836625576019287, "learning_rate": 6.637193796202348e-07, "loss": 0.3548, "step": 6960 }, { "epoch": 0.3363289365608542, "grad_norm": 1.854440689086914, "learning_rate": 6.636710634391457e-07, "loss": 0.1933, "step": 6961 }, { "epoch": 0.3363772527419433, "grad_norm": 2.13336443901062, "learning_rate": 6.636227472580567e-07, "loss": 0.2171, "step": 6962 }, { "epoch": 0.3364255689230323, "grad_norm": 3.5702967643737793, "learning_rate": 6.635744310769676e-07, "loss": 0.413, "step": 6963 }, { "epoch": 0.33647388510412135, "grad_norm": 2.977893352508545, "learning_rate": 6.635261148958786e-07, "loss": 0.3998, "step": 6964 }, { "epoch": 0.33652220128521043, "grad_norm": 3.028182029724121, "learning_rate": 6.634777987147896e-07, "loss": 0.3242, "step": 6965 }, { "epoch": 0.33657051746629946, "grad_norm": 4.684749126434326, "learning_rate": 6.634294825337004e-07, "loss": 0.3597, "step": 6966 }, { "epoch": 0.33661883364738854, "grad_norm": 4.723290920257568, "learning_rate": 6.633811663526114e-07, "loss": 0.3226, "step": 6967 }, { "epoch": 0.33666714982847756, "grad_norm": 3.269601345062256, "learning_rate": 6.633328501715224e-07, "loss": 0.4271, "step": 6968 }, { "epoch": 0.3367154660095666, "grad_norm": 2.247022867202759, "learning_rate": 6.632845339904334e-07, "loss": 0.229, "step": 6969 }, { "epoch": 0.33676378219065567, "grad_norm": 1.5517210960388184, "learning_rate": 6.632362178093444e-07, "loss": 0.2419, "step": 6970 }, { "epoch": 0.3368120983717447, "grad_norm": 2.4146223068237305, "learning_rate": 6.631879016282552e-07, "loss": 0.2351, "step": 6971 }, { "epoch": 0.3368604145528337, "grad_norm": 4.24228048324585, "learning_rate": 6.631395854471662e-07, "loss": 0.2687, "step": 6972 }, { "epoch": 0.3369087307339228, "grad_norm": 3.09134840965271, "learning_rate": 6.630912692660771e-07, "loss": 0.274, "step": 6973 }, { "epoch": 0.3369570469150118, "grad_norm": 13.410717010498047, "learning_rate": 6.630429530849881e-07, "loss": 0.2962, "step": 6974 }, { "epoch": 0.3370053630961009, "grad_norm": 3.282397985458374, "learning_rate": 6.629946369038991e-07, "loss": 0.4249, "step": 6975 }, { "epoch": 0.33705367927718993, "grad_norm": 4.583127021789551, "learning_rate": 6.629463207228101e-07, "loss": 0.238, "step": 6976 }, { "epoch": 0.33710199545827896, "grad_norm": 2.8950893878936768, "learning_rate": 6.62898004541721e-07, "loss": 0.266, "step": 6977 }, { "epoch": 0.33715031163936804, "grad_norm": 3.0807535648345947, "learning_rate": 6.62849688360632e-07, "loss": 0.3129, "step": 6978 }, { "epoch": 0.33719862782045706, "grad_norm": 2.6819872856140137, "learning_rate": 6.628013721795429e-07, "loss": 0.3381, "step": 6979 }, { "epoch": 0.33724694400154615, "grad_norm": 2.212888479232788, "learning_rate": 6.627530559984538e-07, "loss": 0.2092, "step": 6980 }, { "epoch": 0.33729526018263517, "grad_norm": 2.75455379486084, "learning_rate": 6.627047398173648e-07, "loss": 0.2878, "step": 6981 }, { "epoch": 0.3373435763637242, "grad_norm": 3.0018138885498047, "learning_rate": 6.626564236362757e-07, "loss": 0.3398, "step": 6982 }, { "epoch": 0.3373918925448133, "grad_norm": 2.111922264099121, "learning_rate": 6.626081074551867e-07, "loss": 0.1573, "step": 6983 }, { "epoch": 0.3374402087259023, "grad_norm": 2.5480804443359375, "learning_rate": 6.625597912740977e-07, "loss": 0.3067, "step": 6984 }, { "epoch": 0.3374885249069913, "grad_norm": 1.7750335931777954, "learning_rate": 6.625114750930087e-07, "loss": 0.2296, "step": 6985 }, { "epoch": 0.3375368410880804, "grad_norm": 4.685779571533203, "learning_rate": 6.624631589119196e-07, "loss": 0.3291, "step": 6986 }, { "epoch": 0.33758515726916943, "grad_norm": 2.346651315689087, "learning_rate": 6.624148427308305e-07, "loss": 0.269, "step": 6987 }, { "epoch": 0.3376334734502585, "grad_norm": 3.71342396736145, "learning_rate": 6.623665265497414e-07, "loss": 0.2343, "step": 6988 }, { "epoch": 0.33768178963134754, "grad_norm": 5.162962913513184, "learning_rate": 6.623182103686524e-07, "loss": 0.3014, "step": 6989 }, { "epoch": 0.33773010581243657, "grad_norm": 3.1210761070251465, "learning_rate": 6.622698941875634e-07, "loss": 0.237, "step": 6990 }, { "epoch": 0.33777842199352565, "grad_norm": 2.2595958709716797, "learning_rate": 6.622215780064744e-07, "loss": 0.3127, "step": 6991 }, { "epoch": 0.33782673817461467, "grad_norm": 2.9269134998321533, "learning_rate": 6.621732618253852e-07, "loss": 0.2637, "step": 6992 }, { "epoch": 0.33787505435570375, "grad_norm": 5.161642551422119, "learning_rate": 6.621249456442962e-07, "loss": 0.2874, "step": 6993 }, { "epoch": 0.3379233705367928, "grad_norm": 4.302745819091797, "learning_rate": 6.620766294632072e-07, "loss": 0.3673, "step": 6994 }, { "epoch": 0.3379716867178818, "grad_norm": 5.122817039489746, "learning_rate": 6.620283132821182e-07, "loss": 0.4496, "step": 6995 }, { "epoch": 0.3380200028989709, "grad_norm": 2.369225025177002, "learning_rate": 6.619799971010292e-07, "loss": 0.2543, "step": 6996 }, { "epoch": 0.3380683190800599, "grad_norm": 6.951503753662109, "learning_rate": 6.6193168091994e-07, "loss": 0.3561, "step": 6997 }, { "epoch": 0.33811663526114893, "grad_norm": 2.8869996070861816, "learning_rate": 6.61883364738851e-07, "loss": 0.2543, "step": 6998 }, { "epoch": 0.338164951442238, "grad_norm": 2.3969998359680176, "learning_rate": 6.618350485577619e-07, "loss": 0.2972, "step": 6999 }, { "epoch": 0.33821326762332704, "grad_norm": 3.492366075515747, "learning_rate": 6.617867323766729e-07, "loss": 0.2841, "step": 7000 }, { "epoch": 0.3382615838044161, "grad_norm": 1.5861963033676147, "learning_rate": 6.617384161955839e-07, "loss": 0.1453, "step": 7001 }, { "epoch": 0.33830989998550515, "grad_norm": 2.5281925201416016, "learning_rate": 6.616901000144949e-07, "loss": 0.2934, "step": 7002 }, { "epoch": 0.3383582161665942, "grad_norm": 2.8550877571105957, "learning_rate": 6.616417838334058e-07, "loss": 0.3694, "step": 7003 }, { "epoch": 0.33840653234768325, "grad_norm": 2.1607913970947266, "learning_rate": 6.615934676523168e-07, "loss": 0.2633, "step": 7004 }, { "epoch": 0.3384548485287723, "grad_norm": 2.4071834087371826, "learning_rate": 6.615451514712276e-07, "loss": 0.2575, "step": 7005 }, { "epoch": 0.33850316470986136, "grad_norm": 8.587891578674316, "learning_rate": 6.614968352901386e-07, "loss": 0.3144, "step": 7006 }, { "epoch": 0.3385514808909504, "grad_norm": 2.671461820602417, "learning_rate": 6.614485191090496e-07, "loss": 0.3602, "step": 7007 }, { "epoch": 0.3385997970720394, "grad_norm": 3.2226924896240234, "learning_rate": 6.614002029279605e-07, "loss": 0.3353, "step": 7008 }, { "epoch": 0.3386481132531285, "grad_norm": 2.711047410964966, "learning_rate": 6.613518867468715e-07, "loss": 0.1983, "step": 7009 }, { "epoch": 0.3386964294342175, "grad_norm": 2.2321455478668213, "learning_rate": 6.613035705657825e-07, "loss": 0.2446, "step": 7010 }, { "epoch": 0.33874474561530654, "grad_norm": 3.432370901107788, "learning_rate": 6.612552543846934e-07, "loss": 0.3497, "step": 7011 }, { "epoch": 0.3387930617963956, "grad_norm": 4.031431674957275, "learning_rate": 6.612069382036044e-07, "loss": 0.311, "step": 7012 }, { "epoch": 0.33884137797748465, "grad_norm": 2.1151387691497803, "learning_rate": 6.611586220225153e-07, "loss": 0.2461, "step": 7013 }, { "epoch": 0.33888969415857373, "grad_norm": 4.446771621704102, "learning_rate": 6.611103058414262e-07, "loss": 0.4559, "step": 7014 }, { "epoch": 0.33893801033966275, "grad_norm": 2.829961061477661, "learning_rate": 6.610619896603372e-07, "loss": 0.3857, "step": 7015 }, { "epoch": 0.3389863265207518, "grad_norm": 2.1809020042419434, "learning_rate": 6.610136734792482e-07, "loss": 0.2298, "step": 7016 }, { "epoch": 0.33903464270184086, "grad_norm": 2.4113736152648926, "learning_rate": 6.609653572981592e-07, "loss": 0.3452, "step": 7017 }, { "epoch": 0.3390829588829299, "grad_norm": 1.6674377918243408, "learning_rate": 6.6091704111707e-07, "loss": 0.2193, "step": 7018 }, { "epoch": 0.33913127506401897, "grad_norm": 4.697422981262207, "learning_rate": 6.60868724935981e-07, "loss": 0.2092, "step": 7019 }, { "epoch": 0.339179591245108, "grad_norm": 2.7293190956115723, "learning_rate": 6.60820408754892e-07, "loss": 0.1832, "step": 7020 }, { "epoch": 0.339227907426197, "grad_norm": 2.936936378479004, "learning_rate": 6.60772092573803e-07, "loss": 0.2748, "step": 7021 }, { "epoch": 0.3392762236072861, "grad_norm": 2.2763473987579346, "learning_rate": 6.60723776392714e-07, "loss": 0.3064, "step": 7022 }, { "epoch": 0.3393245397883751, "grad_norm": 2.3718576431274414, "learning_rate": 6.606754602116248e-07, "loss": 0.2674, "step": 7023 }, { "epoch": 0.33937285596946415, "grad_norm": 2.5844297409057617, "learning_rate": 6.606271440305357e-07, "loss": 0.2233, "step": 7024 }, { "epoch": 0.33942117215055323, "grad_norm": 2.377019166946411, "learning_rate": 6.605788278494467e-07, "loss": 0.2417, "step": 7025 }, { "epoch": 0.33946948833164226, "grad_norm": 3.802980899810791, "learning_rate": 6.605305116683577e-07, "loss": 0.2872, "step": 7026 }, { "epoch": 0.33951780451273134, "grad_norm": 2.5810904502868652, "learning_rate": 6.604821954872687e-07, "loss": 0.3078, "step": 7027 }, { "epoch": 0.33956612069382036, "grad_norm": 2.157749652862549, "learning_rate": 6.604338793061797e-07, "loss": 0.2169, "step": 7028 }, { "epoch": 0.3396144368749094, "grad_norm": 2.2957119941711426, "learning_rate": 6.603855631250906e-07, "loss": 0.2825, "step": 7029 }, { "epoch": 0.33966275305599847, "grad_norm": 4.452141761779785, "learning_rate": 6.603372469440016e-07, "loss": 0.305, "step": 7030 }, { "epoch": 0.3397110692370875, "grad_norm": 2.5434935092926025, "learning_rate": 6.602889307629124e-07, "loss": 0.2639, "step": 7031 }, { "epoch": 0.3397593854181766, "grad_norm": 4.355024814605713, "learning_rate": 6.602406145818234e-07, "loss": 0.3927, "step": 7032 }, { "epoch": 0.3398077015992656, "grad_norm": 3.98626708984375, "learning_rate": 6.601922984007344e-07, "loss": 0.3798, "step": 7033 }, { "epoch": 0.3398560177803546, "grad_norm": 4.2989044189453125, "learning_rate": 6.601439822196453e-07, "loss": 0.3193, "step": 7034 }, { "epoch": 0.3399043339614437, "grad_norm": 3.317532539367676, "learning_rate": 6.600956660385563e-07, "loss": 0.4588, "step": 7035 }, { "epoch": 0.33995265014253273, "grad_norm": 3.3393402099609375, "learning_rate": 6.600473498574673e-07, "loss": 0.2886, "step": 7036 }, { "epoch": 0.34000096632362176, "grad_norm": 2.5816562175750732, "learning_rate": 6.599990336763782e-07, "loss": 0.2589, "step": 7037 }, { "epoch": 0.34004928250471084, "grad_norm": 2.704667091369629, "learning_rate": 6.599507174952892e-07, "loss": 0.3218, "step": 7038 }, { "epoch": 0.34009759868579986, "grad_norm": 4.282280445098877, "learning_rate": 6.599024013142e-07, "loss": 0.2947, "step": 7039 }, { "epoch": 0.34014591486688894, "grad_norm": 6.359865665435791, "learning_rate": 6.59854085133111e-07, "loss": 0.3658, "step": 7040 }, { "epoch": 0.34019423104797797, "grad_norm": 2.1674232482910156, "learning_rate": 6.59805768952022e-07, "loss": 0.2612, "step": 7041 }, { "epoch": 0.340242547229067, "grad_norm": 4.961465358734131, "learning_rate": 6.59757452770933e-07, "loss": 0.4565, "step": 7042 }, { "epoch": 0.3402908634101561, "grad_norm": 8.98433780670166, "learning_rate": 6.597091365898439e-07, "loss": 0.251, "step": 7043 }, { "epoch": 0.3403391795912451, "grad_norm": 2.4106922149658203, "learning_rate": 6.596608204087548e-07, "loss": 0.2417, "step": 7044 }, { "epoch": 0.3403874957723342, "grad_norm": 4.069691181182861, "learning_rate": 6.596125042276658e-07, "loss": 0.2601, "step": 7045 }, { "epoch": 0.3404358119534232, "grad_norm": 3.712254047393799, "learning_rate": 6.595641880465768e-07, "loss": 0.4057, "step": 7046 }, { "epoch": 0.34048412813451223, "grad_norm": 4.578030586242676, "learning_rate": 6.595158718654878e-07, "loss": 0.3711, "step": 7047 }, { "epoch": 0.3405324443156013, "grad_norm": 2.3358709812164307, "learning_rate": 6.594675556843987e-07, "loss": 0.3391, "step": 7048 }, { "epoch": 0.34058076049669034, "grad_norm": 2.6632840633392334, "learning_rate": 6.594192395033096e-07, "loss": 0.2631, "step": 7049 }, { "epoch": 0.34062907667777936, "grad_norm": 2.334252119064331, "learning_rate": 6.593709233222205e-07, "loss": 0.2395, "step": 7050 }, { "epoch": 0.34067739285886844, "grad_norm": 2.8533802032470703, "learning_rate": 6.593226071411315e-07, "loss": 0.3045, "step": 7051 }, { "epoch": 0.34072570903995747, "grad_norm": 2.578608512878418, "learning_rate": 6.592742909600425e-07, "loss": 0.3233, "step": 7052 }, { "epoch": 0.34077402522104655, "grad_norm": 2.0655007362365723, "learning_rate": 6.592259747789535e-07, "loss": 0.2802, "step": 7053 }, { "epoch": 0.3408223414021356, "grad_norm": 3.5509345531463623, "learning_rate": 6.591776585978645e-07, "loss": 0.4007, "step": 7054 }, { "epoch": 0.3408706575832246, "grad_norm": 2.4382681846618652, "learning_rate": 6.591293424167754e-07, "loss": 0.1817, "step": 7055 }, { "epoch": 0.3409189737643137, "grad_norm": 2.3901383876800537, "learning_rate": 6.590810262356862e-07, "loss": 0.2845, "step": 7056 }, { "epoch": 0.3409672899454027, "grad_norm": 3.4811720848083496, "learning_rate": 6.590327100545972e-07, "loss": 0.3078, "step": 7057 }, { "epoch": 0.3410156061264918, "grad_norm": 2.5493991374969482, "learning_rate": 6.589843938735082e-07, "loss": 0.2811, "step": 7058 }, { "epoch": 0.3410639223075808, "grad_norm": 1.8135101795196533, "learning_rate": 6.589360776924192e-07, "loss": 0.2007, "step": 7059 }, { "epoch": 0.34111223848866984, "grad_norm": 3.2449355125427246, "learning_rate": 6.588877615113301e-07, "loss": 0.2235, "step": 7060 }, { "epoch": 0.3411605546697589, "grad_norm": 1.6969585418701172, "learning_rate": 6.588394453302411e-07, "loss": 0.1656, "step": 7061 }, { "epoch": 0.34120887085084795, "grad_norm": 2.622168779373169, "learning_rate": 6.587911291491521e-07, "loss": 0.3356, "step": 7062 }, { "epoch": 0.34125718703193697, "grad_norm": 5.953306198120117, "learning_rate": 6.58742812968063e-07, "loss": 0.36, "step": 7063 }, { "epoch": 0.34130550321302605, "grad_norm": 2.188539505004883, "learning_rate": 6.58694496786974e-07, "loss": 0.2416, "step": 7064 }, { "epoch": 0.3413538193941151, "grad_norm": 2.3695287704467773, "learning_rate": 6.586461806058848e-07, "loss": 0.2645, "step": 7065 }, { "epoch": 0.34140213557520416, "grad_norm": 2.955674886703491, "learning_rate": 6.585978644247958e-07, "loss": 0.2455, "step": 7066 }, { "epoch": 0.3414504517562932, "grad_norm": 3.283731698989868, "learning_rate": 6.585495482437068e-07, "loss": 0.4177, "step": 7067 }, { "epoch": 0.3414987679373822, "grad_norm": 2.1308786869049072, "learning_rate": 6.585012320626178e-07, "loss": 0.2266, "step": 7068 }, { "epoch": 0.3415470841184713, "grad_norm": 3.0775461196899414, "learning_rate": 6.584529158815287e-07, "loss": 0.421, "step": 7069 }, { "epoch": 0.3415954002995603, "grad_norm": 1.8976026773452759, "learning_rate": 6.584045997004396e-07, "loss": 0.1693, "step": 7070 }, { "epoch": 0.3416437164806494, "grad_norm": 2.8324825763702393, "learning_rate": 6.583562835193506e-07, "loss": 0.4197, "step": 7071 }, { "epoch": 0.3416920326617384, "grad_norm": 4.009576797485352, "learning_rate": 6.583079673382616e-07, "loss": 0.4989, "step": 7072 }, { "epoch": 0.34174034884282745, "grad_norm": 1.9864013195037842, "learning_rate": 6.582596511571725e-07, "loss": 0.2008, "step": 7073 }, { "epoch": 0.3417886650239165, "grad_norm": 2.266451120376587, "learning_rate": 6.582113349760835e-07, "loss": 0.305, "step": 7074 }, { "epoch": 0.34183698120500555, "grad_norm": 3.071361780166626, "learning_rate": 6.581630187949943e-07, "loss": 0.3814, "step": 7075 }, { "epoch": 0.3418852973860946, "grad_norm": 2.2999677658081055, "learning_rate": 6.581147026139053e-07, "loss": 0.2196, "step": 7076 }, { "epoch": 0.34193361356718366, "grad_norm": 3.126711368560791, "learning_rate": 6.580663864328163e-07, "loss": 0.2783, "step": 7077 }, { "epoch": 0.3419819297482727, "grad_norm": 2.427781820297241, "learning_rate": 6.580180702517273e-07, "loss": 0.2052, "step": 7078 }, { "epoch": 0.34203024592936176, "grad_norm": 3.1698148250579834, "learning_rate": 6.579697540706383e-07, "loss": 0.2844, "step": 7079 }, { "epoch": 0.3420785621104508, "grad_norm": 2.3696672916412354, "learning_rate": 6.579214378895493e-07, "loss": 0.2397, "step": 7080 }, { "epoch": 0.3421268782915398, "grad_norm": 3.2357072830200195, "learning_rate": 6.578731217084602e-07, "loss": 0.3558, "step": 7081 }, { "epoch": 0.3421751944726289, "grad_norm": 4.12910270690918, "learning_rate": 6.57824805527371e-07, "loss": 0.1906, "step": 7082 }, { "epoch": 0.3422235106537179, "grad_norm": 11.563100814819336, "learning_rate": 6.57776489346282e-07, "loss": 0.4245, "step": 7083 }, { "epoch": 0.342271826834807, "grad_norm": 4.97351598739624, "learning_rate": 6.57728173165193e-07, "loss": 0.2119, "step": 7084 }, { "epoch": 0.34232014301589603, "grad_norm": 2.596299409866333, "learning_rate": 6.57679856984104e-07, "loss": 0.2742, "step": 7085 }, { "epoch": 0.34236845919698505, "grad_norm": 3.458930015563965, "learning_rate": 6.576315408030149e-07, "loss": 0.2782, "step": 7086 }, { "epoch": 0.34241677537807413, "grad_norm": 1.8440496921539307, "learning_rate": 6.575832246219259e-07, "loss": 0.2637, "step": 7087 }, { "epoch": 0.34246509155916316, "grad_norm": 1.9526779651641846, "learning_rate": 6.575349084408368e-07, "loss": 0.213, "step": 7088 }, { "epoch": 0.3425134077402522, "grad_norm": 3.3505172729492188, "learning_rate": 6.574865922597478e-07, "loss": 0.4195, "step": 7089 }, { "epoch": 0.34256172392134127, "grad_norm": 3.1441426277160645, "learning_rate": 6.574382760786587e-07, "loss": 0.3416, "step": 7090 }, { "epoch": 0.3426100401024303, "grad_norm": 2.4649600982666016, "learning_rate": 6.573899598975696e-07, "loss": 0.277, "step": 7091 }, { "epoch": 0.34265835628351937, "grad_norm": 2.6142616271972656, "learning_rate": 6.573416437164806e-07, "loss": 0.2674, "step": 7092 }, { "epoch": 0.3427066724646084, "grad_norm": 6.766477584838867, "learning_rate": 6.572933275353916e-07, "loss": 0.2782, "step": 7093 }, { "epoch": 0.3427549886456974, "grad_norm": 2.315415382385254, "learning_rate": 6.572450113543026e-07, "loss": 0.2664, "step": 7094 }, { "epoch": 0.3428033048267865, "grad_norm": 3.336756706237793, "learning_rate": 6.571966951732135e-07, "loss": 0.3203, "step": 7095 }, { "epoch": 0.34285162100787553, "grad_norm": 2.6971359252929688, "learning_rate": 6.571483789921244e-07, "loss": 0.3138, "step": 7096 }, { "epoch": 0.3428999371889646, "grad_norm": 2.861187219619751, "learning_rate": 6.571000628110354e-07, "loss": 0.3306, "step": 7097 }, { "epoch": 0.34294825337005364, "grad_norm": 3.4871747493743896, "learning_rate": 6.570517466299463e-07, "loss": 0.3466, "step": 7098 }, { "epoch": 0.34299656955114266, "grad_norm": 2.097536325454712, "learning_rate": 6.570034304488573e-07, "loss": 0.2176, "step": 7099 }, { "epoch": 0.34304488573223174, "grad_norm": 6.13468074798584, "learning_rate": 6.569551142677683e-07, "loss": 0.3062, "step": 7100 }, { "epoch": 0.34309320191332077, "grad_norm": 2.671536445617676, "learning_rate": 6.569067980866791e-07, "loss": 0.2526, "step": 7101 }, { "epoch": 0.3431415180944098, "grad_norm": 3.524784564971924, "learning_rate": 6.568584819055901e-07, "loss": 0.3262, "step": 7102 }, { "epoch": 0.3431898342754989, "grad_norm": 58.635528564453125, "learning_rate": 6.568101657245011e-07, "loss": 0.2569, "step": 7103 }, { "epoch": 0.3432381504565879, "grad_norm": 2.216920852661133, "learning_rate": 6.567618495434121e-07, "loss": 0.2627, "step": 7104 }, { "epoch": 0.343286466637677, "grad_norm": 2.5688040256500244, "learning_rate": 6.567135333623231e-07, "loss": 0.2701, "step": 7105 }, { "epoch": 0.343334782818766, "grad_norm": 3.9086685180664062, "learning_rate": 6.56665217181234e-07, "loss": 0.4956, "step": 7106 }, { "epoch": 0.34338309899985503, "grad_norm": 1.5707470178604126, "learning_rate": 6.566169010001448e-07, "loss": 0.1366, "step": 7107 }, { "epoch": 0.3434314151809441, "grad_norm": 2.013742685317993, "learning_rate": 6.565685848190558e-07, "loss": 0.2407, "step": 7108 }, { "epoch": 0.34347973136203314, "grad_norm": 2.7799367904663086, "learning_rate": 6.565202686379668e-07, "loss": 0.2645, "step": 7109 }, { "epoch": 0.3435280475431222, "grad_norm": 4.61886739730835, "learning_rate": 6.564719524568778e-07, "loss": 0.377, "step": 7110 }, { "epoch": 0.34357636372421124, "grad_norm": 3.0060276985168457, "learning_rate": 6.564236362757888e-07, "loss": 0.4014, "step": 7111 }, { "epoch": 0.34362467990530027, "grad_norm": 2.537027359008789, "learning_rate": 6.563753200946997e-07, "loss": 0.3435, "step": 7112 }, { "epoch": 0.34367299608638935, "grad_norm": 3.8480451107025146, "learning_rate": 6.563270039136107e-07, "loss": 0.3329, "step": 7113 }, { "epoch": 0.3437213122674784, "grad_norm": 7.298872947692871, "learning_rate": 6.562786877325216e-07, "loss": 0.3093, "step": 7114 }, { "epoch": 0.3437696284485674, "grad_norm": 3.4066286087036133, "learning_rate": 6.562303715514325e-07, "loss": 0.4358, "step": 7115 }, { "epoch": 0.3438179446296565, "grad_norm": 2.489006280899048, "learning_rate": 6.561820553703435e-07, "loss": 0.3233, "step": 7116 }, { "epoch": 0.3438662608107455, "grad_norm": 2.407294988632202, "learning_rate": 6.561337391892544e-07, "loss": 0.2907, "step": 7117 }, { "epoch": 0.3439145769918346, "grad_norm": 2.25249981880188, "learning_rate": 6.560854230081654e-07, "loss": 0.2153, "step": 7118 }, { "epoch": 0.3439628931729236, "grad_norm": 2.984835147857666, "learning_rate": 6.560371068270764e-07, "loss": 0.3784, "step": 7119 }, { "epoch": 0.34401120935401264, "grad_norm": 5.590671539306641, "learning_rate": 6.559887906459873e-07, "loss": 0.2272, "step": 7120 }, { "epoch": 0.3440595255351017, "grad_norm": 3.098445415496826, "learning_rate": 6.559404744648983e-07, "loss": 0.274, "step": 7121 }, { "epoch": 0.34410784171619074, "grad_norm": 2.2878291606903076, "learning_rate": 6.558921582838092e-07, "loss": 0.264, "step": 7122 }, { "epoch": 0.3441561578972798, "grad_norm": 4.280829906463623, "learning_rate": 6.558438421027202e-07, "loss": 0.3243, "step": 7123 }, { "epoch": 0.34420447407836885, "grad_norm": 12.62368392944336, "learning_rate": 6.557955259216311e-07, "loss": 0.3733, "step": 7124 }, { "epoch": 0.3442527902594579, "grad_norm": 3.2572009563446045, "learning_rate": 6.557472097405421e-07, "loss": 0.1956, "step": 7125 }, { "epoch": 0.34430110644054696, "grad_norm": 2.878680944442749, "learning_rate": 6.556988935594531e-07, "loss": 0.3944, "step": 7126 }, { "epoch": 0.344349422621636, "grad_norm": 2.414224147796631, "learning_rate": 6.556505773783639e-07, "loss": 0.2997, "step": 7127 }, { "epoch": 0.344397738802725, "grad_norm": 4.863565444946289, "learning_rate": 6.556022611972749e-07, "loss": 0.3387, "step": 7128 }, { "epoch": 0.3444460549838141, "grad_norm": 2.8724138736724854, "learning_rate": 6.555539450161859e-07, "loss": 0.3081, "step": 7129 }, { "epoch": 0.3444943711649031, "grad_norm": 2.529139995574951, "learning_rate": 6.555056288350969e-07, "loss": 0.3268, "step": 7130 }, { "epoch": 0.3445426873459922, "grad_norm": 2.8775126934051514, "learning_rate": 6.554573126540079e-07, "loss": 0.4082, "step": 7131 }, { "epoch": 0.3445910035270812, "grad_norm": 3.016849994659424, "learning_rate": 6.554089964729187e-07, "loss": 0.3727, "step": 7132 }, { "epoch": 0.34463931970817024, "grad_norm": 2.4851200580596924, "learning_rate": 6.553606802918296e-07, "loss": 0.3409, "step": 7133 }, { "epoch": 0.3446876358892593, "grad_norm": 3.4871723651885986, "learning_rate": 6.553123641107406e-07, "loss": 0.3507, "step": 7134 }, { "epoch": 0.34473595207034835, "grad_norm": 1.970779538154602, "learning_rate": 6.552640479296516e-07, "loss": 0.2071, "step": 7135 }, { "epoch": 0.34478426825143743, "grad_norm": 2.8310580253601074, "learning_rate": 6.552157317485626e-07, "loss": 0.3179, "step": 7136 }, { "epoch": 0.34483258443252646, "grad_norm": 2.761728286743164, "learning_rate": 6.551674155674736e-07, "loss": 0.1953, "step": 7137 }, { "epoch": 0.3448809006136155, "grad_norm": 2.085125684738159, "learning_rate": 6.551190993863845e-07, "loss": 0.1789, "step": 7138 }, { "epoch": 0.34492921679470456, "grad_norm": 2.2189688682556152, "learning_rate": 6.550707832052954e-07, "loss": 0.2426, "step": 7139 }, { "epoch": 0.3449775329757936, "grad_norm": 6.192595958709717, "learning_rate": 6.550224670242063e-07, "loss": 0.4849, "step": 7140 }, { "epoch": 0.3450258491568826, "grad_norm": 2.9431674480438232, "learning_rate": 6.549741508431173e-07, "loss": 0.2279, "step": 7141 }, { "epoch": 0.3450741653379717, "grad_norm": 2.432556629180908, "learning_rate": 6.549258346620283e-07, "loss": 0.2727, "step": 7142 }, { "epoch": 0.3451224815190607, "grad_norm": 4.145018100738525, "learning_rate": 6.548775184809392e-07, "loss": 0.3032, "step": 7143 }, { "epoch": 0.3451707977001498, "grad_norm": 1.9444801807403564, "learning_rate": 6.548292022998502e-07, "loss": 0.1933, "step": 7144 }, { "epoch": 0.3452191138812388, "grad_norm": 2.821579933166504, "learning_rate": 6.547808861187612e-07, "loss": 0.392, "step": 7145 }, { "epoch": 0.34526743006232785, "grad_norm": 2.466700792312622, "learning_rate": 6.547325699376721e-07, "loss": 0.2744, "step": 7146 }, { "epoch": 0.34531574624341693, "grad_norm": 3.009716510772705, "learning_rate": 6.546842537565831e-07, "loss": 0.3893, "step": 7147 }, { "epoch": 0.34536406242450596, "grad_norm": 8.442316055297852, "learning_rate": 6.54635937575494e-07, "loss": 0.452, "step": 7148 }, { "epoch": 0.34541237860559504, "grad_norm": 3.1757559776306152, "learning_rate": 6.545876213944049e-07, "loss": 0.3921, "step": 7149 }, { "epoch": 0.34546069478668406, "grad_norm": 3.585191249847412, "learning_rate": 6.545393052133159e-07, "loss": 0.4489, "step": 7150 }, { "epoch": 0.3455090109677731, "grad_norm": 1.5776597261428833, "learning_rate": 6.544909890322269e-07, "loss": 0.1635, "step": 7151 }, { "epoch": 0.34555732714886217, "grad_norm": 2.520625591278076, "learning_rate": 6.544426728511378e-07, "loss": 0.2992, "step": 7152 }, { "epoch": 0.3456056433299512, "grad_norm": 2.2294363975524902, "learning_rate": 6.543943566700487e-07, "loss": 0.2751, "step": 7153 }, { "epoch": 0.3456539595110402, "grad_norm": 2.7686116695404053, "learning_rate": 6.543460404889597e-07, "loss": 0.3555, "step": 7154 }, { "epoch": 0.3457022756921293, "grad_norm": 2.439690589904785, "learning_rate": 6.542977243078707e-07, "loss": 0.2769, "step": 7155 }, { "epoch": 0.3457505918732183, "grad_norm": 2.7962429523468018, "learning_rate": 6.542494081267817e-07, "loss": 0.326, "step": 7156 }, { "epoch": 0.3457989080543074, "grad_norm": 2.5589864253997803, "learning_rate": 6.542010919456927e-07, "loss": 0.3443, "step": 7157 }, { "epoch": 0.34584722423539643, "grad_norm": 5.180351257324219, "learning_rate": 6.541527757646034e-07, "loss": 0.4265, "step": 7158 }, { "epoch": 0.34589554041648546, "grad_norm": 2.9138386249542236, "learning_rate": 6.541044595835144e-07, "loss": 0.3512, "step": 7159 }, { "epoch": 0.34594385659757454, "grad_norm": 4.784990310668945, "learning_rate": 6.540561434024254e-07, "loss": 0.4181, "step": 7160 }, { "epoch": 0.34599217277866356, "grad_norm": 3.3210129737854004, "learning_rate": 6.540078272213364e-07, "loss": 0.4167, "step": 7161 }, { "epoch": 0.34604048895975265, "grad_norm": 106.25792694091797, "learning_rate": 6.539595110402474e-07, "loss": 0.2156, "step": 7162 }, { "epoch": 0.34608880514084167, "grad_norm": 4.282296657562256, "learning_rate": 6.539111948591584e-07, "loss": 0.3239, "step": 7163 }, { "epoch": 0.3461371213219307, "grad_norm": 2.975780963897705, "learning_rate": 6.538628786780693e-07, "loss": 0.3062, "step": 7164 }, { "epoch": 0.3461854375030198, "grad_norm": 3.0568737983703613, "learning_rate": 6.538145624969802e-07, "loss": 0.1578, "step": 7165 }, { "epoch": 0.3462337536841088, "grad_norm": 5.658298969268799, "learning_rate": 6.537662463158911e-07, "loss": 0.2879, "step": 7166 }, { "epoch": 0.34628206986519783, "grad_norm": 2.0903139114379883, "learning_rate": 6.537179301348021e-07, "loss": 0.2784, "step": 7167 }, { "epoch": 0.3463303860462869, "grad_norm": 2.4627952575683594, "learning_rate": 6.536696139537131e-07, "loss": 0.3514, "step": 7168 }, { "epoch": 0.34637870222737593, "grad_norm": 2.564100980758667, "learning_rate": 6.53621297772624e-07, "loss": 0.2803, "step": 7169 }, { "epoch": 0.346427018408465, "grad_norm": 4.497776985168457, "learning_rate": 6.53572981591535e-07, "loss": 0.4176, "step": 7170 }, { "epoch": 0.34647533458955404, "grad_norm": 4.533310413360596, "learning_rate": 6.535246654104459e-07, "loss": 0.3489, "step": 7171 }, { "epoch": 0.34652365077064307, "grad_norm": 2.43475341796875, "learning_rate": 6.534763492293569e-07, "loss": 0.353, "step": 7172 }, { "epoch": 0.34657196695173215, "grad_norm": 2.8312690258026123, "learning_rate": 6.534280330482679e-07, "loss": 0.4427, "step": 7173 }, { "epoch": 0.34662028313282117, "grad_norm": 2.497964382171631, "learning_rate": 6.533797168671787e-07, "loss": 0.3034, "step": 7174 }, { "epoch": 0.34666859931391025, "grad_norm": 2.669813394546509, "learning_rate": 6.533314006860897e-07, "loss": 0.3145, "step": 7175 }, { "epoch": 0.3467169154949993, "grad_norm": 2.757650136947632, "learning_rate": 6.532830845050007e-07, "loss": 0.2349, "step": 7176 }, { "epoch": 0.3467652316760883, "grad_norm": 2.4392049312591553, "learning_rate": 6.532347683239117e-07, "loss": 0.2851, "step": 7177 }, { "epoch": 0.3468135478571774, "grad_norm": 2.8695766925811768, "learning_rate": 6.531864521428226e-07, "loss": 0.3358, "step": 7178 }, { "epoch": 0.3468618640382664, "grad_norm": 2.9774303436279297, "learning_rate": 6.531381359617335e-07, "loss": 0.3627, "step": 7179 }, { "epoch": 0.34691018021935544, "grad_norm": 4.471482276916504, "learning_rate": 6.530898197806445e-07, "loss": 0.2965, "step": 7180 }, { "epoch": 0.3469584964004445, "grad_norm": 1.5740197896957397, "learning_rate": 6.530415035995555e-07, "loss": 0.1456, "step": 7181 }, { "epoch": 0.34700681258153354, "grad_norm": 1.6024256944656372, "learning_rate": 6.529931874184665e-07, "loss": 0.1796, "step": 7182 }, { "epoch": 0.3470551287626226, "grad_norm": 2.533632516860962, "learning_rate": 6.529448712373774e-07, "loss": 0.3405, "step": 7183 }, { "epoch": 0.34710344494371165, "grad_norm": 2.1813082695007324, "learning_rate": 6.528965550562882e-07, "loss": 0.1834, "step": 7184 }, { "epoch": 0.3471517611248007, "grad_norm": 3.2006030082702637, "learning_rate": 6.528482388751992e-07, "loss": 0.2998, "step": 7185 }, { "epoch": 0.34720007730588975, "grad_norm": 2.254542589187622, "learning_rate": 6.527999226941102e-07, "loss": 0.2412, "step": 7186 }, { "epoch": 0.3472483934869788, "grad_norm": 2.8499350547790527, "learning_rate": 6.527516065130212e-07, "loss": 0.3373, "step": 7187 }, { "epoch": 0.34729670966806786, "grad_norm": 4.244269371032715, "learning_rate": 6.527032903319322e-07, "loss": 0.1873, "step": 7188 }, { "epoch": 0.3473450258491569, "grad_norm": 2.308842897415161, "learning_rate": 6.526549741508432e-07, "loss": 0.2561, "step": 7189 }, { "epoch": 0.3473933420302459, "grad_norm": 3.4809634685516357, "learning_rate": 6.52606657969754e-07, "loss": 0.3771, "step": 7190 }, { "epoch": 0.347441658211335, "grad_norm": 2.42677903175354, "learning_rate": 6.525583417886649e-07, "loss": 0.2848, "step": 7191 }, { "epoch": 0.347489974392424, "grad_norm": 4.95301628112793, "learning_rate": 6.525100256075759e-07, "loss": 0.3793, "step": 7192 }, { "epoch": 0.34753829057351304, "grad_norm": 3.8611068725585938, "learning_rate": 6.524617094264869e-07, "loss": 0.4539, "step": 7193 }, { "epoch": 0.3475866067546021, "grad_norm": 2.8988475799560547, "learning_rate": 6.524133932453979e-07, "loss": 0.3138, "step": 7194 }, { "epoch": 0.34763492293569115, "grad_norm": 1.8919187784194946, "learning_rate": 6.523650770643088e-07, "loss": 0.2603, "step": 7195 }, { "epoch": 0.34768323911678023, "grad_norm": 3.6634557247161865, "learning_rate": 6.523167608832198e-07, "loss": 0.2264, "step": 7196 }, { "epoch": 0.34773155529786925, "grad_norm": 2.591921329498291, "learning_rate": 6.522684447021307e-07, "loss": 0.3471, "step": 7197 }, { "epoch": 0.3477798714789583, "grad_norm": 5.55391788482666, "learning_rate": 6.522201285210417e-07, "loss": 0.2787, "step": 7198 }, { "epoch": 0.34782818766004736, "grad_norm": 4.074551582336426, "learning_rate": 6.521718123399527e-07, "loss": 0.3069, "step": 7199 }, { "epoch": 0.3478765038411364, "grad_norm": 2.204968214035034, "learning_rate": 6.521234961588635e-07, "loss": 0.2634, "step": 7200 }, { "epoch": 0.34792482002222547, "grad_norm": 2.8117856979370117, "learning_rate": 6.520751799777745e-07, "loss": 0.343, "step": 7201 }, { "epoch": 0.3479731362033145, "grad_norm": 2.9359357357025146, "learning_rate": 6.520268637966855e-07, "loss": 0.3029, "step": 7202 }, { "epoch": 0.3480214523844035, "grad_norm": 3.3292596340179443, "learning_rate": 6.519785476155964e-07, "loss": 0.2515, "step": 7203 }, { "epoch": 0.3480697685654926, "grad_norm": 2.335554361343384, "learning_rate": 6.519302314345074e-07, "loss": 0.2342, "step": 7204 }, { "epoch": 0.3481180847465816, "grad_norm": 3.021153211593628, "learning_rate": 6.518819152534183e-07, "loss": 0.3235, "step": 7205 }, { "epoch": 0.34816640092767065, "grad_norm": 3.040269613265991, "learning_rate": 6.518335990723293e-07, "loss": 0.4876, "step": 7206 }, { "epoch": 0.34821471710875973, "grad_norm": 2.26585054397583, "learning_rate": 6.517852828912403e-07, "loss": 0.2867, "step": 7207 }, { "epoch": 0.34826303328984876, "grad_norm": 3.365788698196411, "learning_rate": 6.517369667101512e-07, "loss": 0.4237, "step": 7208 }, { "epoch": 0.34831134947093784, "grad_norm": 14.589447021484375, "learning_rate": 6.516886505290622e-07, "loss": 0.1988, "step": 7209 }, { "epoch": 0.34835966565202686, "grad_norm": 3.083463668823242, "learning_rate": 6.51640334347973e-07, "loss": 0.3072, "step": 7210 }, { "epoch": 0.3484079818331159, "grad_norm": 2.0435030460357666, "learning_rate": 6.51592018166884e-07, "loss": 0.2248, "step": 7211 }, { "epoch": 0.34845629801420497, "grad_norm": 2.8546926975250244, "learning_rate": 6.51543701985795e-07, "loss": 0.3022, "step": 7212 }, { "epoch": 0.348504614195294, "grad_norm": 3.527466297149658, "learning_rate": 6.51495385804706e-07, "loss": 0.203, "step": 7213 }, { "epoch": 0.3485529303763831, "grad_norm": 5.271095275878906, "learning_rate": 6.51447069623617e-07, "loss": 0.2815, "step": 7214 }, { "epoch": 0.3486012465574721, "grad_norm": 6.438145160675049, "learning_rate": 6.51398753442528e-07, "loss": 0.3177, "step": 7215 }, { "epoch": 0.3486495627385611, "grad_norm": 2.6131253242492676, "learning_rate": 6.513504372614387e-07, "loss": 0.3197, "step": 7216 }, { "epoch": 0.3486978789196502, "grad_norm": 2.408730983734131, "learning_rate": 6.513021210803497e-07, "loss": 0.3551, "step": 7217 }, { "epoch": 0.34874619510073923, "grad_norm": 2.4265010356903076, "learning_rate": 6.512538048992607e-07, "loss": 0.2716, "step": 7218 }, { "epoch": 0.34879451128182826, "grad_norm": 2.0183939933776855, "learning_rate": 6.512054887181717e-07, "loss": 0.1701, "step": 7219 }, { "epoch": 0.34884282746291734, "grad_norm": 2.5821540355682373, "learning_rate": 6.511571725370827e-07, "loss": 0.3001, "step": 7220 }, { "epoch": 0.34889114364400636, "grad_norm": 1.8253127336502075, "learning_rate": 6.511088563559936e-07, "loss": 0.2161, "step": 7221 }, { "epoch": 0.34893945982509544, "grad_norm": 2.207174301147461, "learning_rate": 6.510605401749045e-07, "loss": 0.242, "step": 7222 }, { "epoch": 0.34898777600618447, "grad_norm": 2.0696611404418945, "learning_rate": 6.510122239938155e-07, "loss": 0.2257, "step": 7223 }, { "epoch": 0.3490360921872735, "grad_norm": 2.850651502609253, "learning_rate": 6.509639078127265e-07, "loss": 0.3278, "step": 7224 }, { "epoch": 0.3490844083683626, "grad_norm": 2.445638656616211, "learning_rate": 6.509155916316374e-07, "loss": 0.3603, "step": 7225 }, { "epoch": 0.3491327245494516, "grad_norm": 2.933257818222046, "learning_rate": 6.508672754505483e-07, "loss": 0.3507, "step": 7226 }, { "epoch": 0.3491810407305407, "grad_norm": 2.380038022994995, "learning_rate": 6.508189592694593e-07, "loss": 0.2667, "step": 7227 }, { "epoch": 0.3492293569116297, "grad_norm": 2.2476587295532227, "learning_rate": 6.507706430883703e-07, "loss": 0.265, "step": 7228 }, { "epoch": 0.34927767309271873, "grad_norm": 2.5476438999176025, "learning_rate": 6.507223269072812e-07, "loss": 0.2673, "step": 7229 }, { "epoch": 0.3493259892738078, "grad_norm": 6.421016693115234, "learning_rate": 6.506740107261922e-07, "loss": 0.2409, "step": 7230 }, { "epoch": 0.34937430545489684, "grad_norm": 2.965137004852295, "learning_rate": 6.506256945451031e-07, "loss": 0.2567, "step": 7231 }, { "epoch": 0.3494226216359859, "grad_norm": 3.2038376331329346, "learning_rate": 6.505773783640141e-07, "loss": 0.1653, "step": 7232 }, { "epoch": 0.34947093781707494, "grad_norm": 4.393511772155762, "learning_rate": 6.50529062182925e-07, "loss": 0.3307, "step": 7233 }, { "epoch": 0.34951925399816397, "grad_norm": 2.067654609680176, "learning_rate": 6.50480746001836e-07, "loss": 0.1743, "step": 7234 }, { "epoch": 0.34956757017925305, "grad_norm": 2.3099498748779297, "learning_rate": 6.504324298207469e-07, "loss": 0.2551, "step": 7235 }, { "epoch": 0.3496158863603421, "grad_norm": 2.768059492111206, "learning_rate": 6.503841136396578e-07, "loss": 0.2451, "step": 7236 }, { "epoch": 0.3496642025414311, "grad_norm": 2.1975953578948975, "learning_rate": 6.503357974585688e-07, "loss": 0.2851, "step": 7237 }, { "epoch": 0.3497125187225202, "grad_norm": 2.810148239135742, "learning_rate": 6.502874812774798e-07, "loss": 0.267, "step": 7238 }, { "epoch": 0.3497608349036092, "grad_norm": 5.8370585441589355, "learning_rate": 6.502391650963908e-07, "loss": 0.2923, "step": 7239 }, { "epoch": 0.3498091510846983, "grad_norm": 5.597393989562988, "learning_rate": 6.501908489153018e-07, "loss": 0.3999, "step": 7240 }, { "epoch": 0.3498574672657873, "grad_norm": 3.822725296020508, "learning_rate": 6.501425327342128e-07, "loss": 0.3068, "step": 7241 }, { "epoch": 0.34990578344687634, "grad_norm": 2.190681219100952, "learning_rate": 6.500942165531235e-07, "loss": 0.275, "step": 7242 }, { "epoch": 0.3499540996279654, "grad_norm": 3.114225149154663, "learning_rate": 6.500459003720345e-07, "loss": 0.3877, "step": 7243 }, { "epoch": 0.35000241580905445, "grad_norm": 2.5013933181762695, "learning_rate": 6.499975841909455e-07, "loss": 0.254, "step": 7244 }, { "epoch": 0.3500507319901435, "grad_norm": 5.322737693786621, "learning_rate": 6.499492680098565e-07, "loss": 0.4421, "step": 7245 }, { "epoch": 0.35009904817123255, "grad_norm": 4.664974689483643, "learning_rate": 6.499009518287675e-07, "loss": 0.5147, "step": 7246 }, { "epoch": 0.3501473643523216, "grad_norm": 2.906722068786621, "learning_rate": 6.498526356476784e-07, "loss": 0.3775, "step": 7247 }, { "epoch": 0.35019568053341066, "grad_norm": 9.634842872619629, "learning_rate": 6.498043194665893e-07, "loss": 0.2354, "step": 7248 }, { "epoch": 0.3502439967144997, "grad_norm": 3.0766923427581787, "learning_rate": 6.497560032855003e-07, "loss": 0.2272, "step": 7249 }, { "epoch": 0.3502923128955887, "grad_norm": 2.667397975921631, "learning_rate": 6.497076871044112e-07, "loss": 0.2572, "step": 7250 }, { "epoch": 0.3503406290766778, "grad_norm": 1.3809531927108765, "learning_rate": 6.496593709233222e-07, "loss": 0.1719, "step": 7251 }, { "epoch": 0.3503889452577668, "grad_norm": 3.3257009983062744, "learning_rate": 6.496110547422331e-07, "loss": 0.3327, "step": 7252 }, { "epoch": 0.3504372614388559, "grad_norm": 2.534202814102173, "learning_rate": 6.495627385611441e-07, "loss": 0.307, "step": 7253 }, { "epoch": 0.3504855776199449, "grad_norm": 2.4391613006591797, "learning_rate": 6.49514422380055e-07, "loss": 0.2338, "step": 7254 }, { "epoch": 0.35053389380103395, "grad_norm": 5.567183494567871, "learning_rate": 6.49466106198966e-07, "loss": 0.3168, "step": 7255 }, { "epoch": 0.350582209982123, "grad_norm": 3.3319199085235596, "learning_rate": 6.49417790017877e-07, "loss": 0.3867, "step": 7256 }, { "epoch": 0.35063052616321205, "grad_norm": 2.002600908279419, "learning_rate": 6.493694738367879e-07, "loss": 0.2428, "step": 7257 }, { "epoch": 0.35067884234430113, "grad_norm": 3.923593044281006, "learning_rate": 6.493211576556989e-07, "loss": 0.3241, "step": 7258 }, { "epoch": 0.35072715852539016, "grad_norm": 3.7177114486694336, "learning_rate": 6.492728414746098e-07, "loss": 0.3036, "step": 7259 }, { "epoch": 0.3507754747064792, "grad_norm": 2.1653456687927246, "learning_rate": 6.492245252935208e-07, "loss": 0.2359, "step": 7260 }, { "epoch": 0.35082379088756827, "grad_norm": 8.94600772857666, "learning_rate": 6.491762091124317e-07, "loss": 0.3532, "step": 7261 }, { "epoch": 0.3508721070686573, "grad_norm": 2.372523069381714, "learning_rate": 6.491278929313426e-07, "loss": 0.2808, "step": 7262 }, { "epoch": 0.3509204232497463, "grad_norm": 2.6592023372650146, "learning_rate": 6.490795767502536e-07, "loss": 0.2587, "step": 7263 }, { "epoch": 0.3509687394308354, "grad_norm": 2.955264091491699, "learning_rate": 6.490312605691646e-07, "loss": 0.2837, "step": 7264 }, { "epoch": 0.3510170556119244, "grad_norm": 2.4419796466827393, "learning_rate": 6.489829443880756e-07, "loss": 0.2962, "step": 7265 }, { "epoch": 0.3510653717930135, "grad_norm": 2.7811386585235596, "learning_rate": 6.489346282069866e-07, "loss": 0.3706, "step": 7266 }, { "epoch": 0.35111368797410253, "grad_norm": 2.4785032272338867, "learning_rate": 6.488863120258974e-07, "loss": 0.256, "step": 7267 }, { "epoch": 0.35116200415519155, "grad_norm": 1.9617887735366821, "learning_rate": 6.488379958448083e-07, "loss": 0.2427, "step": 7268 }, { "epoch": 0.35121032033628063, "grad_norm": 3.382272481918335, "learning_rate": 6.487896796637193e-07, "loss": 0.3007, "step": 7269 }, { "epoch": 0.35125863651736966, "grad_norm": 2.4340856075286865, "learning_rate": 6.487413634826303e-07, "loss": 0.354, "step": 7270 }, { "epoch": 0.35130695269845874, "grad_norm": 2.2445929050445557, "learning_rate": 6.486930473015413e-07, "loss": 0.3038, "step": 7271 }, { "epoch": 0.35135526887954777, "grad_norm": 5.46887731552124, "learning_rate": 6.486447311204523e-07, "loss": 0.2483, "step": 7272 }, { "epoch": 0.3514035850606368, "grad_norm": 2.943952798843384, "learning_rate": 6.485964149393631e-07, "loss": 0.3166, "step": 7273 }, { "epoch": 0.3514519012417259, "grad_norm": 2.3970799446105957, "learning_rate": 6.485480987582741e-07, "loss": 0.2672, "step": 7274 }, { "epoch": 0.3515002174228149, "grad_norm": 2.2217867374420166, "learning_rate": 6.48499782577185e-07, "loss": 0.256, "step": 7275 }, { "epoch": 0.3515485336039039, "grad_norm": 2.1833136081695557, "learning_rate": 6.48451466396096e-07, "loss": 0.2767, "step": 7276 }, { "epoch": 0.351596849784993, "grad_norm": 3.294130802154541, "learning_rate": 6.48403150215007e-07, "loss": 0.2507, "step": 7277 }, { "epoch": 0.35164516596608203, "grad_norm": 2.9932329654693604, "learning_rate": 6.483548340339179e-07, "loss": 0.3939, "step": 7278 }, { "epoch": 0.3516934821471711, "grad_norm": 5.416886806488037, "learning_rate": 6.483065178528289e-07, "loss": 0.3446, "step": 7279 }, { "epoch": 0.35174179832826014, "grad_norm": 2.74025297164917, "learning_rate": 6.482582016717398e-07, "loss": 0.3924, "step": 7280 }, { "epoch": 0.35179011450934916, "grad_norm": 2.9131391048431396, "learning_rate": 6.482098854906508e-07, "loss": 0.3634, "step": 7281 }, { "epoch": 0.35183843069043824, "grad_norm": 2.873852491378784, "learning_rate": 6.481615693095618e-07, "loss": 0.2377, "step": 7282 }, { "epoch": 0.35188674687152727, "grad_norm": 1.407894492149353, "learning_rate": 6.481132531284727e-07, "loss": 0.1855, "step": 7283 }, { "epoch": 0.35193506305261635, "grad_norm": 3.0009605884552, "learning_rate": 6.480649369473836e-07, "loss": 0.3181, "step": 7284 }, { "epoch": 0.3519833792337054, "grad_norm": 2.2203755378723145, "learning_rate": 6.480166207662946e-07, "loss": 0.3158, "step": 7285 }, { "epoch": 0.3520316954147944, "grad_norm": 2.3984076976776123, "learning_rate": 6.479683045852055e-07, "loss": 0.3183, "step": 7286 }, { "epoch": 0.3520800115958835, "grad_norm": 3.324401617050171, "learning_rate": 6.479199884041165e-07, "loss": 0.3301, "step": 7287 }, { "epoch": 0.3521283277769725, "grad_norm": 1.9442683458328247, "learning_rate": 6.478716722230274e-07, "loss": 0.2359, "step": 7288 }, { "epoch": 0.35217664395806153, "grad_norm": 2.6384029388427734, "learning_rate": 6.478233560419384e-07, "loss": 0.2735, "step": 7289 }, { "epoch": 0.3522249601391506, "grad_norm": 1.8765250444412231, "learning_rate": 6.477750398608494e-07, "loss": 0.1986, "step": 7290 }, { "epoch": 0.35227327632023964, "grad_norm": 3.5969157218933105, "learning_rate": 6.477267236797604e-07, "loss": 0.3773, "step": 7291 }, { "epoch": 0.3523215925013287, "grad_norm": 2.7073121070861816, "learning_rate": 6.476784074986714e-07, "loss": 0.1876, "step": 7292 }, { "epoch": 0.35236990868241774, "grad_norm": 1.495192050933838, "learning_rate": 6.476300913175822e-07, "loss": 0.1631, "step": 7293 }, { "epoch": 0.35241822486350677, "grad_norm": 3.3451027870178223, "learning_rate": 6.475817751364931e-07, "loss": 0.3068, "step": 7294 }, { "epoch": 0.35246654104459585, "grad_norm": 2.5739152431488037, "learning_rate": 6.475334589554041e-07, "loss": 0.2391, "step": 7295 }, { "epoch": 0.3525148572256849, "grad_norm": 1.9587067365646362, "learning_rate": 6.474851427743151e-07, "loss": 0.2317, "step": 7296 }, { "epoch": 0.35256317340677396, "grad_norm": 14.953944206237793, "learning_rate": 6.474368265932261e-07, "loss": 0.241, "step": 7297 }, { "epoch": 0.352611489587863, "grad_norm": 3.0966761112213135, "learning_rate": 6.473885104121371e-07, "loss": 0.426, "step": 7298 }, { "epoch": 0.352659805768952, "grad_norm": 2.5833630561828613, "learning_rate": 6.473401942310479e-07, "loss": 0.334, "step": 7299 }, { "epoch": 0.3527081219500411, "grad_norm": 4.127659797668457, "learning_rate": 6.472918780499589e-07, "loss": 0.3471, "step": 7300 }, { "epoch": 0.3527564381311301, "grad_norm": 7.76121187210083, "learning_rate": 6.472435618688698e-07, "loss": 0.2705, "step": 7301 }, { "epoch": 0.35280475431221914, "grad_norm": 3.475163221359253, "learning_rate": 6.471952456877808e-07, "loss": 0.2841, "step": 7302 }, { "epoch": 0.3528530704933082, "grad_norm": 4.878878593444824, "learning_rate": 6.471469295066918e-07, "loss": 0.457, "step": 7303 }, { "epoch": 0.35290138667439724, "grad_norm": 2.468528985977173, "learning_rate": 6.470986133256027e-07, "loss": 0.3135, "step": 7304 }, { "epoch": 0.3529497028554863, "grad_norm": 3.0778253078460693, "learning_rate": 6.470502971445136e-07, "loss": 0.4708, "step": 7305 }, { "epoch": 0.35299801903657535, "grad_norm": 2.1479368209838867, "learning_rate": 6.470019809634246e-07, "loss": 0.2193, "step": 7306 }, { "epoch": 0.3530463352176644, "grad_norm": 3.3976991176605225, "learning_rate": 6.469536647823356e-07, "loss": 0.483, "step": 7307 }, { "epoch": 0.35309465139875346, "grad_norm": 3.072080373764038, "learning_rate": 6.469053486012466e-07, "loss": 0.3284, "step": 7308 }, { "epoch": 0.3531429675798425, "grad_norm": 3.230201244354248, "learning_rate": 6.468570324201574e-07, "loss": 0.2849, "step": 7309 }, { "epoch": 0.35319128376093156, "grad_norm": 2.925673723220825, "learning_rate": 6.468087162390684e-07, "loss": 0.3269, "step": 7310 }, { "epoch": 0.3532395999420206, "grad_norm": 3.6491665840148926, "learning_rate": 6.467604000579794e-07, "loss": 0.2525, "step": 7311 }, { "epoch": 0.3532879161231096, "grad_norm": 2.7579541206359863, "learning_rate": 6.467120838768903e-07, "loss": 0.2848, "step": 7312 }, { "epoch": 0.3533362323041987, "grad_norm": 2.8499844074249268, "learning_rate": 6.466637676958013e-07, "loss": 0.3508, "step": 7313 }, { "epoch": 0.3533845484852877, "grad_norm": 2.2252535820007324, "learning_rate": 6.466154515147122e-07, "loss": 0.2026, "step": 7314 }, { "epoch": 0.35343286466637674, "grad_norm": 3.5668468475341797, "learning_rate": 6.465671353336232e-07, "loss": 0.2394, "step": 7315 }, { "epoch": 0.3534811808474658, "grad_norm": 2.8877365589141846, "learning_rate": 6.465188191525342e-07, "loss": 0.359, "step": 7316 }, { "epoch": 0.35352949702855485, "grad_norm": 17.74365234375, "learning_rate": 6.464705029714452e-07, "loss": 0.3325, "step": 7317 }, { "epoch": 0.35357781320964393, "grad_norm": 2.2294387817382812, "learning_rate": 6.46422186790356e-07, "loss": 0.2257, "step": 7318 }, { "epoch": 0.35362612939073296, "grad_norm": 2.1622517108917236, "learning_rate": 6.46373870609267e-07, "loss": 0.2411, "step": 7319 }, { "epoch": 0.353674445571822, "grad_norm": 2.8487162590026855, "learning_rate": 6.463255544281779e-07, "loss": 0.3113, "step": 7320 }, { "epoch": 0.35372276175291106, "grad_norm": 2.5249948501586914, "learning_rate": 6.462772382470889e-07, "loss": 0.3054, "step": 7321 }, { "epoch": 0.3537710779340001, "grad_norm": 3.8175601959228516, "learning_rate": 6.462289220659999e-07, "loss": 0.2197, "step": 7322 }, { "epoch": 0.35381939411508917, "grad_norm": 1.974196195602417, "learning_rate": 6.461806058849109e-07, "loss": 0.234, "step": 7323 }, { "epoch": 0.3538677102961782, "grad_norm": 2.570575714111328, "learning_rate": 6.461322897038219e-07, "loss": 0.2561, "step": 7324 }, { "epoch": 0.3539160264772672, "grad_norm": 2.53346848487854, "learning_rate": 6.460839735227327e-07, "loss": 0.3189, "step": 7325 }, { "epoch": 0.3539643426583563, "grad_norm": 3.4546308517456055, "learning_rate": 6.460356573416436e-07, "loss": 0.2245, "step": 7326 }, { "epoch": 0.3540126588394453, "grad_norm": 4.118886947631836, "learning_rate": 6.459873411605546e-07, "loss": 0.2084, "step": 7327 }, { "epoch": 0.35406097502053435, "grad_norm": 2.420694351196289, "learning_rate": 6.459390249794656e-07, "loss": 0.2987, "step": 7328 }, { "epoch": 0.35410929120162343, "grad_norm": 3.651624917984009, "learning_rate": 6.458907087983766e-07, "loss": 0.265, "step": 7329 }, { "epoch": 0.35415760738271246, "grad_norm": 3.19307541847229, "learning_rate": 6.458423926172875e-07, "loss": 0.3527, "step": 7330 }, { "epoch": 0.35420592356380154, "grad_norm": 2.194199800491333, "learning_rate": 6.457940764361984e-07, "loss": 0.298, "step": 7331 }, { "epoch": 0.35425423974489056, "grad_norm": 4.8545989990234375, "learning_rate": 6.457457602551094e-07, "loss": 0.3426, "step": 7332 }, { "epoch": 0.3543025559259796, "grad_norm": 2.7547690868377686, "learning_rate": 6.456974440740204e-07, "loss": 0.3704, "step": 7333 }, { "epoch": 0.35435087210706867, "grad_norm": 2.137544631958008, "learning_rate": 6.456491278929314e-07, "loss": 0.2293, "step": 7334 }, { "epoch": 0.3543991882881577, "grad_norm": 3.031907081604004, "learning_rate": 6.456008117118422e-07, "loss": 0.3046, "step": 7335 }, { "epoch": 0.3544475044692468, "grad_norm": 8.663005828857422, "learning_rate": 6.455524955307532e-07, "loss": 0.2746, "step": 7336 }, { "epoch": 0.3544958206503358, "grad_norm": 2.4729297161102295, "learning_rate": 6.455041793496641e-07, "loss": 0.3237, "step": 7337 }, { "epoch": 0.3545441368314248, "grad_norm": 2.3244946002960205, "learning_rate": 6.454558631685751e-07, "loss": 0.2517, "step": 7338 }, { "epoch": 0.3545924530125139, "grad_norm": 2.2365853786468506, "learning_rate": 6.454075469874861e-07, "loss": 0.2361, "step": 7339 }, { "epoch": 0.35464076919360293, "grad_norm": 5.036532402038574, "learning_rate": 6.45359230806397e-07, "loss": 0.2288, "step": 7340 }, { "epoch": 0.35468908537469196, "grad_norm": 2.5161044597625732, "learning_rate": 6.45310914625308e-07, "loss": 0.2909, "step": 7341 }, { "epoch": 0.35473740155578104, "grad_norm": 2.776123285293579, "learning_rate": 6.45262598444219e-07, "loss": 0.2038, "step": 7342 }, { "epoch": 0.35478571773687007, "grad_norm": 1.9227310419082642, "learning_rate": 6.4521428226313e-07, "loss": 0.2354, "step": 7343 }, { "epoch": 0.35483403391795915, "grad_norm": 2.1127779483795166, "learning_rate": 6.451659660820408e-07, "loss": 0.2955, "step": 7344 }, { "epoch": 0.35488235009904817, "grad_norm": 5.005458354949951, "learning_rate": 6.451176499009518e-07, "loss": 0.3556, "step": 7345 }, { "epoch": 0.3549306662801372, "grad_norm": 2.4610931873321533, "learning_rate": 6.450693337198627e-07, "loss": 0.3417, "step": 7346 }, { "epoch": 0.3549789824612263, "grad_norm": 2.927849531173706, "learning_rate": 6.450210175387737e-07, "loss": 0.2837, "step": 7347 }, { "epoch": 0.3550272986423153, "grad_norm": 2.787153482437134, "learning_rate": 6.449727013576847e-07, "loss": 0.3422, "step": 7348 }, { "epoch": 0.3550756148234044, "grad_norm": 2.799286127090454, "learning_rate": 6.449243851765957e-07, "loss": 0.2802, "step": 7349 }, { "epoch": 0.3551239310044934, "grad_norm": 2.9673352241516113, "learning_rate": 6.448760689955066e-07, "loss": 0.4953, "step": 7350 }, { "epoch": 0.35517224718558243, "grad_norm": 4.297056198120117, "learning_rate": 6.448277528144174e-07, "loss": 0.4365, "step": 7351 }, { "epoch": 0.3552205633666715, "grad_norm": 3.177494764328003, "learning_rate": 6.447794366333284e-07, "loss": 0.4017, "step": 7352 }, { "epoch": 0.35526887954776054, "grad_norm": 4.959262847900391, "learning_rate": 6.447311204522394e-07, "loss": 0.2314, "step": 7353 }, { "epoch": 0.35531719572884957, "grad_norm": 3.134812831878662, "learning_rate": 6.446828042711504e-07, "loss": 0.345, "step": 7354 }, { "epoch": 0.35536551190993865, "grad_norm": 3.7646379470825195, "learning_rate": 6.446344880900614e-07, "loss": 0.3604, "step": 7355 }, { "epoch": 0.3554138280910277, "grad_norm": 2.198996067047119, "learning_rate": 6.445861719089723e-07, "loss": 0.2149, "step": 7356 }, { "epoch": 0.35546214427211675, "grad_norm": 2.2046289443969727, "learning_rate": 6.445378557278832e-07, "loss": 0.1713, "step": 7357 }, { "epoch": 0.3555104604532058, "grad_norm": 3.1629233360290527, "learning_rate": 6.444895395467942e-07, "loss": 0.3074, "step": 7358 }, { "epoch": 0.3555587766342948, "grad_norm": 2.2108755111694336, "learning_rate": 6.444412233657052e-07, "loss": 0.2503, "step": 7359 }, { "epoch": 0.3556070928153839, "grad_norm": 2.7716469764709473, "learning_rate": 6.443929071846161e-07, "loss": 0.4299, "step": 7360 }, { "epoch": 0.3556554089964729, "grad_norm": 3.9148850440979004, "learning_rate": 6.44344591003527e-07, "loss": 0.2226, "step": 7361 }, { "epoch": 0.355703725177562, "grad_norm": 2.3040027618408203, "learning_rate": 6.44296274822438e-07, "loss": 0.3155, "step": 7362 }, { "epoch": 0.355752041358651, "grad_norm": 3.5134313106536865, "learning_rate": 6.442479586413489e-07, "loss": 0.2406, "step": 7363 }, { "epoch": 0.35580035753974004, "grad_norm": 2.186046838760376, "learning_rate": 6.441996424602599e-07, "loss": 0.177, "step": 7364 }, { "epoch": 0.3558486737208291, "grad_norm": 1.9141911268234253, "learning_rate": 6.441513262791709e-07, "loss": 0.1842, "step": 7365 }, { "epoch": 0.35589698990191815, "grad_norm": 2.594985008239746, "learning_rate": 6.441030100980818e-07, "loss": 0.2313, "step": 7366 }, { "epoch": 0.3559453060830072, "grad_norm": 2.625788688659668, "learning_rate": 6.440546939169928e-07, "loss": 0.2403, "step": 7367 }, { "epoch": 0.35599362226409625, "grad_norm": 2.5428755283355713, "learning_rate": 6.440063777359038e-07, "loss": 0.1832, "step": 7368 }, { "epoch": 0.3560419384451853, "grad_norm": 1.8750920295715332, "learning_rate": 6.439580615548146e-07, "loss": 0.1647, "step": 7369 }, { "epoch": 0.35609025462627436, "grad_norm": 1.638006567955017, "learning_rate": 6.439097453737256e-07, "loss": 0.1519, "step": 7370 }, { "epoch": 0.3561385708073634, "grad_norm": 3.240064859390259, "learning_rate": 6.438614291926366e-07, "loss": 0.2724, "step": 7371 }, { "epoch": 0.3561868869884524, "grad_norm": 2.9038217067718506, "learning_rate": 6.438131130115475e-07, "loss": 0.2376, "step": 7372 }, { "epoch": 0.3562352031695415, "grad_norm": 3.455610513687134, "learning_rate": 6.437647968304585e-07, "loss": 0.2646, "step": 7373 }, { "epoch": 0.3562835193506305, "grad_norm": 2.978278398513794, "learning_rate": 6.437164806493695e-07, "loss": 0.3277, "step": 7374 }, { "epoch": 0.3563318355317196, "grad_norm": 3.727440595626831, "learning_rate": 6.436681644682805e-07, "loss": 0.3459, "step": 7375 }, { "epoch": 0.3563801517128086, "grad_norm": 3.4783430099487305, "learning_rate": 6.436198482871914e-07, "loss": 0.2229, "step": 7376 }, { "epoch": 0.35642846789389765, "grad_norm": 2.606182813644409, "learning_rate": 6.435715321061022e-07, "loss": 0.2987, "step": 7377 }, { "epoch": 0.35647678407498673, "grad_norm": 2.7510039806365967, "learning_rate": 6.435232159250132e-07, "loss": 0.3513, "step": 7378 }, { "epoch": 0.35652510025607576, "grad_norm": 2.5338237285614014, "learning_rate": 6.434748997439242e-07, "loss": 0.2728, "step": 7379 }, { "epoch": 0.3565734164371648, "grad_norm": 1.8216936588287354, "learning_rate": 6.434265835628352e-07, "loss": 0.187, "step": 7380 }, { "epoch": 0.35662173261825386, "grad_norm": 3.288583517074585, "learning_rate": 6.433782673817462e-07, "loss": 0.3665, "step": 7381 }, { "epoch": 0.3566700487993429, "grad_norm": 2.1013002395629883, "learning_rate": 6.43329951200657e-07, "loss": 0.2274, "step": 7382 }, { "epoch": 0.35671836498043197, "grad_norm": 4.033844947814941, "learning_rate": 6.43281635019568e-07, "loss": 0.3215, "step": 7383 }, { "epoch": 0.356766681161521, "grad_norm": 3.76253604888916, "learning_rate": 6.43233318838479e-07, "loss": 0.2901, "step": 7384 }, { "epoch": 0.35681499734261, "grad_norm": 2.3696036338806152, "learning_rate": 6.4318500265739e-07, "loss": 0.2888, "step": 7385 }, { "epoch": 0.3568633135236991, "grad_norm": 3.043036699295044, "learning_rate": 6.431366864763009e-07, "loss": 0.4319, "step": 7386 }, { "epoch": 0.3569116297047881, "grad_norm": 12.350430488586426, "learning_rate": 6.430883702952118e-07, "loss": 0.3137, "step": 7387 }, { "epoch": 0.3569599458858772, "grad_norm": 12.93999195098877, "learning_rate": 6.430400541141228e-07, "loss": 0.4269, "step": 7388 }, { "epoch": 0.35700826206696623, "grad_norm": 2.027325391769409, "learning_rate": 6.429917379330337e-07, "loss": 0.2183, "step": 7389 }, { "epoch": 0.35705657824805526, "grad_norm": 2.7728376388549805, "learning_rate": 6.429434217519447e-07, "loss": 0.3409, "step": 7390 }, { "epoch": 0.35710489442914434, "grad_norm": 2.809605360031128, "learning_rate": 6.428951055708557e-07, "loss": 0.3703, "step": 7391 }, { "epoch": 0.35715321061023336, "grad_norm": 1.770079255104065, "learning_rate": 6.428467893897666e-07, "loss": 0.1978, "step": 7392 }, { "epoch": 0.3572015267913224, "grad_norm": 2.324152946472168, "learning_rate": 6.427984732086776e-07, "loss": 0.2384, "step": 7393 }, { "epoch": 0.35724984297241147, "grad_norm": 6.935979843139648, "learning_rate": 6.427501570275885e-07, "loss": 0.3086, "step": 7394 }, { "epoch": 0.3572981591535005, "grad_norm": 3.642543077468872, "learning_rate": 6.427018408464994e-07, "loss": 0.2994, "step": 7395 }, { "epoch": 0.3573464753345896, "grad_norm": 3.235311269760132, "learning_rate": 6.426535246654104e-07, "loss": 0.2908, "step": 7396 }, { "epoch": 0.3573947915156786, "grad_norm": 1.385637879371643, "learning_rate": 6.426052084843213e-07, "loss": 0.1826, "step": 7397 }, { "epoch": 0.3574431076967676, "grad_norm": 2.7547638416290283, "learning_rate": 6.425568923032323e-07, "loss": 0.4052, "step": 7398 }, { "epoch": 0.3574914238778567, "grad_norm": 3.4153807163238525, "learning_rate": 6.425085761221433e-07, "loss": 0.3263, "step": 7399 }, { "epoch": 0.35753974005894573, "grad_norm": 2.881265640258789, "learning_rate": 6.424602599410543e-07, "loss": 0.423, "step": 7400 }, { "epoch": 0.3575880562400348, "grad_norm": 3.55488920211792, "learning_rate": 6.424119437599653e-07, "loss": 0.357, "step": 7401 }, { "epoch": 0.35763637242112384, "grad_norm": 2.355898380279541, "learning_rate": 6.423636275788762e-07, "loss": 0.2433, "step": 7402 }, { "epoch": 0.35768468860221286, "grad_norm": 2.3972675800323486, "learning_rate": 6.42315311397787e-07, "loss": 0.2368, "step": 7403 }, { "epoch": 0.35773300478330194, "grad_norm": 4.107419967651367, "learning_rate": 6.42266995216698e-07, "loss": 0.2749, "step": 7404 }, { "epoch": 0.35778132096439097, "grad_norm": 2.937589168548584, "learning_rate": 6.42218679035609e-07, "loss": 0.2529, "step": 7405 }, { "epoch": 0.35782963714548, "grad_norm": 3.3352081775665283, "learning_rate": 6.4217036285452e-07, "loss": 0.3493, "step": 7406 }, { "epoch": 0.3578779533265691, "grad_norm": 2.1336116790771484, "learning_rate": 6.42122046673431e-07, "loss": 0.2906, "step": 7407 }, { "epoch": 0.3579262695076581, "grad_norm": 2.225026845932007, "learning_rate": 6.420737304923418e-07, "loss": 0.2244, "step": 7408 }, { "epoch": 0.3579745856887472, "grad_norm": 3.06424617767334, "learning_rate": 6.420254143112528e-07, "loss": 0.5242, "step": 7409 }, { "epoch": 0.3580229018698362, "grad_norm": 2.052734613418579, "learning_rate": 6.419770981301638e-07, "loss": 0.1931, "step": 7410 }, { "epoch": 0.35807121805092523, "grad_norm": 2.2030997276306152, "learning_rate": 6.419287819490747e-07, "loss": 0.249, "step": 7411 }, { "epoch": 0.3581195342320143, "grad_norm": 9.00113296508789, "learning_rate": 6.418804657679857e-07, "loss": 0.3177, "step": 7412 }, { "epoch": 0.35816785041310334, "grad_norm": 3.74117374420166, "learning_rate": 6.418321495868966e-07, "loss": 0.3944, "step": 7413 }, { "epoch": 0.3582161665941924, "grad_norm": 1.2372077703475952, "learning_rate": 6.417838334058075e-07, "loss": 0.1115, "step": 7414 }, { "epoch": 0.35826448277528145, "grad_norm": 3.70670223236084, "learning_rate": 6.417355172247185e-07, "loss": 0.3007, "step": 7415 }, { "epoch": 0.35831279895637047, "grad_norm": 2.1728994846343994, "learning_rate": 6.416872010436295e-07, "loss": 0.2605, "step": 7416 }, { "epoch": 0.35836111513745955, "grad_norm": 2.4503140449523926, "learning_rate": 6.416388848625405e-07, "loss": 0.2139, "step": 7417 }, { "epoch": 0.3584094313185486, "grad_norm": 2.7702813148498535, "learning_rate": 6.415905686814514e-07, "loss": 0.2869, "step": 7418 }, { "epoch": 0.3584577474996376, "grad_norm": 2.6978797912597656, "learning_rate": 6.415422525003623e-07, "loss": 0.2556, "step": 7419 }, { "epoch": 0.3585060636807267, "grad_norm": 3.2277846336364746, "learning_rate": 6.414939363192733e-07, "loss": 0.2898, "step": 7420 }, { "epoch": 0.3585543798618157, "grad_norm": 2.275761365890503, "learning_rate": 6.414456201381842e-07, "loss": 0.2609, "step": 7421 }, { "epoch": 0.3586026960429048, "grad_norm": 2.379434823989868, "learning_rate": 6.413973039570952e-07, "loss": 0.2376, "step": 7422 }, { "epoch": 0.3586510122239938, "grad_norm": 2.4209914207458496, "learning_rate": 6.413489877760061e-07, "loss": 0.267, "step": 7423 }, { "epoch": 0.35869932840508284, "grad_norm": 4.925489902496338, "learning_rate": 6.413006715949171e-07, "loss": 0.4875, "step": 7424 }, { "epoch": 0.3587476445861719, "grad_norm": 3.195601224899292, "learning_rate": 6.412523554138281e-07, "loss": 0.3867, "step": 7425 }, { "epoch": 0.35879596076726095, "grad_norm": 4.568935871124268, "learning_rate": 6.412040392327391e-07, "loss": 0.2991, "step": 7426 }, { "epoch": 0.35884427694835, "grad_norm": 2.59616756439209, "learning_rate": 6.4115572305165e-07, "loss": 0.2542, "step": 7427 }, { "epoch": 0.35889259312943905, "grad_norm": 1.7774839401245117, "learning_rate": 6.411074068705609e-07, "loss": 0.1827, "step": 7428 }, { "epoch": 0.3589409093105281, "grad_norm": 2.8094260692596436, "learning_rate": 6.410590906894718e-07, "loss": 0.3308, "step": 7429 }, { "epoch": 0.35898922549161716, "grad_norm": 2.4236505031585693, "learning_rate": 6.410107745083828e-07, "loss": 0.2907, "step": 7430 }, { "epoch": 0.3590375416727062, "grad_norm": 1.1661896705627441, "learning_rate": 6.409624583272938e-07, "loss": 0.1412, "step": 7431 }, { "epoch": 0.3590858578537952, "grad_norm": 3.100567579269409, "learning_rate": 6.409141421462048e-07, "loss": 0.3764, "step": 7432 }, { "epoch": 0.3591341740348843, "grad_norm": 4.9639739990234375, "learning_rate": 6.408658259651158e-07, "loss": 0.3452, "step": 7433 }, { "epoch": 0.3591824902159733, "grad_norm": 4.721507549285889, "learning_rate": 6.408175097840266e-07, "loss": 0.1857, "step": 7434 }, { "epoch": 0.3592308063970624, "grad_norm": 1.9987435340881348, "learning_rate": 6.407691936029376e-07, "loss": 0.2039, "step": 7435 }, { "epoch": 0.3592791225781514, "grad_norm": 4.3849968910217285, "learning_rate": 6.407208774218485e-07, "loss": 0.3112, "step": 7436 }, { "epoch": 0.35932743875924045, "grad_norm": 2.1774468421936035, "learning_rate": 6.406725612407595e-07, "loss": 0.1866, "step": 7437 }, { "epoch": 0.35937575494032953, "grad_norm": 2.474829912185669, "learning_rate": 6.406242450596705e-07, "loss": 0.3071, "step": 7438 }, { "epoch": 0.35942407112141855, "grad_norm": 3.6585559844970703, "learning_rate": 6.405759288785814e-07, "loss": 0.4124, "step": 7439 }, { "epoch": 0.35947238730250763, "grad_norm": 5.315946578979492, "learning_rate": 6.405276126974923e-07, "loss": 0.2361, "step": 7440 }, { "epoch": 0.35952070348359666, "grad_norm": 2.770658493041992, "learning_rate": 6.404792965164033e-07, "loss": 0.3418, "step": 7441 }, { "epoch": 0.3595690196646857, "grad_norm": 2.0275213718414307, "learning_rate": 6.404309803353143e-07, "loss": 0.2122, "step": 7442 }, { "epoch": 0.35961733584577477, "grad_norm": 2.9214608669281006, "learning_rate": 6.403826641542253e-07, "loss": 0.3852, "step": 7443 }, { "epoch": 0.3596656520268638, "grad_norm": 2.374377489089966, "learning_rate": 6.403343479731362e-07, "loss": 0.2207, "step": 7444 }, { "epoch": 0.3597139682079528, "grad_norm": 3.820801019668579, "learning_rate": 6.402860317920471e-07, "loss": 0.3281, "step": 7445 }, { "epoch": 0.3597622843890419, "grad_norm": 5.200429439544678, "learning_rate": 6.40237715610958e-07, "loss": 0.2817, "step": 7446 }, { "epoch": 0.3598106005701309, "grad_norm": 3.0107784271240234, "learning_rate": 6.40189399429869e-07, "loss": 0.2945, "step": 7447 }, { "epoch": 0.35985891675122, "grad_norm": 3.601196527481079, "learning_rate": 6.4014108324878e-07, "loss": 0.3045, "step": 7448 }, { "epoch": 0.35990723293230903, "grad_norm": 2.6843645572662354, "learning_rate": 6.400927670676909e-07, "loss": 0.3128, "step": 7449 }, { "epoch": 0.35995554911339805, "grad_norm": 4.737784385681152, "learning_rate": 6.400444508866019e-07, "loss": 0.5211, "step": 7450 }, { "epoch": 0.36000386529448714, "grad_norm": 3.049553632736206, "learning_rate": 6.399961347055129e-07, "loss": 0.3422, "step": 7451 }, { "epoch": 0.36005218147557616, "grad_norm": 2.73783802986145, "learning_rate": 6.399478185244239e-07, "loss": 0.355, "step": 7452 }, { "epoch": 0.36010049765666524, "grad_norm": 2.392421245574951, "learning_rate": 6.398995023433347e-07, "loss": 0.282, "step": 7453 }, { "epoch": 0.36014881383775427, "grad_norm": 3.721139669418335, "learning_rate": 6.398511861622457e-07, "loss": 0.2729, "step": 7454 }, { "epoch": 0.3601971300188433, "grad_norm": 5.257205486297607, "learning_rate": 6.398028699811566e-07, "loss": 0.3337, "step": 7455 }, { "epoch": 0.3602454461999324, "grad_norm": 1.7837610244750977, "learning_rate": 6.397545538000676e-07, "loss": 0.2319, "step": 7456 }, { "epoch": 0.3602937623810214, "grad_norm": 3.195951223373413, "learning_rate": 6.397062376189786e-07, "loss": 0.4922, "step": 7457 }, { "epoch": 0.3603420785621104, "grad_norm": 2.709191083908081, "learning_rate": 6.396579214378896e-07, "loss": 0.3782, "step": 7458 }, { "epoch": 0.3603903947431995, "grad_norm": 3.3680498600006104, "learning_rate": 6.396096052568005e-07, "loss": 0.4615, "step": 7459 }, { "epoch": 0.36043871092428853, "grad_norm": 12.176624298095703, "learning_rate": 6.395612890757114e-07, "loss": 0.3942, "step": 7460 }, { "epoch": 0.3604870271053776, "grad_norm": 2.465710401535034, "learning_rate": 6.395129728946224e-07, "loss": 0.2125, "step": 7461 }, { "epoch": 0.36053534328646664, "grad_norm": 2.6108837127685547, "learning_rate": 6.394646567135333e-07, "loss": 0.2235, "step": 7462 }, { "epoch": 0.36058365946755566, "grad_norm": 2.7115328311920166, "learning_rate": 6.394163405324443e-07, "loss": 0.2186, "step": 7463 }, { "epoch": 0.36063197564864474, "grad_norm": 4.738903045654297, "learning_rate": 6.393680243513553e-07, "loss": 0.2435, "step": 7464 }, { "epoch": 0.36068029182973377, "grad_norm": 3.4881627559661865, "learning_rate": 6.393197081702661e-07, "loss": 0.3668, "step": 7465 }, { "epoch": 0.36072860801082285, "grad_norm": 4.08364200592041, "learning_rate": 6.392713919891771e-07, "loss": 0.1624, "step": 7466 }, { "epoch": 0.3607769241919119, "grad_norm": 2.0691769123077393, "learning_rate": 6.392230758080881e-07, "loss": 0.2333, "step": 7467 }, { "epoch": 0.3608252403730009, "grad_norm": 2.544264316558838, "learning_rate": 6.391747596269991e-07, "loss": 0.3128, "step": 7468 }, { "epoch": 0.36087355655409, "grad_norm": 15.515605926513672, "learning_rate": 6.391264434459101e-07, "loss": 0.3953, "step": 7469 }, { "epoch": 0.360921872735179, "grad_norm": 2.1367664337158203, "learning_rate": 6.390781272648209e-07, "loss": 0.2034, "step": 7470 }, { "epoch": 0.36097018891626803, "grad_norm": 2.632399797439575, "learning_rate": 6.390298110837319e-07, "loss": 0.2531, "step": 7471 }, { "epoch": 0.3610185050973571, "grad_norm": 2.556818723678589, "learning_rate": 6.389814949026428e-07, "loss": 0.2618, "step": 7472 }, { "epoch": 0.36106682127844614, "grad_norm": 3.1119391918182373, "learning_rate": 6.389331787215538e-07, "loss": 0.3682, "step": 7473 }, { "epoch": 0.3611151374595352, "grad_norm": 4.198526859283447, "learning_rate": 6.388848625404648e-07, "loss": 0.2356, "step": 7474 }, { "epoch": 0.36116345364062424, "grad_norm": 2.4163753986358643, "learning_rate": 6.388365463593757e-07, "loss": 0.3172, "step": 7475 }, { "epoch": 0.36121176982171327, "grad_norm": 10.333229064941406, "learning_rate": 6.387882301782867e-07, "loss": 0.3148, "step": 7476 }, { "epoch": 0.36126008600280235, "grad_norm": 2.6714565753936768, "learning_rate": 6.387399139971977e-07, "loss": 0.3085, "step": 7477 }, { "epoch": 0.3613084021838914, "grad_norm": 2.031233072280884, "learning_rate": 6.386915978161085e-07, "loss": 0.2241, "step": 7478 }, { "epoch": 0.36135671836498046, "grad_norm": 2.2362258434295654, "learning_rate": 6.386432816350195e-07, "loss": 0.2605, "step": 7479 }, { "epoch": 0.3614050345460695, "grad_norm": 3.6981160640716553, "learning_rate": 6.385949654539305e-07, "loss": 0.2892, "step": 7480 }, { "epoch": 0.3614533507271585, "grad_norm": 8.48836612701416, "learning_rate": 6.385466492728414e-07, "loss": 0.3223, "step": 7481 }, { "epoch": 0.3615016669082476, "grad_norm": 3.4144132137298584, "learning_rate": 6.384983330917524e-07, "loss": 0.2958, "step": 7482 }, { "epoch": 0.3615499830893366, "grad_norm": 2.7848596572875977, "learning_rate": 6.384500169106634e-07, "loss": 0.2421, "step": 7483 }, { "epoch": 0.36159829927042564, "grad_norm": 2.9623847007751465, "learning_rate": 6.384017007295744e-07, "loss": 0.2801, "step": 7484 }, { "epoch": 0.3616466154515147, "grad_norm": 2.632014513015747, "learning_rate": 6.383533845484853e-07, "loss": 0.3065, "step": 7485 }, { "epoch": 0.36169493163260374, "grad_norm": 4.560904502868652, "learning_rate": 6.383050683673962e-07, "loss": 0.3485, "step": 7486 }, { "epoch": 0.3617432478136928, "grad_norm": 1.9191458225250244, "learning_rate": 6.382567521863071e-07, "loss": 0.1832, "step": 7487 }, { "epoch": 0.36179156399478185, "grad_norm": 8.749679565429688, "learning_rate": 6.382084360052181e-07, "loss": 0.2309, "step": 7488 }, { "epoch": 0.3618398801758709, "grad_norm": 3.6602275371551514, "learning_rate": 6.381601198241291e-07, "loss": 0.2818, "step": 7489 }, { "epoch": 0.36188819635695996, "grad_norm": 2.2970237731933594, "learning_rate": 6.381118036430401e-07, "loss": 0.2237, "step": 7490 }, { "epoch": 0.361936512538049, "grad_norm": 1.6196730136871338, "learning_rate": 6.380634874619509e-07, "loss": 0.1701, "step": 7491 }, { "epoch": 0.36198482871913806, "grad_norm": 2.870973587036133, "learning_rate": 6.380151712808619e-07, "loss": 0.3369, "step": 7492 }, { "epoch": 0.3620331449002271, "grad_norm": 3.909574031829834, "learning_rate": 6.379668550997729e-07, "loss": 0.3848, "step": 7493 }, { "epoch": 0.3620814610813161, "grad_norm": 2.4202685356140137, "learning_rate": 6.379185389186839e-07, "loss": 0.2861, "step": 7494 }, { "epoch": 0.3621297772624052, "grad_norm": 2.8338210582733154, "learning_rate": 6.378702227375949e-07, "loss": 0.3066, "step": 7495 }, { "epoch": 0.3621780934434942, "grad_norm": 3.1968982219696045, "learning_rate": 6.378219065565057e-07, "loss": 0.3047, "step": 7496 }, { "epoch": 0.36222640962458325, "grad_norm": 1.6185839176177979, "learning_rate": 6.377735903754166e-07, "loss": 0.1774, "step": 7497 }, { "epoch": 0.3622747258056723, "grad_norm": 3.1820461750030518, "learning_rate": 6.377252741943276e-07, "loss": 0.3119, "step": 7498 }, { "epoch": 0.36232304198676135, "grad_norm": 3.836113929748535, "learning_rate": 6.376769580132386e-07, "loss": 0.2517, "step": 7499 }, { "epoch": 0.36237135816785043, "grad_norm": 3.1380856037139893, "learning_rate": 6.376286418321496e-07, "loss": 0.2217, "step": 7500 }, { "epoch": 0.36241967434893946, "grad_norm": 2.732520580291748, "learning_rate": 6.375803256510605e-07, "loss": 0.3566, "step": 7501 }, { "epoch": 0.3624679905300285, "grad_norm": 2.7948594093322754, "learning_rate": 6.375320094699715e-07, "loss": 0.411, "step": 7502 }, { "epoch": 0.36251630671111756, "grad_norm": 1.9788156747817993, "learning_rate": 6.374836932888825e-07, "loss": 0.2615, "step": 7503 }, { "epoch": 0.3625646228922066, "grad_norm": 3.3941702842712402, "learning_rate": 6.374353771077933e-07, "loss": 0.2348, "step": 7504 }, { "epoch": 0.36261293907329567, "grad_norm": 3.179654121398926, "learning_rate": 6.373870609267043e-07, "loss": 0.3011, "step": 7505 }, { "epoch": 0.3626612552543847, "grad_norm": 3.6625595092773438, "learning_rate": 6.373387447456153e-07, "loss": 0.343, "step": 7506 }, { "epoch": 0.3627095714354737, "grad_norm": 4.699652194976807, "learning_rate": 6.372904285645262e-07, "loss": 0.5761, "step": 7507 }, { "epoch": 0.3627578876165628, "grad_norm": 13.020191192626953, "learning_rate": 6.372421123834372e-07, "loss": 0.2798, "step": 7508 }, { "epoch": 0.3628062037976518, "grad_norm": 2.5480949878692627, "learning_rate": 6.371937962023482e-07, "loss": 0.3374, "step": 7509 }, { "epoch": 0.36285451997874085, "grad_norm": 3.09251070022583, "learning_rate": 6.371454800212591e-07, "loss": 0.3743, "step": 7510 }, { "epoch": 0.36290283615982993, "grad_norm": 2.6712498664855957, "learning_rate": 6.370971638401701e-07, "loss": 0.2958, "step": 7511 }, { "epoch": 0.36295115234091896, "grad_norm": 2.361687421798706, "learning_rate": 6.370488476590809e-07, "loss": 0.3156, "step": 7512 }, { "epoch": 0.36299946852200804, "grad_norm": 18.544864654541016, "learning_rate": 6.370005314779919e-07, "loss": 0.2864, "step": 7513 }, { "epoch": 0.36304778470309707, "grad_norm": 2.3395895957946777, "learning_rate": 6.369522152969029e-07, "loss": 0.2851, "step": 7514 }, { "epoch": 0.3630961008841861, "grad_norm": 1.5613043308258057, "learning_rate": 6.369038991158139e-07, "loss": 0.1859, "step": 7515 }, { "epoch": 0.36314441706527517, "grad_norm": 2.009978771209717, "learning_rate": 6.368555829347249e-07, "loss": 0.2261, "step": 7516 }, { "epoch": 0.3631927332463642, "grad_norm": 3.024763822555542, "learning_rate": 6.368072667536357e-07, "loss": 0.4341, "step": 7517 }, { "epoch": 0.3632410494274533, "grad_norm": 2.7017626762390137, "learning_rate": 6.367589505725467e-07, "loss": 0.2433, "step": 7518 }, { "epoch": 0.3632893656085423, "grad_norm": 2.983921766281128, "learning_rate": 6.367106343914577e-07, "loss": 0.3162, "step": 7519 }, { "epoch": 0.36333768178963133, "grad_norm": 2.566035032272339, "learning_rate": 6.366623182103687e-07, "loss": 0.3147, "step": 7520 }, { "epoch": 0.3633859979707204, "grad_norm": 4.056722164154053, "learning_rate": 6.366140020292796e-07, "loss": 0.3302, "step": 7521 }, { "epoch": 0.36343431415180943, "grad_norm": 2.8213393688201904, "learning_rate": 6.365656858481905e-07, "loss": 0.4241, "step": 7522 }, { "epoch": 0.3634826303328985, "grad_norm": 2.509009599685669, "learning_rate": 6.365173696671014e-07, "loss": 0.2546, "step": 7523 }, { "epoch": 0.36353094651398754, "grad_norm": 2.619103193283081, "learning_rate": 6.364690534860124e-07, "loss": 0.3058, "step": 7524 }, { "epoch": 0.36357926269507657, "grad_norm": 3.1672935485839844, "learning_rate": 6.364207373049234e-07, "loss": 0.3399, "step": 7525 }, { "epoch": 0.36362757887616565, "grad_norm": 2.508892297744751, "learning_rate": 6.363724211238344e-07, "loss": 0.2899, "step": 7526 }, { "epoch": 0.3636758950572547, "grad_norm": 9.138351440429688, "learning_rate": 6.363241049427453e-07, "loss": 0.2675, "step": 7527 }, { "epoch": 0.3637242112383437, "grad_norm": 2.8952667713165283, "learning_rate": 6.362757887616563e-07, "loss": 0.3125, "step": 7528 }, { "epoch": 0.3637725274194328, "grad_norm": 4.278069496154785, "learning_rate": 6.362274725805671e-07, "loss": 0.3116, "step": 7529 }, { "epoch": 0.3638208436005218, "grad_norm": 2.8961875438690186, "learning_rate": 6.361791563994781e-07, "loss": 0.402, "step": 7530 }, { "epoch": 0.3638691597816109, "grad_norm": 2.682615280151367, "learning_rate": 6.361308402183891e-07, "loss": 0.2684, "step": 7531 }, { "epoch": 0.3639174759626999, "grad_norm": 3.1038224697113037, "learning_rate": 6.360825240373001e-07, "loss": 0.3204, "step": 7532 }, { "epoch": 0.36396579214378894, "grad_norm": 2.0011696815490723, "learning_rate": 6.36034207856211e-07, "loss": 0.2374, "step": 7533 }, { "epoch": 0.364014108324878, "grad_norm": 1.6343814134597778, "learning_rate": 6.35985891675122e-07, "loss": 0.19, "step": 7534 }, { "epoch": 0.36406242450596704, "grad_norm": 2.590639114379883, "learning_rate": 6.35937575494033e-07, "loss": 0.2867, "step": 7535 }, { "epoch": 0.3641107406870561, "grad_norm": 54.10991287231445, "learning_rate": 6.358892593129439e-07, "loss": 0.2467, "step": 7536 }, { "epoch": 0.36415905686814515, "grad_norm": 7.3423380851745605, "learning_rate": 6.358409431318549e-07, "loss": 0.2755, "step": 7537 }, { "epoch": 0.3642073730492342, "grad_norm": 2.1959333419799805, "learning_rate": 6.357926269507657e-07, "loss": 0.228, "step": 7538 }, { "epoch": 0.36425568923032325, "grad_norm": 5.776888370513916, "learning_rate": 6.357443107696767e-07, "loss": 0.3454, "step": 7539 }, { "epoch": 0.3643040054114123, "grad_norm": 3.081357002258301, "learning_rate": 6.356959945885877e-07, "loss": 0.4218, "step": 7540 }, { "epoch": 0.3643523215925013, "grad_norm": 3.9256365299224854, "learning_rate": 6.356476784074987e-07, "loss": 0.3119, "step": 7541 }, { "epoch": 0.3644006377735904, "grad_norm": 2.063112258911133, "learning_rate": 6.355993622264096e-07, "loss": 0.2368, "step": 7542 }, { "epoch": 0.3644489539546794, "grad_norm": 10.55091667175293, "learning_rate": 6.355510460453205e-07, "loss": 0.3667, "step": 7543 }, { "epoch": 0.3644972701357685, "grad_norm": 2.1051132678985596, "learning_rate": 6.355027298642315e-07, "loss": 0.1672, "step": 7544 }, { "epoch": 0.3645455863168575, "grad_norm": 6.63192081451416, "learning_rate": 6.354544136831425e-07, "loss": 0.3395, "step": 7545 }, { "epoch": 0.36459390249794654, "grad_norm": 3.1842215061187744, "learning_rate": 6.354060975020534e-07, "loss": 0.3301, "step": 7546 }, { "epoch": 0.3646422186790356, "grad_norm": 2.4012598991394043, "learning_rate": 6.353577813209644e-07, "loss": 0.1983, "step": 7547 }, { "epoch": 0.36469053486012465, "grad_norm": 3.8961005210876465, "learning_rate": 6.353094651398752e-07, "loss": 0.3012, "step": 7548 }, { "epoch": 0.36473885104121373, "grad_norm": 4.135560512542725, "learning_rate": 6.352611489587862e-07, "loss": 0.3127, "step": 7549 }, { "epoch": 0.36478716722230276, "grad_norm": 3.143643617630005, "learning_rate": 6.352128327776972e-07, "loss": 0.2841, "step": 7550 }, { "epoch": 0.3648354834033918, "grad_norm": 2.356917381286621, "learning_rate": 6.351645165966082e-07, "loss": 0.2988, "step": 7551 }, { "epoch": 0.36488379958448086, "grad_norm": 2.4731760025024414, "learning_rate": 6.351162004155192e-07, "loss": 0.3734, "step": 7552 }, { "epoch": 0.3649321157655699, "grad_norm": 1.832797884941101, "learning_rate": 6.350678842344301e-07, "loss": 0.1774, "step": 7553 }, { "epoch": 0.3649804319466589, "grad_norm": 2.244081497192383, "learning_rate": 6.35019568053341e-07, "loss": 0.2229, "step": 7554 }, { "epoch": 0.365028748127748, "grad_norm": 7.036667346954346, "learning_rate": 6.349712518722519e-07, "loss": 0.1965, "step": 7555 }, { "epoch": 0.365077064308837, "grad_norm": 4.458613395690918, "learning_rate": 6.349229356911629e-07, "loss": 0.3378, "step": 7556 }, { "epoch": 0.3651253804899261, "grad_norm": 2.4605484008789062, "learning_rate": 6.348746195100739e-07, "loss": 0.2426, "step": 7557 }, { "epoch": 0.3651736966710151, "grad_norm": 27.577619552612305, "learning_rate": 6.348263033289849e-07, "loss": 0.2451, "step": 7558 }, { "epoch": 0.36522201285210415, "grad_norm": 2.204249382019043, "learning_rate": 6.347779871478958e-07, "loss": 0.2289, "step": 7559 }, { "epoch": 0.36527032903319323, "grad_norm": 1.9449864625930786, "learning_rate": 6.347296709668068e-07, "loss": 0.1935, "step": 7560 }, { "epoch": 0.36531864521428226, "grad_norm": 2.423056125640869, "learning_rate": 6.346813547857177e-07, "loss": 0.2484, "step": 7561 }, { "epoch": 0.36536696139537134, "grad_norm": 2.4749534130096436, "learning_rate": 6.346330386046287e-07, "loss": 0.2765, "step": 7562 }, { "epoch": 0.36541527757646036, "grad_norm": 2.921679735183716, "learning_rate": 6.345847224235396e-07, "loss": 0.4208, "step": 7563 }, { "epoch": 0.3654635937575494, "grad_norm": 26.8132266998291, "learning_rate": 6.345364062424505e-07, "loss": 0.3212, "step": 7564 }, { "epoch": 0.36551190993863847, "grad_norm": 4.87395715713501, "learning_rate": 6.344880900613615e-07, "loss": 0.3022, "step": 7565 }, { "epoch": 0.3655602261197275, "grad_norm": 2.5116260051727295, "learning_rate": 6.344397738802725e-07, "loss": 0.2172, "step": 7566 }, { "epoch": 0.3656085423008165, "grad_norm": 2.5931267738342285, "learning_rate": 6.343914576991835e-07, "loss": 0.2109, "step": 7567 }, { "epoch": 0.3656568584819056, "grad_norm": 3.201385259628296, "learning_rate": 6.343431415180944e-07, "loss": 0.4254, "step": 7568 }, { "epoch": 0.3657051746629946, "grad_norm": 4.086209774017334, "learning_rate": 6.342948253370053e-07, "loss": 0.3697, "step": 7569 }, { "epoch": 0.3657534908440837, "grad_norm": 2.1312825679779053, "learning_rate": 6.342465091559163e-07, "loss": 0.2428, "step": 7570 }, { "epoch": 0.36580180702517273, "grad_norm": 7.0464701652526855, "learning_rate": 6.341981929748273e-07, "loss": 0.3141, "step": 7571 }, { "epoch": 0.36585012320626176, "grad_norm": 2.7274417877197266, "learning_rate": 6.341498767937382e-07, "loss": 0.3263, "step": 7572 }, { "epoch": 0.36589843938735084, "grad_norm": 2.019535541534424, "learning_rate": 6.341015606126492e-07, "loss": 0.2532, "step": 7573 }, { "epoch": 0.36594675556843986, "grad_norm": 3.3969814777374268, "learning_rate": 6.3405324443156e-07, "loss": 0.2487, "step": 7574 }, { "epoch": 0.36599507174952894, "grad_norm": 2.5580923557281494, "learning_rate": 6.34004928250471e-07, "loss": 0.3394, "step": 7575 }, { "epoch": 0.36604338793061797, "grad_norm": 2.965519905090332, "learning_rate": 6.33956612069382e-07, "loss": 0.4203, "step": 7576 }, { "epoch": 0.366091704111707, "grad_norm": 2.7150678634643555, "learning_rate": 6.33908295888293e-07, "loss": 0.3361, "step": 7577 }, { "epoch": 0.3661400202927961, "grad_norm": 2.240072727203369, "learning_rate": 6.33859979707204e-07, "loss": 0.3128, "step": 7578 }, { "epoch": 0.3661883364738851, "grad_norm": 3.0997109413146973, "learning_rate": 6.338116635261149e-07, "loss": 0.4313, "step": 7579 }, { "epoch": 0.3662366526549741, "grad_norm": 2.4068715572357178, "learning_rate": 6.337633473450257e-07, "loss": 0.2667, "step": 7580 }, { "epoch": 0.3662849688360632, "grad_norm": 2.791170597076416, "learning_rate": 6.337150311639367e-07, "loss": 0.2523, "step": 7581 }, { "epoch": 0.36633328501715223, "grad_norm": 2.4144797325134277, "learning_rate": 6.336667149828477e-07, "loss": 0.2692, "step": 7582 }, { "epoch": 0.3663816011982413, "grad_norm": 2.7858242988586426, "learning_rate": 6.336183988017587e-07, "loss": 0.2904, "step": 7583 }, { "epoch": 0.36642991737933034, "grad_norm": 2.5770628452301025, "learning_rate": 6.335700826206697e-07, "loss": 0.2451, "step": 7584 }, { "epoch": 0.36647823356041936, "grad_norm": 1.3903772830963135, "learning_rate": 6.335217664395806e-07, "loss": 0.1225, "step": 7585 }, { "epoch": 0.36652654974150845, "grad_norm": 5.550978660583496, "learning_rate": 6.334734502584916e-07, "loss": 0.3355, "step": 7586 }, { "epoch": 0.36657486592259747, "grad_norm": 2.471668004989624, "learning_rate": 6.334251340774025e-07, "loss": 0.3245, "step": 7587 }, { "epoch": 0.36662318210368655, "grad_norm": 4.359007835388184, "learning_rate": 6.333768178963134e-07, "loss": 0.2639, "step": 7588 }, { "epoch": 0.3666714982847756, "grad_norm": 3.0668399333953857, "learning_rate": 6.333285017152244e-07, "loss": 0.3895, "step": 7589 }, { "epoch": 0.3667198144658646, "grad_norm": 2.248041868209839, "learning_rate": 6.332801855341353e-07, "loss": 0.2779, "step": 7590 }, { "epoch": 0.3667681306469537, "grad_norm": 2.3577980995178223, "learning_rate": 6.332318693530463e-07, "loss": 0.2362, "step": 7591 }, { "epoch": 0.3668164468280427, "grad_norm": 4.338862419128418, "learning_rate": 6.331835531719573e-07, "loss": 0.3301, "step": 7592 }, { "epoch": 0.36686476300913173, "grad_norm": 5.977138042449951, "learning_rate": 6.331352369908682e-07, "loss": 0.2736, "step": 7593 }, { "epoch": 0.3669130791902208, "grad_norm": 9.963186264038086, "learning_rate": 6.330869208097792e-07, "loss": 0.3482, "step": 7594 }, { "epoch": 0.36696139537130984, "grad_norm": 5.043726444244385, "learning_rate": 6.330386046286901e-07, "loss": 0.3124, "step": 7595 }, { "epoch": 0.3670097115523989, "grad_norm": 3.728426218032837, "learning_rate": 6.32990288447601e-07, "loss": 0.3073, "step": 7596 }, { "epoch": 0.36705802773348795, "grad_norm": 2.0372109413146973, "learning_rate": 6.32941972266512e-07, "loss": 0.278, "step": 7597 }, { "epoch": 0.36710634391457697, "grad_norm": 3.1413214206695557, "learning_rate": 6.32893656085423e-07, "loss": 0.331, "step": 7598 }, { "epoch": 0.36715466009566605, "grad_norm": 3.2813711166381836, "learning_rate": 6.32845339904334e-07, "loss": 0.264, "step": 7599 }, { "epoch": 0.3672029762767551, "grad_norm": 3.6990702152252197, "learning_rate": 6.327970237232448e-07, "loss": 0.3248, "step": 7600 }, { "epoch": 0.36725129245784416, "grad_norm": 2.009430170059204, "learning_rate": 6.327487075421558e-07, "loss": 0.2153, "step": 7601 }, { "epoch": 0.3672996086389332, "grad_norm": 4.5195112228393555, "learning_rate": 6.327003913610668e-07, "loss": 0.2593, "step": 7602 }, { "epoch": 0.3673479248200222, "grad_norm": 2.7899346351623535, "learning_rate": 6.326520751799778e-07, "loss": 0.3189, "step": 7603 }, { "epoch": 0.3673962410011113, "grad_norm": 3.3866026401519775, "learning_rate": 6.326037589988888e-07, "loss": 0.3089, "step": 7604 }, { "epoch": 0.3674445571822003, "grad_norm": 3.9520182609558105, "learning_rate": 6.325554428177996e-07, "loss": 0.3497, "step": 7605 }, { "epoch": 0.36749287336328934, "grad_norm": 14.159972190856934, "learning_rate": 6.325071266367105e-07, "loss": 0.3551, "step": 7606 }, { "epoch": 0.3675411895443784, "grad_norm": 2.5388903617858887, "learning_rate": 6.324588104556215e-07, "loss": 0.3108, "step": 7607 }, { "epoch": 0.36758950572546745, "grad_norm": 2.1748533248901367, "learning_rate": 6.324104942745325e-07, "loss": 0.2621, "step": 7608 }, { "epoch": 0.36763782190655653, "grad_norm": 2.504967451095581, "learning_rate": 6.323621780934435e-07, "loss": 0.2706, "step": 7609 }, { "epoch": 0.36768613808764555, "grad_norm": 2.1406493186950684, "learning_rate": 6.323138619123545e-07, "loss": 0.2815, "step": 7610 }, { "epoch": 0.3677344542687346, "grad_norm": 2.4189748764038086, "learning_rate": 6.322655457312654e-07, "loss": 0.3355, "step": 7611 }, { "epoch": 0.36778277044982366, "grad_norm": 5.338949680328369, "learning_rate": 6.322172295501763e-07, "loss": 0.3539, "step": 7612 }, { "epoch": 0.3678310866309127, "grad_norm": 2.8892300128936768, "learning_rate": 6.321689133690873e-07, "loss": 0.342, "step": 7613 }, { "epoch": 0.36787940281200177, "grad_norm": 2.621631383895874, "learning_rate": 6.321205971879982e-07, "loss": 0.3426, "step": 7614 }, { "epoch": 0.3679277189930908, "grad_norm": 3.1561689376831055, "learning_rate": 6.320722810069092e-07, "loss": 0.38, "step": 7615 }, { "epoch": 0.3679760351741798, "grad_norm": 1.999981164932251, "learning_rate": 6.320239648258201e-07, "loss": 0.2082, "step": 7616 }, { "epoch": 0.3680243513552689, "grad_norm": 1.506656289100647, "learning_rate": 6.319756486447311e-07, "loss": 0.2016, "step": 7617 }, { "epoch": 0.3680726675363579, "grad_norm": 2.8793551921844482, "learning_rate": 6.319273324636421e-07, "loss": 0.2922, "step": 7618 }, { "epoch": 0.36812098371744695, "grad_norm": 2.5166618824005127, "learning_rate": 6.31879016282553e-07, "loss": 0.3022, "step": 7619 }, { "epoch": 0.36816929989853603, "grad_norm": 3.2415294647216797, "learning_rate": 6.31830700101464e-07, "loss": 0.2699, "step": 7620 }, { "epoch": 0.36821761607962505, "grad_norm": 2.23176908493042, "learning_rate": 6.317823839203749e-07, "loss": 0.2917, "step": 7621 }, { "epoch": 0.36826593226071414, "grad_norm": 4.350091934204102, "learning_rate": 6.317340677392858e-07, "loss": 0.3016, "step": 7622 }, { "epoch": 0.36831424844180316, "grad_norm": 3.554424524307251, "learning_rate": 6.316857515581968e-07, "loss": 0.28, "step": 7623 }, { "epoch": 0.3683625646228922, "grad_norm": 2.398632526397705, "learning_rate": 6.316374353771078e-07, "loss": 0.3533, "step": 7624 }, { "epoch": 0.36841088080398127, "grad_norm": 3.0427298545837402, "learning_rate": 6.315891191960187e-07, "loss": 0.3611, "step": 7625 }, { "epoch": 0.3684591969850703, "grad_norm": 2.5874760150909424, "learning_rate": 6.315408030149296e-07, "loss": 0.2915, "step": 7626 }, { "epoch": 0.3685075131661594, "grad_norm": 2.630497455596924, "learning_rate": 6.314924868338406e-07, "loss": 0.3179, "step": 7627 }, { "epoch": 0.3685558293472484, "grad_norm": 48.4274787902832, "learning_rate": 6.314441706527516e-07, "loss": 0.3884, "step": 7628 }, { "epoch": 0.3686041455283374, "grad_norm": 2.71028470993042, "learning_rate": 6.313958544716626e-07, "loss": 0.2745, "step": 7629 }, { "epoch": 0.3686524617094265, "grad_norm": 20.1790771484375, "learning_rate": 6.313475382905736e-07, "loss": 0.3296, "step": 7630 }, { "epoch": 0.36870077789051553, "grad_norm": 3.44915509223938, "learning_rate": 6.312992221094843e-07, "loss": 0.4309, "step": 7631 }, { "epoch": 0.36874909407160456, "grad_norm": 2.4738430976867676, "learning_rate": 6.312509059283953e-07, "loss": 0.1971, "step": 7632 }, { "epoch": 0.36879741025269364, "grad_norm": 2.8831794261932373, "learning_rate": 6.312025897473063e-07, "loss": 0.3112, "step": 7633 }, { "epoch": 0.36884572643378266, "grad_norm": 4.370701789855957, "learning_rate": 6.311542735662173e-07, "loss": 0.3469, "step": 7634 }, { "epoch": 0.36889404261487174, "grad_norm": 2.181540012359619, "learning_rate": 6.311059573851283e-07, "loss": 0.2726, "step": 7635 }, { "epoch": 0.36894235879596077, "grad_norm": 1.9659793376922607, "learning_rate": 6.310576412040393e-07, "loss": 0.207, "step": 7636 }, { "epoch": 0.3689906749770498, "grad_norm": 2.313539981842041, "learning_rate": 6.310093250229502e-07, "loss": 0.2981, "step": 7637 }, { "epoch": 0.3690389911581389, "grad_norm": 3.077256202697754, "learning_rate": 6.309610088418611e-07, "loss": 0.2744, "step": 7638 }, { "epoch": 0.3690873073392279, "grad_norm": 2.393510103225708, "learning_rate": 6.30912692660772e-07, "loss": 0.3225, "step": 7639 }, { "epoch": 0.369135623520317, "grad_norm": 4.728759765625, "learning_rate": 6.30864376479683e-07, "loss": 0.2006, "step": 7640 }, { "epoch": 0.369183939701406, "grad_norm": 3.186736583709717, "learning_rate": 6.30816060298594e-07, "loss": 0.3288, "step": 7641 }, { "epoch": 0.36923225588249503, "grad_norm": 1.9695260524749756, "learning_rate": 6.307677441175049e-07, "loss": 0.2094, "step": 7642 }, { "epoch": 0.3692805720635841, "grad_norm": 3.958692789077759, "learning_rate": 6.307194279364159e-07, "loss": 0.2865, "step": 7643 }, { "epoch": 0.36932888824467314, "grad_norm": 3.8926615715026855, "learning_rate": 6.306711117553268e-07, "loss": 0.3155, "step": 7644 }, { "epoch": 0.36937720442576216, "grad_norm": 2.6399571895599365, "learning_rate": 6.306227955742378e-07, "loss": 0.3305, "step": 7645 }, { "epoch": 0.36942552060685124, "grad_norm": 3.058748960494995, "learning_rate": 6.305744793931488e-07, "loss": 0.3156, "step": 7646 }, { "epoch": 0.36947383678794027, "grad_norm": 2.96661114692688, "learning_rate": 6.305261632120596e-07, "loss": 0.312, "step": 7647 }, { "epoch": 0.36952215296902935, "grad_norm": 2.82334041595459, "learning_rate": 6.304778470309706e-07, "loss": 0.2671, "step": 7648 }, { "epoch": 0.3695704691501184, "grad_norm": 2.8513948917388916, "learning_rate": 6.304295308498816e-07, "loss": 0.3306, "step": 7649 }, { "epoch": 0.3696187853312074, "grad_norm": 2.859117031097412, "learning_rate": 6.303812146687926e-07, "loss": 0.3047, "step": 7650 }, { "epoch": 0.3696671015122965, "grad_norm": 2.155107021331787, "learning_rate": 6.303328984877035e-07, "loss": 0.2149, "step": 7651 }, { "epoch": 0.3697154176933855, "grad_norm": 2.925036668777466, "learning_rate": 6.302845823066144e-07, "loss": 0.4359, "step": 7652 }, { "epoch": 0.3697637338744746, "grad_norm": 2.974640369415283, "learning_rate": 6.302362661255254e-07, "loss": 0.3246, "step": 7653 }, { "epoch": 0.3698120500555636, "grad_norm": 1.598028540611267, "learning_rate": 6.301879499444364e-07, "loss": 0.2089, "step": 7654 }, { "epoch": 0.36986036623665264, "grad_norm": 4.001721382141113, "learning_rate": 6.301396337633474e-07, "loss": 0.2417, "step": 7655 }, { "epoch": 0.3699086824177417, "grad_norm": 4.2321457862854, "learning_rate": 6.300913175822583e-07, "loss": 0.4298, "step": 7656 }, { "epoch": 0.36995699859883074, "grad_norm": 1.96381413936615, "learning_rate": 6.300430014011691e-07, "loss": 0.2239, "step": 7657 }, { "epoch": 0.37000531477991977, "grad_norm": 3.687352418899536, "learning_rate": 6.299946852200801e-07, "loss": 0.3205, "step": 7658 }, { "epoch": 0.37005363096100885, "grad_norm": 1.7967069149017334, "learning_rate": 6.299463690389911e-07, "loss": 0.1959, "step": 7659 }, { "epoch": 0.3701019471420979, "grad_norm": 2.795631170272827, "learning_rate": 6.298980528579021e-07, "loss": 0.2878, "step": 7660 }, { "epoch": 0.37015026332318696, "grad_norm": 2.894338846206665, "learning_rate": 6.298497366768131e-07, "loss": 0.3707, "step": 7661 }, { "epoch": 0.370198579504276, "grad_norm": 1.832094669342041, "learning_rate": 6.29801420495724e-07, "loss": 0.1623, "step": 7662 }, { "epoch": 0.370246895685365, "grad_norm": 2.221402645111084, "learning_rate": 6.297531043146349e-07, "loss": 0.2293, "step": 7663 }, { "epoch": 0.3702952118664541, "grad_norm": 2.470489740371704, "learning_rate": 6.297047881335458e-07, "loss": 0.3008, "step": 7664 }, { "epoch": 0.3703435280475431, "grad_norm": 2.2496228218078613, "learning_rate": 6.296564719524568e-07, "loss": 0.284, "step": 7665 }, { "epoch": 0.3703918442286322, "grad_norm": 2.069730043411255, "learning_rate": 6.296081557713678e-07, "loss": 0.2285, "step": 7666 }, { "epoch": 0.3704401604097212, "grad_norm": 2.478271722793579, "learning_rate": 6.295598395902788e-07, "loss": 0.2715, "step": 7667 }, { "epoch": 0.37048847659081025, "grad_norm": 2.2763638496398926, "learning_rate": 6.295115234091897e-07, "loss": 0.2591, "step": 7668 }, { "epoch": 0.3705367927718993, "grad_norm": 2.347564697265625, "learning_rate": 6.294632072281007e-07, "loss": 0.2554, "step": 7669 }, { "epoch": 0.37058510895298835, "grad_norm": 5.335231781005859, "learning_rate": 6.294148910470116e-07, "loss": 0.3711, "step": 7670 }, { "epoch": 0.3706334251340774, "grad_norm": 2.6306891441345215, "learning_rate": 6.293665748659226e-07, "loss": 0.246, "step": 7671 }, { "epoch": 0.37068174131516646, "grad_norm": 2.561984062194824, "learning_rate": 6.293182586848336e-07, "loss": 0.3065, "step": 7672 }, { "epoch": 0.3707300574962555, "grad_norm": 4.6201982498168945, "learning_rate": 6.292699425037444e-07, "loss": 0.2812, "step": 7673 }, { "epoch": 0.37077837367734456, "grad_norm": 16.842763900756836, "learning_rate": 6.292216263226554e-07, "loss": 0.4675, "step": 7674 }, { "epoch": 0.3708266898584336, "grad_norm": 1.9948513507843018, "learning_rate": 6.291733101415664e-07, "loss": 0.2041, "step": 7675 }, { "epoch": 0.3708750060395226, "grad_norm": 2.5048043727874756, "learning_rate": 6.291249939604773e-07, "loss": 0.2652, "step": 7676 }, { "epoch": 0.3709233222206117, "grad_norm": 4.101357460021973, "learning_rate": 6.290766777793883e-07, "loss": 0.4276, "step": 7677 }, { "epoch": 0.3709716384017007, "grad_norm": 2.4283554553985596, "learning_rate": 6.290283615982992e-07, "loss": 0.2956, "step": 7678 }, { "epoch": 0.3710199545827898, "grad_norm": 3.8990862369537354, "learning_rate": 6.289800454172102e-07, "loss": 0.4008, "step": 7679 }, { "epoch": 0.3710682707638788, "grad_norm": 14.255115509033203, "learning_rate": 6.289317292361212e-07, "loss": 0.3606, "step": 7680 }, { "epoch": 0.37111658694496785, "grad_norm": 3.6622772216796875, "learning_rate": 6.288834130550322e-07, "loss": 0.4745, "step": 7681 }, { "epoch": 0.37116490312605693, "grad_norm": 2.2682573795318604, "learning_rate": 6.288350968739431e-07, "loss": 0.2651, "step": 7682 }, { "epoch": 0.37121321930714596, "grad_norm": 2.1522486209869385, "learning_rate": 6.287867806928539e-07, "loss": 0.2545, "step": 7683 }, { "epoch": 0.371261535488235, "grad_norm": 2.7773165702819824, "learning_rate": 6.287384645117649e-07, "loss": 0.3489, "step": 7684 }, { "epoch": 0.37130985166932406, "grad_norm": 2.442194938659668, "learning_rate": 6.286901483306759e-07, "loss": 0.2689, "step": 7685 }, { "epoch": 0.3713581678504131, "grad_norm": 4.274221897125244, "learning_rate": 6.286418321495869e-07, "loss": 0.3012, "step": 7686 }, { "epoch": 0.37140648403150217, "grad_norm": 3.987947463989258, "learning_rate": 6.285935159684979e-07, "loss": 0.5243, "step": 7687 }, { "epoch": 0.3714548002125912, "grad_norm": 2.171252489089966, "learning_rate": 6.285451997874088e-07, "loss": 0.2485, "step": 7688 }, { "epoch": 0.3715031163936802, "grad_norm": 3.303544282913208, "learning_rate": 6.284968836063196e-07, "loss": 0.2175, "step": 7689 }, { "epoch": 0.3715514325747693, "grad_norm": 5.308825969696045, "learning_rate": 6.284485674252306e-07, "loss": 0.364, "step": 7690 }, { "epoch": 0.37159974875585833, "grad_norm": 3.64514422416687, "learning_rate": 6.284002512441416e-07, "loss": 0.3512, "step": 7691 }, { "epoch": 0.3716480649369474, "grad_norm": 2.1742565631866455, "learning_rate": 6.283519350630526e-07, "loss": 0.2739, "step": 7692 }, { "epoch": 0.37169638111803643, "grad_norm": 2.94539737701416, "learning_rate": 6.283036188819636e-07, "loss": 0.3863, "step": 7693 }, { "epoch": 0.37174469729912546, "grad_norm": 3.4567835330963135, "learning_rate": 6.282553027008745e-07, "loss": 0.4251, "step": 7694 }, { "epoch": 0.37179301348021454, "grad_norm": 3.016641139984131, "learning_rate": 6.282069865197854e-07, "loss": 0.3616, "step": 7695 }, { "epoch": 0.37184132966130357, "grad_norm": 2.1481990814208984, "learning_rate": 6.281586703386964e-07, "loss": 0.1957, "step": 7696 }, { "epoch": 0.3718896458423926, "grad_norm": 2.991736888885498, "learning_rate": 6.281103541576074e-07, "loss": 0.2836, "step": 7697 }, { "epoch": 0.37193796202348167, "grad_norm": 3.664795398712158, "learning_rate": 6.280620379765183e-07, "loss": 0.2235, "step": 7698 }, { "epoch": 0.3719862782045707, "grad_norm": 2.501133680343628, "learning_rate": 6.280137217954292e-07, "loss": 0.2459, "step": 7699 }, { "epoch": 0.3720345943856598, "grad_norm": 4.265063762664795, "learning_rate": 6.279654056143402e-07, "loss": 0.3622, "step": 7700 }, { "epoch": 0.3720829105667488, "grad_norm": 2.2429933547973633, "learning_rate": 6.279170894332512e-07, "loss": 0.2149, "step": 7701 }, { "epoch": 0.37213122674783783, "grad_norm": 1.9753848314285278, "learning_rate": 6.278687732521621e-07, "loss": 0.2223, "step": 7702 }, { "epoch": 0.3721795429289269, "grad_norm": 5.322939872741699, "learning_rate": 6.278204570710731e-07, "loss": 0.2568, "step": 7703 }, { "epoch": 0.37222785911001594, "grad_norm": 2.8474855422973633, "learning_rate": 6.27772140889984e-07, "loss": 0.2728, "step": 7704 }, { "epoch": 0.372276175291105, "grad_norm": 2.3589959144592285, "learning_rate": 6.27723824708895e-07, "loss": 0.298, "step": 7705 }, { "epoch": 0.37232449147219404, "grad_norm": 1.7155098915100098, "learning_rate": 6.27675508527806e-07, "loss": 0.1888, "step": 7706 }, { "epoch": 0.37237280765328307, "grad_norm": 2.1546037197113037, "learning_rate": 6.276271923467169e-07, "loss": 0.2854, "step": 7707 }, { "epoch": 0.37242112383437215, "grad_norm": 3.3245298862457275, "learning_rate": 6.275788761656278e-07, "loss": 0.2835, "step": 7708 }, { "epoch": 0.3724694400154612, "grad_norm": 2.4339632987976074, "learning_rate": 6.275305599845387e-07, "loss": 0.2937, "step": 7709 }, { "epoch": 0.3725177561965502, "grad_norm": 5.221601486206055, "learning_rate": 6.274822438034497e-07, "loss": 0.2617, "step": 7710 }, { "epoch": 0.3725660723776393, "grad_norm": 6.002857208251953, "learning_rate": 6.274339276223607e-07, "loss": 0.192, "step": 7711 }, { "epoch": 0.3726143885587283, "grad_norm": 3.047536611557007, "learning_rate": 6.273856114412717e-07, "loss": 0.3816, "step": 7712 }, { "epoch": 0.3726627047398174, "grad_norm": 3.228604793548584, "learning_rate": 6.273372952601827e-07, "loss": 0.4259, "step": 7713 }, { "epoch": 0.3727110209209064, "grad_norm": 2.3242833614349365, "learning_rate": 6.272889790790936e-07, "loss": 0.3054, "step": 7714 }, { "epoch": 0.37275933710199544, "grad_norm": 2.3870630264282227, "learning_rate": 6.272406628980044e-07, "loss": 0.2781, "step": 7715 }, { "epoch": 0.3728076532830845, "grad_norm": 3.362283945083618, "learning_rate": 6.271923467169154e-07, "loss": 0.2867, "step": 7716 }, { "epoch": 0.37285596946417354, "grad_norm": 2.18704891204834, "learning_rate": 6.271440305358264e-07, "loss": 0.2429, "step": 7717 }, { "epoch": 0.3729042856452626, "grad_norm": 2.220109224319458, "learning_rate": 6.270957143547374e-07, "loss": 0.1972, "step": 7718 }, { "epoch": 0.37295260182635165, "grad_norm": 3.0372791290283203, "learning_rate": 6.270473981736484e-07, "loss": 0.4463, "step": 7719 }, { "epoch": 0.3730009180074407, "grad_norm": 2.7625892162323, "learning_rate": 6.269990819925593e-07, "loss": 0.2895, "step": 7720 }, { "epoch": 0.37304923418852975, "grad_norm": 2.8379805088043213, "learning_rate": 6.269507658114702e-07, "loss": 0.3714, "step": 7721 }, { "epoch": 0.3730975503696188, "grad_norm": 2.641214370727539, "learning_rate": 6.269024496303812e-07, "loss": 0.2922, "step": 7722 }, { "epoch": 0.3731458665507078, "grad_norm": 2.1940910816192627, "learning_rate": 6.268541334492922e-07, "loss": 0.2156, "step": 7723 }, { "epoch": 0.3731941827317969, "grad_norm": 4.915323257446289, "learning_rate": 6.268058172682031e-07, "loss": 0.2279, "step": 7724 }, { "epoch": 0.3732424989128859, "grad_norm": 1.8182554244995117, "learning_rate": 6.26757501087114e-07, "loss": 0.2227, "step": 7725 }, { "epoch": 0.373290815093975, "grad_norm": 2.4881675243377686, "learning_rate": 6.26709184906025e-07, "loss": 0.3101, "step": 7726 }, { "epoch": 0.373339131275064, "grad_norm": 3.776782512664795, "learning_rate": 6.26660868724936e-07, "loss": 0.3755, "step": 7727 }, { "epoch": 0.37338744745615304, "grad_norm": 10.840484619140625, "learning_rate": 6.266125525438469e-07, "loss": 0.3263, "step": 7728 }, { "epoch": 0.3734357636372421, "grad_norm": 2.245147466659546, "learning_rate": 6.265642363627579e-07, "loss": 0.2421, "step": 7729 }, { "epoch": 0.37348407981833115, "grad_norm": 1.864135980606079, "learning_rate": 6.265159201816688e-07, "loss": 0.2153, "step": 7730 }, { "epoch": 0.37353239599942023, "grad_norm": 3.528806447982788, "learning_rate": 6.264676040005798e-07, "loss": 0.277, "step": 7731 }, { "epoch": 0.37358071218050926, "grad_norm": 2.2649688720703125, "learning_rate": 6.264192878194907e-07, "loss": 0.2081, "step": 7732 }, { "epoch": 0.3736290283615983, "grad_norm": 3.241319179534912, "learning_rate": 6.263709716384017e-07, "loss": 0.384, "step": 7733 }, { "epoch": 0.37367734454268736, "grad_norm": 2.6658706665039062, "learning_rate": 6.263226554573126e-07, "loss": 0.2862, "step": 7734 }, { "epoch": 0.3737256607237764, "grad_norm": 2.5547523498535156, "learning_rate": 6.262743392762235e-07, "loss": 0.3036, "step": 7735 }, { "epoch": 0.3737739769048654, "grad_norm": 2.786004066467285, "learning_rate": 6.262260230951345e-07, "loss": 0.3164, "step": 7736 }, { "epoch": 0.3738222930859545, "grad_norm": 2.717785120010376, "learning_rate": 6.261777069140455e-07, "loss": 0.3803, "step": 7737 }, { "epoch": 0.3738706092670435, "grad_norm": 2.37684965133667, "learning_rate": 6.261293907329565e-07, "loss": 0.2529, "step": 7738 }, { "epoch": 0.3739189254481326, "grad_norm": 2.854299545288086, "learning_rate": 6.260810745518675e-07, "loss": 0.3747, "step": 7739 }, { "epoch": 0.3739672416292216, "grad_norm": 1.8255623579025269, "learning_rate": 6.260327583707782e-07, "loss": 0.2369, "step": 7740 }, { "epoch": 0.37401555781031065, "grad_norm": 4.462403774261475, "learning_rate": 6.259844421896892e-07, "loss": 0.2987, "step": 7741 }, { "epoch": 0.37406387399139973, "grad_norm": 2.4614524841308594, "learning_rate": 6.259361260086002e-07, "loss": 0.3329, "step": 7742 }, { "epoch": 0.37411219017248876, "grad_norm": 4.999231815338135, "learning_rate": 6.258878098275112e-07, "loss": 0.3114, "step": 7743 }, { "epoch": 0.37416050635357784, "grad_norm": 2.838318347930908, "learning_rate": 6.258394936464222e-07, "loss": 0.2886, "step": 7744 }, { "epoch": 0.37420882253466686, "grad_norm": 2.468155860900879, "learning_rate": 6.257911774653332e-07, "loss": 0.2413, "step": 7745 }, { "epoch": 0.3742571387157559, "grad_norm": 3.4787278175354004, "learning_rate": 6.257428612842441e-07, "loss": 0.2, "step": 7746 }, { "epoch": 0.37430545489684497, "grad_norm": 4.155656814575195, "learning_rate": 6.25694545103155e-07, "loss": 0.3565, "step": 7747 }, { "epoch": 0.374353771077934, "grad_norm": 2.4217686653137207, "learning_rate": 6.25646228922066e-07, "loss": 0.2234, "step": 7748 }, { "epoch": 0.374402087259023, "grad_norm": 3.1226043701171875, "learning_rate": 6.255979127409769e-07, "loss": 0.3689, "step": 7749 }, { "epoch": 0.3744504034401121, "grad_norm": 2.952249765396118, "learning_rate": 6.255495965598879e-07, "loss": 0.3867, "step": 7750 }, { "epoch": 0.3744987196212011, "grad_norm": 2.027156114578247, "learning_rate": 6.255012803787988e-07, "loss": 0.2099, "step": 7751 }, { "epoch": 0.3745470358022902, "grad_norm": 4.145044326782227, "learning_rate": 6.254529641977098e-07, "loss": 0.2182, "step": 7752 }, { "epoch": 0.37459535198337923, "grad_norm": 2.144716739654541, "learning_rate": 6.254046480166207e-07, "loss": 0.1888, "step": 7753 }, { "epoch": 0.37464366816446826, "grad_norm": 3.327441453933716, "learning_rate": 6.253563318355317e-07, "loss": 0.3365, "step": 7754 }, { "epoch": 0.37469198434555734, "grad_norm": 4.716085433959961, "learning_rate": 6.253080156544427e-07, "loss": 0.2412, "step": 7755 }, { "epoch": 0.37474030052664636, "grad_norm": 2.8590545654296875, "learning_rate": 6.252596994733536e-07, "loss": 0.3737, "step": 7756 }, { "epoch": 0.37478861670773544, "grad_norm": 3.8725011348724365, "learning_rate": 6.252113832922645e-07, "loss": 0.2064, "step": 7757 }, { "epoch": 0.37483693288882447, "grad_norm": 3.0503435134887695, "learning_rate": 6.251630671111755e-07, "loss": 0.3356, "step": 7758 }, { "epoch": 0.3748852490699135, "grad_norm": 9.277654647827148, "learning_rate": 6.251147509300865e-07, "loss": 0.3342, "step": 7759 }, { "epoch": 0.3749335652510026, "grad_norm": 3.0401194095611572, "learning_rate": 6.250664347489974e-07, "loss": 0.3203, "step": 7760 }, { "epoch": 0.3749818814320916, "grad_norm": 6.737254619598389, "learning_rate": 6.250181185679083e-07, "loss": 0.2741, "step": 7761 }, { "epoch": 0.3750301976131806, "grad_norm": 2.9228644371032715, "learning_rate": 6.249698023868193e-07, "loss": 0.2756, "step": 7762 }, { "epoch": 0.3750785137942697, "grad_norm": 2.7996256351470947, "learning_rate": 6.249214862057303e-07, "loss": 0.2718, "step": 7763 }, { "epoch": 0.37512682997535873, "grad_norm": 2.6171536445617676, "learning_rate": 6.248731700246413e-07, "loss": 0.3361, "step": 7764 }, { "epoch": 0.3751751461564478, "grad_norm": 2.5314438343048096, "learning_rate": 6.248248538435523e-07, "loss": 0.2934, "step": 7765 }, { "epoch": 0.37522346233753684, "grad_norm": 2.892690658569336, "learning_rate": 6.24776537662463e-07, "loss": 0.4089, "step": 7766 }, { "epoch": 0.37527177851862586, "grad_norm": 2.3260819911956787, "learning_rate": 6.24728221481374e-07, "loss": 0.2951, "step": 7767 }, { "epoch": 0.37532009469971495, "grad_norm": 2.816687822341919, "learning_rate": 6.24679905300285e-07, "loss": 0.2504, "step": 7768 }, { "epoch": 0.37536841088080397, "grad_norm": 3.2065589427948, "learning_rate": 6.24631589119196e-07, "loss": 0.325, "step": 7769 }, { "epoch": 0.37541672706189305, "grad_norm": 1.9256387948989868, "learning_rate": 6.24583272938107e-07, "loss": 0.241, "step": 7770 }, { "epoch": 0.3754650432429821, "grad_norm": 3.0020065307617188, "learning_rate": 6.24534956757018e-07, "loss": 0.3013, "step": 7771 }, { "epoch": 0.3755133594240711, "grad_norm": 2.6856300830841064, "learning_rate": 6.244866405759288e-07, "loss": 0.2624, "step": 7772 }, { "epoch": 0.3755616756051602, "grad_norm": 2.3169615268707275, "learning_rate": 6.244383243948398e-07, "loss": 0.227, "step": 7773 }, { "epoch": 0.3756099917862492, "grad_norm": 7.187067985534668, "learning_rate": 6.243900082137507e-07, "loss": 0.2629, "step": 7774 }, { "epoch": 0.37565830796733823, "grad_norm": 2.613407850265503, "learning_rate": 6.243416920326617e-07, "loss": 0.311, "step": 7775 }, { "epoch": 0.3757066241484273, "grad_norm": 2.1213061809539795, "learning_rate": 6.242933758515727e-07, "loss": 0.2198, "step": 7776 }, { "epoch": 0.37575494032951634, "grad_norm": 3.457521677017212, "learning_rate": 6.242450596704836e-07, "loss": 0.2886, "step": 7777 }, { "epoch": 0.3758032565106054, "grad_norm": 2.5146284103393555, "learning_rate": 6.241967434893946e-07, "loss": 0.2918, "step": 7778 }, { "epoch": 0.37585157269169445, "grad_norm": 6.758513927459717, "learning_rate": 6.241484273083055e-07, "loss": 0.3586, "step": 7779 }, { "epoch": 0.37589988887278347, "grad_norm": 3.154977798461914, "learning_rate": 6.241001111272165e-07, "loss": 0.307, "step": 7780 }, { "epoch": 0.37594820505387255, "grad_norm": 2.021510124206543, "learning_rate": 6.240517949461275e-07, "loss": 0.1969, "step": 7781 }, { "epoch": 0.3759965212349616, "grad_norm": 2.6454708576202393, "learning_rate": 6.240034787650384e-07, "loss": 0.3815, "step": 7782 }, { "epoch": 0.37604483741605066, "grad_norm": 2.9016308784484863, "learning_rate": 6.239551625839493e-07, "loss": 0.333, "step": 7783 }, { "epoch": 0.3760931535971397, "grad_norm": 1.8021761178970337, "learning_rate": 6.239068464028603e-07, "loss": 0.2113, "step": 7784 }, { "epoch": 0.3761414697782287, "grad_norm": 2.4101507663726807, "learning_rate": 6.238585302217712e-07, "loss": 0.3277, "step": 7785 }, { "epoch": 0.3761897859593178, "grad_norm": 2.779165029525757, "learning_rate": 6.238102140406822e-07, "loss": 0.3608, "step": 7786 }, { "epoch": 0.3762381021404068, "grad_norm": 2.057527542114258, "learning_rate": 6.237618978595931e-07, "loss": 0.2777, "step": 7787 }, { "epoch": 0.37628641832149584, "grad_norm": 2.3895912170410156, "learning_rate": 6.237135816785041e-07, "loss": 0.3131, "step": 7788 }, { "epoch": 0.3763347345025849, "grad_norm": 1.7111272811889648, "learning_rate": 6.236652654974151e-07, "loss": 0.1996, "step": 7789 }, { "epoch": 0.37638305068367395, "grad_norm": 4.405965328216553, "learning_rate": 6.236169493163261e-07, "loss": 0.247, "step": 7790 }, { "epoch": 0.37643136686476303, "grad_norm": 2.001225709915161, "learning_rate": 6.23568633135237e-07, "loss": 0.2092, "step": 7791 }, { "epoch": 0.37647968304585205, "grad_norm": 3.024402618408203, "learning_rate": 6.235203169541478e-07, "loss": 0.3218, "step": 7792 }, { "epoch": 0.3765279992269411, "grad_norm": 1.795276403427124, "learning_rate": 6.234720007730588e-07, "loss": 0.217, "step": 7793 }, { "epoch": 0.37657631540803016, "grad_norm": 1.9702119827270508, "learning_rate": 6.234236845919698e-07, "loss": 0.2945, "step": 7794 }, { "epoch": 0.3766246315891192, "grad_norm": 3.302452564239502, "learning_rate": 6.233753684108808e-07, "loss": 0.3541, "step": 7795 }, { "epoch": 0.37667294777020827, "grad_norm": 1.8167932033538818, "learning_rate": 6.233270522297918e-07, "loss": 0.1948, "step": 7796 }, { "epoch": 0.3767212639512973, "grad_norm": 4.65809965133667, "learning_rate": 6.232787360487028e-07, "loss": 0.4807, "step": 7797 }, { "epoch": 0.3767695801323863, "grad_norm": 4.224972248077393, "learning_rate": 6.232304198676136e-07, "loss": 0.4072, "step": 7798 }, { "epoch": 0.3768178963134754, "grad_norm": 2.7575244903564453, "learning_rate": 6.231821036865245e-07, "loss": 0.331, "step": 7799 }, { "epoch": 0.3768662124945644, "grad_norm": 3.050229072570801, "learning_rate": 6.231337875054355e-07, "loss": 0.2955, "step": 7800 }, { "epoch": 0.3769145286756535, "grad_norm": 5.422780990600586, "learning_rate": 6.230854713243465e-07, "loss": 0.3601, "step": 7801 }, { "epoch": 0.37696284485674253, "grad_norm": 1.836916446685791, "learning_rate": 6.230371551432575e-07, "loss": 0.1682, "step": 7802 }, { "epoch": 0.37701116103783155, "grad_norm": 1.9697160720825195, "learning_rate": 6.229888389621684e-07, "loss": 0.1762, "step": 7803 }, { "epoch": 0.37705947721892064, "grad_norm": 7.536994934082031, "learning_rate": 6.229405227810793e-07, "loss": 0.2871, "step": 7804 }, { "epoch": 0.37710779340000966, "grad_norm": 2.257082939147949, "learning_rate": 6.228922065999903e-07, "loss": 0.2926, "step": 7805 }, { "epoch": 0.3771561095810987, "grad_norm": 3.6470162868499756, "learning_rate": 6.228438904189013e-07, "loss": 0.3763, "step": 7806 }, { "epoch": 0.37720442576218777, "grad_norm": 3.132491111755371, "learning_rate": 6.227955742378123e-07, "loss": 0.233, "step": 7807 }, { "epoch": 0.3772527419432768, "grad_norm": 5.320314407348633, "learning_rate": 6.227472580567231e-07, "loss": 0.312, "step": 7808 }, { "epoch": 0.3773010581243659, "grad_norm": 2.082667827606201, "learning_rate": 6.226989418756341e-07, "loss": 0.2553, "step": 7809 }, { "epoch": 0.3773493743054549, "grad_norm": 2.7039954662323, "learning_rate": 6.226506256945451e-07, "loss": 0.2823, "step": 7810 }, { "epoch": 0.3773976904865439, "grad_norm": 8.966959953308105, "learning_rate": 6.22602309513456e-07, "loss": 0.55, "step": 7811 }, { "epoch": 0.377446006667633, "grad_norm": 4.825913906097412, "learning_rate": 6.22553993332367e-07, "loss": 0.4177, "step": 7812 }, { "epoch": 0.37749432284872203, "grad_norm": 2.252011299133301, "learning_rate": 6.225056771512779e-07, "loss": 0.2148, "step": 7813 }, { "epoch": 0.3775426390298111, "grad_norm": 1.6739895343780518, "learning_rate": 6.224573609701889e-07, "loss": 0.1659, "step": 7814 }, { "epoch": 0.37759095521090014, "grad_norm": 2.793640375137329, "learning_rate": 6.224090447890999e-07, "loss": 0.4115, "step": 7815 }, { "epoch": 0.37763927139198916, "grad_norm": 2.7061829566955566, "learning_rate": 6.223607286080109e-07, "loss": 0.3407, "step": 7816 }, { "epoch": 0.37768758757307824, "grad_norm": 2.9845199584960938, "learning_rate": 6.223124124269217e-07, "loss": 0.3428, "step": 7817 }, { "epoch": 0.37773590375416727, "grad_norm": 3.365985870361328, "learning_rate": 6.222640962458326e-07, "loss": 0.4335, "step": 7818 }, { "epoch": 0.3777842199352563, "grad_norm": 7.614884376525879, "learning_rate": 6.222157800647436e-07, "loss": 0.3194, "step": 7819 }, { "epoch": 0.3778325361163454, "grad_norm": 2.646641731262207, "learning_rate": 6.221674638836546e-07, "loss": 0.2815, "step": 7820 }, { "epoch": 0.3778808522974344, "grad_norm": 2.632622480392456, "learning_rate": 6.221191477025656e-07, "loss": 0.3088, "step": 7821 }, { "epoch": 0.3779291684785235, "grad_norm": 2.7078592777252197, "learning_rate": 6.220708315214766e-07, "loss": 0.3353, "step": 7822 }, { "epoch": 0.3779774846596125, "grad_norm": 2.1720285415649414, "learning_rate": 6.220225153403876e-07, "loss": 0.2422, "step": 7823 }, { "epoch": 0.37802580084070153, "grad_norm": 2.8324971199035645, "learning_rate": 6.219741991592984e-07, "loss": 0.3538, "step": 7824 }, { "epoch": 0.3780741170217906, "grad_norm": 2.472733497619629, "learning_rate": 6.219258829782093e-07, "loss": 0.2693, "step": 7825 }, { "epoch": 0.37812243320287964, "grad_norm": 2.6609063148498535, "learning_rate": 6.218775667971203e-07, "loss": 0.3161, "step": 7826 }, { "epoch": 0.3781707493839687, "grad_norm": 4.260881423950195, "learning_rate": 6.218292506160313e-07, "loss": 0.2915, "step": 7827 }, { "epoch": 0.37821906556505774, "grad_norm": 13.038721084594727, "learning_rate": 6.217809344349423e-07, "loss": 0.2122, "step": 7828 }, { "epoch": 0.37826738174614677, "grad_norm": 2.7051126956939697, "learning_rate": 6.217326182538532e-07, "loss": 0.3393, "step": 7829 }, { "epoch": 0.37831569792723585, "grad_norm": 14.599666595458984, "learning_rate": 6.216843020727641e-07, "loss": 0.5015, "step": 7830 }, { "epoch": 0.3783640141083249, "grad_norm": 2.318321704864502, "learning_rate": 6.216359858916751e-07, "loss": 0.2391, "step": 7831 }, { "epoch": 0.3784123302894139, "grad_norm": 3.7936055660247803, "learning_rate": 6.215876697105861e-07, "loss": 0.4153, "step": 7832 }, { "epoch": 0.378460646470503, "grad_norm": 2.187422752380371, "learning_rate": 6.21539353529497e-07, "loss": 0.209, "step": 7833 }, { "epoch": 0.378508962651592, "grad_norm": 2.6441683769226074, "learning_rate": 6.214910373484079e-07, "loss": 0.3333, "step": 7834 }, { "epoch": 0.3785572788326811, "grad_norm": 3.0732247829437256, "learning_rate": 6.214427211673189e-07, "loss": 0.2668, "step": 7835 }, { "epoch": 0.3786055950137701, "grad_norm": 3.2273037433624268, "learning_rate": 6.213944049862298e-07, "loss": 0.2649, "step": 7836 }, { "epoch": 0.37865391119485914, "grad_norm": 3.215737819671631, "learning_rate": 6.213460888051408e-07, "loss": 0.3211, "step": 7837 }, { "epoch": 0.3787022273759482, "grad_norm": 2.0571935176849365, "learning_rate": 6.212977726240518e-07, "loss": 0.2699, "step": 7838 }, { "epoch": 0.37875054355703724, "grad_norm": 2.4094362258911133, "learning_rate": 6.212494564429627e-07, "loss": 0.2991, "step": 7839 }, { "epoch": 0.3787988597381263, "grad_norm": 3.4310619831085205, "learning_rate": 6.212011402618737e-07, "loss": 0.186, "step": 7840 }, { "epoch": 0.37884717591921535, "grad_norm": 2.9492554664611816, "learning_rate": 6.211528240807847e-07, "loss": 0.3909, "step": 7841 }, { "epoch": 0.3788954921003044, "grad_norm": 2.336463689804077, "learning_rate": 6.211045078996956e-07, "loss": 0.2971, "step": 7842 }, { "epoch": 0.37894380828139346, "grad_norm": 2.2450973987579346, "learning_rate": 6.210561917186065e-07, "loss": 0.2107, "step": 7843 }, { "epoch": 0.3789921244624825, "grad_norm": 2.1825459003448486, "learning_rate": 6.210078755375174e-07, "loss": 0.2013, "step": 7844 }, { "epoch": 0.3790404406435715, "grad_norm": 3.0552475452423096, "learning_rate": 6.209595593564284e-07, "loss": 0.3018, "step": 7845 }, { "epoch": 0.3790887568246606, "grad_norm": 3.1750247478485107, "learning_rate": 6.209112431753394e-07, "loss": 0.3625, "step": 7846 }, { "epoch": 0.3791370730057496, "grad_norm": 2.47267746925354, "learning_rate": 6.208629269942504e-07, "loss": 0.3361, "step": 7847 }, { "epoch": 0.3791853891868387, "grad_norm": 3.3668503761291504, "learning_rate": 6.208146108131614e-07, "loss": 0.2069, "step": 7848 }, { "epoch": 0.3792337053679277, "grad_norm": 3.075093984603882, "learning_rate": 6.207662946320723e-07, "loss": 0.2811, "step": 7849 }, { "epoch": 0.37928202154901675, "grad_norm": 2.0650758743286133, "learning_rate": 6.207179784509831e-07, "loss": 0.2481, "step": 7850 }, { "epoch": 0.3793303377301058, "grad_norm": 1.675197720527649, "learning_rate": 6.206696622698941e-07, "loss": 0.1851, "step": 7851 }, { "epoch": 0.37937865391119485, "grad_norm": 3.107363224029541, "learning_rate": 6.206213460888051e-07, "loss": 0.405, "step": 7852 }, { "epoch": 0.37942697009228393, "grad_norm": 3.0673983097076416, "learning_rate": 6.205730299077161e-07, "loss": 0.2775, "step": 7853 }, { "epoch": 0.37947528627337296, "grad_norm": 2.729527711868286, "learning_rate": 6.205247137266271e-07, "loss": 0.2977, "step": 7854 }, { "epoch": 0.379523602454462, "grad_norm": 2.029479742050171, "learning_rate": 6.204763975455379e-07, "loss": 0.273, "step": 7855 }, { "epoch": 0.37957191863555106, "grad_norm": 4.803487300872803, "learning_rate": 6.204280813644489e-07, "loss": 0.2559, "step": 7856 }, { "epoch": 0.3796202348166401, "grad_norm": 2.6808011531829834, "learning_rate": 6.203797651833599e-07, "loss": 0.3353, "step": 7857 }, { "epoch": 0.3796685509977291, "grad_norm": 3.2524235248565674, "learning_rate": 6.203314490022709e-07, "loss": 0.2905, "step": 7858 }, { "epoch": 0.3797168671788182, "grad_norm": 2.6700429916381836, "learning_rate": 6.202831328211818e-07, "loss": 0.3506, "step": 7859 }, { "epoch": 0.3797651833599072, "grad_norm": 3.0364866256713867, "learning_rate": 6.202348166400927e-07, "loss": 0.4259, "step": 7860 }, { "epoch": 0.3798134995409963, "grad_norm": 2.647264003753662, "learning_rate": 6.201865004590037e-07, "loss": 0.3002, "step": 7861 }, { "epoch": 0.3798618157220853, "grad_norm": 1.9725977182388306, "learning_rate": 6.201381842779146e-07, "loss": 0.2191, "step": 7862 }, { "epoch": 0.37991013190317435, "grad_norm": 7.757933139801025, "learning_rate": 6.200898680968256e-07, "loss": 0.1937, "step": 7863 }, { "epoch": 0.37995844808426343, "grad_norm": 2.4535999298095703, "learning_rate": 6.200415519157366e-07, "loss": 0.3164, "step": 7864 }, { "epoch": 0.38000676426535246, "grad_norm": 2.823056697845459, "learning_rate": 6.199932357346475e-07, "loss": 0.3255, "step": 7865 }, { "epoch": 0.38005508044644154, "grad_norm": 2.279597520828247, "learning_rate": 6.199449195535585e-07, "loss": 0.3114, "step": 7866 }, { "epoch": 0.38010339662753057, "grad_norm": 2.0702261924743652, "learning_rate": 6.198966033724694e-07, "loss": 0.2347, "step": 7867 }, { "epoch": 0.3801517128086196, "grad_norm": 2.377145528793335, "learning_rate": 6.198482871913803e-07, "loss": 0.2941, "step": 7868 }, { "epoch": 0.38020002898970867, "grad_norm": 6.857827663421631, "learning_rate": 6.197999710102913e-07, "loss": 0.2545, "step": 7869 }, { "epoch": 0.3802483451707977, "grad_norm": 2.408527135848999, "learning_rate": 6.197516548292022e-07, "loss": 0.2506, "step": 7870 }, { "epoch": 0.3802966613518867, "grad_norm": 4.278520584106445, "learning_rate": 6.197033386481132e-07, "loss": 0.2973, "step": 7871 }, { "epoch": 0.3803449775329758, "grad_norm": 1.9679412841796875, "learning_rate": 6.196550224670242e-07, "loss": 0.2789, "step": 7872 }, { "epoch": 0.38039329371406483, "grad_norm": 4.531424045562744, "learning_rate": 6.196067062859352e-07, "loss": 0.387, "step": 7873 }, { "epoch": 0.3804416098951539, "grad_norm": 3.2273712158203125, "learning_rate": 6.195583901048462e-07, "loss": 0.3793, "step": 7874 }, { "epoch": 0.38048992607624293, "grad_norm": 4.5621466636657715, "learning_rate": 6.19510073923757e-07, "loss": 0.3597, "step": 7875 }, { "epoch": 0.38053824225733196, "grad_norm": 4.081773281097412, "learning_rate": 6.194617577426679e-07, "loss": 0.2569, "step": 7876 }, { "epoch": 0.38058655843842104, "grad_norm": 3.310943126678467, "learning_rate": 6.194134415615789e-07, "loss": 0.4086, "step": 7877 }, { "epoch": 0.38063487461951007, "grad_norm": 4.779696464538574, "learning_rate": 6.193651253804899e-07, "loss": 0.4025, "step": 7878 }, { "epoch": 0.38068319080059915, "grad_norm": 2.7689828872680664, "learning_rate": 6.193168091994009e-07, "loss": 0.2813, "step": 7879 }, { "epoch": 0.3807315069816882, "grad_norm": 4.287344932556152, "learning_rate": 6.192684930183119e-07, "loss": 0.2932, "step": 7880 }, { "epoch": 0.3807798231627772, "grad_norm": 1.8819282054901123, "learning_rate": 6.192201768372227e-07, "loss": 0.2153, "step": 7881 }, { "epoch": 0.3808281393438663, "grad_norm": 2.762807846069336, "learning_rate": 6.191718606561337e-07, "loss": 0.315, "step": 7882 }, { "epoch": 0.3808764555249553, "grad_norm": 3.596043109893799, "learning_rate": 6.191235444750447e-07, "loss": 0.3, "step": 7883 }, { "epoch": 0.38092477170604433, "grad_norm": 2.131875991821289, "learning_rate": 6.190752282939556e-07, "loss": 0.1773, "step": 7884 }, { "epoch": 0.3809730878871334, "grad_norm": 1.7585536241531372, "learning_rate": 6.190269121128666e-07, "loss": 0.1494, "step": 7885 }, { "epoch": 0.38102140406822244, "grad_norm": 3.6299610137939453, "learning_rate": 6.189785959317775e-07, "loss": 0.3477, "step": 7886 }, { "epoch": 0.3810697202493115, "grad_norm": 2.5232110023498535, "learning_rate": 6.189302797506884e-07, "loss": 0.2865, "step": 7887 }, { "epoch": 0.38111803643040054, "grad_norm": 4.46598482131958, "learning_rate": 6.188819635695994e-07, "loss": 0.2932, "step": 7888 }, { "epoch": 0.38116635261148957, "grad_norm": 2.3629086017608643, "learning_rate": 6.188336473885104e-07, "loss": 0.223, "step": 7889 }, { "epoch": 0.38121466879257865, "grad_norm": 4.616438865661621, "learning_rate": 6.187853312074214e-07, "loss": 0.3306, "step": 7890 }, { "epoch": 0.3812629849736677, "grad_norm": 3.0829715728759766, "learning_rate": 6.187370150263323e-07, "loss": 0.3381, "step": 7891 }, { "epoch": 0.38131130115475675, "grad_norm": 2.2331767082214355, "learning_rate": 6.186886988452433e-07, "loss": 0.2328, "step": 7892 }, { "epoch": 0.3813596173358458, "grad_norm": 2.98294997215271, "learning_rate": 6.186403826641542e-07, "loss": 0.3515, "step": 7893 }, { "epoch": 0.3814079335169348, "grad_norm": 2.3708510398864746, "learning_rate": 6.185920664830651e-07, "loss": 0.2621, "step": 7894 }, { "epoch": 0.3814562496980239, "grad_norm": 8.716567039489746, "learning_rate": 6.185437503019761e-07, "loss": 0.3199, "step": 7895 }, { "epoch": 0.3815045658791129, "grad_norm": 2.4666764736175537, "learning_rate": 6.18495434120887e-07, "loss": 0.2735, "step": 7896 }, { "epoch": 0.38155288206020194, "grad_norm": 2.726475954055786, "learning_rate": 6.18447117939798e-07, "loss": 0.3011, "step": 7897 }, { "epoch": 0.381601198241291, "grad_norm": 1.7050169706344604, "learning_rate": 6.18398801758709e-07, "loss": 0.1729, "step": 7898 }, { "epoch": 0.38164951442238004, "grad_norm": 3.2202823162078857, "learning_rate": 6.1835048557762e-07, "loss": 0.3022, "step": 7899 }, { "epoch": 0.3816978306034691, "grad_norm": 5.12712287902832, "learning_rate": 6.183021693965309e-07, "loss": 0.3263, "step": 7900 }, { "epoch": 0.38174614678455815, "grad_norm": 2.7157492637634277, "learning_rate": 6.182538532154418e-07, "loss": 0.2528, "step": 7901 }, { "epoch": 0.3817944629656472, "grad_norm": 2.5638952255249023, "learning_rate": 6.182055370343527e-07, "loss": 0.3115, "step": 7902 }, { "epoch": 0.38184277914673626, "grad_norm": 3.566253662109375, "learning_rate": 6.181572208532637e-07, "loss": 0.3805, "step": 7903 }, { "epoch": 0.3818910953278253, "grad_norm": 1.459179162979126, "learning_rate": 6.181089046721747e-07, "loss": 0.1342, "step": 7904 }, { "epoch": 0.38193941150891436, "grad_norm": 2.557196617126465, "learning_rate": 6.180605884910857e-07, "loss": 0.2901, "step": 7905 }, { "epoch": 0.3819877276900034, "grad_norm": 3.16359806060791, "learning_rate": 6.180122723099967e-07, "loss": 0.476, "step": 7906 }, { "epoch": 0.3820360438710924, "grad_norm": 3.0899922847747803, "learning_rate": 6.179639561289075e-07, "loss": 0.3431, "step": 7907 }, { "epoch": 0.3820843600521815, "grad_norm": 11.68493366241455, "learning_rate": 6.179156399478185e-07, "loss": 0.3487, "step": 7908 }, { "epoch": 0.3821326762332705, "grad_norm": 3.4361515045166016, "learning_rate": 6.178673237667294e-07, "loss": 0.2551, "step": 7909 }, { "epoch": 0.38218099241435954, "grad_norm": 2.757338523864746, "learning_rate": 6.178190075856404e-07, "loss": 0.2445, "step": 7910 }, { "epoch": 0.3822293085954486, "grad_norm": 14.31757640838623, "learning_rate": 6.177706914045514e-07, "loss": 0.3714, "step": 7911 }, { "epoch": 0.38227762477653765, "grad_norm": 3.2743382453918457, "learning_rate": 6.177223752234623e-07, "loss": 0.2619, "step": 7912 }, { "epoch": 0.38232594095762673, "grad_norm": 1.8474056720733643, "learning_rate": 6.176740590423732e-07, "loss": 0.2127, "step": 7913 }, { "epoch": 0.38237425713871576, "grad_norm": 2.6433908939361572, "learning_rate": 6.176257428612842e-07, "loss": 0.2743, "step": 7914 }, { "epoch": 0.3824225733198048, "grad_norm": 7.013390064239502, "learning_rate": 6.175774266801952e-07, "loss": 0.2346, "step": 7915 }, { "epoch": 0.38247088950089386, "grad_norm": 2.10294246673584, "learning_rate": 6.175291104991062e-07, "loss": 0.3018, "step": 7916 }, { "epoch": 0.3825192056819829, "grad_norm": 2.8972971439361572, "learning_rate": 6.174807943180171e-07, "loss": 0.2696, "step": 7917 }, { "epoch": 0.38256752186307197, "grad_norm": 2.6150424480438232, "learning_rate": 6.17432478136928e-07, "loss": 0.2529, "step": 7918 }, { "epoch": 0.382615838044161, "grad_norm": 3.0743794441223145, "learning_rate": 6.173841619558389e-07, "loss": 0.3819, "step": 7919 }, { "epoch": 0.38266415422525, "grad_norm": 2.897200345993042, "learning_rate": 6.173358457747499e-07, "loss": 0.3253, "step": 7920 }, { "epoch": 0.3827124704063391, "grad_norm": 2.678272008895874, "learning_rate": 6.172875295936609e-07, "loss": 0.2597, "step": 7921 }, { "epoch": 0.3827607865874281, "grad_norm": 2.0025830268859863, "learning_rate": 6.172392134125718e-07, "loss": 0.2241, "step": 7922 }, { "epoch": 0.38280910276851715, "grad_norm": 2.355175018310547, "learning_rate": 6.171908972314828e-07, "loss": 0.3059, "step": 7923 }, { "epoch": 0.38285741894960623, "grad_norm": 2.6798999309539795, "learning_rate": 6.171425810503938e-07, "loss": 0.195, "step": 7924 }, { "epoch": 0.38290573513069526, "grad_norm": 3.233032464981079, "learning_rate": 6.170942648693048e-07, "loss": 0.387, "step": 7925 }, { "epoch": 0.38295405131178434, "grad_norm": 3.1059529781341553, "learning_rate": 6.170459486882156e-07, "loss": 0.32, "step": 7926 }, { "epoch": 0.38300236749287336, "grad_norm": 3.2233004570007324, "learning_rate": 6.169976325071266e-07, "loss": 0.3933, "step": 7927 }, { "epoch": 0.3830506836739624, "grad_norm": 3.222989082336426, "learning_rate": 6.169493163260375e-07, "loss": 0.5026, "step": 7928 }, { "epoch": 0.38309899985505147, "grad_norm": 2.7163960933685303, "learning_rate": 6.169010001449485e-07, "loss": 0.3819, "step": 7929 }, { "epoch": 0.3831473160361405, "grad_norm": 2.983686685562134, "learning_rate": 6.168526839638595e-07, "loss": 0.3363, "step": 7930 }, { "epoch": 0.3831956322172296, "grad_norm": 3.3744709491729736, "learning_rate": 6.168043677827705e-07, "loss": 0.3225, "step": 7931 }, { "epoch": 0.3832439483983186, "grad_norm": 3.209365129470825, "learning_rate": 6.167560516016814e-07, "loss": 0.3751, "step": 7932 }, { "epoch": 0.3832922645794076, "grad_norm": 3.4747366905212402, "learning_rate": 6.167077354205923e-07, "loss": 0.3295, "step": 7933 }, { "epoch": 0.3833405807604967, "grad_norm": 3.2068276405334473, "learning_rate": 6.166594192395033e-07, "loss": 0.2891, "step": 7934 }, { "epoch": 0.38338889694158573, "grad_norm": 3.6057636737823486, "learning_rate": 6.166111030584142e-07, "loss": 0.395, "step": 7935 }, { "epoch": 0.38343721312267476, "grad_norm": 2.8448972702026367, "learning_rate": 6.165627868773252e-07, "loss": 0.2574, "step": 7936 }, { "epoch": 0.38348552930376384, "grad_norm": 3.5528836250305176, "learning_rate": 6.165144706962362e-07, "loss": 0.4082, "step": 7937 }, { "epoch": 0.38353384548485286, "grad_norm": 6.527021408081055, "learning_rate": 6.16466154515147e-07, "loss": 0.2839, "step": 7938 }, { "epoch": 0.38358216166594195, "grad_norm": 2.2618978023529053, "learning_rate": 6.16417838334058e-07, "loss": 0.2865, "step": 7939 }, { "epoch": 0.38363047784703097, "grad_norm": 2.7965493202209473, "learning_rate": 6.16369522152969e-07, "loss": 0.3444, "step": 7940 }, { "epoch": 0.38367879402812, "grad_norm": 2.777768611907959, "learning_rate": 6.1632120597188e-07, "loss": 0.3136, "step": 7941 }, { "epoch": 0.3837271102092091, "grad_norm": 2.0665032863616943, "learning_rate": 6.16272889790791e-07, "loss": 0.2284, "step": 7942 }, { "epoch": 0.3837754263902981, "grad_norm": 2.9880213737487793, "learning_rate": 6.162245736097018e-07, "loss": 0.345, "step": 7943 }, { "epoch": 0.3838237425713872, "grad_norm": 2.7568411827087402, "learning_rate": 6.161762574286128e-07, "loss": 0.2933, "step": 7944 }, { "epoch": 0.3838720587524762, "grad_norm": 4.040649890899658, "learning_rate": 6.161279412475237e-07, "loss": 0.3522, "step": 7945 }, { "epoch": 0.38392037493356523, "grad_norm": 4.279577732086182, "learning_rate": 6.160796250664347e-07, "loss": 0.4044, "step": 7946 }, { "epoch": 0.3839686911146543, "grad_norm": 2.583362579345703, "learning_rate": 6.160313088853457e-07, "loss": 0.3072, "step": 7947 }, { "epoch": 0.38401700729574334, "grad_norm": 2.342144012451172, "learning_rate": 6.159829927042566e-07, "loss": 0.2581, "step": 7948 }, { "epoch": 0.38406532347683237, "grad_norm": 2.270343780517578, "learning_rate": 6.159346765231676e-07, "loss": 0.3198, "step": 7949 }, { "epoch": 0.38411363965792145, "grad_norm": 2.67863392829895, "learning_rate": 6.158863603420786e-07, "loss": 0.1833, "step": 7950 }, { "epoch": 0.38416195583901047, "grad_norm": 3.008103847503662, "learning_rate": 6.158380441609895e-07, "loss": 0.3772, "step": 7951 }, { "epoch": 0.38421027202009955, "grad_norm": 1.5183382034301758, "learning_rate": 6.157897279799004e-07, "loss": 0.1818, "step": 7952 }, { "epoch": 0.3842585882011886, "grad_norm": 3.9950177669525146, "learning_rate": 6.157414117988113e-07, "loss": 0.3425, "step": 7953 }, { "epoch": 0.3843069043822776, "grad_norm": 3.230715751647949, "learning_rate": 6.156930956177223e-07, "loss": 0.4108, "step": 7954 }, { "epoch": 0.3843552205633667, "grad_norm": 3.1301846504211426, "learning_rate": 6.156447794366333e-07, "loss": 0.3514, "step": 7955 }, { "epoch": 0.3844035367444557, "grad_norm": 3.5694847106933594, "learning_rate": 6.155964632555443e-07, "loss": 0.361, "step": 7956 }, { "epoch": 0.3844518529255448, "grad_norm": 2.749812126159668, "learning_rate": 6.155481470744553e-07, "loss": 0.3238, "step": 7957 }, { "epoch": 0.3845001691066338, "grad_norm": 2.045917272567749, "learning_rate": 6.154998308933662e-07, "loss": 0.1876, "step": 7958 }, { "epoch": 0.38454848528772284, "grad_norm": 3.4071125984191895, "learning_rate": 6.154515147122771e-07, "loss": 0.4797, "step": 7959 }, { "epoch": 0.3845968014688119, "grad_norm": 5.164445877075195, "learning_rate": 6.15403198531188e-07, "loss": 0.32, "step": 7960 }, { "epoch": 0.38464511764990095, "grad_norm": 2.075381278991699, "learning_rate": 6.15354882350099e-07, "loss": 0.2399, "step": 7961 }, { "epoch": 0.38469343383099, "grad_norm": 2.168090581893921, "learning_rate": 6.1530656616901e-07, "loss": 0.1918, "step": 7962 }, { "epoch": 0.38474175001207905, "grad_norm": 2.610682725906372, "learning_rate": 6.15258249987921e-07, "loss": 0.3575, "step": 7963 }, { "epoch": 0.3847900661931681, "grad_norm": 3.7780754566192627, "learning_rate": 6.152099338068318e-07, "loss": 0.3127, "step": 7964 }, { "epoch": 0.38483838237425716, "grad_norm": 2.778228759765625, "learning_rate": 6.151616176257428e-07, "loss": 0.3115, "step": 7965 }, { "epoch": 0.3848866985553462, "grad_norm": 1.813413143157959, "learning_rate": 6.151133014446538e-07, "loss": 0.1769, "step": 7966 }, { "epoch": 0.3849350147364352, "grad_norm": 2.409069776535034, "learning_rate": 6.150649852635648e-07, "loss": 0.2191, "step": 7967 }, { "epoch": 0.3849833309175243, "grad_norm": 2.4017343521118164, "learning_rate": 6.150166690824758e-07, "loss": 0.2763, "step": 7968 }, { "epoch": 0.3850316470986133, "grad_norm": 2.361377716064453, "learning_rate": 6.149683529013866e-07, "loss": 0.2903, "step": 7969 }, { "epoch": 0.3850799632797024, "grad_norm": 2.455559015274048, "learning_rate": 6.149200367202975e-07, "loss": 0.2939, "step": 7970 }, { "epoch": 0.3851282794607914, "grad_norm": 2.1554019451141357, "learning_rate": 6.148717205392085e-07, "loss": 0.243, "step": 7971 }, { "epoch": 0.38517659564188045, "grad_norm": 5.78952693939209, "learning_rate": 6.148234043581195e-07, "loss": 0.2143, "step": 7972 }, { "epoch": 0.38522491182296953, "grad_norm": 2.612053632736206, "learning_rate": 6.147750881770305e-07, "loss": 0.31, "step": 7973 }, { "epoch": 0.38527322800405855, "grad_norm": 11.60499382019043, "learning_rate": 6.147267719959414e-07, "loss": 0.3749, "step": 7974 }, { "epoch": 0.3853215441851476, "grad_norm": 2.890728235244751, "learning_rate": 6.146784558148524e-07, "loss": 0.2025, "step": 7975 }, { "epoch": 0.38536986036623666, "grad_norm": 2.7070980072021484, "learning_rate": 6.146301396337634e-07, "loss": 0.1914, "step": 7976 }, { "epoch": 0.3854181765473257, "grad_norm": 4.024579048156738, "learning_rate": 6.145818234526742e-07, "loss": 0.3871, "step": 7977 }, { "epoch": 0.38546649272841477, "grad_norm": 3.032095432281494, "learning_rate": 6.145335072715852e-07, "loss": 0.3766, "step": 7978 }, { "epoch": 0.3855148089095038, "grad_norm": 1.7598869800567627, "learning_rate": 6.144851910904961e-07, "loss": 0.17, "step": 7979 }, { "epoch": 0.3855631250905928, "grad_norm": 2.7620763778686523, "learning_rate": 6.144368749094071e-07, "loss": 0.3038, "step": 7980 }, { "epoch": 0.3856114412716819, "grad_norm": 3.2723381519317627, "learning_rate": 6.143885587283181e-07, "loss": 0.3052, "step": 7981 }, { "epoch": 0.3856597574527709, "grad_norm": 7.391265869140625, "learning_rate": 6.143402425472291e-07, "loss": 0.2339, "step": 7982 }, { "epoch": 0.38570807363386, "grad_norm": 4.623235702514648, "learning_rate": 6.1429192636614e-07, "loss": 0.3949, "step": 7983 }, { "epoch": 0.38575638981494903, "grad_norm": 2.675386905670166, "learning_rate": 6.14243610185051e-07, "loss": 0.2718, "step": 7984 }, { "epoch": 0.38580470599603806, "grad_norm": 3.3752567768096924, "learning_rate": 6.141952940039618e-07, "loss": 0.3598, "step": 7985 }, { "epoch": 0.38585302217712714, "grad_norm": 3.9415054321289062, "learning_rate": 6.141469778228728e-07, "loss": 0.193, "step": 7986 }, { "epoch": 0.38590133835821616, "grad_norm": 2.6497373580932617, "learning_rate": 6.140986616417838e-07, "loss": 0.1477, "step": 7987 }, { "epoch": 0.3859496545393052, "grad_norm": 1.9101988077163696, "learning_rate": 6.140503454606948e-07, "loss": 0.1248, "step": 7988 }, { "epoch": 0.38599797072039427, "grad_norm": 3.7057113647460938, "learning_rate": 6.140020292796058e-07, "loss": 0.3662, "step": 7989 }, { "epoch": 0.3860462869014833, "grad_norm": 1.9092721939086914, "learning_rate": 6.139537130985166e-07, "loss": 0.2006, "step": 7990 }, { "epoch": 0.3860946030825724, "grad_norm": 2.224515676498413, "learning_rate": 6.139053969174276e-07, "loss": 0.2153, "step": 7991 }, { "epoch": 0.3861429192636614, "grad_norm": 5.80797004699707, "learning_rate": 6.138570807363386e-07, "loss": 0.2931, "step": 7992 }, { "epoch": 0.3861912354447504, "grad_norm": 2.7848434448242188, "learning_rate": 6.138087645552496e-07, "loss": 0.3048, "step": 7993 }, { "epoch": 0.3862395516258395, "grad_norm": 2.89190673828125, "learning_rate": 6.137604483741605e-07, "loss": 0.2604, "step": 7994 }, { "epoch": 0.38628786780692853, "grad_norm": 2.6063294410705566, "learning_rate": 6.137121321930714e-07, "loss": 0.3114, "step": 7995 }, { "epoch": 0.3863361839880176, "grad_norm": 3.671736478805542, "learning_rate": 6.136638160119823e-07, "loss": 0.3596, "step": 7996 }, { "epoch": 0.38638450016910664, "grad_norm": 4.847419261932373, "learning_rate": 6.136154998308933e-07, "loss": 0.325, "step": 7997 }, { "epoch": 0.38643281635019566, "grad_norm": 2.659241199493408, "learning_rate": 6.135671836498043e-07, "loss": 0.2789, "step": 7998 }, { "epoch": 0.38648113253128474, "grad_norm": 2.4253013134002686, "learning_rate": 6.135188674687153e-07, "loss": 0.3398, "step": 7999 }, { "epoch": 0.38652944871237377, "grad_norm": 1.9046270847320557, "learning_rate": 6.134705512876262e-07, "loss": 0.2231, "step": 8000 }, { "epoch": 0.3865777648934628, "grad_norm": 2.9086689949035645, "learning_rate": 6.134222351065372e-07, "loss": 0.3327, "step": 8001 }, { "epoch": 0.3866260810745519, "grad_norm": 2.506897211074829, "learning_rate": 6.13373918925448e-07, "loss": 0.3229, "step": 8002 }, { "epoch": 0.3866743972556409, "grad_norm": 2.207519292831421, "learning_rate": 6.13325602744359e-07, "loss": 0.2709, "step": 8003 }, { "epoch": 0.38672271343673, "grad_norm": 2.3430466651916504, "learning_rate": 6.1327728656327e-07, "loss": 0.3251, "step": 8004 }, { "epoch": 0.386771029617819, "grad_norm": 3.382690191268921, "learning_rate": 6.132289703821809e-07, "loss": 0.2746, "step": 8005 }, { "epoch": 0.38681934579890803, "grad_norm": 2.143256425857544, "learning_rate": 6.131806542010919e-07, "loss": 0.2537, "step": 8006 }, { "epoch": 0.3868676619799971, "grad_norm": 2.1219286918640137, "learning_rate": 6.131323380200029e-07, "loss": 0.2631, "step": 8007 }, { "epoch": 0.38691597816108614, "grad_norm": 3.5069775581359863, "learning_rate": 6.130840218389139e-07, "loss": 0.4301, "step": 8008 }, { "epoch": 0.3869642943421752, "grad_norm": 4.698582649230957, "learning_rate": 6.130357056578248e-07, "loss": 0.2988, "step": 8009 }, { "epoch": 0.38701261052326424, "grad_norm": 2.422621965408325, "learning_rate": 6.129873894767358e-07, "loss": 0.3215, "step": 8010 }, { "epoch": 0.38706092670435327, "grad_norm": 2.5421900749206543, "learning_rate": 6.129390732956466e-07, "loss": 0.2903, "step": 8011 }, { "epoch": 0.38710924288544235, "grad_norm": 8.951262474060059, "learning_rate": 6.128907571145576e-07, "loss": 0.2528, "step": 8012 }, { "epoch": 0.3871575590665314, "grad_norm": 2.711925745010376, "learning_rate": 6.128424409334686e-07, "loss": 0.3845, "step": 8013 }, { "epoch": 0.3872058752476204, "grad_norm": 2.9535930156707764, "learning_rate": 6.127941247523796e-07, "loss": 0.4342, "step": 8014 }, { "epoch": 0.3872541914287095, "grad_norm": 3.061645984649658, "learning_rate": 6.127458085712905e-07, "loss": 0.2149, "step": 8015 }, { "epoch": 0.3873025076097985, "grad_norm": 13.578996658325195, "learning_rate": 6.126974923902014e-07, "loss": 0.346, "step": 8016 }, { "epoch": 0.3873508237908876, "grad_norm": 2.910628318786621, "learning_rate": 6.126491762091124e-07, "loss": 0.2701, "step": 8017 }, { "epoch": 0.3873991399719766, "grad_norm": 2.9204187393188477, "learning_rate": 6.126008600280234e-07, "loss": 0.2633, "step": 8018 }, { "epoch": 0.38744745615306564, "grad_norm": 4.090304851531982, "learning_rate": 6.125525438469343e-07, "loss": 0.3282, "step": 8019 }, { "epoch": 0.3874957723341547, "grad_norm": 3.2709038257598877, "learning_rate": 6.125042276658453e-07, "loss": 0.4239, "step": 8020 }, { "epoch": 0.38754408851524375, "grad_norm": 4.026391506195068, "learning_rate": 6.124559114847561e-07, "loss": 0.2962, "step": 8021 }, { "epoch": 0.3875924046963328, "grad_norm": 3.1718876361846924, "learning_rate": 6.124075953036671e-07, "loss": 0.2553, "step": 8022 }, { "epoch": 0.38764072087742185, "grad_norm": 2.5485756397247314, "learning_rate": 6.123592791225781e-07, "loss": 0.3418, "step": 8023 }, { "epoch": 0.3876890370585109, "grad_norm": 3.209624767303467, "learning_rate": 6.123109629414891e-07, "loss": 0.3132, "step": 8024 }, { "epoch": 0.38773735323959996, "grad_norm": 2.866022825241089, "learning_rate": 6.122626467604001e-07, "loss": 0.3842, "step": 8025 }, { "epoch": 0.387785669420689, "grad_norm": 3.0667924880981445, "learning_rate": 6.12214330579311e-07, "loss": 0.2572, "step": 8026 }, { "epoch": 0.387833985601778, "grad_norm": 2.920043706893921, "learning_rate": 6.12166014398222e-07, "loss": 0.3074, "step": 8027 }, { "epoch": 0.3878823017828671, "grad_norm": 4.072113990783691, "learning_rate": 6.121176982171328e-07, "loss": 0.3205, "step": 8028 }, { "epoch": 0.3879306179639561, "grad_norm": 1.7083629369735718, "learning_rate": 6.120693820360438e-07, "loss": 0.1764, "step": 8029 }, { "epoch": 0.3879789341450452, "grad_norm": 2.4889779090881348, "learning_rate": 6.120210658549548e-07, "loss": 0.2776, "step": 8030 }, { "epoch": 0.3880272503261342, "grad_norm": 2.405118227005005, "learning_rate": 6.119727496738657e-07, "loss": 0.251, "step": 8031 }, { "epoch": 0.38807556650722325, "grad_norm": 3.8725383281707764, "learning_rate": 6.119244334927767e-07, "loss": 0.3132, "step": 8032 }, { "epoch": 0.3881238826883123, "grad_norm": 1.6044217348098755, "learning_rate": 6.118761173116877e-07, "loss": 0.218, "step": 8033 }, { "epoch": 0.38817219886940135, "grad_norm": 110.6502914428711, "learning_rate": 6.118278011305986e-07, "loss": 0.2704, "step": 8034 }, { "epoch": 0.38822051505049043, "grad_norm": 1.0990160703659058, "learning_rate": 6.117794849495096e-07, "loss": 0.1232, "step": 8035 }, { "epoch": 0.38826883123157946, "grad_norm": 3.4742109775543213, "learning_rate": 6.117311687684205e-07, "loss": 0.3419, "step": 8036 }, { "epoch": 0.3883171474126685, "grad_norm": 2.4235923290252686, "learning_rate": 6.116828525873314e-07, "loss": 0.2822, "step": 8037 }, { "epoch": 0.38836546359375756, "grad_norm": 2.92215895652771, "learning_rate": 6.116345364062424e-07, "loss": 0.3423, "step": 8038 }, { "epoch": 0.3884137797748466, "grad_norm": 2.034390687942505, "learning_rate": 6.115862202251534e-07, "loss": 0.2154, "step": 8039 }, { "epoch": 0.3884620959559356, "grad_norm": 2.342379093170166, "learning_rate": 6.115379040440644e-07, "loss": 0.2466, "step": 8040 }, { "epoch": 0.3885104121370247, "grad_norm": 37.673309326171875, "learning_rate": 6.114895878629753e-07, "loss": 0.433, "step": 8041 }, { "epoch": 0.3885587283181137, "grad_norm": 36.500614166259766, "learning_rate": 6.114412716818862e-07, "loss": 0.3233, "step": 8042 }, { "epoch": 0.3886070444992028, "grad_norm": 5.878522872924805, "learning_rate": 6.113929555007972e-07, "loss": 0.451, "step": 8043 }, { "epoch": 0.38865536068029183, "grad_norm": 2.5038740634918213, "learning_rate": 6.113446393197082e-07, "loss": 0.317, "step": 8044 }, { "epoch": 0.38870367686138085, "grad_norm": 3.930948495864868, "learning_rate": 6.112963231386191e-07, "loss": 0.2486, "step": 8045 }, { "epoch": 0.38875199304246993, "grad_norm": 2.1289467811584473, "learning_rate": 6.112480069575301e-07, "loss": 0.251, "step": 8046 }, { "epoch": 0.38880030922355896, "grad_norm": 3.889235019683838, "learning_rate": 6.111996907764409e-07, "loss": 0.3524, "step": 8047 }, { "epoch": 0.38884862540464804, "grad_norm": 2.6009726524353027, "learning_rate": 6.111513745953519e-07, "loss": 0.2833, "step": 8048 }, { "epoch": 0.38889694158573707, "grad_norm": 2.698451042175293, "learning_rate": 6.111030584142629e-07, "loss": 0.3872, "step": 8049 }, { "epoch": 0.3889452577668261, "grad_norm": 1.630339503288269, "learning_rate": 6.110547422331739e-07, "loss": 0.1699, "step": 8050 }, { "epoch": 0.38899357394791517, "grad_norm": 2.628741502761841, "learning_rate": 6.110064260520849e-07, "loss": 0.2745, "step": 8051 }, { "epoch": 0.3890418901290042, "grad_norm": 2.483696460723877, "learning_rate": 6.109581098709958e-07, "loss": 0.336, "step": 8052 }, { "epoch": 0.3890902063100932, "grad_norm": 1.8361560106277466, "learning_rate": 6.109097936899067e-07, "loss": 0.222, "step": 8053 }, { "epoch": 0.3891385224911823, "grad_norm": 2.508436679840088, "learning_rate": 6.108614775088176e-07, "loss": 0.2901, "step": 8054 }, { "epoch": 0.38918683867227133, "grad_norm": 2.9677700996398926, "learning_rate": 6.108131613277286e-07, "loss": 0.3752, "step": 8055 }, { "epoch": 0.3892351548533604, "grad_norm": 2.6779961585998535, "learning_rate": 6.107648451466396e-07, "loss": 0.218, "step": 8056 }, { "epoch": 0.38928347103444944, "grad_norm": 3.0355417728424072, "learning_rate": 6.107165289655505e-07, "loss": 0.3157, "step": 8057 }, { "epoch": 0.38933178721553846, "grad_norm": 11.355367660522461, "learning_rate": 6.106682127844615e-07, "loss": 0.3668, "step": 8058 }, { "epoch": 0.38938010339662754, "grad_norm": 2.5924618244171143, "learning_rate": 6.106198966033725e-07, "loss": 0.3177, "step": 8059 }, { "epoch": 0.38942841957771657, "grad_norm": 2.653371810913086, "learning_rate": 6.105715804222834e-07, "loss": 0.2225, "step": 8060 }, { "epoch": 0.38947673575880565, "grad_norm": 2.650688648223877, "learning_rate": 6.105232642411944e-07, "loss": 0.3454, "step": 8061 }, { "epoch": 0.3895250519398947, "grad_norm": 5.4156293869018555, "learning_rate": 6.104749480601053e-07, "loss": 0.352, "step": 8062 }, { "epoch": 0.3895733681209837, "grad_norm": 3.227952241897583, "learning_rate": 6.104266318790162e-07, "loss": 0.372, "step": 8063 }, { "epoch": 0.3896216843020728, "grad_norm": 2.040980100631714, "learning_rate": 6.103783156979272e-07, "loss": 0.2022, "step": 8064 }, { "epoch": 0.3896700004831618, "grad_norm": 2.779730796813965, "learning_rate": 6.103299995168382e-07, "loss": 0.328, "step": 8065 }, { "epoch": 0.38971831666425083, "grad_norm": 3.337454080581665, "learning_rate": 6.102816833357491e-07, "loss": 0.2928, "step": 8066 }, { "epoch": 0.3897666328453399, "grad_norm": 2.1910438537597656, "learning_rate": 6.102333671546601e-07, "loss": 0.2194, "step": 8067 }, { "epoch": 0.38981494902642894, "grad_norm": 2.7848381996154785, "learning_rate": 6.10185050973571e-07, "loss": 0.3158, "step": 8068 }, { "epoch": 0.389863265207518, "grad_norm": 2.876098871231079, "learning_rate": 6.10136734792482e-07, "loss": 0.1999, "step": 8069 }, { "epoch": 0.38991158138860704, "grad_norm": 3.276040554046631, "learning_rate": 6.100884186113929e-07, "loss": 0.3954, "step": 8070 }, { "epoch": 0.38995989756969607, "grad_norm": 4.087518692016602, "learning_rate": 6.100401024303039e-07, "loss": 0.2575, "step": 8071 }, { "epoch": 0.39000821375078515, "grad_norm": 2.3131697177886963, "learning_rate": 6.099917862492149e-07, "loss": 0.1937, "step": 8072 }, { "epoch": 0.3900565299318742, "grad_norm": 2.268101453781128, "learning_rate": 6.099434700681257e-07, "loss": 0.2415, "step": 8073 }, { "epoch": 0.39010484611296325, "grad_norm": 2.7285361289978027, "learning_rate": 6.098951538870367e-07, "loss": 0.2754, "step": 8074 }, { "epoch": 0.3901531622940523, "grad_norm": 2.5087826251983643, "learning_rate": 6.098468377059477e-07, "loss": 0.2449, "step": 8075 }, { "epoch": 0.3902014784751413, "grad_norm": 3.2891597747802734, "learning_rate": 6.097985215248587e-07, "loss": 0.3929, "step": 8076 }, { "epoch": 0.3902497946562304, "grad_norm": 3.3644089698791504, "learning_rate": 6.097502053437697e-07, "loss": 0.3447, "step": 8077 }, { "epoch": 0.3902981108373194, "grad_norm": 10.868871688842773, "learning_rate": 6.097018891626805e-07, "loss": 0.2674, "step": 8078 }, { "epoch": 0.39034642701840844, "grad_norm": 3.299731969833374, "learning_rate": 6.096535729815914e-07, "loss": 0.2357, "step": 8079 }, { "epoch": 0.3903947431994975, "grad_norm": 5.148679733276367, "learning_rate": 6.096052568005024e-07, "loss": 0.3644, "step": 8080 }, { "epoch": 0.39044305938058654, "grad_norm": 2.6402857303619385, "learning_rate": 6.095569406194134e-07, "loss": 0.3365, "step": 8081 }, { "epoch": 0.3904913755616756, "grad_norm": 2.9420223236083984, "learning_rate": 6.095086244383244e-07, "loss": 0.2913, "step": 8082 }, { "epoch": 0.39053969174276465, "grad_norm": 3.2239444255828857, "learning_rate": 6.094603082572353e-07, "loss": 0.3671, "step": 8083 }, { "epoch": 0.3905880079238537, "grad_norm": 4.007575511932373, "learning_rate": 6.094119920761463e-07, "loss": 0.285, "step": 8084 }, { "epoch": 0.39063632410494276, "grad_norm": 2.0982251167297363, "learning_rate": 6.093636758950573e-07, "loss": 0.2677, "step": 8085 }, { "epoch": 0.3906846402860318, "grad_norm": 3.767857074737549, "learning_rate": 6.093153597139682e-07, "loss": 0.3373, "step": 8086 }, { "epoch": 0.39073295646712086, "grad_norm": 2.981391668319702, "learning_rate": 6.092670435328791e-07, "loss": 0.3638, "step": 8087 }, { "epoch": 0.3907812726482099, "grad_norm": 2.6358556747436523, "learning_rate": 6.092187273517901e-07, "loss": 0.3479, "step": 8088 }, { "epoch": 0.3908295888292989, "grad_norm": 3.038862943649292, "learning_rate": 6.09170411170701e-07, "loss": 0.3793, "step": 8089 }, { "epoch": 0.390877905010388, "grad_norm": 5.464182376861572, "learning_rate": 6.09122094989612e-07, "loss": 0.2835, "step": 8090 }, { "epoch": 0.390926221191477, "grad_norm": 3.033998489379883, "learning_rate": 6.09073778808523e-07, "loss": 0.4282, "step": 8091 }, { "epoch": 0.3909745373725661, "grad_norm": 3.3888916969299316, "learning_rate": 6.090254626274339e-07, "loss": 0.3692, "step": 8092 }, { "epoch": 0.3910228535536551, "grad_norm": 2.143803119659424, "learning_rate": 6.089771464463449e-07, "loss": 0.2377, "step": 8093 }, { "epoch": 0.39107116973474415, "grad_norm": 5.681087017059326, "learning_rate": 6.089288302652558e-07, "loss": 0.4964, "step": 8094 }, { "epoch": 0.39111948591583323, "grad_norm": 2.695466995239258, "learning_rate": 6.088805140841667e-07, "loss": 0.3096, "step": 8095 }, { "epoch": 0.39116780209692226, "grad_norm": 2.9145963191986084, "learning_rate": 6.088321979030777e-07, "loss": 0.3388, "step": 8096 }, { "epoch": 0.3912161182780113, "grad_norm": 2.2805733680725098, "learning_rate": 6.087838817219887e-07, "loss": 0.2998, "step": 8097 }, { "epoch": 0.39126443445910036, "grad_norm": 3.5644032955169678, "learning_rate": 6.087355655408996e-07, "loss": 0.3412, "step": 8098 }, { "epoch": 0.3913127506401894, "grad_norm": 2.6437861919403076, "learning_rate": 6.086872493598105e-07, "loss": 0.2631, "step": 8099 }, { "epoch": 0.39136106682127847, "grad_norm": 3.049672842025757, "learning_rate": 6.086389331787215e-07, "loss": 0.34, "step": 8100 }, { "epoch": 0.3914093830023675, "grad_norm": 5.268834114074707, "learning_rate": 6.085906169976325e-07, "loss": 0.2363, "step": 8101 }, { "epoch": 0.3914576991834565, "grad_norm": 11.000006675720215, "learning_rate": 6.085423008165435e-07, "loss": 0.5673, "step": 8102 }, { "epoch": 0.3915060153645456, "grad_norm": 1.9679774045944214, "learning_rate": 6.084939846354545e-07, "loss": 0.1974, "step": 8103 }, { "epoch": 0.3915543315456346, "grad_norm": 4.093286991119385, "learning_rate": 6.084456684543653e-07, "loss": 0.3212, "step": 8104 }, { "epoch": 0.3916026477267237, "grad_norm": 2.743610143661499, "learning_rate": 6.083973522732762e-07, "loss": 0.2564, "step": 8105 }, { "epoch": 0.39165096390781273, "grad_norm": 2.2614223957061768, "learning_rate": 6.083490360921872e-07, "loss": 0.2434, "step": 8106 }, { "epoch": 0.39169928008890176, "grad_norm": 3.569514751434326, "learning_rate": 6.083007199110982e-07, "loss": 0.23, "step": 8107 }, { "epoch": 0.39174759626999084, "grad_norm": 3.6151928901672363, "learning_rate": 6.082524037300092e-07, "loss": 0.2251, "step": 8108 }, { "epoch": 0.39179591245107986, "grad_norm": 2.5693774223327637, "learning_rate": 6.082040875489201e-07, "loss": 0.26, "step": 8109 }, { "epoch": 0.3918442286321689, "grad_norm": 1.838145136833191, "learning_rate": 6.081557713678311e-07, "loss": 0.1996, "step": 8110 }, { "epoch": 0.39189254481325797, "grad_norm": 4.736192226409912, "learning_rate": 6.08107455186742e-07, "loss": 0.3788, "step": 8111 }, { "epoch": 0.391940860994347, "grad_norm": 2.371056318283081, "learning_rate": 6.080591390056529e-07, "loss": 0.3013, "step": 8112 }, { "epoch": 0.3919891771754361, "grad_norm": 3.2885303497314453, "learning_rate": 6.080108228245639e-07, "loss": 0.2652, "step": 8113 }, { "epoch": 0.3920374933565251, "grad_norm": 2.6320083141326904, "learning_rate": 6.079625066434749e-07, "loss": 0.2096, "step": 8114 }, { "epoch": 0.3920858095376141, "grad_norm": 12.52080249786377, "learning_rate": 6.079141904623858e-07, "loss": 0.3353, "step": 8115 }, { "epoch": 0.3921341257187032, "grad_norm": 3.6777894496917725, "learning_rate": 6.078658742812968e-07, "loss": 0.3264, "step": 8116 }, { "epoch": 0.39218244189979223, "grad_norm": 3.243203639984131, "learning_rate": 6.078175581002078e-07, "loss": 0.2368, "step": 8117 }, { "epoch": 0.3922307580808813, "grad_norm": 2.688218832015991, "learning_rate": 6.077692419191187e-07, "loss": 0.2965, "step": 8118 }, { "epoch": 0.39227907426197034, "grad_norm": 2.1443121433258057, "learning_rate": 6.077209257380297e-07, "loss": 0.1903, "step": 8119 }, { "epoch": 0.39232739044305937, "grad_norm": 3.322556734085083, "learning_rate": 6.076726095569405e-07, "loss": 0.3968, "step": 8120 }, { "epoch": 0.39237570662414845, "grad_norm": 3.139798641204834, "learning_rate": 6.076242933758515e-07, "loss": 0.3817, "step": 8121 }, { "epoch": 0.39242402280523747, "grad_norm": 3.433927297592163, "learning_rate": 6.075759771947625e-07, "loss": 0.3876, "step": 8122 }, { "epoch": 0.3924723389863265, "grad_norm": 2.7942655086517334, "learning_rate": 6.075276610136735e-07, "loss": 0.3161, "step": 8123 }, { "epoch": 0.3925206551674156, "grad_norm": 2.465876579284668, "learning_rate": 6.074793448325844e-07, "loss": 0.3578, "step": 8124 }, { "epoch": 0.3925689713485046, "grad_norm": 2.7917640209198, "learning_rate": 6.074310286514953e-07, "loss": 0.3069, "step": 8125 }, { "epoch": 0.3926172875295937, "grad_norm": 2.3845019340515137, "learning_rate": 6.073827124704063e-07, "loss": 0.2791, "step": 8126 }, { "epoch": 0.3926656037106827, "grad_norm": 2.4793314933776855, "learning_rate": 6.073343962893173e-07, "loss": 0.2804, "step": 8127 }, { "epoch": 0.39271391989177173, "grad_norm": 3.1565191745758057, "learning_rate": 6.072860801082283e-07, "loss": 0.3757, "step": 8128 }, { "epoch": 0.3927622360728608, "grad_norm": 14.582358360290527, "learning_rate": 6.072377639271393e-07, "loss": 0.2254, "step": 8129 }, { "epoch": 0.39281055225394984, "grad_norm": 2.805562734603882, "learning_rate": 6.0718944774605e-07, "loss": 0.318, "step": 8130 }, { "epoch": 0.3928588684350389, "grad_norm": 3.920825958251953, "learning_rate": 6.07141131564961e-07, "loss": 0.2445, "step": 8131 }, { "epoch": 0.39290718461612795, "grad_norm": 3.006401538848877, "learning_rate": 6.07092815383872e-07, "loss": 0.2868, "step": 8132 }, { "epoch": 0.39295550079721697, "grad_norm": 4.877512454986572, "learning_rate": 6.07044499202783e-07, "loss": 0.3941, "step": 8133 }, { "epoch": 0.39300381697830605, "grad_norm": 3.0962276458740234, "learning_rate": 6.06996183021694e-07, "loss": 0.2648, "step": 8134 }, { "epoch": 0.3930521331593951, "grad_norm": 2.183382034301758, "learning_rate": 6.069478668406049e-07, "loss": 0.2288, "step": 8135 }, { "epoch": 0.3931004493404841, "grad_norm": 3.478630781173706, "learning_rate": 6.068995506595159e-07, "loss": 0.2691, "step": 8136 }, { "epoch": 0.3931487655215732, "grad_norm": 5.828724384307861, "learning_rate": 6.068512344784267e-07, "loss": 0.2372, "step": 8137 }, { "epoch": 0.3931970817026622, "grad_norm": 7.893082618713379, "learning_rate": 6.068029182973377e-07, "loss": 0.3598, "step": 8138 }, { "epoch": 0.3932453978837513, "grad_norm": 4.8085408210754395, "learning_rate": 6.067546021162487e-07, "loss": 0.4027, "step": 8139 }, { "epoch": 0.3932937140648403, "grad_norm": 2.8111698627471924, "learning_rate": 6.067062859351597e-07, "loss": 0.3145, "step": 8140 }, { "epoch": 0.39334203024592934, "grad_norm": 2.5347976684570312, "learning_rate": 6.066579697540706e-07, "loss": 0.2621, "step": 8141 }, { "epoch": 0.3933903464270184, "grad_norm": 2.8961334228515625, "learning_rate": 6.066096535729816e-07, "loss": 0.2642, "step": 8142 }, { "epoch": 0.39343866260810745, "grad_norm": 2.654336452484131, "learning_rate": 6.065613373918925e-07, "loss": 0.3576, "step": 8143 }, { "epoch": 0.39348697878919653, "grad_norm": 3.3761966228485107, "learning_rate": 6.065130212108035e-07, "loss": 0.3242, "step": 8144 }, { "epoch": 0.39353529497028555, "grad_norm": 2.4110326766967773, "learning_rate": 6.064647050297145e-07, "loss": 0.3531, "step": 8145 }, { "epoch": 0.3935836111513746, "grad_norm": 5.970695972442627, "learning_rate": 6.064163888486253e-07, "loss": 0.386, "step": 8146 }, { "epoch": 0.39363192733246366, "grad_norm": 4.446112155914307, "learning_rate": 6.063680726675363e-07, "loss": 0.4784, "step": 8147 }, { "epoch": 0.3936802435135527, "grad_norm": 2.3859596252441406, "learning_rate": 6.063197564864473e-07, "loss": 0.2799, "step": 8148 }, { "epoch": 0.3937285596946417, "grad_norm": 2.7595322132110596, "learning_rate": 6.062714403053583e-07, "loss": 0.4449, "step": 8149 }, { "epoch": 0.3937768758757308, "grad_norm": 2.6174066066741943, "learning_rate": 6.062231241242692e-07, "loss": 0.3349, "step": 8150 }, { "epoch": 0.3938251920568198, "grad_norm": 3.220430612564087, "learning_rate": 6.061748079431801e-07, "loss": 0.2957, "step": 8151 }, { "epoch": 0.3938735082379089, "grad_norm": 2.682990550994873, "learning_rate": 6.061264917620911e-07, "loss": 0.3107, "step": 8152 }, { "epoch": 0.3939218244189979, "grad_norm": 2.682218074798584, "learning_rate": 6.060781755810021e-07, "loss": 0.3985, "step": 8153 }, { "epoch": 0.39397014060008695, "grad_norm": 2.868870735168457, "learning_rate": 6.06029859399913e-07, "loss": 0.3744, "step": 8154 }, { "epoch": 0.39401845678117603, "grad_norm": 2.811225652694702, "learning_rate": 6.05981543218824e-07, "loss": 0.3044, "step": 8155 }, { "epoch": 0.39406677296226506, "grad_norm": 3.5441243648529053, "learning_rate": 6.059332270377348e-07, "loss": 0.3752, "step": 8156 }, { "epoch": 0.39411508914335414, "grad_norm": 2.42596697807312, "learning_rate": 6.058849108566458e-07, "loss": 0.3548, "step": 8157 }, { "epoch": 0.39416340532444316, "grad_norm": 2.638951539993286, "learning_rate": 6.058365946755568e-07, "loss": 0.3342, "step": 8158 }, { "epoch": 0.3942117215055322, "grad_norm": 1.9732348918914795, "learning_rate": 6.057882784944678e-07, "loss": 0.2448, "step": 8159 }, { "epoch": 0.39426003768662127, "grad_norm": 2.2686052322387695, "learning_rate": 6.057399623133788e-07, "loss": 0.1893, "step": 8160 }, { "epoch": 0.3943083538677103, "grad_norm": 2.765049934387207, "learning_rate": 6.056916461322897e-07, "loss": 0.3242, "step": 8161 }, { "epoch": 0.3943566700487993, "grad_norm": 1.8492894172668457, "learning_rate": 6.056433299512006e-07, "loss": 0.2244, "step": 8162 }, { "epoch": 0.3944049862298884, "grad_norm": 3.2592153549194336, "learning_rate": 6.055950137701115e-07, "loss": 0.2156, "step": 8163 }, { "epoch": 0.3944533024109774, "grad_norm": 3.635697841644287, "learning_rate": 6.055466975890225e-07, "loss": 0.3461, "step": 8164 }, { "epoch": 0.3945016185920665, "grad_norm": 6.867447376251221, "learning_rate": 6.054983814079335e-07, "loss": 0.2538, "step": 8165 }, { "epoch": 0.39454993477315553, "grad_norm": 2.6645469665527344, "learning_rate": 6.054500652268445e-07, "loss": 0.3908, "step": 8166 }, { "epoch": 0.39459825095424456, "grad_norm": 6.165863513946533, "learning_rate": 6.054017490457554e-07, "loss": 0.342, "step": 8167 }, { "epoch": 0.39464656713533364, "grad_norm": 2.0845437049865723, "learning_rate": 6.053534328646664e-07, "loss": 0.2381, "step": 8168 }, { "epoch": 0.39469488331642266, "grad_norm": 3.8193421363830566, "learning_rate": 6.053051166835773e-07, "loss": 0.2904, "step": 8169 }, { "epoch": 0.39474319949751174, "grad_norm": 3.1768627166748047, "learning_rate": 6.052568005024883e-07, "loss": 0.307, "step": 8170 }, { "epoch": 0.39479151567860077, "grad_norm": 1.4835309982299805, "learning_rate": 6.052084843213993e-07, "loss": 0.1606, "step": 8171 }, { "epoch": 0.3948398318596898, "grad_norm": 2.7296626567840576, "learning_rate": 6.051601681403101e-07, "loss": 0.2847, "step": 8172 }, { "epoch": 0.3948881480407789, "grad_norm": 2.558135509490967, "learning_rate": 6.051118519592211e-07, "loss": 0.2837, "step": 8173 }, { "epoch": 0.3949364642218679, "grad_norm": 2.2719223499298096, "learning_rate": 6.050635357781321e-07, "loss": 0.2837, "step": 8174 }, { "epoch": 0.3949847804029569, "grad_norm": 11.187175750732422, "learning_rate": 6.05015219597043e-07, "loss": 0.5273, "step": 8175 }, { "epoch": 0.395033096584046, "grad_norm": 3.5624547004699707, "learning_rate": 6.04966903415954e-07, "loss": 0.2477, "step": 8176 }, { "epoch": 0.39508141276513503, "grad_norm": 2.1468541622161865, "learning_rate": 6.049185872348649e-07, "loss": 0.2356, "step": 8177 }, { "epoch": 0.3951297289462241, "grad_norm": 33.36189270019531, "learning_rate": 6.048702710537759e-07, "loss": 0.1775, "step": 8178 }, { "epoch": 0.39517804512731314, "grad_norm": 2.5744924545288086, "learning_rate": 6.048219548726869e-07, "loss": 0.2567, "step": 8179 }, { "epoch": 0.39522636130840216, "grad_norm": 2.0487887859344482, "learning_rate": 6.047736386915978e-07, "loss": 0.2481, "step": 8180 }, { "epoch": 0.39527467748949124, "grad_norm": 4.217556953430176, "learning_rate": 6.047253225105088e-07, "loss": 0.5149, "step": 8181 }, { "epoch": 0.39532299367058027, "grad_norm": 3.766066551208496, "learning_rate": 6.046770063294196e-07, "loss": 0.2555, "step": 8182 }, { "epoch": 0.39537130985166935, "grad_norm": 1.8436017036437988, "learning_rate": 6.046286901483306e-07, "loss": 0.239, "step": 8183 }, { "epoch": 0.3954196260327584, "grad_norm": 3.186204195022583, "learning_rate": 6.045803739672416e-07, "loss": 0.3119, "step": 8184 }, { "epoch": 0.3954679422138474, "grad_norm": 2.9342005252838135, "learning_rate": 6.045320577861526e-07, "loss": 0.3811, "step": 8185 }, { "epoch": 0.3955162583949365, "grad_norm": 7.474488735198975, "learning_rate": 6.044837416050636e-07, "loss": 0.2751, "step": 8186 }, { "epoch": 0.3955645745760255, "grad_norm": 2.1447954177856445, "learning_rate": 6.044354254239745e-07, "loss": 0.2297, "step": 8187 }, { "epoch": 0.39561289075711453, "grad_norm": 4.118062973022461, "learning_rate": 6.043871092428853e-07, "loss": 0.3952, "step": 8188 }, { "epoch": 0.3956612069382036, "grad_norm": 3.7559878826141357, "learning_rate": 6.043387930617963e-07, "loss": 0.3622, "step": 8189 }, { "epoch": 0.39570952311929264, "grad_norm": 3.780409574508667, "learning_rate": 6.042904768807073e-07, "loss": 0.3256, "step": 8190 }, { "epoch": 0.3957578393003817, "grad_norm": 3.2472517490386963, "learning_rate": 6.042421606996183e-07, "loss": 0.3131, "step": 8191 }, { "epoch": 0.39580615548147075, "grad_norm": 3.8462226390838623, "learning_rate": 6.041938445185293e-07, "loss": 0.3412, "step": 8192 }, { "epoch": 0.39585447166255977, "grad_norm": 4.31989049911499, "learning_rate": 6.041455283374402e-07, "loss": 0.3556, "step": 8193 }, { "epoch": 0.39590278784364885, "grad_norm": 2.4745523929595947, "learning_rate": 6.040972121563511e-07, "loss": 0.3439, "step": 8194 }, { "epoch": 0.3959511040247379, "grad_norm": 3.475884437561035, "learning_rate": 6.040488959752621e-07, "loss": 0.422, "step": 8195 }, { "epoch": 0.39599942020582696, "grad_norm": 2.927426815032959, "learning_rate": 6.040005797941731e-07, "loss": 0.3281, "step": 8196 }, { "epoch": 0.396047736386916, "grad_norm": 2.5477652549743652, "learning_rate": 6.03952263613084e-07, "loss": 0.335, "step": 8197 }, { "epoch": 0.396096052568005, "grad_norm": 9.282820701599121, "learning_rate": 6.039039474319949e-07, "loss": 0.2708, "step": 8198 }, { "epoch": 0.3961443687490941, "grad_norm": 4.972078800201416, "learning_rate": 6.038556312509059e-07, "loss": 0.2818, "step": 8199 }, { "epoch": 0.3961926849301831, "grad_norm": 3.2070770263671875, "learning_rate": 6.038073150698169e-07, "loss": 0.291, "step": 8200 }, { "epoch": 0.39624100111127214, "grad_norm": 5.326044082641602, "learning_rate": 6.037589988887278e-07, "loss": 0.4274, "step": 8201 }, { "epoch": 0.3962893172923612, "grad_norm": 8.293339729309082, "learning_rate": 6.037106827076388e-07, "loss": 0.3007, "step": 8202 }, { "epoch": 0.39633763347345025, "grad_norm": 2.2249181270599365, "learning_rate": 6.036623665265497e-07, "loss": 0.3282, "step": 8203 }, { "epoch": 0.3963859496545393, "grad_norm": 3.2755372524261475, "learning_rate": 6.036140503454607e-07, "loss": 0.3344, "step": 8204 }, { "epoch": 0.39643426583562835, "grad_norm": 3.712867021560669, "learning_rate": 6.035657341643716e-07, "loss": 0.2282, "step": 8205 }, { "epoch": 0.3964825820167174, "grad_norm": 2.1892752647399902, "learning_rate": 6.035174179832826e-07, "loss": 0.2571, "step": 8206 }, { "epoch": 0.39653089819780646, "grad_norm": 2.966574192047119, "learning_rate": 6.034691018021935e-07, "loss": 0.2694, "step": 8207 }, { "epoch": 0.3965792143788955, "grad_norm": 2.355626344680786, "learning_rate": 6.034207856211044e-07, "loss": 0.2846, "step": 8208 }, { "epoch": 0.39662753055998456, "grad_norm": 2.7361013889312744, "learning_rate": 6.033724694400154e-07, "loss": 0.3378, "step": 8209 }, { "epoch": 0.3966758467410736, "grad_norm": 2.605639696121216, "learning_rate": 6.033241532589264e-07, "loss": 0.3584, "step": 8210 }, { "epoch": 0.3967241629221626, "grad_norm": 3.7163071632385254, "learning_rate": 6.032758370778374e-07, "loss": 0.333, "step": 8211 }, { "epoch": 0.3967724791032517, "grad_norm": 2.2425923347473145, "learning_rate": 6.032275208967484e-07, "loss": 0.2654, "step": 8212 }, { "epoch": 0.3968207952843407, "grad_norm": 2.3229758739471436, "learning_rate": 6.031792047156591e-07, "loss": 0.1315, "step": 8213 }, { "epoch": 0.39686911146542975, "grad_norm": 3.5826094150543213, "learning_rate": 6.031308885345701e-07, "loss": 0.376, "step": 8214 }, { "epoch": 0.39691742764651883, "grad_norm": 2.290785312652588, "learning_rate": 6.030825723534811e-07, "loss": 0.2911, "step": 8215 }, { "epoch": 0.39696574382760785, "grad_norm": 4.621437072753906, "learning_rate": 6.030342561723921e-07, "loss": 0.4159, "step": 8216 }, { "epoch": 0.39701406000869693, "grad_norm": 3.296788215637207, "learning_rate": 6.029859399913031e-07, "loss": 0.3093, "step": 8217 }, { "epoch": 0.39706237618978596, "grad_norm": 1.910408854484558, "learning_rate": 6.029376238102141e-07, "loss": 0.2062, "step": 8218 }, { "epoch": 0.397110692370875, "grad_norm": 2.7290706634521484, "learning_rate": 6.02889307629125e-07, "loss": 0.3055, "step": 8219 }, { "epoch": 0.39715900855196407, "grad_norm": 1.8558571338653564, "learning_rate": 6.028409914480359e-07, "loss": 0.2005, "step": 8220 }, { "epoch": 0.3972073247330531, "grad_norm": 3.378310203552246, "learning_rate": 6.027926752669469e-07, "loss": 0.3165, "step": 8221 }, { "epoch": 0.39725564091414217, "grad_norm": 3.9996793270111084, "learning_rate": 6.027443590858578e-07, "loss": 0.3885, "step": 8222 }, { "epoch": 0.3973039570952312, "grad_norm": 6.477331161499023, "learning_rate": 6.026960429047688e-07, "loss": 0.3097, "step": 8223 }, { "epoch": 0.3973522732763202, "grad_norm": 3.687952756881714, "learning_rate": 6.026477267236797e-07, "loss": 0.2425, "step": 8224 }, { "epoch": 0.3974005894574093, "grad_norm": 6.153383255004883, "learning_rate": 6.025994105425907e-07, "loss": 0.3596, "step": 8225 }, { "epoch": 0.39744890563849833, "grad_norm": 2.9324839115142822, "learning_rate": 6.025510943615016e-07, "loss": 0.2777, "step": 8226 }, { "epoch": 0.39749722181958735, "grad_norm": 5.493271350860596, "learning_rate": 6.025027781804126e-07, "loss": 0.3199, "step": 8227 }, { "epoch": 0.39754553800067643, "grad_norm": 2.6780169010162354, "learning_rate": 6.024544619993236e-07, "loss": 0.2796, "step": 8228 }, { "epoch": 0.39759385418176546, "grad_norm": 2.642962694168091, "learning_rate": 6.024061458182345e-07, "loss": 0.2483, "step": 8229 }, { "epoch": 0.39764217036285454, "grad_norm": 2.681110143661499, "learning_rate": 6.023578296371455e-07, "loss": 0.3194, "step": 8230 }, { "epoch": 0.39769048654394357, "grad_norm": 3.017827033996582, "learning_rate": 6.023095134560564e-07, "loss": 0.3296, "step": 8231 }, { "epoch": 0.3977388027250326, "grad_norm": 3.539940357208252, "learning_rate": 6.022611972749674e-07, "loss": 0.4436, "step": 8232 }, { "epoch": 0.3977871189061217, "grad_norm": 14.078707695007324, "learning_rate": 6.022128810938783e-07, "loss": 0.3572, "step": 8233 }, { "epoch": 0.3978354350872107, "grad_norm": 2.279545545578003, "learning_rate": 6.021645649127892e-07, "loss": 0.2721, "step": 8234 }, { "epoch": 0.3978837512682998, "grad_norm": 2.6701955795288086, "learning_rate": 6.021162487317002e-07, "loss": 0.3119, "step": 8235 }, { "epoch": 0.3979320674493888, "grad_norm": 2.140887498855591, "learning_rate": 6.020679325506112e-07, "loss": 0.1918, "step": 8236 }, { "epoch": 0.39798038363047783, "grad_norm": 2.7583823204040527, "learning_rate": 6.020196163695222e-07, "loss": 0.3866, "step": 8237 }, { "epoch": 0.3980286998115669, "grad_norm": 3.2275938987731934, "learning_rate": 6.019713001884332e-07, "loss": 0.261, "step": 8238 }, { "epoch": 0.39807701599265594, "grad_norm": 7.887640953063965, "learning_rate": 6.019229840073439e-07, "loss": 0.3237, "step": 8239 }, { "epoch": 0.39812533217374496, "grad_norm": 2.1603586673736572, "learning_rate": 6.018746678262549e-07, "loss": 0.2566, "step": 8240 }, { "epoch": 0.39817364835483404, "grad_norm": 2.1982085704803467, "learning_rate": 6.018263516451659e-07, "loss": 0.3213, "step": 8241 }, { "epoch": 0.39822196453592307, "grad_norm": 2.7323145866394043, "learning_rate": 6.017780354640769e-07, "loss": 0.3346, "step": 8242 }, { "epoch": 0.39827028071701215, "grad_norm": 12.520485877990723, "learning_rate": 6.017297192829879e-07, "loss": 0.3168, "step": 8243 }, { "epoch": 0.3983185968981012, "grad_norm": 3.73064923286438, "learning_rate": 6.016814031018988e-07, "loss": 0.2422, "step": 8244 }, { "epoch": 0.3983669130791902, "grad_norm": 5.114871025085449, "learning_rate": 6.016330869208097e-07, "loss": 0.376, "step": 8245 }, { "epoch": 0.3984152292602793, "grad_norm": 2.6654155254364014, "learning_rate": 6.015847707397207e-07, "loss": 0.3661, "step": 8246 }, { "epoch": 0.3984635454413683, "grad_norm": 2.8255367279052734, "learning_rate": 6.015364545586316e-07, "loss": 0.3143, "step": 8247 }, { "epoch": 0.3985118616224574, "grad_norm": 2.780416965484619, "learning_rate": 6.014881383775426e-07, "loss": 0.2774, "step": 8248 }, { "epoch": 0.3985601778035464, "grad_norm": 2.226884603500366, "learning_rate": 6.014398221964536e-07, "loss": 0.1704, "step": 8249 }, { "epoch": 0.39860849398463544, "grad_norm": 2.2840709686279297, "learning_rate": 6.013915060153645e-07, "loss": 0.3178, "step": 8250 }, { "epoch": 0.3986568101657245, "grad_norm": 2.809800863265991, "learning_rate": 6.013431898342755e-07, "loss": 0.3578, "step": 8251 }, { "epoch": 0.39870512634681354, "grad_norm": 1.873731017112732, "learning_rate": 6.012948736531864e-07, "loss": 0.2158, "step": 8252 }, { "epoch": 0.39875344252790257, "grad_norm": 3.9346935749053955, "learning_rate": 6.012465574720974e-07, "loss": 0.2883, "step": 8253 }, { "epoch": 0.39880175870899165, "grad_norm": 3.2051007747650146, "learning_rate": 6.011982412910084e-07, "loss": 0.3033, "step": 8254 }, { "epoch": 0.3988500748900807, "grad_norm": 2.5210676193237305, "learning_rate": 6.011499251099193e-07, "loss": 0.2957, "step": 8255 }, { "epoch": 0.39889839107116976, "grad_norm": 2.54516863822937, "learning_rate": 6.011016089288302e-07, "loss": 0.3006, "step": 8256 }, { "epoch": 0.3989467072522588, "grad_norm": 6.954412937164307, "learning_rate": 6.010532927477412e-07, "loss": 0.3086, "step": 8257 }, { "epoch": 0.3989950234333478, "grad_norm": 3.8263561725616455, "learning_rate": 6.010049765666521e-07, "loss": 0.2402, "step": 8258 }, { "epoch": 0.3990433396144369, "grad_norm": 3.5386464595794678, "learning_rate": 6.009566603855631e-07, "loss": 0.4705, "step": 8259 }, { "epoch": 0.3990916557955259, "grad_norm": 4.141263961791992, "learning_rate": 6.00908344204474e-07, "loss": 0.4537, "step": 8260 }, { "epoch": 0.399139971976615, "grad_norm": 11.318078994750977, "learning_rate": 6.00860028023385e-07, "loss": 0.3879, "step": 8261 }, { "epoch": 0.399188288157704, "grad_norm": 3.596194267272949, "learning_rate": 6.00811711842296e-07, "loss": 0.4052, "step": 8262 }, { "epoch": 0.39923660433879304, "grad_norm": 1.9935818910598755, "learning_rate": 6.00763395661207e-07, "loss": 0.2432, "step": 8263 }, { "epoch": 0.3992849205198821, "grad_norm": 4.471746921539307, "learning_rate": 6.00715079480118e-07, "loss": 0.3055, "step": 8264 }, { "epoch": 0.39933323670097115, "grad_norm": 2.7229197025299072, "learning_rate": 6.006667632990287e-07, "loss": 0.2301, "step": 8265 }, { "epoch": 0.3993815528820602, "grad_norm": 3.014435291290283, "learning_rate": 6.006184471179397e-07, "loss": 0.3866, "step": 8266 }, { "epoch": 0.39942986906314926, "grad_norm": 5.258193492889404, "learning_rate": 6.005701309368507e-07, "loss": 0.3001, "step": 8267 }, { "epoch": 0.3994781852442383, "grad_norm": 2.245407819747925, "learning_rate": 6.005218147557617e-07, "loss": 0.2815, "step": 8268 }, { "epoch": 0.39952650142532736, "grad_norm": 2.2864158153533936, "learning_rate": 6.004734985746727e-07, "loss": 0.2458, "step": 8269 }, { "epoch": 0.3995748176064164, "grad_norm": 2.648228645324707, "learning_rate": 6.004251823935836e-07, "loss": 0.2699, "step": 8270 }, { "epoch": 0.3996231337875054, "grad_norm": 1.9242441654205322, "learning_rate": 6.003768662124945e-07, "loss": 0.2439, "step": 8271 }, { "epoch": 0.3996714499685945, "grad_norm": 2.475886106491089, "learning_rate": 6.003285500314055e-07, "loss": 0.2429, "step": 8272 }, { "epoch": 0.3997197661496835, "grad_norm": 2.7147717475891113, "learning_rate": 6.002802338503164e-07, "loss": 0.3578, "step": 8273 }, { "epoch": 0.3997680823307726, "grad_norm": 4.134414196014404, "learning_rate": 6.002319176692274e-07, "loss": 0.3459, "step": 8274 }, { "epoch": 0.3998163985118616, "grad_norm": 2.351433753967285, "learning_rate": 6.001836014881384e-07, "loss": 0.2603, "step": 8275 }, { "epoch": 0.39986471469295065, "grad_norm": 2.908543109893799, "learning_rate": 6.001352853070493e-07, "loss": 0.2628, "step": 8276 }, { "epoch": 0.39991303087403973, "grad_norm": 3.517993211746216, "learning_rate": 6.000869691259602e-07, "loss": 0.3906, "step": 8277 }, { "epoch": 0.39996134705512876, "grad_norm": 5.4491400718688965, "learning_rate": 6.000386529448712e-07, "loss": 0.3989, "step": 8278 }, { "epoch": 0.4000096632362178, "grad_norm": 2.2811217308044434, "learning_rate": 5.999903367637822e-07, "loss": 0.2799, "step": 8279 }, { "epoch": 0.40005797941730686, "grad_norm": 2.657952070236206, "learning_rate": 5.999420205826932e-07, "loss": 0.3114, "step": 8280 }, { "epoch": 0.4001062955983959, "grad_norm": 3.040738105773926, "learning_rate": 5.99893704401604e-07, "loss": 0.4067, "step": 8281 }, { "epoch": 0.40015461177948497, "grad_norm": 32.041690826416016, "learning_rate": 5.99845388220515e-07, "loss": 0.3314, "step": 8282 }, { "epoch": 0.400202927960574, "grad_norm": 3.152869701385498, "learning_rate": 5.99797072039426e-07, "loss": 0.3119, "step": 8283 }, { "epoch": 0.400251244141663, "grad_norm": 1.6983096599578857, "learning_rate": 5.997487558583369e-07, "loss": 0.1974, "step": 8284 }, { "epoch": 0.4002995603227521, "grad_norm": 10.843599319458008, "learning_rate": 5.997004396772479e-07, "loss": 0.26, "step": 8285 }, { "epoch": 0.4003478765038411, "grad_norm": 5.398367881774902, "learning_rate": 5.996521234961588e-07, "loss": 0.3746, "step": 8286 }, { "epoch": 0.4003961926849302, "grad_norm": 2.5242319107055664, "learning_rate": 5.996038073150698e-07, "loss": 0.243, "step": 8287 }, { "epoch": 0.40044450886601923, "grad_norm": 2.0475077629089355, "learning_rate": 5.995554911339808e-07, "loss": 0.2768, "step": 8288 }, { "epoch": 0.40049282504710826, "grad_norm": 3.8694422245025635, "learning_rate": 5.995071749528918e-07, "loss": 0.2336, "step": 8289 }, { "epoch": 0.40054114122819734, "grad_norm": 2.4878971576690674, "learning_rate": 5.994588587718026e-07, "loss": 0.3209, "step": 8290 }, { "epoch": 0.40058945740928636, "grad_norm": 2.503058671951294, "learning_rate": 5.994105425907135e-07, "loss": 0.2496, "step": 8291 }, { "epoch": 0.4006377735903754, "grad_norm": 2.92258358001709, "learning_rate": 5.993622264096245e-07, "loss": 0.3294, "step": 8292 }, { "epoch": 0.40068608977146447, "grad_norm": 1.8905549049377441, "learning_rate": 5.993139102285355e-07, "loss": 0.2071, "step": 8293 }, { "epoch": 0.4007344059525535, "grad_norm": 4.67781400680542, "learning_rate": 5.992655940474465e-07, "loss": 0.3, "step": 8294 }, { "epoch": 0.4007827221336426, "grad_norm": 12.260802268981934, "learning_rate": 5.992172778663575e-07, "loss": 0.2179, "step": 8295 }, { "epoch": 0.4008310383147316, "grad_norm": 3.555056571960449, "learning_rate": 5.991689616852683e-07, "loss": 0.2812, "step": 8296 }, { "epoch": 0.40087935449582063, "grad_norm": 3.1591334342956543, "learning_rate": 5.991206455041793e-07, "loss": 0.3846, "step": 8297 }, { "epoch": 0.4009276706769097, "grad_norm": 2.5326240062713623, "learning_rate": 5.990723293230902e-07, "loss": 0.2699, "step": 8298 }, { "epoch": 0.40097598685799873, "grad_norm": 10.282071113586426, "learning_rate": 5.990240131420012e-07, "loss": 0.3638, "step": 8299 }, { "epoch": 0.4010243030390878, "grad_norm": 2.2208974361419678, "learning_rate": 5.989756969609122e-07, "loss": 0.2873, "step": 8300 }, { "epoch": 0.40107261922017684, "grad_norm": 2.1437184810638428, "learning_rate": 5.989273807798232e-07, "loss": 0.293, "step": 8301 }, { "epoch": 0.40112093540126587, "grad_norm": 1.5380380153656006, "learning_rate": 5.988790645987341e-07, "loss": 0.1456, "step": 8302 }, { "epoch": 0.40116925158235495, "grad_norm": 4.926388740539551, "learning_rate": 5.98830748417645e-07, "loss": 0.3519, "step": 8303 }, { "epoch": 0.40121756776344397, "grad_norm": 2.432556629180908, "learning_rate": 5.98782432236556e-07, "loss": 0.2975, "step": 8304 }, { "epoch": 0.401265883944533, "grad_norm": 2.5959973335266113, "learning_rate": 5.98734116055467e-07, "loss": 0.2864, "step": 8305 }, { "epoch": 0.4013142001256221, "grad_norm": 2.8772311210632324, "learning_rate": 5.98685799874378e-07, "loss": 0.2799, "step": 8306 }, { "epoch": 0.4013625163067111, "grad_norm": 3.164299726486206, "learning_rate": 5.986374836932888e-07, "loss": 0.4398, "step": 8307 }, { "epoch": 0.4014108324878002, "grad_norm": 2.3496108055114746, "learning_rate": 5.985891675121998e-07, "loss": 0.3537, "step": 8308 }, { "epoch": 0.4014591486688892, "grad_norm": 2.9619858264923096, "learning_rate": 5.985408513311107e-07, "loss": 0.4425, "step": 8309 }, { "epoch": 0.40150746484997824, "grad_norm": 3.097564697265625, "learning_rate": 5.984925351500217e-07, "loss": 0.2059, "step": 8310 }, { "epoch": 0.4015557810310673, "grad_norm": 3.292041301727295, "learning_rate": 5.984442189689327e-07, "loss": 0.3098, "step": 8311 }, { "epoch": 0.40160409721215634, "grad_norm": 2.6819796562194824, "learning_rate": 5.983959027878436e-07, "loss": 0.3678, "step": 8312 }, { "epoch": 0.4016524133932454, "grad_norm": 2.139005661010742, "learning_rate": 5.983475866067546e-07, "loss": 0.2436, "step": 8313 }, { "epoch": 0.40170072957433445, "grad_norm": 11.506152153015137, "learning_rate": 5.982992704256656e-07, "loss": 0.2761, "step": 8314 }, { "epoch": 0.4017490457554235, "grad_norm": 2.725525379180908, "learning_rate": 5.982509542445765e-07, "loss": 0.3872, "step": 8315 }, { "epoch": 0.40179736193651255, "grad_norm": 3.1340906620025635, "learning_rate": 5.982026380634874e-07, "loss": 0.2621, "step": 8316 }, { "epoch": 0.4018456781176016, "grad_norm": 2.2438464164733887, "learning_rate": 5.981543218823983e-07, "loss": 0.2142, "step": 8317 }, { "epoch": 0.4018939942986906, "grad_norm": 2.3859076499938965, "learning_rate": 5.981060057013093e-07, "loss": 0.221, "step": 8318 }, { "epoch": 0.4019423104797797, "grad_norm": 12.44140338897705, "learning_rate": 5.980576895202203e-07, "loss": 0.4241, "step": 8319 }, { "epoch": 0.4019906266608687, "grad_norm": 2.845823287963867, "learning_rate": 5.980093733391313e-07, "loss": 0.2909, "step": 8320 }, { "epoch": 0.4020389428419578, "grad_norm": 2.5141091346740723, "learning_rate": 5.979610571580423e-07, "loss": 0.255, "step": 8321 }, { "epoch": 0.4020872590230468, "grad_norm": 2.8022565841674805, "learning_rate": 5.979127409769531e-07, "loss": 0.246, "step": 8322 }, { "epoch": 0.40213557520413584, "grad_norm": 3.3346450328826904, "learning_rate": 5.97864424795864e-07, "loss": 0.2232, "step": 8323 }, { "epoch": 0.4021838913852249, "grad_norm": 3.118786334991455, "learning_rate": 5.97816108614775e-07, "loss": 0.4568, "step": 8324 }, { "epoch": 0.40223220756631395, "grad_norm": 8.027729988098145, "learning_rate": 5.97767792433686e-07, "loss": 0.2539, "step": 8325 }, { "epoch": 0.40228052374740303, "grad_norm": 3.694676637649536, "learning_rate": 5.97719476252597e-07, "loss": 0.3791, "step": 8326 }, { "epoch": 0.40232883992849205, "grad_norm": 3.322072982788086, "learning_rate": 5.97671160071508e-07, "loss": 0.3198, "step": 8327 }, { "epoch": 0.4023771561095811, "grad_norm": 2.6983916759490967, "learning_rate": 5.976228438904188e-07, "loss": 0.2158, "step": 8328 }, { "epoch": 0.40242547229067016, "grad_norm": 4.774192810058594, "learning_rate": 5.975745277093298e-07, "loss": 0.3268, "step": 8329 }, { "epoch": 0.4024737884717592, "grad_norm": 3.8254270553588867, "learning_rate": 5.975262115282408e-07, "loss": 0.4919, "step": 8330 }, { "epoch": 0.4025221046528482, "grad_norm": 11.138399124145508, "learning_rate": 5.974778953471518e-07, "loss": 0.3905, "step": 8331 }, { "epoch": 0.4025704208339373, "grad_norm": 2.825878858566284, "learning_rate": 5.974295791660627e-07, "loss": 0.3903, "step": 8332 }, { "epoch": 0.4026187370150263, "grad_norm": 3.2474803924560547, "learning_rate": 5.973812629849736e-07, "loss": 0.3359, "step": 8333 }, { "epoch": 0.4026670531961154, "grad_norm": 2.1760103702545166, "learning_rate": 5.973329468038846e-07, "loss": 0.2359, "step": 8334 }, { "epoch": 0.4027153693772044, "grad_norm": 3.7802276611328125, "learning_rate": 5.972846306227955e-07, "loss": 0.4369, "step": 8335 }, { "epoch": 0.40276368555829345, "grad_norm": 2.297692060470581, "learning_rate": 5.972363144417065e-07, "loss": 0.2906, "step": 8336 }, { "epoch": 0.40281200173938253, "grad_norm": 1.6290538311004639, "learning_rate": 5.971879982606175e-07, "loss": 0.175, "step": 8337 }, { "epoch": 0.40286031792047156, "grad_norm": 2.5555665493011475, "learning_rate": 5.971396820795284e-07, "loss": 0.2448, "step": 8338 }, { "epoch": 0.40290863410156064, "grad_norm": 2.5052449703216553, "learning_rate": 5.970913658984394e-07, "loss": 0.3197, "step": 8339 }, { "epoch": 0.40295695028264966, "grad_norm": 2.125882148742676, "learning_rate": 5.970430497173504e-07, "loss": 0.2367, "step": 8340 }, { "epoch": 0.4030052664637387, "grad_norm": 2.117016077041626, "learning_rate": 5.969947335362612e-07, "loss": 0.2299, "step": 8341 }, { "epoch": 0.40305358264482777, "grad_norm": 5.018622398376465, "learning_rate": 5.969464173551722e-07, "loss": 0.3514, "step": 8342 }, { "epoch": 0.4031018988259168, "grad_norm": 2.3960728645324707, "learning_rate": 5.968981011740831e-07, "loss": 0.2899, "step": 8343 }, { "epoch": 0.4031502150070058, "grad_norm": 2.918241024017334, "learning_rate": 5.968497849929941e-07, "loss": 0.3998, "step": 8344 }, { "epoch": 0.4031985311880949, "grad_norm": 2.836047649383545, "learning_rate": 5.968014688119051e-07, "loss": 0.2508, "step": 8345 }, { "epoch": 0.4032468473691839, "grad_norm": 5.270936965942383, "learning_rate": 5.967531526308161e-07, "loss": 0.2179, "step": 8346 }, { "epoch": 0.403295163550273, "grad_norm": 3.42348051071167, "learning_rate": 5.967048364497271e-07, "loss": 0.4127, "step": 8347 }, { "epoch": 0.40334347973136203, "grad_norm": 2.277194023132324, "learning_rate": 5.966565202686378e-07, "loss": 0.2314, "step": 8348 }, { "epoch": 0.40339179591245106, "grad_norm": 2.3208563327789307, "learning_rate": 5.966082040875488e-07, "loss": 0.2428, "step": 8349 }, { "epoch": 0.40344011209354014, "grad_norm": 3.025359630584717, "learning_rate": 5.965598879064598e-07, "loss": 0.4032, "step": 8350 }, { "epoch": 0.40348842827462916, "grad_norm": 3.0838499069213867, "learning_rate": 5.965115717253708e-07, "loss": 0.3143, "step": 8351 }, { "epoch": 0.40353674445571824, "grad_norm": 2.654468536376953, "learning_rate": 5.964632555442818e-07, "loss": 0.3238, "step": 8352 }, { "epoch": 0.40358506063680727, "grad_norm": 2.209501266479492, "learning_rate": 5.964149393631928e-07, "loss": 0.2467, "step": 8353 }, { "epoch": 0.4036333768178963, "grad_norm": 4.478390216827393, "learning_rate": 5.963666231821036e-07, "loss": 0.3275, "step": 8354 }, { "epoch": 0.4036816929989854, "grad_norm": 2.495884656906128, "learning_rate": 5.963183070010146e-07, "loss": 0.3192, "step": 8355 }, { "epoch": 0.4037300091800744, "grad_norm": 2.1290693283081055, "learning_rate": 5.962699908199256e-07, "loss": 0.2166, "step": 8356 }, { "epoch": 0.4037783253611634, "grad_norm": 3.9265501499176025, "learning_rate": 5.962216746388365e-07, "loss": 0.4441, "step": 8357 }, { "epoch": 0.4038266415422525, "grad_norm": 2.8995888233184814, "learning_rate": 5.961733584577475e-07, "loss": 0.2487, "step": 8358 }, { "epoch": 0.40387495772334153, "grad_norm": 2.2769522666931152, "learning_rate": 5.961250422766584e-07, "loss": 0.2769, "step": 8359 }, { "epoch": 0.4039232739044306, "grad_norm": 2.856959581375122, "learning_rate": 5.960767260955693e-07, "loss": 0.3296, "step": 8360 }, { "epoch": 0.40397159008551964, "grad_norm": 2.682265043258667, "learning_rate": 5.960284099144803e-07, "loss": 0.2793, "step": 8361 }, { "epoch": 0.40401990626660866, "grad_norm": 5.273832321166992, "learning_rate": 5.959800937333913e-07, "loss": 0.2952, "step": 8362 }, { "epoch": 0.40406822244769774, "grad_norm": 1.7335928678512573, "learning_rate": 5.959317775523023e-07, "loss": 0.1941, "step": 8363 }, { "epoch": 0.40411653862878677, "grad_norm": 2.9471702575683594, "learning_rate": 5.958834613712132e-07, "loss": 0.341, "step": 8364 }, { "epoch": 0.40416485480987585, "grad_norm": 16.671890258789062, "learning_rate": 5.958351451901242e-07, "loss": 0.143, "step": 8365 }, { "epoch": 0.4042131709909649, "grad_norm": 5.392861843109131, "learning_rate": 5.957868290090351e-07, "loss": 0.3465, "step": 8366 }, { "epoch": 0.4042614871720539, "grad_norm": 2.62579607963562, "learning_rate": 5.95738512827946e-07, "loss": 0.3578, "step": 8367 }, { "epoch": 0.404309803353143, "grad_norm": 3.581178903579712, "learning_rate": 5.95690196646857e-07, "loss": 0.25, "step": 8368 }, { "epoch": 0.404358119534232, "grad_norm": 2.2753098011016846, "learning_rate": 5.956418804657679e-07, "loss": 0.2384, "step": 8369 }, { "epoch": 0.4044064357153211, "grad_norm": 6.318424701690674, "learning_rate": 5.955935642846789e-07, "loss": 0.2477, "step": 8370 }, { "epoch": 0.4044547518964101, "grad_norm": 2.243255853652954, "learning_rate": 5.955452481035899e-07, "loss": 0.2398, "step": 8371 }, { "epoch": 0.40450306807749914, "grad_norm": 8.66667652130127, "learning_rate": 5.954969319225009e-07, "loss": 0.3185, "step": 8372 }, { "epoch": 0.4045513842585882, "grad_norm": 2.806439161300659, "learning_rate": 5.954486157414118e-07, "loss": 0.2987, "step": 8373 }, { "epoch": 0.40459970043967725, "grad_norm": 6.877490520477295, "learning_rate": 5.954002995603226e-07, "loss": 0.4935, "step": 8374 }, { "epoch": 0.40464801662076627, "grad_norm": 2.7510037422180176, "learning_rate": 5.953519833792336e-07, "loss": 0.2711, "step": 8375 }, { "epoch": 0.40469633280185535, "grad_norm": 2.984842300415039, "learning_rate": 5.953036671981446e-07, "loss": 0.4788, "step": 8376 }, { "epoch": 0.4047446489829444, "grad_norm": 2.5369362831115723, "learning_rate": 5.952553510170556e-07, "loss": 0.3296, "step": 8377 }, { "epoch": 0.40479296516403346, "grad_norm": 2.993788003921509, "learning_rate": 5.952070348359666e-07, "loss": 0.3751, "step": 8378 }, { "epoch": 0.4048412813451225, "grad_norm": 5.5226545333862305, "learning_rate": 5.951587186548776e-07, "loss": 0.2134, "step": 8379 }, { "epoch": 0.4048895975262115, "grad_norm": 7.466714382171631, "learning_rate": 5.951104024737884e-07, "loss": 0.353, "step": 8380 }, { "epoch": 0.4049379137073006, "grad_norm": 2.1937646865844727, "learning_rate": 5.950620862926994e-07, "loss": 0.2787, "step": 8381 }, { "epoch": 0.4049862298883896, "grad_norm": 2.8028483390808105, "learning_rate": 5.950137701116104e-07, "loss": 0.3109, "step": 8382 }, { "epoch": 0.4050345460694787, "grad_norm": 3.209599494934082, "learning_rate": 5.949654539305213e-07, "loss": 0.3566, "step": 8383 }, { "epoch": 0.4050828622505677, "grad_norm": 4.857532024383545, "learning_rate": 5.949171377494323e-07, "loss": 0.2014, "step": 8384 }, { "epoch": 0.40513117843165675, "grad_norm": 2.6217610836029053, "learning_rate": 5.948688215683432e-07, "loss": 0.3899, "step": 8385 }, { "epoch": 0.4051794946127458, "grad_norm": 2.500575304031372, "learning_rate": 5.948205053872541e-07, "loss": 0.2619, "step": 8386 }, { "epoch": 0.40522781079383485, "grad_norm": 3.0110316276550293, "learning_rate": 5.947721892061651e-07, "loss": 0.2097, "step": 8387 }, { "epoch": 0.4052761269749239, "grad_norm": 4.328314781188965, "learning_rate": 5.947238730250761e-07, "loss": 0.2543, "step": 8388 }, { "epoch": 0.40532444315601296, "grad_norm": 2.0034754276275635, "learning_rate": 5.946755568439871e-07, "loss": 0.2552, "step": 8389 }, { "epoch": 0.405372759337102, "grad_norm": 3.7636594772338867, "learning_rate": 5.94627240662898e-07, "loss": 0.2975, "step": 8390 }, { "epoch": 0.40542107551819107, "grad_norm": 5.274574279785156, "learning_rate": 5.945789244818089e-07, "loss": 0.5318, "step": 8391 }, { "epoch": 0.4054693916992801, "grad_norm": 6.917333126068115, "learning_rate": 5.945306083007198e-07, "loss": 0.3361, "step": 8392 }, { "epoch": 0.4055177078803691, "grad_norm": 2.9973814487457275, "learning_rate": 5.944822921196308e-07, "loss": 0.3591, "step": 8393 }, { "epoch": 0.4055660240614582, "grad_norm": 2.3611347675323486, "learning_rate": 5.944339759385418e-07, "loss": 0.2276, "step": 8394 }, { "epoch": 0.4056143402425472, "grad_norm": 2.1117100715637207, "learning_rate": 5.943856597574527e-07, "loss": 0.2187, "step": 8395 }, { "epoch": 0.4056626564236363, "grad_norm": 74.45679473876953, "learning_rate": 5.943373435763637e-07, "loss": 0.2352, "step": 8396 }, { "epoch": 0.40571097260472533, "grad_norm": 3.788651466369629, "learning_rate": 5.942890273952747e-07, "loss": 0.3861, "step": 8397 }, { "epoch": 0.40575928878581435, "grad_norm": 2.48958683013916, "learning_rate": 5.942407112141857e-07, "loss": 0.3252, "step": 8398 }, { "epoch": 0.40580760496690343, "grad_norm": 2.5996150970458984, "learning_rate": 5.941923950330965e-07, "loss": 0.3651, "step": 8399 }, { "epoch": 0.40585592114799246, "grad_norm": 3.1979708671569824, "learning_rate": 5.941440788520074e-07, "loss": 0.3303, "step": 8400 }, { "epoch": 0.4059042373290815, "grad_norm": 3.3674967288970947, "learning_rate": 5.940957626709184e-07, "loss": 0.3857, "step": 8401 }, { "epoch": 0.40595255351017057, "grad_norm": 2.222158908843994, "learning_rate": 5.940474464898294e-07, "loss": 0.2062, "step": 8402 }, { "epoch": 0.4060008696912596, "grad_norm": 5.649694919586182, "learning_rate": 5.939991303087404e-07, "loss": 0.2849, "step": 8403 }, { "epoch": 0.4060491858723487, "grad_norm": 11.332947731018066, "learning_rate": 5.939508141276514e-07, "loss": 0.2299, "step": 8404 }, { "epoch": 0.4060975020534377, "grad_norm": 2.4702706336975098, "learning_rate": 5.939024979465623e-07, "loss": 0.3174, "step": 8405 }, { "epoch": 0.4061458182345267, "grad_norm": 2.2026207447052, "learning_rate": 5.938541817654732e-07, "loss": 0.2967, "step": 8406 }, { "epoch": 0.4061941344156158, "grad_norm": 2.0381102561950684, "learning_rate": 5.938058655843842e-07, "loss": 0.1973, "step": 8407 }, { "epoch": 0.40624245059670483, "grad_norm": 2.5060551166534424, "learning_rate": 5.937575494032951e-07, "loss": 0.2854, "step": 8408 }, { "epoch": 0.4062907667777939, "grad_norm": 3.2957003116607666, "learning_rate": 5.937092332222061e-07, "loss": 0.2909, "step": 8409 }, { "epoch": 0.40633908295888294, "grad_norm": 2.6752209663391113, "learning_rate": 5.936609170411171e-07, "loss": 0.3274, "step": 8410 }, { "epoch": 0.40638739913997196, "grad_norm": 2.365722894668579, "learning_rate": 5.93612600860028e-07, "loss": 0.3058, "step": 8411 }, { "epoch": 0.40643571532106104, "grad_norm": 2.186018943786621, "learning_rate": 5.935642846789389e-07, "loss": 0.2583, "step": 8412 }, { "epoch": 0.40648403150215007, "grad_norm": 3.330821990966797, "learning_rate": 5.935159684978499e-07, "loss": 0.2384, "step": 8413 }, { "epoch": 0.4065323476832391, "grad_norm": 2.4718871116638184, "learning_rate": 5.934676523167609e-07, "loss": 0.2814, "step": 8414 }, { "epoch": 0.4065806638643282, "grad_norm": 4.743865966796875, "learning_rate": 5.934193361356719e-07, "loss": 0.1973, "step": 8415 }, { "epoch": 0.4066289800454172, "grad_norm": 2.586601495742798, "learning_rate": 5.933710199545827e-07, "loss": 0.2012, "step": 8416 }, { "epoch": 0.4066772962265063, "grad_norm": 2.1021647453308105, "learning_rate": 5.933227037734937e-07, "loss": 0.2251, "step": 8417 }, { "epoch": 0.4067256124075953, "grad_norm": 2.2165770530700684, "learning_rate": 5.932743875924046e-07, "loss": 0.2403, "step": 8418 }, { "epoch": 0.40677392858868433, "grad_norm": 1.5841209888458252, "learning_rate": 5.932260714113156e-07, "loss": 0.1628, "step": 8419 }, { "epoch": 0.4068222447697734, "grad_norm": 3.1009838581085205, "learning_rate": 5.931777552302266e-07, "loss": 0.254, "step": 8420 }, { "epoch": 0.40687056095086244, "grad_norm": 3.2123050689697266, "learning_rate": 5.931294390491375e-07, "loss": 0.4391, "step": 8421 }, { "epoch": 0.4069188771319515, "grad_norm": 2.4992921352386475, "learning_rate": 5.930811228680485e-07, "loss": 0.2666, "step": 8422 }, { "epoch": 0.40696719331304054, "grad_norm": 5.333478927612305, "learning_rate": 5.930328066869595e-07, "loss": 0.3855, "step": 8423 }, { "epoch": 0.40701550949412957, "grad_norm": 4.30920934677124, "learning_rate": 5.929844905058704e-07, "loss": 0.5225, "step": 8424 }, { "epoch": 0.40706382567521865, "grad_norm": 2.676069736480713, "learning_rate": 5.929361743247813e-07, "loss": 0.2872, "step": 8425 }, { "epoch": 0.4071121418563077, "grad_norm": 2.966996431350708, "learning_rate": 5.928878581436922e-07, "loss": 0.3839, "step": 8426 }, { "epoch": 0.4071604580373967, "grad_norm": 2.518991708755493, "learning_rate": 5.928395419626032e-07, "loss": 0.3011, "step": 8427 }, { "epoch": 0.4072087742184858, "grad_norm": 16.294876098632812, "learning_rate": 5.927912257815142e-07, "loss": 0.2539, "step": 8428 }, { "epoch": 0.4072570903995748, "grad_norm": 2.5942020416259766, "learning_rate": 5.927429096004252e-07, "loss": 0.2764, "step": 8429 }, { "epoch": 0.4073054065806639, "grad_norm": 2.122549533843994, "learning_rate": 5.926945934193362e-07, "loss": 0.2942, "step": 8430 }, { "epoch": 0.4073537227617529, "grad_norm": 3.104250431060791, "learning_rate": 5.926462772382471e-07, "loss": 0.3024, "step": 8431 }, { "epoch": 0.40740203894284194, "grad_norm": 2.05391526222229, "learning_rate": 5.92597961057158e-07, "loss": 0.208, "step": 8432 }, { "epoch": 0.407450355123931, "grad_norm": 2.7359235286712646, "learning_rate": 5.925496448760689e-07, "loss": 0.3196, "step": 8433 }, { "epoch": 0.40749867130502004, "grad_norm": 3.3583500385284424, "learning_rate": 5.925013286949799e-07, "loss": 0.2451, "step": 8434 }, { "epoch": 0.4075469874861091, "grad_norm": 2.0496387481689453, "learning_rate": 5.924530125138909e-07, "loss": 0.1791, "step": 8435 }, { "epoch": 0.40759530366719815, "grad_norm": 3.944831132888794, "learning_rate": 5.924046963328019e-07, "loss": 0.3824, "step": 8436 }, { "epoch": 0.4076436198482872, "grad_norm": 2.7720415592193604, "learning_rate": 5.923563801517127e-07, "loss": 0.3712, "step": 8437 }, { "epoch": 0.40769193602937626, "grad_norm": 1.9705681800842285, "learning_rate": 5.923080639706237e-07, "loss": 0.2209, "step": 8438 }, { "epoch": 0.4077402522104653, "grad_norm": 2.026599168777466, "learning_rate": 5.922597477895347e-07, "loss": 0.1746, "step": 8439 }, { "epoch": 0.4077885683915543, "grad_norm": 2.7187962532043457, "learning_rate": 5.922114316084457e-07, "loss": 0.2515, "step": 8440 }, { "epoch": 0.4078368845726434, "grad_norm": 3.0613229274749756, "learning_rate": 5.921631154273567e-07, "loss": 0.2555, "step": 8441 }, { "epoch": 0.4078852007537324, "grad_norm": 2.797297239303589, "learning_rate": 5.921147992462675e-07, "loss": 0.374, "step": 8442 }, { "epoch": 0.4079335169348215, "grad_norm": 4.507987022399902, "learning_rate": 5.920664830651785e-07, "loss": 0.3762, "step": 8443 }, { "epoch": 0.4079818331159105, "grad_norm": 3.456770896911621, "learning_rate": 5.920181668840894e-07, "loss": 0.4717, "step": 8444 }, { "epoch": 0.40803014929699954, "grad_norm": 2.4619803428649902, "learning_rate": 5.919698507030004e-07, "loss": 0.2272, "step": 8445 }, { "epoch": 0.4080784654780886, "grad_norm": 3.1364517211914062, "learning_rate": 5.919215345219114e-07, "loss": 0.3344, "step": 8446 }, { "epoch": 0.40812678165917765, "grad_norm": 2.6640572547912598, "learning_rate": 5.918732183408223e-07, "loss": 0.2208, "step": 8447 }, { "epoch": 0.40817509784026673, "grad_norm": 2.0700783729553223, "learning_rate": 5.918249021597333e-07, "loss": 0.2192, "step": 8448 }, { "epoch": 0.40822341402135576, "grad_norm": 2.639799118041992, "learning_rate": 5.917765859786443e-07, "loss": 0.3637, "step": 8449 }, { "epoch": 0.4082717302024448, "grad_norm": 1.5636934041976929, "learning_rate": 5.917282697975551e-07, "loss": 0.1678, "step": 8450 }, { "epoch": 0.40832004638353386, "grad_norm": 3.3388922214508057, "learning_rate": 5.916799536164661e-07, "loss": 0.1718, "step": 8451 }, { "epoch": 0.4083683625646229, "grad_norm": 2.8730216026306152, "learning_rate": 5.91631637435377e-07, "loss": 0.1496, "step": 8452 }, { "epoch": 0.4084166787457119, "grad_norm": 2.170189619064331, "learning_rate": 5.91583321254288e-07, "loss": 0.2506, "step": 8453 }, { "epoch": 0.408464994926801, "grad_norm": 2.042255401611328, "learning_rate": 5.91535005073199e-07, "loss": 0.2874, "step": 8454 }, { "epoch": 0.40851331110789, "grad_norm": 3.4430973529815674, "learning_rate": 5.9148668889211e-07, "loss": 0.287, "step": 8455 }, { "epoch": 0.4085616272889791, "grad_norm": 2.353569269180298, "learning_rate": 5.91438372711021e-07, "loss": 0.2318, "step": 8456 }, { "epoch": 0.4086099434700681, "grad_norm": 2.28279447555542, "learning_rate": 5.913900565299319e-07, "loss": 0.2547, "step": 8457 }, { "epoch": 0.40865825965115715, "grad_norm": 2.472245454788208, "learning_rate": 5.913417403488427e-07, "loss": 0.2426, "step": 8458 }, { "epoch": 0.40870657583224623, "grad_norm": 2.1836512088775635, "learning_rate": 5.912934241677537e-07, "loss": 0.2218, "step": 8459 }, { "epoch": 0.40875489201333526, "grad_norm": 2.3745765686035156, "learning_rate": 5.912451079866647e-07, "loss": 0.3603, "step": 8460 }, { "epoch": 0.40880320819442434, "grad_norm": 2.8106565475463867, "learning_rate": 5.911967918055757e-07, "loss": 0.3702, "step": 8461 }, { "epoch": 0.40885152437551336, "grad_norm": 2.572079658508301, "learning_rate": 5.911484756244867e-07, "loss": 0.3607, "step": 8462 }, { "epoch": 0.4088998405566024, "grad_norm": 2.3053879737854004, "learning_rate": 5.911001594433975e-07, "loss": 0.1904, "step": 8463 }, { "epoch": 0.40894815673769147, "grad_norm": 2.002450704574585, "learning_rate": 5.910518432623085e-07, "loss": 0.1993, "step": 8464 }, { "epoch": 0.4089964729187805, "grad_norm": 3.2821295261383057, "learning_rate": 5.910035270812195e-07, "loss": 0.3397, "step": 8465 }, { "epoch": 0.4090447890998695, "grad_norm": 2.0608131885528564, "learning_rate": 5.909552109001305e-07, "loss": 0.2128, "step": 8466 }, { "epoch": 0.4090931052809586, "grad_norm": 2.194425344467163, "learning_rate": 5.909068947190414e-07, "loss": 0.2552, "step": 8467 }, { "epoch": 0.4091414214620476, "grad_norm": 7.369598388671875, "learning_rate": 5.908585785379523e-07, "loss": 0.3812, "step": 8468 }, { "epoch": 0.4091897376431367, "grad_norm": 2.7906174659729004, "learning_rate": 5.908102623568632e-07, "loss": 0.2692, "step": 8469 }, { "epoch": 0.40923805382422573, "grad_norm": 7.443472385406494, "learning_rate": 5.907619461757742e-07, "loss": 0.5979, "step": 8470 }, { "epoch": 0.40928637000531476, "grad_norm": 2.20099139213562, "learning_rate": 5.907136299946852e-07, "loss": 0.2858, "step": 8471 }, { "epoch": 0.40933468618640384, "grad_norm": 13.997488975524902, "learning_rate": 5.906653138135962e-07, "loss": 0.2844, "step": 8472 }, { "epoch": 0.40938300236749287, "grad_norm": 1.8851712942123413, "learning_rate": 5.906169976325071e-07, "loss": 0.1592, "step": 8473 }, { "epoch": 0.40943131854858195, "grad_norm": 2.3240115642547607, "learning_rate": 5.905686814514181e-07, "loss": 0.3063, "step": 8474 }, { "epoch": 0.40947963472967097, "grad_norm": 2.2775380611419678, "learning_rate": 5.905203652703291e-07, "loss": 0.2341, "step": 8475 }, { "epoch": 0.40952795091076, "grad_norm": 2.9262053966522217, "learning_rate": 5.904720490892399e-07, "loss": 0.4311, "step": 8476 }, { "epoch": 0.4095762670918491, "grad_norm": 2.9552505016326904, "learning_rate": 5.904237329081509e-07, "loss": 0.3106, "step": 8477 }, { "epoch": 0.4096245832729381, "grad_norm": 2.308756113052368, "learning_rate": 5.903754167270618e-07, "loss": 0.2621, "step": 8478 }, { "epoch": 0.40967289945402713, "grad_norm": 1.6170176267623901, "learning_rate": 5.903271005459728e-07, "loss": 0.1391, "step": 8479 }, { "epoch": 0.4097212156351162, "grad_norm": 2.106818437576294, "learning_rate": 5.902787843648838e-07, "loss": 0.2516, "step": 8480 }, { "epoch": 0.40976953181620523, "grad_norm": 2.6791837215423584, "learning_rate": 5.902304681837948e-07, "loss": 0.303, "step": 8481 }, { "epoch": 0.4098178479972943, "grad_norm": 2.4998703002929688, "learning_rate": 5.901821520027057e-07, "loss": 0.3414, "step": 8482 }, { "epoch": 0.40986616417838334, "grad_norm": 3.732187271118164, "learning_rate": 5.901338358216167e-07, "loss": 0.2716, "step": 8483 }, { "epoch": 0.40991448035947237, "grad_norm": 3.734856605529785, "learning_rate": 5.900855196405275e-07, "loss": 0.3483, "step": 8484 }, { "epoch": 0.40996279654056145, "grad_norm": 2.3474583625793457, "learning_rate": 5.900372034594385e-07, "loss": 0.1864, "step": 8485 }, { "epoch": 0.4100111127216505, "grad_norm": 2.977644681930542, "learning_rate": 5.899888872783495e-07, "loss": 0.3403, "step": 8486 }, { "epoch": 0.41005942890273955, "grad_norm": 2.082717180252075, "learning_rate": 5.899405710972605e-07, "loss": 0.2387, "step": 8487 }, { "epoch": 0.4101077450838286, "grad_norm": 5.405708312988281, "learning_rate": 5.898922549161715e-07, "loss": 0.3559, "step": 8488 }, { "epoch": 0.4101560612649176, "grad_norm": 1.7983931303024292, "learning_rate": 5.898439387350823e-07, "loss": 0.1542, "step": 8489 }, { "epoch": 0.4102043774460067, "grad_norm": 4.408971309661865, "learning_rate": 5.897956225539933e-07, "loss": 0.3845, "step": 8490 }, { "epoch": 0.4102526936270957, "grad_norm": 2.3097054958343506, "learning_rate": 5.897473063729043e-07, "loss": 0.2428, "step": 8491 }, { "epoch": 0.41030100980818474, "grad_norm": 2.524214506149292, "learning_rate": 5.896989901918153e-07, "loss": 0.2945, "step": 8492 }, { "epoch": 0.4103493259892738, "grad_norm": 2.1485538482666016, "learning_rate": 5.896506740107262e-07, "loss": 0.2306, "step": 8493 }, { "epoch": 0.41039764217036284, "grad_norm": 1.9030158519744873, "learning_rate": 5.896023578296371e-07, "loss": 0.1616, "step": 8494 }, { "epoch": 0.4104459583514519, "grad_norm": 2.8845574855804443, "learning_rate": 5.89554041648548e-07, "loss": 0.4297, "step": 8495 }, { "epoch": 0.41049427453254095, "grad_norm": 2.9101145267486572, "learning_rate": 5.89505725467459e-07, "loss": 0.3798, "step": 8496 }, { "epoch": 0.41054259071363, "grad_norm": 2.367535352706909, "learning_rate": 5.8945740928637e-07, "loss": 0.1976, "step": 8497 }, { "epoch": 0.41059090689471905, "grad_norm": 2.782494068145752, "learning_rate": 5.89409093105281e-07, "loss": 0.2632, "step": 8498 }, { "epoch": 0.4106392230758081, "grad_norm": 2.7773513793945312, "learning_rate": 5.893607769241919e-07, "loss": 0.2989, "step": 8499 }, { "epoch": 0.41068753925689716, "grad_norm": 2.348248243331909, "learning_rate": 5.893124607431029e-07, "loss": 0.2416, "step": 8500 }, { "epoch": 0.4107358554379862, "grad_norm": 3.285141944885254, "learning_rate": 5.892641445620137e-07, "loss": 0.4038, "step": 8501 }, { "epoch": 0.4107841716190752, "grad_norm": 2.532822608947754, "learning_rate": 5.892158283809247e-07, "loss": 0.196, "step": 8502 }, { "epoch": 0.4108324878001643, "grad_norm": 12.952044486999512, "learning_rate": 5.891675121998357e-07, "loss": 0.3834, "step": 8503 }, { "epoch": 0.4108808039812533, "grad_norm": 2.8769724369049072, "learning_rate": 5.891191960187466e-07, "loss": 0.4237, "step": 8504 }, { "epoch": 0.41092912016234234, "grad_norm": 3.4834647178649902, "learning_rate": 5.890708798376576e-07, "loss": 0.4003, "step": 8505 }, { "epoch": 0.4109774363434314, "grad_norm": 5.942283630371094, "learning_rate": 5.890225636565686e-07, "loss": 0.3443, "step": 8506 }, { "epoch": 0.41102575252452045, "grad_norm": 3.2930057048797607, "learning_rate": 5.889742474754796e-07, "loss": 0.4463, "step": 8507 }, { "epoch": 0.41107406870560953, "grad_norm": 2.0516068935394287, "learning_rate": 5.889259312943905e-07, "loss": 0.1728, "step": 8508 }, { "epoch": 0.41112238488669856, "grad_norm": 18.65728187561035, "learning_rate": 5.888776151133013e-07, "loss": 0.3051, "step": 8509 }, { "epoch": 0.4111707010677876, "grad_norm": 1.9878188371658325, "learning_rate": 5.888292989322123e-07, "loss": 0.1544, "step": 8510 }, { "epoch": 0.41121901724887666, "grad_norm": 2.683349609375, "learning_rate": 5.887809827511233e-07, "loss": 0.2446, "step": 8511 }, { "epoch": 0.4112673334299657, "grad_norm": 2.40906023979187, "learning_rate": 5.887326665700343e-07, "loss": 0.3482, "step": 8512 }, { "epoch": 0.41131564961105477, "grad_norm": 3.2695000171661377, "learning_rate": 5.886843503889453e-07, "loss": 0.2956, "step": 8513 }, { "epoch": 0.4113639657921438, "grad_norm": 2.289290428161621, "learning_rate": 5.886360342078562e-07, "loss": 0.1995, "step": 8514 }, { "epoch": 0.4114122819732328, "grad_norm": 2.805699586868286, "learning_rate": 5.885877180267671e-07, "loss": 0.4335, "step": 8515 }, { "epoch": 0.4114605981543219, "grad_norm": 2.9468202590942383, "learning_rate": 5.885394018456781e-07, "loss": 0.348, "step": 8516 }, { "epoch": 0.4115089143354109, "grad_norm": 3.036738872528076, "learning_rate": 5.884910856645891e-07, "loss": 0.3793, "step": 8517 }, { "epoch": 0.41155723051649995, "grad_norm": 3.1257355213165283, "learning_rate": 5.884427694835e-07, "loss": 0.3896, "step": 8518 }, { "epoch": 0.41160554669758903, "grad_norm": 2.6645796298980713, "learning_rate": 5.88394453302411e-07, "loss": 0.2893, "step": 8519 }, { "epoch": 0.41165386287867806, "grad_norm": 3.223832130432129, "learning_rate": 5.883461371213218e-07, "loss": 0.2928, "step": 8520 }, { "epoch": 0.41170217905976714, "grad_norm": 7.942060470581055, "learning_rate": 5.882978209402328e-07, "loss": 0.2654, "step": 8521 }, { "epoch": 0.41175049524085616, "grad_norm": 9.253190040588379, "learning_rate": 5.882495047591438e-07, "loss": 0.255, "step": 8522 }, { "epoch": 0.4117988114219452, "grad_norm": 2.1822268962860107, "learning_rate": 5.882011885780548e-07, "loss": 0.1673, "step": 8523 }, { "epoch": 0.41184712760303427, "grad_norm": 4.976955890655518, "learning_rate": 5.881528723969658e-07, "loss": 0.1852, "step": 8524 }, { "epoch": 0.4118954437841233, "grad_norm": 2.953031063079834, "learning_rate": 5.881045562158767e-07, "loss": 0.3079, "step": 8525 }, { "epoch": 0.4119437599652124, "grad_norm": 2.6730170249938965, "learning_rate": 5.880562400347876e-07, "loss": 0.2677, "step": 8526 }, { "epoch": 0.4119920761463014, "grad_norm": 2.377861499786377, "learning_rate": 5.880079238536985e-07, "loss": 0.2312, "step": 8527 }, { "epoch": 0.4120403923273904, "grad_norm": 9.17449951171875, "learning_rate": 5.879596076726095e-07, "loss": 0.4472, "step": 8528 }, { "epoch": 0.4120887085084795, "grad_norm": 4.7065887451171875, "learning_rate": 5.879112914915205e-07, "loss": 0.2447, "step": 8529 }, { "epoch": 0.41213702468956853, "grad_norm": 2.9933154582977295, "learning_rate": 5.878629753104314e-07, "loss": 0.3656, "step": 8530 }, { "epoch": 0.41218534087065756, "grad_norm": 2.5616183280944824, "learning_rate": 5.878146591293424e-07, "loss": 0.2824, "step": 8531 }, { "epoch": 0.41223365705174664, "grad_norm": 3.064303159713745, "learning_rate": 5.877663429482534e-07, "loss": 0.272, "step": 8532 }, { "epoch": 0.41228197323283566, "grad_norm": 5.386425018310547, "learning_rate": 5.877180267671643e-07, "loss": 0.3927, "step": 8533 }, { "epoch": 0.41233028941392474, "grad_norm": 6.004513740539551, "learning_rate": 5.876697105860753e-07, "loss": 0.243, "step": 8534 }, { "epoch": 0.41237860559501377, "grad_norm": 2.8713157176971436, "learning_rate": 5.876213944049861e-07, "loss": 0.2789, "step": 8535 }, { "epoch": 0.4124269217761028, "grad_norm": 2.4897098541259766, "learning_rate": 5.875730782238971e-07, "loss": 0.2708, "step": 8536 }, { "epoch": 0.4124752379571919, "grad_norm": 1.8126777410507202, "learning_rate": 5.875247620428081e-07, "loss": 0.2471, "step": 8537 }, { "epoch": 0.4125235541382809, "grad_norm": 6.440415859222412, "learning_rate": 5.874764458617191e-07, "loss": 0.2102, "step": 8538 }, { "epoch": 0.41257187031937, "grad_norm": 3.2735819816589355, "learning_rate": 5.874281296806301e-07, "loss": 0.2853, "step": 8539 }, { "epoch": 0.412620186500459, "grad_norm": 2.781360149383545, "learning_rate": 5.87379813499541e-07, "loss": 0.2412, "step": 8540 }, { "epoch": 0.41266850268154803, "grad_norm": 2.603549003601074, "learning_rate": 5.873314973184519e-07, "loss": 0.264, "step": 8541 }, { "epoch": 0.4127168188626371, "grad_norm": 2.040539503097534, "learning_rate": 5.872831811373629e-07, "loss": 0.2512, "step": 8542 }, { "epoch": 0.41276513504372614, "grad_norm": 3.140519857406616, "learning_rate": 5.872348649562738e-07, "loss": 0.3364, "step": 8543 }, { "epoch": 0.41281345122481516, "grad_norm": 2.8246774673461914, "learning_rate": 5.871865487751848e-07, "loss": 0.3525, "step": 8544 }, { "epoch": 0.41286176740590425, "grad_norm": 1.583126187324524, "learning_rate": 5.871382325940958e-07, "loss": 0.1871, "step": 8545 }, { "epoch": 0.41291008358699327, "grad_norm": 3.9928364753723145, "learning_rate": 5.870899164130066e-07, "loss": 0.4891, "step": 8546 }, { "epoch": 0.41295839976808235, "grad_norm": 2.0919034481048584, "learning_rate": 5.870416002319176e-07, "loss": 0.2055, "step": 8547 }, { "epoch": 0.4130067159491714, "grad_norm": 2.585965871810913, "learning_rate": 5.869932840508286e-07, "loss": 0.2417, "step": 8548 }, { "epoch": 0.4130550321302604, "grad_norm": 2.8019800186157227, "learning_rate": 5.869449678697396e-07, "loss": 0.2979, "step": 8549 }, { "epoch": 0.4131033483113495, "grad_norm": 2.0930066108703613, "learning_rate": 5.868966516886506e-07, "loss": 0.2121, "step": 8550 }, { "epoch": 0.4131516644924385, "grad_norm": 3.31068754196167, "learning_rate": 5.868483355075615e-07, "loss": 0.3733, "step": 8551 }, { "epoch": 0.4131999806735276, "grad_norm": 2.4755353927612305, "learning_rate": 5.868000193264723e-07, "loss": 0.2804, "step": 8552 }, { "epoch": 0.4132482968546166, "grad_norm": 2.6004467010498047, "learning_rate": 5.867517031453833e-07, "loss": 0.2695, "step": 8553 }, { "epoch": 0.41329661303570564, "grad_norm": 2.306029796600342, "learning_rate": 5.867033869642943e-07, "loss": 0.2835, "step": 8554 }, { "epoch": 0.4133449292167947, "grad_norm": 3.5721545219421387, "learning_rate": 5.866550707832053e-07, "loss": 0.3422, "step": 8555 }, { "epoch": 0.41339324539788375, "grad_norm": 7.470469951629639, "learning_rate": 5.866067546021162e-07, "loss": 0.2819, "step": 8556 }, { "epoch": 0.41344156157897277, "grad_norm": 2.416638135910034, "learning_rate": 5.865584384210272e-07, "loss": 0.3448, "step": 8557 }, { "epoch": 0.41348987776006185, "grad_norm": 2.51664400100708, "learning_rate": 5.865101222399382e-07, "loss": 0.2958, "step": 8558 }, { "epoch": 0.4135381939411509, "grad_norm": 4.612226486206055, "learning_rate": 5.864618060588491e-07, "loss": 0.2214, "step": 8559 }, { "epoch": 0.41358651012223996, "grad_norm": 2.108234405517578, "learning_rate": 5.8641348987776e-07, "loss": 0.2403, "step": 8560 }, { "epoch": 0.413634826303329, "grad_norm": 2.784313201904297, "learning_rate": 5.863651736966709e-07, "loss": 0.3631, "step": 8561 }, { "epoch": 0.413683142484418, "grad_norm": 2.6146979331970215, "learning_rate": 5.863168575155819e-07, "loss": 0.3314, "step": 8562 }, { "epoch": 0.4137314586655071, "grad_norm": 2.5419600009918213, "learning_rate": 5.862685413344929e-07, "loss": 0.2641, "step": 8563 }, { "epoch": 0.4137797748465961, "grad_norm": 2.6197524070739746, "learning_rate": 5.862202251534039e-07, "loss": 0.2507, "step": 8564 }, { "epoch": 0.4138280910276852, "grad_norm": 2.7356185913085938, "learning_rate": 5.861719089723148e-07, "loss": 0.4312, "step": 8565 }, { "epoch": 0.4138764072087742, "grad_norm": 3.8864188194274902, "learning_rate": 5.861235927912258e-07, "loss": 0.3031, "step": 8566 }, { "epoch": 0.41392472338986325, "grad_norm": 10.57024097442627, "learning_rate": 5.860752766101367e-07, "loss": 0.2963, "step": 8567 }, { "epoch": 0.41397303957095233, "grad_norm": 2.157421350479126, "learning_rate": 5.860269604290476e-07, "loss": 0.2791, "step": 8568 }, { "epoch": 0.41402135575204135, "grad_norm": 2.0906097888946533, "learning_rate": 5.859786442479586e-07, "loss": 0.2288, "step": 8569 }, { "epoch": 0.4140696719331304, "grad_norm": 2.5352611541748047, "learning_rate": 5.859303280668696e-07, "loss": 0.3018, "step": 8570 }, { "epoch": 0.41411798811421946, "grad_norm": 2.930142641067505, "learning_rate": 5.858820118857806e-07, "loss": 0.466, "step": 8571 }, { "epoch": 0.4141663042953085, "grad_norm": 2.1066091060638428, "learning_rate": 5.858336957046914e-07, "loss": 0.242, "step": 8572 }, { "epoch": 0.41421462047639757, "grad_norm": 2.469085693359375, "learning_rate": 5.857853795236024e-07, "loss": 0.2855, "step": 8573 }, { "epoch": 0.4142629366574866, "grad_norm": 3.7012901306152344, "learning_rate": 5.857370633425134e-07, "loss": 0.2036, "step": 8574 }, { "epoch": 0.4143112528385756, "grad_norm": 3.11470103263855, "learning_rate": 5.856887471614244e-07, "loss": 0.2309, "step": 8575 }, { "epoch": 0.4143595690196647, "grad_norm": 2.015420436859131, "learning_rate": 5.856404309803354e-07, "loss": 0.2372, "step": 8576 }, { "epoch": 0.4144078852007537, "grad_norm": 3.505208969116211, "learning_rate": 5.855921147992462e-07, "loss": 0.2396, "step": 8577 }, { "epoch": 0.4144562013818428, "grad_norm": 3.444547414779663, "learning_rate": 5.855437986181571e-07, "loss": 0.3672, "step": 8578 }, { "epoch": 0.41450451756293183, "grad_norm": 1.899103045463562, "learning_rate": 5.854954824370681e-07, "loss": 0.2063, "step": 8579 }, { "epoch": 0.41455283374402085, "grad_norm": 3.0949559211730957, "learning_rate": 5.854471662559791e-07, "loss": 0.3535, "step": 8580 }, { "epoch": 0.41460114992510994, "grad_norm": 5.331776142120361, "learning_rate": 5.853988500748901e-07, "loss": 0.3505, "step": 8581 }, { "epoch": 0.41464946610619896, "grad_norm": 3.2239997386932373, "learning_rate": 5.85350533893801e-07, "loss": 0.3912, "step": 8582 }, { "epoch": 0.414697782287288, "grad_norm": 2.361539363861084, "learning_rate": 5.85302217712712e-07, "loss": 0.267, "step": 8583 }, { "epoch": 0.41474609846837707, "grad_norm": 2.6430251598358154, "learning_rate": 5.852539015316229e-07, "loss": 0.3192, "step": 8584 }, { "epoch": 0.4147944146494661, "grad_norm": 3.208345890045166, "learning_rate": 5.852055853505338e-07, "loss": 0.4586, "step": 8585 }, { "epoch": 0.4148427308305552, "grad_norm": 3.185150384902954, "learning_rate": 5.851572691694448e-07, "loss": 0.3326, "step": 8586 }, { "epoch": 0.4148910470116442, "grad_norm": 2.698965311050415, "learning_rate": 5.851089529883557e-07, "loss": 0.2856, "step": 8587 }, { "epoch": 0.4149393631927332, "grad_norm": 2.7750120162963867, "learning_rate": 5.850606368072667e-07, "loss": 0.2604, "step": 8588 }, { "epoch": 0.4149876793738223, "grad_norm": 3.9207355976104736, "learning_rate": 5.850123206261777e-07, "loss": 0.2439, "step": 8589 }, { "epoch": 0.41503599555491133, "grad_norm": 2.208967924118042, "learning_rate": 5.849640044450887e-07, "loss": 0.2126, "step": 8590 }, { "epoch": 0.4150843117360004, "grad_norm": 2.123002529144287, "learning_rate": 5.849156882639996e-07, "loss": 0.2777, "step": 8591 }, { "epoch": 0.41513262791708944, "grad_norm": 3.2447330951690674, "learning_rate": 5.848673720829106e-07, "loss": 0.4676, "step": 8592 }, { "epoch": 0.41518094409817846, "grad_norm": 5.231003761291504, "learning_rate": 5.848190559018215e-07, "loss": 0.3027, "step": 8593 }, { "epoch": 0.41522926027926754, "grad_norm": 2.172429084777832, "learning_rate": 5.847707397207324e-07, "loss": 0.3231, "step": 8594 }, { "epoch": 0.41527757646035657, "grad_norm": 2.679793119430542, "learning_rate": 5.847224235396434e-07, "loss": 0.3132, "step": 8595 }, { "epoch": 0.4153258926414456, "grad_norm": 3.4530792236328125, "learning_rate": 5.846741073585544e-07, "loss": 0.3941, "step": 8596 }, { "epoch": 0.4153742088225347, "grad_norm": 3.0653903484344482, "learning_rate": 5.846257911774653e-07, "loss": 0.3528, "step": 8597 }, { "epoch": 0.4154225250036237, "grad_norm": 2.7258989810943604, "learning_rate": 5.845774749963762e-07, "loss": 0.2624, "step": 8598 }, { "epoch": 0.4154708411847128, "grad_norm": 1.7213801145553589, "learning_rate": 5.845291588152872e-07, "loss": 0.1791, "step": 8599 }, { "epoch": 0.4155191573658018, "grad_norm": 3.021097183227539, "learning_rate": 5.844808426341982e-07, "loss": 0.3351, "step": 8600 }, { "epoch": 0.41556747354689083, "grad_norm": 2.737351179122925, "learning_rate": 5.844325264531092e-07, "loss": 0.3538, "step": 8601 }, { "epoch": 0.4156157897279799, "grad_norm": 1.5530935525894165, "learning_rate": 5.843842102720202e-07, "loss": 0.1451, "step": 8602 }, { "epoch": 0.41566410590906894, "grad_norm": 3.049389123916626, "learning_rate": 5.843358940909309e-07, "loss": 0.3302, "step": 8603 }, { "epoch": 0.415712422090158, "grad_norm": 2.825317621231079, "learning_rate": 5.842875779098419e-07, "loss": 0.3054, "step": 8604 }, { "epoch": 0.41576073827124704, "grad_norm": 2.2678911685943604, "learning_rate": 5.842392617287529e-07, "loss": 0.2672, "step": 8605 }, { "epoch": 0.41580905445233607, "grad_norm": 3.8479537963867188, "learning_rate": 5.841909455476639e-07, "loss": 0.2397, "step": 8606 }, { "epoch": 0.41585737063342515, "grad_norm": 2.5022470951080322, "learning_rate": 5.841426293665749e-07, "loss": 0.3284, "step": 8607 }, { "epoch": 0.4159056868145142, "grad_norm": 3.7154083251953125, "learning_rate": 5.840943131854858e-07, "loss": 0.2564, "step": 8608 }, { "epoch": 0.4159540029956032, "grad_norm": 2.310688018798828, "learning_rate": 5.840459970043968e-07, "loss": 0.1862, "step": 8609 }, { "epoch": 0.4160023191766923, "grad_norm": 2.879823684692383, "learning_rate": 5.839976808233077e-07, "loss": 0.3309, "step": 8610 }, { "epoch": 0.4160506353577813, "grad_norm": 1.7953397035598755, "learning_rate": 5.839493646422186e-07, "loss": 0.2188, "step": 8611 }, { "epoch": 0.4160989515388704, "grad_norm": 2.1261274814605713, "learning_rate": 5.839010484611296e-07, "loss": 0.188, "step": 8612 }, { "epoch": 0.4161472677199594, "grad_norm": 2.3272035121917725, "learning_rate": 5.838527322800405e-07, "loss": 0.2243, "step": 8613 }, { "epoch": 0.41619558390104844, "grad_norm": 3.001232385635376, "learning_rate": 5.838044160989515e-07, "loss": 0.2773, "step": 8614 }, { "epoch": 0.4162439000821375, "grad_norm": 6.109416961669922, "learning_rate": 5.837560999178625e-07, "loss": 0.3056, "step": 8615 }, { "epoch": 0.41629221626322654, "grad_norm": 1.921182632446289, "learning_rate": 5.837077837367734e-07, "loss": 0.2165, "step": 8616 }, { "epoch": 0.4163405324443156, "grad_norm": 2.392219305038452, "learning_rate": 5.836594675556844e-07, "loss": 0.2726, "step": 8617 }, { "epoch": 0.41638884862540465, "grad_norm": 2.7867279052734375, "learning_rate": 5.836111513745954e-07, "loss": 0.4189, "step": 8618 }, { "epoch": 0.4164371648064937, "grad_norm": 14.995315551757812, "learning_rate": 5.835628351935062e-07, "loss": 0.3059, "step": 8619 }, { "epoch": 0.41648548098758276, "grad_norm": 2.978541374206543, "learning_rate": 5.835145190124172e-07, "loss": 0.2336, "step": 8620 }, { "epoch": 0.4165337971686718, "grad_norm": 6.627829074859619, "learning_rate": 5.834662028313282e-07, "loss": 0.2413, "step": 8621 }, { "epoch": 0.4165821133497608, "grad_norm": 4.225154876708984, "learning_rate": 5.834178866502392e-07, "loss": 0.2719, "step": 8622 }, { "epoch": 0.4166304295308499, "grad_norm": 3.570829391479492, "learning_rate": 5.833695704691501e-07, "loss": 0.2167, "step": 8623 }, { "epoch": 0.4166787457119389, "grad_norm": 3.8343398571014404, "learning_rate": 5.83321254288061e-07, "loss": 0.3193, "step": 8624 }, { "epoch": 0.416727061893028, "grad_norm": 2.27270245552063, "learning_rate": 5.83272938106972e-07, "loss": 0.2313, "step": 8625 }, { "epoch": 0.416775378074117, "grad_norm": 4.647250175476074, "learning_rate": 5.83224621925883e-07, "loss": 0.2552, "step": 8626 }, { "epoch": 0.41682369425520605, "grad_norm": 2.9050776958465576, "learning_rate": 5.83176305744794e-07, "loss": 0.173, "step": 8627 }, { "epoch": 0.4168720104362951, "grad_norm": 2.3086767196655273, "learning_rate": 5.831279895637049e-07, "loss": 0.2406, "step": 8628 }, { "epoch": 0.41692032661738415, "grad_norm": 2.5725300312042236, "learning_rate": 5.830796733826157e-07, "loss": 0.2898, "step": 8629 }, { "epoch": 0.41696864279847323, "grad_norm": 2.631216287612915, "learning_rate": 5.830313572015267e-07, "loss": 0.1843, "step": 8630 }, { "epoch": 0.41701695897956226, "grad_norm": 2.982107639312744, "learning_rate": 5.829830410204377e-07, "loss": 0.3985, "step": 8631 }, { "epoch": 0.4170652751606513, "grad_norm": 3.0246567726135254, "learning_rate": 5.829347248393487e-07, "loss": 0.3875, "step": 8632 }, { "epoch": 0.41711359134174036, "grad_norm": 1.4275380373001099, "learning_rate": 5.828864086582597e-07, "loss": 0.1381, "step": 8633 }, { "epoch": 0.4171619075228294, "grad_norm": 2.795305013656616, "learning_rate": 5.828380924771706e-07, "loss": 0.1518, "step": 8634 }, { "epoch": 0.4172102237039184, "grad_norm": 2.753552198410034, "learning_rate": 5.827897762960815e-07, "loss": 0.3134, "step": 8635 }, { "epoch": 0.4172585398850075, "grad_norm": 4.141808032989502, "learning_rate": 5.827414601149924e-07, "loss": 0.3926, "step": 8636 }, { "epoch": 0.4173068560660965, "grad_norm": 5.695283889770508, "learning_rate": 5.826931439339034e-07, "loss": 0.2794, "step": 8637 }, { "epoch": 0.4173551722471856, "grad_norm": 2.0012495517730713, "learning_rate": 5.826448277528144e-07, "loss": 0.1793, "step": 8638 }, { "epoch": 0.4174034884282746, "grad_norm": 2.8217175006866455, "learning_rate": 5.825965115717253e-07, "loss": 0.3953, "step": 8639 }, { "epoch": 0.41745180460936365, "grad_norm": 3.4130475521087646, "learning_rate": 5.825481953906363e-07, "loss": 0.355, "step": 8640 }, { "epoch": 0.41750012079045273, "grad_norm": 3.4876887798309326, "learning_rate": 5.824998792095473e-07, "loss": 0.4577, "step": 8641 }, { "epoch": 0.41754843697154176, "grad_norm": 2.8308231830596924, "learning_rate": 5.824515630284582e-07, "loss": 0.2496, "step": 8642 }, { "epoch": 0.41759675315263084, "grad_norm": 3.6694042682647705, "learning_rate": 5.824032468473692e-07, "loss": 0.227, "step": 8643 }, { "epoch": 0.41764506933371986, "grad_norm": 3.3578720092773438, "learning_rate": 5.823549306662802e-07, "loss": 0.3303, "step": 8644 }, { "epoch": 0.4176933855148089, "grad_norm": 2.052189826965332, "learning_rate": 5.82306614485191e-07, "loss": 0.1899, "step": 8645 }, { "epoch": 0.41774170169589797, "grad_norm": 26.991262435913086, "learning_rate": 5.82258298304102e-07, "loss": 0.5144, "step": 8646 }, { "epoch": 0.417790017876987, "grad_norm": 3.2567336559295654, "learning_rate": 5.82209982123013e-07, "loss": 0.3045, "step": 8647 }, { "epoch": 0.4178383340580761, "grad_norm": 3.0613858699798584, "learning_rate": 5.821616659419239e-07, "loss": 0.3473, "step": 8648 }, { "epoch": 0.4178866502391651, "grad_norm": 2.9637813568115234, "learning_rate": 5.821133497608349e-07, "loss": 0.4649, "step": 8649 }, { "epoch": 0.41793496642025413, "grad_norm": 22.388771057128906, "learning_rate": 5.820650335797458e-07, "loss": 0.3321, "step": 8650 }, { "epoch": 0.4179832826013432, "grad_norm": 2.3116259574890137, "learning_rate": 5.820167173986568e-07, "loss": 0.2615, "step": 8651 }, { "epoch": 0.41803159878243223, "grad_norm": 2.997529983520508, "learning_rate": 5.819684012175678e-07, "loss": 0.4373, "step": 8652 }, { "epoch": 0.41807991496352126, "grad_norm": 2.025761365890503, "learning_rate": 5.819200850364787e-07, "loss": 0.1912, "step": 8653 }, { "epoch": 0.41812823114461034, "grad_norm": 4.20930290222168, "learning_rate": 5.818717688553897e-07, "loss": 0.4227, "step": 8654 }, { "epoch": 0.41817654732569937, "grad_norm": 2.925114393234253, "learning_rate": 5.818234526743005e-07, "loss": 0.3402, "step": 8655 }, { "epoch": 0.41822486350678845, "grad_norm": 19.5037899017334, "learning_rate": 5.817751364932115e-07, "loss": 0.2535, "step": 8656 }, { "epoch": 0.41827317968787747, "grad_norm": 2.5448429584503174, "learning_rate": 5.817268203121225e-07, "loss": 0.3269, "step": 8657 }, { "epoch": 0.4183214958689665, "grad_norm": 3.6876914501190186, "learning_rate": 5.816785041310335e-07, "loss": 0.4446, "step": 8658 }, { "epoch": 0.4183698120500556, "grad_norm": 2.3029494285583496, "learning_rate": 5.816301879499445e-07, "loss": 0.2337, "step": 8659 }, { "epoch": 0.4184181282311446, "grad_norm": 2.3545889854431152, "learning_rate": 5.815818717688554e-07, "loss": 0.2421, "step": 8660 }, { "epoch": 0.4184664444122337, "grad_norm": 2.0919992923736572, "learning_rate": 5.815335555877662e-07, "loss": 0.2361, "step": 8661 }, { "epoch": 0.4185147605933227, "grad_norm": 2.823822259902954, "learning_rate": 5.814852394066772e-07, "loss": 0.2578, "step": 8662 }, { "epoch": 0.41856307677441174, "grad_norm": 2.4280781745910645, "learning_rate": 5.814369232255882e-07, "loss": 0.2674, "step": 8663 }, { "epoch": 0.4186113929555008, "grad_norm": 4.576448440551758, "learning_rate": 5.813886070444992e-07, "loss": 0.4065, "step": 8664 }, { "epoch": 0.41865970913658984, "grad_norm": 3.3481709957122803, "learning_rate": 5.813402908634101e-07, "loss": 0.3521, "step": 8665 }, { "epoch": 0.41870802531767887, "grad_norm": 3.6256110668182373, "learning_rate": 5.812919746823211e-07, "loss": 0.3232, "step": 8666 }, { "epoch": 0.41875634149876795, "grad_norm": 3.138221025466919, "learning_rate": 5.81243658501232e-07, "loss": 0.451, "step": 8667 }, { "epoch": 0.418804657679857, "grad_norm": 2.118429183959961, "learning_rate": 5.81195342320143e-07, "loss": 0.2645, "step": 8668 }, { "epoch": 0.41885297386094605, "grad_norm": 3.4356939792633057, "learning_rate": 5.81147026139054e-07, "loss": 0.3435, "step": 8669 }, { "epoch": 0.4189012900420351, "grad_norm": 4.701597690582275, "learning_rate": 5.810987099579649e-07, "loss": 0.3319, "step": 8670 }, { "epoch": 0.4189496062231241, "grad_norm": 3.0540201663970947, "learning_rate": 5.810503937768758e-07, "loss": 0.3822, "step": 8671 }, { "epoch": 0.4189979224042132, "grad_norm": 1.57706618309021, "learning_rate": 5.810020775957868e-07, "loss": 0.1649, "step": 8672 }, { "epoch": 0.4190462385853022, "grad_norm": 3.856379508972168, "learning_rate": 5.809537614146978e-07, "loss": 0.2926, "step": 8673 }, { "epoch": 0.4190945547663913, "grad_norm": 2.2389190196990967, "learning_rate": 5.809054452336087e-07, "loss": 0.2282, "step": 8674 }, { "epoch": 0.4191428709474803, "grad_norm": 2.7347910404205322, "learning_rate": 5.808571290525197e-07, "loss": 0.3075, "step": 8675 }, { "epoch": 0.41919118712856934, "grad_norm": 1.698242425918579, "learning_rate": 5.808088128714306e-07, "loss": 0.1917, "step": 8676 }, { "epoch": 0.4192395033096584, "grad_norm": 2.4996821880340576, "learning_rate": 5.807604966903416e-07, "loss": 0.3162, "step": 8677 }, { "epoch": 0.41928781949074745, "grad_norm": 2.27909779548645, "learning_rate": 5.807121805092525e-07, "loss": 0.3329, "step": 8678 }, { "epoch": 0.4193361356718365, "grad_norm": 2.564690589904785, "learning_rate": 5.806638643281635e-07, "loss": 0.2339, "step": 8679 }, { "epoch": 0.41938445185292555, "grad_norm": 2.780026912689209, "learning_rate": 5.806155481470744e-07, "loss": 0.2427, "step": 8680 }, { "epoch": 0.4194327680340146, "grad_norm": 11.986323356628418, "learning_rate": 5.805672319659853e-07, "loss": 0.2546, "step": 8681 }, { "epoch": 0.41948108421510366, "grad_norm": 3.232297658920288, "learning_rate": 5.805189157848963e-07, "loss": 0.3532, "step": 8682 }, { "epoch": 0.4195294003961927, "grad_norm": 3.069014072418213, "learning_rate": 5.804705996038073e-07, "loss": 0.2584, "step": 8683 }, { "epoch": 0.4195777165772817, "grad_norm": 2.7059333324432373, "learning_rate": 5.804222834227183e-07, "loss": 0.2768, "step": 8684 }, { "epoch": 0.4196260327583708, "grad_norm": 2.7668654918670654, "learning_rate": 5.803739672416293e-07, "loss": 0.2763, "step": 8685 }, { "epoch": 0.4196743489394598, "grad_norm": 2.364316463470459, "learning_rate": 5.8032565106054e-07, "loss": 0.3144, "step": 8686 }, { "epoch": 0.4197226651205489, "grad_norm": 1.9144655466079712, "learning_rate": 5.80277334879451e-07, "loss": 0.2511, "step": 8687 }, { "epoch": 0.4197709813016379, "grad_norm": 2.5458219051361084, "learning_rate": 5.80229018698362e-07, "loss": 0.2507, "step": 8688 }, { "epoch": 0.41981929748272695, "grad_norm": 2.261432647705078, "learning_rate": 5.80180702517273e-07, "loss": 0.2296, "step": 8689 }, { "epoch": 0.41986761366381603, "grad_norm": 2.866469383239746, "learning_rate": 5.80132386336184e-07, "loss": 0.3527, "step": 8690 }, { "epoch": 0.41991592984490506, "grad_norm": 4.8297576904296875, "learning_rate": 5.800840701550949e-07, "loss": 0.394, "step": 8691 }, { "epoch": 0.4199642460259941, "grad_norm": 6.587594509124756, "learning_rate": 5.800357539740059e-07, "loss": 0.4093, "step": 8692 }, { "epoch": 0.42001256220708316, "grad_norm": 8.735758781433105, "learning_rate": 5.799874377929168e-07, "loss": 0.3201, "step": 8693 }, { "epoch": 0.4200608783881722, "grad_norm": 3.7965946197509766, "learning_rate": 5.799391216118278e-07, "loss": 0.3958, "step": 8694 }, { "epoch": 0.42010919456926127, "grad_norm": 2.6242194175720215, "learning_rate": 5.798908054307387e-07, "loss": 0.2426, "step": 8695 }, { "epoch": 0.4201575107503503, "grad_norm": 4.37711763381958, "learning_rate": 5.798424892496497e-07, "loss": 0.4227, "step": 8696 }, { "epoch": 0.4202058269314393, "grad_norm": 2.0767438411712646, "learning_rate": 5.797941730685606e-07, "loss": 0.2093, "step": 8697 }, { "epoch": 0.4202541431125284, "grad_norm": 3.743302822113037, "learning_rate": 5.797458568874716e-07, "loss": 0.3908, "step": 8698 }, { "epoch": 0.4203024592936174, "grad_norm": 3.582005500793457, "learning_rate": 5.796975407063825e-07, "loss": 0.2103, "step": 8699 }, { "epoch": 0.4203507754747065, "grad_norm": 3.635014533996582, "learning_rate": 5.796492245252935e-07, "loss": 0.3876, "step": 8700 }, { "epoch": 0.42039909165579553, "grad_norm": 4.050076007843018, "learning_rate": 5.796009083442045e-07, "loss": 0.2703, "step": 8701 }, { "epoch": 0.42044740783688456, "grad_norm": 3.4294230937957764, "learning_rate": 5.795525921631154e-07, "loss": 0.2139, "step": 8702 }, { "epoch": 0.42049572401797364, "grad_norm": 2.981367349624634, "learning_rate": 5.795042759820264e-07, "loss": 0.3233, "step": 8703 }, { "epoch": 0.42054404019906266, "grad_norm": 2.9125044345855713, "learning_rate": 5.794559598009373e-07, "loss": 0.3921, "step": 8704 }, { "epoch": 0.4205923563801517, "grad_norm": 2.534240484237671, "learning_rate": 5.794076436198483e-07, "loss": 0.2489, "step": 8705 }, { "epoch": 0.42064067256124077, "grad_norm": 2.8279993534088135, "learning_rate": 5.793593274387592e-07, "loss": 0.3395, "step": 8706 }, { "epoch": 0.4206889887423298, "grad_norm": 2.1499738693237305, "learning_rate": 5.793110112576701e-07, "loss": 0.2291, "step": 8707 }, { "epoch": 0.4207373049234189, "grad_norm": 3.640073537826538, "learning_rate": 5.792626950765811e-07, "loss": 0.3024, "step": 8708 }, { "epoch": 0.4207856211045079, "grad_norm": 3.2589542865753174, "learning_rate": 5.792143788954921e-07, "loss": 0.3262, "step": 8709 }, { "epoch": 0.4208339372855969, "grad_norm": 2.2563652992248535, "learning_rate": 5.791660627144031e-07, "loss": 0.1772, "step": 8710 }, { "epoch": 0.420882253466686, "grad_norm": 2.4704079627990723, "learning_rate": 5.791177465333141e-07, "loss": 0.3076, "step": 8711 }, { "epoch": 0.42093056964777503, "grad_norm": 2.554481267929077, "learning_rate": 5.790694303522248e-07, "loss": 0.2566, "step": 8712 }, { "epoch": 0.4209788858288641, "grad_norm": 4.924106597900391, "learning_rate": 5.790211141711358e-07, "loss": 0.3183, "step": 8713 }, { "epoch": 0.42102720200995314, "grad_norm": 2.204158067703247, "learning_rate": 5.789727979900468e-07, "loss": 0.3028, "step": 8714 }, { "epoch": 0.42107551819104216, "grad_norm": 2.365658760070801, "learning_rate": 5.789244818089578e-07, "loss": 0.1901, "step": 8715 }, { "epoch": 0.42112383437213124, "grad_norm": 2.5313990116119385, "learning_rate": 5.788761656278688e-07, "loss": 0.2738, "step": 8716 }, { "epoch": 0.42117215055322027, "grad_norm": 2.528400182723999, "learning_rate": 5.788278494467797e-07, "loss": 0.3228, "step": 8717 }, { "epoch": 0.4212204667343093, "grad_norm": 3.0005762577056885, "learning_rate": 5.787795332656906e-07, "loss": 0.2929, "step": 8718 }, { "epoch": 0.4212687829153984, "grad_norm": 3.4157567024230957, "learning_rate": 5.787312170846016e-07, "loss": 0.3764, "step": 8719 }, { "epoch": 0.4213170990964874, "grad_norm": 2.043694257736206, "learning_rate": 5.786829009035126e-07, "loss": 0.1604, "step": 8720 }, { "epoch": 0.4213654152775765, "grad_norm": 3.794942855834961, "learning_rate": 5.786345847224235e-07, "loss": 0.3479, "step": 8721 }, { "epoch": 0.4214137314586655, "grad_norm": 2.985285997390747, "learning_rate": 5.785862685413345e-07, "loss": 0.2698, "step": 8722 }, { "epoch": 0.42146204763975453, "grad_norm": 4.398609638214111, "learning_rate": 5.785379523602454e-07, "loss": 0.3469, "step": 8723 }, { "epoch": 0.4215103638208436, "grad_norm": 1.878570556640625, "learning_rate": 5.784896361791564e-07, "loss": 0.1883, "step": 8724 }, { "epoch": 0.42155868000193264, "grad_norm": 1.822693109512329, "learning_rate": 5.784413199980673e-07, "loss": 0.2353, "step": 8725 }, { "epoch": 0.4216069961830217, "grad_norm": 4.069453239440918, "learning_rate": 5.783930038169783e-07, "loss": 0.3774, "step": 8726 }, { "epoch": 0.42165531236411075, "grad_norm": 3.0280230045318604, "learning_rate": 5.783446876358893e-07, "loss": 0.3896, "step": 8727 }, { "epoch": 0.42170362854519977, "grad_norm": 2.5484747886657715, "learning_rate": 5.782963714548002e-07, "loss": 0.3075, "step": 8728 }, { "epoch": 0.42175194472628885, "grad_norm": 2.8503408432006836, "learning_rate": 5.782480552737111e-07, "loss": 0.3693, "step": 8729 }, { "epoch": 0.4218002609073779, "grad_norm": 4.641184329986572, "learning_rate": 5.781997390926221e-07, "loss": 0.3476, "step": 8730 }, { "epoch": 0.4218485770884669, "grad_norm": 2.616250991821289, "learning_rate": 5.78151422911533e-07, "loss": 0.2974, "step": 8731 }, { "epoch": 0.421896893269556, "grad_norm": 3.0025599002838135, "learning_rate": 5.78103106730444e-07, "loss": 0.2818, "step": 8732 }, { "epoch": 0.421945209450645, "grad_norm": 2.536576509475708, "learning_rate": 5.780547905493549e-07, "loss": 0.2989, "step": 8733 }, { "epoch": 0.4219935256317341, "grad_norm": 2.7444496154785156, "learning_rate": 5.780064743682659e-07, "loss": 0.3079, "step": 8734 }, { "epoch": 0.4220418418128231, "grad_norm": 2.7578015327453613, "learning_rate": 5.779581581871769e-07, "loss": 0.3545, "step": 8735 }, { "epoch": 0.42209015799391214, "grad_norm": 3.2578558921813965, "learning_rate": 5.779098420060879e-07, "loss": 0.2537, "step": 8736 }, { "epoch": 0.4221384741750012, "grad_norm": 4.473811626434326, "learning_rate": 5.778615258249989e-07, "loss": 0.2276, "step": 8737 }, { "epoch": 0.42218679035609025, "grad_norm": 2.554719924926758, "learning_rate": 5.778132096439096e-07, "loss": 0.344, "step": 8738 }, { "epoch": 0.4222351065371793, "grad_norm": 2.908651113510132, "learning_rate": 5.777648934628206e-07, "loss": 0.3664, "step": 8739 }, { "epoch": 0.42228342271826835, "grad_norm": 2.092660665512085, "learning_rate": 5.777165772817316e-07, "loss": 0.2663, "step": 8740 }, { "epoch": 0.4223317388993574, "grad_norm": 4.835507392883301, "learning_rate": 5.776682611006426e-07, "loss": 0.2375, "step": 8741 }, { "epoch": 0.42238005508044646, "grad_norm": 3.323843240737915, "learning_rate": 5.776199449195536e-07, "loss": 0.2856, "step": 8742 }, { "epoch": 0.4224283712615355, "grad_norm": 2.3100056648254395, "learning_rate": 5.775716287384645e-07, "loss": 0.2168, "step": 8743 }, { "epoch": 0.4224766874426245, "grad_norm": 2.7413642406463623, "learning_rate": 5.775233125573754e-07, "loss": 0.3715, "step": 8744 }, { "epoch": 0.4225250036237136, "grad_norm": 3.5447804927825928, "learning_rate": 5.774749963762864e-07, "loss": 0.3004, "step": 8745 }, { "epoch": 0.4225733198048026, "grad_norm": 2.3950600624084473, "learning_rate": 5.774266801951973e-07, "loss": 0.3294, "step": 8746 }, { "epoch": 0.4226216359858917, "grad_norm": 2.7364461421966553, "learning_rate": 5.773783640141083e-07, "loss": 0.3473, "step": 8747 }, { "epoch": 0.4226699521669807, "grad_norm": 3.240246534347534, "learning_rate": 5.773300478330193e-07, "loss": 0.2884, "step": 8748 }, { "epoch": 0.42271826834806975, "grad_norm": 2.3371002674102783, "learning_rate": 5.772817316519302e-07, "loss": 0.1794, "step": 8749 }, { "epoch": 0.42276658452915883, "grad_norm": 12.276106834411621, "learning_rate": 5.772334154708411e-07, "loss": 0.4732, "step": 8750 }, { "epoch": 0.42281490071024785, "grad_norm": 11.848686218261719, "learning_rate": 5.771850992897521e-07, "loss": 0.2166, "step": 8751 }, { "epoch": 0.42286321689133693, "grad_norm": 2.2420034408569336, "learning_rate": 5.771367831086631e-07, "loss": 0.3231, "step": 8752 }, { "epoch": 0.42291153307242596, "grad_norm": 2.9947893619537354, "learning_rate": 5.770884669275741e-07, "loss": 0.2045, "step": 8753 }, { "epoch": 0.422959849253515, "grad_norm": 2.2413995265960693, "learning_rate": 5.77040150746485e-07, "loss": 0.1605, "step": 8754 }, { "epoch": 0.42300816543460407, "grad_norm": 2.08577823638916, "learning_rate": 5.769918345653959e-07, "loss": 0.2548, "step": 8755 }, { "epoch": 0.4230564816156931, "grad_norm": 2.7061877250671387, "learning_rate": 5.769435183843069e-07, "loss": 0.3243, "step": 8756 }, { "epoch": 0.4231047977967821, "grad_norm": 2.587599039077759, "learning_rate": 5.768952022032178e-07, "loss": 0.3512, "step": 8757 }, { "epoch": 0.4231531139778712, "grad_norm": 3.844569683074951, "learning_rate": 5.768468860221288e-07, "loss": 0.4263, "step": 8758 }, { "epoch": 0.4232014301589602, "grad_norm": 2.8777599334716797, "learning_rate": 5.767985698410397e-07, "loss": 0.2254, "step": 8759 }, { "epoch": 0.4232497463400493, "grad_norm": 2.810960531234741, "learning_rate": 5.767502536599507e-07, "loss": 0.3676, "step": 8760 }, { "epoch": 0.42329806252113833, "grad_norm": 2.190622329711914, "learning_rate": 5.767019374788617e-07, "loss": 0.2556, "step": 8761 }, { "epoch": 0.42334637870222736, "grad_norm": 2.5519678592681885, "learning_rate": 5.766536212977727e-07, "loss": 0.3355, "step": 8762 }, { "epoch": 0.42339469488331644, "grad_norm": 3.5864531993865967, "learning_rate": 5.766053051166835e-07, "loss": 0.3741, "step": 8763 }, { "epoch": 0.42344301106440546, "grad_norm": 2.7185468673706055, "learning_rate": 5.765569889355944e-07, "loss": 0.2586, "step": 8764 }, { "epoch": 0.42349132724549454, "grad_norm": 7.173250198364258, "learning_rate": 5.765086727545054e-07, "loss": 0.3262, "step": 8765 }, { "epoch": 0.42353964342658357, "grad_norm": 4.295485019683838, "learning_rate": 5.764603565734164e-07, "loss": 0.2757, "step": 8766 }, { "epoch": 0.4235879596076726, "grad_norm": 3.0313780307769775, "learning_rate": 5.764120403923274e-07, "loss": 0.3307, "step": 8767 }, { "epoch": 0.4236362757887617, "grad_norm": 3.024446725845337, "learning_rate": 5.763637242112384e-07, "loss": 0.278, "step": 8768 }, { "epoch": 0.4236845919698507, "grad_norm": 5.7260003089904785, "learning_rate": 5.763154080301493e-07, "loss": 0.2838, "step": 8769 }, { "epoch": 0.4237329081509397, "grad_norm": 9.647469520568848, "learning_rate": 5.762670918490602e-07, "loss": 0.4438, "step": 8770 }, { "epoch": 0.4237812243320288, "grad_norm": 4.133322238922119, "learning_rate": 5.762187756679711e-07, "loss": 0.2602, "step": 8771 }, { "epoch": 0.42382954051311783, "grad_norm": 2.2718398571014404, "learning_rate": 5.761704594868821e-07, "loss": 0.2676, "step": 8772 }, { "epoch": 0.4238778566942069, "grad_norm": 2.4702727794647217, "learning_rate": 5.761221433057931e-07, "loss": 0.2397, "step": 8773 }, { "epoch": 0.42392617287529594, "grad_norm": 3.558915376663208, "learning_rate": 5.760738271247041e-07, "loss": 0.3794, "step": 8774 }, { "epoch": 0.42397448905638496, "grad_norm": 6.536707401275635, "learning_rate": 5.76025510943615e-07, "loss": 0.4062, "step": 8775 }, { "epoch": 0.42402280523747404, "grad_norm": 4.333147048950195, "learning_rate": 5.759771947625259e-07, "loss": 0.445, "step": 8776 }, { "epoch": 0.42407112141856307, "grad_norm": 2.226886034011841, "learning_rate": 5.759288785814369e-07, "loss": 0.205, "step": 8777 }, { "epoch": 0.42411943759965215, "grad_norm": 2.2103147506713867, "learning_rate": 5.758805624003479e-07, "loss": 0.292, "step": 8778 }, { "epoch": 0.4241677537807412, "grad_norm": 2.5154688358306885, "learning_rate": 5.758322462192589e-07, "loss": 0.3375, "step": 8779 }, { "epoch": 0.4242160699618302, "grad_norm": 3.1378440856933594, "learning_rate": 5.757839300381697e-07, "loss": 0.2346, "step": 8780 }, { "epoch": 0.4242643861429193, "grad_norm": 2.2785439491271973, "learning_rate": 5.757356138570807e-07, "loss": 0.2236, "step": 8781 }, { "epoch": 0.4243127023240083, "grad_norm": 3.4590115547180176, "learning_rate": 5.756872976759917e-07, "loss": 0.4727, "step": 8782 }, { "epoch": 0.42436101850509733, "grad_norm": 6.129220008850098, "learning_rate": 5.756389814949026e-07, "loss": 0.3488, "step": 8783 }, { "epoch": 0.4244093346861864, "grad_norm": 3.324810028076172, "learning_rate": 5.755906653138136e-07, "loss": 0.2808, "step": 8784 }, { "epoch": 0.42445765086727544, "grad_norm": 1.8183071613311768, "learning_rate": 5.755423491327245e-07, "loss": 0.2359, "step": 8785 }, { "epoch": 0.4245059670483645, "grad_norm": 2.3628923892974854, "learning_rate": 5.754940329516355e-07, "loss": 0.2594, "step": 8786 }, { "epoch": 0.42455428322945354, "grad_norm": 2.3905513286590576, "learning_rate": 5.754457167705465e-07, "loss": 0.2177, "step": 8787 }, { "epoch": 0.42460259941054257, "grad_norm": 5.665628910064697, "learning_rate": 5.753974005894575e-07, "loss": 0.3284, "step": 8788 }, { "epoch": 0.42465091559163165, "grad_norm": 2.9639644622802734, "learning_rate": 5.753490844083683e-07, "loss": 0.2996, "step": 8789 }, { "epoch": 0.4246992317727207, "grad_norm": 2.7297439575195312, "learning_rate": 5.753007682272792e-07, "loss": 0.3044, "step": 8790 }, { "epoch": 0.42474754795380976, "grad_norm": 2.8007960319519043, "learning_rate": 5.752524520461902e-07, "loss": 0.4327, "step": 8791 }, { "epoch": 0.4247958641348988, "grad_norm": 2.715186357498169, "learning_rate": 5.752041358651012e-07, "loss": 0.2501, "step": 8792 }, { "epoch": 0.4248441803159878, "grad_norm": 12.700644493103027, "learning_rate": 5.751558196840122e-07, "loss": 0.26, "step": 8793 }, { "epoch": 0.4248924964970769, "grad_norm": 2.450131893157959, "learning_rate": 5.751075035029232e-07, "loss": 0.1831, "step": 8794 }, { "epoch": 0.4249408126781659, "grad_norm": 4.012856960296631, "learning_rate": 5.75059187321834e-07, "loss": 0.305, "step": 8795 }, { "epoch": 0.42498912885925494, "grad_norm": 5.947461128234863, "learning_rate": 5.75010871140745e-07, "loss": 0.3088, "step": 8796 }, { "epoch": 0.425037445040344, "grad_norm": 2.585784435272217, "learning_rate": 5.749625549596559e-07, "loss": 0.3228, "step": 8797 }, { "epoch": 0.42508576122143304, "grad_norm": 10.777337074279785, "learning_rate": 5.749142387785669e-07, "loss": 0.3176, "step": 8798 }, { "epoch": 0.4251340774025221, "grad_norm": 2.592618942260742, "learning_rate": 5.748659225974779e-07, "loss": 0.2892, "step": 8799 }, { "epoch": 0.42518239358361115, "grad_norm": 3.4321365356445312, "learning_rate": 5.748176064163888e-07, "loss": 0.4448, "step": 8800 }, { "epoch": 0.4252307097647002, "grad_norm": 3.182152271270752, "learning_rate": 5.747692902352998e-07, "loss": 0.3803, "step": 8801 }, { "epoch": 0.42527902594578926, "grad_norm": 2.6594862937927246, "learning_rate": 5.747209740542107e-07, "loss": 0.3252, "step": 8802 }, { "epoch": 0.4253273421268783, "grad_norm": 2.541818380355835, "learning_rate": 5.746726578731217e-07, "loss": 0.2481, "step": 8803 }, { "epoch": 0.42537565830796736, "grad_norm": 3.335963249206543, "learning_rate": 5.746243416920327e-07, "loss": 0.3201, "step": 8804 }, { "epoch": 0.4254239744890564, "grad_norm": 2.1820991039276123, "learning_rate": 5.745760255109436e-07, "loss": 0.2451, "step": 8805 }, { "epoch": 0.4254722906701454, "grad_norm": 3.5149080753326416, "learning_rate": 5.745277093298545e-07, "loss": 0.231, "step": 8806 }, { "epoch": 0.4255206068512345, "grad_norm": 2.006471633911133, "learning_rate": 5.744793931487655e-07, "loss": 0.2308, "step": 8807 }, { "epoch": 0.4255689230323235, "grad_norm": 8.001314163208008, "learning_rate": 5.744310769676764e-07, "loss": 0.2934, "step": 8808 }, { "epoch": 0.42561723921341255, "grad_norm": 3.335613965988159, "learning_rate": 5.743827607865874e-07, "loss": 0.3669, "step": 8809 }, { "epoch": 0.4256655553945016, "grad_norm": 2.0537784099578857, "learning_rate": 5.743344446054984e-07, "loss": 0.1375, "step": 8810 }, { "epoch": 0.42571387157559065, "grad_norm": 2.2438321113586426, "learning_rate": 5.742861284244093e-07, "loss": 0.2268, "step": 8811 }, { "epoch": 0.42576218775667973, "grad_norm": 2.517277479171753, "learning_rate": 5.742378122433203e-07, "loss": 0.2668, "step": 8812 }, { "epoch": 0.42581050393776876, "grad_norm": 4.320779323577881, "learning_rate": 5.741894960622313e-07, "loss": 0.3723, "step": 8813 }, { "epoch": 0.4258588201188578, "grad_norm": 2.34370493888855, "learning_rate": 5.741411798811422e-07, "loss": 0.2112, "step": 8814 }, { "epoch": 0.42590713629994686, "grad_norm": 2.837462902069092, "learning_rate": 5.740928637000531e-07, "loss": 0.3552, "step": 8815 }, { "epoch": 0.4259554524810359, "grad_norm": 1.7045148611068726, "learning_rate": 5.74044547518964e-07, "loss": 0.1596, "step": 8816 }, { "epoch": 0.42600376866212497, "grad_norm": 3.8096096515655518, "learning_rate": 5.73996231337875e-07, "loss": 0.272, "step": 8817 }, { "epoch": 0.426052084843214, "grad_norm": 2.8380794525146484, "learning_rate": 5.73947915156786e-07, "loss": 0.2461, "step": 8818 }, { "epoch": 0.426100401024303, "grad_norm": 36.67606735229492, "learning_rate": 5.73899598975697e-07, "loss": 0.3134, "step": 8819 }, { "epoch": 0.4261487172053921, "grad_norm": 2.8531172275543213, "learning_rate": 5.73851282794608e-07, "loss": 0.2373, "step": 8820 }, { "epoch": 0.42619703338648113, "grad_norm": 1.718673586845398, "learning_rate": 5.738029666135188e-07, "loss": 0.208, "step": 8821 }, { "epoch": 0.42624534956757015, "grad_norm": 2.497462034225464, "learning_rate": 5.737546504324297e-07, "loss": 0.2857, "step": 8822 }, { "epoch": 0.42629366574865923, "grad_norm": 2.656355381011963, "learning_rate": 5.737063342513407e-07, "loss": 0.221, "step": 8823 }, { "epoch": 0.42634198192974826, "grad_norm": 1.9297077655792236, "learning_rate": 5.736580180702517e-07, "loss": 0.2373, "step": 8824 }, { "epoch": 0.42639029811083734, "grad_norm": 2.3880183696746826, "learning_rate": 5.736097018891627e-07, "loss": 0.2643, "step": 8825 }, { "epoch": 0.42643861429192637, "grad_norm": 1.9798238277435303, "learning_rate": 5.735613857080736e-07, "loss": 0.1786, "step": 8826 }, { "epoch": 0.4264869304730154, "grad_norm": 4.295930862426758, "learning_rate": 5.735130695269845e-07, "loss": 0.4714, "step": 8827 }, { "epoch": 0.42653524665410447, "grad_norm": 2.270120620727539, "learning_rate": 5.734647533458955e-07, "loss": 0.2814, "step": 8828 }, { "epoch": 0.4265835628351935, "grad_norm": 2.0287928581237793, "learning_rate": 5.734164371648065e-07, "loss": 0.2342, "step": 8829 }, { "epoch": 0.4266318790162826, "grad_norm": 3.0027847290039062, "learning_rate": 5.733681209837175e-07, "loss": 0.4624, "step": 8830 }, { "epoch": 0.4266801951973716, "grad_norm": 3.3967127799987793, "learning_rate": 5.733198048026284e-07, "loss": 0.3589, "step": 8831 }, { "epoch": 0.42672851137846063, "grad_norm": 2.9546830654144287, "learning_rate": 5.732714886215393e-07, "loss": 0.2876, "step": 8832 }, { "epoch": 0.4267768275595497, "grad_norm": 2.8485476970672607, "learning_rate": 5.732231724404503e-07, "loss": 0.3572, "step": 8833 }, { "epoch": 0.42682514374063873, "grad_norm": 5.730047225952148, "learning_rate": 5.731748562593612e-07, "loss": 0.2864, "step": 8834 }, { "epoch": 0.42687345992172776, "grad_norm": 3.8035995960235596, "learning_rate": 5.731265400782722e-07, "loss": 0.4143, "step": 8835 }, { "epoch": 0.42692177610281684, "grad_norm": 20.179609298706055, "learning_rate": 5.730782238971832e-07, "loss": 0.3153, "step": 8836 }, { "epoch": 0.42697009228390587, "grad_norm": 2.360832691192627, "learning_rate": 5.730299077160941e-07, "loss": 0.2311, "step": 8837 }, { "epoch": 0.42701840846499495, "grad_norm": 2.179656505584717, "learning_rate": 5.729815915350051e-07, "loss": 0.1673, "step": 8838 }, { "epoch": 0.427066724646084, "grad_norm": 2.2107701301574707, "learning_rate": 5.72933275353916e-07, "loss": 0.2174, "step": 8839 }, { "epoch": 0.427115040827173, "grad_norm": 2.6284499168395996, "learning_rate": 5.728849591728269e-07, "loss": 0.3457, "step": 8840 }, { "epoch": 0.4271633570082621, "grad_norm": 2.218177080154419, "learning_rate": 5.728366429917379e-07, "loss": 0.2592, "step": 8841 }, { "epoch": 0.4272116731893511, "grad_norm": 1.9227288961410522, "learning_rate": 5.727883268106488e-07, "loss": 0.1921, "step": 8842 }, { "epoch": 0.4272599893704402, "grad_norm": 2.762657880783081, "learning_rate": 5.727400106295598e-07, "loss": 0.2319, "step": 8843 }, { "epoch": 0.4273083055515292, "grad_norm": 8.019600868225098, "learning_rate": 5.726916944484708e-07, "loss": 0.2832, "step": 8844 }, { "epoch": 0.42735662173261824, "grad_norm": 3.155632257461548, "learning_rate": 5.726433782673818e-07, "loss": 0.3957, "step": 8845 }, { "epoch": 0.4274049379137073, "grad_norm": 3.0142486095428467, "learning_rate": 5.725950620862928e-07, "loss": 0.3267, "step": 8846 }, { "epoch": 0.42745325409479634, "grad_norm": 4.092146873474121, "learning_rate": 5.725467459052035e-07, "loss": 0.3211, "step": 8847 }, { "epoch": 0.42750157027588537, "grad_norm": 1.8166542053222656, "learning_rate": 5.724984297241145e-07, "loss": 0.2029, "step": 8848 }, { "epoch": 0.42754988645697445, "grad_norm": 2.4614858627319336, "learning_rate": 5.724501135430255e-07, "loss": 0.2283, "step": 8849 }, { "epoch": 0.4275982026380635, "grad_norm": 3.322711706161499, "learning_rate": 5.724017973619365e-07, "loss": 0.3642, "step": 8850 }, { "epoch": 0.42764651881915255, "grad_norm": 7.782492637634277, "learning_rate": 5.723534811808475e-07, "loss": 0.2909, "step": 8851 }, { "epoch": 0.4276948350002416, "grad_norm": 2.518953800201416, "learning_rate": 5.723051649997584e-07, "loss": 0.3059, "step": 8852 }, { "epoch": 0.4277431511813306, "grad_norm": 3.1354501247406006, "learning_rate": 5.722568488186693e-07, "loss": 0.3203, "step": 8853 }, { "epoch": 0.4277914673624197, "grad_norm": 3.1522622108459473, "learning_rate": 5.722085326375803e-07, "loss": 0.4202, "step": 8854 }, { "epoch": 0.4278397835435087, "grad_norm": 2.129420757293701, "learning_rate": 5.721602164564913e-07, "loss": 0.1965, "step": 8855 }, { "epoch": 0.4278880997245978, "grad_norm": 2.7286951541900635, "learning_rate": 5.721119002754022e-07, "loss": 0.2571, "step": 8856 }, { "epoch": 0.4279364159056868, "grad_norm": 3.160757303237915, "learning_rate": 5.720635840943132e-07, "loss": 0.3517, "step": 8857 }, { "epoch": 0.42798473208677584, "grad_norm": 8.47627067565918, "learning_rate": 5.720152679132241e-07, "loss": 0.244, "step": 8858 }, { "epoch": 0.4280330482678649, "grad_norm": 1.9592124223709106, "learning_rate": 5.71966951732135e-07, "loss": 0.1782, "step": 8859 }, { "epoch": 0.42808136444895395, "grad_norm": 2.068565845489502, "learning_rate": 5.71918635551046e-07, "loss": 0.2213, "step": 8860 }, { "epoch": 0.428129680630043, "grad_norm": 2.632474184036255, "learning_rate": 5.71870319369957e-07, "loss": 0.2291, "step": 8861 }, { "epoch": 0.42817799681113206, "grad_norm": 4.3863205909729, "learning_rate": 5.71822003188868e-07, "loss": 0.2227, "step": 8862 }, { "epoch": 0.4282263129922211, "grad_norm": 4.141754150390625, "learning_rate": 5.717736870077789e-07, "loss": 0.2001, "step": 8863 }, { "epoch": 0.42827462917331016, "grad_norm": 3.6100361347198486, "learning_rate": 5.717253708266898e-07, "loss": 0.2686, "step": 8864 }, { "epoch": 0.4283229453543992, "grad_norm": 1.5825563669204712, "learning_rate": 5.716770546456008e-07, "loss": 0.1759, "step": 8865 }, { "epoch": 0.4283712615354882, "grad_norm": 2.540959596633911, "learning_rate": 5.716287384645117e-07, "loss": 0.25, "step": 8866 }, { "epoch": 0.4284195777165773, "grad_norm": 2.4013752937316895, "learning_rate": 5.715804222834227e-07, "loss": 0.2859, "step": 8867 }, { "epoch": 0.4284678938976663, "grad_norm": 2.6876399517059326, "learning_rate": 5.715321061023336e-07, "loss": 0.3359, "step": 8868 }, { "epoch": 0.4285162100787554, "grad_norm": 3.708451747894287, "learning_rate": 5.714837899212446e-07, "loss": 0.3932, "step": 8869 }, { "epoch": 0.4285645262598444, "grad_norm": 3.548128604888916, "learning_rate": 5.714354737401556e-07, "loss": 0.384, "step": 8870 }, { "epoch": 0.42861284244093345, "grad_norm": 4.883907794952393, "learning_rate": 5.713871575590666e-07, "loss": 0.3447, "step": 8871 }, { "epoch": 0.42866115862202253, "grad_norm": 5.251977443695068, "learning_rate": 5.713388413779775e-07, "loss": 0.3015, "step": 8872 }, { "epoch": 0.42870947480311156, "grad_norm": 1.9542787075042725, "learning_rate": 5.712905251968883e-07, "loss": 0.2077, "step": 8873 }, { "epoch": 0.4287577909842006, "grad_norm": 2.0558223724365234, "learning_rate": 5.712422090157993e-07, "loss": 0.2061, "step": 8874 }, { "epoch": 0.42880610716528966, "grad_norm": 2.969958543777466, "learning_rate": 5.711938928347103e-07, "loss": 0.3301, "step": 8875 }, { "epoch": 0.4288544233463787, "grad_norm": 6.457556247711182, "learning_rate": 5.711455766536213e-07, "loss": 0.3278, "step": 8876 }, { "epoch": 0.42890273952746777, "grad_norm": 3.0119664669036865, "learning_rate": 5.710972604725323e-07, "loss": 0.2895, "step": 8877 }, { "epoch": 0.4289510557085568, "grad_norm": 2.3945770263671875, "learning_rate": 5.710489442914431e-07, "loss": 0.3351, "step": 8878 }, { "epoch": 0.4289993718896458, "grad_norm": 3.8442015647888184, "learning_rate": 5.710006281103541e-07, "loss": 0.3636, "step": 8879 }, { "epoch": 0.4290476880707349, "grad_norm": 3.5857138633728027, "learning_rate": 5.709523119292651e-07, "loss": 0.3367, "step": 8880 }, { "epoch": 0.4290960042518239, "grad_norm": 2.7330753803253174, "learning_rate": 5.70903995748176e-07, "loss": 0.3198, "step": 8881 }, { "epoch": 0.429144320432913, "grad_norm": 3.0274558067321777, "learning_rate": 5.70855679567087e-07, "loss": 0.3179, "step": 8882 }, { "epoch": 0.42919263661400203, "grad_norm": 3.9143199920654297, "learning_rate": 5.70807363385998e-07, "loss": 0.3025, "step": 8883 }, { "epoch": 0.42924095279509106, "grad_norm": 4.4718756675720215, "learning_rate": 5.707590472049089e-07, "loss": 0.392, "step": 8884 }, { "epoch": 0.42928926897618014, "grad_norm": 7.527639865875244, "learning_rate": 5.707107310238198e-07, "loss": 0.2807, "step": 8885 }, { "epoch": 0.42933758515726916, "grad_norm": 2.658898115158081, "learning_rate": 5.706624148427308e-07, "loss": 0.3129, "step": 8886 }, { "epoch": 0.4293859013383582, "grad_norm": 4.34940242767334, "learning_rate": 5.706140986616418e-07, "loss": 0.2889, "step": 8887 }, { "epoch": 0.42943421751944727, "grad_norm": 1.9931567907333374, "learning_rate": 5.705657824805528e-07, "loss": 0.2097, "step": 8888 }, { "epoch": 0.4294825337005363, "grad_norm": 6.048797130584717, "learning_rate": 5.705174662994637e-07, "loss": 0.4638, "step": 8889 }, { "epoch": 0.4295308498816254, "grad_norm": 2.344106912612915, "learning_rate": 5.704691501183746e-07, "loss": 0.2463, "step": 8890 }, { "epoch": 0.4295791660627144, "grad_norm": 3.240511894226074, "learning_rate": 5.704208339372855e-07, "loss": 0.3265, "step": 8891 }, { "epoch": 0.4296274822438034, "grad_norm": 3.651822805404663, "learning_rate": 5.703725177561965e-07, "loss": 0.4488, "step": 8892 }, { "epoch": 0.4296757984248925, "grad_norm": 3.6621975898742676, "learning_rate": 5.703242015751075e-07, "loss": 0.446, "step": 8893 }, { "epoch": 0.42972411460598153, "grad_norm": 1.8147790431976318, "learning_rate": 5.702758853940184e-07, "loss": 0.2029, "step": 8894 }, { "epoch": 0.4297724307870706, "grad_norm": 2.845104217529297, "learning_rate": 5.702275692129294e-07, "loss": 0.2107, "step": 8895 }, { "epoch": 0.42982074696815964, "grad_norm": 2.661310911178589, "learning_rate": 5.701792530318404e-07, "loss": 0.2969, "step": 8896 }, { "epoch": 0.42986906314924866, "grad_norm": 2.139256000518799, "learning_rate": 5.701309368507514e-07, "loss": 0.209, "step": 8897 }, { "epoch": 0.42991737933033775, "grad_norm": 2.70351243019104, "learning_rate": 5.700826206696622e-07, "loss": 0.263, "step": 8898 }, { "epoch": 0.42996569551142677, "grad_norm": 2.5482187271118164, "learning_rate": 5.700343044885731e-07, "loss": 0.3206, "step": 8899 }, { "epoch": 0.4300140116925158, "grad_norm": 2.6818172931671143, "learning_rate": 5.699859883074841e-07, "loss": 0.3549, "step": 8900 }, { "epoch": 0.4300623278736049, "grad_norm": 4.285479545593262, "learning_rate": 5.699376721263951e-07, "loss": 0.3635, "step": 8901 }, { "epoch": 0.4301106440546939, "grad_norm": 1.9991892576217651, "learning_rate": 5.698893559453061e-07, "loss": 0.2027, "step": 8902 }, { "epoch": 0.430158960235783, "grad_norm": 4.422778606414795, "learning_rate": 5.698410397642171e-07, "loss": 0.5853, "step": 8903 }, { "epoch": 0.430207276416872, "grad_norm": 2.0669491291046143, "learning_rate": 5.697927235831279e-07, "loss": 0.1753, "step": 8904 }, { "epoch": 0.43025559259796103, "grad_norm": 2.881178140640259, "learning_rate": 5.697444074020389e-07, "loss": 0.3949, "step": 8905 }, { "epoch": 0.4303039087790501, "grad_norm": 8.579713821411133, "learning_rate": 5.696960912209498e-07, "loss": 0.2133, "step": 8906 }, { "epoch": 0.43035222496013914, "grad_norm": 4.018505573272705, "learning_rate": 5.696477750398608e-07, "loss": 0.3422, "step": 8907 }, { "epoch": 0.4304005411412282, "grad_norm": 2.6446456909179688, "learning_rate": 5.695994588587718e-07, "loss": 0.3492, "step": 8908 }, { "epoch": 0.43044885732231725, "grad_norm": 3.3207948207855225, "learning_rate": 5.695511426776828e-07, "loss": 0.281, "step": 8909 }, { "epoch": 0.43049717350340627, "grad_norm": 3.7105748653411865, "learning_rate": 5.695028264965936e-07, "loss": 0.2733, "step": 8910 }, { "epoch": 0.43054548968449535, "grad_norm": 3.286677122116089, "learning_rate": 5.694545103155046e-07, "loss": 0.2974, "step": 8911 }, { "epoch": 0.4305938058655844, "grad_norm": 2.0386998653411865, "learning_rate": 5.694061941344156e-07, "loss": 0.2514, "step": 8912 }, { "epoch": 0.4306421220466734, "grad_norm": 5.192869186401367, "learning_rate": 5.693578779533266e-07, "loss": 0.3461, "step": 8913 }, { "epoch": 0.4306904382277625, "grad_norm": 3.397195816040039, "learning_rate": 5.693095617722376e-07, "loss": 0.2439, "step": 8914 }, { "epoch": 0.4307387544088515, "grad_norm": 3.058009147644043, "learning_rate": 5.692612455911484e-07, "loss": 0.3727, "step": 8915 }, { "epoch": 0.4307870705899406, "grad_norm": 1.772820234298706, "learning_rate": 5.692129294100594e-07, "loss": 0.1873, "step": 8916 }, { "epoch": 0.4308353867710296, "grad_norm": 18.76137924194336, "learning_rate": 5.691646132289703e-07, "loss": 0.3562, "step": 8917 }, { "epoch": 0.43088370295211864, "grad_norm": 2.6839122772216797, "learning_rate": 5.691162970478813e-07, "loss": 0.3655, "step": 8918 }, { "epoch": 0.4309320191332077, "grad_norm": 2.6911685466766357, "learning_rate": 5.690679808667923e-07, "loss": 0.2942, "step": 8919 }, { "epoch": 0.43098033531429675, "grad_norm": 3.27824068069458, "learning_rate": 5.690196646857032e-07, "loss": 0.2967, "step": 8920 }, { "epoch": 0.43102865149538583, "grad_norm": 2.220722198486328, "learning_rate": 5.689713485046142e-07, "loss": 0.3573, "step": 8921 }, { "epoch": 0.43107696767647485, "grad_norm": 2.7449982166290283, "learning_rate": 5.689230323235252e-07, "loss": 0.3011, "step": 8922 }, { "epoch": 0.4311252838575639, "grad_norm": 3.716247081756592, "learning_rate": 5.68874716142436e-07, "loss": 0.2107, "step": 8923 }, { "epoch": 0.43117360003865296, "grad_norm": 3.191978693008423, "learning_rate": 5.68826399961347e-07, "loss": 0.3365, "step": 8924 }, { "epoch": 0.431221916219742, "grad_norm": 2.5822536945343018, "learning_rate": 5.687780837802579e-07, "loss": 0.2499, "step": 8925 }, { "epoch": 0.431270232400831, "grad_norm": 2.7512881755828857, "learning_rate": 5.687297675991689e-07, "loss": 0.2718, "step": 8926 }, { "epoch": 0.4313185485819201, "grad_norm": 2.2879021167755127, "learning_rate": 5.686814514180799e-07, "loss": 0.2631, "step": 8927 }, { "epoch": 0.4313668647630091, "grad_norm": 2.60207462310791, "learning_rate": 5.686331352369909e-07, "loss": 0.2583, "step": 8928 }, { "epoch": 0.4314151809440982, "grad_norm": 2.9572770595550537, "learning_rate": 5.685848190559019e-07, "loss": 0.2919, "step": 8929 }, { "epoch": 0.4314634971251872, "grad_norm": 2.2341666221618652, "learning_rate": 5.685365028748127e-07, "loss": 0.2323, "step": 8930 }, { "epoch": 0.43151181330627625, "grad_norm": 2.6257741451263428, "learning_rate": 5.684881866937237e-07, "loss": 0.2887, "step": 8931 }, { "epoch": 0.43156012948736533, "grad_norm": 2.1135332584381104, "learning_rate": 5.684398705126346e-07, "loss": 0.1962, "step": 8932 }, { "epoch": 0.43160844566845435, "grad_norm": 2.2668278217315674, "learning_rate": 5.683915543315456e-07, "loss": 0.2598, "step": 8933 }, { "epoch": 0.43165676184954344, "grad_norm": 3.8213539123535156, "learning_rate": 5.683432381504566e-07, "loss": 0.2339, "step": 8934 }, { "epoch": 0.43170507803063246, "grad_norm": 2.568700075149536, "learning_rate": 5.682949219693676e-07, "loss": 0.2669, "step": 8935 }, { "epoch": 0.4317533942117215, "grad_norm": 2.104548215866089, "learning_rate": 5.682466057882784e-07, "loss": 0.1779, "step": 8936 }, { "epoch": 0.43180171039281057, "grad_norm": 4.253304958343506, "learning_rate": 5.681982896071894e-07, "loss": 0.2665, "step": 8937 }, { "epoch": 0.4318500265738996, "grad_norm": 6.810206890106201, "learning_rate": 5.681499734261004e-07, "loss": 0.174, "step": 8938 }, { "epoch": 0.4318983427549887, "grad_norm": 2.1090495586395264, "learning_rate": 5.681016572450114e-07, "loss": 0.2231, "step": 8939 }, { "epoch": 0.4319466589360777, "grad_norm": 2.992746591567993, "learning_rate": 5.680533410639224e-07, "loss": 0.3157, "step": 8940 }, { "epoch": 0.4319949751171667, "grad_norm": 1.9523078203201294, "learning_rate": 5.680050248828332e-07, "loss": 0.181, "step": 8941 }, { "epoch": 0.4320432912982558, "grad_norm": 2.087961435317993, "learning_rate": 5.679567087017441e-07, "loss": 0.193, "step": 8942 }, { "epoch": 0.43209160747934483, "grad_norm": 2.2153115272521973, "learning_rate": 5.679083925206551e-07, "loss": 0.238, "step": 8943 }, { "epoch": 0.43213992366043386, "grad_norm": 7.050143718719482, "learning_rate": 5.678600763395661e-07, "loss": 0.3732, "step": 8944 }, { "epoch": 0.43218823984152294, "grad_norm": 2.5108702182769775, "learning_rate": 5.678117601584771e-07, "loss": 0.26, "step": 8945 }, { "epoch": 0.43223655602261196, "grad_norm": 3.1418166160583496, "learning_rate": 5.67763443977388e-07, "loss": 0.3749, "step": 8946 }, { "epoch": 0.43228487220370104, "grad_norm": 3.2208807468414307, "learning_rate": 5.67715127796299e-07, "loss": 0.4001, "step": 8947 }, { "epoch": 0.43233318838479007, "grad_norm": 2.4118824005126953, "learning_rate": 5.6766681161521e-07, "loss": 0.1978, "step": 8948 }, { "epoch": 0.4323815045658791, "grad_norm": 2.086742877960205, "learning_rate": 5.676184954341208e-07, "loss": 0.218, "step": 8949 }, { "epoch": 0.4324298207469682, "grad_norm": 2.321045398712158, "learning_rate": 5.675701792530318e-07, "loss": 0.2781, "step": 8950 }, { "epoch": 0.4324781369280572, "grad_norm": 2.5607333183288574, "learning_rate": 5.675218630719427e-07, "loss": 0.2161, "step": 8951 }, { "epoch": 0.4325264531091463, "grad_norm": 4.962024688720703, "learning_rate": 5.674735468908537e-07, "loss": 0.47, "step": 8952 }, { "epoch": 0.4325747692902353, "grad_norm": 2.3096976280212402, "learning_rate": 5.674252307097647e-07, "loss": 0.2587, "step": 8953 }, { "epoch": 0.43262308547132433, "grad_norm": 2.2469241619110107, "learning_rate": 5.673769145286757e-07, "loss": 0.277, "step": 8954 }, { "epoch": 0.4326714016524134, "grad_norm": 2.9778685569763184, "learning_rate": 5.673285983475866e-07, "loss": 0.3346, "step": 8955 }, { "epoch": 0.43271971783350244, "grad_norm": 5.390810966491699, "learning_rate": 5.672802821664975e-07, "loss": 0.3962, "step": 8956 }, { "epoch": 0.43276803401459146, "grad_norm": 8.514925003051758, "learning_rate": 5.672319659854084e-07, "loss": 0.2709, "step": 8957 }, { "epoch": 0.43281635019568054, "grad_norm": 3.532992124557495, "learning_rate": 5.671836498043194e-07, "loss": 0.3511, "step": 8958 }, { "epoch": 0.43286466637676957, "grad_norm": 2.1527254581451416, "learning_rate": 5.671353336232304e-07, "loss": 0.2715, "step": 8959 }, { "epoch": 0.43291298255785865, "grad_norm": 2.2788426876068115, "learning_rate": 5.670870174421414e-07, "loss": 0.2673, "step": 8960 }, { "epoch": 0.4329612987389477, "grad_norm": 3.6653363704681396, "learning_rate": 5.670387012610524e-07, "loss": 0.3527, "step": 8961 }, { "epoch": 0.4330096149200367, "grad_norm": 3.092395782470703, "learning_rate": 5.669903850799632e-07, "loss": 0.4033, "step": 8962 }, { "epoch": 0.4330579311011258, "grad_norm": 1.8303718566894531, "learning_rate": 5.669420688988742e-07, "loss": 0.2716, "step": 8963 }, { "epoch": 0.4331062472822148, "grad_norm": 2.3978524208068848, "learning_rate": 5.668937527177852e-07, "loss": 0.2894, "step": 8964 }, { "epoch": 0.4331545634633039, "grad_norm": 1.9404845237731934, "learning_rate": 5.668454365366962e-07, "loss": 0.2427, "step": 8965 }, { "epoch": 0.4332028796443929, "grad_norm": 3.1580634117126465, "learning_rate": 5.667971203556071e-07, "loss": 0.412, "step": 8966 }, { "epoch": 0.43325119582548194, "grad_norm": 3.27276349067688, "learning_rate": 5.66748804174518e-07, "loss": 0.4164, "step": 8967 }, { "epoch": 0.433299512006571, "grad_norm": 1.8959362506866455, "learning_rate": 5.667004879934289e-07, "loss": 0.2038, "step": 8968 }, { "epoch": 0.43334782818766004, "grad_norm": 3.1673290729522705, "learning_rate": 5.666521718123399e-07, "loss": 0.196, "step": 8969 }, { "epoch": 0.43339614436874907, "grad_norm": 3.199089765548706, "learning_rate": 5.666038556312509e-07, "loss": 0.341, "step": 8970 }, { "epoch": 0.43344446054983815, "grad_norm": 2.0653324127197266, "learning_rate": 5.665555394501619e-07, "loss": 0.2787, "step": 8971 }, { "epoch": 0.4334927767309272, "grad_norm": 2.212825298309326, "learning_rate": 5.665072232690728e-07, "loss": 0.2394, "step": 8972 }, { "epoch": 0.43354109291201626, "grad_norm": 2.5167696475982666, "learning_rate": 5.664589070879838e-07, "loss": 0.3115, "step": 8973 }, { "epoch": 0.4335894090931053, "grad_norm": 97.12926483154297, "learning_rate": 5.664105909068946e-07, "loss": 0.2375, "step": 8974 }, { "epoch": 0.4336377252741943, "grad_norm": 3.288865804672241, "learning_rate": 5.663622747258056e-07, "loss": 0.3006, "step": 8975 }, { "epoch": 0.4336860414552834, "grad_norm": 3.787230968475342, "learning_rate": 5.663139585447166e-07, "loss": 0.3047, "step": 8976 }, { "epoch": 0.4337343576363724, "grad_norm": 2.9911367893218994, "learning_rate": 5.662656423636275e-07, "loss": 0.2893, "step": 8977 }, { "epoch": 0.4337826738174615, "grad_norm": 3.026488780975342, "learning_rate": 5.662173261825385e-07, "loss": 0.3815, "step": 8978 }, { "epoch": 0.4338309899985505, "grad_norm": 8.041365623474121, "learning_rate": 5.661690100014495e-07, "loss": 0.3407, "step": 8979 }, { "epoch": 0.43387930617963955, "grad_norm": 3.536186933517456, "learning_rate": 5.661206938203605e-07, "loss": 0.3054, "step": 8980 }, { "epoch": 0.4339276223607286, "grad_norm": 2.4837417602539062, "learning_rate": 5.660723776392714e-07, "loss": 0.297, "step": 8981 }, { "epoch": 0.43397593854181765, "grad_norm": 3.1930832862854004, "learning_rate": 5.660240614581822e-07, "loss": 0.3767, "step": 8982 }, { "epoch": 0.4340242547229067, "grad_norm": 47.767059326171875, "learning_rate": 5.659757452770932e-07, "loss": 0.4206, "step": 8983 }, { "epoch": 0.43407257090399576, "grad_norm": 2.573641777038574, "learning_rate": 5.659274290960042e-07, "loss": 0.3152, "step": 8984 }, { "epoch": 0.4341208870850848, "grad_norm": 3.529386281967163, "learning_rate": 5.658791129149152e-07, "loss": 0.2984, "step": 8985 }, { "epoch": 0.43416920326617386, "grad_norm": 2.8619866371154785, "learning_rate": 5.658307967338262e-07, "loss": 0.3059, "step": 8986 }, { "epoch": 0.4342175194472629, "grad_norm": 1.9279158115386963, "learning_rate": 5.657824805527371e-07, "loss": 0.2167, "step": 8987 }, { "epoch": 0.4342658356283519, "grad_norm": 4.026230335235596, "learning_rate": 5.65734164371648e-07, "loss": 0.3883, "step": 8988 }, { "epoch": 0.434314151809441, "grad_norm": 13.2996244430542, "learning_rate": 5.65685848190559e-07, "loss": 0.407, "step": 8989 }, { "epoch": 0.43436246799053, "grad_norm": 2.004364490509033, "learning_rate": 5.6563753200947e-07, "loss": 0.2357, "step": 8990 }, { "epoch": 0.4344107841716191, "grad_norm": 2.188873767852783, "learning_rate": 5.655892158283809e-07, "loss": 0.1857, "step": 8991 }, { "epoch": 0.4344591003527081, "grad_norm": 2.3841817378997803, "learning_rate": 5.655408996472919e-07, "loss": 0.2332, "step": 8992 }, { "epoch": 0.43450741653379715, "grad_norm": 2.7343218326568604, "learning_rate": 5.654925834662027e-07, "loss": 0.3279, "step": 8993 }, { "epoch": 0.43455573271488623, "grad_norm": 1.9555292129516602, "learning_rate": 5.654442672851137e-07, "loss": 0.2939, "step": 8994 }, { "epoch": 0.43460404889597526, "grad_norm": 74.33466339111328, "learning_rate": 5.653959511040247e-07, "loss": 0.2682, "step": 8995 }, { "epoch": 0.4346523650770643, "grad_norm": 3.7599661350250244, "learning_rate": 5.653476349229357e-07, "loss": 0.5532, "step": 8996 }, { "epoch": 0.43470068125815337, "grad_norm": 2.8801698684692383, "learning_rate": 5.652993187418467e-07, "loss": 0.4054, "step": 8997 }, { "epoch": 0.4347489974392424, "grad_norm": 3.093595266342163, "learning_rate": 5.652510025607576e-07, "loss": 0.3388, "step": 8998 }, { "epoch": 0.43479731362033147, "grad_norm": 1.788069248199463, "learning_rate": 5.652026863796686e-07, "loss": 0.1988, "step": 8999 }, { "epoch": 0.4348456298014205, "grad_norm": 3.594055652618408, "learning_rate": 5.651543701985794e-07, "loss": 0.3683, "step": 9000 }, { "epoch": 0.4348939459825095, "grad_norm": 1.9984078407287598, "learning_rate": 5.651060540174904e-07, "loss": 0.2689, "step": 9001 }, { "epoch": 0.4349422621635986, "grad_norm": 3.642929792404175, "learning_rate": 5.650577378364014e-07, "loss": 0.2998, "step": 9002 }, { "epoch": 0.43499057834468763, "grad_norm": 2.4657487869262695, "learning_rate": 5.650094216553123e-07, "loss": 0.3328, "step": 9003 }, { "epoch": 0.4350388945257767, "grad_norm": 2.6932461261749268, "learning_rate": 5.649611054742233e-07, "loss": 0.303, "step": 9004 }, { "epoch": 0.43508721070686573, "grad_norm": 2.06418514251709, "learning_rate": 5.649127892931343e-07, "loss": 0.182, "step": 9005 }, { "epoch": 0.43513552688795476, "grad_norm": 4.872869968414307, "learning_rate": 5.648644731120452e-07, "loss": 0.2531, "step": 9006 }, { "epoch": 0.43518384306904384, "grad_norm": 2.75408935546875, "learning_rate": 5.648161569309562e-07, "loss": 0.3437, "step": 9007 }, { "epoch": 0.43523215925013287, "grad_norm": 2.418513059616089, "learning_rate": 5.64767840749867e-07, "loss": 0.2533, "step": 9008 }, { "epoch": 0.4352804754312219, "grad_norm": 1.9232237339019775, "learning_rate": 5.64719524568778e-07, "loss": 0.2111, "step": 9009 }, { "epoch": 0.435328791612311, "grad_norm": 3.3000664710998535, "learning_rate": 5.64671208387689e-07, "loss": 0.395, "step": 9010 }, { "epoch": 0.4353771077934, "grad_norm": 3.7847087383270264, "learning_rate": 5.646228922066e-07, "loss": 0.3641, "step": 9011 }, { "epoch": 0.4354254239744891, "grad_norm": 2.161480188369751, "learning_rate": 5.64574576025511e-07, "loss": 0.2918, "step": 9012 }, { "epoch": 0.4354737401555781, "grad_norm": 4.616022109985352, "learning_rate": 5.645262598444219e-07, "loss": 0.2517, "step": 9013 }, { "epoch": 0.43552205633666713, "grad_norm": 2.0918595790863037, "learning_rate": 5.644779436633328e-07, "loss": 0.205, "step": 9014 }, { "epoch": 0.4355703725177562, "grad_norm": 12.452552795410156, "learning_rate": 5.644296274822438e-07, "loss": 0.2812, "step": 9015 }, { "epoch": 0.43561868869884524, "grad_norm": 2.7219581604003906, "learning_rate": 5.643813113011547e-07, "loss": 0.3486, "step": 9016 }, { "epoch": 0.4356670048799343, "grad_norm": 2.6100821495056152, "learning_rate": 5.643329951200657e-07, "loss": 0.2872, "step": 9017 }, { "epoch": 0.43571532106102334, "grad_norm": 1.5514967441558838, "learning_rate": 5.642846789389767e-07, "loss": 0.1495, "step": 9018 }, { "epoch": 0.43576363724211237, "grad_norm": 2.3863494396209717, "learning_rate": 5.642363627578875e-07, "loss": 0.1527, "step": 9019 }, { "epoch": 0.43581195342320145, "grad_norm": 2.3212432861328125, "learning_rate": 5.641880465767985e-07, "loss": 0.2587, "step": 9020 }, { "epoch": 0.4358602696042905, "grad_norm": 2.7904279232025146, "learning_rate": 5.641397303957095e-07, "loss": 0.3809, "step": 9021 }, { "epoch": 0.4359085857853795, "grad_norm": 6.646315574645996, "learning_rate": 5.640914142146205e-07, "loss": 0.3145, "step": 9022 }, { "epoch": 0.4359569019664686, "grad_norm": 2.740692138671875, "learning_rate": 5.640430980335315e-07, "loss": 0.2969, "step": 9023 }, { "epoch": 0.4360052181475576, "grad_norm": 5.011521816253662, "learning_rate": 5.639947818524424e-07, "loss": 0.496, "step": 9024 }, { "epoch": 0.4360535343286467, "grad_norm": 2.8140878677368164, "learning_rate": 5.639464656713532e-07, "loss": 0.3667, "step": 9025 }, { "epoch": 0.4361018505097357, "grad_norm": 2.7879586219787598, "learning_rate": 5.638981494902642e-07, "loss": 0.1888, "step": 9026 }, { "epoch": 0.43615016669082474, "grad_norm": 2.554525375366211, "learning_rate": 5.638498333091752e-07, "loss": 0.2417, "step": 9027 }, { "epoch": 0.4361984828719138, "grad_norm": 6.851510047912598, "learning_rate": 5.638015171280862e-07, "loss": 0.2587, "step": 9028 }, { "epoch": 0.43624679905300284, "grad_norm": 3.058878183364868, "learning_rate": 5.637532009469971e-07, "loss": 0.3114, "step": 9029 }, { "epoch": 0.4362951152340919, "grad_norm": 3.0454890727996826, "learning_rate": 5.637048847659081e-07, "loss": 0.4004, "step": 9030 }, { "epoch": 0.43634343141518095, "grad_norm": 17.52300262451172, "learning_rate": 5.636565685848191e-07, "loss": 0.3313, "step": 9031 }, { "epoch": 0.43639174759627, "grad_norm": 2.019599199295044, "learning_rate": 5.6360825240373e-07, "loss": 0.2093, "step": 9032 }, { "epoch": 0.43644006377735906, "grad_norm": 2.405832052230835, "learning_rate": 5.63559936222641e-07, "loss": 0.3083, "step": 9033 }, { "epoch": 0.4364883799584481, "grad_norm": 2.660266876220703, "learning_rate": 5.635116200415518e-07, "loss": 0.193, "step": 9034 }, { "epoch": 0.4365366961395371, "grad_norm": 3.3952126502990723, "learning_rate": 5.634633038604628e-07, "loss": 0.3804, "step": 9035 }, { "epoch": 0.4365850123206262, "grad_norm": 5.084710597991943, "learning_rate": 5.634149876793738e-07, "loss": 0.4194, "step": 9036 }, { "epoch": 0.4366333285017152, "grad_norm": 2.569904327392578, "learning_rate": 5.633666714982848e-07, "loss": 0.2362, "step": 9037 }, { "epoch": 0.4366816446828043, "grad_norm": 4.703449249267578, "learning_rate": 5.633183553171957e-07, "loss": 0.4938, "step": 9038 }, { "epoch": 0.4367299608638933, "grad_norm": 2.9931790828704834, "learning_rate": 5.632700391361067e-07, "loss": 0.2719, "step": 9039 }, { "epoch": 0.43677827704498234, "grad_norm": 2.566681146621704, "learning_rate": 5.632217229550176e-07, "loss": 0.2726, "step": 9040 }, { "epoch": 0.4368265932260714, "grad_norm": 3.0834710597991943, "learning_rate": 5.631734067739286e-07, "loss": 0.3402, "step": 9041 }, { "epoch": 0.43687490940716045, "grad_norm": 3.055149555206299, "learning_rate": 5.631250905928395e-07, "loss": 0.3698, "step": 9042 }, { "epoch": 0.43692322558824953, "grad_norm": 3.5132267475128174, "learning_rate": 5.630767744117505e-07, "loss": 0.3022, "step": 9043 }, { "epoch": 0.43697154176933856, "grad_norm": 2.733999490737915, "learning_rate": 5.630284582306615e-07, "loss": 0.3956, "step": 9044 }, { "epoch": 0.4370198579504276, "grad_norm": 2.469066858291626, "learning_rate": 5.629801420495723e-07, "loss": 0.2703, "step": 9045 }, { "epoch": 0.43706817413151666, "grad_norm": 2.0839922428131104, "learning_rate": 5.629318258684833e-07, "loss": 0.2599, "step": 9046 }, { "epoch": 0.4371164903126057, "grad_norm": 2.3898468017578125, "learning_rate": 5.628835096873943e-07, "loss": 0.3158, "step": 9047 }, { "epoch": 0.4371648064936947, "grad_norm": 5.252500534057617, "learning_rate": 5.628351935063053e-07, "loss": 0.3425, "step": 9048 }, { "epoch": 0.4372131226747838, "grad_norm": 2.6599667072296143, "learning_rate": 5.627868773252163e-07, "loss": 0.2775, "step": 9049 }, { "epoch": 0.4372614388558728, "grad_norm": 2.7042267322540283, "learning_rate": 5.627385611441271e-07, "loss": 0.3022, "step": 9050 }, { "epoch": 0.4373097550369619, "grad_norm": 2.7353127002716064, "learning_rate": 5.62690244963038e-07, "loss": 0.2277, "step": 9051 }, { "epoch": 0.4373580712180509, "grad_norm": 2.9576056003570557, "learning_rate": 5.62641928781949e-07, "loss": 0.2903, "step": 9052 }, { "epoch": 0.43740638739913995, "grad_norm": 5.1045331954956055, "learning_rate": 5.6259361260086e-07, "loss": 0.3911, "step": 9053 }, { "epoch": 0.43745470358022903, "grad_norm": 2.2929718494415283, "learning_rate": 5.62545296419771e-07, "loss": 0.2442, "step": 9054 }, { "epoch": 0.43750301976131806, "grad_norm": 2.3417763710021973, "learning_rate": 5.624969802386819e-07, "loss": 0.2251, "step": 9055 }, { "epoch": 0.43755133594240714, "grad_norm": 2.92815899848938, "learning_rate": 5.624486640575929e-07, "loss": 0.2443, "step": 9056 }, { "epoch": 0.43759965212349616, "grad_norm": 2.5406298637390137, "learning_rate": 5.624003478765038e-07, "loss": 0.2968, "step": 9057 }, { "epoch": 0.4376479683045852, "grad_norm": 2.8734166622161865, "learning_rate": 5.623520316954147e-07, "loss": 0.3323, "step": 9058 }, { "epoch": 0.43769628448567427, "grad_norm": 1.9286409616470337, "learning_rate": 5.623037155143257e-07, "loss": 0.2283, "step": 9059 }, { "epoch": 0.4377446006667633, "grad_norm": 2.5997676849365234, "learning_rate": 5.622553993332366e-07, "loss": 0.3317, "step": 9060 }, { "epoch": 0.4377929168478523, "grad_norm": 3.0602827072143555, "learning_rate": 5.622070831521476e-07, "loss": 0.4272, "step": 9061 }, { "epoch": 0.4378412330289414, "grad_norm": 3.0976803302764893, "learning_rate": 5.621587669710586e-07, "loss": 0.1895, "step": 9062 }, { "epoch": 0.4378895492100304, "grad_norm": 3.151909589767456, "learning_rate": 5.621104507899696e-07, "loss": 0.4235, "step": 9063 }, { "epoch": 0.4379378653911195, "grad_norm": 3.8631205558776855, "learning_rate": 5.620621346088805e-07, "loss": 0.3592, "step": 9064 }, { "epoch": 0.43798618157220853, "grad_norm": 3.4341089725494385, "learning_rate": 5.620138184277915e-07, "loss": 0.3209, "step": 9065 }, { "epoch": 0.43803449775329756, "grad_norm": 6.959120750427246, "learning_rate": 5.619655022467024e-07, "loss": 0.237, "step": 9066 }, { "epoch": 0.43808281393438664, "grad_norm": 3.995671272277832, "learning_rate": 5.619171860656133e-07, "loss": 0.477, "step": 9067 }, { "epoch": 0.43813113011547566, "grad_norm": 2.0363118648529053, "learning_rate": 5.618688698845243e-07, "loss": 0.1928, "step": 9068 }, { "epoch": 0.43817944629656475, "grad_norm": 3.0139129161834717, "learning_rate": 5.618205537034353e-07, "loss": 0.2285, "step": 9069 }, { "epoch": 0.43822776247765377, "grad_norm": 4.1646928787231445, "learning_rate": 5.617722375223462e-07, "loss": 0.3863, "step": 9070 }, { "epoch": 0.4382760786587428, "grad_norm": 2.3568146228790283, "learning_rate": 5.617239213412571e-07, "loss": 0.2636, "step": 9071 }, { "epoch": 0.4383243948398319, "grad_norm": 12.537161827087402, "learning_rate": 5.616756051601681e-07, "loss": 0.2822, "step": 9072 }, { "epoch": 0.4383727110209209, "grad_norm": 2.5868308544158936, "learning_rate": 5.616272889790791e-07, "loss": 0.3184, "step": 9073 }, { "epoch": 0.4384210272020099, "grad_norm": 4.14051628112793, "learning_rate": 5.615789727979901e-07, "loss": 0.2128, "step": 9074 }, { "epoch": 0.438469343383099, "grad_norm": 2.3847899436950684, "learning_rate": 5.615306566169011e-07, "loss": 0.2763, "step": 9075 }, { "epoch": 0.43851765956418803, "grad_norm": 3.173490285873413, "learning_rate": 5.614823404358118e-07, "loss": 0.3013, "step": 9076 }, { "epoch": 0.4385659757452771, "grad_norm": 2.898374319076538, "learning_rate": 5.614340242547228e-07, "loss": 0.3579, "step": 9077 }, { "epoch": 0.43861429192636614, "grad_norm": 4.235819339752197, "learning_rate": 5.613857080736338e-07, "loss": 0.2521, "step": 9078 }, { "epoch": 0.43866260810745517, "grad_norm": 3.0077176094055176, "learning_rate": 5.613373918925448e-07, "loss": 0.3895, "step": 9079 }, { "epoch": 0.43871092428854425, "grad_norm": 2.5056731700897217, "learning_rate": 5.612890757114558e-07, "loss": 0.2751, "step": 9080 }, { "epoch": 0.43875924046963327, "grad_norm": 3.1294333934783936, "learning_rate": 5.612407595303667e-07, "loss": 0.4047, "step": 9081 }, { "epoch": 0.43880755665072235, "grad_norm": 2.267883539199829, "learning_rate": 5.611924433492777e-07, "loss": 0.3649, "step": 9082 }, { "epoch": 0.4388558728318114, "grad_norm": 2.0076282024383545, "learning_rate": 5.611441271681886e-07, "loss": 0.1331, "step": 9083 }, { "epoch": 0.4389041890129004, "grad_norm": 3.3429763317108154, "learning_rate": 5.610958109870995e-07, "loss": 0.3912, "step": 9084 }, { "epoch": 0.4389525051939895, "grad_norm": 3.2724859714508057, "learning_rate": 5.610474948060105e-07, "loss": 0.4733, "step": 9085 }, { "epoch": 0.4390008213750785, "grad_norm": 3.193458318710327, "learning_rate": 5.609991786249214e-07, "loss": 0.29, "step": 9086 }, { "epoch": 0.43904913755616753, "grad_norm": 3.3967676162719727, "learning_rate": 5.609508624438324e-07, "loss": 0.4211, "step": 9087 }, { "epoch": 0.4390974537372566, "grad_norm": 2.3477885723114014, "learning_rate": 5.609025462627434e-07, "loss": 0.2656, "step": 9088 }, { "epoch": 0.43914576991834564, "grad_norm": 2.8751227855682373, "learning_rate": 5.608542300816543e-07, "loss": 0.4227, "step": 9089 }, { "epoch": 0.4391940860994347, "grad_norm": 2.381967067718506, "learning_rate": 5.608059139005653e-07, "loss": 0.2132, "step": 9090 }, { "epoch": 0.43924240228052375, "grad_norm": 3.0378496646881104, "learning_rate": 5.607575977194762e-07, "loss": 0.2992, "step": 9091 }, { "epoch": 0.4392907184616128, "grad_norm": 2.0071139335632324, "learning_rate": 5.607092815383871e-07, "loss": 0.2498, "step": 9092 }, { "epoch": 0.43933903464270185, "grad_norm": 1.644278645515442, "learning_rate": 5.606609653572981e-07, "loss": 0.1994, "step": 9093 }, { "epoch": 0.4393873508237909, "grad_norm": 17.140705108642578, "learning_rate": 5.606126491762091e-07, "loss": 0.3902, "step": 9094 }, { "epoch": 0.43943566700487996, "grad_norm": 2.0106945037841797, "learning_rate": 5.605643329951201e-07, "loss": 0.2225, "step": 9095 }, { "epoch": 0.439483983185969, "grad_norm": 2.646045684814453, "learning_rate": 5.60516016814031e-07, "loss": 0.2394, "step": 9096 }, { "epoch": 0.439532299367058, "grad_norm": 2.5835254192352295, "learning_rate": 5.604677006329419e-07, "loss": 0.237, "step": 9097 }, { "epoch": 0.4395806155481471, "grad_norm": 4.273063659667969, "learning_rate": 5.604193844518529e-07, "loss": 0.3575, "step": 9098 }, { "epoch": 0.4396289317292361, "grad_norm": 2.450124740600586, "learning_rate": 5.603710682707639e-07, "loss": 0.2651, "step": 9099 }, { "epoch": 0.43967724791032514, "grad_norm": 5.103613376617432, "learning_rate": 5.603227520896749e-07, "loss": 0.2458, "step": 9100 }, { "epoch": 0.4397255640914142, "grad_norm": 12.265079498291016, "learning_rate": 5.602744359085858e-07, "loss": 0.2646, "step": 9101 }, { "epoch": 0.43977388027250325, "grad_norm": 2.644570827484131, "learning_rate": 5.602261197274966e-07, "loss": 0.2165, "step": 9102 }, { "epoch": 0.43982219645359233, "grad_norm": 2.539604663848877, "learning_rate": 5.601778035464076e-07, "loss": 0.3585, "step": 9103 }, { "epoch": 0.43987051263468135, "grad_norm": 2.0693793296813965, "learning_rate": 5.601294873653186e-07, "loss": 0.2386, "step": 9104 }, { "epoch": 0.4399188288157704, "grad_norm": 3.963883876800537, "learning_rate": 5.600811711842296e-07, "loss": 0.2937, "step": 9105 }, { "epoch": 0.43996714499685946, "grad_norm": 2.1279945373535156, "learning_rate": 5.600328550031406e-07, "loss": 0.24, "step": 9106 }, { "epoch": 0.4400154611779485, "grad_norm": 2.0719377994537354, "learning_rate": 5.599845388220515e-07, "loss": 0.2235, "step": 9107 }, { "epoch": 0.44006377735903757, "grad_norm": 1.9258930683135986, "learning_rate": 5.599362226409625e-07, "loss": 0.2165, "step": 9108 }, { "epoch": 0.4401120935401266, "grad_norm": 3.5094754695892334, "learning_rate": 5.598879064598733e-07, "loss": 0.2583, "step": 9109 }, { "epoch": 0.4401604097212156, "grad_norm": 2.0554966926574707, "learning_rate": 5.598395902787843e-07, "loss": 0.2848, "step": 9110 }, { "epoch": 0.4402087259023047, "grad_norm": 2.236231565475464, "learning_rate": 5.597912740976953e-07, "loss": 0.2696, "step": 9111 }, { "epoch": 0.4402570420833937, "grad_norm": 2.8079233169555664, "learning_rate": 5.597429579166062e-07, "loss": 0.3193, "step": 9112 }, { "epoch": 0.44030535826448275, "grad_norm": 2.928283929824829, "learning_rate": 5.596946417355172e-07, "loss": 0.3879, "step": 9113 }, { "epoch": 0.44035367444557183, "grad_norm": 3.245382308959961, "learning_rate": 5.596463255544282e-07, "loss": 0.4094, "step": 9114 }, { "epoch": 0.44040199062666086, "grad_norm": 8.623200416564941, "learning_rate": 5.595980093733391e-07, "loss": 0.2211, "step": 9115 }, { "epoch": 0.44045030680774994, "grad_norm": 2.6088593006134033, "learning_rate": 5.595496931922501e-07, "loss": 0.3881, "step": 9116 }, { "epoch": 0.44049862298883896, "grad_norm": 2.285020589828491, "learning_rate": 5.59501377011161e-07, "loss": 0.3067, "step": 9117 }, { "epoch": 0.440546939169928, "grad_norm": 2.182853937149048, "learning_rate": 5.594530608300719e-07, "loss": 0.2366, "step": 9118 }, { "epoch": 0.44059525535101707, "grad_norm": 2.683096408843994, "learning_rate": 5.594047446489829e-07, "loss": 0.29, "step": 9119 }, { "epoch": 0.4406435715321061, "grad_norm": 2.489151954650879, "learning_rate": 5.593564284678939e-07, "loss": 0.3059, "step": 9120 }, { "epoch": 0.4406918877131952, "grad_norm": 3.2709062099456787, "learning_rate": 5.593081122868048e-07, "loss": 0.4273, "step": 9121 }, { "epoch": 0.4407402038942842, "grad_norm": 2.3219127655029297, "learning_rate": 5.592597961057158e-07, "loss": 0.3275, "step": 9122 }, { "epoch": 0.4407885200753732, "grad_norm": 2.2187137603759766, "learning_rate": 5.592114799246267e-07, "loss": 0.2868, "step": 9123 }, { "epoch": 0.4408368362564623, "grad_norm": 2.5104153156280518, "learning_rate": 5.591631637435377e-07, "loss": 0.2887, "step": 9124 }, { "epoch": 0.44088515243755133, "grad_norm": 2.376084804534912, "learning_rate": 5.591148475624487e-07, "loss": 0.2491, "step": 9125 }, { "epoch": 0.44093346861864036, "grad_norm": 3.208397388458252, "learning_rate": 5.590665313813596e-07, "loss": 0.3865, "step": 9126 }, { "epoch": 0.44098178479972944, "grad_norm": 2.4133620262145996, "learning_rate": 5.590182152002706e-07, "loss": 0.2587, "step": 9127 }, { "epoch": 0.44103010098081846, "grad_norm": 2.5973572731018066, "learning_rate": 5.589698990191814e-07, "loss": 0.2638, "step": 9128 }, { "epoch": 0.44107841716190754, "grad_norm": 4.151932239532471, "learning_rate": 5.589215828380924e-07, "loss": 0.4398, "step": 9129 }, { "epoch": 0.44112673334299657, "grad_norm": 2.4548709392547607, "learning_rate": 5.588732666570034e-07, "loss": 0.2498, "step": 9130 }, { "epoch": 0.4411750495240856, "grad_norm": 2.5508463382720947, "learning_rate": 5.588249504759144e-07, "loss": 0.3232, "step": 9131 }, { "epoch": 0.4412233657051747, "grad_norm": 2.7219691276550293, "learning_rate": 5.587766342948254e-07, "loss": 0.3135, "step": 9132 }, { "epoch": 0.4412716818862637, "grad_norm": 2.958444595336914, "learning_rate": 5.587283181137363e-07, "loss": 0.2699, "step": 9133 }, { "epoch": 0.4413199980673528, "grad_norm": 1.8276640176773071, "learning_rate": 5.586800019326471e-07, "loss": 0.2201, "step": 9134 }, { "epoch": 0.4413683142484418, "grad_norm": 2.147365093231201, "learning_rate": 5.586316857515581e-07, "loss": 0.2285, "step": 9135 }, { "epoch": 0.44141663042953083, "grad_norm": 2.9208760261535645, "learning_rate": 5.585833695704691e-07, "loss": 0.203, "step": 9136 }, { "epoch": 0.4414649466106199, "grad_norm": 13.342145919799805, "learning_rate": 5.585350533893801e-07, "loss": 0.4107, "step": 9137 }, { "epoch": 0.44151326279170894, "grad_norm": 1.7836337089538574, "learning_rate": 5.58486737208291e-07, "loss": 0.1772, "step": 9138 }, { "epoch": 0.44156157897279796, "grad_norm": 2.2972586154937744, "learning_rate": 5.58438421027202e-07, "loss": 0.2212, "step": 9139 }, { "epoch": 0.44160989515388704, "grad_norm": 2.003716230392456, "learning_rate": 5.58390104846113e-07, "loss": 0.2276, "step": 9140 }, { "epoch": 0.44165821133497607, "grad_norm": 5.081894397735596, "learning_rate": 5.583417886650239e-07, "loss": 0.443, "step": 9141 }, { "epoch": 0.44170652751606515, "grad_norm": 5.292409896850586, "learning_rate": 5.582934724839349e-07, "loss": 0.4032, "step": 9142 }, { "epoch": 0.4417548436971542, "grad_norm": 2.664361000061035, "learning_rate": 5.582451563028457e-07, "loss": 0.2267, "step": 9143 }, { "epoch": 0.4418031598782432, "grad_norm": 2.4614083766937256, "learning_rate": 5.581968401217567e-07, "loss": 0.272, "step": 9144 }, { "epoch": 0.4418514760593323, "grad_norm": 2.7433841228485107, "learning_rate": 5.581485239406677e-07, "loss": 0.2555, "step": 9145 }, { "epoch": 0.4418997922404213, "grad_norm": 5.0827202796936035, "learning_rate": 5.581002077595787e-07, "loss": 0.3893, "step": 9146 }, { "epoch": 0.4419481084215104, "grad_norm": 4.201113700866699, "learning_rate": 5.580518915784896e-07, "loss": 0.2768, "step": 9147 }, { "epoch": 0.4419964246025994, "grad_norm": 3.116635322570801, "learning_rate": 5.580035753974006e-07, "loss": 0.2495, "step": 9148 }, { "epoch": 0.44204474078368844, "grad_norm": 2.438218832015991, "learning_rate": 5.579552592163115e-07, "loss": 0.2587, "step": 9149 }, { "epoch": 0.4420930569647775, "grad_norm": 2.106898784637451, "learning_rate": 5.579069430352225e-07, "loss": 0.2085, "step": 9150 }, { "epoch": 0.44214137314586655, "grad_norm": 2.4089295864105225, "learning_rate": 5.578586268541335e-07, "loss": 0.3093, "step": 9151 }, { "epoch": 0.44218968932695557, "grad_norm": 1.8045156002044678, "learning_rate": 5.578103106730444e-07, "loss": 0.1441, "step": 9152 }, { "epoch": 0.44223800550804465, "grad_norm": 3.364063024520874, "learning_rate": 5.577619944919553e-07, "loss": 0.4107, "step": 9153 }, { "epoch": 0.4422863216891337, "grad_norm": 3.492459774017334, "learning_rate": 5.577136783108662e-07, "loss": 0.2744, "step": 9154 }, { "epoch": 0.44233463787022276, "grad_norm": 2.1893317699432373, "learning_rate": 5.576653621297772e-07, "loss": 0.2223, "step": 9155 }, { "epoch": 0.4423829540513118, "grad_norm": 2.795954942703247, "learning_rate": 5.576170459486882e-07, "loss": 0.3243, "step": 9156 }, { "epoch": 0.4424312702324008, "grad_norm": 3.233651876449585, "learning_rate": 5.575687297675992e-07, "loss": 0.4565, "step": 9157 }, { "epoch": 0.4424795864134899, "grad_norm": 2.3347246646881104, "learning_rate": 5.575204135865102e-07, "loss": 0.2912, "step": 9158 }, { "epoch": 0.4425279025945789, "grad_norm": 4.721216201782227, "learning_rate": 5.574720974054211e-07, "loss": 0.3378, "step": 9159 }, { "epoch": 0.442576218775668, "grad_norm": 2.6465563774108887, "learning_rate": 5.574237812243319e-07, "loss": 0.2964, "step": 9160 }, { "epoch": 0.442624534956757, "grad_norm": 2.0367345809936523, "learning_rate": 5.573754650432429e-07, "loss": 0.1983, "step": 9161 }, { "epoch": 0.44267285113784605, "grad_norm": 3.584050178527832, "learning_rate": 5.573271488621539e-07, "loss": 0.2727, "step": 9162 }, { "epoch": 0.4427211673189351, "grad_norm": 2.990427017211914, "learning_rate": 5.572788326810649e-07, "loss": 0.2976, "step": 9163 }, { "epoch": 0.44276948350002415, "grad_norm": 2.9370570182800293, "learning_rate": 5.572305164999758e-07, "loss": 0.357, "step": 9164 }, { "epoch": 0.4428177996811132, "grad_norm": 2.8722684383392334, "learning_rate": 5.571822003188868e-07, "loss": 0.3974, "step": 9165 }, { "epoch": 0.44286611586220226, "grad_norm": 2.766335964202881, "learning_rate": 5.571338841377977e-07, "loss": 0.2868, "step": 9166 }, { "epoch": 0.4429144320432913, "grad_norm": 4.916207790374756, "learning_rate": 5.570855679567087e-07, "loss": 0.3227, "step": 9167 }, { "epoch": 0.44296274822438036, "grad_norm": 3.0191309452056885, "learning_rate": 5.570372517756197e-07, "loss": 0.364, "step": 9168 }, { "epoch": 0.4430110644054694, "grad_norm": 2.2589757442474365, "learning_rate": 5.569889355945305e-07, "loss": 0.2397, "step": 9169 }, { "epoch": 0.4430593805865584, "grad_norm": 14.710773468017578, "learning_rate": 5.569406194134415e-07, "loss": 0.2612, "step": 9170 }, { "epoch": 0.4431076967676475, "grad_norm": 2.0445873737335205, "learning_rate": 5.568923032323525e-07, "loss": 0.2727, "step": 9171 }, { "epoch": 0.4431560129487365, "grad_norm": 2.1587884426116943, "learning_rate": 5.568439870512635e-07, "loss": 0.3238, "step": 9172 }, { "epoch": 0.4432043291298256, "grad_norm": 3.177949905395508, "learning_rate": 5.567956708701744e-07, "loss": 0.3269, "step": 9173 }, { "epoch": 0.44325264531091463, "grad_norm": 3.360050916671753, "learning_rate": 5.567473546890854e-07, "loss": 0.4126, "step": 9174 }, { "epoch": 0.44330096149200365, "grad_norm": 2.2615764141082764, "learning_rate": 5.566990385079963e-07, "loss": 0.2306, "step": 9175 }, { "epoch": 0.44334927767309273, "grad_norm": 2.8586015701293945, "learning_rate": 5.566507223269073e-07, "loss": 0.4241, "step": 9176 }, { "epoch": 0.44339759385418176, "grad_norm": 2.7686493396759033, "learning_rate": 5.566024061458182e-07, "loss": 0.3597, "step": 9177 }, { "epoch": 0.4434459100352708, "grad_norm": 4.888660907745361, "learning_rate": 5.565540899647292e-07, "loss": 0.2531, "step": 9178 }, { "epoch": 0.44349422621635987, "grad_norm": 4.860530376434326, "learning_rate": 5.565057737836401e-07, "loss": 0.4014, "step": 9179 }, { "epoch": 0.4435425423974489, "grad_norm": 2.87296724319458, "learning_rate": 5.56457457602551e-07, "loss": 0.4327, "step": 9180 }, { "epoch": 0.44359085857853797, "grad_norm": 2.8092784881591797, "learning_rate": 5.56409141421462e-07, "loss": 0.3373, "step": 9181 }, { "epoch": 0.443639174759627, "grad_norm": 2.3668720722198486, "learning_rate": 5.56360825240373e-07, "loss": 0.2979, "step": 9182 }, { "epoch": 0.443687490940716, "grad_norm": 2.0188910961151123, "learning_rate": 5.56312509059284e-07, "loss": 0.2004, "step": 9183 }, { "epoch": 0.4437358071218051, "grad_norm": 3.2693729400634766, "learning_rate": 5.56264192878195e-07, "loss": 0.3471, "step": 9184 }, { "epoch": 0.44378412330289413, "grad_norm": 3.2617528438568115, "learning_rate": 5.562158766971057e-07, "loss": 0.4392, "step": 9185 }, { "epoch": 0.4438324394839832, "grad_norm": 1.8879848718643188, "learning_rate": 5.561675605160167e-07, "loss": 0.1785, "step": 9186 }, { "epoch": 0.44388075566507224, "grad_norm": 2.218050003051758, "learning_rate": 5.561192443349277e-07, "loss": 0.1987, "step": 9187 }, { "epoch": 0.44392907184616126, "grad_norm": 2.1202657222747803, "learning_rate": 5.560709281538387e-07, "loss": 0.2334, "step": 9188 }, { "epoch": 0.44397738802725034, "grad_norm": 2.334567070007324, "learning_rate": 5.560226119727497e-07, "loss": 0.1774, "step": 9189 }, { "epoch": 0.44402570420833937, "grad_norm": 2.244288444519043, "learning_rate": 5.559742957916606e-07, "loss": 0.21, "step": 9190 }, { "epoch": 0.4440740203894284, "grad_norm": 2.090920925140381, "learning_rate": 5.559259796105716e-07, "loss": 0.2281, "step": 9191 }, { "epoch": 0.4441223365705175, "grad_norm": 2.5176758766174316, "learning_rate": 5.558776634294825e-07, "loss": 0.2806, "step": 9192 }, { "epoch": 0.4441706527516065, "grad_norm": 3.611023187637329, "learning_rate": 5.558293472483935e-07, "loss": 0.3578, "step": 9193 }, { "epoch": 0.4442189689326956, "grad_norm": 3.5124335289001465, "learning_rate": 5.557810310673044e-07, "loss": 0.4859, "step": 9194 }, { "epoch": 0.4442672851137846, "grad_norm": 2.5859944820404053, "learning_rate": 5.557327148862153e-07, "loss": 0.3624, "step": 9195 }, { "epoch": 0.44431560129487363, "grad_norm": 3.843271255493164, "learning_rate": 5.556843987051263e-07, "loss": 0.3869, "step": 9196 }, { "epoch": 0.4443639174759627, "grad_norm": 3.6512610912323, "learning_rate": 5.556360825240373e-07, "loss": 0.3525, "step": 9197 }, { "epoch": 0.44441223365705174, "grad_norm": 2.4926304817199707, "learning_rate": 5.555877663429482e-07, "loss": 0.3639, "step": 9198 }, { "epoch": 0.4444605498381408, "grad_norm": 10.85474681854248, "learning_rate": 5.555394501618592e-07, "loss": 0.3088, "step": 9199 }, { "epoch": 0.44450886601922984, "grad_norm": 4.253768444061279, "learning_rate": 5.554911339807702e-07, "loss": 0.2613, "step": 9200 }, { "epoch": 0.44455718220031887, "grad_norm": 2.488797426223755, "learning_rate": 5.554428177996811e-07, "loss": 0.2848, "step": 9201 }, { "epoch": 0.44460549838140795, "grad_norm": 1.8721359968185425, "learning_rate": 5.55394501618592e-07, "loss": 0.2187, "step": 9202 }, { "epoch": 0.444653814562497, "grad_norm": 2.928612232208252, "learning_rate": 5.55346185437503e-07, "loss": 0.3944, "step": 9203 }, { "epoch": 0.444702130743586, "grad_norm": 3.2344539165496826, "learning_rate": 5.55297869256414e-07, "loss": 0.3566, "step": 9204 }, { "epoch": 0.4447504469246751, "grad_norm": 3.3486785888671875, "learning_rate": 5.552495530753249e-07, "loss": 0.2632, "step": 9205 }, { "epoch": 0.4447987631057641, "grad_norm": 2.4031903743743896, "learning_rate": 5.552012368942358e-07, "loss": 0.2387, "step": 9206 }, { "epoch": 0.4448470792868532, "grad_norm": 3.543121576309204, "learning_rate": 5.551529207131468e-07, "loss": 0.3476, "step": 9207 }, { "epoch": 0.4448953954679422, "grad_norm": 3.3871335983276367, "learning_rate": 5.551046045320578e-07, "loss": 0.4661, "step": 9208 }, { "epoch": 0.44494371164903124, "grad_norm": 2.272303342819214, "learning_rate": 5.550562883509688e-07, "loss": 0.2576, "step": 9209 }, { "epoch": 0.4449920278301203, "grad_norm": 6.049503326416016, "learning_rate": 5.550079721698798e-07, "loss": 0.3306, "step": 9210 }, { "epoch": 0.44504034401120934, "grad_norm": 2.065798759460449, "learning_rate": 5.549596559887905e-07, "loss": 0.3155, "step": 9211 }, { "epoch": 0.4450886601922984, "grad_norm": 3.03690505027771, "learning_rate": 5.549113398077015e-07, "loss": 0.4094, "step": 9212 }, { "epoch": 0.44513697637338745, "grad_norm": 4.7545270919799805, "learning_rate": 5.548630236266125e-07, "loss": 0.4546, "step": 9213 }, { "epoch": 0.4451852925544765, "grad_norm": 6.677389144897461, "learning_rate": 5.548147074455235e-07, "loss": 0.3155, "step": 9214 }, { "epoch": 0.44523360873556556, "grad_norm": 3.71382737159729, "learning_rate": 5.547663912644345e-07, "loss": 0.2411, "step": 9215 }, { "epoch": 0.4452819249166546, "grad_norm": 2.7684874534606934, "learning_rate": 5.547180750833454e-07, "loss": 0.2732, "step": 9216 }, { "epoch": 0.44533024109774366, "grad_norm": 2.5240390300750732, "learning_rate": 5.546697589022563e-07, "loss": 0.3493, "step": 9217 }, { "epoch": 0.4453785572788327, "grad_norm": 2.0516104698181152, "learning_rate": 5.546214427211673e-07, "loss": 0.2744, "step": 9218 }, { "epoch": 0.4454268734599217, "grad_norm": 7.641232490539551, "learning_rate": 5.545731265400782e-07, "loss": 0.3607, "step": 9219 }, { "epoch": 0.4454751896410108, "grad_norm": 2.468050718307495, "learning_rate": 5.545248103589892e-07, "loss": 0.2949, "step": 9220 }, { "epoch": 0.4455235058220998, "grad_norm": 29.35525894165039, "learning_rate": 5.544764941779001e-07, "loss": 0.2509, "step": 9221 }, { "epoch": 0.44557182200318884, "grad_norm": 2.715174913406372, "learning_rate": 5.544281779968111e-07, "loss": 0.3348, "step": 9222 }, { "epoch": 0.4456201381842779, "grad_norm": 2.5275344848632812, "learning_rate": 5.543798618157221e-07, "loss": 0.286, "step": 9223 }, { "epoch": 0.44566845436536695, "grad_norm": 2.425704002380371, "learning_rate": 5.54331545634633e-07, "loss": 0.2441, "step": 9224 }, { "epoch": 0.44571677054645603, "grad_norm": 2.6952741146087646, "learning_rate": 5.54283229453544e-07, "loss": 0.2736, "step": 9225 }, { "epoch": 0.44576508672754506, "grad_norm": 2.0705981254577637, "learning_rate": 5.54234913272455e-07, "loss": 0.2478, "step": 9226 }, { "epoch": 0.4458134029086341, "grad_norm": 2.6821343898773193, "learning_rate": 5.541865970913658e-07, "loss": 0.2603, "step": 9227 }, { "epoch": 0.44586171908972316, "grad_norm": 4.3234734535217285, "learning_rate": 5.541382809102768e-07, "loss": 0.4369, "step": 9228 }, { "epoch": 0.4459100352708122, "grad_norm": 3.655545949935913, "learning_rate": 5.540899647291878e-07, "loss": 0.278, "step": 9229 }, { "epoch": 0.44595835145190127, "grad_norm": 2.563601016998291, "learning_rate": 5.540416485480987e-07, "loss": 0.2856, "step": 9230 }, { "epoch": 0.4460066676329903, "grad_norm": 1.9409573078155518, "learning_rate": 5.539933323670097e-07, "loss": 0.2286, "step": 9231 }, { "epoch": 0.4460549838140793, "grad_norm": 4.14080286026001, "learning_rate": 5.539450161859206e-07, "loss": 0.2214, "step": 9232 }, { "epoch": 0.4461032999951684, "grad_norm": 3.76513671875, "learning_rate": 5.538967000048316e-07, "loss": 0.3422, "step": 9233 }, { "epoch": 0.4461516161762574, "grad_norm": 4.680770397186279, "learning_rate": 5.538483838237426e-07, "loss": 0.2516, "step": 9234 }, { "epoch": 0.44619993235734645, "grad_norm": 2.7154393196105957, "learning_rate": 5.538000676426536e-07, "loss": 0.3239, "step": 9235 }, { "epoch": 0.44624824853843553, "grad_norm": 2.4315900802612305, "learning_rate": 5.537517514615645e-07, "loss": 0.2468, "step": 9236 }, { "epoch": 0.44629656471952456, "grad_norm": 4.558167934417725, "learning_rate": 5.537034352804753e-07, "loss": 0.2835, "step": 9237 }, { "epoch": 0.44634488090061364, "grad_norm": 3.440197229385376, "learning_rate": 5.536551190993863e-07, "loss": 0.3874, "step": 9238 }, { "epoch": 0.44639319708170266, "grad_norm": 3.0954275131225586, "learning_rate": 5.536068029182973e-07, "loss": 0.243, "step": 9239 }, { "epoch": 0.4464415132627917, "grad_norm": 2.049405813217163, "learning_rate": 5.535584867372083e-07, "loss": 0.2537, "step": 9240 }, { "epoch": 0.44648982944388077, "grad_norm": 1.7177492380142212, "learning_rate": 5.535101705561193e-07, "loss": 0.1822, "step": 9241 }, { "epoch": 0.4465381456249698, "grad_norm": 3.9468047618865967, "learning_rate": 5.534618543750302e-07, "loss": 0.277, "step": 9242 }, { "epoch": 0.4465864618060589, "grad_norm": 3.4372761249542236, "learning_rate": 5.534135381939411e-07, "loss": 0.3862, "step": 9243 }, { "epoch": 0.4466347779871479, "grad_norm": 3.2192444801330566, "learning_rate": 5.53365222012852e-07, "loss": 0.1441, "step": 9244 }, { "epoch": 0.4466830941682369, "grad_norm": 2.9455416202545166, "learning_rate": 5.53316905831763e-07, "loss": 0.3811, "step": 9245 }, { "epoch": 0.446731410349326, "grad_norm": 4.515115737915039, "learning_rate": 5.53268589650674e-07, "loss": 0.2201, "step": 9246 }, { "epoch": 0.44677972653041503, "grad_norm": 3.4363558292388916, "learning_rate": 5.532202734695849e-07, "loss": 0.2816, "step": 9247 }, { "epoch": 0.44682804271150406, "grad_norm": 2.2219958305358887, "learning_rate": 5.531719572884959e-07, "loss": 0.2732, "step": 9248 }, { "epoch": 0.44687635889259314, "grad_norm": 3.2146859169006348, "learning_rate": 5.531236411074068e-07, "loss": 0.4276, "step": 9249 }, { "epoch": 0.44692467507368216, "grad_norm": 3.5455093383789062, "learning_rate": 5.530753249263178e-07, "loss": 0.3242, "step": 9250 }, { "epoch": 0.44697299125477125, "grad_norm": 2.2575175762176514, "learning_rate": 5.530270087452288e-07, "loss": 0.2233, "step": 9251 }, { "epoch": 0.44702130743586027, "grad_norm": 2.517331600189209, "learning_rate": 5.529786925641398e-07, "loss": 0.2899, "step": 9252 }, { "epoch": 0.4470696236169493, "grad_norm": 2.4901161193847656, "learning_rate": 5.529303763830506e-07, "loss": 0.3727, "step": 9253 }, { "epoch": 0.4471179397980384, "grad_norm": 2.983191728591919, "learning_rate": 5.528820602019616e-07, "loss": 0.2351, "step": 9254 }, { "epoch": 0.4471662559791274, "grad_norm": 2.3542375564575195, "learning_rate": 5.528337440208726e-07, "loss": 0.3027, "step": 9255 }, { "epoch": 0.4472145721602165, "grad_norm": 1.7982800006866455, "learning_rate": 5.527854278397835e-07, "loss": 0.2047, "step": 9256 }, { "epoch": 0.4472628883413055, "grad_norm": 2.114170789718628, "learning_rate": 5.527371116586945e-07, "loss": 0.2323, "step": 9257 }, { "epoch": 0.44731120452239453, "grad_norm": 2.439993381500244, "learning_rate": 5.526887954776054e-07, "loss": 0.2542, "step": 9258 }, { "epoch": 0.4473595207034836, "grad_norm": 5.9957733154296875, "learning_rate": 5.526404792965164e-07, "loss": 0.3247, "step": 9259 }, { "epoch": 0.44740783688457264, "grad_norm": 2.136061668395996, "learning_rate": 5.525921631154274e-07, "loss": 0.209, "step": 9260 }, { "epoch": 0.44745615306566167, "grad_norm": 2.5739660263061523, "learning_rate": 5.525438469343384e-07, "loss": 0.3346, "step": 9261 }, { "epoch": 0.44750446924675075, "grad_norm": 3.657723903656006, "learning_rate": 5.524955307532492e-07, "loss": 0.3823, "step": 9262 }, { "epoch": 0.44755278542783977, "grad_norm": 2.3855650424957275, "learning_rate": 5.524472145721601e-07, "loss": 0.274, "step": 9263 }, { "epoch": 0.44760110160892885, "grad_norm": 3.7560575008392334, "learning_rate": 5.523988983910711e-07, "loss": 0.225, "step": 9264 }, { "epoch": 0.4476494177900179, "grad_norm": 7.474566459655762, "learning_rate": 5.523505822099821e-07, "loss": 0.2492, "step": 9265 }, { "epoch": 0.4476977339711069, "grad_norm": 3.3780019283294678, "learning_rate": 5.523022660288931e-07, "loss": 0.2401, "step": 9266 }, { "epoch": 0.447746050152196, "grad_norm": 2.6375722885131836, "learning_rate": 5.522539498478041e-07, "loss": 0.2874, "step": 9267 }, { "epoch": 0.447794366333285, "grad_norm": 2.563217878341675, "learning_rate": 5.522056336667149e-07, "loss": 0.2479, "step": 9268 }, { "epoch": 0.4478426825143741, "grad_norm": 2.1601455211639404, "learning_rate": 5.521573174856259e-07, "loss": 0.2592, "step": 9269 }, { "epoch": 0.4478909986954631, "grad_norm": 2.812546730041504, "learning_rate": 5.521090013045368e-07, "loss": 0.1972, "step": 9270 }, { "epoch": 0.44793931487655214, "grad_norm": 3.468820333480835, "learning_rate": 5.520606851234478e-07, "loss": 0.2237, "step": 9271 }, { "epoch": 0.4479876310576412, "grad_norm": 12.864017486572266, "learning_rate": 5.520123689423588e-07, "loss": 0.1931, "step": 9272 }, { "epoch": 0.44803594723873025, "grad_norm": 3.3682658672332764, "learning_rate": 5.519640527612697e-07, "loss": 0.3379, "step": 9273 }, { "epoch": 0.4480842634198193, "grad_norm": 2.874633550643921, "learning_rate": 5.519157365801807e-07, "loss": 0.3372, "step": 9274 }, { "epoch": 0.44813257960090835, "grad_norm": 2.154275894165039, "learning_rate": 5.518674203990916e-07, "loss": 0.241, "step": 9275 }, { "epoch": 0.4481808957819974, "grad_norm": 2.7178707122802734, "learning_rate": 5.518191042180026e-07, "loss": 0.3546, "step": 9276 }, { "epoch": 0.44822921196308646, "grad_norm": 2.7259681224823, "learning_rate": 5.517707880369136e-07, "loss": 0.3707, "step": 9277 }, { "epoch": 0.4482775281441755, "grad_norm": 2.362778663635254, "learning_rate": 5.517224718558246e-07, "loss": 0.3193, "step": 9278 }, { "epoch": 0.4483258443252645, "grad_norm": 2.689185857772827, "learning_rate": 5.516741556747354e-07, "loss": 0.3925, "step": 9279 }, { "epoch": 0.4483741605063536, "grad_norm": 5.350617408752441, "learning_rate": 5.516258394936464e-07, "loss": 0.331, "step": 9280 }, { "epoch": 0.4484224766874426, "grad_norm": 2.5570170879364014, "learning_rate": 5.515775233125573e-07, "loss": 0.2639, "step": 9281 }, { "epoch": 0.4484707928685317, "grad_norm": 3.0506768226623535, "learning_rate": 5.515292071314683e-07, "loss": 0.3668, "step": 9282 }, { "epoch": 0.4485191090496207, "grad_norm": 3.3814098834991455, "learning_rate": 5.514808909503793e-07, "loss": 0.534, "step": 9283 }, { "epoch": 0.44856742523070975, "grad_norm": 3.7275094985961914, "learning_rate": 5.514325747692902e-07, "loss": 0.2456, "step": 9284 }, { "epoch": 0.44861574141179883, "grad_norm": 2.501274347305298, "learning_rate": 5.513842585882012e-07, "loss": 0.2705, "step": 9285 }, { "epoch": 0.44866405759288785, "grad_norm": 3.182126998901367, "learning_rate": 5.513359424071122e-07, "loss": 0.3323, "step": 9286 }, { "epoch": 0.4487123737739769, "grad_norm": 2.309238910675049, "learning_rate": 5.512876262260231e-07, "loss": 0.2633, "step": 9287 }, { "epoch": 0.44876068995506596, "grad_norm": 2.5547473430633545, "learning_rate": 5.51239310044934e-07, "loss": 0.2433, "step": 9288 }, { "epoch": 0.448809006136155, "grad_norm": 2.931873083114624, "learning_rate": 5.511909938638449e-07, "loss": 0.3359, "step": 9289 }, { "epoch": 0.44885732231724407, "grad_norm": 2.2822775840759277, "learning_rate": 5.511426776827559e-07, "loss": 0.2975, "step": 9290 }, { "epoch": 0.4489056384983331, "grad_norm": 2.380964517593384, "learning_rate": 5.510943615016669e-07, "loss": 0.2121, "step": 9291 }, { "epoch": 0.4489539546794221, "grad_norm": 1.864259123802185, "learning_rate": 5.510460453205779e-07, "loss": 0.1863, "step": 9292 }, { "epoch": 0.4490022708605112, "grad_norm": 2.2106285095214844, "learning_rate": 5.509977291394889e-07, "loss": 0.2701, "step": 9293 }, { "epoch": 0.4490505870416002, "grad_norm": 1.9611690044403076, "learning_rate": 5.509494129583997e-07, "loss": 0.2429, "step": 9294 }, { "epoch": 0.4490989032226893, "grad_norm": 2.725877046585083, "learning_rate": 5.509010967773106e-07, "loss": 0.3154, "step": 9295 }, { "epoch": 0.44914721940377833, "grad_norm": 2.874361991882324, "learning_rate": 5.508527805962216e-07, "loss": 0.3358, "step": 9296 }, { "epoch": 0.44919553558486736, "grad_norm": 1.5537035465240479, "learning_rate": 5.508044644151326e-07, "loss": 0.145, "step": 9297 }, { "epoch": 0.44924385176595644, "grad_norm": 2.5420596599578857, "learning_rate": 5.507561482340436e-07, "loss": 0.2403, "step": 9298 }, { "epoch": 0.44929216794704546, "grad_norm": 2.276519298553467, "learning_rate": 5.507078320529545e-07, "loss": 0.1802, "step": 9299 }, { "epoch": 0.4493404841281345, "grad_norm": 1.5782109498977661, "learning_rate": 5.506595158718654e-07, "loss": 0.1831, "step": 9300 }, { "epoch": 0.44938880030922357, "grad_norm": 4.279139518737793, "learning_rate": 5.506111996907764e-07, "loss": 0.3927, "step": 9301 }, { "epoch": 0.4494371164903126, "grad_norm": 2.597933769226074, "learning_rate": 5.505628835096874e-07, "loss": 0.3171, "step": 9302 }, { "epoch": 0.4494854326714017, "grad_norm": 2.394540548324585, "learning_rate": 5.505145673285984e-07, "loss": 0.3152, "step": 9303 }, { "epoch": 0.4495337488524907, "grad_norm": 2.334165096282959, "learning_rate": 5.504662511475093e-07, "loss": 0.284, "step": 9304 }, { "epoch": 0.4495820650335797, "grad_norm": 5.1860857009887695, "learning_rate": 5.504179349664202e-07, "loss": 0.3561, "step": 9305 }, { "epoch": 0.4496303812146688, "grad_norm": 8.770179748535156, "learning_rate": 5.503696187853312e-07, "loss": 0.3541, "step": 9306 }, { "epoch": 0.44967869739575783, "grad_norm": 2.6038668155670166, "learning_rate": 5.503213026042421e-07, "loss": 0.2548, "step": 9307 }, { "epoch": 0.4497270135768469, "grad_norm": 2.8829433917999268, "learning_rate": 5.502729864231531e-07, "loss": 0.3293, "step": 9308 }, { "epoch": 0.44977532975793594, "grad_norm": 2.4239556789398193, "learning_rate": 5.502246702420641e-07, "loss": 0.3245, "step": 9309 }, { "epoch": 0.44982364593902496, "grad_norm": 2.8567466735839844, "learning_rate": 5.50176354060975e-07, "loss": 0.3636, "step": 9310 }, { "epoch": 0.44987196212011404, "grad_norm": 2.9321398735046387, "learning_rate": 5.50128037879886e-07, "loss": 0.3939, "step": 9311 }, { "epoch": 0.44992027830120307, "grad_norm": 4.62959098815918, "learning_rate": 5.50079721698797e-07, "loss": 0.6356, "step": 9312 }, { "epoch": 0.4499685944822921, "grad_norm": 3.8652749061584473, "learning_rate": 5.500314055177078e-07, "loss": 0.2762, "step": 9313 }, { "epoch": 0.4500169106633812, "grad_norm": 2.9581732749938965, "learning_rate": 5.499830893366188e-07, "loss": 0.2376, "step": 9314 }, { "epoch": 0.4500652268444702, "grad_norm": 2.4204466342926025, "learning_rate": 5.499347731555297e-07, "loss": 0.343, "step": 9315 }, { "epoch": 0.4501135430255593, "grad_norm": 3.1987226009368896, "learning_rate": 5.498864569744407e-07, "loss": 0.4665, "step": 9316 }, { "epoch": 0.4501618592066483, "grad_norm": 2.4599881172180176, "learning_rate": 5.498381407933517e-07, "loss": 0.2006, "step": 9317 }, { "epoch": 0.45021017538773733, "grad_norm": 3.5587832927703857, "learning_rate": 5.497898246122627e-07, "loss": 0.2832, "step": 9318 }, { "epoch": 0.4502584915688264, "grad_norm": 2.8515658378601074, "learning_rate": 5.497415084311737e-07, "loss": 0.3565, "step": 9319 }, { "epoch": 0.45030680774991544, "grad_norm": 1.7835147380828857, "learning_rate": 5.496931922500844e-07, "loss": 0.2298, "step": 9320 }, { "epoch": 0.4503551239310045, "grad_norm": 6.937049865722656, "learning_rate": 5.496448760689954e-07, "loss": 0.3092, "step": 9321 }, { "epoch": 0.45040344011209354, "grad_norm": 4.198602199554443, "learning_rate": 5.495965598879064e-07, "loss": 0.4752, "step": 9322 }, { "epoch": 0.45045175629318257, "grad_norm": 3.3059821128845215, "learning_rate": 5.495482437068174e-07, "loss": 0.171, "step": 9323 }, { "epoch": 0.45050007247427165, "grad_norm": 2.3804702758789062, "learning_rate": 5.494999275257284e-07, "loss": 0.2178, "step": 9324 }, { "epoch": 0.4505483886553607, "grad_norm": 3.3884241580963135, "learning_rate": 5.494516113446393e-07, "loss": 0.3975, "step": 9325 }, { "epoch": 0.4505967048364497, "grad_norm": 4.181329727172852, "learning_rate": 5.494032951635502e-07, "loss": 0.3094, "step": 9326 }, { "epoch": 0.4506450210175388, "grad_norm": 10.953638076782227, "learning_rate": 5.493549789824612e-07, "loss": 0.4314, "step": 9327 }, { "epoch": 0.4506933371986278, "grad_norm": 6.024538040161133, "learning_rate": 5.493066628013722e-07, "loss": 0.2841, "step": 9328 }, { "epoch": 0.4507416533797169, "grad_norm": 1.8391574621200562, "learning_rate": 5.492583466202831e-07, "loss": 0.2061, "step": 9329 }, { "epoch": 0.4507899695608059, "grad_norm": 2.664294719696045, "learning_rate": 5.492100304391941e-07, "loss": 0.3371, "step": 9330 }, { "epoch": 0.45083828574189494, "grad_norm": 1.4768121242523193, "learning_rate": 5.49161714258105e-07, "loss": 0.1459, "step": 9331 }, { "epoch": 0.450886601922984, "grad_norm": 3.8836889266967773, "learning_rate": 5.491133980770159e-07, "loss": 0.3739, "step": 9332 }, { "epoch": 0.45093491810407305, "grad_norm": 2.8201675415039062, "learning_rate": 5.490650818959269e-07, "loss": 0.3097, "step": 9333 }, { "epoch": 0.4509832342851621, "grad_norm": 2.6469197273254395, "learning_rate": 5.490167657148379e-07, "loss": 0.1885, "step": 9334 }, { "epoch": 0.45103155046625115, "grad_norm": 2.936401128768921, "learning_rate": 5.489684495337489e-07, "loss": 0.3417, "step": 9335 }, { "epoch": 0.4510798666473402, "grad_norm": 2.521235227584839, "learning_rate": 5.489201333526598e-07, "loss": 0.3056, "step": 9336 }, { "epoch": 0.45112818282842926, "grad_norm": 2.9198591709136963, "learning_rate": 5.488718171715707e-07, "loss": 0.4241, "step": 9337 }, { "epoch": 0.4511764990095183, "grad_norm": 2.648557424545288, "learning_rate": 5.488235009904817e-07, "loss": 0.3476, "step": 9338 }, { "epoch": 0.4512248151906073, "grad_norm": 3.4811513423919678, "learning_rate": 5.487751848093926e-07, "loss": 0.3262, "step": 9339 }, { "epoch": 0.4512731313716964, "grad_norm": 3.6048355102539062, "learning_rate": 5.487268686283036e-07, "loss": 0.3527, "step": 9340 }, { "epoch": 0.4513214475527854, "grad_norm": 2.563401699066162, "learning_rate": 5.486785524472145e-07, "loss": 0.3444, "step": 9341 }, { "epoch": 0.4513697637338745, "grad_norm": 2.902021884918213, "learning_rate": 5.486302362661255e-07, "loss": 0.4068, "step": 9342 }, { "epoch": 0.4514180799149635, "grad_norm": 11.658839225769043, "learning_rate": 5.485819200850365e-07, "loss": 0.2769, "step": 9343 }, { "epoch": 0.45146639609605255, "grad_norm": 1.167376160621643, "learning_rate": 5.485336039039475e-07, "loss": 0.125, "step": 9344 }, { "epoch": 0.4515147122771416, "grad_norm": 3.0876424312591553, "learning_rate": 5.484852877228584e-07, "loss": 0.414, "step": 9345 }, { "epoch": 0.45156302845823065, "grad_norm": 3.1477599143981934, "learning_rate": 5.484369715417692e-07, "loss": 0.3857, "step": 9346 }, { "epoch": 0.45161134463931973, "grad_norm": 1.7103804349899292, "learning_rate": 5.483886553606802e-07, "loss": 0.2106, "step": 9347 }, { "epoch": 0.45165966082040876, "grad_norm": 1.6154898405075073, "learning_rate": 5.483403391795912e-07, "loss": 0.1912, "step": 9348 }, { "epoch": 0.4517079770014978, "grad_norm": 2.759418487548828, "learning_rate": 5.482920229985022e-07, "loss": 0.3255, "step": 9349 }, { "epoch": 0.45175629318258687, "grad_norm": 2.4670398235321045, "learning_rate": 5.482437068174132e-07, "loss": 0.2897, "step": 9350 }, { "epoch": 0.4518046093636759, "grad_norm": 2.825770854949951, "learning_rate": 5.48195390636324e-07, "loss": 0.4271, "step": 9351 }, { "epoch": 0.4518529255447649, "grad_norm": 2.6538267135620117, "learning_rate": 5.48147074455235e-07, "loss": 0.3349, "step": 9352 }, { "epoch": 0.451901241725854, "grad_norm": 2.8590633869171143, "learning_rate": 5.48098758274146e-07, "loss": 0.1819, "step": 9353 }, { "epoch": 0.451949557906943, "grad_norm": 7.809261322021484, "learning_rate": 5.48050442093057e-07, "loss": 0.3407, "step": 9354 }, { "epoch": 0.4519978740880321, "grad_norm": 2.6227781772613525, "learning_rate": 5.480021259119679e-07, "loss": 0.3673, "step": 9355 }, { "epoch": 0.45204619026912113, "grad_norm": 3.8826959133148193, "learning_rate": 5.479538097308789e-07, "loss": 0.3161, "step": 9356 }, { "epoch": 0.45209450645021015, "grad_norm": 11.913209915161133, "learning_rate": 5.479054935497898e-07, "loss": 0.3496, "step": 9357 }, { "epoch": 0.45214282263129923, "grad_norm": 6.408324718475342, "learning_rate": 5.478571773687007e-07, "loss": 0.2999, "step": 9358 }, { "epoch": 0.45219113881238826, "grad_norm": 1.9299441576004028, "learning_rate": 5.478088611876117e-07, "loss": 0.2485, "step": 9359 }, { "epoch": 0.45223945499347734, "grad_norm": 2.3976101875305176, "learning_rate": 5.477605450065227e-07, "loss": 0.3247, "step": 9360 }, { "epoch": 0.45228777117456637, "grad_norm": 3.9313907623291016, "learning_rate": 5.477122288254337e-07, "loss": 0.2715, "step": 9361 }, { "epoch": 0.4523360873556554, "grad_norm": 2.7067179679870605, "learning_rate": 5.476639126443446e-07, "loss": 0.1868, "step": 9362 }, { "epoch": 0.4523844035367445, "grad_norm": 1.9427125453948975, "learning_rate": 5.476155964632555e-07, "loss": 0.2096, "step": 9363 }, { "epoch": 0.4524327197178335, "grad_norm": 2.9581735134124756, "learning_rate": 5.475672802821664e-07, "loss": 0.2782, "step": 9364 }, { "epoch": 0.4524810358989225, "grad_norm": 3.7347545623779297, "learning_rate": 5.475189641010774e-07, "loss": 0.4129, "step": 9365 }, { "epoch": 0.4525293520800116, "grad_norm": 3.1406452655792236, "learning_rate": 5.474706479199884e-07, "loss": 0.442, "step": 9366 }, { "epoch": 0.45257766826110063, "grad_norm": 3.556593656539917, "learning_rate": 5.474223317388993e-07, "loss": 0.3234, "step": 9367 }, { "epoch": 0.4526259844421897, "grad_norm": 2.327667474746704, "learning_rate": 5.473740155578103e-07, "loss": 0.2515, "step": 9368 }, { "epoch": 0.45267430062327874, "grad_norm": 2.0818536281585693, "learning_rate": 5.473256993767213e-07, "loss": 0.2053, "step": 9369 }, { "epoch": 0.45272261680436776, "grad_norm": 2.943061590194702, "learning_rate": 5.472773831956323e-07, "loss": 0.3438, "step": 9370 }, { "epoch": 0.45277093298545684, "grad_norm": 2.506699800491333, "learning_rate": 5.472290670145431e-07, "loss": 0.3347, "step": 9371 }, { "epoch": 0.45281924916654587, "grad_norm": 3.2474279403686523, "learning_rate": 5.47180750833454e-07, "loss": 0.2488, "step": 9372 }, { "epoch": 0.45286756534763495, "grad_norm": 4.566775798797607, "learning_rate": 5.47132434652365e-07, "loss": 0.4622, "step": 9373 }, { "epoch": 0.452915881528724, "grad_norm": 2.5074069499969482, "learning_rate": 5.47084118471276e-07, "loss": 0.2126, "step": 9374 }, { "epoch": 0.452964197709813, "grad_norm": 4.6934733390808105, "learning_rate": 5.47035802290187e-07, "loss": 0.2954, "step": 9375 }, { "epoch": 0.4530125138909021, "grad_norm": 4.187752723693848, "learning_rate": 5.46987486109098e-07, "loss": 0.338, "step": 9376 }, { "epoch": 0.4530608300719911, "grad_norm": 3.6472556591033936, "learning_rate": 5.469391699280088e-07, "loss": 0.3921, "step": 9377 }, { "epoch": 0.45310914625308013, "grad_norm": 2.8804612159729004, "learning_rate": 5.468908537469198e-07, "loss": 0.2807, "step": 9378 }, { "epoch": 0.4531574624341692, "grad_norm": 3.4821572303771973, "learning_rate": 5.468425375658308e-07, "loss": 0.2227, "step": 9379 }, { "epoch": 0.45320577861525824, "grad_norm": 3.201536178588867, "learning_rate": 5.467942213847417e-07, "loss": 0.3819, "step": 9380 }, { "epoch": 0.4532540947963473, "grad_norm": 5.008812427520752, "learning_rate": 5.467459052036527e-07, "loss": 0.3731, "step": 9381 }, { "epoch": 0.45330241097743634, "grad_norm": 7.890887260437012, "learning_rate": 5.466975890225636e-07, "loss": 0.3548, "step": 9382 }, { "epoch": 0.45335072715852537, "grad_norm": 2.8948214054107666, "learning_rate": 5.466492728414745e-07, "loss": 0.3666, "step": 9383 }, { "epoch": 0.45339904333961445, "grad_norm": 4.931232929229736, "learning_rate": 5.466009566603855e-07, "loss": 0.2506, "step": 9384 }, { "epoch": 0.4534473595207035, "grad_norm": 1.6295092105865479, "learning_rate": 5.465526404792965e-07, "loss": 0.1657, "step": 9385 }, { "epoch": 0.45349567570179256, "grad_norm": 2.5709564685821533, "learning_rate": 5.465043242982075e-07, "loss": 0.2444, "step": 9386 }, { "epoch": 0.4535439918828816, "grad_norm": 2.2876288890838623, "learning_rate": 5.464560081171185e-07, "loss": 0.2893, "step": 9387 }, { "epoch": 0.4535923080639706, "grad_norm": 4.65160608291626, "learning_rate": 5.464076919360293e-07, "loss": 0.3394, "step": 9388 }, { "epoch": 0.4536406242450597, "grad_norm": 2.561147451400757, "learning_rate": 5.463593757549403e-07, "loss": 0.2775, "step": 9389 }, { "epoch": 0.4536889404261487, "grad_norm": 7.9590911865234375, "learning_rate": 5.463110595738512e-07, "loss": 0.3483, "step": 9390 }, { "epoch": 0.45373725660723774, "grad_norm": 1.6311770677566528, "learning_rate": 5.462627433927622e-07, "loss": 0.1539, "step": 9391 }, { "epoch": 0.4537855727883268, "grad_norm": 2.0543200969696045, "learning_rate": 5.462144272116732e-07, "loss": 0.232, "step": 9392 }, { "epoch": 0.45383388896941584, "grad_norm": 8.372345924377441, "learning_rate": 5.461661110305841e-07, "loss": 0.3141, "step": 9393 }, { "epoch": 0.4538822051505049, "grad_norm": 4.096055030822754, "learning_rate": 5.461177948494951e-07, "loss": 0.2746, "step": 9394 }, { "epoch": 0.45393052133159395, "grad_norm": 1.9205065965652466, "learning_rate": 5.460694786684061e-07, "loss": 0.2762, "step": 9395 }, { "epoch": 0.453978837512683, "grad_norm": 5.1724348068237305, "learning_rate": 5.46021162487317e-07, "loss": 0.2975, "step": 9396 }, { "epoch": 0.45402715369377206, "grad_norm": 2.009376049041748, "learning_rate": 5.459728463062279e-07, "loss": 0.233, "step": 9397 }, { "epoch": 0.4540754698748611, "grad_norm": 4.04351806640625, "learning_rate": 5.459245301251388e-07, "loss": 0.2677, "step": 9398 }, { "epoch": 0.45412378605595016, "grad_norm": 2.9713752269744873, "learning_rate": 5.458762139440498e-07, "loss": 0.2436, "step": 9399 }, { "epoch": 0.4541721022370392, "grad_norm": 4.775551795959473, "learning_rate": 5.458278977629608e-07, "loss": 0.207, "step": 9400 }, { "epoch": 0.4542204184181282, "grad_norm": 2.104357957839966, "learning_rate": 5.457795815818718e-07, "loss": 0.2587, "step": 9401 }, { "epoch": 0.4542687345992173, "grad_norm": 2.3149759769439697, "learning_rate": 5.457312654007828e-07, "loss": 0.279, "step": 9402 }, { "epoch": 0.4543170507803063, "grad_norm": 2.6536879539489746, "learning_rate": 5.456829492196936e-07, "loss": 0.1413, "step": 9403 }, { "epoch": 0.45436536696139534, "grad_norm": 1.974143147468567, "learning_rate": 5.456346330386046e-07, "loss": 0.2158, "step": 9404 }, { "epoch": 0.4544136831424844, "grad_norm": 2.667985439300537, "learning_rate": 5.455863168575155e-07, "loss": 0.259, "step": 9405 }, { "epoch": 0.45446199932357345, "grad_norm": 2.2781245708465576, "learning_rate": 5.455380006764265e-07, "loss": 0.2228, "step": 9406 }, { "epoch": 0.45451031550466253, "grad_norm": 2.0640108585357666, "learning_rate": 5.454896844953375e-07, "loss": 0.2398, "step": 9407 }, { "epoch": 0.45455863168575156, "grad_norm": 2.3630242347717285, "learning_rate": 5.454413683142484e-07, "loss": 0.2381, "step": 9408 }, { "epoch": 0.4546069478668406, "grad_norm": 2.912729501724243, "learning_rate": 5.453930521331593e-07, "loss": 0.3072, "step": 9409 }, { "epoch": 0.45465526404792966, "grad_norm": 2.3347995281219482, "learning_rate": 5.453447359520703e-07, "loss": 0.2966, "step": 9410 }, { "epoch": 0.4547035802290187, "grad_norm": 2.6712448596954346, "learning_rate": 5.452964197709813e-07, "loss": 0.1808, "step": 9411 }, { "epoch": 0.45475189641010777, "grad_norm": 1.833724021911621, "learning_rate": 5.452481035898923e-07, "loss": 0.244, "step": 9412 }, { "epoch": 0.4548002125911968, "grad_norm": 2.783473014831543, "learning_rate": 5.451997874088033e-07, "loss": 0.3892, "step": 9413 }, { "epoch": 0.4548485287722858, "grad_norm": 20.425189971923828, "learning_rate": 5.451514712277141e-07, "loss": 0.2813, "step": 9414 }, { "epoch": 0.4548968449533749, "grad_norm": 3.626471757888794, "learning_rate": 5.45103155046625e-07, "loss": 0.3832, "step": 9415 }, { "epoch": 0.4549451611344639, "grad_norm": 1.7957584857940674, "learning_rate": 5.45054838865536e-07, "loss": 0.2082, "step": 9416 }, { "epoch": 0.45499347731555295, "grad_norm": 2.7114665508270264, "learning_rate": 5.45006522684447e-07, "loss": 0.3665, "step": 9417 }, { "epoch": 0.45504179349664203, "grad_norm": 4.781137466430664, "learning_rate": 5.44958206503358e-07, "loss": 0.3252, "step": 9418 }, { "epoch": 0.45509010967773106, "grad_norm": 2.399843215942383, "learning_rate": 5.449098903222689e-07, "loss": 0.2567, "step": 9419 }, { "epoch": 0.45513842585882014, "grad_norm": 22.973522186279297, "learning_rate": 5.448615741411799e-07, "loss": 0.2479, "step": 9420 }, { "epoch": 0.45518674203990916, "grad_norm": 8.045620918273926, "learning_rate": 5.448132579600909e-07, "loss": 0.2654, "step": 9421 }, { "epoch": 0.4552350582209982, "grad_norm": 4.238069534301758, "learning_rate": 5.447649417790017e-07, "loss": 0.2424, "step": 9422 }, { "epoch": 0.45528337440208727, "grad_norm": 2.170494556427002, "learning_rate": 5.447166255979127e-07, "loss": 0.3262, "step": 9423 }, { "epoch": 0.4553316905831763, "grad_norm": 3.1383509635925293, "learning_rate": 5.446683094168236e-07, "loss": 0.3202, "step": 9424 }, { "epoch": 0.4553800067642654, "grad_norm": 3.159533739089966, "learning_rate": 5.446199932357346e-07, "loss": 0.3107, "step": 9425 }, { "epoch": 0.4554283229453544, "grad_norm": 3.014681816101074, "learning_rate": 5.445716770546456e-07, "loss": 0.3477, "step": 9426 }, { "epoch": 0.45547663912644343, "grad_norm": 3.71301007270813, "learning_rate": 5.445233608735566e-07, "loss": 0.3003, "step": 9427 }, { "epoch": 0.4555249553075325, "grad_norm": 2.6234354972839355, "learning_rate": 5.444750446924675e-07, "loss": 0.3004, "step": 9428 }, { "epoch": 0.45557327148862153, "grad_norm": 4.95676851272583, "learning_rate": 5.444267285113784e-07, "loss": 0.3786, "step": 9429 }, { "epoch": 0.45562158766971056, "grad_norm": 2.6966564655303955, "learning_rate": 5.443784123302893e-07, "loss": 0.2993, "step": 9430 }, { "epoch": 0.45566990385079964, "grad_norm": 2.2557485103607178, "learning_rate": 5.443300961492003e-07, "loss": 0.3393, "step": 9431 }, { "epoch": 0.45571822003188867, "grad_norm": 2.4907314777374268, "learning_rate": 5.442817799681113e-07, "loss": 0.2895, "step": 9432 }, { "epoch": 0.45576653621297775, "grad_norm": 2.0453081130981445, "learning_rate": 5.442334637870223e-07, "loss": 0.1909, "step": 9433 }, { "epoch": 0.45581485239406677, "grad_norm": 1.656507968902588, "learning_rate": 5.441851476059332e-07, "loss": 0.156, "step": 9434 }, { "epoch": 0.4558631685751558, "grad_norm": 3.7084877490997314, "learning_rate": 5.441368314248441e-07, "loss": 0.422, "step": 9435 }, { "epoch": 0.4559114847562449, "grad_norm": 2.604797124862671, "learning_rate": 5.440885152437551e-07, "loss": 0.3093, "step": 9436 }, { "epoch": 0.4559598009373339, "grad_norm": 2.4447035789489746, "learning_rate": 5.440401990626661e-07, "loss": 0.2713, "step": 9437 }, { "epoch": 0.456008117118423, "grad_norm": 3.1431052684783936, "learning_rate": 5.439918828815771e-07, "loss": 0.2113, "step": 9438 }, { "epoch": 0.456056433299512, "grad_norm": 5.791109561920166, "learning_rate": 5.43943566700488e-07, "loss": 0.4575, "step": 9439 }, { "epoch": 0.45610474948060103, "grad_norm": 4.013237476348877, "learning_rate": 5.438952505193989e-07, "loss": 0.2346, "step": 9440 }, { "epoch": 0.4561530656616901, "grad_norm": 2.284518003463745, "learning_rate": 5.438469343383098e-07, "loss": 0.2398, "step": 9441 }, { "epoch": 0.45620138184277914, "grad_norm": 3.047419786453247, "learning_rate": 5.437986181572208e-07, "loss": 0.3764, "step": 9442 }, { "epoch": 0.45624969802386817, "grad_norm": 2.502312660217285, "learning_rate": 5.437503019761318e-07, "loss": 0.272, "step": 9443 }, { "epoch": 0.45629801420495725, "grad_norm": 3.429311513900757, "learning_rate": 5.437019857950428e-07, "loss": 0.3498, "step": 9444 }, { "epoch": 0.4563463303860463, "grad_norm": 2.369053602218628, "learning_rate": 5.436536696139537e-07, "loss": 0.3316, "step": 9445 }, { "epoch": 0.45639464656713535, "grad_norm": 2.782226324081421, "learning_rate": 5.436053534328647e-07, "loss": 0.2426, "step": 9446 }, { "epoch": 0.4564429627482244, "grad_norm": 4.765141487121582, "learning_rate": 5.435570372517755e-07, "loss": 0.4182, "step": 9447 }, { "epoch": 0.4564912789293134, "grad_norm": 2.629181146621704, "learning_rate": 5.435087210706865e-07, "loss": 0.3302, "step": 9448 }, { "epoch": 0.4565395951104025, "grad_norm": 2.8596365451812744, "learning_rate": 5.434604048895975e-07, "loss": 0.303, "step": 9449 }, { "epoch": 0.4565879112914915, "grad_norm": 3.0200977325439453, "learning_rate": 5.434120887085084e-07, "loss": 0.3418, "step": 9450 }, { "epoch": 0.4566362274725806, "grad_norm": 2.062931537628174, "learning_rate": 5.433637725274194e-07, "loss": 0.2205, "step": 9451 }, { "epoch": 0.4566845436536696, "grad_norm": 2.5204849243164062, "learning_rate": 5.433154563463304e-07, "loss": 0.2728, "step": 9452 }, { "epoch": 0.45673285983475864, "grad_norm": 5.336310863494873, "learning_rate": 5.432671401652414e-07, "loss": 0.2971, "step": 9453 }, { "epoch": 0.4567811760158477, "grad_norm": 3.182648181915283, "learning_rate": 5.432188239841523e-07, "loss": 0.3299, "step": 9454 }, { "epoch": 0.45682949219693675, "grad_norm": 2.709824323654175, "learning_rate": 5.431705078030631e-07, "loss": 0.3525, "step": 9455 }, { "epoch": 0.4568778083780258, "grad_norm": 2.2983882427215576, "learning_rate": 5.431221916219741e-07, "loss": 0.2579, "step": 9456 }, { "epoch": 0.45692612455911485, "grad_norm": 6.94779634475708, "learning_rate": 5.430738754408851e-07, "loss": 0.4366, "step": 9457 }, { "epoch": 0.4569744407402039, "grad_norm": 2.9610180854797363, "learning_rate": 5.430255592597961e-07, "loss": 0.3463, "step": 9458 }, { "epoch": 0.45702275692129296, "grad_norm": 2.671098232269287, "learning_rate": 5.429772430787071e-07, "loss": 0.2831, "step": 9459 }, { "epoch": 0.457071073102382, "grad_norm": 2.555485248565674, "learning_rate": 5.429289268976179e-07, "loss": 0.2971, "step": 9460 }, { "epoch": 0.457119389283471, "grad_norm": 2.87697696685791, "learning_rate": 5.428806107165289e-07, "loss": 0.3985, "step": 9461 }, { "epoch": 0.4571677054645601, "grad_norm": 2.452171802520752, "learning_rate": 5.428322945354399e-07, "loss": 0.2556, "step": 9462 }, { "epoch": 0.4572160216456491, "grad_norm": 2.523897886276245, "learning_rate": 5.427839783543509e-07, "loss": 0.2922, "step": 9463 }, { "epoch": 0.4572643378267382, "grad_norm": 3.336045265197754, "learning_rate": 5.427356621732618e-07, "loss": 0.4036, "step": 9464 }, { "epoch": 0.4573126540078272, "grad_norm": 16.102602005004883, "learning_rate": 5.426873459921728e-07, "loss": 0.3017, "step": 9465 }, { "epoch": 0.45736097018891625, "grad_norm": 2.3647310733795166, "learning_rate": 5.426390298110837e-07, "loss": 0.2375, "step": 9466 }, { "epoch": 0.45740928637000533, "grad_norm": 4.266229629516602, "learning_rate": 5.425907136299946e-07, "loss": 0.4203, "step": 9467 }, { "epoch": 0.45745760255109436, "grad_norm": 2.4588780403137207, "learning_rate": 5.425423974489056e-07, "loss": 0.3362, "step": 9468 }, { "epoch": 0.4575059187321834, "grad_norm": 5.210412979125977, "learning_rate": 5.424940812678166e-07, "loss": 0.2521, "step": 9469 }, { "epoch": 0.45755423491327246, "grad_norm": 1.9822102785110474, "learning_rate": 5.424457650867276e-07, "loss": 0.2162, "step": 9470 }, { "epoch": 0.4576025510943615, "grad_norm": 5.2418437004089355, "learning_rate": 5.423974489056385e-07, "loss": 0.3889, "step": 9471 }, { "epoch": 0.45765086727545057, "grad_norm": 2.778139114379883, "learning_rate": 5.423491327245495e-07, "loss": 0.2725, "step": 9472 }, { "epoch": 0.4576991834565396, "grad_norm": 4.403974533081055, "learning_rate": 5.423008165434603e-07, "loss": 0.1942, "step": 9473 }, { "epoch": 0.4577474996376286, "grad_norm": 2.0684895515441895, "learning_rate": 5.422525003623713e-07, "loss": 0.2596, "step": 9474 }, { "epoch": 0.4577958158187177, "grad_norm": 1.850003719329834, "learning_rate": 5.422041841812823e-07, "loss": 0.2053, "step": 9475 }, { "epoch": 0.4578441319998067, "grad_norm": 2.3858771324157715, "learning_rate": 5.421558680001932e-07, "loss": 0.3354, "step": 9476 }, { "epoch": 0.4578924481808958, "grad_norm": 1.6414870023727417, "learning_rate": 5.421075518191042e-07, "loss": 0.1709, "step": 9477 }, { "epoch": 0.45794076436198483, "grad_norm": 6.498431205749512, "learning_rate": 5.420592356380152e-07, "loss": 0.3186, "step": 9478 }, { "epoch": 0.45798908054307386, "grad_norm": 1.992722988128662, "learning_rate": 5.420109194569261e-07, "loss": 0.1782, "step": 9479 }, { "epoch": 0.45803739672416294, "grad_norm": 2.295173168182373, "learning_rate": 5.419626032758371e-07, "loss": 0.2339, "step": 9480 }, { "epoch": 0.45808571290525196, "grad_norm": 3.0119612216949463, "learning_rate": 5.419142870947479e-07, "loss": 0.3786, "step": 9481 }, { "epoch": 0.458134029086341, "grad_norm": 2.9244275093078613, "learning_rate": 5.418659709136589e-07, "loss": 0.3701, "step": 9482 }, { "epoch": 0.45818234526743007, "grad_norm": 2.183224678039551, "learning_rate": 5.418176547325699e-07, "loss": 0.2487, "step": 9483 }, { "epoch": 0.4582306614485191, "grad_norm": 3.033630609512329, "learning_rate": 5.417693385514809e-07, "loss": 0.2825, "step": 9484 }, { "epoch": 0.4582789776296082, "grad_norm": 1.7661967277526855, "learning_rate": 5.417210223703919e-07, "loss": 0.1642, "step": 9485 }, { "epoch": 0.4583272938106972, "grad_norm": 2.719972848892212, "learning_rate": 5.416727061893027e-07, "loss": 0.1755, "step": 9486 }, { "epoch": 0.4583756099917862, "grad_norm": 2.9076061248779297, "learning_rate": 5.416243900082137e-07, "loss": 0.233, "step": 9487 }, { "epoch": 0.4584239261728753, "grad_norm": 4.898863315582275, "learning_rate": 5.415760738271247e-07, "loss": 0.2019, "step": 9488 }, { "epoch": 0.45847224235396433, "grad_norm": 3.3431622982025146, "learning_rate": 5.415277576460357e-07, "loss": 0.2914, "step": 9489 }, { "epoch": 0.4585205585350534, "grad_norm": 3.2590205669403076, "learning_rate": 5.414794414649466e-07, "loss": 0.3672, "step": 9490 }, { "epoch": 0.45856887471614244, "grad_norm": 3.8999264240264893, "learning_rate": 5.414311252838576e-07, "loss": 0.345, "step": 9491 }, { "epoch": 0.45861719089723146, "grad_norm": 2.3053414821624756, "learning_rate": 5.413828091027684e-07, "loss": 0.2822, "step": 9492 }, { "epoch": 0.45866550707832054, "grad_norm": 3.034499168395996, "learning_rate": 5.413344929216794e-07, "loss": 0.2745, "step": 9493 }, { "epoch": 0.45871382325940957, "grad_norm": 3.063062906265259, "learning_rate": 5.412861767405904e-07, "loss": 0.3916, "step": 9494 }, { "epoch": 0.4587621394404986, "grad_norm": 3.7991034984588623, "learning_rate": 5.412378605595014e-07, "loss": 0.2677, "step": 9495 }, { "epoch": 0.4588104556215877, "grad_norm": 2.420090675354004, "learning_rate": 5.411895443784124e-07, "loss": 0.2535, "step": 9496 }, { "epoch": 0.4588587718026767, "grad_norm": 2.2777249813079834, "learning_rate": 5.411412281973233e-07, "loss": 0.2585, "step": 9497 }, { "epoch": 0.4589070879837658, "grad_norm": 1.9871577024459839, "learning_rate": 5.410929120162342e-07, "loss": 0.1966, "step": 9498 }, { "epoch": 0.4589554041648548, "grad_norm": 2.050851583480835, "learning_rate": 5.410445958351451e-07, "loss": 0.2071, "step": 9499 }, { "epoch": 0.45900372034594383, "grad_norm": 3.014166831970215, "learning_rate": 5.409962796540561e-07, "loss": 0.3665, "step": 9500 }, { "epoch": 0.4590520365270329, "grad_norm": 2.6167492866516113, "learning_rate": 5.409479634729671e-07, "loss": 0.2478, "step": 9501 }, { "epoch": 0.45910035270812194, "grad_norm": 2.1096925735473633, "learning_rate": 5.40899647291878e-07, "loss": 0.1917, "step": 9502 }, { "epoch": 0.459148668889211, "grad_norm": 2.7821805477142334, "learning_rate": 5.40851331110789e-07, "loss": 0.2587, "step": 9503 }, { "epoch": 0.45919698507030005, "grad_norm": 3.3606159687042236, "learning_rate": 5.408030149297e-07, "loss": 0.1983, "step": 9504 }, { "epoch": 0.45924530125138907, "grad_norm": 2.8542943000793457, "learning_rate": 5.407546987486109e-07, "loss": 0.3413, "step": 9505 }, { "epoch": 0.45929361743247815, "grad_norm": 1.9902379512786865, "learning_rate": 5.407063825675218e-07, "loss": 0.223, "step": 9506 }, { "epoch": 0.4593419336135672, "grad_norm": 7.8377251625061035, "learning_rate": 5.406580663864327e-07, "loss": 0.4869, "step": 9507 }, { "epoch": 0.45939024979465626, "grad_norm": 2.0829243659973145, "learning_rate": 5.406097502053437e-07, "loss": 0.227, "step": 9508 }, { "epoch": 0.4594385659757453, "grad_norm": 2.4245333671569824, "learning_rate": 5.405614340242547e-07, "loss": 0.2342, "step": 9509 }, { "epoch": 0.4594868821568343, "grad_norm": 5.295287609100342, "learning_rate": 5.405131178431657e-07, "loss": 0.3418, "step": 9510 }, { "epoch": 0.4595351983379234, "grad_norm": 2.744076728820801, "learning_rate": 5.404648016620767e-07, "loss": 0.3184, "step": 9511 }, { "epoch": 0.4595835145190124, "grad_norm": 5.682882308959961, "learning_rate": 5.404164854809875e-07, "loss": 0.2814, "step": 9512 }, { "epoch": 0.45963183070010144, "grad_norm": 3.6627349853515625, "learning_rate": 5.403681692998985e-07, "loss": 0.2743, "step": 9513 }, { "epoch": 0.4596801468811905, "grad_norm": 3.028613328933716, "learning_rate": 5.403198531188095e-07, "loss": 0.3275, "step": 9514 }, { "epoch": 0.45972846306227955, "grad_norm": 4.868985652923584, "learning_rate": 5.402715369377204e-07, "loss": 0.4414, "step": 9515 }, { "epoch": 0.4597767792433686, "grad_norm": 4.142775535583496, "learning_rate": 5.402232207566314e-07, "loss": 0.2289, "step": 9516 }, { "epoch": 0.45982509542445765, "grad_norm": 3.520561933517456, "learning_rate": 5.401749045755424e-07, "loss": 0.2337, "step": 9517 }, { "epoch": 0.4598734116055467, "grad_norm": 2.47373104095459, "learning_rate": 5.401265883944532e-07, "loss": 0.318, "step": 9518 }, { "epoch": 0.45992172778663576, "grad_norm": 2.011308193206787, "learning_rate": 5.400782722133642e-07, "loss": 0.2495, "step": 9519 }, { "epoch": 0.4599700439677248, "grad_norm": 2.431621551513672, "learning_rate": 5.400299560322752e-07, "loss": 0.2471, "step": 9520 }, { "epoch": 0.46001836014881387, "grad_norm": 4.709614276885986, "learning_rate": 5.399816398511862e-07, "loss": 0.2973, "step": 9521 }, { "epoch": 0.4600666763299029, "grad_norm": 2.828984022140503, "learning_rate": 5.399333236700972e-07, "loss": 0.3462, "step": 9522 }, { "epoch": 0.4601149925109919, "grad_norm": 2.975724458694458, "learning_rate": 5.39885007489008e-07, "loss": 0.3491, "step": 9523 }, { "epoch": 0.460163308692081, "grad_norm": 2.0225741863250732, "learning_rate": 5.398366913079189e-07, "loss": 0.2249, "step": 9524 }, { "epoch": 0.46021162487317, "grad_norm": 3.7145614624023438, "learning_rate": 5.397883751268299e-07, "loss": 0.2586, "step": 9525 }, { "epoch": 0.46025994105425905, "grad_norm": 2.4814677238464355, "learning_rate": 5.397400589457409e-07, "loss": 0.1739, "step": 9526 }, { "epoch": 0.46030825723534813, "grad_norm": 2.703550338745117, "learning_rate": 5.396917427646519e-07, "loss": 0.3569, "step": 9527 }, { "epoch": 0.46035657341643715, "grad_norm": 7.091141700744629, "learning_rate": 5.396434265835628e-07, "loss": 0.2374, "step": 9528 }, { "epoch": 0.46040488959752623, "grad_norm": 3.165423631668091, "learning_rate": 5.395951104024738e-07, "loss": 0.3346, "step": 9529 }, { "epoch": 0.46045320577861526, "grad_norm": 2.565263032913208, "learning_rate": 5.395467942213848e-07, "loss": 0.2581, "step": 9530 }, { "epoch": 0.4605015219597043, "grad_norm": 3.347851276397705, "learning_rate": 5.394984780402957e-07, "loss": 0.3745, "step": 9531 }, { "epoch": 0.46054983814079337, "grad_norm": 1.9564415216445923, "learning_rate": 5.394501618592066e-07, "loss": 0.1662, "step": 9532 }, { "epoch": 0.4605981543218824, "grad_norm": 4.313721656799316, "learning_rate": 5.394018456781175e-07, "loss": 0.2274, "step": 9533 }, { "epoch": 0.4606464705029715, "grad_norm": 3.091630697250366, "learning_rate": 5.393535294970285e-07, "loss": 0.2625, "step": 9534 }, { "epoch": 0.4606947866840605, "grad_norm": 1.9586923122406006, "learning_rate": 5.393052133159395e-07, "loss": 0.2498, "step": 9535 }, { "epoch": 0.4607431028651495, "grad_norm": 3.695852756500244, "learning_rate": 5.392568971348505e-07, "loss": 0.267, "step": 9536 }, { "epoch": 0.4607914190462386, "grad_norm": 5.607787132263184, "learning_rate": 5.392085809537614e-07, "loss": 0.2184, "step": 9537 }, { "epoch": 0.46083973522732763, "grad_norm": 3.087134838104248, "learning_rate": 5.391602647726723e-07, "loss": 0.4986, "step": 9538 }, { "epoch": 0.46088805140841665, "grad_norm": 2.732372760772705, "learning_rate": 5.391119485915833e-07, "loss": 0.2897, "step": 9539 }, { "epoch": 0.46093636758950574, "grad_norm": 4.5262370109558105, "learning_rate": 5.390636324104942e-07, "loss": 0.2851, "step": 9540 }, { "epoch": 0.46098468377059476, "grad_norm": 3.4740922451019287, "learning_rate": 5.390153162294052e-07, "loss": 0.3122, "step": 9541 }, { "epoch": 0.46103299995168384, "grad_norm": 3.949676513671875, "learning_rate": 5.389670000483162e-07, "loss": 0.2311, "step": 9542 }, { "epoch": 0.46108131613277287, "grad_norm": 3.4701805114746094, "learning_rate": 5.389186838672272e-07, "loss": 0.3614, "step": 9543 }, { "epoch": 0.4611296323138619, "grad_norm": 2.2421071529388428, "learning_rate": 5.38870367686138e-07, "loss": 0.2595, "step": 9544 }, { "epoch": 0.461177948494951, "grad_norm": 2.618764638900757, "learning_rate": 5.38822051505049e-07, "loss": 0.2709, "step": 9545 }, { "epoch": 0.46122626467604, "grad_norm": 9.158141136169434, "learning_rate": 5.3877373532396e-07, "loss": 0.3513, "step": 9546 }, { "epoch": 0.4612745808571291, "grad_norm": 1.9755139350891113, "learning_rate": 5.38725419142871e-07, "loss": 0.1571, "step": 9547 }, { "epoch": 0.4613228970382181, "grad_norm": 3.084880828857422, "learning_rate": 5.38677102961782e-07, "loss": 0.4064, "step": 9548 }, { "epoch": 0.46137121321930713, "grad_norm": 2.534787893295288, "learning_rate": 5.386287867806928e-07, "loss": 0.2679, "step": 9549 }, { "epoch": 0.4614195294003962, "grad_norm": 3.1170034408569336, "learning_rate": 5.385804705996037e-07, "loss": 0.3365, "step": 9550 }, { "epoch": 0.46146784558148524, "grad_norm": 2.362269163131714, "learning_rate": 5.385321544185147e-07, "loss": 0.2869, "step": 9551 }, { "epoch": 0.46151616176257426, "grad_norm": 4.021111965179443, "learning_rate": 5.384838382374257e-07, "loss": 0.2437, "step": 9552 }, { "epoch": 0.46156447794366334, "grad_norm": 1.9351341724395752, "learning_rate": 5.384355220563367e-07, "loss": 0.232, "step": 9553 }, { "epoch": 0.46161279412475237, "grad_norm": 3.555060386657715, "learning_rate": 5.383872058752476e-07, "loss": 0.3061, "step": 9554 }, { "epoch": 0.46166111030584145, "grad_norm": 2.7690351009368896, "learning_rate": 5.383388896941586e-07, "loss": 0.2999, "step": 9555 }, { "epoch": 0.4617094264869305, "grad_norm": 2.479774236679077, "learning_rate": 5.382905735130695e-07, "loss": 0.2761, "step": 9556 }, { "epoch": 0.4617577426680195, "grad_norm": 3.194551706314087, "learning_rate": 5.382422573319804e-07, "loss": 0.4379, "step": 9557 }, { "epoch": 0.4618060588491086, "grad_norm": 2.6035022735595703, "learning_rate": 5.381939411508914e-07, "loss": 0.3883, "step": 9558 }, { "epoch": 0.4618543750301976, "grad_norm": 3.464738368988037, "learning_rate": 5.381456249698023e-07, "loss": 0.3288, "step": 9559 }, { "epoch": 0.4619026912112867, "grad_norm": 3.232168197631836, "learning_rate": 5.380973087887133e-07, "loss": 0.2415, "step": 9560 }, { "epoch": 0.4619510073923757, "grad_norm": 3.061816930770874, "learning_rate": 5.380489926076243e-07, "loss": 0.3933, "step": 9561 }, { "epoch": 0.46199932357346474, "grad_norm": 3.446826457977295, "learning_rate": 5.380006764265353e-07, "loss": 0.2534, "step": 9562 }, { "epoch": 0.4620476397545538, "grad_norm": 4.132601737976074, "learning_rate": 5.379523602454462e-07, "loss": 0.294, "step": 9563 }, { "epoch": 0.46209595593564284, "grad_norm": 4.935443878173828, "learning_rate": 5.379040440643571e-07, "loss": 0.2912, "step": 9564 }, { "epoch": 0.46214427211673187, "grad_norm": 3.8231887817382812, "learning_rate": 5.37855727883268e-07, "loss": 0.4303, "step": 9565 }, { "epoch": 0.46219258829782095, "grad_norm": 3.481806993484497, "learning_rate": 5.37807411702179e-07, "loss": 0.3641, "step": 9566 }, { "epoch": 0.46224090447891, "grad_norm": 2.9469592571258545, "learning_rate": 5.3775909552109e-07, "loss": 0.2636, "step": 9567 }, { "epoch": 0.46228922065999906, "grad_norm": 2.188758373260498, "learning_rate": 5.37710779340001e-07, "loss": 0.2578, "step": 9568 }, { "epoch": 0.4623375368410881, "grad_norm": 2.794524669647217, "learning_rate": 5.376624631589119e-07, "loss": 0.3422, "step": 9569 }, { "epoch": 0.4623858530221771, "grad_norm": 3.197094440460205, "learning_rate": 5.376141469778228e-07, "loss": 0.3819, "step": 9570 }, { "epoch": 0.4624341692032662, "grad_norm": 3.3018152713775635, "learning_rate": 5.375658307967338e-07, "loss": 0.4201, "step": 9571 }, { "epoch": 0.4624824853843552, "grad_norm": 2.6036810874938965, "learning_rate": 5.375175146156448e-07, "loss": 0.3276, "step": 9572 }, { "epoch": 0.4625308015654443, "grad_norm": 3.68178653717041, "learning_rate": 5.374691984345558e-07, "loss": 0.3232, "step": 9573 }, { "epoch": 0.4625791177465333, "grad_norm": 2.5962164402008057, "learning_rate": 5.374208822534667e-07, "loss": 0.346, "step": 9574 }, { "epoch": 0.46262743392762234, "grad_norm": 3.494417905807495, "learning_rate": 5.373725660723775e-07, "loss": 0.4322, "step": 9575 }, { "epoch": 0.4626757501087114, "grad_norm": 3.4574735164642334, "learning_rate": 5.373242498912885e-07, "loss": 0.2585, "step": 9576 }, { "epoch": 0.46272406628980045, "grad_norm": 2.621410369873047, "learning_rate": 5.372759337101995e-07, "loss": 0.298, "step": 9577 }, { "epoch": 0.4627723824708895, "grad_norm": 1.9979360103607178, "learning_rate": 5.372276175291105e-07, "loss": 0.2328, "step": 9578 }, { "epoch": 0.46282069865197856, "grad_norm": 4.097740173339844, "learning_rate": 5.371793013480215e-07, "loss": 0.2814, "step": 9579 }, { "epoch": 0.4628690148330676, "grad_norm": 1.6115484237670898, "learning_rate": 5.371309851669324e-07, "loss": 0.1669, "step": 9580 }, { "epoch": 0.46291733101415666, "grad_norm": 4.155736446380615, "learning_rate": 5.370826689858434e-07, "loss": 0.4689, "step": 9581 }, { "epoch": 0.4629656471952457, "grad_norm": 2.3797359466552734, "learning_rate": 5.370343528047542e-07, "loss": 0.1993, "step": 9582 }, { "epoch": 0.4630139633763347, "grad_norm": 3.0590708255767822, "learning_rate": 5.369860366236652e-07, "loss": 0.3749, "step": 9583 }, { "epoch": 0.4630622795574238, "grad_norm": 2.3442652225494385, "learning_rate": 5.369377204425762e-07, "loss": 0.2767, "step": 9584 }, { "epoch": 0.4631105957385128, "grad_norm": 2.7161529064178467, "learning_rate": 5.368894042614871e-07, "loss": 0.2591, "step": 9585 }, { "epoch": 0.4631589119196019, "grad_norm": 4.012360095977783, "learning_rate": 5.368410880803981e-07, "loss": 0.4644, "step": 9586 }, { "epoch": 0.4632072281006909, "grad_norm": 3.1049485206604004, "learning_rate": 5.367927718993091e-07, "loss": 0.2797, "step": 9587 }, { "epoch": 0.46325554428177995, "grad_norm": 4.411696910858154, "learning_rate": 5.3674445571822e-07, "loss": 0.1501, "step": 9588 }, { "epoch": 0.46330386046286903, "grad_norm": 2.54687237739563, "learning_rate": 5.36696139537131e-07, "loss": 0.2685, "step": 9589 }, { "epoch": 0.46335217664395806, "grad_norm": 2.714468479156494, "learning_rate": 5.366478233560419e-07, "loss": 0.3059, "step": 9590 }, { "epoch": 0.4634004928250471, "grad_norm": 3.4333393573760986, "learning_rate": 5.365995071749528e-07, "loss": 0.2455, "step": 9591 }, { "epoch": 0.46344880900613616, "grad_norm": 3.0772762298583984, "learning_rate": 5.365511909938638e-07, "loss": 0.3407, "step": 9592 }, { "epoch": 0.4634971251872252, "grad_norm": 2.2611961364746094, "learning_rate": 5.365028748127748e-07, "loss": 0.2399, "step": 9593 }, { "epoch": 0.46354544136831427, "grad_norm": 9.834680557250977, "learning_rate": 5.364545586316858e-07, "loss": 0.368, "step": 9594 }, { "epoch": 0.4635937575494033, "grad_norm": 3.123061418533325, "learning_rate": 5.364062424505967e-07, "loss": 0.3704, "step": 9595 }, { "epoch": 0.4636420737304923, "grad_norm": 2.055025100708008, "learning_rate": 5.363579262695076e-07, "loss": 0.2416, "step": 9596 }, { "epoch": 0.4636903899115814, "grad_norm": 2.130934238433838, "learning_rate": 5.363096100884186e-07, "loss": 0.2341, "step": 9597 }, { "epoch": 0.4637387060926704, "grad_norm": 2.599738597869873, "learning_rate": 5.362612939073296e-07, "loss": 0.2355, "step": 9598 }, { "epoch": 0.4637870222737595, "grad_norm": 2.60591459274292, "learning_rate": 5.362129777262406e-07, "loss": 0.2754, "step": 9599 }, { "epoch": 0.46383533845484853, "grad_norm": 2.9667365550994873, "learning_rate": 5.361646615451515e-07, "loss": 0.3161, "step": 9600 }, { "epoch": 0.46388365463593756, "grad_norm": 4.198695659637451, "learning_rate": 5.361163453640623e-07, "loss": 0.2406, "step": 9601 }, { "epoch": 0.46393197081702664, "grad_norm": 3.0527093410491943, "learning_rate": 5.360680291829733e-07, "loss": 0.3448, "step": 9602 }, { "epoch": 0.46398028699811567, "grad_norm": 1.9696887731552124, "learning_rate": 5.360197130018843e-07, "loss": 0.2499, "step": 9603 }, { "epoch": 0.4640286031792047, "grad_norm": 2.660996198654175, "learning_rate": 5.359713968207953e-07, "loss": 0.2008, "step": 9604 }, { "epoch": 0.46407691936029377, "grad_norm": 2.130284070968628, "learning_rate": 5.359230806397063e-07, "loss": 0.2184, "step": 9605 }, { "epoch": 0.4641252355413828, "grad_norm": 1.9334394931793213, "learning_rate": 5.358747644586172e-07, "loss": 0.1864, "step": 9606 }, { "epoch": 0.4641735517224719, "grad_norm": 2.9102020263671875, "learning_rate": 5.35826448277528e-07, "loss": 0.3157, "step": 9607 }, { "epoch": 0.4642218679035609, "grad_norm": 12.986628532409668, "learning_rate": 5.35778132096439e-07, "loss": 0.2588, "step": 9608 }, { "epoch": 0.46427018408464993, "grad_norm": 2.4304442405700684, "learning_rate": 5.3572981591535e-07, "loss": 0.2313, "step": 9609 }, { "epoch": 0.464318500265739, "grad_norm": 8.986401557922363, "learning_rate": 5.35681499734261e-07, "loss": 0.3571, "step": 9610 }, { "epoch": 0.46436681644682803, "grad_norm": 3.3929288387298584, "learning_rate": 5.356331835531719e-07, "loss": 0.1836, "step": 9611 }, { "epoch": 0.4644151326279171, "grad_norm": 3.20521879196167, "learning_rate": 5.355848673720829e-07, "loss": 0.3699, "step": 9612 }, { "epoch": 0.46446344880900614, "grad_norm": 2.3877832889556885, "learning_rate": 5.355365511909939e-07, "loss": 0.2343, "step": 9613 }, { "epoch": 0.46451176499009517, "grad_norm": 3.1696877479553223, "learning_rate": 5.354882350099048e-07, "loss": 0.4175, "step": 9614 }, { "epoch": 0.46456008117118425, "grad_norm": 2.4880945682525635, "learning_rate": 5.354399188288158e-07, "loss": 0.3179, "step": 9615 }, { "epoch": 0.4646083973522733, "grad_norm": 2.8036491870880127, "learning_rate": 5.353916026477266e-07, "loss": 0.3042, "step": 9616 }, { "epoch": 0.4646567135333623, "grad_norm": 2.5199410915374756, "learning_rate": 5.353432864666376e-07, "loss": 0.2634, "step": 9617 }, { "epoch": 0.4647050297144514, "grad_norm": 3.2800235748291016, "learning_rate": 5.352949702855486e-07, "loss": 0.3623, "step": 9618 }, { "epoch": 0.4647533458955404, "grad_norm": 1.8799104690551758, "learning_rate": 5.352466541044596e-07, "loss": 0.1993, "step": 9619 }, { "epoch": 0.4648016620766295, "grad_norm": 2.5206456184387207, "learning_rate": 5.351983379233705e-07, "loss": 0.3782, "step": 9620 }, { "epoch": 0.4648499782577185, "grad_norm": 3.083625078201294, "learning_rate": 5.351500217422815e-07, "loss": 0.2376, "step": 9621 }, { "epoch": 0.46489829443880754, "grad_norm": 2.514413595199585, "learning_rate": 5.351017055611924e-07, "loss": 0.2904, "step": 9622 }, { "epoch": 0.4649466106198966, "grad_norm": 2.6457626819610596, "learning_rate": 5.350533893801034e-07, "loss": 0.3089, "step": 9623 }, { "epoch": 0.46499492680098564, "grad_norm": 1.9015651941299438, "learning_rate": 5.350050731990144e-07, "loss": 0.2288, "step": 9624 }, { "epoch": 0.4650432429820747, "grad_norm": 2.7274789810180664, "learning_rate": 5.349567570179253e-07, "loss": 0.26, "step": 9625 }, { "epoch": 0.46509155916316375, "grad_norm": 2.0837132930755615, "learning_rate": 5.349084408368363e-07, "loss": 0.1912, "step": 9626 }, { "epoch": 0.4651398753442528, "grad_norm": 3.33566951751709, "learning_rate": 5.348601246557471e-07, "loss": 0.3717, "step": 9627 }, { "epoch": 0.46518819152534185, "grad_norm": 3.2919418811798096, "learning_rate": 5.348118084746581e-07, "loss": 0.4883, "step": 9628 }, { "epoch": 0.4652365077064309, "grad_norm": 2.7439486980438232, "learning_rate": 5.347634922935691e-07, "loss": 0.2888, "step": 9629 }, { "epoch": 0.4652848238875199, "grad_norm": 2.967517852783203, "learning_rate": 5.347151761124801e-07, "loss": 0.3321, "step": 9630 }, { "epoch": 0.465333140068609, "grad_norm": 2.8287360668182373, "learning_rate": 5.346668599313911e-07, "loss": 0.3333, "step": 9631 }, { "epoch": 0.465381456249698, "grad_norm": 2.76839542388916, "learning_rate": 5.34618543750302e-07, "loss": 0.2072, "step": 9632 }, { "epoch": 0.4654297724307871, "grad_norm": 2.142778158187866, "learning_rate": 5.345702275692128e-07, "loss": 0.2103, "step": 9633 }, { "epoch": 0.4654780886118761, "grad_norm": 3.1030473709106445, "learning_rate": 5.345219113881238e-07, "loss": 0.4351, "step": 9634 }, { "epoch": 0.46552640479296514, "grad_norm": 3.221294641494751, "learning_rate": 5.344735952070348e-07, "loss": 0.3466, "step": 9635 }, { "epoch": 0.4655747209740542, "grad_norm": 5.185123443603516, "learning_rate": 5.344252790259458e-07, "loss": 0.393, "step": 9636 }, { "epoch": 0.46562303715514325, "grad_norm": 2.9354031085968018, "learning_rate": 5.343769628448567e-07, "loss": 0.2299, "step": 9637 }, { "epoch": 0.46567135333623233, "grad_norm": 2.997206687927246, "learning_rate": 5.343286466637677e-07, "loss": 0.3929, "step": 9638 }, { "epoch": 0.46571966951732136, "grad_norm": 2.3130321502685547, "learning_rate": 5.342803304826786e-07, "loss": 0.3042, "step": 9639 }, { "epoch": 0.4657679856984104, "grad_norm": 3.2362329959869385, "learning_rate": 5.342320143015896e-07, "loss": 0.3342, "step": 9640 }, { "epoch": 0.46581630187949946, "grad_norm": 3.872004270553589, "learning_rate": 5.341836981205006e-07, "loss": 0.2769, "step": 9641 }, { "epoch": 0.4658646180605885, "grad_norm": 2.0038819313049316, "learning_rate": 5.341353819394114e-07, "loss": 0.1899, "step": 9642 }, { "epoch": 0.4659129342416775, "grad_norm": 2.5114338397979736, "learning_rate": 5.340870657583224e-07, "loss": 0.1649, "step": 9643 }, { "epoch": 0.4659612504227666, "grad_norm": 2.589992046356201, "learning_rate": 5.340387495772334e-07, "loss": 0.3334, "step": 9644 }, { "epoch": 0.4660095666038556, "grad_norm": 3.0719597339630127, "learning_rate": 5.339904333961444e-07, "loss": 0.4254, "step": 9645 }, { "epoch": 0.4660578827849447, "grad_norm": 7.170373439788818, "learning_rate": 5.339421172150553e-07, "loss": 0.3192, "step": 9646 }, { "epoch": 0.4661061989660337, "grad_norm": 3.4447262287139893, "learning_rate": 5.338938010339662e-07, "loss": 0.5351, "step": 9647 }, { "epoch": 0.46615451514712275, "grad_norm": 3.4950759410858154, "learning_rate": 5.338454848528772e-07, "loss": 0.2431, "step": 9648 }, { "epoch": 0.46620283132821183, "grad_norm": 2.928277015686035, "learning_rate": 5.337971686717882e-07, "loss": 0.402, "step": 9649 }, { "epoch": 0.46625114750930086, "grad_norm": 2.2131478786468506, "learning_rate": 5.337488524906991e-07, "loss": 0.2295, "step": 9650 }, { "epoch": 0.46629946369038994, "grad_norm": 2.208604335784912, "learning_rate": 5.337005363096101e-07, "loss": 0.2065, "step": 9651 }, { "epoch": 0.46634777987147896, "grad_norm": 2.821690082550049, "learning_rate": 5.33652220128521e-07, "loss": 0.354, "step": 9652 }, { "epoch": 0.466396096052568, "grad_norm": 4.102231025695801, "learning_rate": 5.336039039474319e-07, "loss": 0.2834, "step": 9653 }, { "epoch": 0.46644441223365707, "grad_norm": 2.498227834701538, "learning_rate": 5.335555877663429e-07, "loss": 0.1766, "step": 9654 }, { "epoch": 0.4664927284147461, "grad_norm": 3.2882838249206543, "learning_rate": 5.335072715852539e-07, "loss": 0.3743, "step": 9655 }, { "epoch": 0.4665410445958351, "grad_norm": 5.452943801879883, "learning_rate": 5.334589554041649e-07, "loss": 0.2644, "step": 9656 }, { "epoch": 0.4665893607769242, "grad_norm": 2.1562907695770264, "learning_rate": 5.334106392230759e-07, "loss": 0.2092, "step": 9657 }, { "epoch": 0.4666376769580132, "grad_norm": 2.1212759017944336, "learning_rate": 5.333623230419866e-07, "loss": 0.2312, "step": 9658 }, { "epoch": 0.4666859931391023, "grad_norm": 2.3097686767578125, "learning_rate": 5.333140068608976e-07, "loss": 0.28, "step": 9659 }, { "epoch": 0.46673430932019133, "grad_norm": 1.9546364545822144, "learning_rate": 5.332656906798086e-07, "loss": 0.1606, "step": 9660 }, { "epoch": 0.46678262550128036, "grad_norm": 1.8564585447311401, "learning_rate": 5.332173744987196e-07, "loss": 0.1376, "step": 9661 }, { "epoch": 0.46683094168236944, "grad_norm": 1.1742914915084839, "learning_rate": 5.331690583176306e-07, "loss": 0.1179, "step": 9662 }, { "epoch": 0.46687925786345846, "grad_norm": 2.221682071685791, "learning_rate": 5.331207421365415e-07, "loss": 0.2691, "step": 9663 }, { "epoch": 0.46692757404454754, "grad_norm": 4.365329265594482, "learning_rate": 5.330724259554525e-07, "loss": 0.2211, "step": 9664 }, { "epoch": 0.46697589022563657, "grad_norm": 2.5999250411987305, "learning_rate": 5.330241097743634e-07, "loss": 0.2258, "step": 9665 }, { "epoch": 0.4670242064067256, "grad_norm": 2.2563581466674805, "learning_rate": 5.329757935932744e-07, "loss": 0.2656, "step": 9666 }, { "epoch": 0.4670725225878147, "grad_norm": 1.8868811130523682, "learning_rate": 5.329274774121853e-07, "loss": 0.1984, "step": 9667 }, { "epoch": 0.4671208387689037, "grad_norm": 2.2997820377349854, "learning_rate": 5.328791612310962e-07, "loss": 0.2885, "step": 9668 }, { "epoch": 0.4671691549499927, "grad_norm": 2.373612642288208, "learning_rate": 5.328308450500072e-07, "loss": 0.2852, "step": 9669 }, { "epoch": 0.4672174711310818, "grad_norm": 3.2973108291625977, "learning_rate": 5.327825288689182e-07, "loss": 0.3691, "step": 9670 }, { "epoch": 0.46726578731217083, "grad_norm": 4.266437530517578, "learning_rate": 5.327342126878291e-07, "loss": 0.3081, "step": 9671 }, { "epoch": 0.4673141034932599, "grad_norm": 2.783385992050171, "learning_rate": 5.326858965067401e-07, "loss": 0.3793, "step": 9672 }, { "epoch": 0.46736241967434894, "grad_norm": 9.17857551574707, "learning_rate": 5.32637580325651e-07, "loss": 0.3449, "step": 9673 }, { "epoch": 0.46741073585543796, "grad_norm": 2.396899700164795, "learning_rate": 5.32589264144562e-07, "loss": 0.3133, "step": 9674 }, { "epoch": 0.46745905203652705, "grad_norm": 2.9357128143310547, "learning_rate": 5.32540947963473e-07, "loss": 0.4092, "step": 9675 }, { "epoch": 0.46750736821761607, "grad_norm": 2.510847568511963, "learning_rate": 5.324926317823839e-07, "loss": 0.2767, "step": 9676 }, { "epoch": 0.46755568439870515, "grad_norm": 6.3445143699646, "learning_rate": 5.324443156012949e-07, "loss": 0.2422, "step": 9677 }, { "epoch": 0.4676040005797942, "grad_norm": 2.4241607189178467, "learning_rate": 5.323959994202058e-07, "loss": 0.2534, "step": 9678 }, { "epoch": 0.4676523167608832, "grad_norm": 2.5586986541748047, "learning_rate": 5.323476832391167e-07, "loss": 0.3341, "step": 9679 }, { "epoch": 0.4677006329419723, "grad_norm": 3.026386260986328, "learning_rate": 5.322993670580277e-07, "loss": 0.2562, "step": 9680 }, { "epoch": 0.4677489491230613, "grad_norm": 2.4675819873809814, "learning_rate": 5.322510508769387e-07, "loss": 0.2543, "step": 9681 }, { "epoch": 0.46779726530415033, "grad_norm": 2.3633387088775635, "learning_rate": 5.322027346958497e-07, "loss": 0.269, "step": 9682 }, { "epoch": 0.4678455814852394, "grad_norm": 2.2029993534088135, "learning_rate": 5.321544185147607e-07, "loss": 0.1738, "step": 9683 }, { "epoch": 0.46789389766632844, "grad_norm": 2.333677291870117, "learning_rate": 5.321061023336714e-07, "loss": 0.3047, "step": 9684 }, { "epoch": 0.4679422138474175, "grad_norm": 2.826958179473877, "learning_rate": 5.320577861525824e-07, "loss": 0.3109, "step": 9685 }, { "epoch": 0.46799053002850655, "grad_norm": 2.776901960372925, "learning_rate": 5.320094699714934e-07, "loss": 0.2543, "step": 9686 }, { "epoch": 0.46803884620959557, "grad_norm": 2.556694984436035, "learning_rate": 5.319611537904044e-07, "loss": 0.3916, "step": 9687 }, { "epoch": 0.46808716239068465, "grad_norm": 2.6670620441436768, "learning_rate": 5.319128376093154e-07, "loss": 0.3192, "step": 9688 }, { "epoch": 0.4681354785717737, "grad_norm": 3.12017822265625, "learning_rate": 5.318645214282263e-07, "loss": 0.3114, "step": 9689 }, { "epoch": 0.46818379475286276, "grad_norm": 3.020855665206909, "learning_rate": 5.318162052471372e-07, "loss": 0.2899, "step": 9690 }, { "epoch": 0.4682321109339518, "grad_norm": 4.289438724517822, "learning_rate": 5.317678890660482e-07, "loss": 0.3576, "step": 9691 }, { "epoch": 0.4682804271150408, "grad_norm": 4.991974830627441, "learning_rate": 5.317195728849591e-07, "loss": 0.2835, "step": 9692 }, { "epoch": 0.4683287432961299, "grad_norm": 2.6859467029571533, "learning_rate": 5.316712567038701e-07, "loss": 0.3666, "step": 9693 }, { "epoch": 0.4683770594772189, "grad_norm": 2.961483955383301, "learning_rate": 5.31622940522781e-07, "loss": 0.3085, "step": 9694 }, { "epoch": 0.46842537565830794, "grad_norm": 1.8455933332443237, "learning_rate": 5.31574624341692e-07, "loss": 0.1817, "step": 9695 }, { "epoch": 0.468473691839397, "grad_norm": 3.432079792022705, "learning_rate": 5.31526308160603e-07, "loss": 0.429, "step": 9696 }, { "epoch": 0.46852200802048605, "grad_norm": 2.825193166732788, "learning_rate": 5.314779919795139e-07, "loss": 0.2674, "step": 9697 }, { "epoch": 0.46857032420157513, "grad_norm": 10.113808631896973, "learning_rate": 5.314296757984249e-07, "loss": 0.2546, "step": 9698 }, { "epoch": 0.46861864038266415, "grad_norm": 2.761587381362915, "learning_rate": 5.313813596173358e-07, "loss": 0.2393, "step": 9699 }, { "epoch": 0.4686669565637532, "grad_norm": 2.3161911964416504, "learning_rate": 5.313330434362468e-07, "loss": 0.2451, "step": 9700 }, { "epoch": 0.46871527274484226, "grad_norm": 3.0332789421081543, "learning_rate": 5.312847272551577e-07, "loss": 0.3059, "step": 9701 }, { "epoch": 0.4687635889259313, "grad_norm": 3.6231281757354736, "learning_rate": 5.312364110740687e-07, "loss": 0.1645, "step": 9702 }, { "epoch": 0.46881190510702037, "grad_norm": 4.125720977783203, "learning_rate": 5.311880948929796e-07, "loss": 0.3291, "step": 9703 }, { "epoch": 0.4688602212881094, "grad_norm": 2.0065977573394775, "learning_rate": 5.311397787118906e-07, "loss": 0.1985, "step": 9704 }, { "epoch": 0.4689085374691984, "grad_norm": 2.4859447479248047, "learning_rate": 5.310914625308015e-07, "loss": 0.2739, "step": 9705 }, { "epoch": 0.4689568536502875, "grad_norm": 1.9273606538772583, "learning_rate": 5.310431463497125e-07, "loss": 0.1826, "step": 9706 }, { "epoch": 0.4690051698313765, "grad_norm": 4.362617492675781, "learning_rate": 5.309948301686235e-07, "loss": 0.2771, "step": 9707 }, { "epoch": 0.46905348601246555, "grad_norm": 2.59615421295166, "learning_rate": 5.309465139875345e-07, "loss": 0.203, "step": 9708 }, { "epoch": 0.46910180219355463, "grad_norm": 2.2168219089508057, "learning_rate": 5.308981978064455e-07, "loss": 0.2633, "step": 9709 }, { "epoch": 0.46915011837464365, "grad_norm": 2.0949814319610596, "learning_rate": 5.308498816253562e-07, "loss": 0.2548, "step": 9710 }, { "epoch": 0.46919843455573274, "grad_norm": 5.068928241729736, "learning_rate": 5.308015654442672e-07, "loss": 0.3993, "step": 9711 }, { "epoch": 0.46924675073682176, "grad_norm": 2.4091124534606934, "learning_rate": 5.307532492631782e-07, "loss": 0.2674, "step": 9712 }, { "epoch": 0.4692950669179108, "grad_norm": 2.3864831924438477, "learning_rate": 5.307049330820892e-07, "loss": 0.2892, "step": 9713 }, { "epoch": 0.46934338309899987, "grad_norm": 4.54428243637085, "learning_rate": 5.306566169010002e-07, "loss": 0.3218, "step": 9714 }, { "epoch": 0.4693916992800889, "grad_norm": 2.5672826766967773, "learning_rate": 5.306083007199111e-07, "loss": 0.2089, "step": 9715 }, { "epoch": 0.469440015461178, "grad_norm": 2.5516574382781982, "learning_rate": 5.30559984538822e-07, "loss": 0.3033, "step": 9716 }, { "epoch": 0.469488331642267, "grad_norm": 2.321207284927368, "learning_rate": 5.30511668357733e-07, "loss": 0.2937, "step": 9717 }, { "epoch": 0.469536647823356, "grad_norm": 3.9204256534576416, "learning_rate": 5.304633521766439e-07, "loss": 0.3735, "step": 9718 }, { "epoch": 0.4695849640044451, "grad_norm": 3.6864919662475586, "learning_rate": 5.304150359955549e-07, "loss": 0.3103, "step": 9719 }, { "epoch": 0.46963328018553413, "grad_norm": 2.3991358280181885, "learning_rate": 5.303667198144658e-07, "loss": 0.2386, "step": 9720 }, { "epoch": 0.46968159636662316, "grad_norm": 2.978111982345581, "learning_rate": 5.303184036333768e-07, "loss": 0.4079, "step": 9721 }, { "epoch": 0.46972991254771224, "grad_norm": 3.9000115394592285, "learning_rate": 5.302700874522877e-07, "loss": 0.3112, "step": 9722 }, { "epoch": 0.46977822872880126, "grad_norm": 2.0798099040985107, "learning_rate": 5.302217712711987e-07, "loss": 0.2646, "step": 9723 }, { "epoch": 0.46982654490989034, "grad_norm": 3.624202013015747, "learning_rate": 5.301734550901097e-07, "loss": 0.2964, "step": 9724 }, { "epoch": 0.46987486109097937, "grad_norm": 2.365222692489624, "learning_rate": 5.301251389090206e-07, "loss": 0.3236, "step": 9725 }, { "epoch": 0.4699231772720684, "grad_norm": 6.122688293457031, "learning_rate": 5.300768227279315e-07, "loss": 0.2213, "step": 9726 }, { "epoch": 0.4699714934531575, "grad_norm": 4.041342258453369, "learning_rate": 5.300285065468425e-07, "loss": 0.2972, "step": 9727 }, { "epoch": 0.4700198096342465, "grad_norm": 3.7363038063049316, "learning_rate": 5.299801903657535e-07, "loss": 0.3095, "step": 9728 }, { "epoch": 0.4700681258153356, "grad_norm": 2.485874891281128, "learning_rate": 5.299318741846644e-07, "loss": 0.2487, "step": 9729 }, { "epoch": 0.4701164419964246, "grad_norm": 3.461920738220215, "learning_rate": 5.298835580035754e-07, "loss": 0.2977, "step": 9730 }, { "epoch": 0.47016475817751363, "grad_norm": 2.01560115814209, "learning_rate": 5.298352418224863e-07, "loss": 0.2351, "step": 9731 }, { "epoch": 0.4702130743586027, "grad_norm": 2.7600510120391846, "learning_rate": 5.297869256413973e-07, "loss": 0.2633, "step": 9732 }, { "epoch": 0.47026139053969174, "grad_norm": 2.04573655128479, "learning_rate": 5.297386094603083e-07, "loss": 0.2057, "step": 9733 }, { "epoch": 0.47030970672078076, "grad_norm": 3.2872393131256104, "learning_rate": 5.296902932792193e-07, "loss": 0.3213, "step": 9734 }, { "epoch": 0.47035802290186984, "grad_norm": 3.6655266284942627, "learning_rate": 5.296419770981301e-07, "loss": 0.3251, "step": 9735 }, { "epoch": 0.47040633908295887, "grad_norm": 16.42387580871582, "learning_rate": 5.29593660917041e-07, "loss": 0.4472, "step": 9736 }, { "epoch": 0.47045465526404795, "grad_norm": 3.694007635116577, "learning_rate": 5.29545344735952e-07, "loss": 0.5035, "step": 9737 }, { "epoch": 0.470502971445137, "grad_norm": 2.6107540130615234, "learning_rate": 5.29497028554863e-07, "loss": 0.1539, "step": 9738 }, { "epoch": 0.470551287626226, "grad_norm": 4.711629390716553, "learning_rate": 5.29448712373774e-07, "loss": 0.2384, "step": 9739 }, { "epoch": 0.4705996038073151, "grad_norm": 2.48374342918396, "learning_rate": 5.29400396192685e-07, "loss": 0.259, "step": 9740 }, { "epoch": 0.4706479199884041, "grad_norm": 3.2508749961853027, "learning_rate": 5.293520800115958e-07, "loss": 0.4473, "step": 9741 }, { "epoch": 0.4706962361694932, "grad_norm": 5.821134090423584, "learning_rate": 5.293037638305068e-07, "loss": 0.462, "step": 9742 }, { "epoch": 0.4707445523505822, "grad_norm": 3.2663238048553467, "learning_rate": 5.292554476494177e-07, "loss": 0.382, "step": 9743 }, { "epoch": 0.47079286853167124, "grad_norm": 3.3223376274108887, "learning_rate": 5.292071314683287e-07, "loss": 0.3353, "step": 9744 }, { "epoch": 0.4708411847127603, "grad_norm": 2.957225799560547, "learning_rate": 5.291588152872397e-07, "loss": 0.2982, "step": 9745 }, { "epoch": 0.47088950089384934, "grad_norm": 3.239997386932373, "learning_rate": 5.291104991061506e-07, "loss": 0.3373, "step": 9746 }, { "epoch": 0.47093781707493837, "grad_norm": 2.4732589721679688, "learning_rate": 5.290621829250616e-07, "loss": 0.3066, "step": 9747 }, { "epoch": 0.47098613325602745, "grad_norm": 1.9306950569152832, "learning_rate": 5.290138667439725e-07, "loss": 0.1864, "step": 9748 }, { "epoch": 0.4710344494371165, "grad_norm": 9.39367389678955, "learning_rate": 5.289655505628835e-07, "loss": 0.3622, "step": 9749 }, { "epoch": 0.47108276561820556, "grad_norm": 4.634139060974121, "learning_rate": 5.289172343817945e-07, "loss": 0.252, "step": 9750 }, { "epoch": 0.4711310817992946, "grad_norm": 2.6623549461364746, "learning_rate": 5.288689182007053e-07, "loss": 0.2351, "step": 9751 }, { "epoch": 0.4711793979803836, "grad_norm": 3.123722791671753, "learning_rate": 5.288206020196163e-07, "loss": 0.3533, "step": 9752 }, { "epoch": 0.4712277141614727, "grad_norm": 2.607603073120117, "learning_rate": 5.287722858385273e-07, "loss": 0.2573, "step": 9753 }, { "epoch": 0.4712760303425617, "grad_norm": 57.749874114990234, "learning_rate": 5.287239696574382e-07, "loss": 0.3096, "step": 9754 }, { "epoch": 0.4713243465236508, "grad_norm": 2.851466417312622, "learning_rate": 5.286756534763492e-07, "loss": 0.373, "step": 9755 }, { "epoch": 0.4713726627047398, "grad_norm": 3.0808565616607666, "learning_rate": 5.286273372952602e-07, "loss": 0.3774, "step": 9756 }, { "epoch": 0.47142097888582885, "grad_norm": 2.3282108306884766, "learning_rate": 5.285790211141711e-07, "loss": 0.3006, "step": 9757 }, { "epoch": 0.4714692950669179, "grad_norm": 2.5471816062927246, "learning_rate": 5.285307049330821e-07, "loss": 0.3181, "step": 9758 }, { "epoch": 0.47151761124800695, "grad_norm": 2.7816250324249268, "learning_rate": 5.284823887519931e-07, "loss": 0.2892, "step": 9759 }, { "epoch": 0.471565927429096, "grad_norm": 8.817938804626465, "learning_rate": 5.28434072570904e-07, "loss": 0.3072, "step": 9760 }, { "epoch": 0.47161424361018506, "grad_norm": 9.839372634887695, "learning_rate": 5.283857563898149e-07, "loss": 0.3103, "step": 9761 }, { "epoch": 0.4716625597912741, "grad_norm": 2.284627914428711, "learning_rate": 5.283374402087258e-07, "loss": 0.2468, "step": 9762 }, { "epoch": 0.47171087597236316, "grad_norm": 3.0225348472595215, "learning_rate": 5.282891240276368e-07, "loss": 0.4566, "step": 9763 }, { "epoch": 0.4717591921534522, "grad_norm": 3.353902578353882, "learning_rate": 5.282408078465478e-07, "loss": 0.4011, "step": 9764 }, { "epoch": 0.4718075083345412, "grad_norm": 3.8091890811920166, "learning_rate": 5.281924916654588e-07, "loss": 0.3414, "step": 9765 }, { "epoch": 0.4718558245156303, "grad_norm": 2.444014072418213, "learning_rate": 5.281441754843698e-07, "loss": 0.2446, "step": 9766 }, { "epoch": 0.4719041406967193, "grad_norm": 4.357738494873047, "learning_rate": 5.280958593032806e-07, "loss": 0.2637, "step": 9767 }, { "epoch": 0.4719524568778084, "grad_norm": 6.445662021636963, "learning_rate": 5.280475431221915e-07, "loss": 0.245, "step": 9768 }, { "epoch": 0.4720007730588974, "grad_norm": 2.6984145641326904, "learning_rate": 5.279992269411025e-07, "loss": 0.2768, "step": 9769 }, { "epoch": 0.47204908923998645, "grad_norm": 6.336101055145264, "learning_rate": 5.279509107600135e-07, "loss": 0.3105, "step": 9770 }, { "epoch": 0.47209740542107553, "grad_norm": 41.67894744873047, "learning_rate": 5.279025945789245e-07, "loss": 0.308, "step": 9771 }, { "epoch": 0.47214572160216456, "grad_norm": 1.4023067951202393, "learning_rate": 5.278542783978354e-07, "loss": 0.1547, "step": 9772 }, { "epoch": 0.4721940377832536, "grad_norm": 2.609123468399048, "learning_rate": 5.278059622167463e-07, "loss": 0.2676, "step": 9773 }, { "epoch": 0.47224235396434266, "grad_norm": 3.061391592025757, "learning_rate": 5.277576460356573e-07, "loss": 0.2862, "step": 9774 }, { "epoch": 0.4722906701454317, "grad_norm": 3.0789926052093506, "learning_rate": 5.277093298545683e-07, "loss": 0.2319, "step": 9775 }, { "epoch": 0.47233898632652077, "grad_norm": 3.363644599914551, "learning_rate": 5.276610136734793e-07, "loss": 0.4383, "step": 9776 }, { "epoch": 0.4723873025076098, "grad_norm": 2.5228726863861084, "learning_rate": 5.276126974923901e-07, "loss": 0.3481, "step": 9777 }, { "epoch": 0.4724356186886988, "grad_norm": 2.299020528793335, "learning_rate": 5.275643813113011e-07, "loss": 0.3402, "step": 9778 }, { "epoch": 0.4724839348697879, "grad_norm": 2.4743380546569824, "learning_rate": 5.275160651302121e-07, "loss": 0.2929, "step": 9779 }, { "epoch": 0.47253225105087693, "grad_norm": 2.9066600799560547, "learning_rate": 5.27467748949123e-07, "loss": 0.3576, "step": 9780 }, { "epoch": 0.472580567231966, "grad_norm": 4.41270112991333, "learning_rate": 5.27419432768034e-07, "loss": 0.2365, "step": 9781 }, { "epoch": 0.47262888341305503, "grad_norm": 2.5317134857177734, "learning_rate": 5.27371116586945e-07, "loss": 0.2868, "step": 9782 }, { "epoch": 0.47267719959414406, "grad_norm": 3.182103395462036, "learning_rate": 5.273228004058559e-07, "loss": 0.2048, "step": 9783 }, { "epoch": 0.47272551577523314, "grad_norm": 3.216123342514038, "learning_rate": 5.272744842247669e-07, "loss": 0.3266, "step": 9784 }, { "epoch": 0.47277383195632217, "grad_norm": 1.3731380701065063, "learning_rate": 5.272261680436778e-07, "loss": 0.1511, "step": 9785 }, { "epoch": 0.47282214813741125, "grad_norm": 2.175063371658325, "learning_rate": 5.271778518625887e-07, "loss": 0.2125, "step": 9786 }, { "epoch": 0.47287046431850027, "grad_norm": 2.768188238143921, "learning_rate": 5.271295356814997e-07, "loss": 0.4404, "step": 9787 }, { "epoch": 0.4729187804995893, "grad_norm": 6.329874038696289, "learning_rate": 5.270812195004106e-07, "loss": 0.232, "step": 9788 }, { "epoch": 0.4729670966806784, "grad_norm": 1.6179370880126953, "learning_rate": 5.270329033193216e-07, "loss": 0.2065, "step": 9789 }, { "epoch": 0.4730154128617674, "grad_norm": 2.4420816898345947, "learning_rate": 5.269845871382326e-07, "loss": 0.2398, "step": 9790 }, { "epoch": 0.47306372904285643, "grad_norm": 5.074102401733398, "learning_rate": 5.269362709571436e-07, "loss": 0.3341, "step": 9791 }, { "epoch": 0.4731120452239455, "grad_norm": 3.142599582672119, "learning_rate": 5.268879547760546e-07, "loss": 0.268, "step": 9792 }, { "epoch": 0.47316036140503454, "grad_norm": 2.7714433670043945, "learning_rate": 5.268396385949653e-07, "loss": 0.3388, "step": 9793 }, { "epoch": 0.4732086775861236, "grad_norm": 3.1877715587615967, "learning_rate": 5.267913224138763e-07, "loss": 0.2016, "step": 9794 }, { "epoch": 0.47325699376721264, "grad_norm": 4.0165839195251465, "learning_rate": 5.267430062327873e-07, "loss": 0.1506, "step": 9795 }, { "epoch": 0.47330530994830167, "grad_norm": 5.394674301147461, "learning_rate": 5.266946900516983e-07, "loss": 0.2535, "step": 9796 }, { "epoch": 0.47335362612939075, "grad_norm": 4.1677565574646, "learning_rate": 5.266463738706093e-07, "loss": 0.2625, "step": 9797 }, { "epoch": 0.4734019423104798, "grad_norm": 2.329521417617798, "learning_rate": 5.265980576895202e-07, "loss": 0.2871, "step": 9798 }, { "epoch": 0.47345025849156885, "grad_norm": 3.544335126876831, "learning_rate": 5.265497415084311e-07, "loss": 0.2434, "step": 9799 }, { "epoch": 0.4734985746726579, "grad_norm": 2.1740503311157227, "learning_rate": 5.265014253273421e-07, "loss": 0.2634, "step": 9800 }, { "epoch": 0.4735468908537469, "grad_norm": 2.0212085247039795, "learning_rate": 5.264531091462531e-07, "loss": 0.2552, "step": 9801 }, { "epoch": 0.473595207034836, "grad_norm": 3.3464598655700684, "learning_rate": 5.26404792965164e-07, "loss": 0.4237, "step": 9802 }, { "epoch": 0.473643523215925, "grad_norm": 2.334867238998413, "learning_rate": 5.263564767840749e-07, "loss": 0.3014, "step": 9803 }, { "epoch": 0.47369183939701404, "grad_norm": 2.9345500469207764, "learning_rate": 5.263081606029859e-07, "loss": 0.2728, "step": 9804 }, { "epoch": 0.4737401555781031, "grad_norm": 2.8412675857543945, "learning_rate": 5.262598444218968e-07, "loss": 0.3206, "step": 9805 }, { "epoch": 0.47378847175919214, "grad_norm": 2.48103928565979, "learning_rate": 5.262115282408078e-07, "loss": 0.2576, "step": 9806 }, { "epoch": 0.4738367879402812, "grad_norm": 3.193498134613037, "learning_rate": 5.261632120597188e-07, "loss": 0.3807, "step": 9807 }, { "epoch": 0.47388510412137025, "grad_norm": 3.0181143283843994, "learning_rate": 5.261148958786298e-07, "loss": 0.3118, "step": 9808 }, { "epoch": 0.4739334203024593, "grad_norm": 18.153850555419922, "learning_rate": 5.260665796975407e-07, "loss": 0.3123, "step": 9809 }, { "epoch": 0.47398173648354835, "grad_norm": 3.3747477531433105, "learning_rate": 5.260182635164517e-07, "loss": 0.3292, "step": 9810 }, { "epoch": 0.4740300526646374, "grad_norm": 4.1238532066345215, "learning_rate": 5.259699473353626e-07, "loss": 0.2681, "step": 9811 }, { "epoch": 0.47407836884572646, "grad_norm": 2.788506507873535, "learning_rate": 5.259216311542735e-07, "loss": 0.2695, "step": 9812 }, { "epoch": 0.4741266850268155, "grad_norm": 2.917928457260132, "learning_rate": 5.258733149731845e-07, "loss": 0.2817, "step": 9813 }, { "epoch": 0.4741750012079045, "grad_norm": 2.4568800926208496, "learning_rate": 5.258249987920954e-07, "loss": 0.219, "step": 9814 }, { "epoch": 0.4742233173889936, "grad_norm": 3.1557114124298096, "learning_rate": 5.257766826110064e-07, "loss": 0.4377, "step": 9815 }, { "epoch": 0.4742716335700826, "grad_norm": 2.6862783432006836, "learning_rate": 5.257283664299174e-07, "loss": 0.2847, "step": 9816 }, { "epoch": 0.47431994975117164, "grad_norm": 3.9656620025634766, "learning_rate": 5.256800502488284e-07, "loss": 0.3931, "step": 9817 }, { "epoch": 0.4743682659322607, "grad_norm": 4.99458122253418, "learning_rate": 5.256317340677393e-07, "loss": 0.3778, "step": 9818 }, { "epoch": 0.47441658211334975, "grad_norm": 18.24800682067871, "learning_rate": 5.255834178866501e-07, "loss": 0.1767, "step": 9819 }, { "epoch": 0.47446489829443883, "grad_norm": 2.1891188621520996, "learning_rate": 5.255351017055611e-07, "loss": 0.2299, "step": 9820 }, { "epoch": 0.47451321447552786, "grad_norm": 2.922013521194458, "learning_rate": 5.254867855244721e-07, "loss": 0.3141, "step": 9821 }, { "epoch": 0.4745615306566169, "grad_norm": 2.3536717891693115, "learning_rate": 5.254384693433831e-07, "loss": 0.2872, "step": 9822 }, { "epoch": 0.47460984683770596, "grad_norm": 4.794764995574951, "learning_rate": 5.253901531622941e-07, "loss": 0.4225, "step": 9823 }, { "epoch": 0.474658163018795, "grad_norm": 4.2294745445251465, "learning_rate": 5.25341836981205e-07, "loss": 0.3335, "step": 9824 }, { "epoch": 0.47470647919988407, "grad_norm": 2.7742037773132324, "learning_rate": 5.252935208001159e-07, "loss": 0.3478, "step": 9825 }, { "epoch": 0.4747547953809731, "grad_norm": 3.1864819526672363, "learning_rate": 5.252452046190269e-07, "loss": 0.2923, "step": 9826 }, { "epoch": 0.4748031115620621, "grad_norm": 3.0412187576293945, "learning_rate": 5.251968884379378e-07, "loss": 0.2964, "step": 9827 }, { "epoch": 0.4748514277431512, "grad_norm": 2.3792710304260254, "learning_rate": 5.251485722568488e-07, "loss": 0.1785, "step": 9828 }, { "epoch": 0.4748997439242402, "grad_norm": 2.225477933883667, "learning_rate": 5.251002560757597e-07, "loss": 0.2573, "step": 9829 }, { "epoch": 0.47494806010532925, "grad_norm": 3.9303476810455322, "learning_rate": 5.250519398946707e-07, "loss": 0.3541, "step": 9830 }, { "epoch": 0.47499637628641833, "grad_norm": 1.9797089099884033, "learning_rate": 5.250036237135816e-07, "loss": 0.1992, "step": 9831 }, { "epoch": 0.47504469246750736, "grad_norm": 5.40712308883667, "learning_rate": 5.249553075324926e-07, "loss": 0.3407, "step": 9832 }, { "epoch": 0.47509300864859644, "grad_norm": 3.3484716415405273, "learning_rate": 5.249069913514036e-07, "loss": 0.3828, "step": 9833 }, { "epoch": 0.47514132482968546, "grad_norm": 2.491060972213745, "learning_rate": 5.248586751703146e-07, "loss": 0.2658, "step": 9834 }, { "epoch": 0.4751896410107745, "grad_norm": 4.449329853057861, "learning_rate": 5.248103589892255e-07, "loss": 0.3245, "step": 9835 }, { "epoch": 0.47523795719186357, "grad_norm": 2.06915020942688, "learning_rate": 5.247620428081364e-07, "loss": 0.2098, "step": 9836 }, { "epoch": 0.4752862733729526, "grad_norm": 2.179615020751953, "learning_rate": 5.247137266270474e-07, "loss": 0.2738, "step": 9837 }, { "epoch": 0.4753345895540417, "grad_norm": 2.990919589996338, "learning_rate": 5.246654104459583e-07, "loss": 0.281, "step": 9838 }, { "epoch": 0.4753829057351307, "grad_norm": 2.9557228088378906, "learning_rate": 5.246170942648693e-07, "loss": 0.4687, "step": 9839 }, { "epoch": 0.4754312219162197, "grad_norm": 2.6588149070739746, "learning_rate": 5.245687780837802e-07, "loss": 0.2694, "step": 9840 }, { "epoch": 0.4754795380973088, "grad_norm": 11.658353805541992, "learning_rate": 5.245204619026912e-07, "loss": 0.2855, "step": 9841 }, { "epoch": 0.47552785427839783, "grad_norm": 2.8983542919158936, "learning_rate": 5.244721457216022e-07, "loss": 0.2594, "step": 9842 }, { "epoch": 0.47557617045948686, "grad_norm": 2.6460466384887695, "learning_rate": 5.244238295405132e-07, "loss": 0.3505, "step": 9843 }, { "epoch": 0.47562448664057594, "grad_norm": 3.11306095123291, "learning_rate": 5.24375513359424e-07, "loss": 0.4094, "step": 9844 }, { "epoch": 0.47567280282166496, "grad_norm": 2.9561426639556885, "learning_rate": 5.243271971783349e-07, "loss": 0.2974, "step": 9845 }, { "epoch": 0.47572111900275404, "grad_norm": 3.509345769882202, "learning_rate": 5.242788809972459e-07, "loss": 0.2935, "step": 9846 }, { "epoch": 0.47576943518384307, "grad_norm": 78.19220733642578, "learning_rate": 5.242305648161569e-07, "loss": 0.3149, "step": 9847 }, { "epoch": 0.4758177513649321, "grad_norm": 2.365662097930908, "learning_rate": 5.241822486350679e-07, "loss": 0.3215, "step": 9848 }, { "epoch": 0.4758660675460212, "grad_norm": 2.8638126850128174, "learning_rate": 5.241339324539789e-07, "loss": 0.3468, "step": 9849 }, { "epoch": 0.4759143837271102, "grad_norm": 2.94657826423645, "learning_rate": 5.240856162728897e-07, "loss": 0.3399, "step": 9850 }, { "epoch": 0.4759626999081993, "grad_norm": 3.4983057975769043, "learning_rate": 5.240373000918007e-07, "loss": 0.4652, "step": 9851 }, { "epoch": 0.4760110160892883, "grad_norm": 4.064277648925781, "learning_rate": 5.239889839107117e-07, "loss": 0.3864, "step": 9852 }, { "epoch": 0.47605933227037733, "grad_norm": 3.214303731918335, "learning_rate": 5.239406677296226e-07, "loss": 0.3685, "step": 9853 }, { "epoch": 0.4761076484514664, "grad_norm": 2.9932639598846436, "learning_rate": 5.238923515485336e-07, "loss": 0.3425, "step": 9854 }, { "epoch": 0.47615596463255544, "grad_norm": 6.02510404586792, "learning_rate": 5.238440353674445e-07, "loss": 0.3271, "step": 9855 }, { "epoch": 0.47620428081364446, "grad_norm": 1.8899493217468262, "learning_rate": 5.237957191863555e-07, "loss": 0.2153, "step": 9856 }, { "epoch": 0.47625259699473355, "grad_norm": 2.805593729019165, "learning_rate": 5.237474030052664e-07, "loss": 0.4022, "step": 9857 }, { "epoch": 0.47630091317582257, "grad_norm": 3.8662517070770264, "learning_rate": 5.236990868241774e-07, "loss": 0.1949, "step": 9858 }, { "epoch": 0.47634922935691165, "grad_norm": 2.5316967964172363, "learning_rate": 5.236507706430884e-07, "loss": 0.2977, "step": 9859 }, { "epoch": 0.4763975455380007, "grad_norm": 2.1294243335723877, "learning_rate": 5.236024544619994e-07, "loss": 0.1894, "step": 9860 }, { "epoch": 0.4764458617190897, "grad_norm": 2.540015697479248, "learning_rate": 5.235541382809102e-07, "loss": 0.3542, "step": 9861 }, { "epoch": 0.4764941779001788, "grad_norm": 3.834587812423706, "learning_rate": 5.235058220998212e-07, "loss": 0.4018, "step": 9862 }, { "epoch": 0.4765424940812678, "grad_norm": 2.7290256023406982, "learning_rate": 5.234575059187321e-07, "loss": 0.2776, "step": 9863 }, { "epoch": 0.4765908102623569, "grad_norm": 2.3803093433380127, "learning_rate": 5.234091897376431e-07, "loss": 0.3073, "step": 9864 }, { "epoch": 0.4766391264434459, "grad_norm": 3.381016254425049, "learning_rate": 5.233608735565541e-07, "loss": 0.338, "step": 9865 }, { "epoch": 0.47668744262453494, "grad_norm": 4.606222629547119, "learning_rate": 5.23312557375465e-07, "loss": 0.3217, "step": 9866 }, { "epoch": 0.476735758805624, "grad_norm": 2.3433659076690674, "learning_rate": 5.23264241194376e-07, "loss": 0.2319, "step": 9867 }, { "epoch": 0.47678407498671305, "grad_norm": 2.9920003414154053, "learning_rate": 5.23215925013287e-07, "loss": 0.2331, "step": 9868 }, { "epoch": 0.47683239116780207, "grad_norm": 2.533064842224121, "learning_rate": 5.23167608832198e-07, "loss": 0.3917, "step": 9869 }, { "epoch": 0.47688070734889115, "grad_norm": 2.940279722213745, "learning_rate": 5.231192926511088e-07, "loss": 0.2484, "step": 9870 }, { "epoch": 0.4769290235299802, "grad_norm": 2.7082934379577637, "learning_rate": 5.230709764700197e-07, "loss": 0.2514, "step": 9871 }, { "epoch": 0.47697733971106926, "grad_norm": 2.2527315616607666, "learning_rate": 5.230226602889307e-07, "loss": 0.2426, "step": 9872 }, { "epoch": 0.4770256558921583, "grad_norm": 4.538723468780518, "learning_rate": 5.229743441078417e-07, "loss": 0.29, "step": 9873 }, { "epoch": 0.4770739720732473, "grad_norm": 6.1387104988098145, "learning_rate": 5.229260279267527e-07, "loss": 0.2914, "step": 9874 }, { "epoch": 0.4771222882543364, "grad_norm": 5.435134410858154, "learning_rate": 5.228777117456637e-07, "loss": 0.2511, "step": 9875 }, { "epoch": 0.4771706044354254, "grad_norm": 2.5347025394439697, "learning_rate": 5.228293955645745e-07, "loss": 0.3582, "step": 9876 }, { "epoch": 0.4772189206165145, "grad_norm": 4.631583213806152, "learning_rate": 5.227810793834855e-07, "loss": 0.3817, "step": 9877 }, { "epoch": 0.4772672367976035, "grad_norm": 6.276669979095459, "learning_rate": 5.227327632023964e-07, "loss": 0.2178, "step": 9878 }, { "epoch": 0.47731555297869255, "grad_norm": 1.9189611673355103, "learning_rate": 5.226844470213074e-07, "loss": 0.2465, "step": 9879 }, { "epoch": 0.47736386915978163, "grad_norm": 29.22127914428711, "learning_rate": 5.226361308402184e-07, "loss": 0.3747, "step": 9880 }, { "epoch": 0.47741218534087065, "grad_norm": 8.716904640197754, "learning_rate": 5.225878146591293e-07, "loss": 0.2441, "step": 9881 }, { "epoch": 0.4774605015219597, "grad_norm": 4.651823043823242, "learning_rate": 5.225394984780402e-07, "loss": 0.2192, "step": 9882 }, { "epoch": 0.47750881770304876, "grad_norm": 3.2049975395202637, "learning_rate": 5.224911822969512e-07, "loss": 0.2907, "step": 9883 }, { "epoch": 0.4775571338841378, "grad_norm": 3.8423190116882324, "learning_rate": 5.224428661158622e-07, "loss": 0.4511, "step": 9884 }, { "epoch": 0.47760545006522687, "grad_norm": 3.671696662902832, "learning_rate": 5.223945499347732e-07, "loss": 0.2923, "step": 9885 }, { "epoch": 0.4776537662463159, "grad_norm": 2.8942148685455322, "learning_rate": 5.223462337536842e-07, "loss": 0.2961, "step": 9886 }, { "epoch": 0.4777020824274049, "grad_norm": 2.8595316410064697, "learning_rate": 5.22297917572595e-07, "loss": 0.2754, "step": 9887 }, { "epoch": 0.477750398608494, "grad_norm": 2.947317361831665, "learning_rate": 5.22249601391506e-07, "loss": 0.3927, "step": 9888 }, { "epoch": 0.477798714789583, "grad_norm": 2.646561622619629, "learning_rate": 5.222012852104169e-07, "loss": 0.2571, "step": 9889 }, { "epoch": 0.4778470309706721, "grad_norm": 3.24649715423584, "learning_rate": 5.221529690293279e-07, "loss": 0.28, "step": 9890 }, { "epoch": 0.47789534715176113, "grad_norm": 2.2752339839935303, "learning_rate": 5.221046528482389e-07, "loss": 0.3149, "step": 9891 }, { "epoch": 0.47794366333285015, "grad_norm": 2.409025192260742, "learning_rate": 5.220563366671498e-07, "loss": 0.2186, "step": 9892 }, { "epoch": 0.47799197951393924, "grad_norm": 5.169772624969482, "learning_rate": 5.220080204860608e-07, "loss": 0.3771, "step": 9893 }, { "epoch": 0.47804029569502826, "grad_norm": 2.960085868835449, "learning_rate": 5.219597043049718e-07, "loss": 0.3277, "step": 9894 }, { "epoch": 0.4780886118761173, "grad_norm": 3.102717876434326, "learning_rate": 5.219113881238826e-07, "loss": 0.3366, "step": 9895 }, { "epoch": 0.47813692805720637, "grad_norm": 2.1865248680114746, "learning_rate": 5.218630719427936e-07, "loss": 0.2351, "step": 9896 }, { "epoch": 0.4781852442382954, "grad_norm": 4.115565299987793, "learning_rate": 5.218147557617045e-07, "loss": 0.3142, "step": 9897 }, { "epoch": 0.4782335604193845, "grad_norm": 2.652933120727539, "learning_rate": 5.217664395806155e-07, "loss": 0.1532, "step": 9898 }, { "epoch": 0.4782818766004735, "grad_norm": 1.9904465675354004, "learning_rate": 5.217181233995265e-07, "loss": 0.2316, "step": 9899 }, { "epoch": 0.4783301927815625, "grad_norm": 3.5872702598571777, "learning_rate": 5.216698072184375e-07, "loss": 0.3531, "step": 9900 }, { "epoch": 0.4783785089626516, "grad_norm": 2.4650423526763916, "learning_rate": 5.216214910373485e-07, "loss": 0.1974, "step": 9901 }, { "epoch": 0.47842682514374063, "grad_norm": 2.3835389614105225, "learning_rate": 5.215731748562593e-07, "loss": 0.3258, "step": 9902 }, { "epoch": 0.4784751413248297, "grad_norm": 2.562675952911377, "learning_rate": 5.215248586751702e-07, "loss": 0.3342, "step": 9903 }, { "epoch": 0.47852345750591874, "grad_norm": 3.744894027709961, "learning_rate": 5.214765424940812e-07, "loss": 0.2757, "step": 9904 }, { "epoch": 0.47857177368700776, "grad_norm": 2.6746692657470703, "learning_rate": 5.214282263129922e-07, "loss": 0.3052, "step": 9905 }, { "epoch": 0.47862008986809684, "grad_norm": 9.96274185180664, "learning_rate": 5.213799101319032e-07, "loss": 0.2718, "step": 9906 }, { "epoch": 0.47866840604918587, "grad_norm": 2.0630242824554443, "learning_rate": 5.213315939508141e-07, "loss": 0.244, "step": 9907 }, { "epoch": 0.4787167222302749, "grad_norm": 2.8560385704040527, "learning_rate": 5.21283277769725e-07, "loss": 0.2667, "step": 9908 }, { "epoch": 0.478765038411364, "grad_norm": 5.895613193511963, "learning_rate": 5.21234961588636e-07, "loss": 0.3524, "step": 9909 }, { "epoch": 0.478813354592453, "grad_norm": 3.8218069076538086, "learning_rate": 5.21186645407547e-07, "loss": 0.2386, "step": 9910 }, { "epoch": 0.4788616707735421, "grad_norm": 3.1143925189971924, "learning_rate": 5.21138329226458e-07, "loss": 0.1631, "step": 9911 }, { "epoch": 0.4789099869546311, "grad_norm": 2.6797289848327637, "learning_rate": 5.21090013045369e-07, "loss": 0.3607, "step": 9912 }, { "epoch": 0.47895830313572013, "grad_norm": 1.956377387046814, "learning_rate": 5.210416968642798e-07, "loss": 0.2266, "step": 9913 }, { "epoch": 0.4790066193168092, "grad_norm": 2.255847215652466, "learning_rate": 5.209933806831907e-07, "loss": 0.2146, "step": 9914 }, { "epoch": 0.47905493549789824, "grad_norm": 4.205045700073242, "learning_rate": 5.209450645021017e-07, "loss": 0.282, "step": 9915 }, { "epoch": 0.4791032516789873, "grad_norm": 1.7414551973342896, "learning_rate": 5.208967483210127e-07, "loss": 0.2117, "step": 9916 }, { "epoch": 0.47915156786007634, "grad_norm": 2.052302598953247, "learning_rate": 5.208484321399237e-07, "loss": 0.2273, "step": 9917 }, { "epoch": 0.47919988404116537, "grad_norm": 3.036998748779297, "learning_rate": 5.208001159588346e-07, "loss": 0.3019, "step": 9918 }, { "epoch": 0.47924820022225445, "grad_norm": 2.1018054485321045, "learning_rate": 5.207517997777456e-07, "loss": 0.2737, "step": 9919 }, { "epoch": 0.4792965164033435, "grad_norm": 2.8151330947875977, "learning_rate": 5.207034835966566e-07, "loss": 0.3723, "step": 9920 }, { "epoch": 0.4793448325844325, "grad_norm": 2.9525675773620605, "learning_rate": 5.206551674155674e-07, "loss": 0.3703, "step": 9921 }, { "epoch": 0.4793931487655216, "grad_norm": 1.8888283967971802, "learning_rate": 5.206068512344784e-07, "loss": 0.1766, "step": 9922 }, { "epoch": 0.4794414649466106, "grad_norm": 1.8436229228973389, "learning_rate": 5.205585350533893e-07, "loss": 0.2364, "step": 9923 }, { "epoch": 0.4794897811276997, "grad_norm": 5.371336936950684, "learning_rate": 5.205102188723003e-07, "loss": 0.3344, "step": 9924 }, { "epoch": 0.4795380973087887, "grad_norm": 5.419071197509766, "learning_rate": 5.204619026912113e-07, "loss": 0.2939, "step": 9925 }, { "epoch": 0.47958641348987774, "grad_norm": 3.1477723121643066, "learning_rate": 5.204135865101223e-07, "loss": 0.2743, "step": 9926 }, { "epoch": 0.4796347296709668, "grad_norm": 7.2500176429748535, "learning_rate": 5.203652703290332e-07, "loss": 0.2702, "step": 9927 }, { "epoch": 0.47968304585205584, "grad_norm": 2.1720011234283447, "learning_rate": 5.20316954147944e-07, "loss": 0.1866, "step": 9928 }, { "epoch": 0.4797313620331449, "grad_norm": 2.377504825592041, "learning_rate": 5.20268637966855e-07, "loss": 0.2472, "step": 9929 }, { "epoch": 0.47977967821423395, "grad_norm": 8.649395942687988, "learning_rate": 5.20220321785766e-07, "loss": 0.3429, "step": 9930 }, { "epoch": 0.479827994395323, "grad_norm": 2.0141382217407227, "learning_rate": 5.20172005604677e-07, "loss": 0.2279, "step": 9931 }, { "epoch": 0.47987631057641206, "grad_norm": 3.022434949874878, "learning_rate": 5.20123689423588e-07, "loss": 0.3532, "step": 9932 }, { "epoch": 0.4799246267575011, "grad_norm": 2.062077283859253, "learning_rate": 5.200753732424988e-07, "loss": 0.197, "step": 9933 }, { "epoch": 0.4799729429385901, "grad_norm": 2.6113603115081787, "learning_rate": 5.200270570614098e-07, "loss": 0.3136, "step": 9934 }, { "epoch": 0.4800212591196792, "grad_norm": 1.8231453895568848, "learning_rate": 5.199787408803208e-07, "loss": 0.2247, "step": 9935 }, { "epoch": 0.4800695753007682, "grad_norm": 1.7451889514923096, "learning_rate": 5.199304246992318e-07, "loss": 0.2086, "step": 9936 }, { "epoch": 0.4801178914818573, "grad_norm": 3.657487392425537, "learning_rate": 5.198821085181428e-07, "loss": 0.4527, "step": 9937 }, { "epoch": 0.4801662076629463, "grad_norm": 1.9128177165985107, "learning_rate": 5.198337923370536e-07, "loss": 0.2295, "step": 9938 }, { "epoch": 0.48021452384403535, "grad_norm": 5.14438009262085, "learning_rate": 5.197854761559646e-07, "loss": 0.2959, "step": 9939 }, { "epoch": 0.4802628400251244, "grad_norm": 3.3038902282714844, "learning_rate": 5.197371599748755e-07, "loss": 0.325, "step": 9940 }, { "epoch": 0.48031115620621345, "grad_norm": 2.393305540084839, "learning_rate": 5.196888437937865e-07, "loss": 0.2688, "step": 9941 }, { "epoch": 0.48035947238730253, "grad_norm": 2.5067386627197266, "learning_rate": 5.196405276126975e-07, "loss": 0.3503, "step": 9942 }, { "epoch": 0.48040778856839156, "grad_norm": 3.2640604972839355, "learning_rate": 5.195922114316085e-07, "loss": 0.3669, "step": 9943 }, { "epoch": 0.4804561047494806, "grad_norm": 2.8144710063934326, "learning_rate": 5.195438952505194e-07, "loss": 0.3097, "step": 9944 }, { "epoch": 0.48050442093056966, "grad_norm": 4.348064422607422, "learning_rate": 5.194955790694304e-07, "loss": 0.3185, "step": 9945 }, { "epoch": 0.4805527371116587, "grad_norm": 3.6158177852630615, "learning_rate": 5.194472628883412e-07, "loss": 0.3305, "step": 9946 }, { "epoch": 0.4806010532927477, "grad_norm": 1.8925551176071167, "learning_rate": 5.193989467072522e-07, "loss": 0.1686, "step": 9947 }, { "epoch": 0.4806493694738368, "grad_norm": 4.749561309814453, "learning_rate": 5.193506305261632e-07, "loss": 0.3672, "step": 9948 }, { "epoch": 0.4806976856549258, "grad_norm": 6.424739837646484, "learning_rate": 5.193023143450741e-07, "loss": 0.3381, "step": 9949 }, { "epoch": 0.4807460018360149, "grad_norm": 2.462425708770752, "learning_rate": 5.192539981639851e-07, "loss": 0.247, "step": 9950 }, { "epoch": 0.4807943180171039, "grad_norm": 3.4818613529205322, "learning_rate": 5.192056819828961e-07, "loss": 0.3134, "step": 9951 }, { "epoch": 0.48084263419819295, "grad_norm": 2.247389078140259, "learning_rate": 5.191573658018071e-07, "loss": 0.279, "step": 9952 }, { "epoch": 0.48089095037928203, "grad_norm": 2.772449493408203, "learning_rate": 5.19109049620718e-07, "loss": 0.2908, "step": 9953 }, { "epoch": 0.48093926656037106, "grad_norm": 20.990530014038086, "learning_rate": 5.190607334396288e-07, "loss": 0.4511, "step": 9954 }, { "epoch": 0.48098758274146014, "grad_norm": 2.063662528991699, "learning_rate": 5.190124172585398e-07, "loss": 0.2881, "step": 9955 }, { "epoch": 0.48103589892254917, "grad_norm": 2.224775791168213, "learning_rate": 5.189641010774508e-07, "loss": 0.2721, "step": 9956 }, { "epoch": 0.4810842151036382, "grad_norm": 3.021869659423828, "learning_rate": 5.189157848963618e-07, "loss": 0.291, "step": 9957 }, { "epoch": 0.48113253128472727, "grad_norm": 2.7367444038391113, "learning_rate": 5.188674687152728e-07, "loss": 0.2637, "step": 9958 }, { "epoch": 0.4811808474658163, "grad_norm": 2.8002734184265137, "learning_rate": 5.188191525341836e-07, "loss": 0.2763, "step": 9959 }, { "epoch": 0.4812291636469053, "grad_norm": 4.042515754699707, "learning_rate": 5.187708363530946e-07, "loss": 0.2443, "step": 9960 }, { "epoch": 0.4812774798279944, "grad_norm": 4.8007683753967285, "learning_rate": 5.187225201720056e-07, "loss": 0.3971, "step": 9961 }, { "epoch": 0.48132579600908343, "grad_norm": 2.859753131866455, "learning_rate": 5.186742039909166e-07, "loss": 0.3158, "step": 9962 }, { "epoch": 0.4813741121901725, "grad_norm": 2.9428458213806152, "learning_rate": 5.186258878098275e-07, "loss": 0.2805, "step": 9963 }, { "epoch": 0.48142242837126153, "grad_norm": 2.9351937770843506, "learning_rate": 5.185775716287384e-07, "loss": 0.4166, "step": 9964 }, { "epoch": 0.48147074455235056, "grad_norm": 2.5938267707824707, "learning_rate": 5.185292554476493e-07, "loss": 0.2929, "step": 9965 }, { "epoch": 0.48151906073343964, "grad_norm": 2.1638333797454834, "learning_rate": 5.184809392665603e-07, "loss": 0.3136, "step": 9966 }, { "epoch": 0.48156737691452867, "grad_norm": 2.2093193531036377, "learning_rate": 5.184326230854713e-07, "loss": 0.2098, "step": 9967 }, { "epoch": 0.48161569309561775, "grad_norm": 4.85109281539917, "learning_rate": 5.183843069043823e-07, "loss": 0.398, "step": 9968 }, { "epoch": 0.4816640092767068, "grad_norm": 2.7515270709991455, "learning_rate": 5.183359907232933e-07, "loss": 0.406, "step": 9969 }, { "epoch": 0.4817123254577958, "grad_norm": 2.0509941577911377, "learning_rate": 5.182876745422042e-07, "loss": 0.2777, "step": 9970 }, { "epoch": 0.4817606416388849, "grad_norm": 1.9412795305252075, "learning_rate": 5.182393583611151e-07, "loss": 0.1999, "step": 9971 }, { "epoch": 0.4818089578199739, "grad_norm": 9.580158233642578, "learning_rate": 5.18191042180026e-07, "loss": 0.3914, "step": 9972 }, { "epoch": 0.48185727400106293, "grad_norm": 2.133166551589966, "learning_rate": 5.18142725998937e-07, "loss": 0.2122, "step": 9973 }, { "epoch": 0.481905590182152, "grad_norm": 1.9816807508468628, "learning_rate": 5.18094409817848e-07, "loss": 0.212, "step": 9974 }, { "epoch": 0.48195390636324104, "grad_norm": 2.4301869869232178, "learning_rate": 5.180460936367589e-07, "loss": 0.2172, "step": 9975 }, { "epoch": 0.4820022225443301, "grad_norm": 3.1671628952026367, "learning_rate": 5.179977774556699e-07, "loss": 0.2465, "step": 9976 }, { "epoch": 0.48205053872541914, "grad_norm": 1.9450067281723022, "learning_rate": 5.179494612745809e-07, "loss": 0.2288, "step": 9977 }, { "epoch": 0.48209885490650817, "grad_norm": 2.955122232437134, "learning_rate": 5.179011450934918e-07, "loss": 0.2675, "step": 9978 }, { "epoch": 0.48214717108759725, "grad_norm": 2.409651517868042, "learning_rate": 5.178528289124028e-07, "loss": 0.2966, "step": 9979 }, { "epoch": 0.4821954872686863, "grad_norm": 2.50392484664917, "learning_rate": 5.178045127313136e-07, "loss": 0.239, "step": 9980 }, { "epoch": 0.48224380344977535, "grad_norm": 2.4696388244628906, "learning_rate": 5.177561965502246e-07, "loss": 0.2704, "step": 9981 }, { "epoch": 0.4822921196308644, "grad_norm": 3.158444404602051, "learning_rate": 5.177078803691356e-07, "loss": 0.3452, "step": 9982 }, { "epoch": 0.4823404358119534, "grad_norm": 2.9310107231140137, "learning_rate": 5.176595641880466e-07, "loss": 0.3876, "step": 9983 }, { "epoch": 0.4823887519930425, "grad_norm": 3.9227371215820312, "learning_rate": 5.176112480069576e-07, "loss": 0.3719, "step": 9984 }, { "epoch": 0.4824370681741315, "grad_norm": 2.5116524696350098, "learning_rate": 5.175629318258684e-07, "loss": 0.3279, "step": 9985 }, { "epoch": 0.48248538435522054, "grad_norm": 6.296394348144531, "learning_rate": 5.175146156447794e-07, "loss": 0.2597, "step": 9986 }, { "epoch": 0.4825337005363096, "grad_norm": 2.682345390319824, "learning_rate": 5.174662994636904e-07, "loss": 0.3448, "step": 9987 }, { "epoch": 0.48258201671739864, "grad_norm": 2.8755548000335693, "learning_rate": 5.174179832826013e-07, "loss": 0.3479, "step": 9988 }, { "epoch": 0.4826303328984877, "grad_norm": 6.5259222984313965, "learning_rate": 5.173696671015123e-07, "loss": 0.2421, "step": 9989 }, { "epoch": 0.48267864907957675, "grad_norm": 2.6460814476013184, "learning_rate": 5.173213509204232e-07, "loss": 0.309, "step": 9990 }, { "epoch": 0.4827269652606658, "grad_norm": 2.6984829902648926, "learning_rate": 5.172730347393341e-07, "loss": 0.3028, "step": 9991 }, { "epoch": 0.48277528144175486, "grad_norm": 2.6116554737091064, "learning_rate": 5.172247185582451e-07, "loss": 0.4035, "step": 9992 }, { "epoch": 0.4828235976228439, "grad_norm": 2.5290937423706055, "learning_rate": 5.171764023771561e-07, "loss": 0.2845, "step": 9993 }, { "epoch": 0.48287191380393296, "grad_norm": 2.0138447284698486, "learning_rate": 5.171280861960671e-07, "loss": 0.2222, "step": 9994 }, { "epoch": 0.482920229985022, "grad_norm": 1.4570521116256714, "learning_rate": 5.170797700149781e-07, "loss": 0.1446, "step": 9995 }, { "epoch": 0.482968546166111, "grad_norm": 3.7871992588043213, "learning_rate": 5.17031453833889e-07, "loss": 0.2127, "step": 9996 }, { "epoch": 0.4830168623472001, "grad_norm": 2.332568645477295, "learning_rate": 5.169831376527998e-07, "loss": 0.2566, "step": 9997 }, { "epoch": 0.4830651785282891, "grad_norm": 2.492900848388672, "learning_rate": 5.169348214717108e-07, "loss": 0.3364, "step": 9998 }, { "epoch": 0.48311349470937814, "grad_norm": 2.945140838623047, "learning_rate": 5.168865052906218e-07, "loss": 0.3624, "step": 9999 }, { "epoch": 0.4831618108904672, "grad_norm": 2.2464606761932373, "learning_rate": 5.168381891095328e-07, "loss": 0.2026, "step": 10000 }, { "epoch": 0.48321012707155625, "grad_norm": 2.1110646724700928, "learning_rate": 5.167898729284437e-07, "loss": 0.236, "step": 10001 }, { "epoch": 0.48325844325264533, "grad_norm": 2.704402446746826, "learning_rate": 5.167415567473547e-07, "loss": 0.2894, "step": 10002 }, { "epoch": 0.48330675943373436, "grad_norm": 5.076204776763916, "learning_rate": 5.166932405662657e-07, "loss": 0.3253, "step": 10003 }, { "epoch": 0.4833550756148234, "grad_norm": 2.225280284881592, "learning_rate": 5.166449243851766e-07, "loss": 0.2615, "step": 10004 }, { "epoch": 0.48340339179591246, "grad_norm": 2.322620391845703, "learning_rate": 5.165966082040875e-07, "loss": 0.208, "step": 10005 }, { "epoch": 0.4834517079770015, "grad_norm": 3.6844594478607178, "learning_rate": 5.165482920229984e-07, "loss": 0.3558, "step": 10006 }, { "epoch": 0.48350002415809057, "grad_norm": 3.1294448375701904, "learning_rate": 5.164999758419094e-07, "loss": 0.2625, "step": 10007 }, { "epoch": 0.4835483403391796, "grad_norm": 5.703001976013184, "learning_rate": 5.164516596608204e-07, "loss": 0.2657, "step": 10008 }, { "epoch": 0.4835966565202686, "grad_norm": 3.177264928817749, "learning_rate": 5.164033434797314e-07, "loss": 0.2895, "step": 10009 }, { "epoch": 0.4836449727013577, "grad_norm": 2.648071765899658, "learning_rate": 5.163550272986423e-07, "loss": 0.4059, "step": 10010 }, { "epoch": 0.4836932888824467, "grad_norm": 8.39117431640625, "learning_rate": 5.163067111175532e-07, "loss": 0.405, "step": 10011 }, { "epoch": 0.48374160506353575, "grad_norm": 9.335914611816406, "learning_rate": 5.162583949364642e-07, "loss": 0.2977, "step": 10012 }, { "epoch": 0.48378992124462483, "grad_norm": 3.4784634113311768, "learning_rate": 5.162100787553751e-07, "loss": 0.336, "step": 10013 }, { "epoch": 0.48383823742571386, "grad_norm": 2.9586873054504395, "learning_rate": 5.161617625742861e-07, "loss": 0.3385, "step": 10014 }, { "epoch": 0.48388655360680294, "grad_norm": 2.5301473140716553, "learning_rate": 5.161134463931971e-07, "loss": 0.3284, "step": 10015 }, { "epoch": 0.48393486978789196, "grad_norm": 7.576047420501709, "learning_rate": 5.160651302121079e-07, "loss": 0.3466, "step": 10016 }, { "epoch": 0.483983185968981, "grad_norm": 2.7675423622131348, "learning_rate": 5.160168140310189e-07, "loss": 0.2603, "step": 10017 }, { "epoch": 0.48403150215007007, "grad_norm": 2.768686294555664, "learning_rate": 5.159684978499299e-07, "loss": 0.2922, "step": 10018 }, { "epoch": 0.4840798183311591, "grad_norm": 5.017944812774658, "learning_rate": 5.159201816688409e-07, "loss": 0.3349, "step": 10019 }, { "epoch": 0.4841281345122482, "grad_norm": 2.4657089710235596, "learning_rate": 5.158718654877519e-07, "loss": 0.3595, "step": 10020 }, { "epoch": 0.4841764506933372, "grad_norm": 2.9098782539367676, "learning_rate": 5.158235493066629e-07, "loss": 0.3776, "step": 10021 }, { "epoch": 0.4842247668744262, "grad_norm": 1.7784533500671387, "learning_rate": 5.157752331255737e-07, "loss": 0.1556, "step": 10022 }, { "epoch": 0.4842730830555153, "grad_norm": 2.872091293334961, "learning_rate": 5.157269169444846e-07, "loss": 0.4098, "step": 10023 }, { "epoch": 0.48432139923660433, "grad_norm": 3.7351315021514893, "learning_rate": 5.156786007633956e-07, "loss": 0.3195, "step": 10024 }, { "epoch": 0.48436971541769336, "grad_norm": 7.3316497802734375, "learning_rate": 5.156302845823066e-07, "loss": 0.2964, "step": 10025 }, { "epoch": 0.48441803159878244, "grad_norm": 4.811832427978516, "learning_rate": 5.155819684012176e-07, "loss": 0.4267, "step": 10026 }, { "epoch": 0.48446634777987146, "grad_norm": 2.6291518211364746, "learning_rate": 5.155336522201285e-07, "loss": 0.304, "step": 10027 }, { "epoch": 0.48451466396096055, "grad_norm": 6.395092964172363, "learning_rate": 5.154853360390395e-07, "loss": 0.3585, "step": 10028 }, { "epoch": 0.48456298014204957, "grad_norm": 2.2203147411346436, "learning_rate": 5.154370198579504e-07, "loss": 0.2726, "step": 10029 }, { "epoch": 0.4846112963231386, "grad_norm": 3.2475130558013916, "learning_rate": 5.153887036768613e-07, "loss": 0.2814, "step": 10030 }, { "epoch": 0.4846596125042277, "grad_norm": 1.8497756719589233, "learning_rate": 5.153403874957723e-07, "loss": 0.1973, "step": 10031 }, { "epoch": 0.4847079286853167, "grad_norm": 2.5123300552368164, "learning_rate": 5.152920713146832e-07, "loss": 0.331, "step": 10032 }, { "epoch": 0.4847562448664058, "grad_norm": 2.336228370666504, "learning_rate": 5.152437551335942e-07, "loss": 0.2099, "step": 10033 }, { "epoch": 0.4848045610474948, "grad_norm": 3.067502975463867, "learning_rate": 5.151954389525052e-07, "loss": 0.2063, "step": 10034 }, { "epoch": 0.48485287722858383, "grad_norm": 6.35645866394043, "learning_rate": 5.151471227714162e-07, "loss": 0.2565, "step": 10035 }, { "epoch": 0.4849011934096729, "grad_norm": 3.3403286933898926, "learning_rate": 5.150988065903271e-07, "loss": 0.4059, "step": 10036 }, { "epoch": 0.48494950959076194, "grad_norm": 2.3326973915100098, "learning_rate": 5.15050490409238e-07, "loss": 0.2686, "step": 10037 }, { "epoch": 0.48499782577185097, "grad_norm": 2.928736448287964, "learning_rate": 5.15002174228149e-07, "loss": 0.3815, "step": 10038 }, { "epoch": 0.48504614195294005, "grad_norm": 2.5292911529541016, "learning_rate": 5.149538580470599e-07, "loss": 0.2179, "step": 10039 }, { "epoch": 0.48509445813402907, "grad_norm": 2.330364227294922, "learning_rate": 5.149055418659709e-07, "loss": 0.2551, "step": 10040 }, { "epoch": 0.48514277431511815, "grad_norm": 2.5354771614074707, "learning_rate": 5.148572256848819e-07, "loss": 0.2428, "step": 10041 }, { "epoch": 0.4851910904962072, "grad_norm": 6.325314044952393, "learning_rate": 5.148089095037927e-07, "loss": 0.2387, "step": 10042 }, { "epoch": 0.4852394066772962, "grad_norm": 2.593632698059082, "learning_rate": 5.147605933227037e-07, "loss": 0.3101, "step": 10043 }, { "epoch": 0.4852877228583853, "grad_norm": 2.604325771331787, "learning_rate": 5.147122771416147e-07, "loss": 0.3173, "step": 10044 }, { "epoch": 0.4853360390394743, "grad_norm": 2.616159439086914, "learning_rate": 5.146639609605257e-07, "loss": 0.1973, "step": 10045 }, { "epoch": 0.4853843552205634, "grad_norm": 2.550250768661499, "learning_rate": 5.146156447794367e-07, "loss": 0.3504, "step": 10046 }, { "epoch": 0.4854326714016524, "grad_norm": 3.8693599700927734, "learning_rate": 5.145673285983477e-07, "loss": 0.3283, "step": 10047 }, { "epoch": 0.48548098758274144, "grad_norm": 2.942674398422241, "learning_rate": 5.145190124172584e-07, "loss": 0.2408, "step": 10048 }, { "epoch": 0.4855293037638305, "grad_norm": 3.373080015182495, "learning_rate": 5.144706962361694e-07, "loss": 0.3341, "step": 10049 }, { "epoch": 0.48557761994491955, "grad_norm": 3.160539150238037, "learning_rate": 5.144223800550804e-07, "loss": 0.3183, "step": 10050 }, { "epoch": 0.4856259361260086, "grad_norm": 2.5605881214141846, "learning_rate": 5.143740638739914e-07, "loss": 0.3134, "step": 10051 }, { "epoch": 0.48567425230709765, "grad_norm": 3.0558393001556396, "learning_rate": 5.143257476929024e-07, "loss": 0.3219, "step": 10052 }, { "epoch": 0.4857225684881867, "grad_norm": 6.154149532318115, "learning_rate": 5.142774315118133e-07, "loss": 0.3127, "step": 10053 }, { "epoch": 0.48577088466927576, "grad_norm": 4.6906232833862305, "learning_rate": 5.142291153307243e-07, "loss": 0.3922, "step": 10054 }, { "epoch": 0.4858192008503648, "grad_norm": 11.292136192321777, "learning_rate": 5.141807991496351e-07, "loss": 0.3804, "step": 10055 }, { "epoch": 0.4858675170314538, "grad_norm": 3.0421245098114014, "learning_rate": 5.141324829685461e-07, "loss": 0.3559, "step": 10056 }, { "epoch": 0.4859158332125429, "grad_norm": 2.947164297103882, "learning_rate": 5.140841667874571e-07, "loss": 0.2459, "step": 10057 }, { "epoch": 0.4859641493936319, "grad_norm": 2.769737482070923, "learning_rate": 5.14035850606368e-07, "loss": 0.3918, "step": 10058 }, { "epoch": 0.486012465574721, "grad_norm": 2.7333993911743164, "learning_rate": 5.13987534425279e-07, "loss": 0.3069, "step": 10059 }, { "epoch": 0.48606078175581, "grad_norm": 6.314774990081787, "learning_rate": 5.1393921824419e-07, "loss": 0.3323, "step": 10060 }, { "epoch": 0.48610909793689905, "grad_norm": 4.053009033203125, "learning_rate": 5.138909020631009e-07, "loss": 0.3182, "step": 10061 }, { "epoch": 0.48615741411798813, "grad_norm": 2.766570568084717, "learning_rate": 5.138425858820119e-07, "loss": 0.3443, "step": 10062 }, { "epoch": 0.48620573029907715, "grad_norm": 2.994985580444336, "learning_rate": 5.137942697009228e-07, "loss": 0.3008, "step": 10063 }, { "epoch": 0.4862540464801662, "grad_norm": 3.112013816833496, "learning_rate": 5.137459535198337e-07, "loss": 0.3782, "step": 10064 }, { "epoch": 0.48630236266125526, "grad_norm": 2.0390851497650146, "learning_rate": 5.136976373387447e-07, "loss": 0.22, "step": 10065 }, { "epoch": 0.4863506788423443, "grad_norm": 6.017432689666748, "learning_rate": 5.136493211576557e-07, "loss": 0.5019, "step": 10066 }, { "epoch": 0.48639899502343337, "grad_norm": 2.4988644123077393, "learning_rate": 5.136010049765667e-07, "loss": 0.3011, "step": 10067 }, { "epoch": 0.4864473112045224, "grad_norm": 2.9893057346343994, "learning_rate": 5.135526887954775e-07, "loss": 0.2901, "step": 10068 }, { "epoch": 0.4864956273856114, "grad_norm": 2.2305755615234375, "learning_rate": 5.135043726143885e-07, "loss": 0.2898, "step": 10069 }, { "epoch": 0.4865439435667005, "grad_norm": 2.7573564052581787, "learning_rate": 5.134560564332995e-07, "loss": 0.3171, "step": 10070 }, { "epoch": 0.4865922597477895, "grad_norm": 4.381951332092285, "learning_rate": 5.134077402522105e-07, "loss": 0.2885, "step": 10071 }, { "epoch": 0.4866405759288786, "grad_norm": 5.533623218536377, "learning_rate": 5.133594240711215e-07, "loss": 0.4009, "step": 10072 }, { "epoch": 0.48668889210996763, "grad_norm": 3.0863559246063232, "learning_rate": 5.133111078900324e-07, "loss": 0.3061, "step": 10073 }, { "epoch": 0.48673720829105666, "grad_norm": 1.9652851819992065, "learning_rate": 5.132627917089432e-07, "loss": 0.2032, "step": 10074 }, { "epoch": 0.48678552447214574, "grad_norm": 4.180521488189697, "learning_rate": 5.132144755278542e-07, "loss": 0.31, "step": 10075 }, { "epoch": 0.48683384065323476, "grad_norm": 6.107622146606445, "learning_rate": 5.131661593467652e-07, "loss": 0.1796, "step": 10076 }, { "epoch": 0.48688215683432384, "grad_norm": 2.113835573196411, "learning_rate": 5.131178431656762e-07, "loss": 0.1818, "step": 10077 }, { "epoch": 0.48693047301541287, "grad_norm": 2.0315496921539307, "learning_rate": 5.130695269845872e-07, "loss": 0.2205, "step": 10078 }, { "epoch": 0.4869787891965019, "grad_norm": 2.367262601852417, "learning_rate": 5.130212108034981e-07, "loss": 0.2685, "step": 10079 }, { "epoch": 0.487027105377591, "grad_norm": 2.1554369926452637, "learning_rate": 5.12972894622409e-07, "loss": 0.2386, "step": 10080 }, { "epoch": 0.48707542155868, "grad_norm": 3.8248982429504395, "learning_rate": 5.129245784413199e-07, "loss": 0.3699, "step": 10081 }, { "epoch": 0.487123737739769, "grad_norm": 2.636934280395508, "learning_rate": 5.128762622602309e-07, "loss": 0.2299, "step": 10082 }, { "epoch": 0.4871720539208581, "grad_norm": 5.720625877380371, "learning_rate": 5.128279460791419e-07, "loss": 0.3825, "step": 10083 }, { "epoch": 0.48722037010194713, "grad_norm": 1.799089789390564, "learning_rate": 5.127796298980528e-07, "loss": 0.1813, "step": 10084 }, { "epoch": 0.4872686862830362, "grad_norm": 2.9292986392974854, "learning_rate": 5.127313137169638e-07, "loss": 0.2785, "step": 10085 }, { "epoch": 0.48731700246412524, "grad_norm": 5.132380962371826, "learning_rate": 5.126829975358748e-07, "loss": 0.2122, "step": 10086 }, { "epoch": 0.48736531864521426, "grad_norm": 6.772622108459473, "learning_rate": 5.126346813547857e-07, "loss": 0.3144, "step": 10087 }, { "epoch": 0.48741363482630334, "grad_norm": 14.028687477111816, "learning_rate": 5.125863651736967e-07, "loss": 0.2883, "step": 10088 }, { "epoch": 0.48746195100739237, "grad_norm": 2.2564194202423096, "learning_rate": 5.125380489926075e-07, "loss": 0.1963, "step": 10089 }, { "epoch": 0.48751026718848145, "grad_norm": 2.671335220336914, "learning_rate": 5.124897328115185e-07, "loss": 0.3352, "step": 10090 }, { "epoch": 0.4875585833695705, "grad_norm": 5.093069553375244, "learning_rate": 5.124414166304295e-07, "loss": 0.2424, "step": 10091 }, { "epoch": 0.4876068995506595, "grad_norm": 3.5411031246185303, "learning_rate": 5.123931004493405e-07, "loss": 0.4284, "step": 10092 }, { "epoch": 0.4876552157317486, "grad_norm": 2.380852460861206, "learning_rate": 5.123447842682514e-07, "loss": 0.2872, "step": 10093 }, { "epoch": 0.4877035319128376, "grad_norm": 3.25569486618042, "learning_rate": 5.122964680871623e-07, "loss": 0.2686, "step": 10094 }, { "epoch": 0.48775184809392663, "grad_norm": 3.515542984008789, "learning_rate": 5.122481519060733e-07, "loss": 0.271, "step": 10095 }, { "epoch": 0.4878001642750157, "grad_norm": 1.9187963008880615, "learning_rate": 5.121998357249843e-07, "loss": 0.2272, "step": 10096 }, { "epoch": 0.48784848045610474, "grad_norm": 12.904720306396484, "learning_rate": 5.121515195438953e-07, "loss": 0.3289, "step": 10097 }, { "epoch": 0.4878967966371938, "grad_norm": 1.954906940460205, "learning_rate": 5.121032033628062e-07, "loss": 0.2319, "step": 10098 }, { "epoch": 0.48794511281828284, "grad_norm": 6.9400715827941895, "learning_rate": 5.120548871817172e-07, "loss": 0.3522, "step": 10099 }, { "epoch": 0.48799342899937187, "grad_norm": 1.611279010772705, "learning_rate": 5.12006571000628e-07, "loss": 0.1867, "step": 10100 }, { "epoch": 0.48804174518046095, "grad_norm": 4.33371114730835, "learning_rate": 5.11958254819539e-07, "loss": 0.4305, "step": 10101 }, { "epoch": 0.48809006136155, "grad_norm": 2.7049126625061035, "learning_rate": 5.1190993863845e-07, "loss": 0.2277, "step": 10102 }, { "epoch": 0.48813837754263906, "grad_norm": 1.9714170694351196, "learning_rate": 5.11861622457361e-07, "loss": 0.23, "step": 10103 }, { "epoch": 0.4881866937237281, "grad_norm": 1.7376925945281982, "learning_rate": 5.11813306276272e-07, "loss": 0.1702, "step": 10104 }, { "epoch": 0.4882350099048171, "grad_norm": 3.0500640869140625, "learning_rate": 5.117649900951829e-07, "loss": 0.3209, "step": 10105 }, { "epoch": 0.4882833260859062, "grad_norm": 2.5673987865448, "learning_rate": 5.117166739140937e-07, "loss": 0.3234, "step": 10106 }, { "epoch": 0.4883316422669952, "grad_norm": 4.28007173538208, "learning_rate": 5.116683577330047e-07, "loss": 0.4654, "step": 10107 }, { "epoch": 0.48837995844808424, "grad_norm": 2.112959384918213, "learning_rate": 5.116200415519157e-07, "loss": 0.2307, "step": 10108 }, { "epoch": 0.4884282746291733, "grad_norm": 2.499845027923584, "learning_rate": 5.115717253708267e-07, "loss": 0.2064, "step": 10109 }, { "epoch": 0.48847659081026235, "grad_norm": 1.8340359926223755, "learning_rate": 5.115234091897376e-07, "loss": 0.2294, "step": 10110 }, { "epoch": 0.4885249069913514, "grad_norm": 2.451023817062378, "learning_rate": 5.114750930086486e-07, "loss": 0.3355, "step": 10111 }, { "epoch": 0.48857322317244045, "grad_norm": 2.8935630321502686, "learning_rate": 5.114267768275595e-07, "loss": 0.2518, "step": 10112 }, { "epoch": 0.4886215393535295, "grad_norm": 2.0383963584899902, "learning_rate": 5.113784606464705e-07, "loss": 0.2131, "step": 10113 }, { "epoch": 0.48866985553461856, "grad_norm": 2.597867488861084, "learning_rate": 5.113301444653815e-07, "loss": 0.358, "step": 10114 }, { "epoch": 0.4887181717157076, "grad_norm": 2.1765706539154053, "learning_rate": 5.112818282842923e-07, "loss": 0.2679, "step": 10115 }, { "epoch": 0.48876648789679666, "grad_norm": 3.4313321113586426, "learning_rate": 5.112335121032033e-07, "loss": 0.3748, "step": 10116 }, { "epoch": 0.4888148040778857, "grad_norm": 2.35774302482605, "learning_rate": 5.111851959221143e-07, "loss": 0.2152, "step": 10117 }, { "epoch": 0.4888631202589747, "grad_norm": 2.966815710067749, "learning_rate": 5.111368797410253e-07, "loss": 0.32, "step": 10118 }, { "epoch": 0.4889114364400638, "grad_norm": 2.578619956970215, "learning_rate": 5.110885635599362e-07, "loss": 0.2286, "step": 10119 }, { "epoch": 0.4889597526211528, "grad_norm": 2.8317930698394775, "learning_rate": 5.110402473788471e-07, "loss": 0.4586, "step": 10120 }, { "epoch": 0.48900806880224185, "grad_norm": 4.361635684967041, "learning_rate": 5.109919311977581e-07, "loss": 0.3256, "step": 10121 }, { "epoch": 0.4890563849833309, "grad_norm": 2.3950133323669434, "learning_rate": 5.109436150166691e-07, "loss": 0.2802, "step": 10122 }, { "epoch": 0.48910470116441995, "grad_norm": 4.639435291290283, "learning_rate": 5.1089529883558e-07, "loss": 0.368, "step": 10123 }, { "epoch": 0.48915301734550903, "grad_norm": 2.338290214538574, "learning_rate": 5.10846982654491e-07, "loss": 0.2183, "step": 10124 }, { "epoch": 0.48920133352659806, "grad_norm": 2.154202699661255, "learning_rate": 5.107986664734019e-07, "loss": 0.2405, "step": 10125 }, { "epoch": 0.4892496497076871, "grad_norm": 2.6590142250061035, "learning_rate": 5.107503502923128e-07, "loss": 0.2961, "step": 10126 }, { "epoch": 0.48929796588877617, "grad_norm": 2.0037364959716797, "learning_rate": 5.107020341112238e-07, "loss": 0.2331, "step": 10127 }, { "epoch": 0.4893462820698652, "grad_norm": 2.874783992767334, "learning_rate": 5.106537179301348e-07, "loss": 0.3592, "step": 10128 }, { "epoch": 0.48939459825095427, "grad_norm": 2.3123934268951416, "learning_rate": 5.106054017490458e-07, "loss": 0.3263, "step": 10129 }, { "epoch": 0.4894429144320433, "grad_norm": 2.437939405441284, "learning_rate": 5.105570855679568e-07, "loss": 0.3567, "step": 10130 }, { "epoch": 0.4894912306131323, "grad_norm": 2.4256060123443604, "learning_rate": 5.105087693868675e-07, "loss": 0.2837, "step": 10131 }, { "epoch": 0.4895395467942214, "grad_norm": 1.8295321464538574, "learning_rate": 5.104604532057785e-07, "loss": 0.1698, "step": 10132 }, { "epoch": 0.48958786297531043, "grad_norm": 2.4345288276672363, "learning_rate": 5.104121370246895e-07, "loss": 0.3385, "step": 10133 }, { "epoch": 0.48963617915639945, "grad_norm": 2.787198781967163, "learning_rate": 5.103638208436005e-07, "loss": 0.2708, "step": 10134 }, { "epoch": 0.48968449533748853, "grad_norm": 5.101341724395752, "learning_rate": 5.103155046625115e-07, "loss": 0.4153, "step": 10135 }, { "epoch": 0.48973281151857756, "grad_norm": 3.2796599864959717, "learning_rate": 5.102671884814224e-07, "loss": 0.3784, "step": 10136 }, { "epoch": 0.48978112769966664, "grad_norm": 5.736405372619629, "learning_rate": 5.102188723003334e-07, "loss": 0.2959, "step": 10137 }, { "epoch": 0.48982944388075567, "grad_norm": 3.1797726154327393, "learning_rate": 5.101705561192443e-07, "loss": 0.2702, "step": 10138 }, { "epoch": 0.4898777600618447, "grad_norm": 2.6050493717193604, "learning_rate": 5.101222399381553e-07, "loss": 0.3242, "step": 10139 }, { "epoch": 0.4899260762429338, "grad_norm": 3.1618847846984863, "learning_rate": 5.100739237570662e-07, "loss": 0.3955, "step": 10140 }, { "epoch": 0.4899743924240228, "grad_norm": 2.08390212059021, "learning_rate": 5.100256075759771e-07, "loss": 0.2081, "step": 10141 }, { "epoch": 0.4900227086051119, "grad_norm": 2.7028329372406006, "learning_rate": 5.099772913948881e-07, "loss": 0.2778, "step": 10142 }, { "epoch": 0.4900710247862009, "grad_norm": 2.374577045440674, "learning_rate": 5.099289752137991e-07, "loss": 0.2167, "step": 10143 }, { "epoch": 0.49011934096728993, "grad_norm": 2.199815511703491, "learning_rate": 5.0988065903271e-07, "loss": 0.2267, "step": 10144 }, { "epoch": 0.490167657148379, "grad_norm": 3.694626808166504, "learning_rate": 5.09832342851621e-07, "loss": 0.319, "step": 10145 }, { "epoch": 0.49021597332946804, "grad_norm": 7.416436195373535, "learning_rate": 5.097840266705319e-07, "loss": 0.2359, "step": 10146 }, { "epoch": 0.49026428951055706, "grad_norm": 7.5601420402526855, "learning_rate": 5.097357104894429e-07, "loss": 0.366, "step": 10147 }, { "epoch": 0.49031260569164614, "grad_norm": 15.899343490600586, "learning_rate": 5.096873943083539e-07, "loss": 0.3361, "step": 10148 }, { "epoch": 0.49036092187273517, "grad_norm": 2.1482093334198, "learning_rate": 5.096390781272648e-07, "loss": 0.2602, "step": 10149 }, { "epoch": 0.49040923805382425, "grad_norm": 2.6847751140594482, "learning_rate": 5.095907619461758e-07, "loss": 0.3411, "step": 10150 }, { "epoch": 0.4904575542349133, "grad_norm": 2.9163694381713867, "learning_rate": 5.095424457650867e-07, "loss": 0.3888, "step": 10151 }, { "epoch": 0.4905058704160023, "grad_norm": 3.68045711517334, "learning_rate": 5.094941295839976e-07, "loss": 0.3116, "step": 10152 }, { "epoch": 0.4905541865970914, "grad_norm": 2.8642215728759766, "learning_rate": 5.094458134029086e-07, "loss": 0.2724, "step": 10153 }, { "epoch": 0.4906025027781804, "grad_norm": 2.355125904083252, "learning_rate": 5.093974972218196e-07, "loss": 0.2223, "step": 10154 }, { "epoch": 0.4906508189592695, "grad_norm": 2.502929210662842, "learning_rate": 5.093491810407306e-07, "loss": 0.2815, "step": 10155 }, { "epoch": 0.4906991351403585, "grad_norm": 2.7608118057250977, "learning_rate": 5.093008648596416e-07, "loss": 0.2798, "step": 10156 }, { "epoch": 0.49074745132144754, "grad_norm": 4.034046173095703, "learning_rate": 5.092525486785523e-07, "loss": 0.3797, "step": 10157 }, { "epoch": 0.4907957675025366, "grad_norm": 3.6617796421051025, "learning_rate": 5.092042324974633e-07, "loss": 0.4699, "step": 10158 }, { "epoch": 0.49084408368362564, "grad_norm": 11.072166442871094, "learning_rate": 5.091559163163743e-07, "loss": 0.3049, "step": 10159 }, { "epoch": 0.49089239986471467, "grad_norm": 5.065412998199463, "learning_rate": 5.091076001352853e-07, "loss": 0.3091, "step": 10160 }, { "epoch": 0.49094071604580375, "grad_norm": 1.7813236713409424, "learning_rate": 5.090592839541963e-07, "loss": 0.2123, "step": 10161 }, { "epoch": 0.4909890322268928, "grad_norm": 3.63525652885437, "learning_rate": 5.090109677731072e-07, "loss": 0.2593, "step": 10162 }, { "epoch": 0.49103734840798186, "grad_norm": 2.3866090774536133, "learning_rate": 5.089626515920182e-07, "loss": 0.2779, "step": 10163 }, { "epoch": 0.4910856645890709, "grad_norm": 4.454906940460205, "learning_rate": 5.089143354109291e-07, "loss": 0.453, "step": 10164 }, { "epoch": 0.4911339807701599, "grad_norm": 2.3196120262145996, "learning_rate": 5.0886601922984e-07, "loss": 0.2466, "step": 10165 }, { "epoch": 0.491182296951249, "grad_norm": 7.768022537231445, "learning_rate": 5.08817703048751e-07, "loss": 0.285, "step": 10166 }, { "epoch": 0.491230613132338, "grad_norm": 1.9663070440292358, "learning_rate": 5.087693868676619e-07, "loss": 0.1815, "step": 10167 }, { "epoch": 0.4912789293134271, "grad_norm": 2.414327383041382, "learning_rate": 5.087210706865729e-07, "loss": 0.1706, "step": 10168 }, { "epoch": 0.4913272454945161, "grad_norm": 3.436904191970825, "learning_rate": 5.086727545054839e-07, "loss": 0.2234, "step": 10169 }, { "epoch": 0.49137556167560514, "grad_norm": 2.962477207183838, "learning_rate": 5.086244383243948e-07, "loss": 0.3068, "step": 10170 }, { "epoch": 0.4914238778566942, "grad_norm": 3.256096839904785, "learning_rate": 5.085761221433058e-07, "loss": 0.3851, "step": 10171 }, { "epoch": 0.49147219403778325, "grad_norm": 4.345951080322266, "learning_rate": 5.085278059622167e-07, "loss": 0.4023, "step": 10172 }, { "epoch": 0.4915205102188723, "grad_norm": 3.398350238800049, "learning_rate": 5.084794897811277e-07, "loss": 0.3574, "step": 10173 }, { "epoch": 0.49156882639996136, "grad_norm": 2.9796135425567627, "learning_rate": 5.084311736000386e-07, "loss": 0.3106, "step": 10174 }, { "epoch": 0.4916171425810504, "grad_norm": 2.4291586875915527, "learning_rate": 5.083828574189496e-07, "loss": 0.2414, "step": 10175 }, { "epoch": 0.49166545876213946, "grad_norm": 2.852834463119507, "learning_rate": 5.083345412378605e-07, "loss": 0.3466, "step": 10176 }, { "epoch": 0.4917137749432285, "grad_norm": 3.9982635974884033, "learning_rate": 5.082862250567715e-07, "loss": 0.3474, "step": 10177 }, { "epoch": 0.4917620911243175, "grad_norm": 2.668414831161499, "learning_rate": 5.082379088756824e-07, "loss": 0.3106, "step": 10178 }, { "epoch": 0.4918104073054066, "grad_norm": 4.858137130737305, "learning_rate": 5.081895926945934e-07, "loss": 0.3846, "step": 10179 }, { "epoch": 0.4918587234864956, "grad_norm": 2.659175157546997, "learning_rate": 5.081412765135044e-07, "loss": 0.3427, "step": 10180 }, { "epoch": 0.4919070396675847, "grad_norm": 3.076911687850952, "learning_rate": 5.080929603324154e-07, "loss": 0.5077, "step": 10181 }, { "epoch": 0.4919553558486737, "grad_norm": 2.3337833881378174, "learning_rate": 5.080446441513264e-07, "loss": 0.1806, "step": 10182 }, { "epoch": 0.49200367202976275, "grad_norm": 3.3914568424224854, "learning_rate": 5.079963279702371e-07, "loss": 0.4288, "step": 10183 }, { "epoch": 0.49205198821085183, "grad_norm": 2.7156686782836914, "learning_rate": 5.079480117891481e-07, "loss": 0.254, "step": 10184 }, { "epoch": 0.49210030439194086, "grad_norm": 1.9345616102218628, "learning_rate": 5.078996956080591e-07, "loss": 0.1999, "step": 10185 }, { "epoch": 0.4921486205730299, "grad_norm": 2.5103282928466797, "learning_rate": 5.078513794269701e-07, "loss": 0.3286, "step": 10186 }, { "epoch": 0.49219693675411896, "grad_norm": 1.5502524375915527, "learning_rate": 5.078030632458811e-07, "loss": 0.1895, "step": 10187 }, { "epoch": 0.492245252935208, "grad_norm": 2.5335617065429688, "learning_rate": 5.07754747064792e-07, "loss": 0.2166, "step": 10188 }, { "epoch": 0.49229356911629707, "grad_norm": 2.64241623878479, "learning_rate": 5.077064308837029e-07, "loss": 0.2571, "step": 10189 }, { "epoch": 0.4923418852973861, "grad_norm": 2.7754299640655518, "learning_rate": 5.076581147026139e-07, "loss": 0.3462, "step": 10190 }, { "epoch": 0.4923902014784751, "grad_norm": 2.307363271713257, "learning_rate": 5.076097985215248e-07, "loss": 0.3103, "step": 10191 }, { "epoch": 0.4924385176595642, "grad_norm": 2.3489327430725098, "learning_rate": 5.075614823404358e-07, "loss": 0.3114, "step": 10192 }, { "epoch": 0.4924868338406532, "grad_norm": 2.5095114707946777, "learning_rate": 5.075131661593467e-07, "loss": 0.3238, "step": 10193 }, { "epoch": 0.4925351500217423, "grad_norm": 7.834468841552734, "learning_rate": 5.074648499782577e-07, "loss": 0.3031, "step": 10194 }, { "epoch": 0.49258346620283133, "grad_norm": 2.8409154415130615, "learning_rate": 5.074165337971687e-07, "loss": 0.3234, "step": 10195 }, { "epoch": 0.49263178238392036, "grad_norm": 1.775017261505127, "learning_rate": 5.073682176160796e-07, "loss": 0.1602, "step": 10196 }, { "epoch": 0.49268009856500944, "grad_norm": 2.682663679122925, "learning_rate": 5.073199014349906e-07, "loss": 0.2771, "step": 10197 }, { "epoch": 0.49272841474609846, "grad_norm": 4.03656005859375, "learning_rate": 5.072715852539015e-07, "loss": 0.3681, "step": 10198 }, { "epoch": 0.4927767309271875, "grad_norm": 5.238566875457764, "learning_rate": 5.072232690728124e-07, "loss": 0.2877, "step": 10199 }, { "epoch": 0.49282504710827657, "grad_norm": 3.7809808254241943, "learning_rate": 5.071749528917234e-07, "loss": 0.3146, "step": 10200 }, { "epoch": 0.4928733632893656, "grad_norm": 3.544296979904175, "learning_rate": 5.071266367106344e-07, "loss": 0.3608, "step": 10201 }, { "epoch": 0.4929216794704547, "grad_norm": 9.977977752685547, "learning_rate": 5.070783205295453e-07, "loss": 0.2633, "step": 10202 }, { "epoch": 0.4929699956515437, "grad_norm": 2.1897411346435547, "learning_rate": 5.070300043484563e-07, "loss": 0.2724, "step": 10203 }, { "epoch": 0.4930183118326327, "grad_norm": 2.4969658851623535, "learning_rate": 5.069816881673672e-07, "loss": 0.3162, "step": 10204 }, { "epoch": 0.4930666280137218, "grad_norm": 3.885467290878296, "learning_rate": 5.069333719862782e-07, "loss": 0.4172, "step": 10205 }, { "epoch": 0.49311494419481083, "grad_norm": 1.8895517587661743, "learning_rate": 5.068850558051892e-07, "loss": 0.1847, "step": 10206 }, { "epoch": 0.4931632603758999, "grad_norm": 2.62308931350708, "learning_rate": 5.068367396241002e-07, "loss": 0.3237, "step": 10207 }, { "epoch": 0.49321157655698894, "grad_norm": 2.2098307609558105, "learning_rate": 5.06788423443011e-07, "loss": 0.2455, "step": 10208 }, { "epoch": 0.49325989273807797, "grad_norm": 3.10567569732666, "learning_rate": 5.067401072619219e-07, "loss": 0.2847, "step": 10209 }, { "epoch": 0.49330820891916705, "grad_norm": 2.388777256011963, "learning_rate": 5.066917910808329e-07, "loss": 0.3201, "step": 10210 }, { "epoch": 0.49335652510025607, "grad_norm": 1.931577205657959, "learning_rate": 5.066434748997439e-07, "loss": 0.2424, "step": 10211 }, { "epoch": 0.4934048412813451, "grad_norm": 3.245267868041992, "learning_rate": 5.065951587186549e-07, "loss": 0.3901, "step": 10212 }, { "epoch": 0.4934531574624342, "grad_norm": 7.416263580322266, "learning_rate": 5.065468425375659e-07, "loss": 0.3239, "step": 10213 }, { "epoch": 0.4935014736435232, "grad_norm": 3.567201614379883, "learning_rate": 5.064985263564768e-07, "loss": 0.3318, "step": 10214 }, { "epoch": 0.4935497898246123, "grad_norm": 13.699742317199707, "learning_rate": 5.064502101753877e-07, "loss": 0.2478, "step": 10215 }, { "epoch": 0.4935981060057013, "grad_norm": 1.9949837923049927, "learning_rate": 5.064018939942986e-07, "loss": 0.2124, "step": 10216 }, { "epoch": 0.49364642218679033, "grad_norm": 3.3487331867218018, "learning_rate": 5.063535778132096e-07, "loss": 0.3738, "step": 10217 }, { "epoch": 0.4936947383678794, "grad_norm": 2.1326773166656494, "learning_rate": 5.063052616321206e-07, "loss": 0.2524, "step": 10218 }, { "epoch": 0.49374305454896844, "grad_norm": 7.2939772605896, "learning_rate": 5.062569454510315e-07, "loss": 0.3383, "step": 10219 }, { "epoch": 0.4937913707300575, "grad_norm": 5.56166410446167, "learning_rate": 5.062086292699425e-07, "loss": 0.2678, "step": 10220 }, { "epoch": 0.49383968691114655, "grad_norm": 3.1632742881774902, "learning_rate": 5.061603130888534e-07, "loss": 0.2787, "step": 10221 }, { "epoch": 0.4938880030922356, "grad_norm": 2.7708730697631836, "learning_rate": 5.061119969077644e-07, "loss": 0.2612, "step": 10222 }, { "epoch": 0.49393631927332465, "grad_norm": 2.2271575927734375, "learning_rate": 5.060636807266754e-07, "loss": 0.2457, "step": 10223 }, { "epoch": 0.4939846354544137, "grad_norm": 3.7467243671417236, "learning_rate": 5.060153645455862e-07, "loss": 0.2196, "step": 10224 }, { "epoch": 0.4940329516355027, "grad_norm": 2.2917258739471436, "learning_rate": 5.059670483644972e-07, "loss": 0.179, "step": 10225 }, { "epoch": 0.4940812678165918, "grad_norm": 5.675262928009033, "learning_rate": 5.059187321834082e-07, "loss": 0.3142, "step": 10226 }, { "epoch": 0.4941295839976808, "grad_norm": 3.4470908641815186, "learning_rate": 5.058704160023192e-07, "loss": 0.443, "step": 10227 }, { "epoch": 0.4941779001787699, "grad_norm": 3.2790024280548096, "learning_rate": 5.058220998212301e-07, "loss": 0.2785, "step": 10228 }, { "epoch": 0.4942262163598589, "grad_norm": 2.050750255584717, "learning_rate": 5.05773783640141e-07, "loss": 0.1928, "step": 10229 }, { "epoch": 0.49427453254094794, "grad_norm": 2.2571523189544678, "learning_rate": 5.05725467459052e-07, "loss": 0.2674, "step": 10230 }, { "epoch": 0.494322848722037, "grad_norm": 2.1861395835876465, "learning_rate": 5.05677151277963e-07, "loss": 0.2958, "step": 10231 }, { "epoch": 0.49437116490312605, "grad_norm": 2.1279399394989014, "learning_rate": 5.05628835096874e-07, "loss": 0.2123, "step": 10232 }, { "epoch": 0.49441948108421513, "grad_norm": 2.5264415740966797, "learning_rate": 5.05580518915785e-07, "loss": 0.1828, "step": 10233 }, { "epoch": 0.49446779726530415, "grad_norm": 3.1077775955200195, "learning_rate": 5.055322027346958e-07, "loss": 0.3185, "step": 10234 }, { "epoch": 0.4945161134463932, "grad_norm": 2.4002842903137207, "learning_rate": 5.054838865536067e-07, "loss": 0.1936, "step": 10235 }, { "epoch": 0.49456442962748226, "grad_norm": 3.405167579650879, "learning_rate": 5.054355703725177e-07, "loss": 0.2796, "step": 10236 }, { "epoch": 0.4946127458085713, "grad_norm": 3.2016258239746094, "learning_rate": 5.053872541914287e-07, "loss": 0.3918, "step": 10237 }, { "epoch": 0.4946610619896603, "grad_norm": 2.2802696228027344, "learning_rate": 5.053389380103397e-07, "loss": 0.2398, "step": 10238 }, { "epoch": 0.4947093781707494, "grad_norm": 2.081465482711792, "learning_rate": 5.052906218292507e-07, "loss": 0.2478, "step": 10239 }, { "epoch": 0.4947576943518384, "grad_norm": 1.997067928314209, "learning_rate": 5.052423056481615e-07, "loss": 0.2207, "step": 10240 }, { "epoch": 0.4948060105329275, "grad_norm": 2.9346415996551514, "learning_rate": 5.051939894670724e-07, "loss": 0.4184, "step": 10241 }, { "epoch": 0.4948543267140165, "grad_norm": 2.121811628341675, "learning_rate": 5.051456732859834e-07, "loss": 0.2072, "step": 10242 }, { "epoch": 0.49490264289510555, "grad_norm": 1.5631179809570312, "learning_rate": 5.050973571048944e-07, "loss": 0.1998, "step": 10243 }, { "epoch": 0.49495095907619463, "grad_norm": 2.889543056488037, "learning_rate": 5.050490409238054e-07, "loss": 0.4368, "step": 10244 }, { "epoch": 0.49499927525728366, "grad_norm": 7.859467029571533, "learning_rate": 5.050007247427163e-07, "loss": 0.4406, "step": 10245 }, { "epoch": 0.49504759143837274, "grad_norm": 4.623168468475342, "learning_rate": 5.049524085616273e-07, "loss": 0.2455, "step": 10246 }, { "epoch": 0.49509590761946176, "grad_norm": 3.3814473152160645, "learning_rate": 5.049040923805382e-07, "loss": 0.3331, "step": 10247 }, { "epoch": 0.4951442238005508, "grad_norm": 2.9931020736694336, "learning_rate": 5.048557761994492e-07, "loss": 0.3616, "step": 10248 }, { "epoch": 0.49519253998163987, "grad_norm": 3.065131425857544, "learning_rate": 5.048074600183602e-07, "loss": 0.2731, "step": 10249 }, { "epoch": 0.4952408561627289, "grad_norm": 6.151787757873535, "learning_rate": 5.04759143837271e-07, "loss": 0.4153, "step": 10250 }, { "epoch": 0.4952891723438179, "grad_norm": 2.342527151107788, "learning_rate": 5.04710827656182e-07, "loss": 0.2991, "step": 10251 }, { "epoch": 0.495337488524907, "grad_norm": 2.4115612506866455, "learning_rate": 5.04662511475093e-07, "loss": 0.2963, "step": 10252 }, { "epoch": 0.495385804705996, "grad_norm": 2.468947410583496, "learning_rate": 5.046141952940039e-07, "loss": 0.2366, "step": 10253 }, { "epoch": 0.4954341208870851, "grad_norm": 2.6331474781036377, "learning_rate": 5.045658791129149e-07, "loss": 0.2377, "step": 10254 }, { "epoch": 0.49548243706817413, "grad_norm": 3.103041887283325, "learning_rate": 5.045175629318258e-07, "loss": 0.3289, "step": 10255 }, { "epoch": 0.49553075324926316, "grad_norm": 2.4465837478637695, "learning_rate": 5.044692467507368e-07, "loss": 0.329, "step": 10256 }, { "epoch": 0.49557906943035224, "grad_norm": 5.540144443511963, "learning_rate": 5.044209305696478e-07, "loss": 0.3352, "step": 10257 }, { "epoch": 0.49562738561144126, "grad_norm": 8.451507568359375, "learning_rate": 5.043726143885588e-07, "loss": 0.2836, "step": 10258 }, { "epoch": 0.49567570179253034, "grad_norm": 1.6364725828170776, "learning_rate": 5.043242982074697e-07, "loss": 0.1782, "step": 10259 }, { "epoch": 0.49572401797361937, "grad_norm": 2.4640486240386963, "learning_rate": 5.042759820263806e-07, "loss": 0.2406, "step": 10260 }, { "epoch": 0.4957723341547084, "grad_norm": 2.8616271018981934, "learning_rate": 5.042276658452915e-07, "loss": 0.344, "step": 10261 }, { "epoch": 0.4958206503357975, "grad_norm": 2.179229736328125, "learning_rate": 5.041793496642025e-07, "loss": 0.2631, "step": 10262 }, { "epoch": 0.4958689665168865, "grad_norm": 3.542726516723633, "learning_rate": 5.041310334831135e-07, "loss": 0.3002, "step": 10263 }, { "epoch": 0.4959172826979755, "grad_norm": 4.193284511566162, "learning_rate": 5.040827173020245e-07, "loss": 0.2939, "step": 10264 }, { "epoch": 0.4959655988790646, "grad_norm": 1.6869763135910034, "learning_rate": 5.040344011209355e-07, "loss": 0.1988, "step": 10265 }, { "epoch": 0.49601391506015363, "grad_norm": 3.08286452293396, "learning_rate": 5.039860849398462e-07, "loss": 0.4596, "step": 10266 }, { "epoch": 0.4960622312412427, "grad_norm": 9.826530456542969, "learning_rate": 5.039377687587572e-07, "loss": 0.4366, "step": 10267 }, { "epoch": 0.49611054742233174, "grad_norm": 2.852386474609375, "learning_rate": 5.038894525776682e-07, "loss": 0.347, "step": 10268 }, { "epoch": 0.49615886360342076, "grad_norm": 2.220285654067993, "learning_rate": 5.038411363965792e-07, "loss": 0.1946, "step": 10269 }, { "epoch": 0.49620717978450984, "grad_norm": 2.4039487838745117, "learning_rate": 5.037928202154902e-07, "loss": 0.2451, "step": 10270 }, { "epoch": 0.49625549596559887, "grad_norm": 1.897965431213379, "learning_rate": 5.037445040344011e-07, "loss": 0.1797, "step": 10271 }, { "epoch": 0.49630381214668795, "grad_norm": 7.227712154388428, "learning_rate": 5.03696187853312e-07, "loss": 0.3125, "step": 10272 }, { "epoch": 0.496352128327777, "grad_norm": 2.7832798957824707, "learning_rate": 5.03647871672223e-07, "loss": 0.3444, "step": 10273 }, { "epoch": 0.496400444508866, "grad_norm": 3.7776622772216797, "learning_rate": 5.03599555491134e-07, "loss": 0.4924, "step": 10274 }, { "epoch": 0.4964487606899551, "grad_norm": 3.718750476837158, "learning_rate": 5.03551239310045e-07, "loss": 0.2849, "step": 10275 }, { "epoch": 0.4964970768710441, "grad_norm": 2.759587049484253, "learning_rate": 5.035029231289558e-07, "loss": 0.3727, "step": 10276 }, { "epoch": 0.49654539305213313, "grad_norm": 1.7559627294540405, "learning_rate": 5.034546069478668e-07, "loss": 0.1771, "step": 10277 }, { "epoch": 0.4965937092332222, "grad_norm": 2.8532304763793945, "learning_rate": 5.034062907667778e-07, "loss": 0.3053, "step": 10278 }, { "epoch": 0.49664202541431124, "grad_norm": 10.545439720153809, "learning_rate": 5.033579745856887e-07, "loss": 0.4121, "step": 10279 }, { "epoch": 0.4966903415954003, "grad_norm": 2.283348560333252, "learning_rate": 5.033096584045997e-07, "loss": 0.2999, "step": 10280 }, { "epoch": 0.49673865777648935, "grad_norm": 5.246526718139648, "learning_rate": 5.032613422235106e-07, "loss": 0.4299, "step": 10281 }, { "epoch": 0.49678697395757837, "grad_norm": 3.0865657329559326, "learning_rate": 5.032130260424216e-07, "loss": 0.2171, "step": 10282 }, { "epoch": 0.49683529013866745, "grad_norm": 4.37460470199585, "learning_rate": 5.031647098613326e-07, "loss": 0.3398, "step": 10283 }, { "epoch": 0.4968836063197565, "grad_norm": 3.0766332149505615, "learning_rate": 5.031163936802435e-07, "loss": 0.4339, "step": 10284 }, { "epoch": 0.49693192250084556, "grad_norm": 2.6700499057769775, "learning_rate": 5.030680774991544e-07, "loss": 0.3703, "step": 10285 }, { "epoch": 0.4969802386819346, "grad_norm": 1.53156578540802, "learning_rate": 5.030197613180654e-07, "loss": 0.1705, "step": 10286 }, { "epoch": 0.4970285548630236, "grad_norm": 10.69249439239502, "learning_rate": 5.029714451369763e-07, "loss": 0.2966, "step": 10287 }, { "epoch": 0.4970768710441127, "grad_norm": 1.8846760988235474, "learning_rate": 5.029231289558873e-07, "loss": 0.1934, "step": 10288 }, { "epoch": 0.4971251872252017, "grad_norm": 3.164067268371582, "learning_rate": 5.028748127747983e-07, "loss": 0.364, "step": 10289 }, { "epoch": 0.49717350340629074, "grad_norm": 3.1456186771392822, "learning_rate": 5.028264965937093e-07, "loss": 0.3717, "step": 10290 }, { "epoch": 0.4972218195873798, "grad_norm": 4.632706165313721, "learning_rate": 5.027781804126203e-07, "loss": 0.3378, "step": 10291 }, { "epoch": 0.49727013576846885, "grad_norm": 3.9545252323150635, "learning_rate": 5.02729864231531e-07, "loss": 0.379, "step": 10292 }, { "epoch": 0.4973184519495579, "grad_norm": 2.498757839202881, "learning_rate": 5.02681548050442e-07, "loss": 0.2668, "step": 10293 }, { "epoch": 0.49736676813064695, "grad_norm": 3.292766571044922, "learning_rate": 5.02633231869353e-07, "loss": 0.2332, "step": 10294 }, { "epoch": 0.497415084311736, "grad_norm": 2.7604873180389404, "learning_rate": 5.02584915688264e-07, "loss": 0.3277, "step": 10295 }, { "epoch": 0.49746340049282506, "grad_norm": 1.8674306869506836, "learning_rate": 5.02536599507175e-07, "loss": 0.2513, "step": 10296 }, { "epoch": 0.4975117166739141, "grad_norm": 2.57196044921875, "learning_rate": 5.024882833260859e-07, "loss": 0.2706, "step": 10297 }, { "epoch": 0.49756003285500316, "grad_norm": 2.91560959815979, "learning_rate": 5.024399671449968e-07, "loss": 0.3463, "step": 10298 }, { "epoch": 0.4976083490360922, "grad_norm": 2.6525721549987793, "learning_rate": 5.023916509639078e-07, "loss": 0.3799, "step": 10299 }, { "epoch": 0.4976566652171812, "grad_norm": 4.406444549560547, "learning_rate": 5.023433347828188e-07, "loss": 0.4457, "step": 10300 }, { "epoch": 0.4977049813982703, "grad_norm": 2.803023099899292, "learning_rate": 5.022950186017297e-07, "loss": 0.3818, "step": 10301 }, { "epoch": 0.4977532975793593, "grad_norm": 2.8061344623565674, "learning_rate": 5.022467024206406e-07, "loss": 0.3282, "step": 10302 }, { "epoch": 0.49780161376044835, "grad_norm": 2.311475992202759, "learning_rate": 5.021983862395516e-07, "loss": 0.3188, "step": 10303 }, { "epoch": 0.49784992994153743, "grad_norm": 3.6531083583831787, "learning_rate": 5.021500700584625e-07, "loss": 0.4293, "step": 10304 }, { "epoch": 0.49789824612262645, "grad_norm": 1.860534429550171, "learning_rate": 5.021017538773735e-07, "loss": 0.1656, "step": 10305 }, { "epoch": 0.49794656230371553, "grad_norm": 2.555720090866089, "learning_rate": 5.020534376962845e-07, "loss": 0.2394, "step": 10306 }, { "epoch": 0.49799487848480456, "grad_norm": 2.6076436042785645, "learning_rate": 5.020051215151954e-07, "loss": 0.2126, "step": 10307 }, { "epoch": 0.4980431946658936, "grad_norm": 2.821756362915039, "learning_rate": 5.019568053341064e-07, "loss": 0.3646, "step": 10308 }, { "epoch": 0.49809151084698267, "grad_norm": 3.7850263118743896, "learning_rate": 5.019084891530173e-07, "loss": 0.3094, "step": 10309 }, { "epoch": 0.4981398270280717, "grad_norm": 3.928314685821533, "learning_rate": 5.018601729719283e-07, "loss": 0.3319, "step": 10310 }, { "epoch": 0.49818814320916077, "grad_norm": 2.363039970397949, "learning_rate": 5.018118567908392e-07, "loss": 0.2472, "step": 10311 }, { "epoch": 0.4982364593902498, "grad_norm": 2.893519401550293, "learning_rate": 5.017635406097502e-07, "loss": 0.288, "step": 10312 }, { "epoch": 0.4982847755713388, "grad_norm": 2.5629935264587402, "learning_rate": 5.017152244286611e-07, "loss": 0.2496, "step": 10313 }, { "epoch": 0.4983330917524279, "grad_norm": 2.1080782413482666, "learning_rate": 5.016669082475721e-07, "loss": 0.1747, "step": 10314 }, { "epoch": 0.49838140793351693, "grad_norm": 2.501236915588379, "learning_rate": 5.016185920664831e-07, "loss": 0.2707, "step": 10315 }, { "epoch": 0.49842972411460595, "grad_norm": 3.7334911823272705, "learning_rate": 5.015702758853941e-07, "loss": 0.252, "step": 10316 }, { "epoch": 0.49847804029569504, "grad_norm": 2.518213987350464, "learning_rate": 5.01521959704305e-07, "loss": 0.2845, "step": 10317 }, { "epoch": 0.49852635647678406, "grad_norm": 5.157438278198242, "learning_rate": 5.014736435232158e-07, "loss": 0.297, "step": 10318 }, { "epoch": 0.49857467265787314, "grad_norm": 3.980116128921509, "learning_rate": 5.014253273421268e-07, "loss": 0.4271, "step": 10319 }, { "epoch": 0.49862298883896217, "grad_norm": 3.9762167930603027, "learning_rate": 5.013770111610378e-07, "loss": 0.3353, "step": 10320 }, { "epoch": 0.4986713050200512, "grad_norm": 2.694746732711792, "learning_rate": 5.013286949799488e-07, "loss": 0.3418, "step": 10321 }, { "epoch": 0.4987196212011403, "grad_norm": 3.569091320037842, "learning_rate": 5.012803787988598e-07, "loss": 0.3434, "step": 10322 }, { "epoch": 0.4987679373822293, "grad_norm": 3.1702733039855957, "learning_rate": 5.012320626177706e-07, "loss": 0.2697, "step": 10323 }, { "epoch": 0.4988162535633184, "grad_norm": 2.5669939517974854, "learning_rate": 5.011837464366816e-07, "loss": 0.2463, "step": 10324 }, { "epoch": 0.4988645697444074, "grad_norm": 1.587664008140564, "learning_rate": 5.011354302555926e-07, "loss": 0.1925, "step": 10325 }, { "epoch": 0.49891288592549643, "grad_norm": 2.2760956287384033, "learning_rate": 5.010871140745035e-07, "loss": 0.2331, "step": 10326 }, { "epoch": 0.4989612021065855, "grad_norm": 2.6862852573394775, "learning_rate": 5.010387978934145e-07, "loss": 0.2942, "step": 10327 }, { "epoch": 0.49900951828767454, "grad_norm": 4.45399808883667, "learning_rate": 5.009904817123254e-07, "loss": 0.1617, "step": 10328 }, { "epoch": 0.49905783446876356, "grad_norm": 5.501582145690918, "learning_rate": 5.009421655312364e-07, "loss": 0.2435, "step": 10329 }, { "epoch": 0.49910615064985264, "grad_norm": 5.13597297668457, "learning_rate": 5.008938493501473e-07, "loss": 0.2943, "step": 10330 }, { "epoch": 0.49915446683094167, "grad_norm": 3.7617669105529785, "learning_rate": 5.008455331690583e-07, "loss": 0.2883, "step": 10331 }, { "epoch": 0.49920278301203075, "grad_norm": 3.141719102859497, "learning_rate": 5.007972169879693e-07, "loss": 0.3902, "step": 10332 }, { "epoch": 0.4992510991931198, "grad_norm": 2.3576791286468506, "learning_rate": 5.007489008068802e-07, "loss": 0.2519, "step": 10333 }, { "epoch": 0.4992994153742088, "grad_norm": 3.924574375152588, "learning_rate": 5.007005846257911e-07, "loss": 0.3305, "step": 10334 }, { "epoch": 0.4993477315552979, "grad_norm": 4.624518394470215, "learning_rate": 5.006522684447021e-07, "loss": 0.4184, "step": 10335 }, { "epoch": 0.4993960477363869, "grad_norm": 1.7535791397094727, "learning_rate": 5.00603952263613e-07, "loss": 0.193, "step": 10336 }, { "epoch": 0.499444363917476, "grad_norm": 2.909714698791504, "learning_rate": 5.00555636082524e-07, "loss": 0.2559, "step": 10337 }, { "epoch": 0.499492680098565, "grad_norm": 2.010108709335327, "learning_rate": 5.00507319901435e-07, "loss": 0.2352, "step": 10338 }, { "epoch": 0.49954099627965404, "grad_norm": 2.026759386062622, "learning_rate": 5.004590037203459e-07, "loss": 0.1883, "step": 10339 }, { "epoch": 0.4995893124607431, "grad_norm": 3.7621583938598633, "learning_rate": 5.004106875392569e-07, "loss": 0.3747, "step": 10340 }, { "epoch": 0.49963762864183214, "grad_norm": 5.744492053985596, "learning_rate": 5.003623713581679e-07, "loss": 0.2491, "step": 10341 }, { "epoch": 0.49968594482292117, "grad_norm": 4.174633502960205, "learning_rate": 5.003140551770789e-07, "loss": 0.4029, "step": 10342 }, { "epoch": 0.49973426100401025, "grad_norm": 8.445691108703613, "learning_rate": 5.002657389959897e-07, "loss": 0.3156, "step": 10343 }, { "epoch": 0.4997825771850993, "grad_norm": 2.5386126041412354, "learning_rate": 5.002174228149006e-07, "loss": 0.2494, "step": 10344 }, { "epoch": 0.49983089336618836, "grad_norm": 1.9686545133590698, "learning_rate": 5.001691066338116e-07, "loss": 0.1994, "step": 10345 }, { "epoch": 0.4998792095472774, "grad_norm": 4.1252121925354, "learning_rate": 5.001207904527226e-07, "loss": 0.3827, "step": 10346 }, { "epoch": 0.4999275257283664, "grad_norm": 1.68342125415802, "learning_rate": 5.000724742716336e-07, "loss": 0.1357, "step": 10347 }, { "epoch": 0.4999758419094555, "grad_norm": 2.641724109649658, "learning_rate": 5.000241580905446e-07, "loss": 0.2602, "step": 10348 }, { "epoch": 0.5000241580905446, "grad_norm": 2.2472641468048096, "learning_rate": 4.999758419094555e-07, "loss": 0.2203, "step": 10349 }, { "epoch": 0.5000724742716336, "grad_norm": 1.7645376920700073, "learning_rate": 4.999275257283664e-07, "loss": 0.1414, "step": 10350 }, { "epoch": 0.5001207904527226, "grad_norm": 1.9595589637756348, "learning_rate": 4.998792095472773e-07, "loss": 0.2022, "step": 10351 }, { "epoch": 0.5001691066338116, "grad_norm": 2.378479480743408, "learning_rate": 4.998308933661883e-07, "loss": 0.2836, "step": 10352 }, { "epoch": 0.5002174228149007, "grad_norm": 4.125068664550781, "learning_rate": 4.997825771850992e-07, "loss": 0.3482, "step": 10353 }, { "epoch": 0.5002657389959898, "grad_norm": 9.40890884399414, "learning_rate": 4.997342610040102e-07, "loss": 0.2875, "step": 10354 }, { "epoch": 0.5003140551770788, "grad_norm": 2.5059492588043213, "learning_rate": 4.996859448229211e-07, "loss": 0.2811, "step": 10355 }, { "epoch": 0.5003623713581679, "grad_norm": 3.2870428562164307, "learning_rate": 4.996376286418321e-07, "loss": 0.2859, "step": 10356 }, { "epoch": 0.5004106875392569, "grad_norm": 5.326707363128662, "learning_rate": 4.995893124607431e-07, "loss": 0.342, "step": 10357 }, { "epoch": 0.5004590037203459, "grad_norm": 2.772352695465088, "learning_rate": 4.995409962796541e-07, "loss": 0.2564, "step": 10358 }, { "epoch": 0.500507319901435, "grad_norm": 2.55391001701355, "learning_rate": 4.99492680098565e-07, "loss": 0.346, "step": 10359 }, { "epoch": 0.5005556360825241, "grad_norm": 5.869524955749512, "learning_rate": 4.994443639174759e-07, "loss": 0.2504, "step": 10360 }, { "epoch": 0.5006039522636131, "grad_norm": 1.8157520294189453, "learning_rate": 4.993960477363869e-07, "loss": 0.2324, "step": 10361 }, { "epoch": 0.5006522684447021, "grad_norm": 2.6835994720458984, "learning_rate": 4.993477315552978e-07, "loss": 0.2721, "step": 10362 }, { "epoch": 0.5007005846257911, "grad_norm": 3.0033626556396484, "learning_rate": 4.992994153742088e-07, "loss": 0.4435, "step": 10363 }, { "epoch": 0.5007489008068802, "grad_norm": 3.3610172271728516, "learning_rate": 4.992510991931198e-07, "loss": 0.2586, "step": 10364 }, { "epoch": 0.5007972169879693, "grad_norm": 3.7499990463256836, "learning_rate": 4.992027830120307e-07, "loss": 0.3613, "step": 10365 }, { "epoch": 0.5008455331690583, "grad_norm": 2.6113295555114746, "learning_rate": 4.991544668309417e-07, "loss": 0.3823, "step": 10366 }, { "epoch": 0.5008938493501474, "grad_norm": 2.217017650604248, "learning_rate": 4.991061506498527e-07, "loss": 0.2327, "step": 10367 }, { "epoch": 0.5009421655312364, "grad_norm": 1.9255179166793823, "learning_rate": 4.990578344687635e-07, "loss": 0.2083, "step": 10368 }, { "epoch": 0.5009904817123254, "grad_norm": 2.0857467651367188, "learning_rate": 4.990095182876745e-07, "loss": 0.3138, "step": 10369 }, { "epoch": 0.5010387978934145, "grad_norm": 1.8781641721725464, "learning_rate": 4.989612021065855e-07, "loss": 0.207, "step": 10370 }, { "epoch": 0.5010871140745036, "grad_norm": 2.6472012996673584, "learning_rate": 4.989128859254964e-07, "loss": 0.2636, "step": 10371 }, { "epoch": 0.5011354302555926, "grad_norm": 2.2657089233398438, "learning_rate": 4.988645697444074e-07, "loss": 0.2733, "step": 10372 }, { "epoch": 0.5011837464366816, "grad_norm": 2.370610237121582, "learning_rate": 4.988162535633184e-07, "loss": 0.274, "step": 10373 }, { "epoch": 0.5012320626177706, "grad_norm": 3.458293914794922, "learning_rate": 4.987679373822293e-07, "loss": 0.3532, "step": 10374 }, { "epoch": 0.5012803787988598, "grad_norm": 2.6765005588531494, "learning_rate": 4.987196212011403e-07, "loss": 0.2528, "step": 10375 }, { "epoch": 0.5013286949799488, "grad_norm": 2.6230618953704834, "learning_rate": 4.986713050200511e-07, "loss": 0.3234, "step": 10376 }, { "epoch": 0.5013770111610378, "grad_norm": 5.751240253448486, "learning_rate": 4.986229888389621e-07, "loss": 0.4437, "step": 10377 }, { "epoch": 0.5014253273421269, "grad_norm": 3.2499728202819824, "learning_rate": 4.985746726578731e-07, "loss": 0.3109, "step": 10378 }, { "epoch": 0.5014736435232159, "grad_norm": 1.8132070302963257, "learning_rate": 4.98526356476784e-07, "loss": 0.1583, "step": 10379 }, { "epoch": 0.501521959704305, "grad_norm": 5.012701988220215, "learning_rate": 4.98478040295695e-07, "loss": 0.2596, "step": 10380 }, { "epoch": 0.501570275885394, "grad_norm": 2.9873604774475098, "learning_rate": 4.984297241146059e-07, "loss": 0.3156, "step": 10381 }, { "epoch": 0.5016185920664831, "grad_norm": 3.2652266025543213, "learning_rate": 4.983814079335169e-07, "loss": 0.4386, "step": 10382 }, { "epoch": 0.5016669082475721, "grad_norm": 3.0264065265655518, "learning_rate": 4.983330917524279e-07, "loss": 0.386, "step": 10383 }, { "epoch": 0.5017152244286611, "grad_norm": 3.017007827758789, "learning_rate": 4.982847755713388e-07, "loss": 0.1961, "step": 10384 }, { "epoch": 0.5017635406097503, "grad_norm": 4.900841236114502, "learning_rate": 4.982364593902497e-07, "loss": 0.321, "step": 10385 }, { "epoch": 0.5018118567908393, "grad_norm": 1.5988272428512573, "learning_rate": 4.981881432091607e-07, "loss": 0.1973, "step": 10386 }, { "epoch": 0.5018601729719283, "grad_norm": 4.030196666717529, "learning_rate": 4.981398270280716e-07, "loss": 0.3349, "step": 10387 }, { "epoch": 0.5019084891530173, "grad_norm": 2.6292026042938232, "learning_rate": 4.980915108469826e-07, "loss": 0.2433, "step": 10388 }, { "epoch": 0.5019568053341064, "grad_norm": 3.4330010414123535, "learning_rate": 4.980431946658936e-07, "loss": 0.3465, "step": 10389 }, { "epoch": 0.5020051215151954, "grad_norm": 2.4029765129089355, "learning_rate": 4.979948784848046e-07, "loss": 0.2952, "step": 10390 }, { "epoch": 0.5020534376962845, "grad_norm": 2.2231428623199463, "learning_rate": 4.979465623037155e-07, "loss": 0.2939, "step": 10391 }, { "epoch": 0.5021017538773735, "grad_norm": 2.292602777481079, "learning_rate": 4.978982461226265e-07, "loss": 0.2462, "step": 10392 }, { "epoch": 0.5021500700584626, "grad_norm": 6.274514675140381, "learning_rate": 4.978499299415375e-07, "loss": 0.1873, "step": 10393 }, { "epoch": 0.5021983862395516, "grad_norm": 2.657900333404541, "learning_rate": 4.978016137604483e-07, "loss": 0.2852, "step": 10394 }, { "epoch": 0.5022467024206406, "grad_norm": 2.518714427947998, "learning_rate": 4.977532975793593e-07, "loss": 0.3119, "step": 10395 }, { "epoch": 0.5022950186017298, "grad_norm": 1.983974814414978, "learning_rate": 4.977049813982703e-07, "loss": 0.1732, "step": 10396 }, { "epoch": 0.5023433347828188, "grad_norm": 2.3224947452545166, "learning_rate": 4.976566652171812e-07, "loss": 0.1735, "step": 10397 }, { "epoch": 0.5023916509639078, "grad_norm": 3.722151756286621, "learning_rate": 4.976083490360922e-07, "loss": 0.2707, "step": 10398 }, { "epoch": 0.5024399671449968, "grad_norm": 4.948612213134766, "learning_rate": 4.975600328550032e-07, "loss": 0.2253, "step": 10399 }, { "epoch": 0.5024882833260859, "grad_norm": 2.7719478607177734, "learning_rate": 4.975117166739141e-07, "loss": 0.2206, "step": 10400 }, { "epoch": 0.502536599507175, "grad_norm": 2.7790653705596924, "learning_rate": 4.974634004928251e-07, "loss": 0.3001, "step": 10401 }, { "epoch": 0.502584915688264, "grad_norm": 3.1675496101379395, "learning_rate": 4.974150843117359e-07, "loss": 0.4002, "step": 10402 }, { "epoch": 0.502633231869353, "grad_norm": 2.369938373565674, "learning_rate": 4.973667681306469e-07, "loss": 0.3103, "step": 10403 }, { "epoch": 0.5026815480504421, "grad_norm": 2.4480278491973877, "learning_rate": 4.973184519495579e-07, "loss": 0.2709, "step": 10404 }, { "epoch": 0.5027298642315311, "grad_norm": 2.9606924057006836, "learning_rate": 4.972701357684688e-07, "loss": 0.3307, "step": 10405 }, { "epoch": 0.5027781804126202, "grad_norm": 2.5905253887176514, "learning_rate": 4.972218195873798e-07, "loss": 0.2191, "step": 10406 }, { "epoch": 0.5028264965937093, "grad_norm": 4.114433288574219, "learning_rate": 4.971735034062907e-07, "loss": 0.3783, "step": 10407 }, { "epoch": 0.5028748127747983, "grad_norm": 4.965817928314209, "learning_rate": 4.971251872252017e-07, "loss": 0.5244, "step": 10408 }, { "epoch": 0.5029231289558873, "grad_norm": 1.9910088777542114, "learning_rate": 4.970768710441127e-07, "loss": 0.2848, "step": 10409 }, { "epoch": 0.5029714451369763, "grad_norm": 1.6541985273361206, "learning_rate": 4.970285548630235e-07, "loss": 0.1801, "step": 10410 }, { "epoch": 0.5030197613180655, "grad_norm": 14.738483428955078, "learning_rate": 4.969802386819345e-07, "loss": 0.3346, "step": 10411 }, { "epoch": 0.5030680774991545, "grad_norm": 2.3259217739105225, "learning_rate": 4.969319225008455e-07, "loss": 0.3121, "step": 10412 }, { "epoch": 0.5031163936802435, "grad_norm": 2.1833693981170654, "learning_rate": 4.968836063197564e-07, "loss": 0.2158, "step": 10413 }, { "epoch": 0.5031647098613325, "grad_norm": 1.792403221130371, "learning_rate": 4.968352901386674e-07, "loss": 0.1721, "step": 10414 }, { "epoch": 0.5032130260424216, "grad_norm": 2.9199185371398926, "learning_rate": 4.967869739575784e-07, "loss": 0.3023, "step": 10415 }, { "epoch": 0.5032613422235106, "grad_norm": 2.468838930130005, "learning_rate": 4.967386577764893e-07, "loss": 0.2695, "step": 10416 }, { "epoch": 0.5033096584045997, "grad_norm": 2.485600709915161, "learning_rate": 4.966903415954003e-07, "loss": 0.3706, "step": 10417 }, { "epoch": 0.5033579745856888, "grad_norm": 2.841822862625122, "learning_rate": 4.966420254143113e-07, "loss": 0.2958, "step": 10418 }, { "epoch": 0.5034062907667778, "grad_norm": 3.4372200965881348, "learning_rate": 4.965937092332221e-07, "loss": 0.3603, "step": 10419 }, { "epoch": 0.5034546069478668, "grad_norm": 2.371880054473877, "learning_rate": 4.965453930521331e-07, "loss": 0.3059, "step": 10420 }, { "epoch": 0.5035029231289558, "grad_norm": 2.3607144355773926, "learning_rate": 4.964970768710441e-07, "loss": 0.2834, "step": 10421 }, { "epoch": 0.503551239310045, "grad_norm": 4.272170543670654, "learning_rate": 4.964487606899551e-07, "loss": 0.3804, "step": 10422 }, { "epoch": 0.503599555491134, "grad_norm": 2.2550432682037354, "learning_rate": 4.96400444508866e-07, "loss": 0.2754, "step": 10423 }, { "epoch": 0.503647871672223, "grad_norm": 9.879792213439941, "learning_rate": 4.96352128327777e-07, "loss": 0.3619, "step": 10424 }, { "epoch": 0.503696187853312, "grad_norm": 7.794052600860596, "learning_rate": 4.96303812146688e-07, "loss": 0.3125, "step": 10425 }, { "epoch": 0.5037445040344011, "grad_norm": 4.9103169441223145, "learning_rate": 4.962554959655989e-07, "loss": 0.3018, "step": 10426 }, { "epoch": 0.5037928202154902, "grad_norm": 3.495387077331543, "learning_rate": 4.962071797845099e-07, "loss": 0.1807, "step": 10427 }, { "epoch": 0.5038411363965792, "grad_norm": 3.398175001144409, "learning_rate": 4.961588636034207e-07, "loss": 0.2982, "step": 10428 }, { "epoch": 0.5038894525776683, "grad_norm": 2.8232386112213135, "learning_rate": 4.961105474223317e-07, "loss": 0.3881, "step": 10429 }, { "epoch": 0.5039377687587573, "grad_norm": 2.615792989730835, "learning_rate": 4.960622312412427e-07, "loss": 0.1944, "step": 10430 }, { "epoch": 0.5039860849398463, "grad_norm": 3.110269784927368, "learning_rate": 4.960139150601536e-07, "loss": 0.2299, "step": 10431 }, { "epoch": 0.5040344011209354, "grad_norm": 9.501470565795898, "learning_rate": 4.959655988790646e-07, "loss": 0.2888, "step": 10432 }, { "epoch": 0.5040827173020245, "grad_norm": 2.1890923976898193, "learning_rate": 4.959172826979755e-07, "loss": 0.2946, "step": 10433 }, { "epoch": 0.5041310334831135, "grad_norm": 3.5899579524993896, "learning_rate": 4.958689665168865e-07, "loss": 0.3898, "step": 10434 }, { "epoch": 0.5041793496642025, "grad_norm": 5.6348443031311035, "learning_rate": 4.958206503357975e-07, "loss": 0.2015, "step": 10435 }, { "epoch": 0.5042276658452916, "grad_norm": 2.931746244430542, "learning_rate": 4.957723341547083e-07, "loss": 0.3076, "step": 10436 }, { "epoch": 0.5042759820263807, "grad_norm": 5.043954849243164, "learning_rate": 4.957240179736193e-07, "loss": 0.3385, "step": 10437 }, { "epoch": 0.5043242982074697, "grad_norm": 3.1418185234069824, "learning_rate": 4.956757017925303e-07, "loss": 0.4311, "step": 10438 }, { "epoch": 0.5043726143885587, "grad_norm": 2.252758026123047, "learning_rate": 4.956273856114412e-07, "loss": 0.2263, "step": 10439 }, { "epoch": 0.5044209305696478, "grad_norm": 1.6302696466445923, "learning_rate": 4.955790694303522e-07, "loss": 0.2035, "step": 10440 }, { "epoch": 0.5044692467507368, "grad_norm": 11.048702239990234, "learning_rate": 4.955307532492632e-07, "loss": 0.2525, "step": 10441 }, { "epoch": 0.5045175629318258, "grad_norm": 2.4053337574005127, "learning_rate": 4.954824370681741e-07, "loss": 0.2418, "step": 10442 }, { "epoch": 0.504565879112915, "grad_norm": 5.941202640533447, "learning_rate": 4.954341208870851e-07, "loss": 0.274, "step": 10443 }, { "epoch": 0.504614195294004, "grad_norm": 3.5792477130889893, "learning_rate": 4.95385804705996e-07, "loss": 0.2688, "step": 10444 }, { "epoch": 0.504662511475093, "grad_norm": 3.823500871658325, "learning_rate": 4.953374885249069e-07, "loss": 0.3525, "step": 10445 }, { "epoch": 0.504710827656182, "grad_norm": 2.6129934787750244, "learning_rate": 4.952891723438179e-07, "loss": 0.3186, "step": 10446 }, { "epoch": 0.504759143837271, "grad_norm": 2.599390983581543, "learning_rate": 4.952408561627289e-07, "loss": 0.3177, "step": 10447 }, { "epoch": 0.5048074600183602, "grad_norm": 2.125507354736328, "learning_rate": 4.951925399816398e-07, "loss": 0.2112, "step": 10448 }, { "epoch": 0.5048557761994492, "grad_norm": 2.0958328247070312, "learning_rate": 4.951442238005508e-07, "loss": 0.2996, "step": 10449 }, { "epoch": 0.5049040923805382, "grad_norm": 3.2737629413604736, "learning_rate": 4.950959076194618e-07, "loss": 0.2742, "step": 10450 }, { "epoch": 0.5049524085616273, "grad_norm": 4.045783042907715, "learning_rate": 4.950475914383727e-07, "loss": 0.2634, "step": 10451 }, { "epoch": 0.5050007247427163, "grad_norm": 3.1072332859039307, "learning_rate": 4.949992752572837e-07, "loss": 0.2526, "step": 10452 }, { "epoch": 0.5050490409238054, "grad_norm": 3.156540870666504, "learning_rate": 4.949509590761946e-07, "loss": 0.2726, "step": 10453 }, { "epoch": 0.5050973571048945, "grad_norm": 5.102315902709961, "learning_rate": 4.949026428951055e-07, "loss": 0.4363, "step": 10454 }, { "epoch": 0.5051456732859835, "grad_norm": 4.484220027923584, "learning_rate": 4.948543267140165e-07, "loss": 0.3585, "step": 10455 }, { "epoch": 0.5051939894670725, "grad_norm": 2.415926933288574, "learning_rate": 4.948060105329275e-07, "loss": 0.23, "step": 10456 }, { "epoch": 0.5052423056481615, "grad_norm": 1.924367070198059, "learning_rate": 4.947576943518384e-07, "loss": 0.2304, "step": 10457 }, { "epoch": 0.5052906218292507, "grad_norm": 2.0970029830932617, "learning_rate": 4.947093781707494e-07, "loss": 0.2787, "step": 10458 }, { "epoch": 0.5053389380103397, "grad_norm": 6.522836208343506, "learning_rate": 4.946610619896603e-07, "loss": 0.277, "step": 10459 }, { "epoch": 0.5053872541914287, "grad_norm": 1.9970979690551758, "learning_rate": 4.946127458085713e-07, "loss": 0.2462, "step": 10460 }, { "epoch": 0.5054355703725177, "grad_norm": 2.856745719909668, "learning_rate": 4.945644296274822e-07, "loss": 0.3221, "step": 10461 }, { "epoch": 0.5054838865536068, "grad_norm": 5.885802745819092, "learning_rate": 4.945161134463931e-07, "loss": 0.362, "step": 10462 }, { "epoch": 0.5055322027346959, "grad_norm": 2.524531126022339, "learning_rate": 4.944677972653041e-07, "loss": 0.2356, "step": 10463 }, { "epoch": 0.5055805189157849, "grad_norm": 2.6699202060699463, "learning_rate": 4.944194810842151e-07, "loss": 0.3268, "step": 10464 }, { "epoch": 0.505628835096874, "grad_norm": 2.822166919708252, "learning_rate": 4.94371164903126e-07, "loss": 0.3711, "step": 10465 }, { "epoch": 0.505677151277963, "grad_norm": 3.6099953651428223, "learning_rate": 4.94322848722037e-07, "loss": 0.3017, "step": 10466 }, { "epoch": 0.505725467459052, "grad_norm": 2.9945995807647705, "learning_rate": 4.942745325409479e-07, "loss": 0.3822, "step": 10467 }, { "epoch": 0.505773783640141, "grad_norm": 4.2755255699157715, "learning_rate": 4.942262163598589e-07, "loss": 0.2805, "step": 10468 }, { "epoch": 0.5058220998212302, "grad_norm": 2.9006335735321045, "learning_rate": 4.941779001787699e-07, "loss": 0.376, "step": 10469 }, { "epoch": 0.5058704160023192, "grad_norm": 4.057344436645508, "learning_rate": 4.941295839976807e-07, "loss": 0.4124, "step": 10470 }, { "epoch": 0.5059187321834082, "grad_norm": 2.737675666809082, "learning_rate": 4.940812678165917e-07, "loss": 0.2687, "step": 10471 }, { "epoch": 0.5059670483644972, "grad_norm": 2.3574304580688477, "learning_rate": 4.940329516355027e-07, "loss": 0.2888, "step": 10472 }, { "epoch": 0.5060153645455863, "grad_norm": 4.31495475769043, "learning_rate": 4.939846354544137e-07, "loss": 0.4467, "step": 10473 }, { "epoch": 0.5060636807266754, "grad_norm": 2.1693522930145264, "learning_rate": 4.939363192733246e-07, "loss": 0.2321, "step": 10474 }, { "epoch": 0.5061119969077644, "grad_norm": 3.7849369049072266, "learning_rate": 4.938880030922356e-07, "loss": 0.3441, "step": 10475 }, { "epoch": 0.5061603130888535, "grad_norm": 3.7039833068847656, "learning_rate": 4.938396869111466e-07, "loss": 0.2924, "step": 10476 }, { "epoch": 0.5062086292699425, "grad_norm": 17.41004180908203, "learning_rate": 4.937913707300575e-07, "loss": 0.273, "step": 10477 }, { "epoch": 0.5062569454510315, "grad_norm": 5.214887619018555, "learning_rate": 4.937430545489684e-07, "loss": 0.3016, "step": 10478 }, { "epoch": 0.5063052616321206, "grad_norm": 2.9997141361236572, "learning_rate": 4.936947383678794e-07, "loss": 0.2581, "step": 10479 }, { "epoch": 0.5063535778132097, "grad_norm": 3.21518611907959, "learning_rate": 4.936464221867903e-07, "loss": 0.4022, "step": 10480 }, { "epoch": 0.5064018939942987, "grad_norm": 3.0134360790252686, "learning_rate": 4.935981060057013e-07, "loss": 0.2803, "step": 10481 }, { "epoch": 0.5064502101753877, "grad_norm": 3.065598487854004, "learning_rate": 4.935497898246123e-07, "loss": 0.3792, "step": 10482 }, { "epoch": 0.5064985263564767, "grad_norm": 3.643676519393921, "learning_rate": 4.935014736435232e-07, "loss": 0.2887, "step": 10483 }, { "epoch": 0.5065468425375659, "grad_norm": 2.9082090854644775, "learning_rate": 4.934531574624342e-07, "loss": 0.3885, "step": 10484 }, { "epoch": 0.5065951587186549, "grad_norm": 2.9691011905670166, "learning_rate": 4.934048412813451e-07, "loss": 0.2649, "step": 10485 }, { "epoch": 0.5066434748997439, "grad_norm": 2.1551156044006348, "learning_rate": 4.93356525100256e-07, "loss": 0.17, "step": 10486 }, { "epoch": 0.506691791080833, "grad_norm": 1.9074293375015259, "learning_rate": 4.93308208919167e-07, "loss": 0.2106, "step": 10487 }, { "epoch": 0.506740107261922, "grad_norm": 2.4459011554718018, "learning_rate": 4.932598927380779e-07, "loss": 0.2688, "step": 10488 }, { "epoch": 0.5067884234430111, "grad_norm": 3.831627607345581, "learning_rate": 4.932115765569889e-07, "loss": 0.173, "step": 10489 }, { "epoch": 0.5068367396241001, "grad_norm": 2.3944296836853027, "learning_rate": 4.931632603758999e-07, "loss": 0.2852, "step": 10490 }, { "epoch": 0.5068850558051892, "grad_norm": 2.5729589462280273, "learning_rate": 4.931149441948108e-07, "loss": 0.2932, "step": 10491 }, { "epoch": 0.5069333719862782, "grad_norm": 2.3401684761047363, "learning_rate": 4.930666280137218e-07, "loss": 0.2568, "step": 10492 }, { "epoch": 0.5069816881673672, "grad_norm": 4.12916898727417, "learning_rate": 4.930183118326327e-07, "loss": 0.1781, "step": 10493 }, { "epoch": 0.5070300043484564, "grad_norm": 1.5946928262710571, "learning_rate": 4.929699956515437e-07, "loss": 0.2074, "step": 10494 }, { "epoch": 0.5070783205295454, "grad_norm": 2.911952495574951, "learning_rate": 4.929216794704546e-07, "loss": 0.2505, "step": 10495 }, { "epoch": 0.5071266367106344, "grad_norm": 2.2203030586242676, "learning_rate": 4.928733632893655e-07, "loss": 0.2439, "step": 10496 }, { "epoch": 0.5071749528917234, "grad_norm": 3.2568812370300293, "learning_rate": 4.928250471082765e-07, "loss": 0.3199, "step": 10497 }, { "epoch": 0.5072232690728125, "grad_norm": 2.2041189670562744, "learning_rate": 4.927767309271875e-07, "loss": 0.2175, "step": 10498 }, { "epoch": 0.5072715852539015, "grad_norm": 1.7289841175079346, "learning_rate": 4.927284147460984e-07, "loss": 0.1395, "step": 10499 }, { "epoch": 0.5073199014349906, "grad_norm": 3.2896554470062256, "learning_rate": 4.926800985650094e-07, "loss": 0.3755, "step": 10500 }, { "epoch": 0.5073682176160796, "grad_norm": 3.157702922821045, "learning_rate": 4.926317823839204e-07, "loss": 0.2213, "step": 10501 }, { "epoch": 0.5074165337971687, "grad_norm": 3.3609397411346436, "learning_rate": 4.925834662028313e-07, "loss": 0.3331, "step": 10502 }, { "epoch": 0.5074648499782577, "grad_norm": 11.333516120910645, "learning_rate": 4.925351500217422e-07, "loss": 0.3556, "step": 10503 }, { "epoch": 0.5075131661593467, "grad_norm": 2.992831230163574, "learning_rate": 4.924868338406532e-07, "loss": 0.4182, "step": 10504 }, { "epoch": 0.5075614823404359, "grad_norm": 1.8913640975952148, "learning_rate": 4.924385176595642e-07, "loss": 0.2141, "step": 10505 }, { "epoch": 0.5076097985215249, "grad_norm": 2.836210250854492, "learning_rate": 4.923902014784751e-07, "loss": 0.4225, "step": 10506 }, { "epoch": 0.5076581147026139, "grad_norm": 11.122062683105469, "learning_rate": 4.923418852973861e-07, "loss": 0.313, "step": 10507 }, { "epoch": 0.5077064308837029, "grad_norm": 3.5000290870666504, "learning_rate": 4.922935691162971e-07, "loss": 0.2496, "step": 10508 }, { "epoch": 0.507754747064792, "grad_norm": 2.355863332748413, "learning_rate": 4.92245252935208e-07, "loss": 0.3067, "step": 10509 }, { "epoch": 0.5078030632458811, "grad_norm": 2.4242398738861084, "learning_rate": 4.92196936754119e-07, "loss": 0.2389, "step": 10510 }, { "epoch": 0.5078513794269701, "grad_norm": 13.3141450881958, "learning_rate": 4.921486205730299e-07, "loss": 0.5086, "step": 10511 }, { "epoch": 0.5078996956080591, "grad_norm": 4.342800140380859, "learning_rate": 4.921003043919408e-07, "loss": 0.3083, "step": 10512 }, { "epoch": 0.5079480117891482, "grad_norm": 3.207756757736206, "learning_rate": 4.920519882108518e-07, "loss": 0.3055, "step": 10513 }, { "epoch": 0.5079963279702372, "grad_norm": 16.410999298095703, "learning_rate": 4.920036720297627e-07, "loss": 0.2741, "step": 10514 }, { "epoch": 0.5080446441513263, "grad_norm": 2.2997376918792725, "learning_rate": 4.919553558486737e-07, "loss": 0.2938, "step": 10515 }, { "epoch": 0.5080929603324154, "grad_norm": 6.970754146575928, "learning_rate": 4.919070396675847e-07, "loss": 0.3206, "step": 10516 }, { "epoch": 0.5081412765135044, "grad_norm": 1.9516805410385132, "learning_rate": 4.918587234864956e-07, "loss": 0.1922, "step": 10517 }, { "epoch": 0.5081895926945934, "grad_norm": 3.4659597873687744, "learning_rate": 4.918104073054066e-07, "loss": 0.2014, "step": 10518 }, { "epoch": 0.5082379088756824, "grad_norm": 7.947080612182617, "learning_rate": 4.917620911243175e-07, "loss": 0.3735, "step": 10519 }, { "epoch": 0.5082862250567716, "grad_norm": 4.321717739105225, "learning_rate": 4.917137749432284e-07, "loss": 0.4842, "step": 10520 }, { "epoch": 0.5083345412378606, "grad_norm": 5.634243488311768, "learning_rate": 4.916654587621394e-07, "loss": 0.2214, "step": 10521 }, { "epoch": 0.5083828574189496, "grad_norm": 5.144689559936523, "learning_rate": 4.916171425810503e-07, "loss": 0.2267, "step": 10522 }, { "epoch": 0.5084311736000386, "grad_norm": 4.504950046539307, "learning_rate": 4.915688263999613e-07, "loss": 0.2045, "step": 10523 }, { "epoch": 0.5084794897811277, "grad_norm": 2.5490713119506836, "learning_rate": 4.915205102188723e-07, "loss": 0.2535, "step": 10524 }, { "epoch": 0.5085278059622167, "grad_norm": 10.417165756225586, "learning_rate": 4.914721940377832e-07, "loss": 0.1656, "step": 10525 }, { "epoch": 0.5085761221433058, "grad_norm": 3.6450204849243164, "learning_rate": 4.914238778566942e-07, "loss": 0.2647, "step": 10526 }, { "epoch": 0.5086244383243949, "grad_norm": 2.8955681324005127, "learning_rate": 4.913755616756052e-07, "loss": 0.3112, "step": 10527 }, { "epoch": 0.5086727545054839, "grad_norm": 12.055219650268555, "learning_rate": 4.91327245494516e-07, "loss": 0.2872, "step": 10528 }, { "epoch": 0.5087210706865729, "grad_norm": 2.419865131378174, "learning_rate": 4.91278929313427e-07, "loss": 0.3247, "step": 10529 }, { "epoch": 0.5087693868676619, "grad_norm": 5.341844081878662, "learning_rate": 4.91230613132338e-07, "loss": 0.3122, "step": 10530 }, { "epoch": 0.5088177030487511, "grad_norm": 1.7200958728790283, "learning_rate": 4.911822969512489e-07, "loss": 0.2016, "step": 10531 }, { "epoch": 0.5088660192298401, "grad_norm": 2.710115909576416, "learning_rate": 4.911339807701599e-07, "loss": 0.3072, "step": 10532 }, { "epoch": 0.5089143354109291, "grad_norm": 3.9080893993377686, "learning_rate": 4.910856645890709e-07, "loss": 0.3578, "step": 10533 }, { "epoch": 0.5089626515920181, "grad_norm": 3.24989914894104, "learning_rate": 4.910373484079818e-07, "loss": 0.1808, "step": 10534 }, { "epoch": 0.5090109677731072, "grad_norm": 60.56094741821289, "learning_rate": 4.909890322268928e-07, "loss": 0.2688, "step": 10535 }, { "epoch": 0.5090592839541963, "grad_norm": 3.311102867126465, "learning_rate": 4.909407160458038e-07, "loss": 0.2898, "step": 10536 }, { "epoch": 0.5091076001352853, "grad_norm": 11.114664077758789, "learning_rate": 4.908923998647146e-07, "loss": 0.2569, "step": 10537 }, { "epoch": 0.5091559163163744, "grad_norm": 2.2398183345794678, "learning_rate": 4.908440836836256e-07, "loss": 0.2369, "step": 10538 }, { "epoch": 0.5092042324974634, "grad_norm": 7.445709228515625, "learning_rate": 4.907957675025366e-07, "loss": 0.2478, "step": 10539 }, { "epoch": 0.5092525486785524, "grad_norm": 3.905632972717285, "learning_rate": 4.907474513214475e-07, "loss": 0.3784, "step": 10540 }, { "epoch": 0.5093008648596415, "grad_norm": 3.0289528369903564, "learning_rate": 4.906991351403585e-07, "loss": 0.3518, "step": 10541 }, { "epoch": 0.5093491810407306, "grad_norm": 2.5363621711730957, "learning_rate": 4.906508189592695e-07, "loss": 0.3295, "step": 10542 }, { "epoch": 0.5093974972218196, "grad_norm": 7.143273830413818, "learning_rate": 4.906025027781804e-07, "loss": 0.4222, "step": 10543 }, { "epoch": 0.5094458134029086, "grad_norm": 2.392458915710449, "learning_rate": 4.905541865970914e-07, "loss": 0.3491, "step": 10544 }, { "epoch": 0.5094941295839976, "grad_norm": 2.428668260574341, "learning_rate": 4.905058704160022e-07, "loss": 0.2351, "step": 10545 }, { "epoch": 0.5095424457650868, "grad_norm": 2.273669958114624, "learning_rate": 4.904575542349132e-07, "loss": 0.2469, "step": 10546 }, { "epoch": 0.5095907619461758, "grad_norm": 3.76249098777771, "learning_rate": 4.904092380538242e-07, "loss": 0.2863, "step": 10547 }, { "epoch": 0.5096390781272648, "grad_norm": 9.37597942352295, "learning_rate": 4.903609218727351e-07, "loss": 0.3504, "step": 10548 }, { "epoch": 0.5096873943083539, "grad_norm": 12.308100700378418, "learning_rate": 4.903126056916461e-07, "loss": 0.2609, "step": 10549 }, { "epoch": 0.5097357104894429, "grad_norm": 139.2427978515625, "learning_rate": 4.90264289510557e-07, "loss": 0.2638, "step": 10550 }, { "epoch": 0.5097840266705319, "grad_norm": 2.810441255569458, "learning_rate": 4.90215973329468e-07, "loss": 0.313, "step": 10551 }, { "epoch": 0.509832342851621, "grad_norm": 3.1797831058502197, "learning_rate": 4.90167657148379e-07, "loss": 0.3097, "step": 10552 }, { "epoch": 0.5098806590327101, "grad_norm": 7.12611722946167, "learning_rate": 4.9011934096729e-07, "loss": 0.2044, "step": 10553 }, { "epoch": 0.5099289752137991, "grad_norm": 3.8298909664154053, "learning_rate": 4.900710247862008e-07, "loss": 0.2988, "step": 10554 }, { "epoch": 0.5099772913948881, "grad_norm": 2.2263758182525635, "learning_rate": 4.900227086051118e-07, "loss": 0.2253, "step": 10555 }, { "epoch": 0.5100256075759771, "grad_norm": 2.3937642574310303, "learning_rate": 4.899743924240228e-07, "loss": 0.3208, "step": 10556 }, { "epoch": 0.5100739237570663, "grad_norm": 2.6327595710754395, "learning_rate": 4.899260762429337e-07, "loss": 0.324, "step": 10557 }, { "epoch": 0.5101222399381553, "grad_norm": 2.533639430999756, "learning_rate": 4.898777600618447e-07, "loss": 0.2998, "step": 10558 }, { "epoch": 0.5101705561192443, "grad_norm": 3.390200614929199, "learning_rate": 4.898294438807557e-07, "loss": 0.2547, "step": 10559 }, { "epoch": 0.5102188723003334, "grad_norm": 2.4305429458618164, "learning_rate": 4.897811276996666e-07, "loss": 0.3245, "step": 10560 }, { "epoch": 0.5102671884814224, "grad_norm": 2.277151346206665, "learning_rate": 4.897328115185776e-07, "loss": 0.312, "step": 10561 }, { "epoch": 0.5103155046625115, "grad_norm": 22.895294189453125, "learning_rate": 4.896844953374886e-07, "loss": 0.2277, "step": 10562 }, { "epoch": 0.5103638208436005, "grad_norm": 2.3508856296539307, "learning_rate": 4.896361791563994e-07, "loss": 0.1821, "step": 10563 }, { "epoch": 0.5104121370246896, "grad_norm": 4.408300399780273, "learning_rate": 4.895878629753104e-07, "loss": 0.2317, "step": 10564 }, { "epoch": 0.5104604532057786, "grad_norm": 2.5343666076660156, "learning_rate": 4.895395467942214e-07, "loss": 0.2193, "step": 10565 }, { "epoch": 0.5105087693868676, "grad_norm": 2.3636679649353027, "learning_rate": 4.894912306131323e-07, "loss": 0.2978, "step": 10566 }, { "epoch": 0.5105570855679568, "grad_norm": 3.4233877658843994, "learning_rate": 4.894429144320433e-07, "loss": 0.2585, "step": 10567 }, { "epoch": 0.5106054017490458, "grad_norm": 2.145057201385498, "learning_rate": 4.893945982509543e-07, "loss": 0.2025, "step": 10568 }, { "epoch": 0.5106537179301348, "grad_norm": 3.474804162979126, "learning_rate": 4.893462820698652e-07, "loss": 0.3603, "step": 10569 }, { "epoch": 0.5107020341112238, "grad_norm": 3.4079339504241943, "learning_rate": 4.892979658887762e-07, "loss": 0.332, "step": 10570 }, { "epoch": 0.5107503502923129, "grad_norm": 2.1686513423919678, "learning_rate": 4.89249649707687e-07, "loss": 0.2037, "step": 10571 }, { "epoch": 0.510798666473402, "grad_norm": 1.7216417789459229, "learning_rate": 4.89201333526598e-07, "loss": 0.2117, "step": 10572 }, { "epoch": 0.510846982654491, "grad_norm": 4.2570085525512695, "learning_rate": 4.89153017345509e-07, "loss": 0.392, "step": 10573 }, { "epoch": 0.51089529883558, "grad_norm": 3.271740674972534, "learning_rate": 4.891047011644199e-07, "loss": 0.299, "step": 10574 }, { "epoch": 0.5109436150166691, "grad_norm": 2.6362476348876953, "learning_rate": 4.890563849833309e-07, "loss": 0.2676, "step": 10575 }, { "epoch": 0.5109919311977581, "grad_norm": 3.336864948272705, "learning_rate": 4.890080688022418e-07, "loss": 0.4063, "step": 10576 }, { "epoch": 0.5110402473788471, "grad_norm": 2.3371500968933105, "learning_rate": 4.889597526211528e-07, "loss": 0.2531, "step": 10577 }, { "epoch": 0.5110885635599363, "grad_norm": 2.4121131896972656, "learning_rate": 4.889114364400638e-07, "loss": 0.4311, "step": 10578 }, { "epoch": 0.5111368797410253, "grad_norm": 1.4784026145935059, "learning_rate": 4.888631202589746e-07, "loss": 0.1207, "step": 10579 }, { "epoch": 0.5111851959221143, "grad_norm": 2.6846072673797607, "learning_rate": 4.888148040778856e-07, "loss": 0.2869, "step": 10580 }, { "epoch": 0.5112335121032033, "grad_norm": 1.951309084892273, "learning_rate": 4.887664878967966e-07, "loss": 0.257, "step": 10581 }, { "epoch": 0.5112818282842924, "grad_norm": 10.023639678955078, "learning_rate": 4.887181717157075e-07, "loss": 0.2831, "step": 10582 }, { "epoch": 0.5113301444653815, "grad_norm": 2.9416797161102295, "learning_rate": 4.886698555346185e-07, "loss": 0.2292, "step": 10583 }, { "epoch": 0.5113784606464705, "grad_norm": 2.8772225379943848, "learning_rate": 4.886215393535295e-07, "loss": 0.2764, "step": 10584 }, { "epoch": 0.5114267768275595, "grad_norm": 2.4247355461120605, "learning_rate": 4.885732231724405e-07, "loss": 0.342, "step": 10585 }, { "epoch": 0.5114750930086486, "grad_norm": 2.3458378314971924, "learning_rate": 4.885249069913514e-07, "loss": 0.2872, "step": 10586 }, { "epoch": 0.5115234091897376, "grad_norm": 2.6942522525787354, "learning_rate": 4.884765908102624e-07, "loss": 0.3401, "step": 10587 }, { "epoch": 0.5115717253708267, "grad_norm": 3.190376043319702, "learning_rate": 4.884282746291733e-07, "loss": 0.3739, "step": 10588 }, { "epoch": 0.5116200415519158, "grad_norm": 2.597261667251587, "learning_rate": 4.883799584480842e-07, "loss": 0.3305, "step": 10589 }, { "epoch": 0.5116683577330048, "grad_norm": 3.953756093978882, "learning_rate": 4.883316422669952e-07, "loss": 0.307, "step": 10590 }, { "epoch": 0.5117166739140938, "grad_norm": 2.9728446006774902, "learning_rate": 4.882833260859062e-07, "loss": 0.1572, "step": 10591 }, { "epoch": 0.5117649900951828, "grad_norm": 1.7743085622787476, "learning_rate": 4.882350099048171e-07, "loss": 0.2228, "step": 10592 }, { "epoch": 0.511813306276272, "grad_norm": 3.767841100692749, "learning_rate": 4.881866937237281e-07, "loss": 0.368, "step": 10593 }, { "epoch": 0.511861622457361, "grad_norm": 3.250966787338257, "learning_rate": 4.881383775426391e-07, "loss": 0.4048, "step": 10594 }, { "epoch": 0.51190993863845, "grad_norm": 1.925167441368103, "learning_rate": 4.8809006136155e-07, "loss": 0.1374, "step": 10595 }, { "epoch": 0.511958254819539, "grad_norm": 3.0103676319122314, "learning_rate": 4.88041745180461e-07, "loss": 0.4106, "step": 10596 }, { "epoch": 0.5120065710006281, "grad_norm": 1.7729003429412842, "learning_rate": 4.879934289993718e-07, "loss": 0.1713, "step": 10597 }, { "epoch": 0.5120548871817172, "grad_norm": 2.388117790222168, "learning_rate": 4.879451128182828e-07, "loss": 0.229, "step": 10598 }, { "epoch": 0.5121032033628062, "grad_norm": 2.7314364910125732, "learning_rate": 4.878967966371938e-07, "loss": 0.2941, "step": 10599 }, { "epoch": 0.5121515195438953, "grad_norm": 5.6153483390808105, "learning_rate": 4.878484804561047e-07, "loss": 0.379, "step": 10600 }, { "epoch": 0.5121998357249843, "grad_norm": 3.060906171798706, "learning_rate": 4.878001642750157e-07, "loss": 0.3188, "step": 10601 }, { "epoch": 0.5122481519060733, "grad_norm": 1.7432153224945068, "learning_rate": 4.877518480939266e-07, "loss": 0.1856, "step": 10602 }, { "epoch": 0.5122964680871623, "grad_norm": 1.82500422000885, "learning_rate": 4.877035319128376e-07, "loss": 0.2102, "step": 10603 }, { "epoch": 0.5123447842682515, "grad_norm": 5.139692783355713, "learning_rate": 4.876552157317486e-07, "loss": 0.478, "step": 10604 }, { "epoch": 0.5123931004493405, "grad_norm": 2.0500271320343018, "learning_rate": 4.876068995506594e-07, "loss": 0.2661, "step": 10605 }, { "epoch": 0.5124414166304295, "grad_norm": 2.2734053134918213, "learning_rate": 4.875585833695704e-07, "loss": 0.2641, "step": 10606 }, { "epoch": 0.5124897328115186, "grad_norm": 2.5225937366485596, "learning_rate": 4.875102671884814e-07, "loss": 0.2906, "step": 10607 }, { "epoch": 0.5125380489926076, "grad_norm": 3.1997318267822266, "learning_rate": 4.874619510073923e-07, "loss": 0.3717, "step": 10608 }, { "epoch": 0.5125863651736967, "grad_norm": 2.589404344558716, "learning_rate": 4.874136348263033e-07, "loss": 0.3249, "step": 10609 }, { "epoch": 0.5126346813547857, "grad_norm": 2.244781732559204, "learning_rate": 4.873653186452143e-07, "loss": 0.225, "step": 10610 }, { "epoch": 0.5126829975358748, "grad_norm": 2.30422043800354, "learning_rate": 4.873170024641252e-07, "loss": 0.2399, "step": 10611 }, { "epoch": 0.5127313137169638, "grad_norm": 2.3451926708221436, "learning_rate": 4.872686862830362e-07, "loss": 0.2903, "step": 10612 }, { "epoch": 0.5127796298980528, "grad_norm": 2.5056357383728027, "learning_rate": 4.872203701019471e-07, "loss": 0.3014, "step": 10613 }, { "epoch": 0.512827946079142, "grad_norm": 4.428685188293457, "learning_rate": 4.87172053920858e-07, "loss": 0.3294, "step": 10614 }, { "epoch": 0.512876262260231, "grad_norm": 1.9651474952697754, "learning_rate": 4.87123737739769e-07, "loss": 0.1923, "step": 10615 }, { "epoch": 0.51292457844132, "grad_norm": 3.794783115386963, "learning_rate": 4.8707542155868e-07, "loss": 0.4955, "step": 10616 }, { "epoch": 0.512972894622409, "grad_norm": 2.5306811332702637, "learning_rate": 4.87027105377591e-07, "loss": 0.3674, "step": 10617 }, { "epoch": 0.513021210803498, "grad_norm": 2.99389386177063, "learning_rate": 4.869787891965019e-07, "loss": 0.2072, "step": 10618 }, { "epoch": 0.5130695269845872, "grad_norm": 2.3212082386016846, "learning_rate": 4.869304730154129e-07, "loss": 0.3125, "step": 10619 }, { "epoch": 0.5131178431656762, "grad_norm": 2.4965553283691406, "learning_rate": 4.868821568343239e-07, "loss": 0.2828, "step": 10620 }, { "epoch": 0.5131661593467652, "grad_norm": 2.4182276725769043, "learning_rate": 4.868338406532348e-07, "loss": 0.2658, "step": 10621 }, { "epoch": 0.5132144755278543, "grad_norm": 3.4628992080688477, "learning_rate": 4.867855244721457e-07, "loss": 0.3133, "step": 10622 }, { "epoch": 0.5132627917089433, "grad_norm": 2.8414740562438965, "learning_rate": 4.867372082910566e-07, "loss": 0.2269, "step": 10623 }, { "epoch": 0.5133111078900324, "grad_norm": 2.4171361923217773, "learning_rate": 4.866888921099676e-07, "loss": 0.2433, "step": 10624 }, { "epoch": 0.5133594240711215, "grad_norm": 5.426791191101074, "learning_rate": 4.866405759288786e-07, "loss": 0.2318, "step": 10625 }, { "epoch": 0.5134077402522105, "grad_norm": 8.506331443786621, "learning_rate": 4.865922597477895e-07, "loss": 0.2963, "step": 10626 }, { "epoch": 0.5134560564332995, "grad_norm": 2.7556726932525635, "learning_rate": 4.865439435667005e-07, "loss": 0.3735, "step": 10627 }, { "epoch": 0.5135043726143885, "grad_norm": 3.2701492309570312, "learning_rate": 4.864956273856114e-07, "loss": 0.3315, "step": 10628 }, { "epoch": 0.5135526887954776, "grad_norm": 4.4981465339660645, "learning_rate": 4.864473112045224e-07, "loss": 0.2868, "step": 10629 }, { "epoch": 0.5136010049765667, "grad_norm": 2.533294677734375, "learning_rate": 4.863989950234333e-07, "loss": 0.2947, "step": 10630 }, { "epoch": 0.5136493211576557, "grad_norm": 4.104548454284668, "learning_rate": 4.863506788423442e-07, "loss": 0.2442, "step": 10631 }, { "epoch": 0.5136976373387447, "grad_norm": 4.036331653594971, "learning_rate": 4.863023626612552e-07, "loss": 0.2246, "step": 10632 }, { "epoch": 0.5137459535198338, "grad_norm": 2.3024916648864746, "learning_rate": 4.862540464801661e-07, "loss": 0.2786, "step": 10633 }, { "epoch": 0.5137942697009228, "grad_norm": 3.9063470363616943, "learning_rate": 4.862057302990771e-07, "loss": 0.538, "step": 10634 }, { "epoch": 0.5138425858820119, "grad_norm": 3.017622232437134, "learning_rate": 4.861574141179881e-07, "loss": 0.3382, "step": 10635 }, { "epoch": 0.513890902063101, "grad_norm": 2.2836341857910156, "learning_rate": 4.861090979368991e-07, "loss": 0.2267, "step": 10636 }, { "epoch": 0.51393921824419, "grad_norm": 3.046363592147827, "learning_rate": 4.8606078175581e-07, "loss": 0.292, "step": 10637 }, { "epoch": 0.513987534425279, "grad_norm": 2.8146746158599854, "learning_rate": 4.86012465574721e-07, "loss": 0.4034, "step": 10638 }, { "epoch": 0.514035850606368, "grad_norm": 1.9683842658996582, "learning_rate": 4.859641493936319e-07, "loss": 0.188, "step": 10639 }, { "epoch": 0.5140841667874572, "grad_norm": 2.3424770832061768, "learning_rate": 4.859158332125428e-07, "loss": 0.1738, "step": 10640 }, { "epoch": 0.5141324829685462, "grad_norm": 3.380471706390381, "learning_rate": 4.858675170314538e-07, "loss": 0.366, "step": 10641 }, { "epoch": 0.5141807991496352, "grad_norm": 1.9086397886276245, "learning_rate": 4.858192008503648e-07, "loss": 0.2286, "step": 10642 }, { "epoch": 0.5142291153307242, "grad_norm": 1.8683555126190186, "learning_rate": 4.857708846692757e-07, "loss": 0.1929, "step": 10643 }, { "epoch": 0.5142774315118133, "grad_norm": 3.743669033050537, "learning_rate": 4.857225684881867e-07, "loss": 0.3253, "step": 10644 }, { "epoch": 0.5143257476929024, "grad_norm": 2.9944021701812744, "learning_rate": 4.856742523070977e-07, "loss": 0.3109, "step": 10645 }, { "epoch": 0.5143740638739914, "grad_norm": 3.1932740211486816, "learning_rate": 4.856259361260086e-07, "loss": 0.3219, "step": 10646 }, { "epoch": 0.5144223800550805, "grad_norm": 3.090620279312134, "learning_rate": 4.855776199449195e-07, "loss": 0.3644, "step": 10647 }, { "epoch": 0.5144706962361695, "grad_norm": 3.8217294216156006, "learning_rate": 4.855293037638305e-07, "loss": 0.3078, "step": 10648 }, { "epoch": 0.5145190124172585, "grad_norm": 2.7088770866394043, "learning_rate": 4.854809875827414e-07, "loss": 0.2524, "step": 10649 }, { "epoch": 0.5145673285983476, "grad_norm": 3.3705031871795654, "learning_rate": 4.854326714016524e-07, "loss": 0.2918, "step": 10650 }, { "epoch": 0.5146156447794367, "grad_norm": 2.4472367763519287, "learning_rate": 4.853843552205634e-07, "loss": 0.1975, "step": 10651 }, { "epoch": 0.5146639609605257, "grad_norm": 10.170443534851074, "learning_rate": 4.853360390394743e-07, "loss": 0.3247, "step": 10652 }, { "epoch": 0.5147122771416147, "grad_norm": 2.3889312744140625, "learning_rate": 4.852877228583853e-07, "loss": 0.2975, "step": 10653 }, { "epoch": 0.5147605933227037, "grad_norm": 2.232485055923462, "learning_rate": 4.852394066772962e-07, "loss": 0.2764, "step": 10654 }, { "epoch": 0.5148089095037928, "grad_norm": 2.8535094261169434, "learning_rate": 4.851910904962071e-07, "loss": 0.3415, "step": 10655 }, { "epoch": 0.5148572256848819, "grad_norm": 2.687073230743408, "learning_rate": 4.851427743151181e-07, "loss": 0.2227, "step": 10656 }, { "epoch": 0.5149055418659709, "grad_norm": 1.569331407546997, "learning_rate": 4.85094458134029e-07, "loss": 0.179, "step": 10657 }, { "epoch": 0.51495385804706, "grad_norm": 2.1346275806427, "learning_rate": 4.8504614195294e-07, "loss": 0.224, "step": 10658 }, { "epoch": 0.515002174228149, "grad_norm": 7.731612205505371, "learning_rate": 4.849978257718509e-07, "loss": 0.2784, "step": 10659 }, { "epoch": 0.515050490409238, "grad_norm": 2.288595676422119, "learning_rate": 4.849495095907619e-07, "loss": 0.2605, "step": 10660 }, { "epoch": 0.5150988065903271, "grad_norm": 3.9012718200683594, "learning_rate": 4.849011934096729e-07, "loss": 0.2925, "step": 10661 }, { "epoch": 0.5151471227714162, "grad_norm": 2.2001307010650635, "learning_rate": 4.848528772285838e-07, "loss": 0.2983, "step": 10662 }, { "epoch": 0.5151954389525052, "grad_norm": 1.982942819595337, "learning_rate": 4.848045610474948e-07, "loss": 0.2491, "step": 10663 }, { "epoch": 0.5152437551335942, "grad_norm": 2.807565450668335, "learning_rate": 4.847562448664057e-07, "loss": 0.1594, "step": 10664 }, { "epoch": 0.5152920713146832, "grad_norm": 2.042677640914917, "learning_rate": 4.847079286853166e-07, "loss": 0.2297, "step": 10665 }, { "epoch": 0.5153403874957724, "grad_norm": 4.373459339141846, "learning_rate": 4.846596125042276e-07, "loss": 0.2706, "step": 10666 }, { "epoch": 0.5153887036768614, "grad_norm": 1.6327629089355469, "learning_rate": 4.846112963231386e-07, "loss": 0.2045, "step": 10667 }, { "epoch": 0.5154370198579504, "grad_norm": 3.0704410076141357, "learning_rate": 4.845629801420496e-07, "loss": 0.3095, "step": 10668 }, { "epoch": 0.5154853360390395, "grad_norm": 3.7711167335510254, "learning_rate": 4.845146639609605e-07, "loss": 0.3349, "step": 10669 }, { "epoch": 0.5155336522201285, "grad_norm": 2.323861837387085, "learning_rate": 4.844663477798715e-07, "loss": 0.2923, "step": 10670 }, { "epoch": 0.5155819684012176, "grad_norm": 2.270704507827759, "learning_rate": 4.844180315987825e-07, "loss": 0.2136, "step": 10671 }, { "epoch": 0.5156302845823066, "grad_norm": 5.239778518676758, "learning_rate": 4.843697154176933e-07, "loss": 0.4092, "step": 10672 }, { "epoch": 0.5156786007633957, "grad_norm": 2.7994842529296875, "learning_rate": 4.843213992366043e-07, "loss": 0.2558, "step": 10673 }, { "epoch": 0.5157269169444847, "grad_norm": 4.170881748199463, "learning_rate": 4.842730830555153e-07, "loss": 0.2997, "step": 10674 }, { "epoch": 0.5157752331255737, "grad_norm": 4.276281833648682, "learning_rate": 4.842247668744262e-07, "loss": 0.3609, "step": 10675 }, { "epoch": 0.5158235493066629, "grad_norm": 10.43010425567627, "learning_rate": 4.841764506933372e-07, "loss": 0.3943, "step": 10676 }, { "epoch": 0.5158718654877519, "grad_norm": 3.69655704498291, "learning_rate": 4.841281345122482e-07, "loss": 0.322, "step": 10677 }, { "epoch": 0.5159201816688409, "grad_norm": 2.4672064781188965, "learning_rate": 4.840798183311591e-07, "loss": 0.2751, "step": 10678 }, { "epoch": 0.5159684978499299, "grad_norm": 2.8925223350524902, "learning_rate": 4.840315021500701e-07, "loss": 0.3505, "step": 10679 }, { "epoch": 0.516016814031019, "grad_norm": 2.4386043548583984, "learning_rate": 4.83983185968981e-07, "loss": 0.3417, "step": 10680 }, { "epoch": 0.516065130212108, "grad_norm": 2.4170498847961426, "learning_rate": 4.839348697878919e-07, "loss": 0.2156, "step": 10681 }, { "epoch": 0.5161134463931971, "grad_norm": 2.5701382160186768, "learning_rate": 4.838865536068029e-07, "loss": 0.3018, "step": 10682 }, { "epoch": 0.5161617625742861, "grad_norm": 2.4255051612854004, "learning_rate": 4.838382374257138e-07, "loss": 0.3042, "step": 10683 }, { "epoch": 0.5162100787553752, "grad_norm": 2.2531604766845703, "learning_rate": 4.837899212446248e-07, "loss": 0.2233, "step": 10684 }, { "epoch": 0.5162583949364642, "grad_norm": 10.31801700592041, "learning_rate": 4.837416050635357e-07, "loss": 0.2909, "step": 10685 }, { "epoch": 0.5163067111175532, "grad_norm": 2.1937296390533447, "learning_rate": 4.836932888824467e-07, "loss": 0.2336, "step": 10686 }, { "epoch": 0.5163550272986424, "grad_norm": 3.6052024364471436, "learning_rate": 4.836449727013577e-07, "loss": 0.3729, "step": 10687 }, { "epoch": 0.5164033434797314, "grad_norm": 3.2436702251434326, "learning_rate": 4.835966565202686e-07, "loss": 0.3373, "step": 10688 }, { "epoch": 0.5164516596608204, "grad_norm": 3.017594814300537, "learning_rate": 4.835483403391795e-07, "loss": 0.3391, "step": 10689 }, { "epoch": 0.5164999758419094, "grad_norm": 2.25616192817688, "learning_rate": 4.835000241580905e-07, "loss": 0.2654, "step": 10690 }, { "epoch": 0.5165482920229985, "grad_norm": 2.9660136699676514, "learning_rate": 4.834517079770014e-07, "loss": 0.4227, "step": 10691 }, { "epoch": 0.5165966082040876, "grad_norm": 2.4317126274108887, "learning_rate": 4.834033917959124e-07, "loss": 0.2593, "step": 10692 }, { "epoch": 0.5166449243851766, "grad_norm": 2.723601818084717, "learning_rate": 4.833550756148234e-07, "loss": 0.3009, "step": 10693 }, { "epoch": 0.5166932405662656, "grad_norm": 2.828218936920166, "learning_rate": 4.833067594337343e-07, "loss": 0.2804, "step": 10694 }, { "epoch": 0.5167415567473547, "grad_norm": 2.4831528663635254, "learning_rate": 4.832584432526453e-07, "loss": 0.275, "step": 10695 }, { "epoch": 0.5167898729284437, "grad_norm": 3.724290370941162, "learning_rate": 4.832101270715563e-07, "loss": 0.3071, "step": 10696 }, { "epoch": 0.5168381891095328, "grad_norm": 3.0057437419891357, "learning_rate": 4.831618108904672e-07, "loss": 0.3296, "step": 10697 }, { "epoch": 0.5168865052906219, "grad_norm": 2.5462534427642822, "learning_rate": 4.831134947093781e-07, "loss": 0.2943, "step": 10698 }, { "epoch": 0.5169348214717109, "grad_norm": 4.205527305603027, "learning_rate": 4.830651785282891e-07, "loss": 0.2177, "step": 10699 }, { "epoch": 0.5169831376527999, "grad_norm": 2.217625856399536, "learning_rate": 4.830168623472001e-07, "loss": 0.2137, "step": 10700 }, { "epoch": 0.5170314538338889, "grad_norm": 2.412882089614868, "learning_rate": 4.82968546166111e-07, "loss": 0.3563, "step": 10701 }, { "epoch": 0.5170797700149781, "grad_norm": 2.152172327041626, "learning_rate": 4.82920229985022e-07, "loss": 0.242, "step": 10702 }, { "epoch": 0.5171280861960671, "grad_norm": 2.0965921878814697, "learning_rate": 4.82871913803933e-07, "loss": 0.2334, "step": 10703 }, { "epoch": 0.5171764023771561, "grad_norm": 2.6621017456054688, "learning_rate": 4.828235976228439e-07, "loss": 0.3271, "step": 10704 }, { "epoch": 0.5172247185582451, "grad_norm": 5.287365436553955, "learning_rate": 4.827752814417549e-07, "loss": 0.272, "step": 10705 }, { "epoch": 0.5172730347393342, "grad_norm": 10.456136703491211, "learning_rate": 4.827269652606657e-07, "loss": 0.3095, "step": 10706 }, { "epoch": 0.5173213509204232, "grad_norm": 2.219301700592041, "learning_rate": 4.826786490795767e-07, "loss": 0.2967, "step": 10707 }, { "epoch": 0.5173696671015123, "grad_norm": 2.654919385910034, "learning_rate": 4.826303328984877e-07, "loss": 0.2516, "step": 10708 }, { "epoch": 0.5174179832826014, "grad_norm": 4.311221599578857, "learning_rate": 4.825820167173986e-07, "loss": 0.3198, "step": 10709 }, { "epoch": 0.5174662994636904, "grad_norm": 10.900077819824219, "learning_rate": 4.825337005363096e-07, "loss": 0.3301, "step": 10710 }, { "epoch": 0.5175146156447794, "grad_norm": 4.849620819091797, "learning_rate": 4.824853843552205e-07, "loss": 0.2976, "step": 10711 }, { "epoch": 0.5175629318258684, "grad_norm": 1.4680049419403076, "learning_rate": 4.824370681741315e-07, "loss": 0.1452, "step": 10712 }, { "epoch": 0.5176112480069576, "grad_norm": 1.6961991786956787, "learning_rate": 4.823887519930425e-07, "loss": 0.1744, "step": 10713 }, { "epoch": 0.5176595641880466, "grad_norm": 3.049379825592041, "learning_rate": 4.823404358119533e-07, "loss": 0.2296, "step": 10714 }, { "epoch": 0.5177078803691356, "grad_norm": 2.733752489089966, "learning_rate": 4.822921196308643e-07, "loss": 0.3316, "step": 10715 }, { "epoch": 0.5177561965502246, "grad_norm": 2.295238494873047, "learning_rate": 4.822438034497753e-07, "loss": 0.2892, "step": 10716 }, { "epoch": 0.5178045127313137, "grad_norm": 2.579054355621338, "learning_rate": 4.821954872686862e-07, "loss": 0.2517, "step": 10717 }, { "epoch": 0.5178528289124028, "grad_norm": 2.54237699508667, "learning_rate": 4.821471710875972e-07, "loss": 0.2697, "step": 10718 }, { "epoch": 0.5179011450934918, "grad_norm": 2.669734239578247, "learning_rate": 4.820988549065082e-07, "loss": 0.2768, "step": 10719 }, { "epoch": 0.5179494612745809, "grad_norm": 1.468869924545288, "learning_rate": 4.820505387254191e-07, "loss": 0.173, "step": 10720 }, { "epoch": 0.5179977774556699, "grad_norm": 1.4238096475601196, "learning_rate": 4.820022225443301e-07, "loss": 0.1313, "step": 10721 }, { "epoch": 0.5180460936367589, "grad_norm": 2.1074090003967285, "learning_rate": 4.819539063632411e-07, "loss": 0.1831, "step": 10722 }, { "epoch": 0.518094409817848, "grad_norm": 2.7328381538391113, "learning_rate": 4.819055901821519e-07, "loss": 0.3789, "step": 10723 }, { "epoch": 0.5181427259989371, "grad_norm": 5.096104145050049, "learning_rate": 4.818572740010629e-07, "loss": 0.3028, "step": 10724 }, { "epoch": 0.5181910421800261, "grad_norm": 3.000915288925171, "learning_rate": 4.818089578199739e-07, "loss": 0.4352, "step": 10725 }, { "epoch": 0.5182393583611151, "grad_norm": 3.1375365257263184, "learning_rate": 4.817606416388848e-07, "loss": 0.4186, "step": 10726 }, { "epoch": 0.5182876745422041, "grad_norm": 5.806797504425049, "learning_rate": 4.817123254577958e-07, "loss": 0.3181, "step": 10727 }, { "epoch": 0.5183359907232933, "grad_norm": 3.4003446102142334, "learning_rate": 4.816640092767068e-07, "loss": 0.3923, "step": 10728 }, { "epoch": 0.5183843069043823, "grad_norm": 2.825547456741333, "learning_rate": 4.816156930956178e-07, "loss": 0.3169, "step": 10729 }, { "epoch": 0.5184326230854713, "grad_norm": 3.1289303302764893, "learning_rate": 4.815673769145287e-07, "loss": 0.212, "step": 10730 }, { "epoch": 0.5184809392665604, "grad_norm": 1.7701926231384277, "learning_rate": 4.815190607334397e-07, "loss": 0.1586, "step": 10731 }, { "epoch": 0.5185292554476494, "grad_norm": 2.46360182762146, "learning_rate": 4.814707445523505e-07, "loss": 0.278, "step": 10732 }, { "epoch": 0.5185775716287384, "grad_norm": 2.1317851543426514, "learning_rate": 4.814224283712615e-07, "loss": 0.2446, "step": 10733 }, { "epoch": 0.5186258878098275, "grad_norm": 3.0337448120117188, "learning_rate": 4.813741121901725e-07, "loss": 0.2987, "step": 10734 }, { "epoch": 0.5186742039909166, "grad_norm": 2.38763689994812, "learning_rate": 4.813257960090834e-07, "loss": 0.3457, "step": 10735 }, { "epoch": 0.5187225201720056, "grad_norm": 13.399353981018066, "learning_rate": 4.812774798279944e-07, "loss": 0.3351, "step": 10736 }, { "epoch": 0.5187708363530946, "grad_norm": 2.1914937496185303, "learning_rate": 4.812291636469053e-07, "loss": 0.1501, "step": 10737 }, { "epoch": 0.5188191525341836, "grad_norm": 2.1019201278686523, "learning_rate": 4.811808474658163e-07, "loss": 0.2148, "step": 10738 }, { "epoch": 0.5188674687152728, "grad_norm": 3.7546515464782715, "learning_rate": 4.811325312847273e-07, "loss": 0.3606, "step": 10739 }, { "epoch": 0.5189157848963618, "grad_norm": 2.5345962047576904, "learning_rate": 4.810842151036381e-07, "loss": 0.3356, "step": 10740 }, { "epoch": 0.5189641010774508, "grad_norm": 2.380218505859375, "learning_rate": 4.810358989225491e-07, "loss": 0.2095, "step": 10741 }, { "epoch": 0.5190124172585399, "grad_norm": 2.2191460132598877, "learning_rate": 4.809875827414601e-07, "loss": 0.2845, "step": 10742 }, { "epoch": 0.5190607334396289, "grad_norm": 3.3690028190612793, "learning_rate": 4.80939266560371e-07, "loss": 0.3214, "step": 10743 }, { "epoch": 0.519109049620718, "grad_norm": 2.222508668899536, "learning_rate": 4.80890950379282e-07, "loss": 0.3152, "step": 10744 }, { "epoch": 0.519157365801807, "grad_norm": 2.887862205505371, "learning_rate": 4.808426341981929e-07, "loss": 0.4174, "step": 10745 }, { "epoch": 0.5192056819828961, "grad_norm": 5.222890377044678, "learning_rate": 4.807943180171039e-07, "loss": 0.2745, "step": 10746 }, { "epoch": 0.5192539981639851, "grad_norm": 3.823538064956665, "learning_rate": 4.807460018360149e-07, "loss": 0.3438, "step": 10747 }, { "epoch": 0.5193023143450741, "grad_norm": 4.995550632476807, "learning_rate": 4.806976856549259e-07, "loss": 0.4032, "step": 10748 }, { "epoch": 0.5193506305261633, "grad_norm": 3.1352224349975586, "learning_rate": 4.806493694738367e-07, "loss": 0.3324, "step": 10749 }, { "epoch": 0.5193989467072523, "grad_norm": 1.8783107995986938, "learning_rate": 4.806010532927477e-07, "loss": 0.2244, "step": 10750 }, { "epoch": 0.5194472628883413, "grad_norm": 1.6266423463821411, "learning_rate": 4.805527371116587e-07, "loss": 0.2005, "step": 10751 }, { "epoch": 0.5194955790694303, "grad_norm": 2.494476556777954, "learning_rate": 4.805044209305696e-07, "loss": 0.3033, "step": 10752 }, { "epoch": 0.5195438952505194, "grad_norm": 2.760141372680664, "learning_rate": 4.804561047494806e-07, "loss": 0.4382, "step": 10753 }, { "epoch": 0.5195922114316085, "grad_norm": 2.2252886295318604, "learning_rate": 4.804077885683916e-07, "loss": 0.2193, "step": 10754 }, { "epoch": 0.5196405276126975, "grad_norm": 2.9320647716522217, "learning_rate": 4.803594723873025e-07, "loss": 0.3183, "step": 10755 }, { "epoch": 0.5196888437937865, "grad_norm": 2.400479555130005, "learning_rate": 4.803111562062135e-07, "loss": 0.2763, "step": 10756 }, { "epoch": 0.5197371599748756, "grad_norm": 2.090700149536133, "learning_rate": 4.802628400251244e-07, "loss": 0.2728, "step": 10757 }, { "epoch": 0.5197854761559646, "grad_norm": 1.8897309303283691, "learning_rate": 4.802145238440353e-07, "loss": 0.2307, "step": 10758 }, { "epoch": 0.5198337923370536, "grad_norm": 2.339240312576294, "learning_rate": 4.801662076629463e-07, "loss": 0.1955, "step": 10759 }, { "epoch": 0.5198821085181428, "grad_norm": 2.614535093307495, "learning_rate": 4.801178914818573e-07, "loss": 0.3103, "step": 10760 }, { "epoch": 0.5199304246992318, "grad_norm": 2.0978429317474365, "learning_rate": 4.800695753007682e-07, "loss": 0.1967, "step": 10761 }, { "epoch": 0.5199787408803208, "grad_norm": 2.9100735187530518, "learning_rate": 4.800212591196792e-07, "loss": 0.3473, "step": 10762 }, { "epoch": 0.5200270570614098, "grad_norm": 1.8623392581939697, "learning_rate": 4.799729429385901e-07, "loss": 0.2014, "step": 10763 }, { "epoch": 0.5200753732424989, "grad_norm": 9.898460388183594, "learning_rate": 4.799246267575011e-07, "loss": 0.2796, "step": 10764 }, { "epoch": 0.520123689423588, "grad_norm": 3.000783920288086, "learning_rate": 4.79876310576412e-07, "loss": 0.4372, "step": 10765 }, { "epoch": 0.520172005604677, "grad_norm": 3.177283525466919, "learning_rate": 4.798279943953229e-07, "loss": 0.3473, "step": 10766 }, { "epoch": 0.520220321785766, "grad_norm": 2.6713802814483643, "learning_rate": 4.797796782142339e-07, "loss": 0.4044, "step": 10767 }, { "epoch": 0.5202686379668551, "grad_norm": 5.157328128814697, "learning_rate": 4.797313620331449e-07, "loss": 0.4107, "step": 10768 }, { "epoch": 0.5203169541479441, "grad_norm": 2.4836225509643555, "learning_rate": 4.796830458520558e-07, "loss": 0.2654, "step": 10769 }, { "epoch": 0.5203652703290332, "grad_norm": 2.4001307487487793, "learning_rate": 4.796347296709668e-07, "loss": 0.2279, "step": 10770 }, { "epoch": 0.5204135865101223, "grad_norm": 11.904513359069824, "learning_rate": 4.795864134898777e-07, "loss": 0.2644, "step": 10771 }, { "epoch": 0.5204619026912113, "grad_norm": 3.4004650115966797, "learning_rate": 4.795380973087887e-07, "loss": 0.255, "step": 10772 }, { "epoch": 0.5205102188723003, "grad_norm": 2.4443719387054443, "learning_rate": 4.794897811276997e-07, "loss": 0.2453, "step": 10773 }, { "epoch": 0.5205585350533893, "grad_norm": 3.3381235599517822, "learning_rate": 4.794414649466105e-07, "loss": 0.306, "step": 10774 }, { "epoch": 0.5206068512344785, "grad_norm": 2.4743897914886475, "learning_rate": 4.793931487655215e-07, "loss": 0.3682, "step": 10775 }, { "epoch": 0.5206551674155675, "grad_norm": 2.9768311977386475, "learning_rate": 4.793448325844325e-07, "loss": 0.2895, "step": 10776 }, { "epoch": 0.5207034835966565, "grad_norm": 3.6553542613983154, "learning_rate": 4.792965164033434e-07, "loss": 0.3288, "step": 10777 }, { "epoch": 0.5207517997777455, "grad_norm": 2.893434762954712, "learning_rate": 4.792482002222544e-07, "loss": 0.3751, "step": 10778 }, { "epoch": 0.5208001159588346, "grad_norm": 2.2305831909179688, "learning_rate": 4.791998840411654e-07, "loss": 0.229, "step": 10779 }, { "epoch": 0.5208484321399237, "grad_norm": 2.285814046859741, "learning_rate": 4.791515678600764e-07, "loss": 0.2643, "step": 10780 }, { "epoch": 0.5208967483210127, "grad_norm": 1.3733478784561157, "learning_rate": 4.791032516789873e-07, "loss": 0.1735, "step": 10781 }, { "epoch": 0.5209450645021018, "grad_norm": 7.861397743225098, "learning_rate": 4.790549354978982e-07, "loss": 0.2524, "step": 10782 }, { "epoch": 0.5209933806831908, "grad_norm": 2.7999284267425537, "learning_rate": 4.790066193168092e-07, "loss": 0.2793, "step": 10783 }, { "epoch": 0.5210416968642798, "grad_norm": 82.8946533203125, "learning_rate": 4.789583031357201e-07, "loss": 0.4118, "step": 10784 }, { "epoch": 0.521090013045369, "grad_norm": 3.5009796619415283, "learning_rate": 4.789099869546311e-07, "loss": 0.3961, "step": 10785 }, { "epoch": 0.521138329226458, "grad_norm": 2.7317144870758057, "learning_rate": 4.788616707735421e-07, "loss": 0.3366, "step": 10786 }, { "epoch": 0.521186645407547, "grad_norm": 1.8488613367080688, "learning_rate": 4.78813354592453e-07, "loss": 0.1925, "step": 10787 }, { "epoch": 0.521234961588636, "grad_norm": 2.8631174564361572, "learning_rate": 4.78765038411364e-07, "loss": 0.3295, "step": 10788 }, { "epoch": 0.521283277769725, "grad_norm": 2.6772329807281494, "learning_rate": 4.787167222302749e-07, "loss": 0.2947, "step": 10789 }, { "epoch": 0.5213315939508141, "grad_norm": 3.4083967208862305, "learning_rate": 4.786684060491859e-07, "loss": 0.4261, "step": 10790 }, { "epoch": 0.5213799101319032, "grad_norm": 6.1459479331970215, "learning_rate": 4.786200898680968e-07, "loss": 0.4015, "step": 10791 }, { "epoch": 0.5214282263129922, "grad_norm": 1.8628545999526978, "learning_rate": 4.785717736870077e-07, "loss": 0.226, "step": 10792 }, { "epoch": 0.5214765424940813, "grad_norm": 3.4692330360412598, "learning_rate": 4.785234575059187e-07, "loss": 0.3384, "step": 10793 }, { "epoch": 0.5215248586751703, "grad_norm": 6.4114203453063965, "learning_rate": 4.784751413248297e-07, "loss": 0.2982, "step": 10794 }, { "epoch": 0.5215731748562593, "grad_norm": 2.856935501098633, "learning_rate": 4.784268251437406e-07, "loss": 0.2613, "step": 10795 }, { "epoch": 0.5216214910373485, "grad_norm": 2.059616804122925, "learning_rate": 4.783785089626516e-07, "loss": 0.2705, "step": 10796 }, { "epoch": 0.5216698072184375, "grad_norm": 2.694197177886963, "learning_rate": 4.783301927815625e-07, "loss": 0.3234, "step": 10797 }, { "epoch": 0.5217181233995265, "grad_norm": 23.240942001342773, "learning_rate": 4.782818766004735e-07, "loss": 0.2609, "step": 10798 }, { "epoch": 0.5217664395806155, "grad_norm": 4.666946887969971, "learning_rate": 4.782335604193844e-07, "loss": 0.4239, "step": 10799 }, { "epoch": 0.5218147557617046, "grad_norm": 12.285360336303711, "learning_rate": 4.781852442382953e-07, "loss": 0.3332, "step": 10800 }, { "epoch": 0.5218630719427937, "grad_norm": 3.230788469314575, "learning_rate": 4.781369280572063e-07, "loss": 0.1565, "step": 10801 }, { "epoch": 0.5219113881238827, "grad_norm": 10.490377426147461, "learning_rate": 4.780886118761173e-07, "loss": 0.3563, "step": 10802 }, { "epoch": 0.5219597043049717, "grad_norm": 2.7489771842956543, "learning_rate": 4.780402956950282e-07, "loss": 0.3237, "step": 10803 }, { "epoch": 0.5220080204860608, "grad_norm": 3.0173943042755127, "learning_rate": 4.779919795139392e-07, "loss": 0.2899, "step": 10804 }, { "epoch": 0.5220563366671498, "grad_norm": 8.666040420532227, "learning_rate": 4.779436633328502e-07, "loss": 0.2565, "step": 10805 }, { "epoch": 0.5221046528482389, "grad_norm": 3.3313238620758057, "learning_rate": 4.778953471517611e-07, "loss": 0.2718, "step": 10806 }, { "epoch": 0.522152969029328, "grad_norm": 2.560711145401001, "learning_rate": 4.77847030970672e-07, "loss": 0.2899, "step": 10807 }, { "epoch": 0.522201285210417, "grad_norm": 6.217749118804932, "learning_rate": 4.77798714789583e-07, "loss": 0.3984, "step": 10808 }, { "epoch": 0.522249601391506, "grad_norm": 2.5261082649230957, "learning_rate": 4.777503986084939e-07, "loss": 0.3003, "step": 10809 }, { "epoch": 0.522297917572595, "grad_norm": 2.74594783782959, "learning_rate": 4.777020824274049e-07, "loss": 0.2632, "step": 10810 }, { "epoch": 0.5223462337536842, "grad_norm": 2.542041063308716, "learning_rate": 4.776537662463159e-07, "loss": 0.255, "step": 10811 }, { "epoch": 0.5223945499347732, "grad_norm": 4.477573871612549, "learning_rate": 4.776054500652269e-07, "loss": 0.3181, "step": 10812 }, { "epoch": 0.5224428661158622, "grad_norm": 2.465378522872925, "learning_rate": 4.775571338841378e-07, "loss": 0.2713, "step": 10813 }, { "epoch": 0.5224911822969512, "grad_norm": 3.2341766357421875, "learning_rate": 4.775088177030488e-07, "loss": 0.1994, "step": 10814 }, { "epoch": 0.5225394984780403, "grad_norm": 2.7198526859283447, "learning_rate": 4.774605015219597e-07, "loss": 0.2853, "step": 10815 }, { "epoch": 0.5225878146591293, "grad_norm": 2.5931098461151123, "learning_rate": 4.774121853408706e-07, "loss": 0.2787, "step": 10816 }, { "epoch": 0.5226361308402184, "grad_norm": 8.08867073059082, "learning_rate": 4.773638691597816e-07, "loss": 0.2817, "step": 10817 }, { "epoch": 0.5226844470213075, "grad_norm": 2.6791117191314697, "learning_rate": 4.773155529786925e-07, "loss": 0.3217, "step": 10818 }, { "epoch": 0.5227327632023965, "grad_norm": 6.470175266265869, "learning_rate": 4.772672367976035e-07, "loss": 0.2615, "step": 10819 }, { "epoch": 0.5227810793834855, "grad_norm": 2.09201979637146, "learning_rate": 4.772189206165145e-07, "loss": 0.2326, "step": 10820 }, { "epoch": 0.5228293955645745, "grad_norm": 3.3559978008270264, "learning_rate": 4.771706044354254e-07, "loss": 0.2718, "step": 10821 }, { "epoch": 0.5228777117456637, "grad_norm": 2.7160375118255615, "learning_rate": 4.771222882543364e-07, "loss": 0.3558, "step": 10822 }, { "epoch": 0.5229260279267527, "grad_norm": 2.3153319358825684, "learning_rate": 4.770739720732473e-07, "loss": 0.2643, "step": 10823 }, { "epoch": 0.5229743441078417, "grad_norm": 2.494840621948242, "learning_rate": 4.770256558921582e-07, "loss": 0.2598, "step": 10824 }, { "epoch": 0.5230226602889307, "grad_norm": 4.262389659881592, "learning_rate": 4.769773397110692e-07, "loss": 0.3847, "step": 10825 }, { "epoch": 0.5230709764700198, "grad_norm": 3.1130497455596924, "learning_rate": 4.769290235299801e-07, "loss": 0.3321, "step": 10826 }, { "epoch": 0.5231192926511089, "grad_norm": 3.105684280395508, "learning_rate": 4.768807073488911e-07, "loss": 0.2026, "step": 10827 }, { "epoch": 0.5231676088321979, "grad_norm": 1.7840619087219238, "learning_rate": 4.7683239116780205e-07, "loss": 0.1861, "step": 10828 }, { "epoch": 0.523215925013287, "grad_norm": 2.231145143508911, "learning_rate": 4.7678407498671304e-07, "loss": 0.2642, "step": 10829 }, { "epoch": 0.523264241194376, "grad_norm": 2.658473491668701, "learning_rate": 4.76735758805624e-07, "loss": 0.3528, "step": 10830 }, { "epoch": 0.523312557375465, "grad_norm": 2.6247997283935547, "learning_rate": 4.766874426245349e-07, "loss": 0.3786, "step": 10831 }, { "epoch": 0.5233608735565541, "grad_norm": 2.251828193664551, "learning_rate": 4.766391264434459e-07, "loss": 0.263, "step": 10832 }, { "epoch": 0.5234091897376432, "grad_norm": 2.8885514736175537, "learning_rate": 4.7659081026235684e-07, "loss": 0.2747, "step": 10833 }, { "epoch": 0.5234575059187322, "grad_norm": 7.597640514373779, "learning_rate": 4.765424940812678e-07, "loss": 0.3081, "step": 10834 }, { "epoch": 0.5235058220998212, "grad_norm": 3.735470771789551, "learning_rate": 4.7649417790017877e-07, "loss": 0.2469, "step": 10835 }, { "epoch": 0.5235541382809102, "grad_norm": 2.2660837173461914, "learning_rate": 4.7644586171908966e-07, "loss": 0.2336, "step": 10836 }, { "epoch": 0.5236024544619994, "grad_norm": 3.2261736392974854, "learning_rate": 4.7639754553800065e-07, "loss": 0.2645, "step": 10837 }, { "epoch": 0.5236507706430884, "grad_norm": 3.299443483352661, "learning_rate": 4.7634922935691164e-07, "loss": 0.2589, "step": 10838 }, { "epoch": 0.5236990868241774, "grad_norm": 2.3850317001342773, "learning_rate": 4.7630091317582257e-07, "loss": 0.3329, "step": 10839 }, { "epoch": 0.5237474030052665, "grad_norm": 2.7239267826080322, "learning_rate": 4.762525969947335e-07, "loss": 0.3245, "step": 10840 }, { "epoch": 0.5237957191863555, "grad_norm": 2.5202877521514893, "learning_rate": 4.7620428081364445e-07, "loss": 0.2156, "step": 10841 }, { "epoch": 0.5238440353674445, "grad_norm": 2.087425947189331, "learning_rate": 4.7615596463255544e-07, "loss": 0.2089, "step": 10842 }, { "epoch": 0.5238923515485336, "grad_norm": 21.060453414916992, "learning_rate": 4.761076484514664e-07, "loss": 0.2837, "step": 10843 }, { "epoch": 0.5239406677296227, "grad_norm": 2.856614112854004, "learning_rate": 4.760593322703773e-07, "loss": 0.2756, "step": 10844 }, { "epoch": 0.5239889839107117, "grad_norm": 3.802933692932129, "learning_rate": 4.760110160892883e-07, "loss": 0.4696, "step": 10845 }, { "epoch": 0.5240373000918007, "grad_norm": 2.682610273361206, "learning_rate": 4.7596269990819924e-07, "loss": 0.2964, "step": 10846 }, { "epoch": 0.5240856162728897, "grad_norm": 1.9737197160720825, "learning_rate": 4.759143837271102e-07, "loss": 0.2244, "step": 10847 }, { "epoch": 0.5241339324539789, "grad_norm": 4.356914043426514, "learning_rate": 4.7586606754602117e-07, "loss": 0.33, "step": 10848 }, { "epoch": 0.5241822486350679, "grad_norm": 3.7284152507781982, "learning_rate": 4.7581775136493205e-07, "loss": 0.3542, "step": 10849 }, { "epoch": 0.5242305648161569, "grad_norm": 2.795956611633301, "learning_rate": 4.7576943518384304e-07, "loss": 0.2772, "step": 10850 }, { "epoch": 0.524278880997246, "grad_norm": 5.526771068572998, "learning_rate": 4.7572111900275403e-07, "loss": 0.2943, "step": 10851 }, { "epoch": 0.524327197178335, "grad_norm": 3.376953125, "learning_rate": 4.756728028216649e-07, "loss": 0.2721, "step": 10852 }, { "epoch": 0.5243755133594241, "grad_norm": 2.326509952545166, "learning_rate": 4.756244866405759e-07, "loss": 0.1638, "step": 10853 }, { "epoch": 0.5244238295405131, "grad_norm": 7.306582450866699, "learning_rate": 4.7557617045948684e-07, "loss": 0.4226, "step": 10854 }, { "epoch": 0.5244721457216022, "grad_norm": 2.6000568866729736, "learning_rate": 4.7552785427839783e-07, "loss": 0.3429, "step": 10855 }, { "epoch": 0.5245204619026912, "grad_norm": 3.3701155185699463, "learning_rate": 4.7547953809730877e-07, "loss": 0.2128, "step": 10856 }, { "epoch": 0.5245687780837802, "grad_norm": 2.5807034969329834, "learning_rate": 4.754312219162197e-07, "loss": 0.4009, "step": 10857 }, { "epoch": 0.5246170942648694, "grad_norm": 2.205902099609375, "learning_rate": 4.753829057351307e-07, "loss": 0.1946, "step": 10858 }, { "epoch": 0.5246654104459584, "grad_norm": 9.199138641357422, "learning_rate": 4.7533458955404164e-07, "loss": 0.2839, "step": 10859 }, { "epoch": 0.5247137266270474, "grad_norm": 3.165433168411255, "learning_rate": 4.7528627337295257e-07, "loss": 0.1876, "step": 10860 }, { "epoch": 0.5247620428081364, "grad_norm": 4.284180164337158, "learning_rate": 4.7523795719186356e-07, "loss": 0.3849, "step": 10861 }, { "epoch": 0.5248103589892255, "grad_norm": 1.7164857387542725, "learning_rate": 4.7518964101077445e-07, "loss": 0.2178, "step": 10862 }, { "epoch": 0.5248586751703146, "grad_norm": 1.8911114931106567, "learning_rate": 4.7514132482968544e-07, "loss": 0.1962, "step": 10863 }, { "epoch": 0.5249069913514036, "grad_norm": 2.8779165744781494, "learning_rate": 4.7509300864859643e-07, "loss": 0.3162, "step": 10864 }, { "epoch": 0.5249553075324926, "grad_norm": 2.9088337421417236, "learning_rate": 4.750446924675073e-07, "loss": 0.2867, "step": 10865 }, { "epoch": 0.5250036237135817, "grad_norm": 3.0929160118103027, "learning_rate": 4.749963762864183e-07, "loss": 0.2518, "step": 10866 }, { "epoch": 0.5250519398946707, "grad_norm": 2.2360944747924805, "learning_rate": 4.7494806010532924e-07, "loss": 0.2714, "step": 10867 }, { "epoch": 0.5251002560757597, "grad_norm": 2.7548534870147705, "learning_rate": 4.748997439242402e-07, "loss": 0.2418, "step": 10868 }, { "epoch": 0.5251485722568489, "grad_norm": 3.640169620513916, "learning_rate": 4.7485142774315117e-07, "loss": 0.4609, "step": 10869 }, { "epoch": 0.5251968884379379, "grad_norm": 2.2183332443237305, "learning_rate": 4.748031115620621e-07, "loss": 0.2574, "step": 10870 }, { "epoch": 0.5252452046190269, "grad_norm": 3.643599033355713, "learning_rate": 4.747547953809731e-07, "loss": 0.155, "step": 10871 }, { "epoch": 0.5252935208001159, "grad_norm": 2.913705587387085, "learning_rate": 4.7470647919988403e-07, "loss": 0.3745, "step": 10872 }, { "epoch": 0.525341836981205, "grad_norm": 1.6488876342773438, "learning_rate": 4.7465816301879497e-07, "loss": 0.1751, "step": 10873 }, { "epoch": 0.5253901531622941, "grad_norm": 3.4668362140655518, "learning_rate": 4.7460984683770596e-07, "loss": 0.3046, "step": 10874 }, { "epoch": 0.5254384693433831, "grad_norm": 2.5023229122161865, "learning_rate": 4.7456153065661684e-07, "loss": 0.2893, "step": 10875 }, { "epoch": 0.5254867855244721, "grad_norm": 4.838870048522949, "learning_rate": 4.7451321447552783e-07, "loss": 0.3829, "step": 10876 }, { "epoch": 0.5255351017055612, "grad_norm": 2.4219915866851807, "learning_rate": 4.744648982944388e-07, "loss": 0.1708, "step": 10877 }, { "epoch": 0.5255834178866502, "grad_norm": 2.2963991165161133, "learning_rate": 4.744165821133497e-07, "loss": 0.2411, "step": 10878 }, { "epoch": 0.5256317340677393, "grad_norm": 5.245243072509766, "learning_rate": 4.743682659322607e-07, "loss": 0.3729, "step": 10879 }, { "epoch": 0.5256800502488284, "grad_norm": 2.8853859901428223, "learning_rate": 4.7431994975117164e-07, "loss": 0.2891, "step": 10880 }, { "epoch": 0.5257283664299174, "grad_norm": 2.583667516708374, "learning_rate": 4.742716335700826e-07, "loss": 0.3264, "step": 10881 }, { "epoch": 0.5257766826110064, "grad_norm": 1.7988048791885376, "learning_rate": 4.7422331738899356e-07, "loss": 0.1951, "step": 10882 }, { "epoch": 0.5258249987920954, "grad_norm": 2.968693971633911, "learning_rate": 4.741750012079045e-07, "loss": 0.2368, "step": 10883 }, { "epoch": 0.5258733149731846, "grad_norm": 2.151397466659546, "learning_rate": 4.741266850268155e-07, "loss": 0.2767, "step": 10884 }, { "epoch": 0.5259216311542736, "grad_norm": 2.7862870693206787, "learning_rate": 4.7407836884572643e-07, "loss": 0.2998, "step": 10885 }, { "epoch": 0.5259699473353626, "grad_norm": 2.2540769577026367, "learning_rate": 4.7403005266463737e-07, "loss": 0.2753, "step": 10886 }, { "epoch": 0.5260182635164516, "grad_norm": 3.2219784259796143, "learning_rate": 4.7398173648354836e-07, "loss": 0.2829, "step": 10887 }, { "epoch": 0.5260665796975407, "grad_norm": 3.431147575378418, "learning_rate": 4.7393342030245924e-07, "loss": 0.3272, "step": 10888 }, { "epoch": 0.5261148958786298, "grad_norm": 3.677300214767456, "learning_rate": 4.7388510412137023e-07, "loss": 0.1955, "step": 10889 }, { "epoch": 0.5261632120597188, "grad_norm": 2.7399485111236572, "learning_rate": 4.738367879402812e-07, "loss": 0.3555, "step": 10890 }, { "epoch": 0.5262115282408079, "grad_norm": 2.3044958114624023, "learning_rate": 4.737884717591921e-07, "loss": 0.2676, "step": 10891 }, { "epoch": 0.5262598444218969, "grad_norm": 4.171197891235352, "learning_rate": 4.737401555781031e-07, "loss": 0.2851, "step": 10892 }, { "epoch": 0.5263081606029859, "grad_norm": 3.0434601306915283, "learning_rate": 4.7369183939701403e-07, "loss": 0.3983, "step": 10893 }, { "epoch": 0.5263564767840749, "grad_norm": 3.7607908248901367, "learning_rate": 4.7364352321592497e-07, "loss": 0.3244, "step": 10894 }, { "epoch": 0.5264047929651641, "grad_norm": 2.7282776832580566, "learning_rate": 4.7359520703483596e-07, "loss": 0.296, "step": 10895 }, { "epoch": 0.5264531091462531, "grad_norm": 2.4623870849609375, "learning_rate": 4.735468908537469e-07, "loss": 0.3063, "step": 10896 }, { "epoch": 0.5265014253273421, "grad_norm": 2.4174342155456543, "learning_rate": 4.7349857467265783e-07, "loss": 0.3222, "step": 10897 }, { "epoch": 0.5265497415084311, "grad_norm": 3.3516409397125244, "learning_rate": 4.734502584915688e-07, "loss": 0.382, "step": 10898 }, { "epoch": 0.5265980576895202, "grad_norm": 3.2103323936462402, "learning_rate": 4.7340194231047976e-07, "loss": 0.3147, "step": 10899 }, { "epoch": 0.5266463738706093, "grad_norm": 2.440142869949341, "learning_rate": 4.7335362612939075e-07, "loss": 0.2713, "step": 10900 }, { "epoch": 0.5266946900516983, "grad_norm": 2.27732515335083, "learning_rate": 4.7330530994830164e-07, "loss": 0.2207, "step": 10901 }, { "epoch": 0.5267430062327874, "grad_norm": 3.288964033126831, "learning_rate": 4.7325699376721263e-07, "loss": 0.4468, "step": 10902 }, { "epoch": 0.5267913224138764, "grad_norm": 2.9384567737579346, "learning_rate": 4.732086775861236e-07, "loss": 0.3223, "step": 10903 }, { "epoch": 0.5268396385949654, "grad_norm": 3.607970952987671, "learning_rate": 4.731603614050345e-07, "loss": 0.2045, "step": 10904 }, { "epoch": 0.5268879547760545, "grad_norm": 1.7281348705291748, "learning_rate": 4.731120452239455e-07, "loss": 0.1809, "step": 10905 }, { "epoch": 0.5269362709571436, "grad_norm": 2.2369165420532227, "learning_rate": 4.7306372904285643e-07, "loss": 0.211, "step": 10906 }, { "epoch": 0.5269845871382326, "grad_norm": 2.9212570190429688, "learning_rate": 4.7301541286176737e-07, "loss": 0.3275, "step": 10907 }, { "epoch": 0.5270329033193216, "grad_norm": 3.9930906295776367, "learning_rate": 4.7296709668067836e-07, "loss": 0.1685, "step": 10908 }, { "epoch": 0.5270812195004106, "grad_norm": 3.1018216609954834, "learning_rate": 4.729187804995893e-07, "loss": 0.3596, "step": 10909 }, { "epoch": 0.5271295356814998, "grad_norm": 4.663389205932617, "learning_rate": 4.7287046431850023e-07, "loss": 0.3288, "step": 10910 }, { "epoch": 0.5271778518625888, "grad_norm": 8.095419883728027, "learning_rate": 4.728221481374112e-07, "loss": 0.3041, "step": 10911 }, { "epoch": 0.5272261680436778, "grad_norm": 8.98403263092041, "learning_rate": 4.7277383195632216e-07, "loss": 0.2208, "step": 10912 }, { "epoch": 0.5272744842247669, "grad_norm": 4.987022876739502, "learning_rate": 4.727255157752331e-07, "loss": 0.2892, "step": 10913 }, { "epoch": 0.5273228004058559, "grad_norm": 2.2551543712615967, "learning_rate": 4.7267719959414403e-07, "loss": 0.2698, "step": 10914 }, { "epoch": 0.527371116586945, "grad_norm": 4.591022491455078, "learning_rate": 4.72628883413055e-07, "loss": 0.3708, "step": 10915 }, { "epoch": 0.527419432768034, "grad_norm": 2.5691375732421875, "learning_rate": 4.72580567231966e-07, "loss": 0.3345, "step": 10916 }, { "epoch": 0.5274677489491231, "grad_norm": 2.8863039016723633, "learning_rate": 4.725322510508769e-07, "loss": 0.2068, "step": 10917 }, { "epoch": 0.5275160651302121, "grad_norm": 4.459704399108887, "learning_rate": 4.724839348697879e-07, "loss": 0.396, "step": 10918 }, { "epoch": 0.5275643813113011, "grad_norm": 3.661977767944336, "learning_rate": 4.724356186886988e-07, "loss": 0.4515, "step": 10919 }, { "epoch": 0.5276126974923901, "grad_norm": 2.1232850551605225, "learning_rate": 4.7238730250760976e-07, "loss": 0.2465, "step": 10920 }, { "epoch": 0.5276610136734793, "grad_norm": 2.547133445739746, "learning_rate": 4.7233898632652075e-07, "loss": 0.3418, "step": 10921 }, { "epoch": 0.5277093298545683, "grad_norm": 2.8202531337738037, "learning_rate": 4.722906701454317e-07, "loss": 0.2033, "step": 10922 }, { "epoch": 0.5277576460356573, "grad_norm": 2.2370715141296387, "learning_rate": 4.7224235396434263e-07, "loss": 0.2907, "step": 10923 }, { "epoch": 0.5278059622167464, "grad_norm": 2.785618782043457, "learning_rate": 4.7219403778325356e-07, "loss": 0.3283, "step": 10924 }, { "epoch": 0.5278542783978354, "grad_norm": 3.507976770401001, "learning_rate": 4.7214572160216455e-07, "loss": 0.3532, "step": 10925 }, { "epoch": 0.5279025945789245, "grad_norm": 2.3366293907165527, "learning_rate": 4.720974054210755e-07, "loss": 0.2055, "step": 10926 }, { "epoch": 0.5279509107600135, "grad_norm": 2.5863394737243652, "learning_rate": 4.7204908923998643e-07, "loss": 0.2904, "step": 10927 }, { "epoch": 0.5279992269411026, "grad_norm": 2.429243564605713, "learning_rate": 4.720007730588974e-07, "loss": 0.3176, "step": 10928 }, { "epoch": 0.5280475431221916, "grad_norm": 2.879055976867676, "learning_rate": 4.7195245687780836e-07, "loss": 0.437, "step": 10929 }, { "epoch": 0.5280958593032806, "grad_norm": 2.2547097206115723, "learning_rate": 4.719041406967193e-07, "loss": 0.2157, "step": 10930 }, { "epoch": 0.5281441754843698, "grad_norm": 2.9515368938446045, "learning_rate": 4.718558245156303e-07, "loss": 0.316, "step": 10931 }, { "epoch": 0.5281924916654588, "grad_norm": 4.488186359405518, "learning_rate": 4.7180750833454117e-07, "loss": 0.4503, "step": 10932 }, { "epoch": 0.5282408078465478, "grad_norm": 2.18766713142395, "learning_rate": 4.7175919215345216e-07, "loss": 0.3239, "step": 10933 }, { "epoch": 0.5282891240276368, "grad_norm": 3.829134225845337, "learning_rate": 4.7171087597236315e-07, "loss": 0.3688, "step": 10934 }, { "epoch": 0.5283374402087259, "grad_norm": 6.811121463775635, "learning_rate": 4.716625597912741e-07, "loss": 0.2782, "step": 10935 }, { "epoch": 0.528385756389815, "grad_norm": 2.256209135055542, "learning_rate": 4.71614243610185e-07, "loss": 0.2204, "step": 10936 }, { "epoch": 0.528434072570904, "grad_norm": 4.559001445770264, "learning_rate": 4.7156592742909596e-07, "loss": 0.3224, "step": 10937 }, { "epoch": 0.528482388751993, "grad_norm": 4.255964756011963, "learning_rate": 4.7151761124800695e-07, "loss": 0.4865, "step": 10938 }, { "epoch": 0.5285307049330821, "grad_norm": 3.157975435256958, "learning_rate": 4.714692950669179e-07, "loss": 0.2533, "step": 10939 }, { "epoch": 0.5285790211141711, "grad_norm": 1.9424717426300049, "learning_rate": 4.714209788858288e-07, "loss": 0.2252, "step": 10940 }, { "epoch": 0.5286273372952602, "grad_norm": 2.435025215148926, "learning_rate": 4.713726627047398e-07, "loss": 0.2233, "step": 10941 }, { "epoch": 0.5286756534763493, "grad_norm": 2.8642735481262207, "learning_rate": 4.7132434652365075e-07, "loss": 0.2482, "step": 10942 }, { "epoch": 0.5287239696574383, "grad_norm": 2.1085875034332275, "learning_rate": 4.712760303425617e-07, "loss": 0.2195, "step": 10943 }, { "epoch": 0.5287722858385273, "grad_norm": 3.7890706062316895, "learning_rate": 4.712277141614727e-07, "loss": 0.264, "step": 10944 }, { "epoch": 0.5288206020196163, "grad_norm": 3.516874313354492, "learning_rate": 4.7117939798038357e-07, "loss": 0.2556, "step": 10945 }, { "epoch": 0.5288689182007054, "grad_norm": 2.9622321128845215, "learning_rate": 4.7113108179929456e-07, "loss": 0.3121, "step": 10946 }, { "epoch": 0.5289172343817945, "grad_norm": 1.4930634498596191, "learning_rate": 4.7108276561820555e-07, "loss": 0.1522, "step": 10947 }, { "epoch": 0.5289655505628835, "grad_norm": 2.6835362911224365, "learning_rate": 4.7103444943711643e-07, "loss": 0.3151, "step": 10948 }, { "epoch": 0.5290138667439725, "grad_norm": 2.7539374828338623, "learning_rate": 4.709861332560274e-07, "loss": 0.3741, "step": 10949 }, { "epoch": 0.5290621829250616, "grad_norm": 3.947789192199707, "learning_rate": 4.7093781707493836e-07, "loss": 0.2093, "step": 10950 }, { "epoch": 0.5291104991061506, "grad_norm": 3.8237180709838867, "learning_rate": 4.7088950089384935e-07, "loss": 0.4055, "step": 10951 }, { "epoch": 0.5291588152872397, "grad_norm": 1.8090730905532837, "learning_rate": 4.708411847127603e-07, "loss": 0.2419, "step": 10952 }, { "epoch": 0.5292071314683288, "grad_norm": 2.877765655517578, "learning_rate": 4.707928685316712e-07, "loss": 0.3369, "step": 10953 }, { "epoch": 0.5292554476494178, "grad_norm": 3.636667013168335, "learning_rate": 4.707445523505822e-07, "loss": 0.3722, "step": 10954 }, { "epoch": 0.5293037638305068, "grad_norm": 4.793670654296875, "learning_rate": 4.7069623616949315e-07, "loss": 0.3233, "step": 10955 }, { "epoch": 0.5293520800115958, "grad_norm": 4.438404083251953, "learning_rate": 4.706479199884041e-07, "loss": 0.4008, "step": 10956 }, { "epoch": 0.529400396192685, "grad_norm": 2.0302810668945312, "learning_rate": 4.705996038073151e-07, "loss": 0.2875, "step": 10957 }, { "epoch": 0.529448712373774, "grad_norm": 3.3748996257781982, "learning_rate": 4.7055128762622596e-07, "loss": 0.3412, "step": 10958 }, { "epoch": 0.529497028554863, "grad_norm": 4.387531757354736, "learning_rate": 4.7050297144513695e-07, "loss": 0.2787, "step": 10959 }, { "epoch": 0.529545344735952, "grad_norm": 5.664848804473877, "learning_rate": 4.7045465526404794e-07, "loss": 0.4513, "step": 10960 }, { "epoch": 0.5295936609170411, "grad_norm": 3.560870885848999, "learning_rate": 4.704063390829588e-07, "loss": 0.3759, "step": 10961 }, { "epoch": 0.5296419770981302, "grad_norm": 2.7449193000793457, "learning_rate": 4.703580229018698e-07, "loss": 0.35, "step": 10962 }, { "epoch": 0.5296902932792192, "grad_norm": 2.097534656524658, "learning_rate": 4.7030970672078075e-07, "loss": 0.2065, "step": 10963 }, { "epoch": 0.5297386094603083, "grad_norm": 2.3405933380126953, "learning_rate": 4.702613905396917e-07, "loss": 0.316, "step": 10964 }, { "epoch": 0.5297869256413973, "grad_norm": 2.15065336227417, "learning_rate": 4.702130743586027e-07, "loss": 0.2337, "step": 10965 }, { "epoch": 0.5298352418224863, "grad_norm": 4.128777503967285, "learning_rate": 4.701647581775136e-07, "loss": 0.3173, "step": 10966 }, { "epoch": 0.5298835580035755, "grad_norm": 2.1688458919525146, "learning_rate": 4.701164419964246e-07, "loss": 0.243, "step": 10967 }, { "epoch": 0.5299318741846645, "grad_norm": 33.30400848388672, "learning_rate": 4.7006812581533555e-07, "loss": 0.2893, "step": 10968 }, { "epoch": 0.5299801903657535, "grad_norm": 3.713750123977661, "learning_rate": 4.700198096342465e-07, "loss": 0.3183, "step": 10969 }, { "epoch": 0.5300285065468425, "grad_norm": 2.088242292404175, "learning_rate": 4.699714934531575e-07, "loss": 0.2207, "step": 10970 }, { "epoch": 0.5300768227279316, "grad_norm": 2.689030170440674, "learning_rate": 4.6992317727206836e-07, "loss": 0.1405, "step": 10971 }, { "epoch": 0.5301251389090206, "grad_norm": 2.4719603061676025, "learning_rate": 4.6987486109097935e-07, "loss": 0.2488, "step": 10972 }, { "epoch": 0.5301734550901097, "grad_norm": 3.087750196456909, "learning_rate": 4.6982654490989034e-07, "loss": 0.3715, "step": 10973 }, { "epoch": 0.5302217712711987, "grad_norm": 31.923171997070312, "learning_rate": 4.697782287288012e-07, "loss": 0.2772, "step": 10974 }, { "epoch": 0.5302700874522878, "grad_norm": 5.835654258728027, "learning_rate": 4.697299125477122e-07, "loss": 0.4034, "step": 10975 }, { "epoch": 0.5303184036333768, "grad_norm": 2.217689275741577, "learning_rate": 4.6968159636662315e-07, "loss": 0.2554, "step": 10976 }, { "epoch": 0.5303667198144658, "grad_norm": 2.3869028091430664, "learning_rate": 4.696332801855341e-07, "loss": 0.2597, "step": 10977 }, { "epoch": 0.530415035995555, "grad_norm": 3.038067579269409, "learning_rate": 4.695849640044451e-07, "loss": 0.3698, "step": 10978 }, { "epoch": 0.530463352176644, "grad_norm": 2.673386812210083, "learning_rate": 4.69536647823356e-07, "loss": 0.2366, "step": 10979 }, { "epoch": 0.530511668357733, "grad_norm": 2.009244680404663, "learning_rate": 4.6948833164226695e-07, "loss": 0.2255, "step": 10980 }, { "epoch": 0.530559984538822, "grad_norm": 3.53179931640625, "learning_rate": 4.6944001546117794e-07, "loss": 0.3548, "step": 10981 }, { "epoch": 0.530608300719911, "grad_norm": 2.3857345581054688, "learning_rate": 4.693916992800889e-07, "loss": 0.3001, "step": 10982 }, { "epoch": 0.5306566169010002, "grad_norm": 2.7237424850463867, "learning_rate": 4.6934338309899987e-07, "loss": 0.2935, "step": 10983 }, { "epoch": 0.5307049330820892, "grad_norm": 2.2550368309020996, "learning_rate": 4.6929506691791075e-07, "loss": 0.321, "step": 10984 }, { "epoch": 0.5307532492631782, "grad_norm": 44.95878219604492, "learning_rate": 4.6924675073682174e-07, "loss": 0.2196, "step": 10985 }, { "epoch": 0.5308015654442673, "grad_norm": 2.1280813217163086, "learning_rate": 4.6919843455573273e-07, "loss": 0.203, "step": 10986 }, { "epoch": 0.5308498816253563, "grad_norm": 2.162196397781372, "learning_rate": 4.691501183746436e-07, "loss": 0.2487, "step": 10987 }, { "epoch": 0.5308981978064454, "grad_norm": 8.276674270629883, "learning_rate": 4.691018021935546e-07, "loss": 0.3839, "step": 10988 }, { "epoch": 0.5309465139875345, "grad_norm": 2.113145589828491, "learning_rate": 4.6905348601246555e-07, "loss": 0.2284, "step": 10989 }, { "epoch": 0.5309948301686235, "grad_norm": 3.1640443801879883, "learning_rate": 4.690051698313765e-07, "loss": 0.2539, "step": 10990 }, { "epoch": 0.5310431463497125, "grad_norm": 2.808303117752075, "learning_rate": 4.689568536502875e-07, "loss": 0.2975, "step": 10991 }, { "epoch": 0.5310914625308015, "grad_norm": 3.10495924949646, "learning_rate": 4.689085374691984e-07, "loss": 0.2155, "step": 10992 }, { "epoch": 0.5311397787118907, "grad_norm": 3.409365653991699, "learning_rate": 4.6886022128810935e-07, "loss": 0.3433, "step": 10993 }, { "epoch": 0.5311880948929797, "grad_norm": 2.4908738136291504, "learning_rate": 4.6881190510702034e-07, "loss": 0.3014, "step": 10994 }, { "epoch": 0.5312364110740687, "grad_norm": 3.150503635406494, "learning_rate": 4.687635889259313e-07, "loss": 0.328, "step": 10995 }, { "epoch": 0.5312847272551577, "grad_norm": 1.8715794086456299, "learning_rate": 4.687152727448422e-07, "loss": 0.1967, "step": 10996 }, { "epoch": 0.5313330434362468, "grad_norm": 2.2750232219696045, "learning_rate": 4.6866695656375315e-07, "loss": 0.2549, "step": 10997 }, { "epoch": 0.5313813596173358, "grad_norm": 1.845572829246521, "learning_rate": 4.6861864038266414e-07, "loss": 0.2743, "step": 10998 }, { "epoch": 0.5314296757984249, "grad_norm": 3.499105930328369, "learning_rate": 4.6857032420157513e-07, "loss": 0.3911, "step": 10999 }, { "epoch": 0.531477991979514, "grad_norm": 3.3291656970977783, "learning_rate": 4.68522008020486e-07, "loss": 0.3375, "step": 11000 }, { "epoch": 0.531526308160603, "grad_norm": 2.8604049682617188, "learning_rate": 4.68473691839397e-07, "loss": 0.2932, "step": 11001 }, { "epoch": 0.531574624341692, "grad_norm": 11.4969482421875, "learning_rate": 4.6842537565830794e-07, "loss": 0.1646, "step": 11002 }, { "epoch": 0.531622940522781, "grad_norm": 2.484480619430542, "learning_rate": 4.683770594772189e-07, "loss": 0.3288, "step": 11003 }, { "epoch": 0.5316712567038702, "grad_norm": 2.3174827098846436, "learning_rate": 4.6832874329612987e-07, "loss": 0.2756, "step": 11004 }, { "epoch": 0.5317195728849592, "grad_norm": 1.8949265480041504, "learning_rate": 4.682804271150408e-07, "loss": 0.1811, "step": 11005 }, { "epoch": 0.5317678890660482, "grad_norm": 2.7007477283477783, "learning_rate": 4.6823211093395174e-07, "loss": 0.2969, "step": 11006 }, { "epoch": 0.5318162052471372, "grad_norm": 2.8269550800323486, "learning_rate": 4.6818379475286274e-07, "loss": 0.2924, "step": 11007 }, { "epoch": 0.5318645214282263, "grad_norm": 2.8093371391296387, "learning_rate": 4.6813547857177367e-07, "loss": 0.4388, "step": 11008 }, { "epoch": 0.5319128376093154, "grad_norm": 2.152111053466797, "learning_rate": 4.680871623906846e-07, "loss": 0.2819, "step": 11009 }, { "epoch": 0.5319611537904044, "grad_norm": 2.1265223026275635, "learning_rate": 4.6803884620959555e-07, "loss": 0.2047, "step": 11010 }, { "epoch": 0.5320094699714935, "grad_norm": 5.442714214324951, "learning_rate": 4.6799053002850654e-07, "loss": 0.2735, "step": 11011 }, { "epoch": 0.5320577861525825, "grad_norm": 3.1647543907165527, "learning_rate": 4.679422138474175e-07, "loss": 0.3289, "step": 11012 }, { "epoch": 0.5321061023336715, "grad_norm": 2.836141586303711, "learning_rate": 4.678938976663284e-07, "loss": 0.2045, "step": 11013 }, { "epoch": 0.5321544185147606, "grad_norm": 2.8023691177368164, "learning_rate": 4.678455814852394e-07, "loss": 0.3909, "step": 11014 }, { "epoch": 0.5322027346958497, "grad_norm": 2.4559340476989746, "learning_rate": 4.677972653041503e-07, "loss": 0.3709, "step": 11015 }, { "epoch": 0.5322510508769387, "grad_norm": 3.0801589488983154, "learning_rate": 4.677489491230613e-07, "loss": 0.2649, "step": 11016 }, { "epoch": 0.5322993670580277, "grad_norm": 3.9173290729522705, "learning_rate": 4.6770063294197227e-07, "loss": 0.3773, "step": 11017 }, { "epoch": 0.5323476832391167, "grad_norm": 5.220424175262451, "learning_rate": 4.676523167608832e-07, "loss": 0.2859, "step": 11018 }, { "epoch": 0.5323959994202059, "grad_norm": 3.7316577434539795, "learning_rate": 4.6760400057979414e-07, "loss": 0.2993, "step": 11019 }, { "epoch": 0.5324443156012949, "grad_norm": 3.619271755218506, "learning_rate": 4.6755568439870513e-07, "loss": 0.296, "step": 11020 }, { "epoch": 0.5324926317823839, "grad_norm": 3.1964199542999268, "learning_rate": 4.6750736821761607e-07, "loss": 0.1547, "step": 11021 }, { "epoch": 0.532540947963473, "grad_norm": 2.455157518386841, "learning_rate": 4.67459052036527e-07, "loss": 0.2859, "step": 11022 }, { "epoch": 0.532589264144562, "grad_norm": 1.8224283456802368, "learning_rate": 4.6741073585543794e-07, "loss": 0.1888, "step": 11023 }, { "epoch": 0.532637580325651, "grad_norm": 2.914193630218506, "learning_rate": 4.6736241967434893e-07, "loss": 0.2795, "step": 11024 }, { "epoch": 0.5326858965067401, "grad_norm": 1.8685386180877686, "learning_rate": 4.6731410349325987e-07, "loss": 0.2363, "step": 11025 }, { "epoch": 0.5327342126878292, "grad_norm": 4.186707973480225, "learning_rate": 4.672657873121708e-07, "loss": 0.448, "step": 11026 }, { "epoch": 0.5327825288689182, "grad_norm": 5.053283214569092, "learning_rate": 4.672174711310818e-07, "loss": 0.2901, "step": 11027 }, { "epoch": 0.5328308450500072, "grad_norm": 1.73549222946167, "learning_rate": 4.671691549499927e-07, "loss": 0.2328, "step": 11028 }, { "epoch": 0.5328791612310962, "grad_norm": 2.6446189880371094, "learning_rate": 4.6712083876890367e-07, "loss": 0.2627, "step": 11029 }, { "epoch": 0.5329274774121854, "grad_norm": 2.2585034370422363, "learning_rate": 4.6707252258781466e-07, "loss": 0.2585, "step": 11030 }, { "epoch": 0.5329757935932744, "grad_norm": 2.912062644958496, "learning_rate": 4.6702420640672555e-07, "loss": 0.4277, "step": 11031 }, { "epoch": 0.5330241097743634, "grad_norm": 3.5007622241973877, "learning_rate": 4.6697589022563654e-07, "loss": 0.4761, "step": 11032 }, { "epoch": 0.5330724259554525, "grad_norm": 6.510773181915283, "learning_rate": 4.6692757404454753e-07, "loss": 0.3217, "step": 11033 }, { "epoch": 0.5331207421365415, "grad_norm": 2.1767385005950928, "learning_rate": 4.6687925786345847e-07, "loss": 0.2085, "step": 11034 }, { "epoch": 0.5331690583176306, "grad_norm": 3.145306348800659, "learning_rate": 4.668309416823694e-07, "loss": 0.3295, "step": 11035 }, { "epoch": 0.5332173744987196, "grad_norm": 2.3102428913116455, "learning_rate": 4.6678262550128034e-07, "loss": 0.3418, "step": 11036 }, { "epoch": 0.5332656906798087, "grad_norm": 2.773658037185669, "learning_rate": 4.6673430932019133e-07, "loss": 0.2762, "step": 11037 }, { "epoch": 0.5333140068608977, "grad_norm": 4.35088586807251, "learning_rate": 4.6668599313910227e-07, "loss": 0.2716, "step": 11038 }, { "epoch": 0.5333623230419867, "grad_norm": 3.8855795860290527, "learning_rate": 4.666376769580132e-07, "loss": 0.3814, "step": 11039 }, { "epoch": 0.5334106392230759, "grad_norm": 2.1143746376037598, "learning_rate": 4.665893607769242e-07, "loss": 0.2661, "step": 11040 }, { "epoch": 0.5334589554041649, "grad_norm": 2.056258201599121, "learning_rate": 4.665410445958351e-07, "loss": 0.1966, "step": 11041 }, { "epoch": 0.5335072715852539, "grad_norm": 11.533344268798828, "learning_rate": 4.6649272841474607e-07, "loss": 0.2896, "step": 11042 }, { "epoch": 0.5335555877663429, "grad_norm": 2.5099942684173584, "learning_rate": 4.6644441223365706e-07, "loss": 0.3073, "step": 11043 }, { "epoch": 0.533603903947432, "grad_norm": 3.293534517288208, "learning_rate": 4.6639609605256794e-07, "loss": 0.1562, "step": 11044 }, { "epoch": 0.5336522201285211, "grad_norm": 2.0903069972991943, "learning_rate": 4.6634777987147893e-07, "loss": 0.2867, "step": 11045 }, { "epoch": 0.5337005363096101, "grad_norm": 2.3343710899353027, "learning_rate": 4.662994636903899e-07, "loss": 0.2403, "step": 11046 }, { "epoch": 0.5337488524906991, "grad_norm": 2.69258189201355, "learning_rate": 4.6625114750930086e-07, "loss": 0.3706, "step": 11047 }, { "epoch": 0.5337971686717882, "grad_norm": 2.2183706760406494, "learning_rate": 4.662028313282118e-07, "loss": 0.2668, "step": 11048 }, { "epoch": 0.5338454848528772, "grad_norm": 2.5476601123809814, "learning_rate": 4.6615451514712274e-07, "loss": 0.2937, "step": 11049 }, { "epoch": 0.5338938010339662, "grad_norm": 2.227130174636841, "learning_rate": 4.661061989660337e-07, "loss": 0.2326, "step": 11050 }, { "epoch": 0.5339421172150554, "grad_norm": 2.125723361968994, "learning_rate": 4.6605788278494466e-07, "loss": 0.3059, "step": 11051 }, { "epoch": 0.5339904333961444, "grad_norm": 2.6585464477539062, "learning_rate": 4.660095666038556e-07, "loss": 0.3577, "step": 11052 }, { "epoch": 0.5340387495772334, "grad_norm": 4.372353553771973, "learning_rate": 4.659612504227666e-07, "loss": 0.2747, "step": 11053 }, { "epoch": 0.5340870657583224, "grad_norm": 1.560436725616455, "learning_rate": 4.659129342416775e-07, "loss": 0.132, "step": 11054 }, { "epoch": 0.5341353819394115, "grad_norm": 2.4456400871276855, "learning_rate": 4.6586461806058847e-07, "loss": 0.2798, "step": 11055 }, { "epoch": 0.5341836981205006, "grad_norm": 1.8456206321716309, "learning_rate": 4.6581630187949946e-07, "loss": 0.2114, "step": 11056 }, { "epoch": 0.5342320143015896, "grad_norm": 3.246680498123169, "learning_rate": 4.6576798569841034e-07, "loss": 0.3421, "step": 11057 }, { "epoch": 0.5342803304826786, "grad_norm": 2.014482021331787, "learning_rate": 4.6571966951732133e-07, "loss": 0.203, "step": 11058 }, { "epoch": 0.5343286466637677, "grad_norm": 2.05517840385437, "learning_rate": 4.656713533362323e-07, "loss": 0.2444, "step": 11059 }, { "epoch": 0.5343769628448567, "grad_norm": 2.395623207092285, "learning_rate": 4.656230371551432e-07, "loss": 0.2883, "step": 11060 }, { "epoch": 0.5344252790259458, "grad_norm": 2.782453775405884, "learning_rate": 4.655747209740542e-07, "loss": 0.311, "step": 11061 }, { "epoch": 0.5344735952070349, "grad_norm": 2.2169642448425293, "learning_rate": 4.6552640479296513e-07, "loss": 0.2421, "step": 11062 }, { "epoch": 0.5345219113881239, "grad_norm": 2.737647771835327, "learning_rate": 4.654780886118761e-07, "loss": 0.339, "step": 11063 }, { "epoch": 0.5345702275692129, "grad_norm": 2.2282474040985107, "learning_rate": 4.6542977243078706e-07, "loss": 0.2864, "step": 11064 }, { "epoch": 0.5346185437503019, "grad_norm": 2.527017593383789, "learning_rate": 4.65381456249698e-07, "loss": 0.3185, "step": 11065 }, { "epoch": 0.5346668599313911, "grad_norm": 3.4162094593048096, "learning_rate": 4.65333140068609e-07, "loss": 0.4493, "step": 11066 }, { "epoch": 0.5347151761124801, "grad_norm": 4.339859962463379, "learning_rate": 4.6528482388751987e-07, "loss": 0.3856, "step": 11067 }, { "epoch": 0.5347634922935691, "grad_norm": 2.0648608207702637, "learning_rate": 4.6523650770643086e-07, "loss": 0.2495, "step": 11068 }, { "epoch": 0.5348118084746581, "grad_norm": 2.529665231704712, "learning_rate": 4.6518819152534185e-07, "loss": 0.2856, "step": 11069 }, { "epoch": 0.5348601246557472, "grad_norm": 3.5992751121520996, "learning_rate": 4.6513987534425274e-07, "loss": 0.2181, "step": 11070 }, { "epoch": 0.5349084408368363, "grad_norm": 8.708806991577148, "learning_rate": 4.6509155916316373e-07, "loss": 0.3388, "step": 11071 }, { "epoch": 0.5349567570179253, "grad_norm": 2.6153435707092285, "learning_rate": 4.650432429820747e-07, "loss": 0.3513, "step": 11072 }, { "epoch": 0.5350050731990144, "grad_norm": 3.294360876083374, "learning_rate": 4.649949268009856e-07, "loss": 0.339, "step": 11073 }, { "epoch": 0.5350533893801034, "grad_norm": 2.2286033630371094, "learning_rate": 4.649466106198966e-07, "loss": 0.267, "step": 11074 }, { "epoch": 0.5351017055611924, "grad_norm": 7.323428630828857, "learning_rate": 4.6489829443880753e-07, "loss": 0.3322, "step": 11075 }, { "epoch": 0.5351500217422815, "grad_norm": 3.1192336082458496, "learning_rate": 4.6484997825771847e-07, "loss": 0.3968, "step": 11076 }, { "epoch": 0.5351983379233706, "grad_norm": 2.569347858428955, "learning_rate": 4.6480166207662946e-07, "loss": 0.4788, "step": 11077 }, { "epoch": 0.5352466541044596, "grad_norm": 2.811202049255371, "learning_rate": 4.647533458955404e-07, "loss": 0.3516, "step": 11078 }, { "epoch": 0.5352949702855486, "grad_norm": 12.149805068969727, "learning_rate": 4.647050297144514e-07, "loss": 0.2959, "step": 11079 }, { "epoch": 0.5353432864666376, "grad_norm": 2.0161309242248535, "learning_rate": 4.6465671353336227e-07, "loss": 0.1955, "step": 11080 }, { "epoch": 0.5353916026477267, "grad_norm": 3.854372501373291, "learning_rate": 4.6460839735227326e-07, "loss": 0.2489, "step": 11081 }, { "epoch": 0.5354399188288158, "grad_norm": 4.218692779541016, "learning_rate": 4.6456008117118425e-07, "loss": 0.4471, "step": 11082 }, { "epoch": 0.5354882350099048, "grad_norm": 4.200211048126221, "learning_rate": 4.6451176499009513e-07, "loss": 0.3082, "step": 11083 }, { "epoch": 0.5355365511909939, "grad_norm": 1.5558574199676514, "learning_rate": 4.644634488090061e-07, "loss": 0.1595, "step": 11084 }, { "epoch": 0.5355848673720829, "grad_norm": 2.5068562030792236, "learning_rate": 4.644151326279171e-07, "loss": 0.268, "step": 11085 }, { "epoch": 0.5356331835531719, "grad_norm": 6.072683811187744, "learning_rate": 4.64366816446828e-07, "loss": 0.4053, "step": 11086 }, { "epoch": 0.535681499734261, "grad_norm": 2.8293957710266113, "learning_rate": 4.64318500265739e-07, "loss": 0.3471, "step": 11087 }, { "epoch": 0.5357298159153501, "grad_norm": 5.577359199523926, "learning_rate": 4.642701840846499e-07, "loss": 0.4006, "step": 11088 }, { "epoch": 0.5357781320964391, "grad_norm": 2.551694631576538, "learning_rate": 4.6422186790356086e-07, "loss": 0.2999, "step": 11089 }, { "epoch": 0.5358264482775281, "grad_norm": 2.5071053504943848, "learning_rate": 4.6417355172247185e-07, "loss": 0.292, "step": 11090 }, { "epoch": 0.5358747644586171, "grad_norm": 3.832549571990967, "learning_rate": 4.641252355413828e-07, "loss": 0.4053, "step": 11091 }, { "epoch": 0.5359230806397063, "grad_norm": 3.0757763385772705, "learning_rate": 4.6407691936029373e-07, "loss": 0.2701, "step": 11092 }, { "epoch": 0.5359713968207953, "grad_norm": 5.929993629455566, "learning_rate": 4.6402860317920466e-07, "loss": 0.3913, "step": 11093 }, { "epoch": 0.5360197130018843, "grad_norm": 3.170189619064331, "learning_rate": 4.6398028699811565e-07, "loss": 0.3499, "step": 11094 }, { "epoch": 0.5360680291829734, "grad_norm": 3.194232225418091, "learning_rate": 4.6393197081702664e-07, "loss": 0.4366, "step": 11095 }, { "epoch": 0.5361163453640624, "grad_norm": 2.646019220352173, "learning_rate": 4.6388365463593753e-07, "loss": 0.3196, "step": 11096 }, { "epoch": 0.5361646615451515, "grad_norm": 2.383213996887207, "learning_rate": 4.638353384548485e-07, "loss": 0.2591, "step": 11097 }, { "epoch": 0.5362129777262405, "grad_norm": 6.710709571838379, "learning_rate": 4.637870222737595e-07, "loss": 0.2693, "step": 11098 }, { "epoch": 0.5362612939073296, "grad_norm": 1.9576772451400757, "learning_rate": 4.637387060926704e-07, "loss": 0.2104, "step": 11099 }, { "epoch": 0.5363096100884186, "grad_norm": 2.8847758769989014, "learning_rate": 4.636903899115814e-07, "loss": 0.2913, "step": 11100 }, { "epoch": 0.5363579262695076, "grad_norm": 2.7049131393432617, "learning_rate": 4.636420737304923e-07, "loss": 0.2906, "step": 11101 }, { "epoch": 0.5364062424505968, "grad_norm": 1.9042775630950928, "learning_rate": 4.6359375754940326e-07, "loss": 0.2153, "step": 11102 }, { "epoch": 0.5364545586316858, "grad_norm": 1.8584399223327637, "learning_rate": 4.6354544136831425e-07, "loss": 0.2093, "step": 11103 }, { "epoch": 0.5365028748127748, "grad_norm": 34.53533935546875, "learning_rate": 4.634971251872252e-07, "loss": 0.2723, "step": 11104 }, { "epoch": 0.5365511909938638, "grad_norm": 2.2276182174682617, "learning_rate": 4.634488090061361e-07, "loss": 0.2401, "step": 11105 }, { "epoch": 0.5365995071749529, "grad_norm": 4.292393684387207, "learning_rate": 4.6340049282504706e-07, "loss": 0.3125, "step": 11106 }, { "epoch": 0.5366478233560419, "grad_norm": 2.0801310539245605, "learning_rate": 4.6335217664395805e-07, "loss": 0.1948, "step": 11107 }, { "epoch": 0.536696139537131, "grad_norm": 2.6656546592712402, "learning_rate": 4.63303860462869e-07, "loss": 0.3368, "step": 11108 }, { "epoch": 0.53674445571822, "grad_norm": 6.5653228759765625, "learning_rate": 4.632555442817799e-07, "loss": 0.2291, "step": 11109 }, { "epoch": 0.5367927718993091, "grad_norm": 2.257913589477539, "learning_rate": 4.632072281006909e-07, "loss": 0.2101, "step": 11110 }, { "epoch": 0.5368410880803981, "grad_norm": 3.3212571144104004, "learning_rate": 4.631589119196019e-07, "loss": 0.3388, "step": 11111 }, { "epoch": 0.5368894042614871, "grad_norm": 2.267813205718994, "learning_rate": 4.631105957385128e-07, "loss": 0.2362, "step": 11112 }, { "epoch": 0.5369377204425763, "grad_norm": 2.869568109512329, "learning_rate": 4.630622795574238e-07, "loss": 0.3072, "step": 11113 }, { "epoch": 0.5369860366236653, "grad_norm": 2.6242897510528564, "learning_rate": 4.630139633763347e-07, "loss": 0.3589, "step": 11114 }, { "epoch": 0.5370343528047543, "grad_norm": 3.1841607093811035, "learning_rate": 4.6296564719524565e-07, "loss": 0.1854, "step": 11115 }, { "epoch": 0.5370826689858433, "grad_norm": 3.8935928344726562, "learning_rate": 4.6291733101415665e-07, "loss": 0.3038, "step": 11116 }, { "epoch": 0.5371309851669324, "grad_norm": 3.7584056854248047, "learning_rate": 4.628690148330676e-07, "loss": 0.2838, "step": 11117 }, { "epoch": 0.5371793013480215, "grad_norm": 2.642688751220703, "learning_rate": 4.628206986519785e-07, "loss": 0.2852, "step": 11118 }, { "epoch": 0.5372276175291105, "grad_norm": 2.821681022644043, "learning_rate": 4.6277238247088946e-07, "loss": 0.3652, "step": 11119 }, { "epoch": 0.5372759337101995, "grad_norm": 2.3902368545532227, "learning_rate": 4.6272406628980045e-07, "loss": 0.2531, "step": 11120 }, { "epoch": 0.5373242498912886, "grad_norm": 3.1884849071502686, "learning_rate": 4.626757501087114e-07, "loss": 0.2567, "step": 11121 }, { "epoch": 0.5373725660723776, "grad_norm": 3.2347476482391357, "learning_rate": 4.626274339276223e-07, "loss": 0.2227, "step": 11122 }, { "epoch": 0.5374208822534667, "grad_norm": 3.372605800628662, "learning_rate": 4.625791177465333e-07, "loss": 0.3712, "step": 11123 }, { "epoch": 0.5374691984345558, "grad_norm": 2.887145519256592, "learning_rate": 4.6253080156544425e-07, "loss": 0.3088, "step": 11124 }, { "epoch": 0.5375175146156448, "grad_norm": 2.8357200622558594, "learning_rate": 4.624824853843552e-07, "loss": 0.1376, "step": 11125 }, { "epoch": 0.5375658307967338, "grad_norm": 3.367570161819458, "learning_rate": 4.624341692032662e-07, "loss": 0.2586, "step": 11126 }, { "epoch": 0.5376141469778228, "grad_norm": 2.423180103302002, "learning_rate": 4.6238585302217706e-07, "loss": 0.2927, "step": 11127 }, { "epoch": 0.537662463158912, "grad_norm": 2.724979877471924, "learning_rate": 4.6233753684108805e-07, "loss": 0.3792, "step": 11128 }, { "epoch": 0.537710779340001, "grad_norm": 2.8383967876434326, "learning_rate": 4.6228922065999904e-07, "loss": 0.3126, "step": 11129 }, { "epoch": 0.53775909552109, "grad_norm": 2.895209789276123, "learning_rate": 4.6224090447891e-07, "loss": 0.2074, "step": 11130 }, { "epoch": 0.537807411702179, "grad_norm": 2.7051877975463867, "learning_rate": 4.621925882978209e-07, "loss": 0.2872, "step": 11131 }, { "epoch": 0.5378557278832681, "grad_norm": 1.8539224863052368, "learning_rate": 4.6214427211673185e-07, "loss": 0.2074, "step": 11132 }, { "epoch": 0.5379040440643571, "grad_norm": 2.550959587097168, "learning_rate": 4.6209595593564284e-07, "loss": 0.2161, "step": 11133 }, { "epoch": 0.5379523602454462, "grad_norm": 1.7386530637741089, "learning_rate": 4.620476397545538e-07, "loss": 0.1541, "step": 11134 }, { "epoch": 0.5380006764265353, "grad_norm": 2.015486240386963, "learning_rate": 4.619993235734647e-07, "loss": 0.1957, "step": 11135 }, { "epoch": 0.5380489926076243, "grad_norm": 3.2369282245635986, "learning_rate": 4.619510073923757e-07, "loss": 0.3515, "step": 11136 }, { "epoch": 0.5380973087887133, "grad_norm": 2.4030065536499023, "learning_rate": 4.6190269121128665e-07, "loss": 0.252, "step": 11137 }, { "epoch": 0.5381456249698023, "grad_norm": 2.06085467338562, "learning_rate": 4.618543750301976e-07, "loss": 0.2058, "step": 11138 }, { "epoch": 0.5381939411508915, "grad_norm": 31.90814208984375, "learning_rate": 4.6180605884910857e-07, "loss": 0.2087, "step": 11139 }, { "epoch": 0.5382422573319805, "grad_norm": 2.472550868988037, "learning_rate": 4.6175774266801946e-07, "loss": 0.2839, "step": 11140 }, { "epoch": 0.5382905735130695, "grad_norm": 2.100458860397339, "learning_rate": 4.6170942648693045e-07, "loss": 0.2311, "step": 11141 }, { "epoch": 0.5383388896941586, "grad_norm": 5.02862548828125, "learning_rate": 4.6166111030584144e-07, "loss": 0.4707, "step": 11142 }, { "epoch": 0.5383872058752476, "grad_norm": 3.0381650924682617, "learning_rate": 4.616127941247523e-07, "loss": 0.3513, "step": 11143 }, { "epoch": 0.5384355220563367, "grad_norm": 3.6465742588043213, "learning_rate": 4.615644779436633e-07, "loss": 0.336, "step": 11144 }, { "epoch": 0.5384838382374257, "grad_norm": 3.013145923614502, "learning_rate": 4.6151616176257425e-07, "loss": 0.3559, "step": 11145 }, { "epoch": 0.5385321544185148, "grad_norm": 2.4062306880950928, "learning_rate": 4.6146784558148524e-07, "loss": 0.2477, "step": 11146 }, { "epoch": 0.5385804705996038, "grad_norm": 4.4712114334106445, "learning_rate": 4.614195294003962e-07, "loss": 0.3879, "step": 11147 }, { "epoch": 0.5386287867806928, "grad_norm": 2.0443122386932373, "learning_rate": 4.613712132193071e-07, "loss": 0.2422, "step": 11148 }, { "epoch": 0.538677102961782, "grad_norm": 1.4529434442520142, "learning_rate": 4.613228970382181e-07, "loss": 0.1387, "step": 11149 }, { "epoch": 0.538725419142871, "grad_norm": 1.6988580226898193, "learning_rate": 4.6127458085712904e-07, "loss": 0.1687, "step": 11150 }, { "epoch": 0.53877373532396, "grad_norm": 2.763199806213379, "learning_rate": 4.6122626467604e-07, "loss": 0.2796, "step": 11151 }, { "epoch": 0.538822051505049, "grad_norm": 5.052678108215332, "learning_rate": 4.6117794849495097e-07, "loss": 0.3798, "step": 11152 }, { "epoch": 0.538870367686138, "grad_norm": 3.1361632347106934, "learning_rate": 4.6112963231386185e-07, "loss": 0.2242, "step": 11153 }, { "epoch": 0.5389186838672272, "grad_norm": 3.262573480606079, "learning_rate": 4.6108131613277284e-07, "loss": 0.2537, "step": 11154 }, { "epoch": 0.5389670000483162, "grad_norm": 3.1164066791534424, "learning_rate": 4.6103299995168383e-07, "loss": 0.4394, "step": 11155 }, { "epoch": 0.5390153162294052, "grad_norm": 2.2874984741210938, "learning_rate": 4.609846837705947e-07, "loss": 0.2544, "step": 11156 }, { "epoch": 0.5390636324104943, "grad_norm": 1.908730149269104, "learning_rate": 4.609363675895057e-07, "loss": 0.203, "step": 11157 }, { "epoch": 0.5391119485915833, "grad_norm": 3.050083875656128, "learning_rate": 4.6088805140841665e-07, "loss": 0.3702, "step": 11158 }, { "epoch": 0.5391602647726723, "grad_norm": 2.735029935836792, "learning_rate": 4.608397352273276e-07, "loss": 0.2349, "step": 11159 }, { "epoch": 0.5392085809537615, "grad_norm": 2.6381313800811768, "learning_rate": 4.607914190462386e-07, "loss": 0.3281, "step": 11160 }, { "epoch": 0.5392568971348505, "grad_norm": 2.2160801887512207, "learning_rate": 4.607431028651495e-07, "loss": 0.1854, "step": 11161 }, { "epoch": 0.5393052133159395, "grad_norm": 10.285550117492676, "learning_rate": 4.606947866840605e-07, "loss": 0.4415, "step": 11162 }, { "epoch": 0.5393535294970285, "grad_norm": 3.0303573608398438, "learning_rate": 4.6064647050297144e-07, "loss": 0.426, "step": 11163 }, { "epoch": 0.5394018456781176, "grad_norm": 2.9831912517547607, "learning_rate": 4.605981543218824e-07, "loss": 0.3569, "step": 11164 }, { "epoch": 0.5394501618592067, "grad_norm": 25.42593002319336, "learning_rate": 4.6054983814079337e-07, "loss": 0.3141, "step": 11165 }, { "epoch": 0.5394984780402957, "grad_norm": 2.4543092250823975, "learning_rate": 4.6050152195970425e-07, "loss": 0.1792, "step": 11166 }, { "epoch": 0.5395467942213847, "grad_norm": 2.2755508422851562, "learning_rate": 4.6045320577861524e-07, "loss": 0.2493, "step": 11167 }, { "epoch": 0.5395951104024738, "grad_norm": 3.170011281967163, "learning_rate": 4.6040488959752623e-07, "loss": 0.3765, "step": 11168 }, { "epoch": 0.5396434265835628, "grad_norm": 2.088484048843384, "learning_rate": 4.603565734164371e-07, "loss": 0.1732, "step": 11169 }, { "epoch": 0.5396917427646519, "grad_norm": 2.2675955295562744, "learning_rate": 4.603082572353481e-07, "loss": 0.3337, "step": 11170 }, { "epoch": 0.539740058945741, "grad_norm": 1.9627476930618286, "learning_rate": 4.6025994105425904e-07, "loss": 0.2707, "step": 11171 }, { "epoch": 0.53978837512683, "grad_norm": 5.864731788635254, "learning_rate": 4.6021162487317e-07, "loss": 0.4187, "step": 11172 }, { "epoch": 0.539836691307919, "grad_norm": 2.040276288986206, "learning_rate": 4.6016330869208097e-07, "loss": 0.2589, "step": 11173 }, { "epoch": 0.539885007489008, "grad_norm": 3.8489646911621094, "learning_rate": 4.601149925109919e-07, "loss": 0.4352, "step": 11174 }, { "epoch": 0.5399333236700972, "grad_norm": 3.7302606105804443, "learning_rate": 4.6006667632990284e-07, "loss": 0.3147, "step": 11175 }, { "epoch": 0.5399816398511862, "grad_norm": 2.65645694732666, "learning_rate": 4.6001836014881383e-07, "loss": 0.2795, "step": 11176 }, { "epoch": 0.5400299560322752, "grad_norm": 2.1470563411712646, "learning_rate": 4.5997004396772477e-07, "loss": 0.2566, "step": 11177 }, { "epoch": 0.5400782722133642, "grad_norm": 6.056983470916748, "learning_rate": 4.5992172778663576e-07, "loss": 0.2235, "step": 11178 }, { "epoch": 0.5401265883944533, "grad_norm": 1.5824058055877686, "learning_rate": 4.5987341160554665e-07, "loss": 0.1583, "step": 11179 }, { "epoch": 0.5401749045755424, "grad_norm": 5.10676908493042, "learning_rate": 4.5982509542445764e-07, "loss": 0.1724, "step": 11180 }, { "epoch": 0.5402232207566314, "grad_norm": 5.979903697967529, "learning_rate": 4.5977677924336863e-07, "loss": 0.2106, "step": 11181 }, { "epoch": 0.5402715369377205, "grad_norm": 3.772294044494629, "learning_rate": 4.597284630622795e-07, "loss": 0.2887, "step": 11182 }, { "epoch": 0.5403198531188095, "grad_norm": 2.99544358253479, "learning_rate": 4.596801468811905e-07, "loss": 0.2171, "step": 11183 }, { "epoch": 0.5403681692998985, "grad_norm": 2.7977190017700195, "learning_rate": 4.5963183070010144e-07, "loss": 0.1829, "step": 11184 }, { "epoch": 0.5404164854809875, "grad_norm": 1.7028566598892212, "learning_rate": 4.595835145190124e-07, "loss": 0.2209, "step": 11185 }, { "epoch": 0.5404648016620767, "grad_norm": 4.39546012878418, "learning_rate": 4.5953519833792337e-07, "loss": 0.2495, "step": 11186 }, { "epoch": 0.5405131178431657, "grad_norm": 2.60086989402771, "learning_rate": 4.594868821568343e-07, "loss": 0.2521, "step": 11187 }, { "epoch": 0.5405614340242547, "grad_norm": 3.0626401901245117, "learning_rate": 4.5943856597574524e-07, "loss": 0.2955, "step": 11188 }, { "epoch": 0.5406097502053437, "grad_norm": 2.4082417488098145, "learning_rate": 4.5939024979465623e-07, "loss": 0.2594, "step": 11189 }, { "epoch": 0.5406580663864328, "grad_norm": 2.840761184692383, "learning_rate": 4.5934193361356717e-07, "loss": 0.3675, "step": 11190 }, { "epoch": 0.5407063825675219, "grad_norm": 3.2980687618255615, "learning_rate": 4.592936174324781e-07, "loss": 0.3125, "step": 11191 }, { "epoch": 0.5407546987486109, "grad_norm": 2.4805643558502197, "learning_rate": 4.5924530125138904e-07, "loss": 0.3186, "step": 11192 }, { "epoch": 0.5408030149297, "grad_norm": 2.4651808738708496, "learning_rate": 4.5919698507030003e-07, "loss": 0.2915, "step": 11193 }, { "epoch": 0.540851331110789, "grad_norm": 3.66782546043396, "learning_rate": 4.59148668889211e-07, "loss": 0.4539, "step": 11194 }, { "epoch": 0.540899647291878, "grad_norm": 3.242892265319824, "learning_rate": 4.591003527081219e-07, "loss": 0.3156, "step": 11195 }, { "epoch": 0.5409479634729671, "grad_norm": 2.7978737354278564, "learning_rate": 4.590520365270329e-07, "loss": 0.2967, "step": 11196 }, { "epoch": 0.5409962796540562, "grad_norm": 2.4580416679382324, "learning_rate": 4.5900372034594384e-07, "loss": 0.2304, "step": 11197 }, { "epoch": 0.5410445958351452, "grad_norm": 2.789876937866211, "learning_rate": 4.5895540416485477e-07, "loss": 0.3297, "step": 11198 }, { "epoch": 0.5410929120162342, "grad_norm": 3.6772940158843994, "learning_rate": 4.5890708798376576e-07, "loss": 0.3566, "step": 11199 }, { "epoch": 0.5411412281973232, "grad_norm": 2.5292415618896484, "learning_rate": 4.588587718026767e-07, "loss": 0.2906, "step": 11200 }, { "epoch": 0.5411895443784124, "grad_norm": 2.961496353149414, "learning_rate": 4.5881045562158764e-07, "loss": 0.3286, "step": 11201 }, { "epoch": 0.5412378605595014, "grad_norm": 3.2859132289886475, "learning_rate": 4.587621394404986e-07, "loss": 0.3876, "step": 11202 }, { "epoch": 0.5412861767405904, "grad_norm": 5.210460186004639, "learning_rate": 4.5871382325940956e-07, "loss": 0.3944, "step": 11203 }, { "epoch": 0.5413344929216795, "grad_norm": 2.4080655574798584, "learning_rate": 4.586655070783205e-07, "loss": 0.2875, "step": 11204 }, { "epoch": 0.5413828091027685, "grad_norm": 4.506015777587891, "learning_rate": 4.5861719089723144e-07, "loss": 0.4414, "step": 11205 }, { "epoch": 0.5414311252838576, "grad_norm": 4.3718037605285645, "learning_rate": 4.5856887471614243e-07, "loss": 0.1972, "step": 11206 }, { "epoch": 0.5414794414649466, "grad_norm": 3.2962982654571533, "learning_rate": 4.5852055853505337e-07, "loss": 0.2718, "step": 11207 }, { "epoch": 0.5415277576460357, "grad_norm": 10.612861633300781, "learning_rate": 4.584722423539643e-07, "loss": 0.2981, "step": 11208 }, { "epoch": 0.5415760738271247, "grad_norm": 2.5577375888824463, "learning_rate": 4.584239261728753e-07, "loss": 0.2657, "step": 11209 }, { "epoch": 0.5416243900082137, "grad_norm": 6.431613922119141, "learning_rate": 4.5837560999178623e-07, "loss": 0.3296, "step": 11210 }, { "epoch": 0.5416727061893027, "grad_norm": 5.9837727546691895, "learning_rate": 4.5832729381069717e-07, "loss": 0.1997, "step": 11211 }, { "epoch": 0.5417210223703919, "grad_norm": 2.323718786239624, "learning_rate": 4.5827897762960816e-07, "loss": 0.2756, "step": 11212 }, { "epoch": 0.5417693385514809, "grad_norm": 4.160667419433594, "learning_rate": 4.582306614485191e-07, "loss": 0.36, "step": 11213 }, { "epoch": 0.5418176547325699, "grad_norm": 2.9262795448303223, "learning_rate": 4.5818234526743003e-07, "loss": 0.3215, "step": 11214 }, { "epoch": 0.541865970913659, "grad_norm": 2.158074140548706, "learning_rate": 4.5813402908634097e-07, "loss": 0.2188, "step": 11215 }, { "epoch": 0.541914287094748, "grad_norm": 4.34427547454834, "learning_rate": 4.5808571290525196e-07, "loss": 0.3769, "step": 11216 }, { "epoch": 0.5419626032758371, "grad_norm": 4.571682929992676, "learning_rate": 4.580373967241629e-07, "loss": 0.3149, "step": 11217 }, { "epoch": 0.5420109194569261, "grad_norm": 3.21094012260437, "learning_rate": 4.5798908054307384e-07, "loss": 0.4702, "step": 11218 }, { "epoch": 0.5420592356380152, "grad_norm": 5.097911834716797, "learning_rate": 4.579407643619848e-07, "loss": 0.3128, "step": 11219 }, { "epoch": 0.5421075518191042, "grad_norm": 2.8956315517425537, "learning_rate": 4.5789244818089576e-07, "loss": 0.3263, "step": 11220 }, { "epoch": 0.5421558680001932, "grad_norm": 3.7057907581329346, "learning_rate": 4.578441319998067e-07, "loss": 0.3705, "step": 11221 }, { "epoch": 0.5422041841812824, "grad_norm": 6.8550190925598145, "learning_rate": 4.577958158187177e-07, "loss": 0.323, "step": 11222 }, { "epoch": 0.5422525003623714, "grad_norm": 27.676101684570312, "learning_rate": 4.577474996376286e-07, "loss": 0.306, "step": 11223 }, { "epoch": 0.5423008165434604, "grad_norm": 2.5447425842285156, "learning_rate": 4.5769918345653957e-07, "loss": 0.3656, "step": 11224 }, { "epoch": 0.5423491327245494, "grad_norm": 3.5729286670684814, "learning_rate": 4.5765086727545056e-07, "loss": 0.3426, "step": 11225 }, { "epoch": 0.5423974489056385, "grad_norm": 2.1934561729431152, "learning_rate": 4.576025510943615e-07, "loss": 0.2527, "step": 11226 }, { "epoch": 0.5424457650867276, "grad_norm": 2.4951987266540527, "learning_rate": 4.5755423491327243e-07, "loss": 0.257, "step": 11227 }, { "epoch": 0.5424940812678166, "grad_norm": 2.1198062896728516, "learning_rate": 4.5750591873218337e-07, "loss": 0.2383, "step": 11228 }, { "epoch": 0.5425423974489056, "grad_norm": 4.980994701385498, "learning_rate": 4.5745760255109436e-07, "loss": 0.3512, "step": 11229 }, { "epoch": 0.5425907136299947, "grad_norm": 3.03315806388855, "learning_rate": 4.574092863700053e-07, "loss": 0.2833, "step": 11230 }, { "epoch": 0.5426390298110837, "grad_norm": 3.910083770751953, "learning_rate": 4.5736097018891623e-07, "loss": 0.31, "step": 11231 }, { "epoch": 0.5426873459921728, "grad_norm": 2.5563907623291016, "learning_rate": 4.573126540078272e-07, "loss": 0.378, "step": 11232 }, { "epoch": 0.5427356621732619, "grad_norm": 2.9899256229400635, "learning_rate": 4.5726433782673816e-07, "loss": 0.411, "step": 11233 }, { "epoch": 0.5427839783543509, "grad_norm": 2.402873992919922, "learning_rate": 4.572160216456491e-07, "loss": 0.3106, "step": 11234 }, { "epoch": 0.5428322945354399, "grad_norm": 2.4546573162078857, "learning_rate": 4.571677054645601e-07, "loss": 0.2518, "step": 11235 }, { "epoch": 0.5428806107165289, "grad_norm": 2.142484426498413, "learning_rate": 4.5711938928347097e-07, "loss": 0.2722, "step": 11236 }, { "epoch": 0.542928926897618, "grad_norm": 5.076048374176025, "learning_rate": 4.5707107310238196e-07, "loss": 0.4863, "step": 11237 }, { "epoch": 0.5429772430787071, "grad_norm": 1.9631354808807373, "learning_rate": 4.5702275692129295e-07, "loss": 0.2151, "step": 11238 }, { "epoch": 0.5430255592597961, "grad_norm": 1.9983134269714355, "learning_rate": 4.5697444074020384e-07, "loss": 0.2378, "step": 11239 }, { "epoch": 0.5430738754408851, "grad_norm": 4.5540666580200195, "learning_rate": 4.569261245591148e-07, "loss": 0.3961, "step": 11240 }, { "epoch": 0.5431221916219742, "grad_norm": 1.9989969730377197, "learning_rate": 4.5687780837802576e-07, "loss": 0.2523, "step": 11241 }, { "epoch": 0.5431705078030632, "grad_norm": 3.0103116035461426, "learning_rate": 4.5682949219693675e-07, "loss": 0.3156, "step": 11242 }, { "epoch": 0.5432188239841523, "grad_norm": 2.8594493865966797, "learning_rate": 4.567811760158477e-07, "loss": 0.293, "step": 11243 }, { "epoch": 0.5432671401652414, "grad_norm": 2.427466630935669, "learning_rate": 4.5673285983475863e-07, "loss": 0.3442, "step": 11244 }, { "epoch": 0.5433154563463304, "grad_norm": 2.2964072227478027, "learning_rate": 4.566845436536696e-07, "loss": 0.2634, "step": 11245 }, { "epoch": 0.5433637725274194, "grad_norm": 21.87584114074707, "learning_rate": 4.5663622747258056e-07, "loss": 0.2434, "step": 11246 }, { "epoch": 0.5434120887085084, "grad_norm": 2.309699535369873, "learning_rate": 4.565879112914915e-07, "loss": 0.2463, "step": 11247 }, { "epoch": 0.5434604048895976, "grad_norm": 3.336106777191162, "learning_rate": 4.565395951104025e-07, "loss": 0.3353, "step": 11248 }, { "epoch": 0.5435087210706866, "grad_norm": 3.0714337825775146, "learning_rate": 4.5649127892931337e-07, "loss": 0.2515, "step": 11249 }, { "epoch": 0.5435570372517756, "grad_norm": 3.262216091156006, "learning_rate": 4.5644296274822436e-07, "loss": 0.2166, "step": 11250 }, { "epoch": 0.5436053534328646, "grad_norm": 3.806236743927002, "learning_rate": 4.5639464656713535e-07, "loss": 0.3844, "step": 11251 }, { "epoch": 0.5436536696139537, "grad_norm": 3.0072433948516846, "learning_rate": 4.5634633038604623e-07, "loss": 0.3706, "step": 11252 }, { "epoch": 0.5437019857950428, "grad_norm": 2.8391127586364746, "learning_rate": 4.562980142049572e-07, "loss": 0.4015, "step": 11253 }, { "epoch": 0.5437503019761318, "grad_norm": 3.5440475940704346, "learning_rate": 4.5624969802386816e-07, "loss": 0.3724, "step": 11254 }, { "epoch": 0.5437986181572209, "grad_norm": 2.976571559906006, "learning_rate": 4.562013818427791e-07, "loss": 0.4271, "step": 11255 }, { "epoch": 0.5438469343383099, "grad_norm": 3.9119722843170166, "learning_rate": 4.561530656616901e-07, "loss": 0.3079, "step": 11256 }, { "epoch": 0.5438952505193989, "grad_norm": 2.8450660705566406, "learning_rate": 4.56104749480601e-07, "loss": 0.29, "step": 11257 }, { "epoch": 0.543943566700488, "grad_norm": 2.9833710193634033, "learning_rate": 4.56056433299512e-07, "loss": 0.1851, "step": 11258 }, { "epoch": 0.5439918828815771, "grad_norm": 4.299864768981934, "learning_rate": 4.5600811711842295e-07, "loss": 0.3859, "step": 11259 }, { "epoch": 0.5440401990626661, "grad_norm": 2.5225796699523926, "learning_rate": 4.559598009373339e-07, "loss": 0.2855, "step": 11260 }, { "epoch": 0.5440885152437551, "grad_norm": 2.468048572540283, "learning_rate": 4.559114847562449e-07, "loss": 0.2442, "step": 11261 }, { "epoch": 0.5441368314248441, "grad_norm": 3.3717262744903564, "learning_rate": 4.5586316857515576e-07, "loss": 0.3612, "step": 11262 }, { "epoch": 0.5441851476059332, "grad_norm": 6.645641326904297, "learning_rate": 4.5581485239406675e-07, "loss": 0.2149, "step": 11263 }, { "epoch": 0.5442334637870223, "grad_norm": 2.611156463623047, "learning_rate": 4.5576653621297774e-07, "loss": 0.3878, "step": 11264 }, { "epoch": 0.5442817799681113, "grad_norm": 3.0202701091766357, "learning_rate": 4.5571822003188863e-07, "loss": 0.3277, "step": 11265 }, { "epoch": 0.5443300961492004, "grad_norm": 2.8757596015930176, "learning_rate": 4.556699038507996e-07, "loss": 0.2853, "step": 11266 }, { "epoch": 0.5443784123302894, "grad_norm": 2.482590436935425, "learning_rate": 4.5562158766971056e-07, "loss": 0.2685, "step": 11267 }, { "epoch": 0.5444267285113784, "grad_norm": 3.9599339962005615, "learning_rate": 4.555732714886215e-07, "loss": 0.3739, "step": 11268 }, { "epoch": 0.5444750446924675, "grad_norm": 3.155184507369995, "learning_rate": 4.555249553075325e-07, "loss": 0.4378, "step": 11269 }, { "epoch": 0.5445233608735566, "grad_norm": 2.4721837043762207, "learning_rate": 4.554766391264434e-07, "loss": 0.3408, "step": 11270 }, { "epoch": 0.5445716770546456, "grad_norm": 9.304443359375, "learning_rate": 4.5542832294535436e-07, "loss": 0.3578, "step": 11271 }, { "epoch": 0.5446199932357346, "grad_norm": 2.3441367149353027, "learning_rate": 4.5538000676426535e-07, "loss": 0.2847, "step": 11272 }, { "epoch": 0.5446683094168236, "grad_norm": 3.7508232593536377, "learning_rate": 4.553316905831763e-07, "loss": 0.2697, "step": 11273 }, { "epoch": 0.5447166255979128, "grad_norm": 2.0044188499450684, "learning_rate": 4.552833744020873e-07, "loss": 0.2244, "step": 11274 }, { "epoch": 0.5447649417790018, "grad_norm": 2.0965120792388916, "learning_rate": 4.5523505822099816e-07, "loss": 0.1642, "step": 11275 }, { "epoch": 0.5448132579600908, "grad_norm": 4.902538299560547, "learning_rate": 4.5518674203990915e-07, "loss": 0.2502, "step": 11276 }, { "epoch": 0.5448615741411799, "grad_norm": 3.117410182952881, "learning_rate": 4.5513842585882014e-07, "loss": 0.3688, "step": 11277 }, { "epoch": 0.5449098903222689, "grad_norm": 3.2190005779266357, "learning_rate": 4.55090109677731e-07, "loss": 0.5302, "step": 11278 }, { "epoch": 0.544958206503358, "grad_norm": 3.4515764713287354, "learning_rate": 4.55041793496642e-07, "loss": 0.3549, "step": 11279 }, { "epoch": 0.545006522684447, "grad_norm": 17.42900276184082, "learning_rate": 4.5499347731555295e-07, "loss": 0.469, "step": 11280 }, { "epoch": 0.5450548388655361, "grad_norm": 5.153381824493408, "learning_rate": 4.549451611344639e-07, "loss": 0.3575, "step": 11281 }, { "epoch": 0.5451031550466251, "grad_norm": 2.8616201877593994, "learning_rate": 4.548968449533749e-07, "loss": 0.2789, "step": 11282 }, { "epoch": 0.5451514712277141, "grad_norm": 2.75948429107666, "learning_rate": 4.548485287722858e-07, "loss": 0.417, "step": 11283 }, { "epoch": 0.5451997874088033, "grad_norm": 2.975435733795166, "learning_rate": 4.5480021259119675e-07, "loss": 0.3035, "step": 11284 }, { "epoch": 0.5452481035898923, "grad_norm": 1.936385989189148, "learning_rate": 4.5475189641010774e-07, "loss": 0.2215, "step": 11285 }, { "epoch": 0.5452964197709813, "grad_norm": 2.8962268829345703, "learning_rate": 4.547035802290187e-07, "loss": 0.338, "step": 11286 }, { "epoch": 0.5453447359520703, "grad_norm": 3.919283628463745, "learning_rate": 4.546552640479296e-07, "loss": 0.2366, "step": 11287 }, { "epoch": 0.5453930521331594, "grad_norm": 2.254549980163574, "learning_rate": 4.5460694786684056e-07, "loss": 0.2419, "step": 11288 }, { "epoch": 0.5454413683142484, "grad_norm": 2.2727911472320557, "learning_rate": 4.5455863168575155e-07, "loss": 0.2693, "step": 11289 }, { "epoch": 0.5454896844953375, "grad_norm": 2.60182523727417, "learning_rate": 4.5451031550466254e-07, "loss": 0.3728, "step": 11290 }, { "epoch": 0.5455380006764265, "grad_norm": 2.7049970626831055, "learning_rate": 4.544619993235734e-07, "loss": 0.2689, "step": 11291 }, { "epoch": 0.5455863168575156, "grad_norm": 3.282750129699707, "learning_rate": 4.544136831424844e-07, "loss": 0.36, "step": 11292 }, { "epoch": 0.5456346330386046, "grad_norm": 3.6933090686798096, "learning_rate": 4.5436536696139535e-07, "loss": 0.3398, "step": 11293 }, { "epoch": 0.5456829492196936, "grad_norm": 2.0977602005004883, "learning_rate": 4.543170507803063e-07, "loss": 0.2103, "step": 11294 }, { "epoch": 0.5457312654007828, "grad_norm": 3.4828858375549316, "learning_rate": 4.542687345992173e-07, "loss": 0.2253, "step": 11295 }, { "epoch": 0.5457795815818718, "grad_norm": 19.247970581054688, "learning_rate": 4.542204184181282e-07, "loss": 0.3595, "step": 11296 }, { "epoch": 0.5458278977629608, "grad_norm": 3.374195098876953, "learning_rate": 4.5417210223703915e-07, "loss": 0.3249, "step": 11297 }, { "epoch": 0.5458762139440498, "grad_norm": 6.525956630706787, "learning_rate": 4.5412378605595014e-07, "loss": 0.3347, "step": 11298 }, { "epoch": 0.5459245301251389, "grad_norm": 3.203169107437134, "learning_rate": 4.540754698748611e-07, "loss": 0.2363, "step": 11299 }, { "epoch": 0.545972846306228, "grad_norm": 2.904799222946167, "learning_rate": 4.54027153693772e-07, "loss": 0.3519, "step": 11300 }, { "epoch": 0.546021162487317, "grad_norm": 2.19389009475708, "learning_rate": 4.5397883751268295e-07, "loss": 0.2837, "step": 11301 }, { "epoch": 0.546069478668406, "grad_norm": 2.5507586002349854, "learning_rate": 4.5393052133159394e-07, "loss": 0.3238, "step": 11302 }, { "epoch": 0.5461177948494951, "grad_norm": 2.6741976737976074, "learning_rate": 4.538822051505049e-07, "loss": 0.2759, "step": 11303 }, { "epoch": 0.5461661110305841, "grad_norm": 2.1336185932159424, "learning_rate": 4.538338889694158e-07, "loss": 0.2516, "step": 11304 }, { "epoch": 0.5462144272116732, "grad_norm": 4.892667293548584, "learning_rate": 4.537855727883268e-07, "loss": 0.3811, "step": 11305 }, { "epoch": 0.5462627433927623, "grad_norm": 2.8183908462524414, "learning_rate": 4.537372566072377e-07, "loss": 0.2998, "step": 11306 }, { "epoch": 0.5463110595738513, "grad_norm": 2.0700747966766357, "learning_rate": 4.536889404261487e-07, "loss": 0.2326, "step": 11307 }, { "epoch": 0.5463593757549403, "grad_norm": 2.5490317344665527, "learning_rate": 4.5364062424505967e-07, "loss": 0.2554, "step": 11308 }, { "epoch": 0.5464076919360293, "grad_norm": 2.9961376190185547, "learning_rate": 4.535923080639706e-07, "loss": 0.2146, "step": 11309 }, { "epoch": 0.5464560081171185, "grad_norm": 2.881402015686035, "learning_rate": 4.5354399188288155e-07, "loss": 0.2901, "step": 11310 }, { "epoch": 0.5465043242982075, "grad_norm": 2.340832233428955, "learning_rate": 4.5349567570179254e-07, "loss": 0.2649, "step": 11311 }, { "epoch": 0.5465526404792965, "grad_norm": 2.3431544303894043, "learning_rate": 4.534473595207035e-07, "loss": 0.2638, "step": 11312 }, { "epoch": 0.5466009566603856, "grad_norm": 2.449310779571533, "learning_rate": 4.533990433396144e-07, "loss": 0.3121, "step": 11313 }, { "epoch": 0.5466492728414746, "grad_norm": 1.8011294603347778, "learning_rate": 4.5335072715852535e-07, "loss": 0.201, "step": 11314 }, { "epoch": 0.5466975890225636, "grad_norm": 3.42830753326416, "learning_rate": 4.5330241097743634e-07, "loss": 0.2436, "step": 11315 }, { "epoch": 0.5467459052036527, "grad_norm": 5.524621963500977, "learning_rate": 4.532540947963473e-07, "loss": 0.2202, "step": 11316 }, { "epoch": 0.5467942213847418, "grad_norm": 2.2044026851654053, "learning_rate": 4.532057786152582e-07, "loss": 0.2512, "step": 11317 }, { "epoch": 0.5468425375658308, "grad_norm": 5.850228786468506, "learning_rate": 4.531574624341692e-07, "loss": 0.3183, "step": 11318 }, { "epoch": 0.5468908537469198, "grad_norm": 3.0775997638702393, "learning_rate": 4.531091462530801e-07, "loss": 0.3508, "step": 11319 }, { "epoch": 0.5469391699280088, "grad_norm": 41.60462188720703, "learning_rate": 4.530608300719911e-07, "loss": 0.3418, "step": 11320 }, { "epoch": 0.546987486109098, "grad_norm": 1.7887928485870361, "learning_rate": 4.5301251389090207e-07, "loss": 0.1752, "step": 11321 }, { "epoch": 0.547035802290187, "grad_norm": 1.803727626800537, "learning_rate": 4.5296419770981295e-07, "loss": 0.1737, "step": 11322 }, { "epoch": 0.547084118471276, "grad_norm": 4.526071071624756, "learning_rate": 4.5291588152872394e-07, "loss": 0.4269, "step": 11323 }, { "epoch": 0.547132434652365, "grad_norm": 5.371371746063232, "learning_rate": 4.5286756534763493e-07, "loss": 0.2727, "step": 11324 }, { "epoch": 0.5471807508334541, "grad_norm": 2.2301881313323975, "learning_rate": 4.5281924916654587e-07, "loss": 0.2556, "step": 11325 }, { "epoch": 0.5472290670145432, "grad_norm": 2.6167714595794678, "learning_rate": 4.527709329854568e-07, "loss": 0.3873, "step": 11326 }, { "epoch": 0.5472773831956322, "grad_norm": 3.590766191482544, "learning_rate": 4.5272261680436775e-07, "loss": 0.2729, "step": 11327 }, { "epoch": 0.5473256993767213, "grad_norm": 2.416696548461914, "learning_rate": 4.5267430062327874e-07, "loss": 0.2533, "step": 11328 }, { "epoch": 0.5473740155578103, "grad_norm": 3.1176254749298096, "learning_rate": 4.5262598444218967e-07, "loss": 0.3385, "step": 11329 }, { "epoch": 0.5474223317388993, "grad_norm": 3.5684597492218018, "learning_rate": 4.525776682611006e-07, "loss": 0.2881, "step": 11330 }, { "epoch": 0.5474706479199885, "grad_norm": 2.4662985801696777, "learning_rate": 4.525293520800116e-07, "loss": 0.275, "step": 11331 }, { "epoch": 0.5475189641010775, "grad_norm": 2.8538477420806885, "learning_rate": 4.524810358989225e-07, "loss": 0.2857, "step": 11332 }, { "epoch": 0.5475672802821665, "grad_norm": 1.3862426280975342, "learning_rate": 4.524327197178335e-07, "loss": 0.1168, "step": 11333 }, { "epoch": 0.5476155964632555, "grad_norm": 2.6906192302703857, "learning_rate": 4.5238440353674447e-07, "loss": 0.3085, "step": 11334 }, { "epoch": 0.5476639126443446, "grad_norm": 2.0307514667510986, "learning_rate": 4.5233608735565535e-07, "loss": 0.2246, "step": 11335 }, { "epoch": 0.5477122288254337, "grad_norm": 3.0424275398254395, "learning_rate": 4.5228777117456634e-07, "loss": 0.2712, "step": 11336 }, { "epoch": 0.5477605450065227, "grad_norm": 2.975740909576416, "learning_rate": 4.5223945499347733e-07, "loss": 0.4059, "step": 11337 }, { "epoch": 0.5478088611876117, "grad_norm": 2.089352607727051, "learning_rate": 4.521911388123882e-07, "loss": 0.2428, "step": 11338 }, { "epoch": 0.5478571773687008, "grad_norm": 3.0688998699188232, "learning_rate": 4.521428226312992e-07, "loss": 0.4091, "step": 11339 }, { "epoch": 0.5479054935497898, "grad_norm": 3.191941499710083, "learning_rate": 4.5209450645021014e-07, "loss": 0.3199, "step": 11340 }, { "epoch": 0.5479538097308789, "grad_norm": 2.4928112030029297, "learning_rate": 4.5204619026912113e-07, "loss": 0.3439, "step": 11341 }, { "epoch": 0.548002125911968, "grad_norm": 1.9959834814071655, "learning_rate": 4.5199787408803207e-07, "loss": 0.2151, "step": 11342 }, { "epoch": 0.548050442093057, "grad_norm": 3.8270230293273926, "learning_rate": 4.51949557906943e-07, "loss": 0.2511, "step": 11343 }, { "epoch": 0.548098758274146, "grad_norm": 2.554598331451416, "learning_rate": 4.51901241725854e-07, "loss": 0.3156, "step": 11344 }, { "epoch": 0.548147074455235, "grad_norm": 2.983684778213501, "learning_rate": 4.518529255447649e-07, "loss": 0.4127, "step": 11345 }, { "epoch": 0.548195390636324, "grad_norm": 1.9100579023361206, "learning_rate": 4.5180460936367587e-07, "loss": 0.1907, "step": 11346 }, { "epoch": 0.5482437068174132, "grad_norm": 2.717301845550537, "learning_rate": 4.5175629318258686e-07, "loss": 0.2227, "step": 11347 }, { "epoch": 0.5482920229985022, "grad_norm": 1.8137716054916382, "learning_rate": 4.5170797700149775e-07, "loss": 0.209, "step": 11348 }, { "epoch": 0.5483403391795912, "grad_norm": 2.408282518386841, "learning_rate": 4.5165966082040874e-07, "loss": 0.2923, "step": 11349 }, { "epoch": 0.5483886553606803, "grad_norm": 3.671678304672241, "learning_rate": 4.5161134463931973e-07, "loss": 0.2975, "step": 11350 }, { "epoch": 0.5484369715417693, "grad_norm": 15.534906387329102, "learning_rate": 4.515630284582306e-07, "loss": 0.3111, "step": 11351 }, { "epoch": 0.5484852877228584, "grad_norm": 2.3888051509857178, "learning_rate": 4.515147122771416e-07, "loss": 0.2581, "step": 11352 }, { "epoch": 0.5485336039039475, "grad_norm": 2.0876119136810303, "learning_rate": 4.5146639609605254e-07, "loss": 0.187, "step": 11353 }, { "epoch": 0.5485819200850365, "grad_norm": 2.379514455795288, "learning_rate": 4.514180799149635e-07, "loss": 0.235, "step": 11354 }, { "epoch": 0.5486302362661255, "grad_norm": 2.518249750137329, "learning_rate": 4.5136976373387447e-07, "loss": 0.2212, "step": 11355 }, { "epoch": 0.5486785524472145, "grad_norm": 3.1642751693725586, "learning_rate": 4.513214475527854e-07, "loss": 0.3504, "step": 11356 }, { "epoch": 0.5487268686283037, "grad_norm": 3.4573569297790527, "learning_rate": 4.512731313716964e-07, "loss": 0.3756, "step": 11357 }, { "epoch": 0.5487751848093927, "grad_norm": 3.331176280975342, "learning_rate": 4.512248151906073e-07, "loss": 0.4054, "step": 11358 }, { "epoch": 0.5488235009904817, "grad_norm": 2.8287646770477295, "learning_rate": 4.5117649900951827e-07, "loss": 0.2669, "step": 11359 }, { "epoch": 0.5488718171715707, "grad_norm": 1.9630935192108154, "learning_rate": 4.5112818282842926e-07, "loss": 0.2098, "step": 11360 }, { "epoch": 0.5489201333526598, "grad_norm": 3.9640777111053467, "learning_rate": 4.5107986664734014e-07, "loss": 0.3065, "step": 11361 }, { "epoch": 0.5489684495337489, "grad_norm": 3.2504072189331055, "learning_rate": 4.5103155046625113e-07, "loss": 0.2853, "step": 11362 }, { "epoch": 0.5490167657148379, "grad_norm": 2.111886739730835, "learning_rate": 4.509832342851621e-07, "loss": 0.26, "step": 11363 }, { "epoch": 0.549065081895927, "grad_norm": 4.936985015869141, "learning_rate": 4.50934918104073e-07, "loss": 0.2764, "step": 11364 }, { "epoch": 0.549113398077016, "grad_norm": 1.8828446865081787, "learning_rate": 4.50886601922984e-07, "loss": 0.2058, "step": 11365 }, { "epoch": 0.549161714258105, "grad_norm": 3.368924379348755, "learning_rate": 4.5083828574189493e-07, "loss": 0.2513, "step": 11366 }, { "epoch": 0.5492100304391941, "grad_norm": 5.314666271209717, "learning_rate": 4.5078996956080587e-07, "loss": 0.3661, "step": 11367 }, { "epoch": 0.5492583466202832, "grad_norm": 19.03843116760254, "learning_rate": 4.5074165337971686e-07, "loss": 0.3044, "step": 11368 }, { "epoch": 0.5493066628013722, "grad_norm": 2.4794435501098633, "learning_rate": 4.506933371986278e-07, "loss": 0.3343, "step": 11369 }, { "epoch": 0.5493549789824612, "grad_norm": 2.9574711322784424, "learning_rate": 4.5064502101753874e-07, "loss": 0.3478, "step": 11370 }, { "epoch": 0.5494032951635502, "grad_norm": 2.1984269618988037, "learning_rate": 4.505967048364497e-07, "loss": 0.2471, "step": 11371 }, { "epoch": 0.5494516113446393, "grad_norm": 2.8291056156158447, "learning_rate": 4.5054838865536066e-07, "loss": 0.3934, "step": 11372 }, { "epoch": 0.5494999275257284, "grad_norm": 2.8267300128936768, "learning_rate": 4.5050007247427165e-07, "loss": 0.4169, "step": 11373 }, { "epoch": 0.5495482437068174, "grad_norm": 4.216265678405762, "learning_rate": 4.5045175629318254e-07, "loss": 0.2087, "step": 11374 }, { "epoch": 0.5495965598879065, "grad_norm": 2.4597880840301514, "learning_rate": 4.5040344011209353e-07, "loss": 0.2456, "step": 11375 }, { "epoch": 0.5496448760689955, "grad_norm": 2.801941156387329, "learning_rate": 4.503551239310045e-07, "loss": 0.4382, "step": 11376 }, { "epoch": 0.5496931922500845, "grad_norm": 1.785897970199585, "learning_rate": 4.503068077499154e-07, "loss": 0.1782, "step": 11377 }, { "epoch": 0.5497415084311736, "grad_norm": 13.82296085357666, "learning_rate": 4.502584915688264e-07, "loss": 0.2275, "step": 11378 }, { "epoch": 0.5497898246122627, "grad_norm": 2.8005034923553467, "learning_rate": 4.5021017538773733e-07, "loss": 0.3486, "step": 11379 }, { "epoch": 0.5498381407933517, "grad_norm": 2.6788253784179688, "learning_rate": 4.5016185920664827e-07, "loss": 0.2831, "step": 11380 }, { "epoch": 0.5498864569744407, "grad_norm": 1.786482572555542, "learning_rate": 4.5011354302555926e-07, "loss": 0.15, "step": 11381 }, { "epoch": 0.5499347731555297, "grad_norm": 2.4325098991394043, "learning_rate": 4.500652268444702e-07, "loss": 0.3259, "step": 11382 }, { "epoch": 0.5499830893366189, "grad_norm": 3.861494541168213, "learning_rate": 4.5001691066338113e-07, "loss": 0.2874, "step": 11383 }, { "epoch": 0.5500314055177079, "grad_norm": 2.6396188735961914, "learning_rate": 4.4996859448229207e-07, "loss": 0.2515, "step": 11384 }, { "epoch": 0.5500797216987969, "grad_norm": 2.4449779987335205, "learning_rate": 4.4992027830120306e-07, "loss": 0.3091, "step": 11385 }, { "epoch": 0.550128037879886, "grad_norm": 2.936875581741333, "learning_rate": 4.49871962120114e-07, "loss": 0.3034, "step": 11386 }, { "epoch": 0.550176354060975, "grad_norm": 1.9977092742919922, "learning_rate": 4.4982364593902494e-07, "loss": 0.2099, "step": 11387 }, { "epoch": 0.5502246702420641, "grad_norm": 1.9060181379318237, "learning_rate": 4.497753297579359e-07, "loss": 0.2267, "step": 11388 }, { "epoch": 0.5502729864231531, "grad_norm": 2.8513705730438232, "learning_rate": 4.497270135768469e-07, "loss": 0.2945, "step": 11389 }, { "epoch": 0.5503213026042422, "grad_norm": 2.2202627658843994, "learning_rate": 4.496786973957578e-07, "loss": 0.2613, "step": 11390 }, { "epoch": 0.5503696187853312, "grad_norm": 2.5794765949249268, "learning_rate": 4.496303812146688e-07, "loss": 0.3192, "step": 11391 }, { "epoch": 0.5504179349664202, "grad_norm": 4.4634528160095215, "learning_rate": 4.4958206503357973e-07, "loss": 0.2676, "step": 11392 }, { "epoch": 0.5504662511475094, "grad_norm": 2.364454746246338, "learning_rate": 4.4953374885249066e-07, "loss": 0.2787, "step": 11393 }, { "epoch": 0.5505145673285984, "grad_norm": 2.5811378955841064, "learning_rate": 4.4948543267140165e-07, "loss": 0.2714, "step": 11394 }, { "epoch": 0.5505628835096874, "grad_norm": 3.6861987113952637, "learning_rate": 4.494371164903126e-07, "loss": 0.3837, "step": 11395 }, { "epoch": 0.5506111996907764, "grad_norm": 2.289139986038208, "learning_rate": 4.4938880030922353e-07, "loss": 0.2508, "step": 11396 }, { "epoch": 0.5506595158718655, "grad_norm": 2.170121669769287, "learning_rate": 4.4934048412813447e-07, "loss": 0.2656, "step": 11397 }, { "epoch": 0.5507078320529545, "grad_norm": 2.8938803672790527, "learning_rate": 4.4929216794704546e-07, "loss": 0.3155, "step": 11398 }, { "epoch": 0.5507561482340436, "grad_norm": 3.213045835494995, "learning_rate": 4.492438517659564e-07, "loss": 0.272, "step": 11399 }, { "epoch": 0.5508044644151326, "grad_norm": 16.960458755493164, "learning_rate": 4.4919553558486733e-07, "loss": 0.4521, "step": 11400 }, { "epoch": 0.5508527805962217, "grad_norm": 1.7591290473937988, "learning_rate": 4.491472194037783e-07, "loss": 0.1875, "step": 11401 }, { "epoch": 0.5509010967773107, "grad_norm": 2.7410507202148438, "learning_rate": 4.4909890322268926e-07, "loss": 0.3226, "step": 11402 }, { "epoch": 0.5509494129583997, "grad_norm": 5.200016021728516, "learning_rate": 4.490505870416002e-07, "loss": 0.2343, "step": 11403 }, { "epoch": 0.5509977291394889, "grad_norm": 3.156167507171631, "learning_rate": 4.490022708605112e-07, "loss": 0.3987, "step": 11404 }, { "epoch": 0.5510460453205779, "grad_norm": 4.952980995178223, "learning_rate": 4.489539546794221e-07, "loss": 0.3272, "step": 11405 }, { "epoch": 0.5510943615016669, "grad_norm": 3.240429162979126, "learning_rate": 4.4890563849833306e-07, "loss": 0.3927, "step": 11406 }, { "epoch": 0.5511426776827559, "grad_norm": 2.3053324222564697, "learning_rate": 4.4885732231724405e-07, "loss": 0.3362, "step": 11407 }, { "epoch": 0.551190993863845, "grad_norm": 2.858093738555908, "learning_rate": 4.48809006136155e-07, "loss": 0.3062, "step": 11408 }, { "epoch": 0.5512393100449341, "grad_norm": 2.975149393081665, "learning_rate": 4.487606899550659e-07, "loss": 0.2765, "step": 11409 }, { "epoch": 0.5512876262260231, "grad_norm": 1.9535119533538818, "learning_rate": 4.4871237377397686e-07, "loss": 0.2311, "step": 11410 }, { "epoch": 0.5513359424071121, "grad_norm": 2.2670135498046875, "learning_rate": 4.4866405759288785e-07, "loss": 0.235, "step": 11411 }, { "epoch": 0.5513842585882012, "grad_norm": 3.448532819747925, "learning_rate": 4.486157414117988e-07, "loss": 0.4463, "step": 11412 }, { "epoch": 0.5514325747692902, "grad_norm": 2.6327877044677734, "learning_rate": 4.4856742523070973e-07, "loss": 0.2663, "step": 11413 }, { "epoch": 0.5514808909503793, "grad_norm": 2.440131664276123, "learning_rate": 4.485191090496207e-07, "loss": 0.2518, "step": 11414 }, { "epoch": 0.5515292071314684, "grad_norm": 2.6763150691986084, "learning_rate": 4.4847079286853166e-07, "loss": 0.2416, "step": 11415 }, { "epoch": 0.5515775233125574, "grad_norm": 2.207516670227051, "learning_rate": 4.484224766874426e-07, "loss": 0.2427, "step": 11416 }, { "epoch": 0.5516258394936464, "grad_norm": 41.03973388671875, "learning_rate": 4.483741605063536e-07, "loss": 0.2164, "step": 11417 }, { "epoch": 0.5516741556747354, "grad_norm": 2.7034900188446045, "learning_rate": 4.4832584432526447e-07, "loss": 0.2998, "step": 11418 }, { "epoch": 0.5517224718558246, "grad_norm": 24.435178756713867, "learning_rate": 4.4827752814417546e-07, "loss": 0.2667, "step": 11419 }, { "epoch": 0.5517707880369136, "grad_norm": 3.1566460132598877, "learning_rate": 4.4822921196308645e-07, "loss": 0.3543, "step": 11420 }, { "epoch": 0.5518191042180026, "grad_norm": 2.666759967803955, "learning_rate": 4.481808957819974e-07, "loss": 0.3212, "step": 11421 }, { "epoch": 0.5518674203990916, "grad_norm": 2.3082802295684814, "learning_rate": 4.481325796009083e-07, "loss": 0.2957, "step": 11422 }, { "epoch": 0.5519157365801807, "grad_norm": 2.501648187637329, "learning_rate": 4.4808426341981926e-07, "loss": 0.3053, "step": 11423 }, { "epoch": 0.5519640527612697, "grad_norm": 2.4270875453948975, "learning_rate": 4.4803594723873025e-07, "loss": 0.2425, "step": 11424 }, { "epoch": 0.5520123689423588, "grad_norm": 1.9701519012451172, "learning_rate": 4.479876310576412e-07, "loss": 0.1919, "step": 11425 }, { "epoch": 0.5520606851234479, "grad_norm": 3.489234209060669, "learning_rate": 4.479393148765521e-07, "loss": 0.3204, "step": 11426 }, { "epoch": 0.5521090013045369, "grad_norm": 2.356496810913086, "learning_rate": 4.478909986954631e-07, "loss": 0.248, "step": 11427 }, { "epoch": 0.5521573174856259, "grad_norm": 2.4616010189056396, "learning_rate": 4.4784268251437405e-07, "loss": 0.3096, "step": 11428 }, { "epoch": 0.5522056336667149, "grad_norm": 5.633838653564453, "learning_rate": 4.47794366333285e-07, "loss": 0.339, "step": 11429 }, { "epoch": 0.5522539498478041, "grad_norm": 3.128075122833252, "learning_rate": 4.47746050152196e-07, "loss": 0.4062, "step": 11430 }, { "epoch": 0.5523022660288931, "grad_norm": 7.0322771072387695, "learning_rate": 4.4769773397110686e-07, "loss": 0.3067, "step": 11431 }, { "epoch": 0.5523505822099821, "grad_norm": 2.168421506881714, "learning_rate": 4.4764941779001785e-07, "loss": 0.3194, "step": 11432 }, { "epoch": 0.5523988983910711, "grad_norm": 2.3929545879364014, "learning_rate": 4.4760110160892884e-07, "loss": 0.2726, "step": 11433 }, { "epoch": 0.5524472145721602, "grad_norm": 6.916486740112305, "learning_rate": 4.4755278542783973e-07, "loss": 0.448, "step": 11434 }, { "epoch": 0.5524955307532493, "grad_norm": 2.6031253337860107, "learning_rate": 4.475044692467507e-07, "loss": 0.2764, "step": 11435 }, { "epoch": 0.5525438469343383, "grad_norm": 1.821510672569275, "learning_rate": 4.4745615306566166e-07, "loss": 0.1875, "step": 11436 }, { "epoch": 0.5525921631154274, "grad_norm": 2.411626100540161, "learning_rate": 4.4740783688457265e-07, "loss": 0.2621, "step": 11437 }, { "epoch": 0.5526404792965164, "grad_norm": 3.7629480361938477, "learning_rate": 4.473595207034836e-07, "loss": 0.4643, "step": 11438 }, { "epoch": 0.5526887954776054, "grad_norm": 2.6454241275787354, "learning_rate": 4.473112045223945e-07, "loss": 0.2835, "step": 11439 }, { "epoch": 0.5527371116586945, "grad_norm": 3.798830032348633, "learning_rate": 4.472628883413055e-07, "loss": 0.1819, "step": 11440 }, { "epoch": 0.5527854278397836, "grad_norm": 4.674692153930664, "learning_rate": 4.4721457216021645e-07, "loss": 0.2763, "step": 11441 }, { "epoch": 0.5528337440208726, "grad_norm": 2.4600954055786133, "learning_rate": 4.471662559791274e-07, "loss": 0.2992, "step": 11442 }, { "epoch": 0.5528820602019616, "grad_norm": 6.174219131469727, "learning_rate": 4.471179397980384e-07, "loss": 0.2649, "step": 11443 }, { "epoch": 0.5529303763830506, "grad_norm": 2.667372226715088, "learning_rate": 4.4706962361694926e-07, "loss": 0.3048, "step": 11444 }, { "epoch": 0.5529786925641398, "grad_norm": 2.9615676403045654, "learning_rate": 4.4702130743586025e-07, "loss": 0.3541, "step": 11445 }, { "epoch": 0.5530270087452288, "grad_norm": 4.767740726470947, "learning_rate": 4.4697299125477124e-07, "loss": 0.2504, "step": 11446 }, { "epoch": 0.5530753249263178, "grad_norm": 2.4693455696105957, "learning_rate": 4.469246750736821e-07, "loss": 0.2591, "step": 11447 }, { "epoch": 0.5531236411074069, "grad_norm": 3.4739043712615967, "learning_rate": 4.468763588925931e-07, "loss": 0.3462, "step": 11448 }, { "epoch": 0.5531719572884959, "grad_norm": 4.855652332305908, "learning_rate": 4.4682804271150405e-07, "loss": 0.3512, "step": 11449 }, { "epoch": 0.5532202734695849, "grad_norm": 3.879856586456299, "learning_rate": 4.46779726530415e-07, "loss": 0.2312, "step": 11450 }, { "epoch": 0.553268589650674, "grad_norm": 4.084598064422607, "learning_rate": 4.46731410349326e-07, "loss": 0.3167, "step": 11451 }, { "epoch": 0.5533169058317631, "grad_norm": 4.507862091064453, "learning_rate": 4.466830941682369e-07, "loss": 0.2938, "step": 11452 }, { "epoch": 0.5533652220128521, "grad_norm": 3.221575975418091, "learning_rate": 4.466347779871479e-07, "loss": 0.2878, "step": 11453 }, { "epoch": 0.5534135381939411, "grad_norm": 2.740680694580078, "learning_rate": 4.4658646180605884e-07, "loss": 0.3088, "step": 11454 }, { "epoch": 0.5534618543750301, "grad_norm": 2.445908546447754, "learning_rate": 4.465381456249698e-07, "loss": 0.2812, "step": 11455 }, { "epoch": 0.5535101705561193, "grad_norm": 3.644028425216675, "learning_rate": 4.4648982944388077e-07, "loss": 0.2572, "step": 11456 }, { "epoch": 0.5535584867372083, "grad_norm": 2.9780654907226562, "learning_rate": 4.4644151326279166e-07, "loss": 0.2334, "step": 11457 }, { "epoch": 0.5536068029182973, "grad_norm": 3.2109408378601074, "learning_rate": 4.4639319708170265e-07, "loss": 0.3955, "step": 11458 }, { "epoch": 0.5536551190993864, "grad_norm": 2.415832281112671, "learning_rate": 4.4634488090061364e-07, "loss": 0.3232, "step": 11459 }, { "epoch": 0.5537034352804754, "grad_norm": 4.125380992889404, "learning_rate": 4.462965647195245e-07, "loss": 0.2667, "step": 11460 }, { "epoch": 0.5537517514615645, "grad_norm": 1.900444746017456, "learning_rate": 4.462482485384355e-07, "loss": 0.2276, "step": 11461 }, { "epoch": 0.5538000676426535, "grad_norm": 2.4114153385162354, "learning_rate": 4.4619993235734645e-07, "loss": 0.2489, "step": 11462 }, { "epoch": 0.5538483838237426, "grad_norm": 1.892135739326477, "learning_rate": 4.461516161762574e-07, "loss": 0.225, "step": 11463 }, { "epoch": 0.5538967000048316, "grad_norm": 2.808166742324829, "learning_rate": 4.461032999951684e-07, "loss": 0.3216, "step": 11464 }, { "epoch": 0.5539450161859206, "grad_norm": 2.6141507625579834, "learning_rate": 4.460549838140793e-07, "loss": 0.3624, "step": 11465 }, { "epoch": 0.5539933323670098, "grad_norm": 1.9436448812484741, "learning_rate": 4.4600666763299025e-07, "loss": 0.2313, "step": 11466 }, { "epoch": 0.5540416485480988, "grad_norm": 2.426527261734009, "learning_rate": 4.4595835145190124e-07, "loss": 0.2005, "step": 11467 }, { "epoch": 0.5540899647291878, "grad_norm": 2.280796527862549, "learning_rate": 4.459100352708122e-07, "loss": 0.1735, "step": 11468 }, { "epoch": 0.5541382809102768, "grad_norm": 26.384687423706055, "learning_rate": 4.4586171908972317e-07, "loss": 0.3281, "step": 11469 }, { "epoch": 0.5541865970913659, "grad_norm": 2.4841997623443604, "learning_rate": 4.4581340290863405e-07, "loss": 0.3304, "step": 11470 }, { "epoch": 0.554234913272455, "grad_norm": 2.347658634185791, "learning_rate": 4.4576508672754504e-07, "loss": 0.2425, "step": 11471 }, { "epoch": 0.554283229453544, "grad_norm": 3.2983832359313965, "learning_rate": 4.4571677054645603e-07, "loss": 0.3193, "step": 11472 }, { "epoch": 0.554331545634633, "grad_norm": 2.2848174571990967, "learning_rate": 4.456684543653669e-07, "loss": 0.3124, "step": 11473 }, { "epoch": 0.5543798618157221, "grad_norm": 2.327369451522827, "learning_rate": 4.456201381842779e-07, "loss": 0.2604, "step": 11474 }, { "epoch": 0.5544281779968111, "grad_norm": 3.7612781524658203, "learning_rate": 4.4557182200318884e-07, "loss": 0.4008, "step": 11475 }, { "epoch": 0.5544764941779001, "grad_norm": 2.0761911869049072, "learning_rate": 4.455235058220998e-07, "loss": 0.176, "step": 11476 }, { "epoch": 0.5545248103589893, "grad_norm": 2.7338778972625732, "learning_rate": 4.4547518964101077e-07, "loss": 0.3499, "step": 11477 }, { "epoch": 0.5545731265400783, "grad_norm": 3.038470506668091, "learning_rate": 4.454268734599217e-07, "loss": 0.2711, "step": 11478 }, { "epoch": 0.5546214427211673, "grad_norm": 3.291261672973633, "learning_rate": 4.4537855727883265e-07, "loss": 0.431, "step": 11479 }, { "epoch": 0.5546697589022563, "grad_norm": 2.0446367263793945, "learning_rate": 4.4533024109774364e-07, "loss": 0.2141, "step": 11480 }, { "epoch": 0.5547180750833454, "grad_norm": 3.535055637359619, "learning_rate": 4.452819249166546e-07, "loss": 0.3007, "step": 11481 }, { "epoch": 0.5547663912644345, "grad_norm": 14.971390724182129, "learning_rate": 4.452336087355655e-07, "loss": 0.2385, "step": 11482 }, { "epoch": 0.5548147074455235, "grad_norm": 3.606745481491089, "learning_rate": 4.4518529255447645e-07, "loss": 0.2342, "step": 11483 }, { "epoch": 0.5548630236266126, "grad_norm": 9.324291229248047, "learning_rate": 4.4513697637338744e-07, "loss": 0.264, "step": 11484 }, { "epoch": 0.5549113398077016, "grad_norm": 2.873427152633667, "learning_rate": 4.4508866019229843e-07, "loss": 0.2921, "step": 11485 }, { "epoch": 0.5549596559887906, "grad_norm": 2.4125001430511475, "learning_rate": 4.450403440112093e-07, "loss": 0.2457, "step": 11486 }, { "epoch": 0.5550079721698797, "grad_norm": 3.7985870838165283, "learning_rate": 4.449920278301203e-07, "loss": 0.4118, "step": 11487 }, { "epoch": 0.5550562883509688, "grad_norm": 4.804101943969727, "learning_rate": 4.4494371164903124e-07, "loss": 0.3285, "step": 11488 }, { "epoch": 0.5551046045320578, "grad_norm": 2.898587703704834, "learning_rate": 4.448953954679422e-07, "loss": 0.3197, "step": 11489 }, { "epoch": 0.5551529207131468, "grad_norm": 2.414623260498047, "learning_rate": 4.4484707928685317e-07, "loss": 0.2555, "step": 11490 }, { "epoch": 0.5552012368942358, "grad_norm": 2.5433459281921387, "learning_rate": 4.447987631057641e-07, "loss": 0.2255, "step": 11491 }, { "epoch": 0.555249553075325, "grad_norm": 4.260138511657715, "learning_rate": 4.4475044692467504e-07, "loss": 0.267, "step": 11492 }, { "epoch": 0.555297869256414, "grad_norm": 3.238476514816284, "learning_rate": 4.44702130743586e-07, "loss": 0.2929, "step": 11493 }, { "epoch": 0.555346185437503, "grad_norm": 5.832372188568115, "learning_rate": 4.4465381456249697e-07, "loss": 0.3335, "step": 11494 }, { "epoch": 0.555394501618592, "grad_norm": 4.899961471557617, "learning_rate": 4.446054983814079e-07, "loss": 0.3074, "step": 11495 }, { "epoch": 0.5554428177996811, "grad_norm": 1.8560162782669067, "learning_rate": 4.4455718220031885e-07, "loss": 0.2383, "step": 11496 }, { "epoch": 0.5554911339807702, "grad_norm": 2.1846556663513184, "learning_rate": 4.4450886601922984e-07, "loss": 0.156, "step": 11497 }, { "epoch": 0.5555394501618592, "grad_norm": 67.01943969726562, "learning_rate": 4.4446054983814077e-07, "loss": 0.266, "step": 11498 }, { "epoch": 0.5555877663429483, "grad_norm": 2.6877877712249756, "learning_rate": 4.444122336570517e-07, "loss": 0.3777, "step": 11499 }, { "epoch": 0.5556360825240373, "grad_norm": 1.8089327812194824, "learning_rate": 4.443639174759627e-07, "loss": 0.1717, "step": 11500 }, { "epoch": 0.5556843987051263, "grad_norm": 3.127249002456665, "learning_rate": 4.443156012948736e-07, "loss": 0.2687, "step": 11501 }, { "epoch": 0.5557327148862153, "grad_norm": 2.427889823913574, "learning_rate": 4.442672851137846e-07, "loss": 0.2488, "step": 11502 }, { "epoch": 0.5557810310673045, "grad_norm": 4.750091552734375, "learning_rate": 4.4421896893269556e-07, "loss": 0.3117, "step": 11503 }, { "epoch": 0.5558293472483935, "grad_norm": 3.283607244491577, "learning_rate": 4.441706527516065e-07, "loss": 0.3466, "step": 11504 }, { "epoch": 0.5558776634294825, "grad_norm": 3.3675389289855957, "learning_rate": 4.4412233657051744e-07, "loss": 0.3486, "step": 11505 }, { "epoch": 0.5559259796105716, "grad_norm": 2.6070451736450195, "learning_rate": 4.440740203894284e-07, "loss": 0.322, "step": 11506 }, { "epoch": 0.5559742957916606, "grad_norm": 2.47163987159729, "learning_rate": 4.4402570420833937e-07, "loss": 0.2751, "step": 11507 }, { "epoch": 0.5560226119727497, "grad_norm": 3.2457761764526367, "learning_rate": 4.439773880272503e-07, "loss": 0.4665, "step": 11508 }, { "epoch": 0.5560709281538387, "grad_norm": 3.23415470123291, "learning_rate": 4.4392907184616124e-07, "loss": 0.2752, "step": 11509 }, { "epoch": 0.5561192443349278, "grad_norm": 2.4791207313537598, "learning_rate": 4.4388075566507223e-07, "loss": 0.269, "step": 11510 }, { "epoch": 0.5561675605160168, "grad_norm": 3.816340208053589, "learning_rate": 4.4383243948398317e-07, "loss": 0.4239, "step": 11511 }, { "epoch": 0.5562158766971058, "grad_norm": 6.7120041847229, "learning_rate": 4.437841233028941e-07, "loss": 0.3884, "step": 11512 }, { "epoch": 0.556264192878195, "grad_norm": 2.0154433250427246, "learning_rate": 4.437358071218051e-07, "loss": 0.1939, "step": 11513 }, { "epoch": 0.556312509059284, "grad_norm": 2.4183847904205322, "learning_rate": 4.43687490940716e-07, "loss": 0.2173, "step": 11514 }, { "epoch": 0.556360825240373, "grad_norm": 4.0501627922058105, "learning_rate": 4.4363917475962697e-07, "loss": 0.2517, "step": 11515 }, { "epoch": 0.556409141421462, "grad_norm": 2.6033997535705566, "learning_rate": 4.4359085857853796e-07, "loss": 0.3625, "step": 11516 }, { "epoch": 0.556457457602551, "grad_norm": 4.769340515136719, "learning_rate": 4.4354254239744885e-07, "loss": 0.2879, "step": 11517 }, { "epoch": 0.5565057737836402, "grad_norm": 1.6984617710113525, "learning_rate": 4.4349422621635984e-07, "loss": 0.1775, "step": 11518 }, { "epoch": 0.5565540899647292, "grad_norm": 2.286325693130493, "learning_rate": 4.4344591003527077e-07, "loss": 0.1842, "step": 11519 }, { "epoch": 0.5566024061458182, "grad_norm": 2.453284740447998, "learning_rate": 4.4339759385418176e-07, "loss": 0.2237, "step": 11520 }, { "epoch": 0.5566507223269073, "grad_norm": 2.7629454135894775, "learning_rate": 4.433492776730927e-07, "loss": 0.3259, "step": 11521 }, { "epoch": 0.5566990385079963, "grad_norm": 26.787935256958008, "learning_rate": 4.4330096149200364e-07, "loss": 0.3648, "step": 11522 }, { "epoch": 0.5567473546890854, "grad_norm": 1.8748406171798706, "learning_rate": 4.4325264531091463e-07, "loss": 0.265, "step": 11523 }, { "epoch": 0.5567956708701745, "grad_norm": 2.3041036128997803, "learning_rate": 4.4320432912982557e-07, "loss": 0.2503, "step": 11524 }, { "epoch": 0.5568439870512635, "grad_norm": 2.238701105117798, "learning_rate": 4.431560129487365e-07, "loss": 0.2413, "step": 11525 }, { "epoch": 0.5568923032323525, "grad_norm": 8.66459846496582, "learning_rate": 4.431076967676475e-07, "loss": 0.2367, "step": 11526 }, { "epoch": 0.5569406194134415, "grad_norm": 2.6183247566223145, "learning_rate": 4.430593805865584e-07, "loss": 0.3247, "step": 11527 }, { "epoch": 0.5569889355945306, "grad_norm": 3.02634859085083, "learning_rate": 4.4301106440546937e-07, "loss": 0.3269, "step": 11528 }, { "epoch": 0.5570372517756197, "grad_norm": 1.3962496519088745, "learning_rate": 4.4296274822438036e-07, "loss": 0.14, "step": 11529 }, { "epoch": 0.5570855679567087, "grad_norm": 2.7796201705932617, "learning_rate": 4.4291443204329124e-07, "loss": 0.2247, "step": 11530 }, { "epoch": 0.5571338841377977, "grad_norm": 3.695547580718994, "learning_rate": 4.4286611586220223e-07, "loss": 0.3207, "step": 11531 }, { "epoch": 0.5571822003188868, "grad_norm": 2.974320650100708, "learning_rate": 4.4281779968111317e-07, "loss": 0.2563, "step": 11532 }, { "epoch": 0.5572305164999758, "grad_norm": 6.75432014465332, "learning_rate": 4.427694835000241e-07, "loss": 0.2995, "step": 11533 }, { "epoch": 0.5572788326810649, "grad_norm": 1.8500374555587769, "learning_rate": 4.427211673189351e-07, "loss": 0.143, "step": 11534 }, { "epoch": 0.557327148862154, "grad_norm": 3.6061532497406006, "learning_rate": 4.4267285113784603e-07, "loss": 0.343, "step": 11535 }, { "epoch": 0.557375465043243, "grad_norm": 2.2918598651885986, "learning_rate": 4.42624534956757e-07, "loss": 0.2192, "step": 11536 }, { "epoch": 0.557423781224332, "grad_norm": 2.3472321033477783, "learning_rate": 4.4257621877566796e-07, "loss": 0.2168, "step": 11537 }, { "epoch": 0.557472097405421, "grad_norm": 2.3890559673309326, "learning_rate": 4.425279025945789e-07, "loss": 0.251, "step": 11538 }, { "epoch": 0.5575204135865102, "grad_norm": 2.201101779937744, "learning_rate": 4.424795864134899e-07, "loss": 0.2382, "step": 11539 }, { "epoch": 0.5575687297675992, "grad_norm": 2.684758424758911, "learning_rate": 4.424312702324008e-07, "loss": 0.3307, "step": 11540 }, { "epoch": 0.5576170459486882, "grad_norm": 2.7697079181671143, "learning_rate": 4.4238295405131176e-07, "loss": 0.3841, "step": 11541 }, { "epoch": 0.5576653621297772, "grad_norm": 1.75154709815979, "learning_rate": 4.4233463787022275e-07, "loss": 0.1412, "step": 11542 }, { "epoch": 0.5577136783108663, "grad_norm": 2.0269949436187744, "learning_rate": 4.4228632168913364e-07, "loss": 0.2031, "step": 11543 }, { "epoch": 0.5577619944919554, "grad_norm": 2.1120831966400146, "learning_rate": 4.4223800550804463e-07, "loss": 0.2381, "step": 11544 }, { "epoch": 0.5578103106730444, "grad_norm": 2.8662869930267334, "learning_rate": 4.4218968932695557e-07, "loss": 0.3005, "step": 11545 }, { "epoch": 0.5578586268541335, "grad_norm": 1.6934261322021484, "learning_rate": 4.421413731458665e-07, "loss": 0.1831, "step": 11546 }, { "epoch": 0.5579069430352225, "grad_norm": 3.01041579246521, "learning_rate": 4.420930569647775e-07, "loss": 0.2648, "step": 11547 }, { "epoch": 0.5579552592163115, "grad_norm": 4.319825649261475, "learning_rate": 4.4204474078368843e-07, "loss": 0.3415, "step": 11548 }, { "epoch": 0.5580035753974006, "grad_norm": 1.9930726289749146, "learning_rate": 4.4199642460259937e-07, "loss": 0.2466, "step": 11549 }, { "epoch": 0.5580518915784897, "grad_norm": 3.84977126121521, "learning_rate": 4.4194810842151036e-07, "loss": 0.3619, "step": 11550 }, { "epoch": 0.5581002077595787, "grad_norm": 3.767148733139038, "learning_rate": 4.418997922404213e-07, "loss": 0.3669, "step": 11551 }, { "epoch": 0.5581485239406677, "grad_norm": 3.9158172607421875, "learning_rate": 4.418514760593323e-07, "loss": 0.2365, "step": 11552 }, { "epoch": 0.5581968401217567, "grad_norm": 2.410715103149414, "learning_rate": 4.4180315987824317e-07, "loss": 0.2623, "step": 11553 }, { "epoch": 0.5582451563028458, "grad_norm": 3.433283805847168, "learning_rate": 4.4175484369715416e-07, "loss": 0.3846, "step": 11554 }, { "epoch": 0.5582934724839349, "grad_norm": 2.6561481952667236, "learning_rate": 4.4170652751606515e-07, "loss": 0.2244, "step": 11555 }, { "epoch": 0.5583417886650239, "grad_norm": 2.5580356121063232, "learning_rate": 4.4165821133497603e-07, "loss": 0.2766, "step": 11556 }, { "epoch": 0.558390104846113, "grad_norm": 1.5320910215377808, "learning_rate": 4.41609895153887e-07, "loss": 0.1604, "step": 11557 }, { "epoch": 0.558438421027202, "grad_norm": 1.847824215888977, "learning_rate": 4.4156157897279796e-07, "loss": 0.1585, "step": 11558 }, { "epoch": 0.558486737208291, "grad_norm": 1.967509150505066, "learning_rate": 4.415132627917089e-07, "loss": 0.2203, "step": 11559 }, { "epoch": 0.5585350533893801, "grad_norm": 2.175449848175049, "learning_rate": 4.414649466106199e-07, "loss": 0.2398, "step": 11560 }, { "epoch": 0.5585833695704692, "grad_norm": 2.9050228595733643, "learning_rate": 4.4141663042953083e-07, "loss": 0.4023, "step": 11561 }, { "epoch": 0.5586316857515582, "grad_norm": 2.697877883911133, "learning_rate": 4.4136831424844176e-07, "loss": 0.2841, "step": 11562 }, { "epoch": 0.5586800019326472, "grad_norm": 3.079087972640991, "learning_rate": 4.4131999806735275e-07, "loss": 0.2694, "step": 11563 }, { "epoch": 0.5587283181137362, "grad_norm": 3.8046298027038574, "learning_rate": 4.412716818862637e-07, "loss": 0.2592, "step": 11564 }, { "epoch": 0.5587766342948254, "grad_norm": 15.61620044708252, "learning_rate": 4.4122336570517463e-07, "loss": 0.2497, "step": 11565 }, { "epoch": 0.5588249504759144, "grad_norm": 3.458857297897339, "learning_rate": 4.4117504952408557e-07, "loss": 0.3699, "step": 11566 }, { "epoch": 0.5588732666570034, "grad_norm": 2.329951763153076, "learning_rate": 4.4112673334299656e-07, "loss": 0.2655, "step": 11567 }, { "epoch": 0.5589215828380925, "grad_norm": 2.7251486778259277, "learning_rate": 4.4107841716190755e-07, "loss": 0.3409, "step": 11568 }, { "epoch": 0.5589698990191815, "grad_norm": 2.9585752487182617, "learning_rate": 4.4103010098081843e-07, "loss": 0.362, "step": 11569 }, { "epoch": 0.5590182152002706, "grad_norm": 4.142649173736572, "learning_rate": 4.409817847997294e-07, "loss": 0.4277, "step": 11570 }, { "epoch": 0.5590665313813596, "grad_norm": 4.502340793609619, "learning_rate": 4.4093346861864036e-07, "loss": 0.4401, "step": 11571 }, { "epoch": 0.5591148475624487, "grad_norm": 2.5157153606414795, "learning_rate": 4.408851524375513e-07, "loss": 0.2689, "step": 11572 }, { "epoch": 0.5591631637435377, "grad_norm": 2.1509506702423096, "learning_rate": 4.408368362564623e-07, "loss": 0.2576, "step": 11573 }, { "epoch": 0.5592114799246267, "grad_norm": 3.587240219116211, "learning_rate": 4.407885200753732e-07, "loss": 0.3473, "step": 11574 }, { "epoch": 0.5592597961057159, "grad_norm": 1.6649765968322754, "learning_rate": 4.4074020389428416e-07, "loss": 0.1854, "step": 11575 }, { "epoch": 0.5593081122868049, "grad_norm": 5.001549243927002, "learning_rate": 4.4069188771319515e-07, "loss": 0.3857, "step": 11576 }, { "epoch": 0.5593564284678939, "grad_norm": 2.192777633666992, "learning_rate": 4.406435715321061e-07, "loss": 0.2372, "step": 11577 }, { "epoch": 0.5594047446489829, "grad_norm": 2.5108561515808105, "learning_rate": 4.40595255351017e-07, "loss": 0.3198, "step": 11578 }, { "epoch": 0.559453060830072, "grad_norm": 4.3498148918151855, "learning_rate": 4.4054693916992796e-07, "loss": 0.3213, "step": 11579 }, { "epoch": 0.559501377011161, "grad_norm": 2.4515156745910645, "learning_rate": 4.4049862298883895e-07, "loss": 0.2649, "step": 11580 }, { "epoch": 0.5595496931922501, "grad_norm": 1.957091212272644, "learning_rate": 4.404503068077499e-07, "loss": 0.2097, "step": 11581 }, { "epoch": 0.5595980093733391, "grad_norm": 1.9643640518188477, "learning_rate": 4.4040199062666083e-07, "loss": 0.2463, "step": 11582 }, { "epoch": 0.5596463255544282, "grad_norm": 2.7630159854888916, "learning_rate": 4.403536744455718e-07, "loss": 0.2904, "step": 11583 }, { "epoch": 0.5596946417355172, "grad_norm": 2.3639020919799805, "learning_rate": 4.4030535826448275e-07, "loss": 0.2933, "step": 11584 }, { "epoch": 0.5597429579166062, "grad_norm": 4.127456188201904, "learning_rate": 4.402570420833937e-07, "loss": 0.3218, "step": 11585 }, { "epoch": 0.5597912740976954, "grad_norm": 2.673417806625366, "learning_rate": 4.402087259023047e-07, "loss": 0.2209, "step": 11586 }, { "epoch": 0.5598395902787844, "grad_norm": 2.352567672729492, "learning_rate": 4.401604097212156e-07, "loss": 0.2775, "step": 11587 }, { "epoch": 0.5598879064598734, "grad_norm": 1.82127046585083, "learning_rate": 4.4011209354012656e-07, "loss": 0.2352, "step": 11588 }, { "epoch": 0.5599362226409624, "grad_norm": 4.236079216003418, "learning_rate": 4.4006377735903755e-07, "loss": 0.2852, "step": 11589 }, { "epoch": 0.5599845388220515, "grad_norm": 2.6096091270446777, "learning_rate": 4.400154611779485e-07, "loss": 0.3104, "step": 11590 }, { "epoch": 0.5600328550031406, "grad_norm": 2.0519938468933105, "learning_rate": 4.399671449968594e-07, "loss": 0.185, "step": 11591 }, { "epoch": 0.5600811711842296, "grad_norm": 1.6579231023788452, "learning_rate": 4.3991882881577036e-07, "loss": 0.1838, "step": 11592 }, { "epoch": 0.5601294873653186, "grad_norm": 1.854548692703247, "learning_rate": 4.3987051263468135e-07, "loss": 0.1868, "step": 11593 }, { "epoch": 0.5601778035464077, "grad_norm": 3.0525407791137695, "learning_rate": 4.398221964535923e-07, "loss": 0.3874, "step": 11594 }, { "epoch": 0.5602261197274967, "grad_norm": 2.583430528640747, "learning_rate": 4.397738802725032e-07, "loss": 0.342, "step": 11595 }, { "epoch": 0.5602744359085858, "grad_norm": 3.0146186351776123, "learning_rate": 4.397255640914142e-07, "loss": 0.4432, "step": 11596 }, { "epoch": 0.5603227520896749, "grad_norm": 4.346525192260742, "learning_rate": 4.396772479103251e-07, "loss": 0.2991, "step": 11597 }, { "epoch": 0.5603710682707639, "grad_norm": 1.9550522565841675, "learning_rate": 4.396289317292361e-07, "loss": 0.2667, "step": 11598 }, { "epoch": 0.5604193844518529, "grad_norm": 2.9234416484832764, "learning_rate": 4.395806155481471e-07, "loss": 0.2216, "step": 11599 }, { "epoch": 0.5604677006329419, "grad_norm": 3.404277801513672, "learning_rate": 4.39532299367058e-07, "loss": 0.283, "step": 11600 }, { "epoch": 0.5605160168140311, "grad_norm": 3.220520496368408, "learning_rate": 4.3948398318596895e-07, "loss": 0.3163, "step": 11601 }, { "epoch": 0.5605643329951201, "grad_norm": 2.4751806259155273, "learning_rate": 4.3943566700487994e-07, "loss": 0.3189, "step": 11602 }, { "epoch": 0.5606126491762091, "grad_norm": 14.087425231933594, "learning_rate": 4.393873508237909e-07, "loss": 0.2651, "step": 11603 }, { "epoch": 0.5606609653572981, "grad_norm": 2.0570967197418213, "learning_rate": 4.393390346427018e-07, "loss": 0.2058, "step": 11604 }, { "epoch": 0.5607092815383872, "grad_norm": 2.1921029090881348, "learning_rate": 4.3929071846161276e-07, "loss": 0.29, "step": 11605 }, { "epoch": 0.5607575977194762, "grad_norm": 8.262359619140625, "learning_rate": 4.3924240228052375e-07, "loss": 0.2283, "step": 11606 }, { "epoch": 0.5608059139005653, "grad_norm": 1.489568829536438, "learning_rate": 4.391940860994347e-07, "loss": 0.1453, "step": 11607 }, { "epoch": 0.5608542300816544, "grad_norm": 2.761539936065674, "learning_rate": 4.391457699183456e-07, "loss": 0.3096, "step": 11608 }, { "epoch": 0.5609025462627434, "grad_norm": 2.471275806427002, "learning_rate": 4.390974537372566e-07, "loss": 0.2865, "step": 11609 }, { "epoch": 0.5609508624438324, "grad_norm": 3.210693120956421, "learning_rate": 4.390491375561675e-07, "loss": 0.337, "step": 11610 }, { "epoch": 0.5609991786249214, "grad_norm": 2.1760411262512207, "learning_rate": 4.390008213750785e-07, "loss": 0.2515, "step": 11611 }, { "epoch": 0.5610474948060106, "grad_norm": 4.940282821655273, "learning_rate": 4.389525051939895e-07, "loss": 0.36, "step": 11612 }, { "epoch": 0.5610958109870996, "grad_norm": 3.24180269241333, "learning_rate": 4.3890418901290036e-07, "loss": 0.4765, "step": 11613 }, { "epoch": 0.5611441271681886, "grad_norm": 3.242680549621582, "learning_rate": 4.3885587283181135e-07, "loss": 0.3525, "step": 11614 }, { "epoch": 0.5611924433492776, "grad_norm": 2.4547338485717773, "learning_rate": 4.3880755665072234e-07, "loss": 0.2552, "step": 11615 }, { "epoch": 0.5612407595303667, "grad_norm": 3.431802749633789, "learning_rate": 4.387592404696333e-07, "loss": 0.2368, "step": 11616 }, { "epoch": 0.5612890757114558, "grad_norm": 3.2729129791259766, "learning_rate": 4.387109242885442e-07, "loss": 0.317, "step": 11617 }, { "epoch": 0.5613373918925448, "grad_norm": 4.031475067138672, "learning_rate": 4.3866260810745515e-07, "loss": 0.2397, "step": 11618 }, { "epoch": 0.5613857080736339, "grad_norm": 1.3485429286956787, "learning_rate": 4.3861429192636614e-07, "loss": 0.1501, "step": 11619 }, { "epoch": 0.5614340242547229, "grad_norm": 2.6813607215881348, "learning_rate": 4.385659757452771e-07, "loss": 0.2404, "step": 11620 }, { "epoch": 0.5614823404358119, "grad_norm": 1.9960525035858154, "learning_rate": 4.38517659564188e-07, "loss": 0.219, "step": 11621 }, { "epoch": 0.561530656616901, "grad_norm": 41.39948272705078, "learning_rate": 4.38469343383099e-07, "loss": 0.2135, "step": 11622 }, { "epoch": 0.5615789727979901, "grad_norm": 3.2568914890289307, "learning_rate": 4.384210272020099e-07, "loss": 0.2452, "step": 11623 }, { "epoch": 0.5616272889790791, "grad_norm": 2.277395248413086, "learning_rate": 4.383727110209209e-07, "loss": 0.2347, "step": 11624 }, { "epoch": 0.5616756051601681, "grad_norm": 3.781888246536255, "learning_rate": 4.3832439483983187e-07, "loss": 0.2428, "step": 11625 }, { "epoch": 0.5617239213412571, "grad_norm": 2.6080374717712402, "learning_rate": 4.3827607865874276e-07, "loss": 0.247, "step": 11626 }, { "epoch": 0.5617722375223463, "grad_norm": 2.63634991645813, "learning_rate": 4.3822776247765375e-07, "loss": 0.2111, "step": 11627 }, { "epoch": 0.5618205537034353, "grad_norm": 2.1338701248168945, "learning_rate": 4.3817944629656474e-07, "loss": 0.1886, "step": 11628 }, { "epoch": 0.5618688698845243, "grad_norm": 3.8119139671325684, "learning_rate": 4.381311301154756e-07, "loss": 0.4587, "step": 11629 }, { "epoch": 0.5619171860656134, "grad_norm": 9.354852676391602, "learning_rate": 4.380828139343866e-07, "loss": 0.3478, "step": 11630 }, { "epoch": 0.5619655022467024, "grad_norm": 3.578465700149536, "learning_rate": 4.3803449775329755e-07, "loss": 0.2305, "step": 11631 }, { "epoch": 0.5620138184277915, "grad_norm": 1.8536269664764404, "learning_rate": 4.3798618157220854e-07, "loss": 0.1823, "step": 11632 }, { "epoch": 0.5620621346088805, "grad_norm": 3.116821527481079, "learning_rate": 4.379378653911195e-07, "loss": 0.4671, "step": 11633 }, { "epoch": 0.5621104507899696, "grad_norm": 2.1752798557281494, "learning_rate": 4.378895492100304e-07, "loss": 0.2758, "step": 11634 }, { "epoch": 0.5621587669710586, "grad_norm": 1.8130418062210083, "learning_rate": 4.378412330289414e-07, "loss": 0.2338, "step": 11635 }, { "epoch": 0.5622070831521476, "grad_norm": 3.5626795291900635, "learning_rate": 4.377929168478523e-07, "loss": 0.3823, "step": 11636 }, { "epoch": 0.5622553993332366, "grad_norm": 4.064435958862305, "learning_rate": 4.377446006667633e-07, "loss": 0.2865, "step": 11637 }, { "epoch": 0.5623037155143258, "grad_norm": 3.221595287322998, "learning_rate": 4.3769628448567427e-07, "loss": 0.4438, "step": 11638 }, { "epoch": 0.5623520316954148, "grad_norm": 2.738309621810913, "learning_rate": 4.3764796830458515e-07, "loss": 0.2823, "step": 11639 }, { "epoch": 0.5624003478765038, "grad_norm": 2.3109071254730225, "learning_rate": 4.3759965212349614e-07, "loss": 0.1674, "step": 11640 }, { "epoch": 0.5624486640575929, "grad_norm": 3.942148447036743, "learning_rate": 4.3755133594240713e-07, "loss": 0.2518, "step": 11641 }, { "epoch": 0.5624969802386819, "grad_norm": 2.8102331161499023, "learning_rate": 4.37503019761318e-07, "loss": 0.2818, "step": 11642 }, { "epoch": 0.562545296419771, "grad_norm": 2.9061734676361084, "learning_rate": 4.37454703580229e-07, "loss": 0.2966, "step": 11643 }, { "epoch": 0.56259361260086, "grad_norm": 2.5410103797912598, "learning_rate": 4.3740638739913994e-07, "loss": 0.3545, "step": 11644 }, { "epoch": 0.5626419287819491, "grad_norm": 3.336886405944824, "learning_rate": 4.373580712180509e-07, "loss": 0.2972, "step": 11645 }, { "epoch": 0.5626902449630381, "grad_norm": 4.671755313873291, "learning_rate": 4.3730975503696187e-07, "loss": 0.5233, "step": 11646 }, { "epoch": 0.5627385611441271, "grad_norm": 2.6713595390319824, "learning_rate": 4.372614388558728e-07, "loss": 0.2576, "step": 11647 }, { "epoch": 0.5627868773252163, "grad_norm": 2.4740357398986816, "learning_rate": 4.372131226747838e-07, "loss": 0.3309, "step": 11648 }, { "epoch": 0.5628351935063053, "grad_norm": 2.4021546840667725, "learning_rate": 4.371648064936947e-07, "loss": 0.2683, "step": 11649 }, { "epoch": 0.5628835096873943, "grad_norm": 8.029510498046875, "learning_rate": 4.371164903126057e-07, "loss": 0.3183, "step": 11650 }, { "epoch": 0.5629318258684833, "grad_norm": 2.9557790756225586, "learning_rate": 4.3706817413151666e-07, "loss": 0.3157, "step": 11651 }, { "epoch": 0.5629801420495724, "grad_norm": 2.9029383659362793, "learning_rate": 4.3701985795042755e-07, "loss": 0.2229, "step": 11652 }, { "epoch": 0.5630284582306615, "grad_norm": 2.6343884468078613, "learning_rate": 4.3697154176933854e-07, "loss": 0.223, "step": 11653 }, { "epoch": 0.5630767744117505, "grad_norm": 2.155778646469116, "learning_rate": 4.3692322558824953e-07, "loss": 0.1941, "step": 11654 }, { "epoch": 0.5631250905928396, "grad_norm": 3.4628210067749023, "learning_rate": 4.368749094071604e-07, "loss": 0.3132, "step": 11655 }, { "epoch": 0.5631734067739286, "grad_norm": 2.3372273445129395, "learning_rate": 4.368265932260714e-07, "loss": 0.1877, "step": 11656 }, { "epoch": 0.5632217229550176, "grad_norm": 2.5565571784973145, "learning_rate": 4.3677827704498234e-07, "loss": 0.3353, "step": 11657 }, { "epoch": 0.5632700391361067, "grad_norm": 2.7786614894866943, "learning_rate": 4.367299608638933e-07, "loss": 0.2863, "step": 11658 }, { "epoch": 0.5633183553171958, "grad_norm": 2.1864771842956543, "learning_rate": 4.3668164468280427e-07, "loss": 0.2875, "step": 11659 }, { "epoch": 0.5633666714982848, "grad_norm": 3.5244486331939697, "learning_rate": 4.366333285017152e-07, "loss": 0.3036, "step": 11660 }, { "epoch": 0.5634149876793738, "grad_norm": 3.124659299850464, "learning_rate": 4.3658501232062614e-07, "loss": 0.3486, "step": 11661 }, { "epoch": 0.5634633038604628, "grad_norm": 3.173102617263794, "learning_rate": 4.365366961395371e-07, "loss": 0.4509, "step": 11662 }, { "epoch": 0.5635116200415519, "grad_norm": 3.00801157951355, "learning_rate": 4.3648837995844807e-07, "loss": 0.3602, "step": 11663 }, { "epoch": 0.563559936222641, "grad_norm": 2.5901381969451904, "learning_rate": 4.3644006377735906e-07, "loss": 0.1933, "step": 11664 }, { "epoch": 0.56360825240373, "grad_norm": 2.0217530727386475, "learning_rate": 4.3639174759626994e-07, "loss": 0.2071, "step": 11665 }, { "epoch": 0.563656568584819, "grad_norm": 11.769667625427246, "learning_rate": 4.3634343141518093e-07, "loss": 0.4287, "step": 11666 }, { "epoch": 0.5637048847659081, "grad_norm": 2.9789490699768066, "learning_rate": 4.362951152340919e-07, "loss": 0.2833, "step": 11667 }, { "epoch": 0.5637532009469971, "grad_norm": 2.4137909412384033, "learning_rate": 4.362467990530028e-07, "loss": 0.2561, "step": 11668 }, { "epoch": 0.5638015171280862, "grad_norm": 5.4242730140686035, "learning_rate": 4.361984828719138e-07, "loss": 0.3701, "step": 11669 }, { "epoch": 0.5638498333091753, "grad_norm": 3.4081921577453613, "learning_rate": 4.3615016669082474e-07, "loss": 0.4019, "step": 11670 }, { "epoch": 0.5638981494902643, "grad_norm": 2.4009106159210205, "learning_rate": 4.361018505097357e-07, "loss": 0.2882, "step": 11671 }, { "epoch": 0.5639464656713533, "grad_norm": 2.8069114685058594, "learning_rate": 4.3605353432864666e-07, "loss": 0.3105, "step": 11672 }, { "epoch": 0.5639947818524423, "grad_norm": 2.6682612895965576, "learning_rate": 4.360052181475576e-07, "loss": 0.263, "step": 11673 }, { "epoch": 0.5640430980335315, "grad_norm": 2.9621973037719727, "learning_rate": 4.3595690196646854e-07, "loss": 0.3677, "step": 11674 }, { "epoch": 0.5640914142146205, "grad_norm": 1.735671877861023, "learning_rate": 4.359085857853795e-07, "loss": 0.188, "step": 11675 }, { "epoch": 0.5641397303957095, "grad_norm": 2.5849802494049072, "learning_rate": 4.3586026960429047e-07, "loss": 0.2681, "step": 11676 }, { "epoch": 0.5641880465767986, "grad_norm": 1.6153535842895508, "learning_rate": 4.358119534232014e-07, "loss": 0.1496, "step": 11677 }, { "epoch": 0.5642363627578876, "grad_norm": 4.044699192047119, "learning_rate": 4.3576363724211234e-07, "loss": 0.3051, "step": 11678 }, { "epoch": 0.5642846789389767, "grad_norm": 2.4019265174865723, "learning_rate": 4.3571532106102333e-07, "loss": 0.2736, "step": 11679 }, { "epoch": 0.5643329951200657, "grad_norm": 2.907017707824707, "learning_rate": 4.356670048799343e-07, "loss": 0.4019, "step": 11680 }, { "epoch": 0.5643813113011548, "grad_norm": 8.588872909545898, "learning_rate": 4.356186886988452e-07, "loss": 0.3503, "step": 11681 }, { "epoch": 0.5644296274822438, "grad_norm": 14.791533470153809, "learning_rate": 4.355703725177562e-07, "loss": 0.2419, "step": 11682 }, { "epoch": 0.5644779436633328, "grad_norm": 2.1726107597351074, "learning_rate": 4.3552205633666713e-07, "loss": 0.2749, "step": 11683 }, { "epoch": 0.564526259844422, "grad_norm": 2.3157904148101807, "learning_rate": 4.3547374015557807e-07, "loss": 0.2606, "step": 11684 }, { "epoch": 0.564574576025511, "grad_norm": 3.187971591949463, "learning_rate": 4.3542542397448906e-07, "loss": 0.2139, "step": 11685 }, { "epoch": 0.5646228922066, "grad_norm": 2.7175004482269287, "learning_rate": 4.353771077934e-07, "loss": 0.2637, "step": 11686 }, { "epoch": 0.564671208387689, "grad_norm": 78.69185638427734, "learning_rate": 4.3532879161231094e-07, "loss": 0.3071, "step": 11687 }, { "epoch": 0.564719524568778, "grad_norm": 4.120550155639648, "learning_rate": 4.3528047543122187e-07, "loss": 0.2571, "step": 11688 }, { "epoch": 0.5647678407498671, "grad_norm": 3.1194369792938232, "learning_rate": 4.3523215925013286e-07, "loss": 0.2955, "step": 11689 }, { "epoch": 0.5648161569309562, "grad_norm": 2.5126307010650635, "learning_rate": 4.351838430690438e-07, "loss": 0.2468, "step": 11690 }, { "epoch": 0.5648644731120452, "grad_norm": 3.520214319229126, "learning_rate": 4.3513552688795474e-07, "loss": 0.3412, "step": 11691 }, { "epoch": 0.5649127892931343, "grad_norm": 3.2178311347961426, "learning_rate": 4.3508721070686573e-07, "loss": 0.364, "step": 11692 }, { "epoch": 0.5649611054742233, "grad_norm": 2.173058271408081, "learning_rate": 4.3503889452577666e-07, "loss": 0.2715, "step": 11693 }, { "epoch": 0.5650094216553123, "grad_norm": 2.9923174381256104, "learning_rate": 4.349905783446876e-07, "loss": 0.2533, "step": 11694 }, { "epoch": 0.5650577378364015, "grad_norm": 2.3240067958831787, "learning_rate": 4.349422621635986e-07, "loss": 0.2493, "step": 11695 }, { "epoch": 0.5651060540174905, "grad_norm": 2.073356866836548, "learning_rate": 4.348939459825095e-07, "loss": 0.1901, "step": 11696 }, { "epoch": 0.5651543701985795, "grad_norm": 3.9909451007843018, "learning_rate": 4.3484562980142047e-07, "loss": 0.1922, "step": 11697 }, { "epoch": 0.5652026863796685, "grad_norm": 2.336299419403076, "learning_rate": 4.3479731362033146e-07, "loss": 0.2009, "step": 11698 }, { "epoch": 0.5652510025607576, "grad_norm": 3.1556236743927, "learning_rate": 4.347489974392424e-07, "loss": 0.2583, "step": 11699 }, { "epoch": 0.5652993187418467, "grad_norm": 2.772512912750244, "learning_rate": 4.3470068125815333e-07, "loss": 0.3694, "step": 11700 }, { "epoch": 0.5653476349229357, "grad_norm": 5.291601181030273, "learning_rate": 4.3465236507706427e-07, "loss": 0.1789, "step": 11701 }, { "epoch": 0.5653959511040247, "grad_norm": 2.3040552139282227, "learning_rate": 4.3460404889597526e-07, "loss": 0.2441, "step": 11702 }, { "epoch": 0.5654442672851138, "grad_norm": 2.4349355697631836, "learning_rate": 4.345557327148862e-07, "loss": 0.2968, "step": 11703 }, { "epoch": 0.5654925834662028, "grad_norm": 2.370035409927368, "learning_rate": 4.3450741653379713e-07, "loss": 0.2541, "step": 11704 }, { "epoch": 0.5655408996472919, "grad_norm": 5.802189350128174, "learning_rate": 4.344591003527081e-07, "loss": 0.4986, "step": 11705 }, { "epoch": 0.565589215828381, "grad_norm": 2.25146746635437, "learning_rate": 4.3441078417161906e-07, "loss": 0.2222, "step": 11706 }, { "epoch": 0.56563753200947, "grad_norm": 2.723297357559204, "learning_rate": 4.3436246799053e-07, "loss": 0.3067, "step": 11707 }, { "epoch": 0.565685848190559, "grad_norm": 2.0810024738311768, "learning_rate": 4.34314151809441e-07, "loss": 0.2028, "step": 11708 }, { "epoch": 0.565734164371648, "grad_norm": 3.042710065841675, "learning_rate": 4.3426583562835187e-07, "loss": 0.3619, "step": 11709 }, { "epoch": 0.5657824805527372, "grad_norm": 2.2482857704162598, "learning_rate": 4.3421751944726286e-07, "loss": 0.2768, "step": 11710 }, { "epoch": 0.5658307967338262, "grad_norm": 4.2743096351623535, "learning_rate": 4.3416920326617385e-07, "loss": 0.3623, "step": 11711 }, { "epoch": 0.5658791129149152, "grad_norm": 2.0807013511657715, "learning_rate": 4.3412088708508474e-07, "loss": 0.2285, "step": 11712 }, { "epoch": 0.5659274290960042, "grad_norm": 2.2309017181396484, "learning_rate": 4.3407257090399573e-07, "loss": 0.2649, "step": 11713 }, { "epoch": 0.5659757452770933, "grad_norm": 2.095330238342285, "learning_rate": 4.3402425472290667e-07, "loss": 0.3203, "step": 11714 }, { "epoch": 0.5660240614581823, "grad_norm": 1.8356877565383911, "learning_rate": 4.3397593854181766e-07, "loss": 0.1834, "step": 11715 }, { "epoch": 0.5660723776392714, "grad_norm": 5.069091320037842, "learning_rate": 4.339276223607286e-07, "loss": 0.3784, "step": 11716 }, { "epoch": 0.5661206938203605, "grad_norm": 2.3821544647216797, "learning_rate": 4.3387930617963953e-07, "loss": 0.2548, "step": 11717 }, { "epoch": 0.5661690100014495, "grad_norm": 3.4795167446136475, "learning_rate": 4.338309899985505e-07, "loss": 0.1593, "step": 11718 }, { "epoch": 0.5662173261825385, "grad_norm": 2.8556931018829346, "learning_rate": 4.3378267381746146e-07, "loss": 0.2724, "step": 11719 }, { "epoch": 0.5662656423636275, "grad_norm": 2.5168850421905518, "learning_rate": 4.337343576363724e-07, "loss": 0.3038, "step": 11720 }, { "epoch": 0.5663139585447167, "grad_norm": 2.2049508094787598, "learning_rate": 4.336860414552834e-07, "loss": 0.281, "step": 11721 }, { "epoch": 0.5663622747258057, "grad_norm": 2.582406520843506, "learning_rate": 4.3363772527419427e-07, "loss": 0.144, "step": 11722 }, { "epoch": 0.5664105909068947, "grad_norm": 5.19855260848999, "learning_rate": 4.3358940909310526e-07, "loss": 0.2434, "step": 11723 }, { "epoch": 0.5664589070879837, "grad_norm": 2.3260610103607178, "learning_rate": 4.3354109291201625e-07, "loss": 0.2503, "step": 11724 }, { "epoch": 0.5665072232690728, "grad_norm": 4.801332950592041, "learning_rate": 4.3349277673092713e-07, "loss": 0.2485, "step": 11725 }, { "epoch": 0.5665555394501619, "grad_norm": 2.7231173515319824, "learning_rate": 4.334444605498381e-07, "loss": 0.298, "step": 11726 }, { "epoch": 0.5666038556312509, "grad_norm": 2.6210291385650635, "learning_rate": 4.3339614436874906e-07, "loss": 0.229, "step": 11727 }, { "epoch": 0.56665217181234, "grad_norm": 2.068650484085083, "learning_rate": 4.3334782818766e-07, "loss": 0.3012, "step": 11728 }, { "epoch": 0.566700487993429, "grad_norm": 2.8157706260681152, "learning_rate": 4.33299512006571e-07, "loss": 0.1778, "step": 11729 }, { "epoch": 0.566748804174518, "grad_norm": 46.66412353515625, "learning_rate": 4.3325119582548193e-07, "loss": 0.3528, "step": 11730 }, { "epoch": 0.5667971203556071, "grad_norm": 2.5623016357421875, "learning_rate": 4.332028796443929e-07, "loss": 0.2789, "step": 11731 }, { "epoch": 0.5668454365366962, "grad_norm": 2.821687936782837, "learning_rate": 4.3315456346330385e-07, "loss": 0.3453, "step": 11732 }, { "epoch": 0.5668937527177852, "grad_norm": 4.065003871917725, "learning_rate": 4.331062472822148e-07, "loss": 0.3174, "step": 11733 }, { "epoch": 0.5669420688988742, "grad_norm": 2.3179495334625244, "learning_rate": 4.330579311011258e-07, "loss": 0.2697, "step": 11734 }, { "epoch": 0.5669903850799632, "grad_norm": 2.7939980030059814, "learning_rate": 4.3300961492003667e-07, "loss": 0.2705, "step": 11735 }, { "epoch": 0.5670387012610524, "grad_norm": 2.1321895122528076, "learning_rate": 4.3296129873894766e-07, "loss": 0.2465, "step": 11736 }, { "epoch": 0.5670870174421414, "grad_norm": 2.7090728282928467, "learning_rate": 4.3291298255785865e-07, "loss": 0.1844, "step": 11737 }, { "epoch": 0.5671353336232304, "grad_norm": 2.7989084720611572, "learning_rate": 4.3286466637676953e-07, "loss": 0.3121, "step": 11738 }, { "epoch": 0.5671836498043195, "grad_norm": 2.796473741531372, "learning_rate": 4.328163501956805e-07, "loss": 0.1827, "step": 11739 }, { "epoch": 0.5672319659854085, "grad_norm": 5.341859817504883, "learning_rate": 4.3276803401459146e-07, "loss": 0.286, "step": 11740 }, { "epoch": 0.5672802821664975, "grad_norm": 2.473978281021118, "learning_rate": 4.327197178335024e-07, "loss": 0.302, "step": 11741 }, { "epoch": 0.5673285983475866, "grad_norm": 4.045364856719971, "learning_rate": 4.326714016524134e-07, "loss": 0.3186, "step": 11742 }, { "epoch": 0.5673769145286757, "grad_norm": 3.3218777179718018, "learning_rate": 4.326230854713243e-07, "loss": 0.3438, "step": 11743 }, { "epoch": 0.5674252307097647, "grad_norm": 2.1272201538085938, "learning_rate": 4.3257476929023526e-07, "loss": 0.2435, "step": 11744 }, { "epoch": 0.5674735468908537, "grad_norm": 2.9443225860595703, "learning_rate": 4.3252645310914625e-07, "loss": 0.265, "step": 11745 }, { "epoch": 0.5675218630719427, "grad_norm": 2.3684194087982178, "learning_rate": 4.324781369280572e-07, "loss": 0.2937, "step": 11746 }, { "epoch": 0.5675701792530319, "grad_norm": 2.5004889965057373, "learning_rate": 4.324298207469682e-07, "loss": 0.3199, "step": 11747 }, { "epoch": 0.5676184954341209, "grad_norm": 5.234831809997559, "learning_rate": 4.3238150456587906e-07, "loss": 0.1938, "step": 11748 }, { "epoch": 0.5676668116152099, "grad_norm": 2.4286794662475586, "learning_rate": 4.3233318838479005e-07, "loss": 0.3001, "step": 11749 }, { "epoch": 0.567715127796299, "grad_norm": 2.4121010303497314, "learning_rate": 4.3228487220370104e-07, "loss": 0.2521, "step": 11750 }, { "epoch": 0.567763443977388, "grad_norm": 7.808499813079834, "learning_rate": 4.3223655602261193e-07, "loss": 0.3133, "step": 11751 }, { "epoch": 0.5678117601584771, "grad_norm": 3.103938102722168, "learning_rate": 4.321882398415229e-07, "loss": 0.3979, "step": 11752 }, { "epoch": 0.5678600763395661, "grad_norm": 2.6964592933654785, "learning_rate": 4.3213992366043385e-07, "loss": 0.3227, "step": 11753 }, { "epoch": 0.5679083925206552, "grad_norm": 2.8757829666137695, "learning_rate": 4.320916074793448e-07, "loss": 0.3616, "step": 11754 }, { "epoch": 0.5679567087017442, "grad_norm": 3.4268620014190674, "learning_rate": 4.320432912982558e-07, "loss": 0.2959, "step": 11755 }, { "epoch": 0.5680050248828332, "grad_norm": 3.4059629440307617, "learning_rate": 4.319949751171667e-07, "loss": 0.3046, "step": 11756 }, { "epoch": 0.5680533410639224, "grad_norm": 10.87148666381836, "learning_rate": 4.3194665893607766e-07, "loss": 0.4272, "step": 11757 }, { "epoch": 0.5681016572450114, "grad_norm": 3.144644260406494, "learning_rate": 4.3189834275498865e-07, "loss": 0.2814, "step": 11758 }, { "epoch": 0.5681499734261004, "grad_norm": 3.249079704284668, "learning_rate": 4.318500265738996e-07, "loss": 0.3018, "step": 11759 }, { "epoch": 0.5681982896071894, "grad_norm": 1.7360680103302002, "learning_rate": 4.318017103928105e-07, "loss": 0.1946, "step": 11760 }, { "epoch": 0.5682466057882785, "grad_norm": 2.252443790435791, "learning_rate": 4.3175339421172146e-07, "loss": 0.2913, "step": 11761 }, { "epoch": 0.5682949219693676, "grad_norm": 2.8290042877197266, "learning_rate": 4.3170507803063245e-07, "loss": 0.432, "step": 11762 }, { "epoch": 0.5683432381504566, "grad_norm": 3.3000762462615967, "learning_rate": 4.3165676184954344e-07, "loss": 0.2642, "step": 11763 }, { "epoch": 0.5683915543315456, "grad_norm": 7.618345260620117, "learning_rate": 4.316084456684543e-07, "loss": 0.4267, "step": 11764 }, { "epoch": 0.5684398705126347, "grad_norm": 5.858697414398193, "learning_rate": 4.315601294873653e-07, "loss": 0.3307, "step": 11765 }, { "epoch": 0.5684881866937237, "grad_norm": 2.317204713821411, "learning_rate": 4.3151181330627625e-07, "loss": 0.2101, "step": 11766 }, { "epoch": 0.5685365028748127, "grad_norm": 6.294442176818848, "learning_rate": 4.314634971251872e-07, "loss": 0.2733, "step": 11767 }, { "epoch": 0.5685848190559019, "grad_norm": 3.711618661880493, "learning_rate": 4.314151809440982e-07, "loss": 0.4028, "step": 11768 }, { "epoch": 0.5686331352369909, "grad_norm": 2.298293113708496, "learning_rate": 4.313668647630091e-07, "loss": 0.255, "step": 11769 }, { "epoch": 0.5686814514180799, "grad_norm": 2.229863166809082, "learning_rate": 4.3131854858192005e-07, "loss": 0.2686, "step": 11770 }, { "epoch": 0.5687297675991689, "grad_norm": 2.2245547771453857, "learning_rate": 4.31270232400831e-07, "loss": 0.2529, "step": 11771 }, { "epoch": 0.568778083780258, "grad_norm": 13.23229694366455, "learning_rate": 4.31221916219742e-07, "loss": 0.3161, "step": 11772 }, { "epoch": 0.5688263999613471, "grad_norm": 4.177785396575928, "learning_rate": 4.311736000386529e-07, "loss": 0.2881, "step": 11773 }, { "epoch": 0.5688747161424361, "grad_norm": 2.739412546157837, "learning_rate": 4.3112528385756385e-07, "loss": 0.2826, "step": 11774 }, { "epoch": 0.5689230323235251, "grad_norm": 2.871499538421631, "learning_rate": 4.3107696767647485e-07, "loss": 0.287, "step": 11775 }, { "epoch": 0.5689713485046142, "grad_norm": 3.372121810913086, "learning_rate": 4.3102865149538584e-07, "loss": 0.3448, "step": 11776 }, { "epoch": 0.5690196646857032, "grad_norm": 1.8902010917663574, "learning_rate": 4.309803353142967e-07, "loss": 0.1922, "step": 11777 }, { "epoch": 0.5690679808667923, "grad_norm": 2.8393421173095703, "learning_rate": 4.309320191332077e-07, "loss": 0.3917, "step": 11778 }, { "epoch": 0.5691162970478814, "grad_norm": 2.234314441680908, "learning_rate": 4.3088370295211865e-07, "loss": 0.2031, "step": 11779 }, { "epoch": 0.5691646132289704, "grad_norm": 2.116647243499756, "learning_rate": 4.308353867710296e-07, "loss": 0.2339, "step": 11780 }, { "epoch": 0.5692129294100594, "grad_norm": 11.341480255126953, "learning_rate": 4.307870705899406e-07, "loss": 0.3408, "step": 11781 }, { "epoch": 0.5692612455911484, "grad_norm": 4.666808128356934, "learning_rate": 4.307387544088515e-07, "loss": 0.3479, "step": 11782 }, { "epoch": 0.5693095617722376, "grad_norm": 2.1097288131713867, "learning_rate": 4.3069043822776245e-07, "loss": 0.241, "step": 11783 }, { "epoch": 0.5693578779533266, "grad_norm": 2.3631978034973145, "learning_rate": 4.306421220466734e-07, "loss": 0.3565, "step": 11784 }, { "epoch": 0.5694061941344156, "grad_norm": 3.5221917629241943, "learning_rate": 4.305938058655844e-07, "loss": 0.3744, "step": 11785 }, { "epoch": 0.5694545103155046, "grad_norm": 2.737119436264038, "learning_rate": 4.305454896844953e-07, "loss": 0.2886, "step": 11786 }, { "epoch": 0.5695028264965937, "grad_norm": 1.9550155401229858, "learning_rate": 4.3049717350340625e-07, "loss": 0.239, "step": 11787 }, { "epoch": 0.5695511426776828, "grad_norm": 4.6197896003723145, "learning_rate": 4.3044885732231724e-07, "loss": 0.3303, "step": 11788 }, { "epoch": 0.5695994588587718, "grad_norm": 3.0040698051452637, "learning_rate": 4.304005411412282e-07, "loss": 0.3091, "step": 11789 }, { "epoch": 0.5696477750398609, "grad_norm": 1.9363977909088135, "learning_rate": 4.303522249601391e-07, "loss": 0.2276, "step": 11790 }, { "epoch": 0.5696960912209499, "grad_norm": 2.3748416900634766, "learning_rate": 4.303039087790501e-07, "loss": 0.2509, "step": 11791 }, { "epoch": 0.5697444074020389, "grad_norm": 2.7424895763397217, "learning_rate": 4.30255592597961e-07, "loss": 0.2443, "step": 11792 }, { "epoch": 0.5697927235831279, "grad_norm": 2.8014464378356934, "learning_rate": 4.30207276416872e-07, "loss": 0.3069, "step": 11793 }, { "epoch": 0.5698410397642171, "grad_norm": 8.47980785369873, "learning_rate": 4.3015896023578297e-07, "loss": 0.3926, "step": 11794 }, { "epoch": 0.5698893559453061, "grad_norm": 1.3695117235183716, "learning_rate": 4.301106440546939e-07, "loss": 0.1553, "step": 11795 }, { "epoch": 0.5699376721263951, "grad_norm": 2.755897283554077, "learning_rate": 4.3006232787360485e-07, "loss": 0.3252, "step": 11796 }, { "epoch": 0.5699859883074841, "grad_norm": 2.649254560470581, "learning_rate": 4.300140116925158e-07, "loss": 0.2851, "step": 11797 }, { "epoch": 0.5700343044885732, "grad_norm": 3.576483964920044, "learning_rate": 4.2996569551142677e-07, "loss": 0.3299, "step": 11798 }, { "epoch": 0.5700826206696623, "grad_norm": 4.316655158996582, "learning_rate": 4.299173793303377e-07, "loss": 0.1921, "step": 11799 }, { "epoch": 0.5701309368507513, "grad_norm": 2.379359006881714, "learning_rate": 4.2986906314924865e-07, "loss": 0.2261, "step": 11800 }, { "epoch": 0.5701792530318404, "grad_norm": 2.9528629779815674, "learning_rate": 4.2982074696815964e-07, "loss": 0.4294, "step": 11801 }, { "epoch": 0.5702275692129294, "grad_norm": 2.629312515258789, "learning_rate": 4.297724307870706e-07, "loss": 0.3194, "step": 11802 }, { "epoch": 0.5702758853940184, "grad_norm": 2.9309134483337402, "learning_rate": 4.297241146059815e-07, "loss": 0.313, "step": 11803 }, { "epoch": 0.5703242015751075, "grad_norm": 3.4506664276123047, "learning_rate": 4.296757984248925e-07, "loss": 0.2561, "step": 11804 }, { "epoch": 0.5703725177561966, "grad_norm": 2.8969063758850098, "learning_rate": 4.296274822438034e-07, "loss": 0.1974, "step": 11805 }, { "epoch": 0.5704208339372856, "grad_norm": 2.9579529762268066, "learning_rate": 4.295791660627144e-07, "loss": 0.2542, "step": 11806 }, { "epoch": 0.5704691501183746, "grad_norm": 2.5523550510406494, "learning_rate": 4.2953084988162537e-07, "loss": 0.2333, "step": 11807 }, { "epoch": 0.5705174662994636, "grad_norm": 2.7815983295440674, "learning_rate": 4.2948253370053625e-07, "loss": 0.3136, "step": 11808 }, { "epoch": 0.5705657824805528, "grad_norm": 2.827129364013672, "learning_rate": 4.2943421751944724e-07, "loss": 0.3638, "step": 11809 }, { "epoch": 0.5706140986616418, "grad_norm": 2.315199613571167, "learning_rate": 4.293859013383582e-07, "loss": 0.2425, "step": 11810 }, { "epoch": 0.5706624148427308, "grad_norm": 1.6755149364471436, "learning_rate": 4.2933758515726917e-07, "loss": 0.19, "step": 11811 }, { "epoch": 0.5707107310238199, "grad_norm": 5.26893424987793, "learning_rate": 4.292892689761801e-07, "loss": 0.3638, "step": 11812 }, { "epoch": 0.5707590472049089, "grad_norm": 2.0345304012298584, "learning_rate": 4.2924095279509104e-07, "loss": 0.247, "step": 11813 }, { "epoch": 0.570807363385998, "grad_norm": 2.4786508083343506, "learning_rate": 4.2919263661400203e-07, "loss": 0.2509, "step": 11814 }, { "epoch": 0.570855679567087, "grad_norm": 2.472844362258911, "learning_rate": 4.2914432043291297e-07, "loss": 0.2892, "step": 11815 }, { "epoch": 0.5709039957481761, "grad_norm": 1.6569774150848389, "learning_rate": 4.290960042518239e-07, "loss": 0.1672, "step": 11816 }, { "epoch": 0.5709523119292651, "grad_norm": 4.088841915130615, "learning_rate": 4.290476880707349e-07, "loss": 0.2843, "step": 11817 }, { "epoch": 0.5710006281103541, "grad_norm": 2.6946206092834473, "learning_rate": 4.289993718896458e-07, "loss": 0.3378, "step": 11818 }, { "epoch": 0.5710489442914432, "grad_norm": 2.301945447921753, "learning_rate": 4.2895105570855677e-07, "loss": 0.1535, "step": 11819 }, { "epoch": 0.5710972604725323, "grad_norm": 2.8112452030181885, "learning_rate": 4.2890273952746776e-07, "loss": 0.3013, "step": 11820 }, { "epoch": 0.5711455766536213, "grad_norm": 5.249285697937012, "learning_rate": 4.2885442334637865e-07, "loss": 0.3245, "step": 11821 }, { "epoch": 0.5711938928347103, "grad_norm": 34.9744758605957, "learning_rate": 4.2880610716528964e-07, "loss": 0.4439, "step": 11822 }, { "epoch": 0.5712422090157994, "grad_norm": 3.9129648208618164, "learning_rate": 4.287577909842006e-07, "loss": 0.2975, "step": 11823 }, { "epoch": 0.5712905251968884, "grad_norm": 3.2788891792297363, "learning_rate": 4.287094748031115e-07, "loss": 0.434, "step": 11824 }, { "epoch": 0.5713388413779775, "grad_norm": 13.67642879486084, "learning_rate": 4.286611586220225e-07, "loss": 0.3409, "step": 11825 }, { "epoch": 0.5713871575590665, "grad_norm": 7.101381301879883, "learning_rate": 4.2861284244093344e-07, "loss": 0.2368, "step": 11826 }, { "epoch": 0.5714354737401556, "grad_norm": 3.1850812435150146, "learning_rate": 4.2856452625984443e-07, "loss": 0.3382, "step": 11827 }, { "epoch": 0.5714837899212446, "grad_norm": 2.5276870727539062, "learning_rate": 4.2851621007875537e-07, "loss": 0.2465, "step": 11828 }, { "epoch": 0.5715321061023336, "grad_norm": 3.6343374252319336, "learning_rate": 4.284678938976663e-07, "loss": 0.3438, "step": 11829 }, { "epoch": 0.5715804222834228, "grad_norm": 2.5406763553619385, "learning_rate": 4.284195777165773e-07, "loss": 0.2485, "step": 11830 }, { "epoch": 0.5716287384645118, "grad_norm": 2.4532570838928223, "learning_rate": 4.283712615354882e-07, "loss": 0.2222, "step": 11831 }, { "epoch": 0.5716770546456008, "grad_norm": 2.243007183074951, "learning_rate": 4.2832294535439917e-07, "loss": 0.2681, "step": 11832 }, { "epoch": 0.5717253708266898, "grad_norm": 3.437478542327881, "learning_rate": 4.2827462917331016e-07, "loss": 0.3551, "step": 11833 }, { "epoch": 0.5717736870077789, "grad_norm": 3.2222583293914795, "learning_rate": 4.2822631299222104e-07, "loss": 0.3698, "step": 11834 }, { "epoch": 0.571822003188868, "grad_norm": 4.286189556121826, "learning_rate": 4.2817799681113203e-07, "loss": 0.3504, "step": 11835 }, { "epoch": 0.571870319369957, "grad_norm": 3.915632963180542, "learning_rate": 4.2812968063004297e-07, "loss": 0.1852, "step": 11836 }, { "epoch": 0.571918635551046, "grad_norm": 2.73232364654541, "learning_rate": 4.280813644489539e-07, "loss": 0.3072, "step": 11837 }, { "epoch": 0.5719669517321351, "grad_norm": 11.8609619140625, "learning_rate": 4.280330482678649e-07, "loss": 0.3786, "step": 11838 }, { "epoch": 0.5720152679132241, "grad_norm": 2.536038875579834, "learning_rate": 4.2798473208677584e-07, "loss": 0.2739, "step": 11839 }, { "epoch": 0.5720635840943132, "grad_norm": 2.4376182556152344, "learning_rate": 4.279364159056868e-07, "loss": 0.272, "step": 11840 }, { "epoch": 0.5721119002754023, "grad_norm": 3.662065267562866, "learning_rate": 4.2788809972459776e-07, "loss": 0.3254, "step": 11841 }, { "epoch": 0.5721602164564913, "grad_norm": 3.3612749576568604, "learning_rate": 4.278397835435087e-07, "loss": 0.2027, "step": 11842 }, { "epoch": 0.5722085326375803, "grad_norm": 2.8864667415618896, "learning_rate": 4.277914673624197e-07, "loss": 0.3406, "step": 11843 }, { "epoch": 0.5722568488186693, "grad_norm": 2.086742401123047, "learning_rate": 4.277431511813306e-07, "loss": 0.2161, "step": 11844 }, { "epoch": 0.5723051649997584, "grad_norm": 2.5821969509124756, "learning_rate": 4.2769483500024157e-07, "loss": 0.3159, "step": 11845 }, { "epoch": 0.5723534811808475, "grad_norm": 12.167991638183594, "learning_rate": 4.2764651881915256e-07, "loss": 0.2607, "step": 11846 }, { "epoch": 0.5724017973619365, "grad_norm": 4.83479118347168, "learning_rate": 4.2759820263806344e-07, "loss": 0.3157, "step": 11847 }, { "epoch": 0.5724501135430256, "grad_norm": 2.5131661891937256, "learning_rate": 4.2754988645697443e-07, "loss": 0.2436, "step": 11848 }, { "epoch": 0.5724984297241146, "grad_norm": 3.3393025398254395, "learning_rate": 4.2750157027588537e-07, "loss": 0.3092, "step": 11849 }, { "epoch": 0.5725467459052036, "grad_norm": 2.6467418670654297, "learning_rate": 4.274532540947963e-07, "loss": 0.1818, "step": 11850 }, { "epoch": 0.5725950620862927, "grad_norm": 2.350578546524048, "learning_rate": 4.274049379137073e-07, "loss": 0.3044, "step": 11851 }, { "epoch": 0.5726433782673818, "grad_norm": 4.847958087921143, "learning_rate": 4.2735662173261823e-07, "loss": 0.3549, "step": 11852 }, { "epoch": 0.5726916944484708, "grad_norm": 2.950700283050537, "learning_rate": 4.2730830555152917e-07, "loss": 0.3109, "step": 11853 }, { "epoch": 0.5727400106295598, "grad_norm": 2.424278497695923, "learning_rate": 4.2725998937044016e-07, "loss": 0.2216, "step": 11854 }, { "epoch": 0.5727883268106488, "grad_norm": 2.1390151977539062, "learning_rate": 4.272116731893511e-07, "loss": 0.2536, "step": 11855 }, { "epoch": 0.572836642991738, "grad_norm": 3.4568114280700684, "learning_rate": 4.2716335700826203e-07, "loss": 0.3186, "step": 11856 }, { "epoch": 0.572884959172827, "grad_norm": 3.383934259414673, "learning_rate": 4.2711504082717297e-07, "loss": 0.4078, "step": 11857 }, { "epoch": 0.572933275353916, "grad_norm": 2.184326171875, "learning_rate": 4.2706672464608396e-07, "loss": 0.2126, "step": 11858 }, { "epoch": 0.572981591535005, "grad_norm": 5.233061790466309, "learning_rate": 4.2701840846499495e-07, "loss": 0.2801, "step": 11859 }, { "epoch": 0.5730299077160941, "grad_norm": 3.184283971786499, "learning_rate": 4.2697009228390584e-07, "loss": 0.4647, "step": 11860 }, { "epoch": 0.5730782238971832, "grad_norm": 2.7349424362182617, "learning_rate": 4.2692177610281683e-07, "loss": 0.2626, "step": 11861 }, { "epoch": 0.5731265400782722, "grad_norm": 4.972206115722656, "learning_rate": 4.2687345992172776e-07, "loss": 0.5062, "step": 11862 }, { "epoch": 0.5731748562593613, "grad_norm": 2.290166139602661, "learning_rate": 4.268251437406387e-07, "loss": 0.2122, "step": 11863 }, { "epoch": 0.5732231724404503, "grad_norm": 2.071769952774048, "learning_rate": 4.267768275595497e-07, "loss": 0.2291, "step": 11864 }, { "epoch": 0.5732714886215393, "grad_norm": 3.0899598598480225, "learning_rate": 4.2672851137846063e-07, "loss": 0.366, "step": 11865 }, { "epoch": 0.5733198048026285, "grad_norm": 3.8646955490112305, "learning_rate": 4.2668019519737157e-07, "loss": 0.2933, "step": 11866 }, { "epoch": 0.5733681209837175, "grad_norm": 1.6957794427871704, "learning_rate": 4.2663187901628256e-07, "loss": 0.1854, "step": 11867 }, { "epoch": 0.5734164371648065, "grad_norm": 2.856666088104248, "learning_rate": 4.265835628351935e-07, "loss": 0.3296, "step": 11868 }, { "epoch": 0.5734647533458955, "grad_norm": 2.881798267364502, "learning_rate": 4.2653524665410443e-07, "loss": 0.3716, "step": 11869 }, { "epoch": 0.5735130695269846, "grad_norm": 2.334895133972168, "learning_rate": 4.2648693047301537e-07, "loss": 0.2501, "step": 11870 }, { "epoch": 0.5735613857080736, "grad_norm": 4.125622749328613, "learning_rate": 4.2643861429192636e-07, "loss": 0.3628, "step": 11871 }, { "epoch": 0.5736097018891627, "grad_norm": 2.385890245437622, "learning_rate": 4.263902981108373e-07, "loss": 0.2696, "step": 11872 }, { "epoch": 0.5736580180702517, "grad_norm": 2.1178863048553467, "learning_rate": 4.2634198192974823e-07, "loss": 0.251, "step": 11873 }, { "epoch": 0.5737063342513408, "grad_norm": 9.589387893676758, "learning_rate": 4.262936657486592e-07, "loss": 0.2395, "step": 11874 }, { "epoch": 0.5737546504324298, "grad_norm": 3.1888206005096436, "learning_rate": 4.262453495675701e-07, "loss": 0.4461, "step": 11875 }, { "epoch": 0.5738029666135188, "grad_norm": 22.549427032470703, "learning_rate": 4.261970333864811e-07, "loss": 0.5318, "step": 11876 }, { "epoch": 0.573851282794608, "grad_norm": 1.913182020187378, "learning_rate": 4.261487172053921e-07, "loss": 0.2392, "step": 11877 }, { "epoch": 0.573899598975697, "grad_norm": 3.4333603382110596, "learning_rate": 4.26100401024303e-07, "loss": 0.2466, "step": 11878 }, { "epoch": 0.573947915156786, "grad_norm": 2.4538185596466064, "learning_rate": 4.2605208484321396e-07, "loss": 0.3369, "step": 11879 }, { "epoch": 0.573996231337875, "grad_norm": 2.6296379566192627, "learning_rate": 4.2600376866212495e-07, "loss": 0.2375, "step": 11880 }, { "epoch": 0.574044547518964, "grad_norm": 2.710991621017456, "learning_rate": 4.259554524810359e-07, "loss": 0.2719, "step": 11881 }, { "epoch": 0.5740928637000532, "grad_norm": 2.6788482666015625, "learning_rate": 4.2590713629994683e-07, "loss": 0.2775, "step": 11882 }, { "epoch": 0.5741411798811422, "grad_norm": 3.317957639694214, "learning_rate": 4.2585882011885776e-07, "loss": 0.3188, "step": 11883 }, { "epoch": 0.5741894960622312, "grad_norm": 2.942824125289917, "learning_rate": 4.2581050393776876e-07, "loss": 0.364, "step": 11884 }, { "epoch": 0.5742378122433203, "grad_norm": 16.333036422729492, "learning_rate": 4.257621877566797e-07, "loss": 0.3373, "step": 11885 }, { "epoch": 0.5742861284244093, "grad_norm": 2.6171939373016357, "learning_rate": 4.2571387157559063e-07, "loss": 0.2844, "step": 11886 }, { "epoch": 0.5743344446054984, "grad_norm": 2.013319969177246, "learning_rate": 4.256655553945016e-07, "loss": 0.2036, "step": 11887 }, { "epoch": 0.5743827607865875, "grad_norm": 2.6145663261413574, "learning_rate": 4.256172392134125e-07, "loss": 0.3562, "step": 11888 }, { "epoch": 0.5744310769676765, "grad_norm": 1.593787670135498, "learning_rate": 4.255689230323235e-07, "loss": 0.1478, "step": 11889 }, { "epoch": 0.5744793931487655, "grad_norm": 4.415423393249512, "learning_rate": 4.255206068512345e-07, "loss": 0.3415, "step": 11890 }, { "epoch": 0.5745277093298545, "grad_norm": 2.6548821926116943, "learning_rate": 4.2547229067014537e-07, "loss": 0.3973, "step": 11891 }, { "epoch": 0.5745760255109437, "grad_norm": 2.117166519165039, "learning_rate": 4.2542397448905636e-07, "loss": 0.2032, "step": 11892 }, { "epoch": 0.5746243416920327, "grad_norm": 2.865861654281616, "learning_rate": 4.2537565830796735e-07, "loss": 0.3779, "step": 11893 }, { "epoch": 0.5746726578731217, "grad_norm": 3.201143503189087, "learning_rate": 4.253273421268783e-07, "loss": 0.2846, "step": 11894 }, { "epoch": 0.5747209740542107, "grad_norm": 3.5727367401123047, "learning_rate": 4.252790259457892e-07, "loss": 0.3456, "step": 11895 }, { "epoch": 0.5747692902352998, "grad_norm": 2.4155356884002686, "learning_rate": 4.2523070976470016e-07, "loss": 0.2772, "step": 11896 }, { "epoch": 0.5748176064163888, "grad_norm": 3.0214996337890625, "learning_rate": 4.2518239358361115e-07, "loss": 0.294, "step": 11897 }, { "epoch": 0.5748659225974779, "grad_norm": 5.819912910461426, "learning_rate": 4.251340774025221e-07, "loss": 0.2364, "step": 11898 }, { "epoch": 0.574914238778567, "grad_norm": 3.3639824390411377, "learning_rate": 4.25085761221433e-07, "loss": 0.2032, "step": 11899 }, { "epoch": 0.574962554959656, "grad_norm": 2.9153668880462646, "learning_rate": 4.25037445040344e-07, "loss": 0.2781, "step": 11900 }, { "epoch": 0.575010871140745, "grad_norm": 6.011216163635254, "learning_rate": 4.249891288592549e-07, "loss": 0.3425, "step": 11901 }, { "epoch": 0.575059187321834, "grad_norm": 2.920621395111084, "learning_rate": 4.249408126781659e-07, "loss": 0.3388, "step": 11902 }, { "epoch": 0.5751075035029232, "grad_norm": 2.1276485919952393, "learning_rate": 4.248924964970769e-07, "loss": 0.2018, "step": 11903 }, { "epoch": 0.5751558196840122, "grad_norm": 2.6660871505737305, "learning_rate": 4.2484418031598777e-07, "loss": 0.3316, "step": 11904 }, { "epoch": 0.5752041358651012, "grad_norm": 2.640925168991089, "learning_rate": 4.2479586413489876e-07, "loss": 0.3538, "step": 11905 }, { "epoch": 0.5752524520461902, "grad_norm": 33.92139434814453, "learning_rate": 4.2474754795380975e-07, "loss": 0.2504, "step": 11906 }, { "epoch": 0.5753007682272793, "grad_norm": 1.5853052139282227, "learning_rate": 4.2469923177272063e-07, "loss": 0.1703, "step": 11907 }, { "epoch": 0.5753490844083684, "grad_norm": 5.095920085906982, "learning_rate": 4.246509155916316e-07, "loss": 0.4525, "step": 11908 }, { "epoch": 0.5753974005894574, "grad_norm": 3.8991565704345703, "learning_rate": 4.2460259941054256e-07, "loss": 0.3551, "step": 11909 }, { "epoch": 0.5754457167705465, "grad_norm": 5.063607692718506, "learning_rate": 4.2455428322945355e-07, "loss": 0.2172, "step": 11910 }, { "epoch": 0.5754940329516355, "grad_norm": 2.8799338340759277, "learning_rate": 4.245059670483645e-07, "loss": 0.3263, "step": 11911 }, { "epoch": 0.5755423491327245, "grad_norm": 3.784125328063965, "learning_rate": 4.244576508672754e-07, "loss": 0.4017, "step": 11912 }, { "epoch": 0.5755906653138136, "grad_norm": 2.1230361461639404, "learning_rate": 4.244093346861864e-07, "loss": 0.2774, "step": 11913 }, { "epoch": 0.5756389814949027, "grad_norm": 2.1197893619537354, "learning_rate": 4.243610185050973e-07, "loss": 0.2596, "step": 11914 }, { "epoch": 0.5756872976759917, "grad_norm": 3.5376830101013184, "learning_rate": 4.243127023240083e-07, "loss": 0.2596, "step": 11915 }, { "epoch": 0.5757356138570807, "grad_norm": 6.1579790115356445, "learning_rate": 4.242643861429193e-07, "loss": 0.2521, "step": 11916 }, { "epoch": 0.5757839300381697, "grad_norm": 2.733825445175171, "learning_rate": 4.2421606996183016e-07, "loss": 0.3244, "step": 11917 }, { "epoch": 0.5758322462192589, "grad_norm": 5.090945243835449, "learning_rate": 4.2416775378074115e-07, "loss": 0.3425, "step": 11918 }, { "epoch": 0.5758805624003479, "grad_norm": 4.742954730987549, "learning_rate": 4.2411943759965214e-07, "loss": 0.5042, "step": 11919 }, { "epoch": 0.5759288785814369, "grad_norm": 3.2531609535217285, "learning_rate": 4.2407112141856303e-07, "loss": 0.278, "step": 11920 }, { "epoch": 0.575977194762526, "grad_norm": 2.625727891921997, "learning_rate": 4.24022805237474e-07, "loss": 0.395, "step": 11921 }, { "epoch": 0.576025510943615, "grad_norm": 2.492300271987915, "learning_rate": 4.2397448905638495e-07, "loss": 0.2887, "step": 11922 }, { "epoch": 0.5760738271247041, "grad_norm": 2.5420289039611816, "learning_rate": 4.239261728752959e-07, "loss": 0.3271, "step": 11923 }, { "epoch": 0.5761221433057931, "grad_norm": 2.035287380218506, "learning_rate": 4.238778566942069e-07, "loss": 0.1788, "step": 11924 }, { "epoch": 0.5761704594868822, "grad_norm": 2.269474983215332, "learning_rate": 4.238295405131178e-07, "loss": 0.2289, "step": 11925 }, { "epoch": 0.5762187756679712, "grad_norm": 2.648885488510132, "learning_rate": 4.237812243320288e-07, "loss": 0.2618, "step": 11926 }, { "epoch": 0.5762670918490602, "grad_norm": 2.303840398788452, "learning_rate": 4.237329081509397e-07, "loss": 0.1919, "step": 11927 }, { "epoch": 0.5763154080301492, "grad_norm": 9.855355262756348, "learning_rate": 4.236845919698507e-07, "loss": 0.3449, "step": 11928 }, { "epoch": 0.5763637242112384, "grad_norm": 2.073181629180908, "learning_rate": 4.236362757887617e-07, "loss": 0.2241, "step": 11929 }, { "epoch": 0.5764120403923274, "grad_norm": 2.7511653900146484, "learning_rate": 4.2358795960767256e-07, "loss": 0.2617, "step": 11930 }, { "epoch": 0.5764603565734164, "grad_norm": 2.103592872619629, "learning_rate": 4.2353964342658355e-07, "loss": 0.2208, "step": 11931 }, { "epoch": 0.5765086727545055, "grad_norm": 2.228209972381592, "learning_rate": 4.2349132724549454e-07, "loss": 0.1735, "step": 11932 }, { "epoch": 0.5765569889355945, "grad_norm": 2.9943997859954834, "learning_rate": 4.234430110644054e-07, "loss": 0.2282, "step": 11933 }, { "epoch": 0.5766053051166836, "grad_norm": 3.0190165042877197, "learning_rate": 4.233946948833164e-07, "loss": 0.3225, "step": 11934 }, { "epoch": 0.5766536212977726, "grad_norm": 41.46586608886719, "learning_rate": 4.2334637870222735e-07, "loss": 0.2783, "step": 11935 }, { "epoch": 0.5767019374788617, "grad_norm": 6.884325981140137, "learning_rate": 4.232980625211383e-07, "loss": 0.3498, "step": 11936 }, { "epoch": 0.5767502536599507, "grad_norm": 2.1741719245910645, "learning_rate": 4.232497463400493e-07, "loss": 0.2745, "step": 11937 }, { "epoch": 0.5767985698410397, "grad_norm": 7.470798015594482, "learning_rate": 4.232014301589602e-07, "loss": 0.2989, "step": 11938 }, { "epoch": 0.5768468860221289, "grad_norm": 3.2257962226867676, "learning_rate": 4.231531139778712e-07, "loss": 0.3037, "step": 11939 }, { "epoch": 0.5768952022032179, "grad_norm": 22.792896270751953, "learning_rate": 4.231047977967821e-07, "loss": 0.3348, "step": 11940 }, { "epoch": 0.5769435183843069, "grad_norm": 3.1535122394561768, "learning_rate": 4.230564816156931e-07, "loss": 0.3298, "step": 11941 }, { "epoch": 0.5769918345653959, "grad_norm": 5.5711798667907715, "learning_rate": 4.2300816543460407e-07, "loss": 0.3272, "step": 11942 }, { "epoch": 0.577040150746485, "grad_norm": 2.5053398609161377, "learning_rate": 4.2295984925351495e-07, "loss": 0.2888, "step": 11943 }, { "epoch": 0.5770884669275741, "grad_norm": 2.8008058071136475, "learning_rate": 4.2291153307242594e-07, "loss": 0.1708, "step": 11944 }, { "epoch": 0.5771367831086631, "grad_norm": 2.458904981613159, "learning_rate": 4.2286321689133693e-07, "loss": 0.2531, "step": 11945 }, { "epoch": 0.5771850992897521, "grad_norm": 2.603060007095337, "learning_rate": 4.228149007102478e-07, "loss": 0.244, "step": 11946 }, { "epoch": 0.5772334154708412, "grad_norm": 2.2600486278533936, "learning_rate": 4.227665845291588e-07, "loss": 0.2631, "step": 11947 }, { "epoch": 0.5772817316519302, "grad_norm": 2.8982014656066895, "learning_rate": 4.2271826834806975e-07, "loss": 0.2328, "step": 11948 }, { "epoch": 0.5773300478330193, "grad_norm": 2.7548811435699463, "learning_rate": 4.226699521669807e-07, "loss": 0.3424, "step": 11949 }, { "epoch": 0.5773783640141084, "grad_norm": 2.4139580726623535, "learning_rate": 4.226216359858917e-07, "loss": 0.3296, "step": 11950 }, { "epoch": 0.5774266801951974, "grad_norm": 11.861725807189941, "learning_rate": 4.225733198048026e-07, "loss": 0.3351, "step": 11951 }, { "epoch": 0.5774749963762864, "grad_norm": 3.13265323638916, "learning_rate": 4.2252500362371355e-07, "loss": 0.3539, "step": 11952 }, { "epoch": 0.5775233125573754, "grad_norm": 1.9961596727371216, "learning_rate": 4.224766874426245e-07, "loss": 0.1991, "step": 11953 }, { "epoch": 0.5775716287384645, "grad_norm": 9.869168281555176, "learning_rate": 4.224283712615355e-07, "loss": 0.3433, "step": 11954 }, { "epoch": 0.5776199449195536, "grad_norm": 3.8589532375335693, "learning_rate": 4.2238005508044647e-07, "loss": 0.2975, "step": 11955 }, { "epoch": 0.5776682611006426, "grad_norm": 2.3708744049072266, "learning_rate": 4.2233173889935735e-07, "loss": 0.3281, "step": 11956 }, { "epoch": 0.5777165772817316, "grad_norm": 2.5098774433135986, "learning_rate": 4.2228342271826834e-07, "loss": 0.2672, "step": 11957 }, { "epoch": 0.5777648934628207, "grad_norm": 3.0905325412750244, "learning_rate": 4.2223510653717933e-07, "loss": 0.3613, "step": 11958 }, { "epoch": 0.5778132096439097, "grad_norm": 2.8940956592559814, "learning_rate": 4.221867903560902e-07, "loss": 0.3142, "step": 11959 }, { "epoch": 0.5778615258249988, "grad_norm": 2.6263880729675293, "learning_rate": 4.221384741750012e-07, "loss": 0.3143, "step": 11960 }, { "epoch": 0.5779098420060879, "grad_norm": 2.6950738430023193, "learning_rate": 4.2209015799391214e-07, "loss": 0.2689, "step": 11961 }, { "epoch": 0.5779581581871769, "grad_norm": 4.595891952514648, "learning_rate": 4.220418418128231e-07, "loss": 0.2337, "step": 11962 }, { "epoch": 0.5780064743682659, "grad_norm": 2.9494712352752686, "learning_rate": 4.2199352563173407e-07, "loss": 0.4421, "step": 11963 }, { "epoch": 0.5780547905493549, "grad_norm": 4.007003307342529, "learning_rate": 4.21945209450645e-07, "loss": 0.2614, "step": 11964 }, { "epoch": 0.5781031067304441, "grad_norm": 3.1133840084075928, "learning_rate": 4.2189689326955595e-07, "loss": 0.2566, "step": 11965 }, { "epoch": 0.5781514229115331, "grad_norm": 3.71687912940979, "learning_rate": 4.218485770884669e-07, "loss": 0.2042, "step": 11966 }, { "epoch": 0.5781997390926221, "grad_norm": 3.009054183959961, "learning_rate": 4.2180026090737787e-07, "loss": 0.3269, "step": 11967 }, { "epoch": 0.5782480552737111, "grad_norm": 2.5531058311462402, "learning_rate": 4.217519447262888e-07, "loss": 0.3033, "step": 11968 }, { "epoch": 0.5782963714548002, "grad_norm": 1.8910776376724243, "learning_rate": 4.2170362854519975e-07, "loss": 0.1891, "step": 11969 }, { "epoch": 0.5783446876358893, "grad_norm": 2.9154562950134277, "learning_rate": 4.2165531236411074e-07, "loss": 0.3319, "step": 11970 }, { "epoch": 0.5783930038169783, "grad_norm": 2.1698176860809326, "learning_rate": 4.2160699618302173e-07, "loss": 0.2272, "step": 11971 }, { "epoch": 0.5784413199980674, "grad_norm": 2.8630611896514893, "learning_rate": 4.215586800019326e-07, "loss": 0.2784, "step": 11972 }, { "epoch": 0.5784896361791564, "grad_norm": 4.238105773925781, "learning_rate": 4.215103638208436e-07, "loss": 0.3426, "step": 11973 }, { "epoch": 0.5785379523602454, "grad_norm": 2.3875796794891357, "learning_rate": 4.2146204763975454e-07, "loss": 0.2438, "step": 11974 }, { "epoch": 0.5785862685413345, "grad_norm": 2.711252212524414, "learning_rate": 4.214137314586655e-07, "loss": 0.1934, "step": 11975 }, { "epoch": 0.5786345847224236, "grad_norm": 4.800364017486572, "learning_rate": 4.2136541527757647e-07, "loss": 0.3087, "step": 11976 }, { "epoch": 0.5786829009035126, "grad_norm": 3.6676387786865234, "learning_rate": 4.213170990964874e-07, "loss": 0.3108, "step": 11977 }, { "epoch": 0.5787312170846016, "grad_norm": 4.336536884307861, "learning_rate": 4.2126878291539834e-07, "loss": 0.342, "step": 11978 }, { "epoch": 0.5787795332656906, "grad_norm": 3.1302030086517334, "learning_rate": 4.212204667343093e-07, "loss": 0.4327, "step": 11979 }, { "epoch": 0.5788278494467797, "grad_norm": 3.4187588691711426, "learning_rate": 4.2117215055322027e-07, "loss": 0.2206, "step": 11980 }, { "epoch": 0.5788761656278688, "grad_norm": 2.0631237030029297, "learning_rate": 4.211238343721312e-07, "loss": 0.2791, "step": 11981 }, { "epoch": 0.5789244818089578, "grad_norm": 2.5539445877075195, "learning_rate": 4.2107551819104214e-07, "loss": 0.3236, "step": 11982 }, { "epoch": 0.5789727979900469, "grad_norm": 2.7751684188842773, "learning_rate": 4.2102720200995313e-07, "loss": 0.3735, "step": 11983 }, { "epoch": 0.5790211141711359, "grad_norm": 1.8909660577774048, "learning_rate": 4.2097888582886407e-07, "loss": 0.2103, "step": 11984 }, { "epoch": 0.5790694303522249, "grad_norm": 2.697354793548584, "learning_rate": 4.20930569647775e-07, "loss": 0.3075, "step": 11985 }, { "epoch": 0.579117746533314, "grad_norm": 2.6771061420440674, "learning_rate": 4.20882253466686e-07, "loss": 0.3014, "step": 11986 }, { "epoch": 0.5791660627144031, "grad_norm": 2.382510185241699, "learning_rate": 4.208339372855969e-07, "loss": 0.2426, "step": 11987 }, { "epoch": 0.5792143788954921, "grad_norm": 2.003923177719116, "learning_rate": 4.2078562110450787e-07, "loss": 0.1545, "step": 11988 }, { "epoch": 0.5792626950765811, "grad_norm": 2.0313804149627686, "learning_rate": 4.2073730492341886e-07, "loss": 0.2375, "step": 11989 }, { "epoch": 0.5793110112576701, "grad_norm": 7.370113372802734, "learning_rate": 4.206889887423298e-07, "loss": 0.4071, "step": 11990 }, { "epoch": 0.5793593274387593, "grad_norm": 4.82843017578125, "learning_rate": 4.2064067256124074e-07, "loss": 0.2454, "step": 11991 }, { "epoch": 0.5794076436198483, "grad_norm": 3.89200758934021, "learning_rate": 4.205923563801517e-07, "loss": 0.2965, "step": 11992 }, { "epoch": 0.5794559598009373, "grad_norm": 2.9982874393463135, "learning_rate": 4.2054404019906267e-07, "loss": 0.2078, "step": 11993 }, { "epoch": 0.5795042759820264, "grad_norm": 9.303696632385254, "learning_rate": 4.204957240179736e-07, "loss": 0.3131, "step": 11994 }, { "epoch": 0.5795525921631154, "grad_norm": 3.279536724090576, "learning_rate": 4.2044740783688454e-07, "loss": 0.2915, "step": 11995 }, { "epoch": 0.5796009083442045, "grad_norm": 1.9927736520767212, "learning_rate": 4.2039909165579553e-07, "loss": 0.281, "step": 11996 }, { "epoch": 0.5796492245252935, "grad_norm": 2.996523857116699, "learning_rate": 4.2035077547470647e-07, "loss": 0.2609, "step": 11997 }, { "epoch": 0.5796975407063826, "grad_norm": 2.8951284885406494, "learning_rate": 4.203024592936174e-07, "loss": 0.2997, "step": 11998 }, { "epoch": 0.5797458568874716, "grad_norm": 2.7978153228759766, "learning_rate": 4.202541431125284e-07, "loss": 0.3359, "step": 11999 }, { "epoch": 0.5797941730685606, "grad_norm": 6.759016990661621, "learning_rate": 4.202058269314393e-07, "loss": 0.3365, "step": 12000 }, { "epoch": 0.5798424892496498, "grad_norm": 3.2630088329315186, "learning_rate": 4.2015751075035027e-07, "loss": 0.2771, "step": 12001 }, { "epoch": 0.5798908054307388, "grad_norm": 7.618697643280029, "learning_rate": 4.2010919456926126e-07, "loss": 0.1924, "step": 12002 }, { "epoch": 0.5799391216118278, "grad_norm": 1.9104571342468262, "learning_rate": 4.2006087838817214e-07, "loss": 0.1983, "step": 12003 }, { "epoch": 0.5799874377929168, "grad_norm": 5.767370700836182, "learning_rate": 4.2001256220708313e-07, "loss": 0.3527, "step": 12004 }, { "epoch": 0.5800357539740059, "grad_norm": 4.143055438995361, "learning_rate": 4.1996424602599407e-07, "loss": 0.226, "step": 12005 }, { "epoch": 0.5800840701550949, "grad_norm": 2.6163594722747803, "learning_rate": 4.1991592984490506e-07, "loss": 0.3399, "step": 12006 }, { "epoch": 0.580132386336184, "grad_norm": 3.1028947830200195, "learning_rate": 4.19867613663816e-07, "loss": 0.3682, "step": 12007 }, { "epoch": 0.580180702517273, "grad_norm": 1.3083521127700806, "learning_rate": 4.1981929748272694e-07, "loss": 0.1369, "step": 12008 }, { "epoch": 0.5802290186983621, "grad_norm": 3.089644193649292, "learning_rate": 4.197709813016379e-07, "loss": 0.2889, "step": 12009 }, { "epoch": 0.5802773348794511, "grad_norm": 2.2764828205108643, "learning_rate": 4.1972266512054886e-07, "loss": 0.1939, "step": 12010 }, { "epoch": 0.5803256510605401, "grad_norm": 3.225078582763672, "learning_rate": 4.196743489394598e-07, "loss": 0.2889, "step": 12011 }, { "epoch": 0.5803739672416293, "grad_norm": 2.7945594787597656, "learning_rate": 4.196260327583708e-07, "loss": 0.3213, "step": 12012 }, { "epoch": 0.5804222834227183, "grad_norm": 2.7785592079162598, "learning_rate": 4.195777165772817e-07, "loss": 0.426, "step": 12013 }, { "epoch": 0.5804705996038073, "grad_norm": 3.6112430095672607, "learning_rate": 4.1952940039619267e-07, "loss": 0.2458, "step": 12014 }, { "epoch": 0.5805189157848963, "grad_norm": 2.3867716789245605, "learning_rate": 4.1948108421510366e-07, "loss": 0.2253, "step": 12015 }, { "epoch": 0.5805672319659854, "grad_norm": 2.0977604389190674, "learning_rate": 4.1943276803401454e-07, "loss": 0.2048, "step": 12016 }, { "epoch": 0.5806155481470745, "grad_norm": 2.9738271236419678, "learning_rate": 4.1938445185292553e-07, "loss": 0.2183, "step": 12017 }, { "epoch": 0.5806638643281635, "grad_norm": 3.8023111820220947, "learning_rate": 4.1933613567183647e-07, "loss": 0.2424, "step": 12018 }, { "epoch": 0.5807121805092526, "grad_norm": 2.604247570037842, "learning_rate": 4.192878194907474e-07, "loss": 0.3405, "step": 12019 }, { "epoch": 0.5807604966903416, "grad_norm": 3.382974624633789, "learning_rate": 4.192395033096584e-07, "loss": 0.33, "step": 12020 }, { "epoch": 0.5808088128714306, "grad_norm": 2.923346757888794, "learning_rate": 4.1919118712856933e-07, "loss": 0.3531, "step": 12021 }, { "epoch": 0.5808571290525197, "grad_norm": 2.098912239074707, "learning_rate": 4.191428709474803e-07, "loss": 0.2071, "step": 12022 }, { "epoch": 0.5809054452336088, "grad_norm": 5.797247886657715, "learning_rate": 4.1909455476639126e-07, "loss": 0.3741, "step": 12023 }, { "epoch": 0.5809537614146978, "grad_norm": 2.344841957092285, "learning_rate": 4.190462385853022e-07, "loss": 0.2645, "step": 12024 }, { "epoch": 0.5810020775957868, "grad_norm": 2.634622097015381, "learning_rate": 4.189979224042132e-07, "loss": 0.3061, "step": 12025 }, { "epoch": 0.5810503937768758, "grad_norm": 4.536335468292236, "learning_rate": 4.1894960622312407e-07, "loss": 0.4371, "step": 12026 }, { "epoch": 0.581098709957965, "grad_norm": 2.9331188201904297, "learning_rate": 4.1890129004203506e-07, "loss": 0.3863, "step": 12027 }, { "epoch": 0.581147026139054, "grad_norm": 2.907749652862549, "learning_rate": 4.1885297386094605e-07, "loss": 0.2958, "step": 12028 }, { "epoch": 0.581195342320143, "grad_norm": 8.05711555480957, "learning_rate": 4.1880465767985694e-07, "loss": 0.2962, "step": 12029 }, { "epoch": 0.581243658501232, "grad_norm": 4.001550197601318, "learning_rate": 4.1875634149876793e-07, "loss": 0.273, "step": 12030 }, { "epoch": 0.5812919746823211, "grad_norm": 1.3990591764450073, "learning_rate": 4.1870802531767886e-07, "loss": 0.1389, "step": 12031 }, { "epoch": 0.5813402908634101, "grad_norm": 2.3649346828460693, "learning_rate": 4.186597091365898e-07, "loss": 0.2203, "step": 12032 }, { "epoch": 0.5813886070444992, "grad_norm": 3.7972397804260254, "learning_rate": 4.186113929555008e-07, "loss": 0.3909, "step": 12033 }, { "epoch": 0.5814369232255883, "grad_norm": 5.620264530181885, "learning_rate": 4.1856307677441173e-07, "loss": 0.2323, "step": 12034 }, { "epoch": 0.5814852394066773, "grad_norm": 3.1314313411712646, "learning_rate": 4.1851476059332267e-07, "loss": 0.4602, "step": 12035 }, { "epoch": 0.5815335555877663, "grad_norm": 2.753434896469116, "learning_rate": 4.1846644441223366e-07, "loss": 0.2702, "step": 12036 }, { "epoch": 0.5815818717688553, "grad_norm": 3.4172046184539795, "learning_rate": 4.184181282311446e-07, "loss": 0.2604, "step": 12037 }, { "epoch": 0.5816301879499445, "grad_norm": 2.087343692779541, "learning_rate": 4.183698120500556e-07, "loss": 0.2012, "step": 12038 }, { "epoch": 0.5816785041310335, "grad_norm": 2.542470932006836, "learning_rate": 4.1832149586896647e-07, "loss": 0.2579, "step": 12039 }, { "epoch": 0.5817268203121225, "grad_norm": 1.7711987495422363, "learning_rate": 4.1827317968787746e-07, "loss": 0.1843, "step": 12040 }, { "epoch": 0.5817751364932116, "grad_norm": 3.7626731395721436, "learning_rate": 4.1822486350678845e-07, "loss": 0.3419, "step": 12041 }, { "epoch": 0.5818234526743006, "grad_norm": 3.7299628257751465, "learning_rate": 4.1817654732569933e-07, "loss": 0.2523, "step": 12042 }, { "epoch": 0.5818717688553897, "grad_norm": 2.6445858478546143, "learning_rate": 4.181282311446103e-07, "loss": 0.2363, "step": 12043 }, { "epoch": 0.5819200850364787, "grad_norm": 3.5045289993286133, "learning_rate": 4.1807991496352126e-07, "loss": 0.3531, "step": 12044 }, { "epoch": 0.5819684012175678, "grad_norm": 4.560112476348877, "learning_rate": 4.180315987824322e-07, "loss": 0.3544, "step": 12045 }, { "epoch": 0.5820167173986568, "grad_norm": 3.649758815765381, "learning_rate": 4.179832826013432e-07, "loss": 0.3554, "step": 12046 }, { "epoch": 0.5820650335797458, "grad_norm": 3.4634904861450195, "learning_rate": 4.179349664202541e-07, "loss": 0.3119, "step": 12047 }, { "epoch": 0.582113349760835, "grad_norm": 4.1743245124816895, "learning_rate": 4.1788665023916506e-07, "loss": 0.2974, "step": 12048 }, { "epoch": 0.582161665941924, "grad_norm": 2.6763906478881836, "learning_rate": 4.1783833405807605e-07, "loss": 0.4115, "step": 12049 }, { "epoch": 0.582209982123013, "grad_norm": 3.8680198192596436, "learning_rate": 4.17790017876987e-07, "loss": 0.2796, "step": 12050 }, { "epoch": 0.582258298304102, "grad_norm": 3.417343854904175, "learning_rate": 4.1774170169589793e-07, "loss": 0.2639, "step": 12051 }, { "epoch": 0.582306614485191, "grad_norm": 3.0092570781707764, "learning_rate": 4.1769338551480886e-07, "loss": 0.267, "step": 12052 }, { "epoch": 0.5823549306662802, "grad_norm": 4.43224573135376, "learning_rate": 4.1764506933371985e-07, "loss": 0.4054, "step": 12053 }, { "epoch": 0.5824032468473692, "grad_norm": 3.019082546234131, "learning_rate": 4.1759675315263084e-07, "loss": 0.3766, "step": 12054 }, { "epoch": 0.5824515630284582, "grad_norm": 2.7797558307647705, "learning_rate": 4.1754843697154173e-07, "loss": 0.4305, "step": 12055 }, { "epoch": 0.5824998792095473, "grad_norm": 2.545466899871826, "learning_rate": 4.175001207904527e-07, "loss": 0.2144, "step": 12056 }, { "epoch": 0.5825481953906363, "grad_norm": 2.0583174228668213, "learning_rate": 4.1745180460936366e-07, "loss": 0.2102, "step": 12057 }, { "epoch": 0.5825965115717253, "grad_norm": 2.2708492279052734, "learning_rate": 4.174034884282746e-07, "loss": 0.2612, "step": 12058 }, { "epoch": 0.5826448277528145, "grad_norm": 1.806723952293396, "learning_rate": 4.173551722471856e-07, "loss": 0.1723, "step": 12059 }, { "epoch": 0.5826931439339035, "grad_norm": 2.485072135925293, "learning_rate": 4.173068560660965e-07, "loss": 0.2476, "step": 12060 }, { "epoch": 0.5827414601149925, "grad_norm": 3.3274331092834473, "learning_rate": 4.1725853988500746e-07, "loss": 0.1759, "step": 12061 }, { "epoch": 0.5827897762960815, "grad_norm": 1.7324925661087036, "learning_rate": 4.172102237039184e-07, "loss": 0.1763, "step": 12062 }, { "epoch": 0.5828380924771706, "grad_norm": 2.343071699142456, "learning_rate": 4.171619075228294e-07, "loss": 0.3404, "step": 12063 }, { "epoch": 0.5828864086582597, "grad_norm": 2.9240078926086426, "learning_rate": 4.171135913417403e-07, "loss": 0.4514, "step": 12064 }, { "epoch": 0.5829347248393487, "grad_norm": 2.9120850563049316, "learning_rate": 4.1706527516065126e-07, "loss": 0.368, "step": 12065 }, { "epoch": 0.5829830410204377, "grad_norm": 2.7227466106414795, "learning_rate": 4.1701695897956225e-07, "loss": 0.2965, "step": 12066 }, { "epoch": 0.5830313572015268, "grad_norm": 2.761509656906128, "learning_rate": 4.169686427984732e-07, "loss": 0.3988, "step": 12067 }, { "epoch": 0.5830796733826158, "grad_norm": 2.5107650756835938, "learning_rate": 4.169203266173841e-07, "loss": 0.2925, "step": 12068 }, { "epoch": 0.5831279895637049, "grad_norm": 2.508835554122925, "learning_rate": 4.168720104362951e-07, "loss": 0.3006, "step": 12069 }, { "epoch": 0.583176305744794, "grad_norm": 2.1968605518341064, "learning_rate": 4.16823694255206e-07, "loss": 0.237, "step": 12070 }, { "epoch": 0.583224621925883, "grad_norm": 4.5523905754089355, "learning_rate": 4.16775378074117e-07, "loss": 0.2034, "step": 12071 }, { "epoch": 0.583272938106972, "grad_norm": 3.520881175994873, "learning_rate": 4.16727061893028e-07, "loss": 0.3046, "step": 12072 }, { "epoch": 0.583321254288061, "grad_norm": 2.6448326110839844, "learning_rate": 4.166787457119389e-07, "loss": 0.2172, "step": 12073 }, { "epoch": 0.5833695704691502, "grad_norm": 2.2682178020477295, "learning_rate": 4.1663042953084986e-07, "loss": 0.2726, "step": 12074 }, { "epoch": 0.5834178866502392, "grad_norm": 2.38915753364563, "learning_rate": 4.165821133497608e-07, "loss": 0.2255, "step": 12075 }, { "epoch": 0.5834662028313282, "grad_norm": 19.345439910888672, "learning_rate": 4.165337971686718e-07, "loss": 0.4052, "step": 12076 }, { "epoch": 0.5835145190124172, "grad_norm": 1.7257295846939087, "learning_rate": 4.164854809875827e-07, "loss": 0.1919, "step": 12077 }, { "epoch": 0.5835628351935063, "grad_norm": 5.5953545570373535, "learning_rate": 4.1643716480649366e-07, "loss": 0.2645, "step": 12078 }, { "epoch": 0.5836111513745954, "grad_norm": 3.537101984024048, "learning_rate": 4.1638884862540465e-07, "loss": 0.3269, "step": 12079 }, { "epoch": 0.5836594675556844, "grad_norm": 3.2112526893615723, "learning_rate": 4.163405324443156e-07, "loss": 0.2492, "step": 12080 }, { "epoch": 0.5837077837367735, "grad_norm": 3.473742961883545, "learning_rate": 4.162922162632265e-07, "loss": 0.3099, "step": 12081 }, { "epoch": 0.5837560999178625, "grad_norm": 2.6756997108459473, "learning_rate": 4.162439000821375e-07, "loss": 0.2923, "step": 12082 }, { "epoch": 0.5838044160989515, "grad_norm": 2.921539306640625, "learning_rate": 4.161955839010484e-07, "loss": 0.3741, "step": 12083 }, { "epoch": 0.5838527322800405, "grad_norm": 1.2587875127792358, "learning_rate": 4.161472677199594e-07, "loss": 0.1359, "step": 12084 }, { "epoch": 0.5839010484611297, "grad_norm": 3.1144723892211914, "learning_rate": 4.160989515388704e-07, "loss": 0.2554, "step": 12085 }, { "epoch": 0.5839493646422187, "grad_norm": 3.2776949405670166, "learning_rate": 4.1605063535778126e-07, "loss": 0.3228, "step": 12086 }, { "epoch": 0.5839976808233077, "grad_norm": 2.9045469760894775, "learning_rate": 4.1600231917669225e-07, "loss": 0.2084, "step": 12087 }, { "epoch": 0.5840459970043967, "grad_norm": 3.0094690322875977, "learning_rate": 4.159540029956032e-07, "loss": 0.2846, "step": 12088 }, { "epoch": 0.5840943131854858, "grad_norm": 2.6773178577423096, "learning_rate": 4.159056868145142e-07, "loss": 0.2425, "step": 12089 }, { "epoch": 0.5841426293665749, "grad_norm": 3.5313847064971924, "learning_rate": 4.158573706334251e-07, "loss": 0.2366, "step": 12090 }, { "epoch": 0.5841909455476639, "grad_norm": 2.0809555053710938, "learning_rate": 4.1580905445233605e-07, "loss": 0.2562, "step": 12091 }, { "epoch": 0.584239261728753, "grad_norm": 4.013155937194824, "learning_rate": 4.1576073827124704e-07, "loss": 0.3435, "step": 12092 }, { "epoch": 0.584287577909842, "grad_norm": 3.01274037361145, "learning_rate": 4.15712422090158e-07, "loss": 0.2584, "step": 12093 }, { "epoch": 0.584335894090931, "grad_norm": 3.416640043258667, "learning_rate": 4.156641059090689e-07, "loss": 0.4472, "step": 12094 }, { "epoch": 0.5843842102720201, "grad_norm": 3.426119089126587, "learning_rate": 4.156157897279799e-07, "loss": 0.3071, "step": 12095 }, { "epoch": 0.5844325264531092, "grad_norm": 3.088221788406372, "learning_rate": 4.155674735468908e-07, "loss": 0.148, "step": 12096 }, { "epoch": 0.5844808426341982, "grad_norm": 2.4244329929351807, "learning_rate": 4.155191573658018e-07, "loss": 0.1935, "step": 12097 }, { "epoch": 0.5845291588152872, "grad_norm": 3.6313507556915283, "learning_rate": 4.1547084118471277e-07, "loss": 0.4029, "step": 12098 }, { "epoch": 0.5845774749963762, "grad_norm": 3.1929335594177246, "learning_rate": 4.1542252500362366e-07, "loss": 0.3936, "step": 12099 }, { "epoch": 0.5846257911774654, "grad_norm": 4.204725742340088, "learning_rate": 4.1537420882253465e-07, "loss": 0.3109, "step": 12100 }, { "epoch": 0.5846741073585544, "grad_norm": 5.843654632568359, "learning_rate": 4.153258926414456e-07, "loss": 0.2185, "step": 12101 }, { "epoch": 0.5847224235396434, "grad_norm": 4.084404468536377, "learning_rate": 4.152775764603566e-07, "loss": 0.2167, "step": 12102 }, { "epoch": 0.5847707397207325, "grad_norm": 3.0081281661987305, "learning_rate": 4.152292602792675e-07, "loss": 0.3286, "step": 12103 }, { "epoch": 0.5848190559018215, "grad_norm": 2.3908040523529053, "learning_rate": 4.1518094409817845e-07, "loss": 0.2295, "step": 12104 }, { "epoch": 0.5848673720829106, "grad_norm": 9.419170379638672, "learning_rate": 4.1513262791708944e-07, "loss": 0.2925, "step": 12105 }, { "epoch": 0.5849156882639996, "grad_norm": 3.2776801586151123, "learning_rate": 4.150843117360004e-07, "loss": 0.3075, "step": 12106 }, { "epoch": 0.5849640044450887, "grad_norm": 2.4580063819885254, "learning_rate": 4.150359955549113e-07, "loss": 0.2551, "step": 12107 }, { "epoch": 0.5850123206261777, "grad_norm": 2.456099271774292, "learning_rate": 4.149876793738223e-07, "loss": 0.1885, "step": 12108 }, { "epoch": 0.5850606368072667, "grad_norm": 2.2161624431610107, "learning_rate": 4.149393631927332e-07, "loss": 0.2403, "step": 12109 }, { "epoch": 0.5851089529883557, "grad_norm": 2.1520206928253174, "learning_rate": 4.148910470116442e-07, "loss": 0.2784, "step": 12110 }, { "epoch": 0.5851572691694449, "grad_norm": 4.084197998046875, "learning_rate": 4.1484273083055517e-07, "loss": 0.2473, "step": 12111 }, { "epoch": 0.5852055853505339, "grad_norm": 2.7385590076446533, "learning_rate": 4.1479441464946605e-07, "loss": 0.2496, "step": 12112 }, { "epoch": 0.5852539015316229, "grad_norm": 2.026190996170044, "learning_rate": 4.1474609846837704e-07, "loss": 0.2486, "step": 12113 }, { "epoch": 0.585302217712712, "grad_norm": 2.1287474632263184, "learning_rate": 4.14697782287288e-07, "loss": 0.3161, "step": 12114 }, { "epoch": 0.585350533893801, "grad_norm": 2.6714351177215576, "learning_rate": 4.146494661061989e-07, "loss": 0.3187, "step": 12115 }, { "epoch": 0.5853988500748901, "grad_norm": 3.408905029296875, "learning_rate": 4.146011499251099e-07, "loss": 0.3008, "step": 12116 }, { "epoch": 0.5854471662559791, "grad_norm": 2.4739437103271484, "learning_rate": 4.1455283374402085e-07, "loss": 0.2985, "step": 12117 }, { "epoch": 0.5854954824370682, "grad_norm": 2.2340281009674072, "learning_rate": 4.1450451756293184e-07, "loss": 0.2095, "step": 12118 }, { "epoch": 0.5855437986181572, "grad_norm": 2.9378368854522705, "learning_rate": 4.144562013818428e-07, "loss": 0.3901, "step": 12119 }, { "epoch": 0.5855921147992462, "grad_norm": 2.622002124786377, "learning_rate": 4.144078852007537e-07, "loss": 0.3419, "step": 12120 }, { "epoch": 0.5856404309803354, "grad_norm": 3.3413617610931396, "learning_rate": 4.143595690196647e-07, "loss": 0.2467, "step": 12121 }, { "epoch": 0.5856887471614244, "grad_norm": 3.0190696716308594, "learning_rate": 4.143112528385756e-07, "loss": 0.4229, "step": 12122 }, { "epoch": 0.5857370633425134, "grad_norm": 2.537010908126831, "learning_rate": 4.142629366574866e-07, "loss": 0.3014, "step": 12123 }, { "epoch": 0.5857853795236024, "grad_norm": 3.0711400508880615, "learning_rate": 4.1421462047639757e-07, "loss": 0.381, "step": 12124 }, { "epoch": 0.5858336957046915, "grad_norm": 2.589021921157837, "learning_rate": 4.1416630429530845e-07, "loss": 0.2567, "step": 12125 }, { "epoch": 0.5858820118857806, "grad_norm": 2.968266487121582, "learning_rate": 4.1411798811421944e-07, "loss": 0.3984, "step": 12126 }, { "epoch": 0.5859303280668696, "grad_norm": 3.177851438522339, "learning_rate": 4.140696719331304e-07, "loss": 0.3814, "step": 12127 }, { "epoch": 0.5859786442479586, "grad_norm": 1.6380336284637451, "learning_rate": 4.140213557520413e-07, "loss": 0.1588, "step": 12128 }, { "epoch": 0.5860269604290477, "grad_norm": 2.5437896251678467, "learning_rate": 4.139730395709523e-07, "loss": 0.2492, "step": 12129 }, { "epoch": 0.5860752766101367, "grad_norm": 3.279822826385498, "learning_rate": 4.1392472338986324e-07, "loss": 0.4413, "step": 12130 }, { "epoch": 0.5861235927912258, "grad_norm": 5.992620944976807, "learning_rate": 4.138764072087742e-07, "loss": 0.2535, "step": 12131 }, { "epoch": 0.5861719089723149, "grad_norm": 1.5338534116744995, "learning_rate": 4.1382809102768517e-07, "loss": 0.1456, "step": 12132 }, { "epoch": 0.5862202251534039, "grad_norm": 10.442559242248535, "learning_rate": 4.137797748465961e-07, "loss": 0.3222, "step": 12133 }, { "epoch": 0.5862685413344929, "grad_norm": 1.9930822849273682, "learning_rate": 4.137314586655071e-07, "loss": 0.2235, "step": 12134 }, { "epoch": 0.5863168575155819, "grad_norm": 3.1004433631896973, "learning_rate": 4.13683142484418e-07, "loss": 0.2493, "step": 12135 }, { "epoch": 0.586365173696671, "grad_norm": 1.991328239440918, "learning_rate": 4.1363482630332897e-07, "loss": 0.2255, "step": 12136 }, { "epoch": 0.5864134898777601, "grad_norm": 2.265964984893799, "learning_rate": 4.1358651012223996e-07, "loss": 0.2195, "step": 12137 }, { "epoch": 0.5864618060588491, "grad_norm": 2.7584962844848633, "learning_rate": 4.1353819394115085e-07, "loss": 0.3317, "step": 12138 }, { "epoch": 0.5865101222399381, "grad_norm": 3.929474353790283, "learning_rate": 4.1348987776006184e-07, "loss": 0.3797, "step": 12139 }, { "epoch": 0.5865584384210272, "grad_norm": 2.7603840827941895, "learning_rate": 4.134415615789728e-07, "loss": 0.2681, "step": 12140 }, { "epoch": 0.5866067546021162, "grad_norm": 2.3104872703552246, "learning_rate": 4.133932453978837e-07, "loss": 0.3207, "step": 12141 }, { "epoch": 0.5866550707832053, "grad_norm": 3.0730338096618652, "learning_rate": 4.133449292167947e-07, "loss": 0.2335, "step": 12142 }, { "epoch": 0.5867033869642944, "grad_norm": 2.3707187175750732, "learning_rate": 4.1329661303570564e-07, "loss": 0.2975, "step": 12143 }, { "epoch": 0.5867517031453834, "grad_norm": 14.91954517364502, "learning_rate": 4.132482968546166e-07, "loss": 0.2828, "step": 12144 }, { "epoch": 0.5868000193264724, "grad_norm": 8.215417861938477, "learning_rate": 4.1319998067352757e-07, "loss": 0.2864, "step": 12145 }, { "epoch": 0.5868483355075614, "grad_norm": 2.370112895965576, "learning_rate": 4.131516644924385e-07, "loss": 0.2941, "step": 12146 }, { "epoch": 0.5868966516886506, "grad_norm": 3.560324192047119, "learning_rate": 4.1310334831134944e-07, "loss": 0.2646, "step": 12147 }, { "epoch": 0.5869449678697396, "grad_norm": 2.499504566192627, "learning_rate": 4.130550321302604e-07, "loss": 0.2847, "step": 12148 }, { "epoch": 0.5869932840508286, "grad_norm": 4.718276500701904, "learning_rate": 4.1300671594917137e-07, "loss": 0.3499, "step": 12149 }, { "epoch": 0.5870416002319176, "grad_norm": 1.8133912086486816, "learning_rate": 4.1295839976808236e-07, "loss": 0.1471, "step": 12150 }, { "epoch": 0.5870899164130067, "grad_norm": 3.464707136154175, "learning_rate": 4.1291008358699324e-07, "loss": 0.3094, "step": 12151 }, { "epoch": 0.5871382325940958, "grad_norm": 2.6090312004089355, "learning_rate": 4.1286176740590423e-07, "loss": 0.4101, "step": 12152 }, { "epoch": 0.5871865487751848, "grad_norm": 3.6628024578094482, "learning_rate": 4.1281345122481517e-07, "loss": 0.3157, "step": 12153 }, { "epoch": 0.5872348649562739, "grad_norm": 1.8914437294006348, "learning_rate": 4.127651350437261e-07, "loss": 0.1651, "step": 12154 }, { "epoch": 0.5872831811373629, "grad_norm": 3.2956483364105225, "learning_rate": 4.127168188626371e-07, "loss": 0.2948, "step": 12155 }, { "epoch": 0.5873314973184519, "grad_norm": 2.8747658729553223, "learning_rate": 4.1266850268154804e-07, "loss": 0.2991, "step": 12156 }, { "epoch": 0.587379813499541, "grad_norm": 2.953009605407715, "learning_rate": 4.1262018650045897e-07, "loss": 0.3238, "step": 12157 }, { "epoch": 0.5874281296806301, "grad_norm": 2.366567611694336, "learning_rate": 4.1257187031936996e-07, "loss": 0.2789, "step": 12158 }, { "epoch": 0.5874764458617191, "grad_norm": 2.6465086936950684, "learning_rate": 4.125235541382809e-07, "loss": 0.3355, "step": 12159 }, { "epoch": 0.5875247620428081, "grad_norm": 4.004485130310059, "learning_rate": 4.1247523795719184e-07, "loss": 0.2188, "step": 12160 }, { "epoch": 0.5875730782238971, "grad_norm": 2.968439817428589, "learning_rate": 4.124269217761028e-07, "loss": 0.2807, "step": 12161 }, { "epoch": 0.5876213944049862, "grad_norm": 2.386914014816284, "learning_rate": 4.1237860559501376e-07, "loss": 0.2757, "step": 12162 }, { "epoch": 0.5876697105860753, "grad_norm": 2.510204315185547, "learning_rate": 4.123302894139247e-07, "loss": 0.2606, "step": 12163 }, { "epoch": 0.5877180267671643, "grad_norm": 5.935659885406494, "learning_rate": 4.1228197323283564e-07, "loss": 0.2626, "step": 12164 }, { "epoch": 0.5877663429482534, "grad_norm": 2.928192615509033, "learning_rate": 4.1223365705174663e-07, "loss": 0.3167, "step": 12165 }, { "epoch": 0.5878146591293424, "grad_norm": 10.323373794555664, "learning_rate": 4.121853408706575e-07, "loss": 0.3841, "step": 12166 }, { "epoch": 0.5878629753104314, "grad_norm": 2.0603771209716797, "learning_rate": 4.121370246895685e-07, "loss": 0.359, "step": 12167 }, { "epoch": 0.5879112914915205, "grad_norm": 2.8013012409210205, "learning_rate": 4.120887085084795e-07, "loss": 0.319, "step": 12168 }, { "epoch": 0.5879596076726096, "grad_norm": 8.628283500671387, "learning_rate": 4.1204039232739043e-07, "loss": 0.2554, "step": 12169 }, { "epoch": 0.5880079238536986, "grad_norm": 1.4187546968460083, "learning_rate": 4.1199207614630137e-07, "loss": 0.1967, "step": 12170 }, { "epoch": 0.5880562400347876, "grad_norm": 2.1970911026000977, "learning_rate": 4.1194375996521236e-07, "loss": 0.2592, "step": 12171 }, { "epoch": 0.5881045562158767, "grad_norm": 3.3035669326782227, "learning_rate": 4.118954437841233e-07, "loss": 0.4286, "step": 12172 }, { "epoch": 0.5881528723969658, "grad_norm": 7.052720546722412, "learning_rate": 4.1184712760303423e-07, "loss": 0.2372, "step": 12173 }, { "epoch": 0.5882011885780548, "grad_norm": 2.6147122383117676, "learning_rate": 4.1179881142194517e-07, "loss": 0.3093, "step": 12174 }, { "epoch": 0.5882495047591438, "grad_norm": 4.831766605377197, "learning_rate": 4.1175049524085616e-07, "loss": 0.2529, "step": 12175 }, { "epoch": 0.5882978209402329, "grad_norm": 1.916547417640686, "learning_rate": 4.117021790597671e-07, "loss": 0.1619, "step": 12176 }, { "epoch": 0.5883461371213219, "grad_norm": 2.8889591693878174, "learning_rate": 4.1165386287867804e-07, "loss": 0.3351, "step": 12177 }, { "epoch": 0.588394453302411, "grad_norm": 2.2972354888916016, "learning_rate": 4.11605546697589e-07, "loss": 0.2692, "step": 12178 }, { "epoch": 0.5884427694835, "grad_norm": 2.4591870307922363, "learning_rate": 4.115572305164999e-07, "loss": 0.3272, "step": 12179 }, { "epoch": 0.5884910856645891, "grad_norm": 2.6602964401245117, "learning_rate": 4.115089143354109e-07, "loss": 0.2304, "step": 12180 }, { "epoch": 0.5885394018456781, "grad_norm": 7.4469428062438965, "learning_rate": 4.114605981543219e-07, "loss": 0.2069, "step": 12181 }, { "epoch": 0.5885877180267671, "grad_norm": 2.562385082244873, "learning_rate": 4.114122819732328e-07, "loss": 0.2743, "step": 12182 }, { "epoch": 0.5886360342078563, "grad_norm": 7.727176666259766, "learning_rate": 4.1136396579214377e-07, "loss": 0.4385, "step": 12183 }, { "epoch": 0.5886843503889453, "grad_norm": 2.138529062271118, "learning_rate": 4.1131564961105476e-07, "loss": 0.1964, "step": 12184 }, { "epoch": 0.5887326665700343, "grad_norm": 2.9218645095825195, "learning_rate": 4.112673334299657e-07, "loss": 0.2078, "step": 12185 }, { "epoch": 0.5887809827511233, "grad_norm": 3.25693678855896, "learning_rate": 4.1121901724887663e-07, "loss": 0.3363, "step": 12186 }, { "epoch": 0.5888292989322124, "grad_norm": 2.4214417934417725, "learning_rate": 4.1117070106778757e-07, "loss": 0.2637, "step": 12187 }, { "epoch": 0.5888776151133014, "grad_norm": 5.883080005645752, "learning_rate": 4.1112238488669856e-07, "loss": 0.1568, "step": 12188 }, { "epoch": 0.5889259312943905, "grad_norm": 2.8216195106506348, "learning_rate": 4.110740687056095e-07, "loss": 0.3165, "step": 12189 }, { "epoch": 0.5889742474754796, "grad_norm": 3.0046234130859375, "learning_rate": 4.1102575252452043e-07, "loss": 0.252, "step": 12190 }, { "epoch": 0.5890225636565686, "grad_norm": 6.4524078369140625, "learning_rate": 4.109774363434314e-07, "loss": 0.2946, "step": 12191 }, { "epoch": 0.5890708798376576, "grad_norm": 2.3607358932495117, "learning_rate": 4.109291201623423e-07, "loss": 0.2397, "step": 12192 }, { "epoch": 0.5891191960187466, "grad_norm": 2.2241034507751465, "learning_rate": 4.108808039812533e-07, "loss": 0.2567, "step": 12193 }, { "epoch": 0.5891675121998358, "grad_norm": 3.12199068069458, "learning_rate": 4.108324878001643e-07, "loss": 0.3168, "step": 12194 }, { "epoch": 0.5892158283809248, "grad_norm": 14.130622863769531, "learning_rate": 4.1078417161907517e-07, "loss": 0.3459, "step": 12195 }, { "epoch": 0.5892641445620138, "grad_norm": 4.07889986038208, "learning_rate": 4.1073585543798616e-07, "loss": 0.4324, "step": 12196 }, { "epoch": 0.5893124607431028, "grad_norm": 2.270822286605835, "learning_rate": 4.1068753925689715e-07, "loss": 0.2704, "step": 12197 }, { "epoch": 0.5893607769241919, "grad_norm": 2.9419877529144287, "learning_rate": 4.1063922307580804e-07, "loss": 0.363, "step": 12198 }, { "epoch": 0.589409093105281, "grad_norm": 1.9889620542526245, "learning_rate": 4.10590906894719e-07, "loss": 0.2144, "step": 12199 }, { "epoch": 0.58945740928637, "grad_norm": 3.3628287315368652, "learning_rate": 4.1054259071362996e-07, "loss": 0.3362, "step": 12200 }, { "epoch": 0.589505725467459, "grad_norm": 2.5570220947265625, "learning_rate": 4.1049427453254095e-07, "loss": 0.3207, "step": 12201 }, { "epoch": 0.5895540416485481, "grad_norm": 3.252758264541626, "learning_rate": 4.104459583514519e-07, "loss": 0.2401, "step": 12202 }, { "epoch": 0.5896023578296371, "grad_norm": 3.260214328765869, "learning_rate": 4.1039764217036283e-07, "loss": 0.2564, "step": 12203 }, { "epoch": 0.5896506740107262, "grad_norm": 2.1256439685821533, "learning_rate": 4.103493259892738e-07, "loss": 0.2842, "step": 12204 }, { "epoch": 0.5896989901918153, "grad_norm": 6.090517997741699, "learning_rate": 4.103010098081847e-07, "loss": 0.383, "step": 12205 }, { "epoch": 0.5897473063729043, "grad_norm": 2.947739601135254, "learning_rate": 4.102526936270957e-07, "loss": 0.3065, "step": 12206 }, { "epoch": 0.5897956225539933, "grad_norm": 4.000668048858643, "learning_rate": 4.102043774460067e-07, "loss": 0.2964, "step": 12207 }, { "epoch": 0.5898439387350823, "grad_norm": 4.742715835571289, "learning_rate": 4.1015606126491757e-07, "loss": 0.2885, "step": 12208 }, { "epoch": 0.5898922549161715, "grad_norm": 2.556589126586914, "learning_rate": 4.1010774508382856e-07, "loss": 0.2969, "step": 12209 }, { "epoch": 0.5899405710972605, "grad_norm": 2.163238525390625, "learning_rate": 4.1005942890273955e-07, "loss": 0.2455, "step": 12210 }, { "epoch": 0.5899888872783495, "grad_norm": 2.7524302005767822, "learning_rate": 4.1001111272165043e-07, "loss": 0.2746, "step": 12211 }, { "epoch": 0.5900372034594386, "grad_norm": 3.3382649421691895, "learning_rate": 4.099627965405614e-07, "loss": 0.3434, "step": 12212 }, { "epoch": 0.5900855196405276, "grad_norm": 3.8199563026428223, "learning_rate": 4.0991448035947236e-07, "loss": 0.3974, "step": 12213 }, { "epoch": 0.5901338358216167, "grad_norm": 2.145977258682251, "learning_rate": 4.098661641783833e-07, "loss": 0.2069, "step": 12214 }, { "epoch": 0.5901821520027057, "grad_norm": 2.0951473712921143, "learning_rate": 4.098178479972943e-07, "loss": 0.2115, "step": 12215 }, { "epoch": 0.5902304681837948, "grad_norm": 2.2704355716705322, "learning_rate": 4.097695318162052e-07, "loss": 0.2236, "step": 12216 }, { "epoch": 0.5902787843648838, "grad_norm": 7.821933746337891, "learning_rate": 4.097212156351162e-07, "loss": 0.2423, "step": 12217 }, { "epoch": 0.5903271005459728, "grad_norm": 3.6662275791168213, "learning_rate": 4.096728994540271e-07, "loss": 0.1925, "step": 12218 }, { "epoch": 0.5903754167270618, "grad_norm": 3.5428121089935303, "learning_rate": 4.096245832729381e-07, "loss": 0.2172, "step": 12219 }, { "epoch": 0.590423732908151, "grad_norm": 5.300175666809082, "learning_rate": 4.095762670918491e-07, "loss": 0.387, "step": 12220 }, { "epoch": 0.59047204908924, "grad_norm": 3.309992790222168, "learning_rate": 4.0952795091075996e-07, "loss": 0.4288, "step": 12221 }, { "epoch": 0.590520365270329, "grad_norm": 4.249866962432861, "learning_rate": 4.0947963472967095e-07, "loss": 0.2823, "step": 12222 }, { "epoch": 0.590568681451418, "grad_norm": 2.102761745452881, "learning_rate": 4.0943131854858194e-07, "loss": 0.2219, "step": 12223 }, { "epoch": 0.5906169976325071, "grad_norm": 1.9543625116348267, "learning_rate": 4.0938300236749283e-07, "loss": 0.1921, "step": 12224 }, { "epoch": 0.5906653138135962, "grad_norm": 2.5235326290130615, "learning_rate": 4.093346861864038e-07, "loss": 0.2902, "step": 12225 }, { "epoch": 0.5907136299946852, "grad_norm": 4.5780253410339355, "learning_rate": 4.0928637000531476e-07, "loss": 0.3697, "step": 12226 }, { "epoch": 0.5907619461757743, "grad_norm": 4.390435218811035, "learning_rate": 4.092380538242257e-07, "loss": 0.2896, "step": 12227 }, { "epoch": 0.5908102623568633, "grad_norm": 2.4697425365448, "learning_rate": 4.091897376431367e-07, "loss": 0.2874, "step": 12228 }, { "epoch": 0.5908585785379523, "grad_norm": 3.9200878143310547, "learning_rate": 4.091414214620476e-07, "loss": 0.4219, "step": 12229 }, { "epoch": 0.5909068947190415, "grad_norm": 3.2656707763671875, "learning_rate": 4.0909310528095856e-07, "loss": 0.2642, "step": 12230 }, { "epoch": 0.5909552109001305, "grad_norm": 2.668172597885132, "learning_rate": 4.090447890998695e-07, "loss": 0.2095, "step": 12231 }, { "epoch": 0.5910035270812195, "grad_norm": 3.6436574459075928, "learning_rate": 4.089964729187805e-07, "loss": 0.2319, "step": 12232 }, { "epoch": 0.5910518432623085, "grad_norm": 2.1185154914855957, "learning_rate": 4.089481567376915e-07, "loss": 0.1967, "step": 12233 }, { "epoch": 0.5911001594433976, "grad_norm": 17.028820037841797, "learning_rate": 4.0889984055660236e-07, "loss": 0.2272, "step": 12234 }, { "epoch": 0.5911484756244867, "grad_norm": 2.7512753009796143, "learning_rate": 4.0885152437551335e-07, "loss": 0.3207, "step": 12235 }, { "epoch": 0.5911967918055757, "grad_norm": 2.9540202617645264, "learning_rate": 4.0880320819442434e-07, "loss": 0.196, "step": 12236 }, { "epoch": 0.5912451079866647, "grad_norm": 2.071176290512085, "learning_rate": 4.087548920133352e-07, "loss": 0.1879, "step": 12237 }, { "epoch": 0.5912934241677538, "grad_norm": 2.939566135406494, "learning_rate": 4.087065758322462e-07, "loss": 0.3802, "step": 12238 }, { "epoch": 0.5913417403488428, "grad_norm": 95.1744155883789, "learning_rate": 4.0865825965115715e-07, "loss": 0.2161, "step": 12239 }, { "epoch": 0.5913900565299319, "grad_norm": 2.8541572093963623, "learning_rate": 4.086099434700681e-07, "loss": 0.3005, "step": 12240 }, { "epoch": 0.591438372711021, "grad_norm": 3.0786995887756348, "learning_rate": 4.085616272889791e-07, "loss": 0.277, "step": 12241 }, { "epoch": 0.59148668889211, "grad_norm": 2.775181293487549, "learning_rate": 4.0851331110789e-07, "loss": 0.3166, "step": 12242 }, { "epoch": 0.591535005073199, "grad_norm": 1.928443431854248, "learning_rate": 4.0846499492680095e-07, "loss": 0.1785, "step": 12243 }, { "epoch": 0.591583321254288, "grad_norm": 3.095503568649292, "learning_rate": 4.084166787457119e-07, "loss": 0.2902, "step": 12244 }, { "epoch": 0.5916316374353771, "grad_norm": 2.7617955207824707, "learning_rate": 4.083683625646229e-07, "loss": 0.2039, "step": 12245 }, { "epoch": 0.5916799536164662, "grad_norm": 2.8214035034179688, "learning_rate": 4.083200463835338e-07, "loss": 0.3087, "step": 12246 }, { "epoch": 0.5917282697975552, "grad_norm": 2.2633514404296875, "learning_rate": 4.0827173020244476e-07, "loss": 0.2808, "step": 12247 }, { "epoch": 0.5917765859786442, "grad_norm": 1.856781005859375, "learning_rate": 4.0822341402135575e-07, "loss": 0.1952, "step": 12248 }, { "epoch": 0.5918249021597333, "grad_norm": 3.430091142654419, "learning_rate": 4.0817509784026674e-07, "loss": 0.373, "step": 12249 }, { "epoch": 0.5918732183408223, "grad_norm": 2.6743593215942383, "learning_rate": 4.081267816591776e-07, "loss": 0.3304, "step": 12250 }, { "epoch": 0.5919215345219114, "grad_norm": 2.503664016723633, "learning_rate": 4.080784654780886e-07, "loss": 0.3053, "step": 12251 }, { "epoch": 0.5919698507030005, "grad_norm": 3.0553531646728516, "learning_rate": 4.0803014929699955e-07, "loss": 0.4296, "step": 12252 }, { "epoch": 0.5920181668840895, "grad_norm": 3.621530532836914, "learning_rate": 4.079818331159105e-07, "loss": 0.2795, "step": 12253 }, { "epoch": 0.5920664830651785, "grad_norm": 2.3998491764068604, "learning_rate": 4.079335169348215e-07, "loss": 0.2647, "step": 12254 }, { "epoch": 0.5921147992462675, "grad_norm": 2.78355073928833, "learning_rate": 4.078852007537324e-07, "loss": 0.3371, "step": 12255 }, { "epoch": 0.5921631154273567, "grad_norm": 3.785720109939575, "learning_rate": 4.0783688457264335e-07, "loss": 0.2291, "step": 12256 }, { "epoch": 0.5922114316084457, "grad_norm": 2.095125675201416, "learning_rate": 4.077885683915543e-07, "loss": 0.2087, "step": 12257 }, { "epoch": 0.5922597477895347, "grad_norm": 4.90974235534668, "learning_rate": 4.077402522104653e-07, "loss": 0.3837, "step": 12258 }, { "epoch": 0.5923080639706237, "grad_norm": 3.4946606159210205, "learning_rate": 4.076919360293762e-07, "loss": 0.3465, "step": 12259 }, { "epoch": 0.5923563801517128, "grad_norm": 6.665116310119629, "learning_rate": 4.0764361984828715e-07, "loss": 0.3895, "step": 12260 }, { "epoch": 0.5924046963328019, "grad_norm": 2.379082441329956, "learning_rate": 4.0759530366719814e-07, "loss": 0.2664, "step": 12261 }, { "epoch": 0.5924530125138909, "grad_norm": 2.433722734451294, "learning_rate": 4.075469874861091e-07, "loss": 0.1931, "step": 12262 }, { "epoch": 0.59250132869498, "grad_norm": 4.911206245422363, "learning_rate": 4.0749867130502e-07, "loss": 0.2829, "step": 12263 }, { "epoch": 0.592549644876069, "grad_norm": 2.6686532497406006, "learning_rate": 4.07450355123931e-07, "loss": 0.369, "step": 12264 }, { "epoch": 0.592597961057158, "grad_norm": 2.542240619659424, "learning_rate": 4.0740203894284195e-07, "loss": 0.2504, "step": 12265 }, { "epoch": 0.5926462772382471, "grad_norm": 3.0886852741241455, "learning_rate": 4.073537227617529e-07, "loss": 0.2627, "step": 12266 }, { "epoch": 0.5926945934193362, "grad_norm": 2.24922513961792, "learning_rate": 4.0730540658066387e-07, "loss": 0.2642, "step": 12267 }, { "epoch": 0.5927429096004252, "grad_norm": 2.0648093223571777, "learning_rate": 4.072570903995748e-07, "loss": 0.251, "step": 12268 }, { "epoch": 0.5927912257815142, "grad_norm": 4.801638603210449, "learning_rate": 4.0720877421848575e-07, "loss": 0.274, "step": 12269 }, { "epoch": 0.5928395419626032, "grad_norm": 2.1266930103302, "learning_rate": 4.071604580373967e-07, "loss": 0.224, "step": 12270 }, { "epoch": 0.5928878581436923, "grad_norm": 2.8750345706939697, "learning_rate": 4.071121418563077e-07, "loss": 0.3932, "step": 12271 }, { "epoch": 0.5929361743247814, "grad_norm": 3.0653529167175293, "learning_rate": 4.070638256752186e-07, "loss": 0.333, "step": 12272 }, { "epoch": 0.5929844905058704, "grad_norm": 2.6357874870300293, "learning_rate": 4.0701550949412955e-07, "loss": 0.3418, "step": 12273 }, { "epoch": 0.5930328066869595, "grad_norm": 4.411324977874756, "learning_rate": 4.0696719331304054e-07, "loss": 0.4317, "step": 12274 }, { "epoch": 0.5930811228680485, "grad_norm": 3.7524924278259277, "learning_rate": 4.069188771319515e-07, "loss": 0.3094, "step": 12275 }, { "epoch": 0.5931294390491375, "grad_norm": 3.139678716659546, "learning_rate": 4.068705609508624e-07, "loss": 0.3367, "step": 12276 }, { "epoch": 0.5931777552302266, "grad_norm": 2.596158981323242, "learning_rate": 4.068222447697734e-07, "loss": 0.2742, "step": 12277 }, { "epoch": 0.5932260714113157, "grad_norm": 2.3138530254364014, "learning_rate": 4.067739285886843e-07, "loss": 0.2729, "step": 12278 }, { "epoch": 0.5932743875924047, "grad_norm": 2.5292370319366455, "learning_rate": 4.067256124075953e-07, "loss": 0.3668, "step": 12279 }, { "epoch": 0.5933227037734937, "grad_norm": 2.6186749935150146, "learning_rate": 4.0667729622650627e-07, "loss": 0.2146, "step": 12280 }, { "epoch": 0.5933710199545827, "grad_norm": 2.510640859603882, "learning_rate": 4.066289800454172e-07, "loss": 0.3086, "step": 12281 }, { "epoch": 0.5934193361356719, "grad_norm": 1.6407114267349243, "learning_rate": 4.0658066386432814e-07, "loss": 0.179, "step": 12282 }, { "epoch": 0.5934676523167609, "grad_norm": 3.8704824447631836, "learning_rate": 4.065323476832391e-07, "loss": 0.271, "step": 12283 }, { "epoch": 0.5935159684978499, "grad_norm": 5.567883491516113, "learning_rate": 4.0648403150215007e-07, "loss": 0.3064, "step": 12284 }, { "epoch": 0.593564284678939, "grad_norm": 2.6766409873962402, "learning_rate": 4.06435715321061e-07, "loss": 0.2418, "step": 12285 }, { "epoch": 0.593612600860028, "grad_norm": 4.350426197052002, "learning_rate": 4.0638739913997195e-07, "loss": 0.4199, "step": 12286 }, { "epoch": 0.5936609170411171, "grad_norm": 3.1960911750793457, "learning_rate": 4.0633908295888294e-07, "loss": 0.2219, "step": 12287 }, { "epoch": 0.5937092332222061, "grad_norm": 5.99599027633667, "learning_rate": 4.0629076677779387e-07, "loss": 0.3278, "step": 12288 }, { "epoch": 0.5937575494032952, "grad_norm": 3.165911912918091, "learning_rate": 4.062424505967048e-07, "loss": 0.3725, "step": 12289 }, { "epoch": 0.5938058655843842, "grad_norm": 2.0082173347473145, "learning_rate": 4.061941344156158e-07, "loss": 0.2323, "step": 12290 }, { "epoch": 0.5938541817654732, "grad_norm": 3.429403781890869, "learning_rate": 4.061458182345267e-07, "loss": 0.2683, "step": 12291 }, { "epoch": 0.5939024979465624, "grad_norm": 2.2241151332855225, "learning_rate": 4.060975020534377e-07, "loss": 0.2058, "step": 12292 }, { "epoch": 0.5939508141276514, "grad_norm": 2.69570255279541, "learning_rate": 4.0604918587234867e-07, "loss": 0.2641, "step": 12293 }, { "epoch": 0.5939991303087404, "grad_norm": 1.705622911453247, "learning_rate": 4.0600086969125955e-07, "loss": 0.1924, "step": 12294 }, { "epoch": 0.5940474464898294, "grad_norm": 2.520524740219116, "learning_rate": 4.0595255351017054e-07, "loss": 0.2825, "step": 12295 }, { "epoch": 0.5940957626709185, "grad_norm": 2.3654985427856445, "learning_rate": 4.059042373290815e-07, "loss": 0.2461, "step": 12296 }, { "epoch": 0.5941440788520075, "grad_norm": 26.19826889038086, "learning_rate": 4.0585592114799247e-07, "loss": 0.3941, "step": 12297 }, { "epoch": 0.5941923950330966, "grad_norm": 5.856659889221191, "learning_rate": 4.058076049669034e-07, "loss": 0.3589, "step": 12298 }, { "epoch": 0.5942407112141856, "grad_norm": 2.4591641426086426, "learning_rate": 4.0575928878581434e-07, "loss": 0.2587, "step": 12299 }, { "epoch": 0.5942890273952747, "grad_norm": 8.855609893798828, "learning_rate": 4.0571097260472533e-07, "loss": 0.4718, "step": 12300 }, { "epoch": 0.5943373435763637, "grad_norm": 2.972647190093994, "learning_rate": 4.0566265642363627e-07, "loss": 0.2666, "step": 12301 }, { "epoch": 0.5943856597574527, "grad_norm": 2.2895734310150146, "learning_rate": 4.056143402425472e-07, "loss": 0.277, "step": 12302 }, { "epoch": 0.5944339759385419, "grad_norm": 2.5736820697784424, "learning_rate": 4.055660240614582e-07, "loss": 0.2238, "step": 12303 }, { "epoch": 0.5944822921196309, "grad_norm": 3.0263683795928955, "learning_rate": 4.055177078803691e-07, "loss": 0.3576, "step": 12304 }, { "epoch": 0.5945306083007199, "grad_norm": 3.0053038597106934, "learning_rate": 4.0546939169928007e-07, "loss": 0.2726, "step": 12305 }, { "epoch": 0.5945789244818089, "grad_norm": 3.404087781906128, "learning_rate": 4.0542107551819106e-07, "loss": 0.3112, "step": 12306 }, { "epoch": 0.594627240662898, "grad_norm": 2.7212467193603516, "learning_rate": 4.0537275933710195e-07, "loss": 0.2327, "step": 12307 }, { "epoch": 0.5946755568439871, "grad_norm": 3.038541316986084, "learning_rate": 4.0532444315601294e-07, "loss": 0.389, "step": 12308 }, { "epoch": 0.5947238730250761, "grad_norm": 3.0640571117401123, "learning_rate": 4.052761269749239e-07, "loss": 0.3402, "step": 12309 }, { "epoch": 0.5947721892061651, "grad_norm": 1.8082855939865112, "learning_rate": 4.052278107938348e-07, "loss": 0.195, "step": 12310 }, { "epoch": 0.5948205053872542, "grad_norm": 4.414855480194092, "learning_rate": 4.051794946127458e-07, "loss": 0.2832, "step": 12311 }, { "epoch": 0.5948688215683432, "grad_norm": 2.4628617763519287, "learning_rate": 4.0513117843165674e-07, "loss": 0.2044, "step": 12312 }, { "epoch": 0.5949171377494323, "grad_norm": 2.4143970012664795, "learning_rate": 4.0508286225056773e-07, "loss": 0.2714, "step": 12313 }, { "epoch": 0.5949654539305214, "grad_norm": 3.272094488143921, "learning_rate": 4.0503454606947867e-07, "loss": 0.3033, "step": 12314 }, { "epoch": 0.5950137701116104, "grad_norm": 2.294191837310791, "learning_rate": 4.049862298883896e-07, "loss": 0.3007, "step": 12315 }, { "epoch": 0.5950620862926994, "grad_norm": 3.0147740840911865, "learning_rate": 4.049379137073006e-07, "loss": 0.4229, "step": 12316 }, { "epoch": 0.5951104024737884, "grad_norm": 3.1340932846069336, "learning_rate": 4.048895975262115e-07, "loss": 0.3866, "step": 12317 }, { "epoch": 0.5951587186548776, "grad_norm": 3.9297122955322266, "learning_rate": 4.0484128134512247e-07, "loss": 0.3452, "step": 12318 }, { "epoch": 0.5952070348359666, "grad_norm": 1.772500991821289, "learning_rate": 4.0479296516403346e-07, "loss": 0.2049, "step": 12319 }, { "epoch": 0.5952553510170556, "grad_norm": 1.599677324295044, "learning_rate": 4.0474464898294434e-07, "loss": 0.1941, "step": 12320 }, { "epoch": 0.5953036671981446, "grad_norm": 3.240204334259033, "learning_rate": 4.0469633280185533e-07, "loss": 0.3848, "step": 12321 }, { "epoch": 0.5953519833792337, "grad_norm": 5.454595565795898, "learning_rate": 4.0464801662076627e-07, "loss": 0.2777, "step": 12322 }, { "epoch": 0.5954002995603227, "grad_norm": 2.2622408866882324, "learning_rate": 4.045997004396772e-07, "loss": 0.2736, "step": 12323 }, { "epoch": 0.5954486157414118, "grad_norm": 2.3837921619415283, "learning_rate": 4.045513842585882e-07, "loss": 0.2838, "step": 12324 }, { "epoch": 0.5954969319225009, "grad_norm": 4.471991062164307, "learning_rate": 4.0450306807749913e-07, "loss": 0.2667, "step": 12325 }, { "epoch": 0.5955452481035899, "grad_norm": 2.408933162689209, "learning_rate": 4.0445475189641007e-07, "loss": 0.2778, "step": 12326 }, { "epoch": 0.5955935642846789, "grad_norm": 2.526846170425415, "learning_rate": 4.0440643571532106e-07, "loss": 0.2205, "step": 12327 }, { "epoch": 0.5956418804657679, "grad_norm": 11.502030372619629, "learning_rate": 4.04358119534232e-07, "loss": 0.4284, "step": 12328 }, { "epoch": 0.5956901966468571, "grad_norm": 2.2610864639282227, "learning_rate": 4.04309803353143e-07, "loss": 0.2813, "step": 12329 }, { "epoch": 0.5957385128279461, "grad_norm": 2.341017484664917, "learning_rate": 4.042614871720539e-07, "loss": 0.2482, "step": 12330 }, { "epoch": 0.5957868290090351, "grad_norm": 2.845374345779419, "learning_rate": 4.0421317099096486e-07, "loss": 0.3229, "step": 12331 }, { "epoch": 0.5958351451901241, "grad_norm": 2.6818079948425293, "learning_rate": 4.0416485480987585e-07, "loss": 0.3358, "step": 12332 }, { "epoch": 0.5958834613712132, "grad_norm": 1.8581924438476562, "learning_rate": 4.0411653862878674e-07, "loss": 0.1882, "step": 12333 }, { "epoch": 0.5959317775523023, "grad_norm": 3.0187361240386963, "learning_rate": 4.0406822244769773e-07, "loss": 0.2247, "step": 12334 }, { "epoch": 0.5959800937333913, "grad_norm": 17.370563507080078, "learning_rate": 4.0401990626660867e-07, "loss": 0.3327, "step": 12335 }, { "epoch": 0.5960284099144804, "grad_norm": 3.0283682346343994, "learning_rate": 4.039715900855196e-07, "loss": 0.3528, "step": 12336 }, { "epoch": 0.5960767260955694, "grad_norm": 2.4467413425445557, "learning_rate": 4.039232739044306e-07, "loss": 0.2342, "step": 12337 }, { "epoch": 0.5961250422766584, "grad_norm": 1.8613970279693604, "learning_rate": 4.0387495772334153e-07, "loss": 0.2263, "step": 12338 }, { "epoch": 0.5961733584577475, "grad_norm": 1.7227497100830078, "learning_rate": 4.0382664154225247e-07, "loss": 0.1939, "step": 12339 }, { "epoch": 0.5962216746388366, "grad_norm": 3.160949230194092, "learning_rate": 4.037783253611634e-07, "loss": 0.3329, "step": 12340 }, { "epoch": 0.5962699908199256, "grad_norm": 2.8657259941101074, "learning_rate": 4.037300091800744e-07, "loss": 0.3187, "step": 12341 }, { "epoch": 0.5963183070010146, "grad_norm": 2.349712371826172, "learning_rate": 4.0368169299898533e-07, "loss": 0.2935, "step": 12342 }, { "epoch": 0.5963666231821037, "grad_norm": 3.142625570297241, "learning_rate": 4.0363337681789627e-07, "loss": 0.3419, "step": 12343 }, { "epoch": 0.5964149393631928, "grad_norm": 2.995971202850342, "learning_rate": 4.0358506063680726e-07, "loss": 0.23, "step": 12344 }, { "epoch": 0.5964632555442818, "grad_norm": 4.612610340118408, "learning_rate": 4.0353674445571825e-07, "loss": 0.2267, "step": 12345 }, { "epoch": 0.5965115717253708, "grad_norm": 4.7246599197387695, "learning_rate": 4.0348842827462914e-07, "loss": 0.2444, "step": 12346 }, { "epoch": 0.5965598879064599, "grad_norm": 3.218679666519165, "learning_rate": 4.034401120935401e-07, "loss": 0.3805, "step": 12347 }, { "epoch": 0.5966082040875489, "grad_norm": 3.6826822757720947, "learning_rate": 4.0339179591245106e-07, "loss": 0.2592, "step": 12348 }, { "epoch": 0.5966565202686379, "grad_norm": 7.994897842407227, "learning_rate": 4.03343479731362e-07, "loss": 0.2492, "step": 12349 }, { "epoch": 0.596704836449727, "grad_norm": 3.258057117462158, "learning_rate": 4.03295163550273e-07, "loss": 0.319, "step": 12350 }, { "epoch": 0.5967531526308161, "grad_norm": 2.6982288360595703, "learning_rate": 4.0324684736918393e-07, "loss": 0.3965, "step": 12351 }, { "epoch": 0.5968014688119051, "grad_norm": 3.169865846633911, "learning_rate": 4.0319853118809486e-07, "loss": 0.27, "step": 12352 }, { "epoch": 0.5968497849929941, "grad_norm": 2.742779016494751, "learning_rate": 4.031502150070058e-07, "loss": 0.3509, "step": 12353 }, { "epoch": 0.5968981011740832, "grad_norm": 4.485811233520508, "learning_rate": 4.031018988259168e-07, "loss": 0.3352, "step": 12354 }, { "epoch": 0.5969464173551723, "grad_norm": 4.070005416870117, "learning_rate": 4.0305358264482773e-07, "loss": 0.3566, "step": 12355 }, { "epoch": 0.5969947335362613, "grad_norm": 3.2853429317474365, "learning_rate": 4.0300526646373867e-07, "loss": 0.3514, "step": 12356 }, { "epoch": 0.5970430497173503, "grad_norm": 2.347677230834961, "learning_rate": 4.0295695028264966e-07, "loss": 0.203, "step": 12357 }, { "epoch": 0.5970913658984394, "grad_norm": 2.759735345840454, "learning_rate": 4.029086341015606e-07, "loss": 0.2513, "step": 12358 }, { "epoch": 0.5971396820795284, "grad_norm": 2.853109836578369, "learning_rate": 4.0286031792047153e-07, "loss": 0.2844, "step": 12359 }, { "epoch": 0.5971879982606175, "grad_norm": 4.02602481842041, "learning_rate": 4.028120017393825e-07, "loss": 0.4794, "step": 12360 }, { "epoch": 0.5972363144417066, "grad_norm": 2.7902731895446777, "learning_rate": 4.027636855582934e-07, "loss": 0.233, "step": 12361 }, { "epoch": 0.5972846306227956, "grad_norm": 4.5323333740234375, "learning_rate": 4.027153693772044e-07, "loss": 0.1518, "step": 12362 }, { "epoch": 0.5973329468038846, "grad_norm": 2.350830078125, "learning_rate": 4.026670531961154e-07, "loss": 0.2875, "step": 12363 }, { "epoch": 0.5973812629849736, "grad_norm": 5.247110843658447, "learning_rate": 4.026187370150263e-07, "loss": 0.3521, "step": 12364 }, { "epoch": 0.5974295791660628, "grad_norm": 1.9354544878005981, "learning_rate": 4.0257042083393726e-07, "loss": 0.1433, "step": 12365 }, { "epoch": 0.5974778953471518, "grad_norm": 3.6469833850860596, "learning_rate": 4.025221046528482e-07, "loss": 0.2986, "step": 12366 }, { "epoch": 0.5975262115282408, "grad_norm": 1.7664687633514404, "learning_rate": 4.024737884717592e-07, "loss": 0.2242, "step": 12367 }, { "epoch": 0.5975745277093298, "grad_norm": 2.779604911804199, "learning_rate": 4.024254722906701e-07, "loss": 0.2242, "step": 12368 }, { "epoch": 0.5976228438904189, "grad_norm": 1.607045292854309, "learning_rate": 4.0237715610958106e-07, "loss": 0.1258, "step": 12369 }, { "epoch": 0.597671160071508, "grad_norm": 3.2539827823638916, "learning_rate": 4.0232883992849205e-07, "loss": 0.4147, "step": 12370 }, { "epoch": 0.597719476252597, "grad_norm": 3.171757936477661, "learning_rate": 4.02280523747403e-07, "loss": 0.384, "step": 12371 }, { "epoch": 0.597767792433686, "grad_norm": 2.1068520545959473, "learning_rate": 4.0223220756631393e-07, "loss": 0.2569, "step": 12372 }, { "epoch": 0.5978161086147751, "grad_norm": 3.089312791824341, "learning_rate": 4.021838913852249e-07, "loss": 0.2706, "step": 12373 }, { "epoch": 0.5978644247958641, "grad_norm": 6.123800277709961, "learning_rate": 4.021355752041358e-07, "loss": 0.3179, "step": 12374 }, { "epoch": 0.5979127409769531, "grad_norm": 2.7963509559631348, "learning_rate": 4.020872590230468e-07, "loss": 0.237, "step": 12375 }, { "epoch": 0.5979610571580423, "grad_norm": 2.9702095985412598, "learning_rate": 4.020389428419578e-07, "loss": 0.4036, "step": 12376 }, { "epoch": 0.5980093733391313, "grad_norm": 3.050459861755371, "learning_rate": 4.0199062666086867e-07, "loss": 0.3113, "step": 12377 }, { "epoch": 0.5980576895202203, "grad_norm": 2.6469292640686035, "learning_rate": 4.0194231047977966e-07, "loss": 0.3039, "step": 12378 }, { "epoch": 0.5981060057013093, "grad_norm": 2.3482682704925537, "learning_rate": 4.018939942986906e-07, "loss": 0.2633, "step": 12379 }, { "epoch": 0.5981543218823984, "grad_norm": 2.8500101566314697, "learning_rate": 4.018456781176016e-07, "loss": 0.3887, "step": 12380 }, { "epoch": 0.5982026380634875, "grad_norm": 2.865846633911133, "learning_rate": 4.017973619365125e-07, "loss": 0.2716, "step": 12381 }, { "epoch": 0.5982509542445765, "grad_norm": 2.184779644012451, "learning_rate": 4.0174904575542346e-07, "loss": 0.2185, "step": 12382 }, { "epoch": 0.5982992704256656, "grad_norm": 2.1888673305511475, "learning_rate": 4.0170072957433445e-07, "loss": 0.1706, "step": 12383 }, { "epoch": 0.5983475866067546, "grad_norm": 16.27253532409668, "learning_rate": 4.016524133932454e-07, "loss": 0.3151, "step": 12384 }, { "epoch": 0.5983959027878436, "grad_norm": 3.6954803466796875, "learning_rate": 4.016040972121563e-07, "loss": 0.4015, "step": 12385 }, { "epoch": 0.5984442189689327, "grad_norm": 3.2896029949188232, "learning_rate": 4.015557810310673e-07, "loss": 0.4071, "step": 12386 }, { "epoch": 0.5984925351500218, "grad_norm": 2.8550069332122803, "learning_rate": 4.015074648499782e-07, "loss": 0.3736, "step": 12387 }, { "epoch": 0.5985408513311108, "grad_norm": 2.185718059539795, "learning_rate": 4.014591486688892e-07, "loss": 0.2385, "step": 12388 }, { "epoch": 0.5985891675121998, "grad_norm": 6.439850807189941, "learning_rate": 4.014108324878002e-07, "loss": 0.2627, "step": 12389 }, { "epoch": 0.5986374836932888, "grad_norm": 19.418245315551758, "learning_rate": 4.0136251630671106e-07, "loss": 0.3415, "step": 12390 }, { "epoch": 0.598685799874378, "grad_norm": 2.624178886413574, "learning_rate": 4.0131420012562205e-07, "loss": 0.3647, "step": 12391 }, { "epoch": 0.598734116055467, "grad_norm": 6.303853511810303, "learning_rate": 4.01265883944533e-07, "loss": 0.3164, "step": 12392 }, { "epoch": 0.598782432236556, "grad_norm": 1.98371422290802, "learning_rate": 4.0121756776344393e-07, "loss": 0.1794, "step": 12393 }, { "epoch": 0.598830748417645, "grad_norm": 3.3008649349212646, "learning_rate": 4.011692515823549e-07, "loss": 0.3808, "step": 12394 }, { "epoch": 0.5988790645987341, "grad_norm": 2.4801173210144043, "learning_rate": 4.0112093540126586e-07, "loss": 0.2789, "step": 12395 }, { "epoch": 0.5989273807798232, "grad_norm": 3.2163853645324707, "learning_rate": 4.0107261922017685e-07, "loss": 0.4137, "step": 12396 }, { "epoch": 0.5989756969609122, "grad_norm": 2.4759361743927, "learning_rate": 4.010243030390878e-07, "loss": 0.3063, "step": 12397 }, { "epoch": 0.5990240131420013, "grad_norm": 8.015291213989258, "learning_rate": 4.009759868579987e-07, "loss": 0.3581, "step": 12398 }, { "epoch": 0.5990723293230903, "grad_norm": 15.740612983703613, "learning_rate": 4.009276706769097e-07, "loss": 0.3386, "step": 12399 }, { "epoch": 0.5991206455041793, "grad_norm": 2.838660717010498, "learning_rate": 4.008793544958206e-07, "loss": 0.3485, "step": 12400 }, { "epoch": 0.5991689616852683, "grad_norm": 3.089395761489868, "learning_rate": 4.008310383147316e-07, "loss": 0.2614, "step": 12401 }, { "epoch": 0.5992172778663575, "grad_norm": 2.5232529640197754, "learning_rate": 4.007827221336426e-07, "loss": 0.2929, "step": 12402 }, { "epoch": 0.5992655940474465, "grad_norm": 4.851789474487305, "learning_rate": 4.0073440595255346e-07, "loss": 0.2675, "step": 12403 }, { "epoch": 0.5993139102285355, "grad_norm": 4.170388698577881, "learning_rate": 4.0068608977146445e-07, "loss": 0.1834, "step": 12404 }, { "epoch": 0.5993622264096246, "grad_norm": 2.695854663848877, "learning_rate": 4.006377735903754e-07, "loss": 0.3062, "step": 12405 }, { "epoch": 0.5994105425907136, "grad_norm": 3.3130314350128174, "learning_rate": 4.005894574092863e-07, "loss": 0.3303, "step": 12406 }, { "epoch": 0.5994588587718027, "grad_norm": 4.9568095207214355, "learning_rate": 4.005411412281973e-07, "loss": 0.2766, "step": 12407 }, { "epoch": 0.5995071749528917, "grad_norm": 2.4405717849731445, "learning_rate": 4.0049282504710825e-07, "loss": 0.3745, "step": 12408 }, { "epoch": 0.5995554911339808, "grad_norm": 3.2790205478668213, "learning_rate": 4.004445088660192e-07, "loss": 0.2961, "step": 12409 }, { "epoch": 0.5996038073150698, "grad_norm": 1.2223079204559326, "learning_rate": 4.003961926849302e-07, "loss": 0.1599, "step": 12410 }, { "epoch": 0.5996521234961588, "grad_norm": 2.298358201980591, "learning_rate": 4.003478765038411e-07, "loss": 0.2923, "step": 12411 }, { "epoch": 0.599700439677248, "grad_norm": 2.311894416809082, "learning_rate": 4.002995603227521e-07, "loss": 0.2723, "step": 12412 }, { "epoch": 0.599748755858337, "grad_norm": 3.448204517364502, "learning_rate": 4.00251244141663e-07, "loss": 0.1877, "step": 12413 }, { "epoch": 0.599797072039426, "grad_norm": 2.5534605979919434, "learning_rate": 4.00202927960574e-07, "loss": 0.2517, "step": 12414 }, { "epoch": 0.599845388220515, "grad_norm": 2.304760456085205, "learning_rate": 4.0015461177948497e-07, "loss": 0.3247, "step": 12415 }, { "epoch": 0.599893704401604, "grad_norm": 3.374873638153076, "learning_rate": 4.0010629559839586e-07, "loss": 0.3795, "step": 12416 }, { "epoch": 0.5999420205826932, "grad_norm": 4.208839416503906, "learning_rate": 4.0005797941730685e-07, "loss": 0.3555, "step": 12417 }, { "epoch": 0.5999903367637822, "grad_norm": 2.898864984512329, "learning_rate": 4.000096632362178e-07, "loss": 0.2241, "step": 12418 }, { "epoch": 0.6000386529448712, "grad_norm": 2.739431619644165, "learning_rate": 3.999613470551287e-07, "loss": 0.3222, "step": 12419 }, { "epoch": 0.6000869691259603, "grad_norm": 3.2332582473754883, "learning_rate": 3.999130308740397e-07, "loss": 0.324, "step": 12420 }, { "epoch": 0.6001352853070493, "grad_norm": 2.1169893741607666, "learning_rate": 3.9986471469295065e-07, "loss": 0.1923, "step": 12421 }, { "epoch": 0.6001836014881384, "grad_norm": 3.319882392883301, "learning_rate": 3.998163985118616e-07, "loss": 0.1998, "step": 12422 }, { "epoch": 0.6002319176692275, "grad_norm": 2.718949317932129, "learning_rate": 3.997680823307726e-07, "loss": 0.2154, "step": 12423 }, { "epoch": 0.6002802338503165, "grad_norm": 6.420773983001709, "learning_rate": 3.997197661496835e-07, "loss": 0.2049, "step": 12424 }, { "epoch": 0.6003285500314055, "grad_norm": 3.8480467796325684, "learning_rate": 3.9967144996859445e-07, "loss": 0.2541, "step": 12425 }, { "epoch": 0.6003768662124945, "grad_norm": 2.298924684524536, "learning_rate": 3.996231337875054e-07, "loss": 0.257, "step": 12426 }, { "epoch": 0.6004251823935836, "grad_norm": 2.319838285446167, "learning_rate": 3.995748176064164e-07, "loss": 0.2468, "step": 12427 }, { "epoch": 0.6004734985746727, "grad_norm": 2.2985148429870605, "learning_rate": 3.9952650142532737e-07, "loss": 0.2082, "step": 12428 }, { "epoch": 0.6005218147557617, "grad_norm": 2.7156527042388916, "learning_rate": 3.9947818524423825e-07, "loss": 0.2371, "step": 12429 }, { "epoch": 0.6005701309368507, "grad_norm": 2.6202187538146973, "learning_rate": 3.9942986906314924e-07, "loss": 0.1659, "step": 12430 }, { "epoch": 0.6006184471179398, "grad_norm": 2.545574188232422, "learning_rate": 3.993815528820602e-07, "loss": 0.2956, "step": 12431 }, { "epoch": 0.6006667632990288, "grad_norm": 3.1587467193603516, "learning_rate": 3.993332367009711e-07, "loss": 0.4782, "step": 12432 }, { "epoch": 0.6007150794801179, "grad_norm": 2.7290399074554443, "learning_rate": 3.992849205198821e-07, "loss": 0.3445, "step": 12433 }, { "epoch": 0.600763395661207, "grad_norm": 2.7553298473358154, "learning_rate": 3.9923660433879304e-07, "loss": 0.3266, "step": 12434 }, { "epoch": 0.600811711842296, "grad_norm": 4.854439735412598, "learning_rate": 3.99188288157704e-07, "loss": 0.193, "step": 12435 }, { "epoch": 0.600860028023385, "grad_norm": 4.098382472991943, "learning_rate": 3.9913997197661497e-07, "loss": 0.2333, "step": 12436 }, { "epoch": 0.600908344204474, "grad_norm": 2.656813144683838, "learning_rate": 3.990916557955259e-07, "loss": 0.356, "step": 12437 }, { "epoch": 0.6009566603855632, "grad_norm": 3.73848557472229, "learning_rate": 3.9904333961443685e-07, "loss": 0.3538, "step": 12438 }, { "epoch": 0.6010049765666522, "grad_norm": 3.7207789421081543, "learning_rate": 3.989950234333478e-07, "loss": 0.3748, "step": 12439 }, { "epoch": 0.6010532927477412, "grad_norm": 2.088346004486084, "learning_rate": 3.989467072522588e-07, "loss": 0.1854, "step": 12440 }, { "epoch": 0.6011016089288302, "grad_norm": 3.246910333633423, "learning_rate": 3.988983910711697e-07, "loss": 0.2617, "step": 12441 }, { "epoch": 0.6011499251099193, "grad_norm": 3.1960113048553467, "learning_rate": 3.9885007489008065e-07, "loss": 0.2798, "step": 12442 }, { "epoch": 0.6011982412910084, "grad_norm": 5.2998480796813965, "learning_rate": 3.9880175870899164e-07, "loss": 0.3323, "step": 12443 }, { "epoch": 0.6012465574720974, "grad_norm": 2.4033215045928955, "learning_rate": 3.987534425279026e-07, "loss": 0.3602, "step": 12444 }, { "epoch": 0.6012948736531865, "grad_norm": 2.5153846740722656, "learning_rate": 3.987051263468135e-07, "loss": 0.3189, "step": 12445 }, { "epoch": 0.6013431898342755, "grad_norm": 4.3501200675964355, "learning_rate": 3.986568101657245e-07, "loss": 0.3628, "step": 12446 }, { "epoch": 0.6013915060153645, "grad_norm": 6.117180824279785, "learning_rate": 3.9860849398463544e-07, "loss": 0.249, "step": 12447 }, { "epoch": 0.6014398221964536, "grad_norm": 2.1199333667755127, "learning_rate": 3.985601778035464e-07, "loss": 0.2656, "step": 12448 }, { "epoch": 0.6014881383775427, "grad_norm": 2.678361415863037, "learning_rate": 3.9851186162245737e-07, "loss": 0.3741, "step": 12449 }, { "epoch": 0.6015364545586317, "grad_norm": 2.4576921463012695, "learning_rate": 3.984635454413683e-07, "loss": 0.2655, "step": 12450 }, { "epoch": 0.6015847707397207, "grad_norm": 2.878490447998047, "learning_rate": 3.9841522926027924e-07, "loss": 0.3152, "step": 12451 }, { "epoch": 0.6016330869208097, "grad_norm": 2.0525567531585693, "learning_rate": 3.983669130791902e-07, "loss": 0.2236, "step": 12452 }, { "epoch": 0.6016814031018988, "grad_norm": 3.3279972076416016, "learning_rate": 3.9831859689810117e-07, "loss": 0.3385, "step": 12453 }, { "epoch": 0.6017297192829879, "grad_norm": 2.8516695499420166, "learning_rate": 3.982702807170121e-07, "loss": 0.299, "step": 12454 }, { "epoch": 0.6017780354640769, "grad_norm": 3.3190064430236816, "learning_rate": 3.9822196453592305e-07, "loss": 0.2553, "step": 12455 }, { "epoch": 0.601826351645166, "grad_norm": 3.0573532581329346, "learning_rate": 3.9817364835483404e-07, "loss": 0.3271, "step": 12456 }, { "epoch": 0.601874667826255, "grad_norm": 2.609179735183716, "learning_rate": 3.981253321737449e-07, "loss": 0.3003, "step": 12457 }, { "epoch": 0.601922984007344, "grad_norm": 2.707737684249878, "learning_rate": 3.980770159926559e-07, "loss": 0.2941, "step": 12458 }, { "epoch": 0.6019713001884331, "grad_norm": 1.6576863527297974, "learning_rate": 3.980286998115669e-07, "loss": 0.1542, "step": 12459 }, { "epoch": 0.6020196163695222, "grad_norm": 2.8077969551086426, "learning_rate": 3.9798038363047784e-07, "loss": 0.2704, "step": 12460 }, { "epoch": 0.6020679325506112, "grad_norm": 1.6295759677886963, "learning_rate": 3.979320674493888e-07, "loss": 0.153, "step": 12461 }, { "epoch": 0.6021162487317002, "grad_norm": 2.916841745376587, "learning_rate": 3.9788375126829977e-07, "loss": 0.2673, "step": 12462 }, { "epoch": 0.6021645649127892, "grad_norm": 4.8824944496154785, "learning_rate": 3.978354350872107e-07, "loss": 0.4206, "step": 12463 }, { "epoch": 0.6022128810938784, "grad_norm": 2.997738838195801, "learning_rate": 3.9778711890612164e-07, "loss": 0.3731, "step": 12464 }, { "epoch": 0.6022611972749674, "grad_norm": 1.9340453147888184, "learning_rate": 3.977388027250326e-07, "loss": 0.1946, "step": 12465 }, { "epoch": 0.6023095134560564, "grad_norm": 13.446962356567383, "learning_rate": 3.9769048654394357e-07, "loss": 0.2613, "step": 12466 }, { "epoch": 0.6023578296371455, "grad_norm": 2.6936604976654053, "learning_rate": 3.976421703628545e-07, "loss": 0.3051, "step": 12467 }, { "epoch": 0.6024061458182345, "grad_norm": 7.884354591369629, "learning_rate": 3.9759385418176544e-07, "loss": 0.1879, "step": 12468 }, { "epoch": 0.6024544619993236, "grad_norm": 3.256683111190796, "learning_rate": 3.9754553800067643e-07, "loss": 0.241, "step": 12469 }, { "epoch": 0.6025027781804126, "grad_norm": 1.6546075344085693, "learning_rate": 3.974972218195873e-07, "loss": 0.2117, "step": 12470 }, { "epoch": 0.6025510943615017, "grad_norm": 21.29914665222168, "learning_rate": 3.974489056384983e-07, "loss": 0.3845, "step": 12471 }, { "epoch": 0.6025994105425907, "grad_norm": 2.788618564605713, "learning_rate": 3.974005894574093e-07, "loss": 0.3977, "step": 12472 }, { "epoch": 0.6026477267236797, "grad_norm": 1.9984889030456543, "learning_rate": 3.973522732763202e-07, "loss": 0.2288, "step": 12473 }, { "epoch": 0.6026960429047689, "grad_norm": 5.061539649963379, "learning_rate": 3.9730395709523117e-07, "loss": 0.2745, "step": 12474 }, { "epoch": 0.6027443590858579, "grad_norm": 2.9445743560791016, "learning_rate": 3.9725564091414216e-07, "loss": 0.291, "step": 12475 }, { "epoch": 0.6027926752669469, "grad_norm": 2.62324857711792, "learning_rate": 3.972073247330531e-07, "loss": 0.2577, "step": 12476 }, { "epoch": 0.6028409914480359, "grad_norm": 2.7444519996643066, "learning_rate": 3.9715900855196404e-07, "loss": 0.2953, "step": 12477 }, { "epoch": 0.602889307629125, "grad_norm": 2.389143943786621, "learning_rate": 3.9711069237087497e-07, "loss": 0.2223, "step": 12478 }, { "epoch": 0.6029376238102141, "grad_norm": 2.2869482040405273, "learning_rate": 3.9706237618978596e-07, "loss": 0.2602, "step": 12479 }, { "epoch": 0.6029859399913031, "grad_norm": 2.9441983699798584, "learning_rate": 3.970140600086969e-07, "loss": 0.3947, "step": 12480 }, { "epoch": 0.6030342561723921, "grad_norm": 2.775989055633545, "learning_rate": 3.9696574382760784e-07, "loss": 0.2966, "step": 12481 }, { "epoch": 0.6030825723534812, "grad_norm": 8.398551940917969, "learning_rate": 3.9691742764651883e-07, "loss": 0.282, "step": 12482 }, { "epoch": 0.6031308885345702, "grad_norm": 1.7366266250610352, "learning_rate": 3.968691114654297e-07, "loss": 0.1629, "step": 12483 }, { "epoch": 0.6031792047156592, "grad_norm": 3.7524237632751465, "learning_rate": 3.968207952843407e-07, "loss": 0.3514, "step": 12484 }, { "epoch": 0.6032275208967484, "grad_norm": 2.0080533027648926, "learning_rate": 3.967724791032517e-07, "loss": 0.2243, "step": 12485 }, { "epoch": 0.6032758370778374, "grad_norm": 3.1449368000030518, "learning_rate": 3.967241629221626e-07, "loss": 0.3769, "step": 12486 }, { "epoch": 0.6033241532589264, "grad_norm": 4.23905086517334, "learning_rate": 3.9667584674107357e-07, "loss": 0.4259, "step": 12487 }, { "epoch": 0.6033724694400154, "grad_norm": 2.466736078262329, "learning_rate": 3.9662753055998456e-07, "loss": 0.2772, "step": 12488 }, { "epoch": 0.6034207856211045, "grad_norm": 2.1089844703674316, "learning_rate": 3.9657921437889544e-07, "loss": 0.2218, "step": 12489 }, { "epoch": 0.6034691018021936, "grad_norm": 2.021524667739868, "learning_rate": 3.9653089819780643e-07, "loss": 0.1908, "step": 12490 }, { "epoch": 0.6035174179832826, "grad_norm": 2.758685827255249, "learning_rate": 3.9648258201671737e-07, "loss": 0.2692, "step": 12491 }, { "epoch": 0.6035657341643716, "grad_norm": 2.7790019512176514, "learning_rate": 3.9643426583562836e-07, "loss": 0.3509, "step": 12492 }, { "epoch": 0.6036140503454607, "grad_norm": 2.8643531799316406, "learning_rate": 3.963859496545393e-07, "loss": 0.3255, "step": 12493 }, { "epoch": 0.6036623665265497, "grad_norm": 2.5162980556488037, "learning_rate": 3.9633763347345023e-07, "loss": 0.237, "step": 12494 }, { "epoch": 0.6037106827076388, "grad_norm": 4.085603713989258, "learning_rate": 3.962893172923612e-07, "loss": 0.2853, "step": 12495 }, { "epoch": 0.6037589988887279, "grad_norm": 2.3126888275146484, "learning_rate": 3.962410011112721e-07, "loss": 0.2375, "step": 12496 }, { "epoch": 0.6038073150698169, "grad_norm": 2.7195119857788086, "learning_rate": 3.961926849301831e-07, "loss": 0.2545, "step": 12497 }, { "epoch": 0.6038556312509059, "grad_norm": 3.275412082672119, "learning_rate": 3.961443687490941e-07, "loss": 0.3568, "step": 12498 }, { "epoch": 0.6039039474319949, "grad_norm": 3.1590917110443115, "learning_rate": 3.96096052568005e-07, "loss": 0.1988, "step": 12499 }, { "epoch": 0.6039522636130841, "grad_norm": 3.1191911697387695, "learning_rate": 3.9604773638691596e-07, "loss": 0.2597, "step": 12500 }, { "epoch": 0.6040005797941731, "grad_norm": 3.088071823120117, "learning_rate": 3.9599942020582695e-07, "loss": 0.3071, "step": 12501 }, { "epoch": 0.6040488959752621, "grad_norm": 4.5961012840271, "learning_rate": 3.9595110402473784e-07, "loss": 0.3312, "step": 12502 }, { "epoch": 0.6040972121563511, "grad_norm": 1.8292171955108643, "learning_rate": 3.9590278784364883e-07, "loss": 0.1537, "step": 12503 }, { "epoch": 0.6041455283374402, "grad_norm": 2.7192697525024414, "learning_rate": 3.9585447166255977e-07, "loss": 0.2916, "step": 12504 }, { "epoch": 0.6041938445185293, "grad_norm": 2.717012882232666, "learning_rate": 3.958061554814707e-07, "loss": 0.2099, "step": 12505 }, { "epoch": 0.6042421606996183, "grad_norm": 3.0009195804595947, "learning_rate": 3.957578393003817e-07, "loss": 0.4348, "step": 12506 }, { "epoch": 0.6042904768807074, "grad_norm": 1.8870078325271606, "learning_rate": 3.9570952311929263e-07, "loss": 0.1793, "step": 12507 }, { "epoch": 0.6043387930617964, "grad_norm": 6.085685729980469, "learning_rate": 3.956612069382036e-07, "loss": 0.3526, "step": 12508 }, { "epoch": 0.6043871092428854, "grad_norm": 1.3212727308273315, "learning_rate": 3.956128907571145e-07, "loss": 0.1249, "step": 12509 }, { "epoch": 0.6044354254239744, "grad_norm": 3.358394145965576, "learning_rate": 3.955645745760255e-07, "loss": 0.3153, "step": 12510 }, { "epoch": 0.6044837416050636, "grad_norm": 3.7117819786071777, "learning_rate": 3.955162583949365e-07, "loss": 0.4024, "step": 12511 }, { "epoch": 0.6045320577861526, "grad_norm": 2.8962996006011963, "learning_rate": 3.9546794221384737e-07, "loss": 0.1947, "step": 12512 }, { "epoch": 0.6045803739672416, "grad_norm": 2.7779130935668945, "learning_rate": 3.9541962603275836e-07, "loss": 0.263, "step": 12513 }, { "epoch": 0.6046286901483306, "grad_norm": 2.6280338764190674, "learning_rate": 3.9537130985166935e-07, "loss": 0.2255, "step": 12514 }, { "epoch": 0.6046770063294197, "grad_norm": 2.525156259536743, "learning_rate": 3.9532299367058023e-07, "loss": 0.2638, "step": 12515 }, { "epoch": 0.6047253225105088, "grad_norm": 2.744539499282837, "learning_rate": 3.952746774894912e-07, "loss": 0.2955, "step": 12516 }, { "epoch": 0.6047736386915978, "grad_norm": 3.3635170459747314, "learning_rate": 3.9522636130840216e-07, "loss": 0.2743, "step": 12517 }, { "epoch": 0.6048219548726869, "grad_norm": 3.4332196712493896, "learning_rate": 3.951780451273131e-07, "loss": 0.379, "step": 12518 }, { "epoch": 0.6048702710537759, "grad_norm": 6.4639177322387695, "learning_rate": 3.951297289462241e-07, "loss": 0.3057, "step": 12519 }, { "epoch": 0.6049185872348649, "grad_norm": 3.1771559715270996, "learning_rate": 3.9508141276513503e-07, "loss": 0.2279, "step": 12520 }, { "epoch": 0.604966903415954, "grad_norm": 2.3900673389434814, "learning_rate": 3.9503309658404596e-07, "loss": 0.3055, "step": 12521 }, { "epoch": 0.6050152195970431, "grad_norm": 1.6613049507141113, "learning_rate": 3.949847804029569e-07, "loss": 0.1704, "step": 12522 }, { "epoch": 0.6050635357781321, "grad_norm": 3.6599855422973633, "learning_rate": 3.949364642218679e-07, "loss": 0.4054, "step": 12523 }, { "epoch": 0.6051118519592211, "grad_norm": 1.6466325521469116, "learning_rate": 3.948881480407789e-07, "loss": 0.151, "step": 12524 }, { "epoch": 0.6051601681403102, "grad_norm": 2.594395637512207, "learning_rate": 3.9483983185968977e-07, "loss": 0.1943, "step": 12525 }, { "epoch": 0.6052084843213993, "grad_norm": 4.289935111999512, "learning_rate": 3.9479151567860076e-07, "loss": 0.3298, "step": 12526 }, { "epoch": 0.6052568005024883, "grad_norm": 2.8180577754974365, "learning_rate": 3.9474319949751175e-07, "loss": 0.3724, "step": 12527 }, { "epoch": 0.6053051166835773, "grad_norm": 2.716385841369629, "learning_rate": 3.9469488331642263e-07, "loss": 0.277, "step": 12528 }, { "epoch": 0.6053534328646664, "grad_norm": 3.3442132472991943, "learning_rate": 3.946465671353336e-07, "loss": 0.4564, "step": 12529 }, { "epoch": 0.6054017490457554, "grad_norm": 2.4029271602630615, "learning_rate": 3.9459825095424456e-07, "loss": 0.2179, "step": 12530 }, { "epoch": 0.6054500652268445, "grad_norm": 1.8625447750091553, "learning_rate": 3.945499347731555e-07, "loss": 0.2232, "step": 12531 }, { "epoch": 0.6054983814079336, "grad_norm": 1.9204022884368896, "learning_rate": 3.945016185920665e-07, "loss": 0.2093, "step": 12532 }, { "epoch": 0.6055466975890226, "grad_norm": 1.9817692041397095, "learning_rate": 3.944533024109774e-07, "loss": 0.1861, "step": 12533 }, { "epoch": 0.6055950137701116, "grad_norm": 2.8423454761505127, "learning_rate": 3.9440498622988836e-07, "loss": 0.2778, "step": 12534 }, { "epoch": 0.6056433299512006, "grad_norm": 4.466928005218506, "learning_rate": 3.943566700487993e-07, "loss": 0.3859, "step": 12535 }, { "epoch": 0.6056916461322897, "grad_norm": 3.4465785026550293, "learning_rate": 3.943083538677103e-07, "loss": 0.223, "step": 12536 }, { "epoch": 0.6057399623133788, "grad_norm": 2.0202107429504395, "learning_rate": 3.942600376866212e-07, "loss": 0.2218, "step": 12537 }, { "epoch": 0.6057882784944678, "grad_norm": 4.593092918395996, "learning_rate": 3.9421172150553216e-07, "loss": 0.5012, "step": 12538 }, { "epoch": 0.6058365946755568, "grad_norm": 2.5739290714263916, "learning_rate": 3.9416340532444315e-07, "loss": 0.2999, "step": 12539 }, { "epoch": 0.6058849108566459, "grad_norm": 2.194343328475952, "learning_rate": 3.9411508914335414e-07, "loss": 0.1909, "step": 12540 }, { "epoch": 0.6059332270377349, "grad_norm": 3.268247604370117, "learning_rate": 3.9406677296226503e-07, "loss": 0.3056, "step": 12541 }, { "epoch": 0.605981543218824, "grad_norm": 3.483109474182129, "learning_rate": 3.94018456781176e-07, "loss": 0.2515, "step": 12542 }, { "epoch": 0.606029859399913, "grad_norm": 2.864438772201538, "learning_rate": 3.9397014060008696e-07, "loss": 0.3004, "step": 12543 }, { "epoch": 0.6060781755810021, "grad_norm": 2.248262405395508, "learning_rate": 3.939218244189979e-07, "loss": 0.2959, "step": 12544 }, { "epoch": 0.6061264917620911, "grad_norm": 2.0553934574127197, "learning_rate": 3.938735082379089e-07, "loss": 0.2381, "step": 12545 }, { "epoch": 0.6061748079431801, "grad_norm": 3.493319272994995, "learning_rate": 3.938251920568198e-07, "loss": 0.4426, "step": 12546 }, { "epoch": 0.6062231241242693, "grad_norm": 18.32815933227539, "learning_rate": 3.9377687587573076e-07, "loss": 0.358, "step": 12547 }, { "epoch": 0.6062714403053583, "grad_norm": 5.842501163482666, "learning_rate": 3.937285596946417e-07, "loss": 0.3551, "step": 12548 }, { "epoch": 0.6063197564864473, "grad_norm": 3.1259970664978027, "learning_rate": 3.936802435135527e-07, "loss": 0.2205, "step": 12549 }, { "epoch": 0.6063680726675363, "grad_norm": 2.104174852371216, "learning_rate": 3.936319273324636e-07, "loss": 0.2975, "step": 12550 }, { "epoch": 0.6064163888486254, "grad_norm": 3.553281307220459, "learning_rate": 3.9358361115137456e-07, "loss": 0.3594, "step": 12551 }, { "epoch": 0.6064647050297145, "grad_norm": 2.2538132667541504, "learning_rate": 3.9353529497028555e-07, "loss": 0.2856, "step": 12552 }, { "epoch": 0.6065130212108035, "grad_norm": 2.8331491947174072, "learning_rate": 3.934869787891965e-07, "loss": 0.2857, "step": 12553 }, { "epoch": 0.6065613373918926, "grad_norm": 5.197621822357178, "learning_rate": 3.934386626081074e-07, "loss": 0.3572, "step": 12554 }, { "epoch": 0.6066096535729816, "grad_norm": 2.288271188735962, "learning_rate": 3.933903464270184e-07, "loss": 0.2995, "step": 12555 }, { "epoch": 0.6066579697540706, "grad_norm": 2.294064521789551, "learning_rate": 3.933420302459293e-07, "loss": 0.2643, "step": 12556 }, { "epoch": 0.6067062859351597, "grad_norm": 3.175037145614624, "learning_rate": 3.932937140648403e-07, "loss": 0.4148, "step": 12557 }, { "epoch": 0.6067546021162488, "grad_norm": 2.7435641288757324, "learning_rate": 3.932453978837513e-07, "loss": 0.3532, "step": 12558 }, { "epoch": 0.6068029182973378, "grad_norm": 3.154123306274414, "learning_rate": 3.931970817026622e-07, "loss": 0.3365, "step": 12559 }, { "epoch": 0.6068512344784268, "grad_norm": 12.545835494995117, "learning_rate": 3.9314876552157315e-07, "loss": 0.2077, "step": 12560 }, { "epoch": 0.6068995506595158, "grad_norm": 2.388293504714966, "learning_rate": 3.931004493404841e-07, "loss": 0.2434, "step": 12561 }, { "epoch": 0.6069478668406049, "grad_norm": 4.268959999084473, "learning_rate": 3.930521331593951e-07, "loss": 0.3652, "step": 12562 }, { "epoch": 0.606996183021694, "grad_norm": 2.9650869369506836, "learning_rate": 3.93003816978306e-07, "loss": 0.3354, "step": 12563 }, { "epoch": 0.607044499202783, "grad_norm": 3.8052427768707275, "learning_rate": 3.9295550079721696e-07, "loss": 0.2539, "step": 12564 }, { "epoch": 0.607092815383872, "grad_norm": 4.481335163116455, "learning_rate": 3.9290718461612795e-07, "loss": 0.3079, "step": 12565 }, { "epoch": 0.6071411315649611, "grad_norm": 16.152904510498047, "learning_rate": 3.928588684350389e-07, "loss": 0.2656, "step": 12566 }, { "epoch": 0.6071894477460501, "grad_norm": 4.199753761291504, "learning_rate": 3.928105522539498e-07, "loss": 0.2765, "step": 12567 }, { "epoch": 0.6072377639271392, "grad_norm": 3.056013584136963, "learning_rate": 3.927622360728608e-07, "loss": 0.2762, "step": 12568 }, { "epoch": 0.6072860801082283, "grad_norm": 3.7175967693328857, "learning_rate": 3.927139198917717e-07, "loss": 0.2712, "step": 12569 }, { "epoch": 0.6073343962893173, "grad_norm": 2.619049072265625, "learning_rate": 3.926656037106827e-07, "loss": 0.257, "step": 12570 }, { "epoch": 0.6073827124704063, "grad_norm": 2.1538069248199463, "learning_rate": 3.926172875295937e-07, "loss": 0.2181, "step": 12571 }, { "epoch": 0.6074310286514953, "grad_norm": 3.259753704071045, "learning_rate": 3.9256897134850456e-07, "loss": 0.3291, "step": 12572 }, { "epoch": 0.6074793448325845, "grad_norm": 1.5384591817855835, "learning_rate": 3.9252065516741555e-07, "loss": 0.174, "step": 12573 }, { "epoch": 0.6075276610136735, "grad_norm": 2.649299144744873, "learning_rate": 3.924723389863265e-07, "loss": 0.2775, "step": 12574 }, { "epoch": 0.6075759771947625, "grad_norm": 2.7120981216430664, "learning_rate": 3.924240228052375e-07, "loss": 0.3372, "step": 12575 }, { "epoch": 0.6076242933758516, "grad_norm": 1.9088329076766968, "learning_rate": 3.923757066241484e-07, "loss": 0.1872, "step": 12576 }, { "epoch": 0.6076726095569406, "grad_norm": 7.609114646911621, "learning_rate": 3.9232739044305935e-07, "loss": 0.2788, "step": 12577 }, { "epoch": 0.6077209257380297, "grad_norm": 3.224933385848999, "learning_rate": 3.9227907426197034e-07, "loss": 0.3627, "step": 12578 }, { "epoch": 0.6077692419191187, "grad_norm": 2.286954641342163, "learning_rate": 3.922307580808813e-07, "loss": 0.2531, "step": 12579 }, { "epoch": 0.6078175581002078, "grad_norm": 2.6280646324157715, "learning_rate": 3.921824418997922e-07, "loss": 0.3378, "step": 12580 }, { "epoch": 0.6078658742812968, "grad_norm": 2.2762458324432373, "learning_rate": 3.921341257187032e-07, "loss": 0.2395, "step": 12581 }, { "epoch": 0.6079141904623858, "grad_norm": 2.3830857276916504, "learning_rate": 3.920858095376141e-07, "loss": 0.2772, "step": 12582 }, { "epoch": 0.607962506643475, "grad_norm": 4.699297904968262, "learning_rate": 3.920374933565251e-07, "loss": 0.3725, "step": 12583 }, { "epoch": 0.608010822824564, "grad_norm": 2.679370880126953, "learning_rate": 3.9198917717543607e-07, "loss": 0.323, "step": 12584 }, { "epoch": 0.608059139005653, "grad_norm": 3.645944595336914, "learning_rate": 3.9194086099434696e-07, "loss": 0.3695, "step": 12585 }, { "epoch": 0.608107455186742, "grad_norm": 2.5613934993743896, "learning_rate": 3.9189254481325795e-07, "loss": 0.2716, "step": 12586 }, { "epoch": 0.608155771367831, "grad_norm": 21.25505828857422, "learning_rate": 3.918442286321689e-07, "loss": 0.2926, "step": 12587 }, { "epoch": 0.6082040875489201, "grad_norm": 1.8625408411026, "learning_rate": 3.917959124510798e-07, "loss": 0.193, "step": 12588 }, { "epoch": 0.6082524037300092, "grad_norm": 2.2515838146209717, "learning_rate": 3.917475962699908e-07, "loss": 0.2621, "step": 12589 }, { "epoch": 0.6083007199110982, "grad_norm": 2.3808534145355225, "learning_rate": 3.9169928008890175e-07, "loss": 0.2612, "step": 12590 }, { "epoch": 0.6083490360921873, "grad_norm": 3.510277271270752, "learning_rate": 3.9165096390781274e-07, "loss": 0.3878, "step": 12591 }, { "epoch": 0.6083973522732763, "grad_norm": 2.132646322250366, "learning_rate": 3.916026477267237e-07, "loss": 0.2824, "step": 12592 }, { "epoch": 0.6084456684543653, "grad_norm": 3.0505688190460205, "learning_rate": 3.915543315456346e-07, "loss": 0.395, "step": 12593 }, { "epoch": 0.6084939846354545, "grad_norm": 3.421133518218994, "learning_rate": 3.915060153645456e-07, "loss": 0.248, "step": 12594 }, { "epoch": 0.6085423008165435, "grad_norm": 1.9595506191253662, "learning_rate": 3.914576991834565e-07, "loss": 0.2295, "step": 12595 }, { "epoch": 0.6085906169976325, "grad_norm": 3.032902956008911, "learning_rate": 3.914093830023675e-07, "loss": 0.2147, "step": 12596 }, { "epoch": 0.6086389331787215, "grad_norm": 3.613071918487549, "learning_rate": 3.9136106682127847e-07, "loss": 0.4652, "step": 12597 }, { "epoch": 0.6086872493598106, "grad_norm": 1.4322859048843384, "learning_rate": 3.9131275064018935e-07, "loss": 0.1508, "step": 12598 }, { "epoch": 0.6087355655408997, "grad_norm": 4.507907390594482, "learning_rate": 3.9126443445910034e-07, "loss": 0.3658, "step": 12599 }, { "epoch": 0.6087838817219887, "grad_norm": 10.134127616882324, "learning_rate": 3.912161182780113e-07, "loss": 0.248, "step": 12600 }, { "epoch": 0.6088321979030777, "grad_norm": 5.264978408813477, "learning_rate": 3.911678020969222e-07, "loss": 0.3109, "step": 12601 }, { "epoch": 0.6088805140841668, "grad_norm": 9.009804725646973, "learning_rate": 3.911194859158332e-07, "loss": 0.3695, "step": 12602 }, { "epoch": 0.6089288302652558, "grad_norm": 4.18695068359375, "learning_rate": 3.9107116973474414e-07, "loss": 0.2975, "step": 12603 }, { "epoch": 0.6089771464463449, "grad_norm": 2.462092876434326, "learning_rate": 3.910228535536551e-07, "loss": 0.2804, "step": 12604 }, { "epoch": 0.609025462627434, "grad_norm": 2.4056296348571777, "learning_rate": 3.9097453737256607e-07, "loss": 0.2825, "step": 12605 }, { "epoch": 0.609073778808523, "grad_norm": 2.182957410812378, "learning_rate": 3.90926221191477e-07, "loss": 0.2076, "step": 12606 }, { "epoch": 0.609122094989612, "grad_norm": 2.540820360183716, "learning_rate": 3.90877905010388e-07, "loss": 0.3865, "step": 12607 }, { "epoch": 0.609170411170701, "grad_norm": 2.6396853923797607, "learning_rate": 3.908295888292989e-07, "loss": 0.2846, "step": 12608 }, { "epoch": 0.6092187273517902, "grad_norm": 3.2390034198760986, "learning_rate": 3.907812726482099e-07, "loss": 0.4208, "step": 12609 }, { "epoch": 0.6092670435328792, "grad_norm": 1.8559691905975342, "learning_rate": 3.9073295646712086e-07, "loss": 0.2058, "step": 12610 }, { "epoch": 0.6093153597139682, "grad_norm": 1.7099545001983643, "learning_rate": 3.9068464028603175e-07, "loss": 0.1763, "step": 12611 }, { "epoch": 0.6093636758950572, "grad_norm": 2.975878953933716, "learning_rate": 3.9063632410494274e-07, "loss": 0.278, "step": 12612 }, { "epoch": 0.6094119920761463, "grad_norm": 6.440308094024658, "learning_rate": 3.905880079238537e-07, "loss": 0.3028, "step": 12613 }, { "epoch": 0.6094603082572353, "grad_norm": 2.3707196712493896, "learning_rate": 3.905396917427646e-07, "loss": 0.206, "step": 12614 }, { "epoch": 0.6095086244383244, "grad_norm": 4.244413375854492, "learning_rate": 3.904913755616756e-07, "loss": 0.3566, "step": 12615 }, { "epoch": 0.6095569406194135, "grad_norm": 4.612421035766602, "learning_rate": 3.9044305938058654e-07, "loss": 0.3304, "step": 12616 }, { "epoch": 0.6096052568005025, "grad_norm": 5.006315231323242, "learning_rate": 3.903947431994975e-07, "loss": 0.3386, "step": 12617 }, { "epoch": 0.6096535729815915, "grad_norm": 10.639466285705566, "learning_rate": 3.903464270184084e-07, "loss": 0.2419, "step": 12618 }, { "epoch": 0.6097018891626805, "grad_norm": 1.9999719858169556, "learning_rate": 3.902981108373194e-07, "loss": 0.1872, "step": 12619 }, { "epoch": 0.6097502053437697, "grad_norm": 2.5632309913635254, "learning_rate": 3.9024979465623034e-07, "loss": 0.3462, "step": 12620 }, { "epoch": 0.6097985215248587, "grad_norm": 2.939870595932007, "learning_rate": 3.902014784751413e-07, "loss": 0.2261, "step": 12621 }, { "epoch": 0.6098468377059477, "grad_norm": 2.2251877784729004, "learning_rate": 3.9015316229405227e-07, "loss": 0.2358, "step": 12622 }, { "epoch": 0.6098951538870367, "grad_norm": 3.3833670616149902, "learning_rate": 3.9010484611296326e-07, "loss": 0.3912, "step": 12623 }, { "epoch": 0.6099434700681258, "grad_norm": 3.4575700759887695, "learning_rate": 3.9005652993187414e-07, "loss": 0.3816, "step": 12624 }, { "epoch": 0.6099917862492149, "grad_norm": 1.9631569385528564, "learning_rate": 3.9000821375078514e-07, "loss": 0.2226, "step": 12625 }, { "epoch": 0.6100401024303039, "grad_norm": 3.2612226009368896, "learning_rate": 3.8995989756969607e-07, "loss": 0.3716, "step": 12626 }, { "epoch": 0.610088418611393, "grad_norm": 2.8897972106933594, "learning_rate": 3.89911581388607e-07, "loss": 0.3329, "step": 12627 }, { "epoch": 0.610136734792482, "grad_norm": 2.249537706375122, "learning_rate": 3.89863265207518e-07, "loss": 0.2806, "step": 12628 }, { "epoch": 0.610185050973571, "grad_norm": 2.411689281463623, "learning_rate": 3.8981494902642894e-07, "loss": 0.26, "step": 12629 }, { "epoch": 0.6102333671546601, "grad_norm": 2.5476601123809814, "learning_rate": 3.897666328453399e-07, "loss": 0.3108, "step": 12630 }, { "epoch": 0.6102816833357492, "grad_norm": 4.75396728515625, "learning_rate": 3.897183166642508e-07, "loss": 0.3314, "step": 12631 }, { "epoch": 0.6103299995168382, "grad_norm": 8.11172866821289, "learning_rate": 3.896700004831618e-07, "loss": 0.2345, "step": 12632 }, { "epoch": 0.6103783156979272, "grad_norm": 2.07373309135437, "learning_rate": 3.8962168430207274e-07, "loss": 0.2145, "step": 12633 }, { "epoch": 0.6104266318790162, "grad_norm": 2.5471696853637695, "learning_rate": 3.895733681209837e-07, "loss": 0.314, "step": 12634 }, { "epoch": 0.6104749480601054, "grad_norm": 6.5599846839904785, "learning_rate": 3.8952505193989467e-07, "loss": 0.5411, "step": 12635 }, { "epoch": 0.6105232642411944, "grad_norm": 2.2093000411987305, "learning_rate": 3.894767357588056e-07, "loss": 0.1943, "step": 12636 }, { "epoch": 0.6105715804222834, "grad_norm": 2.6025478839874268, "learning_rate": 3.8942841957771654e-07, "loss": 0.3095, "step": 12637 }, { "epoch": 0.6106198966033725, "grad_norm": 2.6649978160858154, "learning_rate": 3.8938010339662753e-07, "loss": 0.2691, "step": 12638 }, { "epoch": 0.6106682127844615, "grad_norm": 2.25036883354187, "learning_rate": 3.8933178721553847e-07, "loss": 0.2306, "step": 12639 }, { "epoch": 0.6107165289655505, "grad_norm": 2.6976988315582275, "learning_rate": 3.892834710344494e-07, "loss": 0.3854, "step": 12640 }, { "epoch": 0.6107648451466396, "grad_norm": 2.847179889678955, "learning_rate": 3.892351548533604e-07, "loss": 0.3056, "step": 12641 }, { "epoch": 0.6108131613277287, "grad_norm": 2.96117901802063, "learning_rate": 3.8918683867227133e-07, "loss": 0.2691, "step": 12642 }, { "epoch": 0.6108614775088177, "grad_norm": 2.6948249340057373, "learning_rate": 3.8913852249118227e-07, "loss": 0.2239, "step": 12643 }, { "epoch": 0.6109097936899067, "grad_norm": 6.090638160705566, "learning_rate": 3.890902063100932e-07, "loss": 0.443, "step": 12644 }, { "epoch": 0.6109581098709957, "grad_norm": 6.793923854827881, "learning_rate": 3.890418901290042e-07, "loss": 0.2331, "step": 12645 }, { "epoch": 0.6110064260520849, "grad_norm": 3.902559280395508, "learning_rate": 3.8899357394791514e-07, "loss": 0.2603, "step": 12646 }, { "epoch": 0.6110547422331739, "grad_norm": 4.124622344970703, "learning_rate": 3.8894525776682607e-07, "loss": 0.1432, "step": 12647 }, { "epoch": 0.6111030584142629, "grad_norm": 2.1768951416015625, "learning_rate": 3.8889694158573706e-07, "loss": 0.2864, "step": 12648 }, { "epoch": 0.611151374595352, "grad_norm": 5.76569938659668, "learning_rate": 3.88848625404648e-07, "loss": 0.3686, "step": 12649 }, { "epoch": 0.611199690776441, "grad_norm": 2.7974436283111572, "learning_rate": 3.8880030922355894e-07, "loss": 0.3193, "step": 12650 }, { "epoch": 0.6112480069575301, "grad_norm": 2.1839029788970947, "learning_rate": 3.8875199304246993e-07, "loss": 0.3026, "step": 12651 }, { "epoch": 0.6112963231386191, "grad_norm": 2.6951732635498047, "learning_rate": 3.887036768613808e-07, "loss": 0.311, "step": 12652 }, { "epoch": 0.6113446393197082, "grad_norm": 4.848851203918457, "learning_rate": 3.886553606802918e-07, "loss": 0.3167, "step": 12653 }, { "epoch": 0.6113929555007972, "grad_norm": 3.186781644821167, "learning_rate": 3.886070444992028e-07, "loss": 0.3276, "step": 12654 }, { "epoch": 0.6114412716818862, "grad_norm": 1.9892792701721191, "learning_rate": 3.8855872831811373e-07, "loss": 0.1967, "step": 12655 }, { "epoch": 0.6114895878629754, "grad_norm": 3.491640567779541, "learning_rate": 3.8851041213702467e-07, "loss": 0.171, "step": 12656 }, { "epoch": 0.6115379040440644, "grad_norm": 37.623512268066406, "learning_rate": 3.884620959559356e-07, "loss": 0.1941, "step": 12657 }, { "epoch": 0.6115862202251534, "grad_norm": 4.8074259757995605, "learning_rate": 3.884137797748466e-07, "loss": 0.2507, "step": 12658 }, { "epoch": 0.6116345364062424, "grad_norm": 2.303020715713501, "learning_rate": 3.8836546359375753e-07, "loss": 0.2595, "step": 12659 }, { "epoch": 0.6116828525873315, "grad_norm": 4.170516490936279, "learning_rate": 3.8831714741266847e-07, "loss": 0.3663, "step": 12660 }, { "epoch": 0.6117311687684206, "grad_norm": 4.443815231323242, "learning_rate": 3.8826883123157946e-07, "loss": 0.2656, "step": 12661 }, { "epoch": 0.6117794849495096, "grad_norm": 3.6608898639678955, "learning_rate": 3.882205150504904e-07, "loss": 0.3903, "step": 12662 }, { "epoch": 0.6118278011305986, "grad_norm": 2.4142332077026367, "learning_rate": 3.8817219886940133e-07, "loss": 0.2298, "step": 12663 }, { "epoch": 0.6118761173116877, "grad_norm": 2.976806879043579, "learning_rate": 3.881238826883123e-07, "loss": 0.2638, "step": 12664 }, { "epoch": 0.6119244334927767, "grad_norm": 10.014049530029297, "learning_rate": 3.880755665072232e-07, "loss": 0.2778, "step": 12665 }, { "epoch": 0.6119727496738657, "grad_norm": 2.555924654006958, "learning_rate": 3.880272503261342e-07, "loss": 0.3047, "step": 12666 }, { "epoch": 0.6120210658549549, "grad_norm": 2.4285085201263428, "learning_rate": 3.879789341450452e-07, "loss": 0.2205, "step": 12667 }, { "epoch": 0.6120693820360439, "grad_norm": 20.24486541748047, "learning_rate": 3.8793061796395607e-07, "loss": 0.3108, "step": 12668 }, { "epoch": 0.6121176982171329, "grad_norm": 4.602148056030273, "learning_rate": 3.8788230178286706e-07, "loss": 0.2273, "step": 12669 }, { "epoch": 0.6121660143982219, "grad_norm": 4.286997318267822, "learning_rate": 3.87833985601778e-07, "loss": 0.2696, "step": 12670 }, { "epoch": 0.612214330579311, "grad_norm": 2.970102071762085, "learning_rate": 3.87785669420689e-07, "loss": 0.3705, "step": 12671 }, { "epoch": 0.6122626467604001, "grad_norm": 2.383927822113037, "learning_rate": 3.8773735323959993e-07, "loss": 0.2691, "step": 12672 }, { "epoch": 0.6123109629414891, "grad_norm": 2.3664801120758057, "learning_rate": 3.8768903705851087e-07, "loss": 0.2433, "step": 12673 }, { "epoch": 0.6123592791225781, "grad_norm": 3.713566780090332, "learning_rate": 3.8764072087742186e-07, "loss": 0.3914, "step": 12674 }, { "epoch": 0.6124075953036672, "grad_norm": 2.8017418384552, "learning_rate": 3.875924046963328e-07, "loss": 0.2559, "step": 12675 }, { "epoch": 0.6124559114847562, "grad_norm": 3.0396361351013184, "learning_rate": 3.8754408851524373e-07, "loss": 0.4103, "step": 12676 }, { "epoch": 0.6125042276658453, "grad_norm": 3.568864583969116, "learning_rate": 3.874957723341547e-07, "loss": 0.2619, "step": 12677 }, { "epoch": 0.6125525438469344, "grad_norm": 3.581317186355591, "learning_rate": 3.874474561530656e-07, "loss": 0.2735, "step": 12678 }, { "epoch": 0.6126008600280234, "grad_norm": 2.076972246170044, "learning_rate": 3.873991399719766e-07, "loss": 0.2338, "step": 12679 }, { "epoch": 0.6126491762091124, "grad_norm": 2.5221357345581055, "learning_rate": 3.873508237908876e-07, "loss": 0.1996, "step": 12680 }, { "epoch": 0.6126974923902014, "grad_norm": 2.502967119216919, "learning_rate": 3.8730250760979847e-07, "loss": 0.2274, "step": 12681 }, { "epoch": 0.6127458085712906, "grad_norm": 5.254246711730957, "learning_rate": 3.8725419142870946e-07, "loss": 0.2941, "step": 12682 }, { "epoch": 0.6127941247523796, "grad_norm": 2.859093427658081, "learning_rate": 3.872058752476204e-07, "loss": 0.2315, "step": 12683 }, { "epoch": 0.6128424409334686, "grad_norm": 5.345643043518066, "learning_rate": 3.8715755906653133e-07, "loss": 0.2881, "step": 12684 }, { "epoch": 0.6128907571145576, "grad_norm": 2.6353089809417725, "learning_rate": 3.871092428854423e-07, "loss": 0.3647, "step": 12685 }, { "epoch": 0.6129390732956467, "grad_norm": 3.0548105239868164, "learning_rate": 3.8706092670435326e-07, "loss": 0.3206, "step": 12686 }, { "epoch": 0.6129873894767358, "grad_norm": 2.1936516761779785, "learning_rate": 3.8701261052326425e-07, "loss": 0.2453, "step": 12687 }, { "epoch": 0.6130357056578248, "grad_norm": 3.1883127689361572, "learning_rate": 3.869642943421752e-07, "loss": 0.3483, "step": 12688 }, { "epoch": 0.6130840218389139, "grad_norm": 2.632674217224121, "learning_rate": 3.8691597816108613e-07, "loss": 0.3024, "step": 12689 }, { "epoch": 0.6131323380200029, "grad_norm": 3.8082900047302246, "learning_rate": 3.868676619799971e-07, "loss": 0.2073, "step": 12690 }, { "epoch": 0.6131806542010919, "grad_norm": 2.719454288482666, "learning_rate": 3.86819345798908e-07, "loss": 0.2969, "step": 12691 }, { "epoch": 0.6132289703821809, "grad_norm": 6.3508381843566895, "learning_rate": 3.86771029617819e-07, "loss": 0.4032, "step": 12692 }, { "epoch": 0.6132772865632701, "grad_norm": 2.209737539291382, "learning_rate": 3.8672271343673e-07, "loss": 0.2404, "step": 12693 }, { "epoch": 0.6133256027443591, "grad_norm": 2.0972814559936523, "learning_rate": 3.8667439725564087e-07, "loss": 0.2361, "step": 12694 }, { "epoch": 0.6133739189254481, "grad_norm": 2.0131053924560547, "learning_rate": 3.8662608107455186e-07, "loss": 0.204, "step": 12695 }, { "epoch": 0.6134222351065372, "grad_norm": 2.8025407791137695, "learning_rate": 3.865777648934628e-07, "loss": 0.2985, "step": 12696 }, { "epoch": 0.6134705512876262, "grad_norm": 6.214917182922363, "learning_rate": 3.8652944871237373e-07, "loss": 0.3511, "step": 12697 }, { "epoch": 0.6135188674687153, "grad_norm": 2.470736265182495, "learning_rate": 3.864811325312847e-07, "loss": 0.2875, "step": 12698 }, { "epoch": 0.6135671836498043, "grad_norm": 3.437976121902466, "learning_rate": 3.8643281635019566e-07, "loss": 0.3186, "step": 12699 }, { "epoch": 0.6136154998308934, "grad_norm": 2.0319292545318604, "learning_rate": 3.863845001691066e-07, "loss": 0.1963, "step": 12700 }, { "epoch": 0.6136638160119824, "grad_norm": 24.753501892089844, "learning_rate": 3.863361839880176e-07, "loss": 0.2923, "step": 12701 }, { "epoch": 0.6137121321930714, "grad_norm": 2.3905858993530273, "learning_rate": 3.862878678069285e-07, "loss": 0.2143, "step": 12702 }, { "epoch": 0.6137604483741605, "grad_norm": 2.5817887783050537, "learning_rate": 3.862395516258395e-07, "loss": 0.3307, "step": 12703 }, { "epoch": 0.6138087645552496, "grad_norm": 2.5076663494110107, "learning_rate": 3.861912354447504e-07, "loss": 0.1865, "step": 12704 }, { "epoch": 0.6138570807363386, "grad_norm": 5.078788757324219, "learning_rate": 3.861429192636614e-07, "loss": 0.4424, "step": 12705 }, { "epoch": 0.6139053969174276, "grad_norm": 4.778254985809326, "learning_rate": 3.860946030825724e-07, "loss": 0.3138, "step": 12706 }, { "epoch": 0.6139537130985167, "grad_norm": 3.3376035690307617, "learning_rate": 3.8604628690148326e-07, "loss": 0.3741, "step": 12707 }, { "epoch": 0.6140020292796058, "grad_norm": 3.20204758644104, "learning_rate": 3.8599797072039425e-07, "loss": 0.4541, "step": 12708 }, { "epoch": 0.6140503454606948, "grad_norm": 3.2441139221191406, "learning_rate": 3.859496545393052e-07, "loss": 0.4425, "step": 12709 }, { "epoch": 0.6140986616417838, "grad_norm": 3.178999423980713, "learning_rate": 3.8590133835821613e-07, "loss": 0.4943, "step": 12710 }, { "epoch": 0.6141469778228729, "grad_norm": 2.436933755874634, "learning_rate": 3.858530221771271e-07, "loss": 0.2776, "step": 12711 }, { "epoch": 0.6141952940039619, "grad_norm": 3.1617934703826904, "learning_rate": 3.8580470599603805e-07, "loss": 0.3522, "step": 12712 }, { "epoch": 0.614243610185051, "grad_norm": 2.34730863571167, "learning_rate": 3.85756389814949e-07, "loss": 0.2449, "step": 12713 }, { "epoch": 0.61429192636614, "grad_norm": 2.8161821365356445, "learning_rate": 3.8570807363386e-07, "loss": 0.3697, "step": 12714 }, { "epoch": 0.6143402425472291, "grad_norm": 3.210136651992798, "learning_rate": 3.856597574527709e-07, "loss": 0.3989, "step": 12715 }, { "epoch": 0.6143885587283181, "grad_norm": 2.8021717071533203, "learning_rate": 3.8561144127168186e-07, "loss": 0.2702, "step": 12716 }, { "epoch": 0.6144368749094071, "grad_norm": 2.7642359733581543, "learning_rate": 3.855631250905928e-07, "loss": 0.1928, "step": 12717 }, { "epoch": 0.6144851910904962, "grad_norm": 1.4751304388046265, "learning_rate": 3.855148089095038e-07, "loss": 0.1905, "step": 12718 }, { "epoch": 0.6145335072715853, "grad_norm": 6.158595085144043, "learning_rate": 3.854664927284148e-07, "loss": 0.3947, "step": 12719 }, { "epoch": 0.6145818234526743, "grad_norm": 3.1895954608917236, "learning_rate": 3.8541817654732566e-07, "loss": 0.5374, "step": 12720 }, { "epoch": 0.6146301396337633, "grad_norm": 5.324985504150391, "learning_rate": 3.8536986036623665e-07, "loss": 0.5236, "step": 12721 }, { "epoch": 0.6146784558148524, "grad_norm": 2.9380667209625244, "learning_rate": 3.853215441851476e-07, "loss": 0.2092, "step": 12722 }, { "epoch": 0.6147267719959414, "grad_norm": 3.4418416023254395, "learning_rate": 3.852732280040585e-07, "loss": 0.2638, "step": 12723 }, { "epoch": 0.6147750881770305, "grad_norm": 3.009269952774048, "learning_rate": 3.852249118229695e-07, "loss": 0.2593, "step": 12724 }, { "epoch": 0.6148234043581196, "grad_norm": 2.4031078815460205, "learning_rate": 3.8517659564188045e-07, "loss": 0.2794, "step": 12725 }, { "epoch": 0.6148717205392086, "grad_norm": 5.295119285583496, "learning_rate": 3.851282794607914e-07, "loss": 0.2602, "step": 12726 }, { "epoch": 0.6149200367202976, "grad_norm": 3.5839617252349854, "learning_rate": 3.850799632797024e-07, "loss": 0.33, "step": 12727 }, { "epoch": 0.6149683529013866, "grad_norm": 2.1313540935516357, "learning_rate": 3.850316470986133e-07, "loss": 0.2006, "step": 12728 }, { "epoch": 0.6150166690824758, "grad_norm": 3.8412959575653076, "learning_rate": 3.8498333091752425e-07, "loss": 0.2353, "step": 12729 }, { "epoch": 0.6150649852635648, "grad_norm": 1.7246495485305786, "learning_rate": 3.849350147364352e-07, "loss": 0.1387, "step": 12730 }, { "epoch": 0.6151133014446538, "grad_norm": 2.410538673400879, "learning_rate": 3.848866985553462e-07, "loss": 0.2698, "step": 12731 }, { "epoch": 0.6151616176257428, "grad_norm": 2.127316951751709, "learning_rate": 3.848383823742571e-07, "loss": 0.1763, "step": 12732 }, { "epoch": 0.6152099338068319, "grad_norm": 1.8182497024536133, "learning_rate": 3.8479006619316806e-07, "loss": 0.2154, "step": 12733 }, { "epoch": 0.615258249987921, "grad_norm": 1.8946361541748047, "learning_rate": 3.8474175001207905e-07, "loss": 0.2141, "step": 12734 }, { "epoch": 0.61530656616901, "grad_norm": 2.6888396739959717, "learning_rate": 3.8469343383098993e-07, "loss": 0.4182, "step": 12735 }, { "epoch": 0.615354882350099, "grad_norm": 3.677342414855957, "learning_rate": 3.846451176499009e-07, "loss": 0.3107, "step": 12736 }, { "epoch": 0.6154031985311881, "grad_norm": 3.122534990310669, "learning_rate": 3.845968014688119e-07, "loss": 0.3244, "step": 12737 }, { "epoch": 0.6154515147122771, "grad_norm": 2.2482216358184814, "learning_rate": 3.8454848528772285e-07, "loss": 0.2189, "step": 12738 }, { "epoch": 0.6154998308933662, "grad_norm": 2.9537346363067627, "learning_rate": 3.845001691066338e-07, "loss": 0.4079, "step": 12739 }, { "epoch": 0.6155481470744553, "grad_norm": 4.753889083862305, "learning_rate": 3.844518529255448e-07, "loss": 0.2899, "step": 12740 }, { "epoch": 0.6155964632555443, "grad_norm": 3.1941399574279785, "learning_rate": 3.844035367444557e-07, "loss": 0.3143, "step": 12741 }, { "epoch": 0.6156447794366333, "grad_norm": 1.700071930885315, "learning_rate": 3.8435522056336665e-07, "loss": 0.1748, "step": 12742 }, { "epoch": 0.6156930956177223, "grad_norm": 2.068410873413086, "learning_rate": 3.843069043822776e-07, "loss": 0.2125, "step": 12743 }, { "epoch": 0.6157414117988114, "grad_norm": 2.4040920734405518, "learning_rate": 3.842585882011886e-07, "loss": 0.3093, "step": 12744 }, { "epoch": 0.6157897279799005, "grad_norm": 2.3476154804229736, "learning_rate": 3.842102720200995e-07, "loss": 0.2667, "step": 12745 }, { "epoch": 0.6158380441609895, "grad_norm": 3.4173474311828613, "learning_rate": 3.8416195583901045e-07, "loss": 0.2204, "step": 12746 }, { "epoch": 0.6158863603420786, "grad_norm": 2.0589654445648193, "learning_rate": 3.8411363965792144e-07, "loss": 0.2293, "step": 12747 }, { "epoch": 0.6159346765231676, "grad_norm": 2.675664186477661, "learning_rate": 3.840653234768323e-07, "loss": 0.2654, "step": 12748 }, { "epoch": 0.6159829927042566, "grad_norm": 37.87917709350586, "learning_rate": 3.840170072957433e-07, "loss": 0.4368, "step": 12749 }, { "epoch": 0.6160313088853457, "grad_norm": 3.095759868621826, "learning_rate": 3.839686911146543e-07, "loss": 0.2791, "step": 12750 }, { "epoch": 0.6160796250664348, "grad_norm": 2.178361415863037, "learning_rate": 3.839203749335652e-07, "loss": 0.2175, "step": 12751 }, { "epoch": 0.6161279412475238, "grad_norm": 6.118105888366699, "learning_rate": 3.838720587524762e-07, "loss": 0.3423, "step": 12752 }, { "epoch": 0.6161762574286128, "grad_norm": 2.3082525730133057, "learning_rate": 3.8382374257138717e-07, "loss": 0.2831, "step": 12753 }, { "epoch": 0.6162245736097018, "grad_norm": 2.5079355239868164, "learning_rate": 3.837754263902981e-07, "loss": 0.2801, "step": 12754 }, { "epoch": 0.616272889790791, "grad_norm": 3.35368013381958, "learning_rate": 3.8372711020920905e-07, "loss": 0.3688, "step": 12755 }, { "epoch": 0.61632120597188, "grad_norm": 3.1955862045288086, "learning_rate": 3.8367879402812e-07, "loss": 0.3891, "step": 12756 }, { "epoch": 0.616369522152969, "grad_norm": 2.13405442237854, "learning_rate": 3.8363047784703097e-07, "loss": 0.2145, "step": 12757 }, { "epoch": 0.616417838334058, "grad_norm": 1.7342849969863892, "learning_rate": 3.835821616659419e-07, "loss": 0.1682, "step": 12758 }, { "epoch": 0.6164661545151471, "grad_norm": 2.4069149494171143, "learning_rate": 3.8353384548485285e-07, "loss": 0.2719, "step": 12759 }, { "epoch": 0.6165144706962362, "grad_norm": 3.306288480758667, "learning_rate": 3.8348552930376384e-07, "loss": 0.3307, "step": 12760 }, { "epoch": 0.6165627868773252, "grad_norm": 2.0428531169891357, "learning_rate": 3.834372131226747e-07, "loss": 0.2596, "step": 12761 }, { "epoch": 0.6166111030584143, "grad_norm": 2.3073065280914307, "learning_rate": 3.833888969415857e-07, "loss": 0.2368, "step": 12762 }, { "epoch": 0.6166594192395033, "grad_norm": 9.58055591583252, "learning_rate": 3.833405807604967e-07, "loss": 0.3753, "step": 12763 }, { "epoch": 0.6167077354205923, "grad_norm": 2.7430508136749268, "learning_rate": 3.832922645794076e-07, "loss": 0.361, "step": 12764 }, { "epoch": 0.6167560516016815, "grad_norm": 1.6349964141845703, "learning_rate": 3.832439483983186e-07, "loss": 0.1375, "step": 12765 }, { "epoch": 0.6168043677827705, "grad_norm": 2.4897899627685547, "learning_rate": 3.8319563221722957e-07, "loss": 0.354, "step": 12766 }, { "epoch": 0.6168526839638595, "grad_norm": 2.2548956871032715, "learning_rate": 3.8314731603614045e-07, "loss": 0.3007, "step": 12767 }, { "epoch": 0.6169010001449485, "grad_norm": 2.0838210582733154, "learning_rate": 3.8309899985505144e-07, "loss": 0.2236, "step": 12768 }, { "epoch": 0.6169493163260376, "grad_norm": 2.290642499923706, "learning_rate": 3.830506836739624e-07, "loss": 0.2012, "step": 12769 }, { "epoch": 0.6169976325071267, "grad_norm": 3.5988259315490723, "learning_rate": 3.8300236749287337e-07, "loss": 0.4, "step": 12770 }, { "epoch": 0.6170459486882157, "grad_norm": 1.496537685394287, "learning_rate": 3.829540513117843e-07, "loss": 0.1415, "step": 12771 }, { "epoch": 0.6170942648693047, "grad_norm": 2.8808369636535645, "learning_rate": 3.8290573513069524e-07, "loss": 0.3233, "step": 12772 }, { "epoch": 0.6171425810503938, "grad_norm": 1.2575106620788574, "learning_rate": 3.8285741894960623e-07, "loss": 0.1307, "step": 12773 }, { "epoch": 0.6171908972314828, "grad_norm": 2.7933502197265625, "learning_rate": 3.828091027685171e-07, "loss": 0.3348, "step": 12774 }, { "epoch": 0.6172392134125718, "grad_norm": 2.426835060119629, "learning_rate": 3.827607865874281e-07, "loss": 0.3133, "step": 12775 }, { "epoch": 0.617287529593661, "grad_norm": 2.6440749168395996, "learning_rate": 3.827124704063391e-07, "loss": 0.3508, "step": 12776 }, { "epoch": 0.61733584577475, "grad_norm": 4.961085796356201, "learning_rate": 3.8266415422525e-07, "loss": 0.4204, "step": 12777 }, { "epoch": 0.617384161955839, "grad_norm": 2.3154149055480957, "learning_rate": 3.82615838044161e-07, "loss": 0.3037, "step": 12778 }, { "epoch": 0.617432478136928, "grad_norm": 2.909625768661499, "learning_rate": 3.8256752186307196e-07, "loss": 0.3478, "step": 12779 }, { "epoch": 0.6174807943180171, "grad_norm": 2.049848794937134, "learning_rate": 3.8251920568198285e-07, "loss": 0.1816, "step": 12780 }, { "epoch": 0.6175291104991062, "grad_norm": 3.11090087890625, "learning_rate": 3.8247088950089384e-07, "loss": 0.3635, "step": 12781 }, { "epoch": 0.6175774266801952, "grad_norm": 2.5507562160491943, "learning_rate": 3.824225733198048e-07, "loss": 0.3384, "step": 12782 }, { "epoch": 0.6176257428612842, "grad_norm": 2.7460145950317383, "learning_rate": 3.823742571387157e-07, "loss": 0.2981, "step": 12783 }, { "epoch": 0.6176740590423733, "grad_norm": 3.1891696453094482, "learning_rate": 3.823259409576267e-07, "loss": 0.2857, "step": 12784 }, { "epoch": 0.6177223752234623, "grad_norm": 2.356933832168579, "learning_rate": 3.8227762477653764e-07, "loss": 0.2812, "step": 12785 }, { "epoch": 0.6177706914045514, "grad_norm": 4.574784278869629, "learning_rate": 3.8222930859544863e-07, "loss": 0.245, "step": 12786 }, { "epoch": 0.6178190075856405, "grad_norm": 2.4909331798553467, "learning_rate": 3.821809924143595e-07, "loss": 0.3555, "step": 12787 }, { "epoch": 0.6178673237667295, "grad_norm": 4.856515884399414, "learning_rate": 3.821326762332705e-07, "loss": 0.3272, "step": 12788 }, { "epoch": 0.6179156399478185, "grad_norm": 3.6720752716064453, "learning_rate": 3.820843600521815e-07, "loss": 0.2992, "step": 12789 }, { "epoch": 0.6179639561289075, "grad_norm": 2.218951463699341, "learning_rate": 3.820360438710924e-07, "loss": 0.2651, "step": 12790 }, { "epoch": 0.6180122723099967, "grad_norm": 3.396559953689575, "learning_rate": 3.8198772769000337e-07, "loss": 0.2971, "step": 12791 }, { "epoch": 0.6180605884910857, "grad_norm": 2.884323835372925, "learning_rate": 3.8193941150891436e-07, "loss": 0.3519, "step": 12792 }, { "epoch": 0.6181089046721747, "grad_norm": 2.3808951377868652, "learning_rate": 3.8189109532782524e-07, "loss": 0.3756, "step": 12793 }, { "epoch": 0.6181572208532637, "grad_norm": 16.119871139526367, "learning_rate": 3.8184277914673623e-07, "loss": 0.3885, "step": 12794 }, { "epoch": 0.6182055370343528, "grad_norm": 2.969318389892578, "learning_rate": 3.8179446296564717e-07, "loss": 0.3238, "step": 12795 }, { "epoch": 0.6182538532154419, "grad_norm": 2.6441304683685303, "learning_rate": 3.817461467845581e-07, "loss": 0.3017, "step": 12796 }, { "epoch": 0.6183021693965309, "grad_norm": 2.375910758972168, "learning_rate": 3.816978306034691e-07, "loss": 0.2099, "step": 12797 }, { "epoch": 0.61835048557762, "grad_norm": 2.878849506378174, "learning_rate": 3.8164951442238004e-07, "loss": 0.2709, "step": 12798 }, { "epoch": 0.618398801758709, "grad_norm": 3.8557851314544678, "learning_rate": 3.81601198241291e-07, "loss": 0.1867, "step": 12799 }, { "epoch": 0.618447117939798, "grad_norm": 2.6122889518737793, "learning_rate": 3.815528820602019e-07, "loss": 0.298, "step": 12800 }, { "epoch": 0.618495434120887, "grad_norm": 2.8466241359710693, "learning_rate": 3.815045658791129e-07, "loss": 0.2544, "step": 12801 }, { "epoch": 0.6185437503019762, "grad_norm": 6.352092266082764, "learning_rate": 3.814562496980239e-07, "loss": 0.2137, "step": 12802 }, { "epoch": 0.6185920664830652, "grad_norm": 2.6910297870635986, "learning_rate": 3.814079335169348e-07, "loss": 0.3427, "step": 12803 }, { "epoch": 0.6186403826641542, "grad_norm": 1.8617898225784302, "learning_rate": 3.8135961733584577e-07, "loss": 0.1911, "step": 12804 }, { "epoch": 0.6186886988452432, "grad_norm": 2.566324234008789, "learning_rate": 3.8131130115475676e-07, "loss": 0.3329, "step": 12805 }, { "epoch": 0.6187370150263323, "grad_norm": 2.1288352012634277, "learning_rate": 3.8126298497366764e-07, "loss": 0.1903, "step": 12806 }, { "epoch": 0.6187853312074214, "grad_norm": 3.4050827026367188, "learning_rate": 3.8121466879257863e-07, "loss": 0.3717, "step": 12807 }, { "epoch": 0.6188336473885104, "grad_norm": 2.8065149784088135, "learning_rate": 3.8116635261148957e-07, "loss": 0.2902, "step": 12808 }, { "epoch": 0.6188819635695995, "grad_norm": 2.3809375762939453, "learning_rate": 3.811180364304005e-07, "loss": 0.2453, "step": 12809 }, { "epoch": 0.6189302797506885, "grad_norm": 2.3878886699676514, "learning_rate": 3.810697202493115e-07, "loss": 0.3015, "step": 12810 }, { "epoch": 0.6189785959317775, "grad_norm": 3.462517261505127, "learning_rate": 3.8102140406822243e-07, "loss": 0.3128, "step": 12811 }, { "epoch": 0.6190269121128666, "grad_norm": 2.762543201446533, "learning_rate": 3.8097308788713337e-07, "loss": 0.3003, "step": 12812 }, { "epoch": 0.6190752282939557, "grad_norm": 10.913030624389648, "learning_rate": 3.809247717060443e-07, "loss": 0.3568, "step": 12813 }, { "epoch": 0.6191235444750447, "grad_norm": 5.4141764640808105, "learning_rate": 3.808764555249553e-07, "loss": 0.2483, "step": 12814 }, { "epoch": 0.6191718606561337, "grad_norm": 3.8338398933410645, "learning_rate": 3.8082813934386624e-07, "loss": 0.3202, "step": 12815 }, { "epoch": 0.6192201768372227, "grad_norm": 1.9945836067199707, "learning_rate": 3.8077982316277717e-07, "loss": 0.2162, "step": 12816 }, { "epoch": 0.6192684930183119, "grad_norm": 3.399837017059326, "learning_rate": 3.8073150698168816e-07, "loss": 0.3491, "step": 12817 }, { "epoch": 0.6193168091994009, "grad_norm": 3.339707136154175, "learning_rate": 3.8068319080059915e-07, "loss": 0.4052, "step": 12818 }, { "epoch": 0.6193651253804899, "grad_norm": 2.170485496520996, "learning_rate": 3.8063487461951004e-07, "loss": 0.2402, "step": 12819 }, { "epoch": 0.619413441561579, "grad_norm": 2.7700021266937256, "learning_rate": 3.8058655843842103e-07, "loss": 0.2385, "step": 12820 }, { "epoch": 0.619461757742668, "grad_norm": 3.7322022914886475, "learning_rate": 3.8053824225733196e-07, "loss": 0.4073, "step": 12821 }, { "epoch": 0.6195100739237571, "grad_norm": 2.0921244621276855, "learning_rate": 3.804899260762429e-07, "loss": 0.1271, "step": 12822 }, { "epoch": 0.6195583901048461, "grad_norm": 6.047909259796143, "learning_rate": 3.804416098951539e-07, "loss": 0.2887, "step": 12823 }, { "epoch": 0.6196067062859352, "grad_norm": 3.1249990463256836, "learning_rate": 3.8039329371406483e-07, "loss": 0.1918, "step": 12824 }, { "epoch": 0.6196550224670242, "grad_norm": 2.749859571456909, "learning_rate": 3.8034497753297577e-07, "loss": 0.257, "step": 12825 }, { "epoch": 0.6197033386481132, "grad_norm": 3.2927191257476807, "learning_rate": 3.802966613518867e-07, "loss": 0.2137, "step": 12826 }, { "epoch": 0.6197516548292022, "grad_norm": 2.7610104084014893, "learning_rate": 3.802483451707977e-07, "loss": 0.2937, "step": 12827 }, { "epoch": 0.6197999710102914, "grad_norm": 2.7796740531921387, "learning_rate": 3.8020002898970863e-07, "loss": 0.2566, "step": 12828 }, { "epoch": 0.6198482871913804, "grad_norm": 3.2197563648223877, "learning_rate": 3.8015171280861957e-07, "loss": 0.2625, "step": 12829 }, { "epoch": 0.6198966033724694, "grad_norm": 5.4662699699401855, "learning_rate": 3.8010339662753056e-07, "loss": 0.3515, "step": 12830 }, { "epoch": 0.6199449195535585, "grad_norm": 2.5036780834198, "learning_rate": 3.800550804464415e-07, "loss": 0.238, "step": 12831 }, { "epoch": 0.6199932357346475, "grad_norm": 2.3916070461273193, "learning_rate": 3.8000676426535243e-07, "loss": 0.1798, "step": 12832 }, { "epoch": 0.6200415519157366, "grad_norm": 7.6702656745910645, "learning_rate": 3.799584480842634e-07, "loss": 0.3161, "step": 12833 }, { "epoch": 0.6200898680968256, "grad_norm": 110.59855651855469, "learning_rate": 3.7991013190317436e-07, "loss": 0.3523, "step": 12834 }, { "epoch": 0.6201381842779147, "grad_norm": 2.2451884746551514, "learning_rate": 3.798618157220853e-07, "loss": 0.2733, "step": 12835 }, { "epoch": 0.6201865004590037, "grad_norm": 3.5794475078582764, "learning_rate": 3.798134995409963e-07, "loss": 0.3591, "step": 12836 }, { "epoch": 0.6202348166400927, "grad_norm": 2.157606601715088, "learning_rate": 3.797651833599072e-07, "loss": 0.1854, "step": 12837 }, { "epoch": 0.6202831328211819, "grad_norm": 2.389122486114502, "learning_rate": 3.7971686717881816e-07, "loss": 0.3088, "step": 12838 }, { "epoch": 0.6203314490022709, "grad_norm": 2.492769956588745, "learning_rate": 3.796685509977291e-07, "loss": 0.2765, "step": 12839 }, { "epoch": 0.6203797651833599, "grad_norm": 3.65537428855896, "learning_rate": 3.796202348166401e-07, "loss": 0.3303, "step": 12840 }, { "epoch": 0.6204280813644489, "grad_norm": 2.578143358230591, "learning_rate": 3.7957191863555103e-07, "loss": 0.4008, "step": 12841 }, { "epoch": 0.620476397545538, "grad_norm": 7.004425048828125, "learning_rate": 3.7952360245446197e-07, "loss": 0.3637, "step": 12842 }, { "epoch": 0.6205247137266271, "grad_norm": 2.6798627376556396, "learning_rate": 3.7947528627337296e-07, "loss": 0.352, "step": 12843 }, { "epoch": 0.6205730299077161, "grad_norm": 2.9783833026885986, "learning_rate": 3.794269700922839e-07, "loss": 0.2988, "step": 12844 }, { "epoch": 0.6206213460888051, "grad_norm": 2.478508949279785, "learning_rate": 3.7937865391119483e-07, "loss": 0.282, "step": 12845 }, { "epoch": 0.6206696622698942, "grad_norm": 2.9458024501800537, "learning_rate": 3.793303377301058e-07, "loss": 0.2376, "step": 12846 }, { "epoch": 0.6207179784509832, "grad_norm": 6.173056125640869, "learning_rate": 3.792820215490167e-07, "loss": 0.3794, "step": 12847 }, { "epoch": 0.6207662946320723, "grad_norm": 2.2209439277648926, "learning_rate": 3.792337053679277e-07, "loss": 0.263, "step": 12848 }, { "epoch": 0.6208146108131614, "grad_norm": 1.709399938583374, "learning_rate": 3.791853891868387e-07, "loss": 0.1482, "step": 12849 }, { "epoch": 0.6208629269942504, "grad_norm": 3.593118190765381, "learning_rate": 3.791370730057496e-07, "loss": 0.3314, "step": 12850 }, { "epoch": 0.6209112431753394, "grad_norm": 2.8743896484375, "learning_rate": 3.7908875682466056e-07, "loss": 0.3718, "step": 12851 }, { "epoch": 0.6209595593564284, "grad_norm": 5.237276554107666, "learning_rate": 3.790404406435715e-07, "loss": 0.2354, "step": 12852 }, { "epoch": 0.6210078755375175, "grad_norm": 2.9772167205810547, "learning_rate": 3.789921244624825e-07, "loss": 0.3318, "step": 12853 }, { "epoch": 0.6210561917186066, "grad_norm": 2.2744908332824707, "learning_rate": 3.789438082813934e-07, "loss": 0.1588, "step": 12854 }, { "epoch": 0.6211045078996956, "grad_norm": 2.3267269134521484, "learning_rate": 3.7889549210030436e-07, "loss": 0.2596, "step": 12855 }, { "epoch": 0.6211528240807846, "grad_norm": 3.400538921356201, "learning_rate": 3.7884717591921535e-07, "loss": 0.418, "step": 12856 }, { "epoch": 0.6212011402618737, "grad_norm": 3.8355777263641357, "learning_rate": 3.787988597381263e-07, "loss": 0.2471, "step": 12857 }, { "epoch": 0.6212494564429627, "grad_norm": 6.024219989776611, "learning_rate": 3.787505435570372e-07, "loss": 0.3129, "step": 12858 }, { "epoch": 0.6212977726240518, "grad_norm": 3.236572742462158, "learning_rate": 3.787022273759482e-07, "loss": 0.2499, "step": 12859 }, { "epoch": 0.6213460888051409, "grad_norm": 4.874373912811279, "learning_rate": 3.786539111948591e-07, "loss": 0.3193, "step": 12860 }, { "epoch": 0.6213944049862299, "grad_norm": 2.041367769241333, "learning_rate": 3.786055950137701e-07, "loss": 0.2394, "step": 12861 }, { "epoch": 0.6214427211673189, "grad_norm": 2.2859890460968018, "learning_rate": 3.785572788326811e-07, "loss": 0.2537, "step": 12862 }, { "epoch": 0.6214910373484079, "grad_norm": 5.396285533905029, "learning_rate": 3.7850896265159197e-07, "loss": 0.2653, "step": 12863 }, { "epoch": 0.6215393535294971, "grad_norm": 2.4944586753845215, "learning_rate": 3.7846064647050296e-07, "loss": 0.2564, "step": 12864 }, { "epoch": 0.6215876697105861, "grad_norm": 3.2782390117645264, "learning_rate": 3.784123302894139e-07, "loss": 0.1436, "step": 12865 }, { "epoch": 0.6216359858916751, "grad_norm": 2.6923935413360596, "learning_rate": 3.783640141083249e-07, "loss": 0.362, "step": 12866 }, { "epoch": 0.6216843020727642, "grad_norm": 3.1143746376037598, "learning_rate": 3.783156979272358e-07, "loss": 0.1633, "step": 12867 }, { "epoch": 0.6217326182538532, "grad_norm": 2.51446270942688, "learning_rate": 3.7826738174614676e-07, "loss": 0.2465, "step": 12868 }, { "epoch": 0.6217809344349423, "grad_norm": 3.4439046382904053, "learning_rate": 3.7821906556505775e-07, "loss": 0.3942, "step": 12869 }, { "epoch": 0.6218292506160313, "grad_norm": 2.141637086868286, "learning_rate": 3.781707493839687e-07, "loss": 0.2166, "step": 12870 }, { "epoch": 0.6218775667971204, "grad_norm": 3.0326786041259766, "learning_rate": 3.781224332028796e-07, "loss": 0.3651, "step": 12871 }, { "epoch": 0.6219258829782094, "grad_norm": 3.1518547534942627, "learning_rate": 3.780741170217906e-07, "loss": 0.2247, "step": 12872 }, { "epoch": 0.6219741991592984, "grad_norm": 8.266386032104492, "learning_rate": 3.780258008407015e-07, "loss": 0.2829, "step": 12873 }, { "epoch": 0.6220225153403875, "grad_norm": 3.98152494430542, "learning_rate": 3.779774846596125e-07, "loss": 0.4091, "step": 12874 }, { "epoch": 0.6220708315214766, "grad_norm": 2.105114459991455, "learning_rate": 3.779291684785235e-07, "loss": 0.1637, "step": 12875 }, { "epoch": 0.6221191477025656, "grad_norm": 2.4128901958465576, "learning_rate": 3.7788085229743436e-07, "loss": 0.2605, "step": 12876 }, { "epoch": 0.6221674638836546, "grad_norm": 2.165421962738037, "learning_rate": 3.7783253611634535e-07, "loss": 0.256, "step": 12877 }, { "epoch": 0.6222157800647437, "grad_norm": 3.1431148052215576, "learning_rate": 3.777842199352563e-07, "loss": 0.3801, "step": 12878 }, { "epoch": 0.6222640962458327, "grad_norm": 2.0469160079956055, "learning_rate": 3.7773590375416723e-07, "loss": 0.2111, "step": 12879 }, { "epoch": 0.6223124124269218, "grad_norm": 4.496734142303467, "learning_rate": 3.776875875730782e-07, "loss": 0.2047, "step": 12880 }, { "epoch": 0.6223607286080108, "grad_norm": 3.241135835647583, "learning_rate": 3.7763927139198915e-07, "loss": 0.3847, "step": 12881 }, { "epoch": 0.6224090447890999, "grad_norm": 5.030508518218994, "learning_rate": 3.7759095521090014e-07, "loss": 0.257, "step": 12882 }, { "epoch": 0.6224573609701889, "grad_norm": 3.5417213439941406, "learning_rate": 3.775426390298111e-07, "loss": 0.1575, "step": 12883 }, { "epoch": 0.6225056771512779, "grad_norm": 2.0452866554260254, "learning_rate": 3.77494322848722e-07, "loss": 0.1899, "step": 12884 }, { "epoch": 0.622553993332367, "grad_norm": 2.145585775375366, "learning_rate": 3.77446006667633e-07, "loss": 0.2915, "step": 12885 }, { "epoch": 0.6226023095134561, "grad_norm": 3.2796523571014404, "learning_rate": 3.773976904865439e-07, "loss": 0.3082, "step": 12886 }, { "epoch": 0.6226506256945451, "grad_norm": 4.209521293640137, "learning_rate": 3.773493743054549e-07, "loss": 0.3223, "step": 12887 }, { "epoch": 0.6226989418756341, "grad_norm": 2.6660144329071045, "learning_rate": 3.773010581243659e-07, "loss": 0.3052, "step": 12888 }, { "epoch": 0.6227472580567232, "grad_norm": 2.8729681968688965, "learning_rate": 3.7725274194327676e-07, "loss": 0.2732, "step": 12889 }, { "epoch": 0.6227955742378123, "grad_norm": 3.284407138824463, "learning_rate": 3.7720442576218775e-07, "loss": 0.2972, "step": 12890 }, { "epoch": 0.6228438904189013, "grad_norm": 2.6383039951324463, "learning_rate": 3.771561095810987e-07, "loss": 0.2164, "step": 12891 }, { "epoch": 0.6228922065999903, "grad_norm": 2.700059413909912, "learning_rate": 3.771077934000096e-07, "loss": 0.2884, "step": 12892 }, { "epoch": 0.6229405227810794, "grad_norm": 3.5504417419433594, "learning_rate": 3.770594772189206e-07, "loss": 0.38, "step": 12893 }, { "epoch": 0.6229888389621684, "grad_norm": 4.0087056159973145, "learning_rate": 3.7701116103783155e-07, "loss": 0.3511, "step": 12894 }, { "epoch": 0.6230371551432575, "grad_norm": 7.652207374572754, "learning_rate": 3.769628448567425e-07, "loss": 0.2591, "step": 12895 }, { "epoch": 0.6230854713243466, "grad_norm": 2.3300411701202393, "learning_rate": 3.769145286756535e-07, "loss": 0.2267, "step": 12896 }, { "epoch": 0.6231337875054356, "grad_norm": 2.6147124767303467, "learning_rate": 3.768662124945644e-07, "loss": 0.3661, "step": 12897 }, { "epoch": 0.6231821036865246, "grad_norm": 1.6835377216339111, "learning_rate": 3.768178963134754e-07, "loss": 0.1706, "step": 12898 }, { "epoch": 0.6232304198676136, "grad_norm": 3.1561226844787598, "learning_rate": 3.767695801323863e-07, "loss": 0.3798, "step": 12899 }, { "epoch": 0.6232787360487028, "grad_norm": 4.107435703277588, "learning_rate": 3.767212639512973e-07, "loss": 0.2178, "step": 12900 }, { "epoch": 0.6233270522297918, "grad_norm": 1.9528273344039917, "learning_rate": 3.7667294777020827e-07, "loss": 0.1822, "step": 12901 }, { "epoch": 0.6233753684108808, "grad_norm": 2.3306641578674316, "learning_rate": 3.7662463158911915e-07, "loss": 0.2126, "step": 12902 }, { "epoch": 0.6234236845919698, "grad_norm": 4.535329341888428, "learning_rate": 3.7657631540803014e-07, "loss": 0.3146, "step": 12903 }, { "epoch": 0.6234720007730589, "grad_norm": 2.0997562408447266, "learning_rate": 3.765279992269411e-07, "loss": 0.1895, "step": 12904 }, { "epoch": 0.6235203169541479, "grad_norm": 2.8664352893829346, "learning_rate": 3.76479683045852e-07, "loss": 0.2723, "step": 12905 }, { "epoch": 0.623568633135237, "grad_norm": 2.244412899017334, "learning_rate": 3.76431366864763e-07, "loss": 0.2395, "step": 12906 }, { "epoch": 0.623616949316326, "grad_norm": 4.894256114959717, "learning_rate": 3.7638305068367395e-07, "loss": 0.3943, "step": 12907 }, { "epoch": 0.6236652654974151, "grad_norm": 6.235065937042236, "learning_rate": 3.763347345025849e-07, "loss": 0.4948, "step": 12908 }, { "epoch": 0.6237135816785041, "grad_norm": 4.413879871368408, "learning_rate": 3.762864183214958e-07, "loss": 0.2005, "step": 12909 }, { "epoch": 0.6237618978595931, "grad_norm": 2.5830302238464355, "learning_rate": 3.762381021404068e-07, "loss": 0.3424, "step": 12910 }, { "epoch": 0.6238102140406823, "grad_norm": 2.431334972381592, "learning_rate": 3.7618978595931775e-07, "loss": 0.2955, "step": 12911 }, { "epoch": 0.6238585302217713, "grad_norm": 1.8389514684677124, "learning_rate": 3.761414697782287e-07, "loss": 0.2255, "step": 12912 }, { "epoch": 0.6239068464028603, "grad_norm": 3.0629611015319824, "learning_rate": 3.760931535971397e-07, "loss": 0.263, "step": 12913 }, { "epoch": 0.6239551625839493, "grad_norm": 2.4082014560699463, "learning_rate": 3.7604483741605067e-07, "loss": 0.2544, "step": 12914 }, { "epoch": 0.6240034787650384, "grad_norm": 3.316124439239502, "learning_rate": 3.7599652123496155e-07, "loss": 0.3193, "step": 12915 }, { "epoch": 0.6240517949461275, "grad_norm": 6.026618003845215, "learning_rate": 3.7594820505387254e-07, "loss": 0.2833, "step": 12916 }, { "epoch": 0.6241001111272165, "grad_norm": 7.318263053894043, "learning_rate": 3.758998888727835e-07, "loss": 0.4027, "step": 12917 }, { "epoch": 0.6241484273083056, "grad_norm": 2.6410953998565674, "learning_rate": 3.758515726916944e-07, "loss": 0.2023, "step": 12918 }, { "epoch": 0.6241967434893946, "grad_norm": 3.02213716506958, "learning_rate": 3.758032565106054e-07, "loss": 0.2932, "step": 12919 }, { "epoch": 0.6242450596704836, "grad_norm": 20.21794319152832, "learning_rate": 3.7575494032951634e-07, "loss": 0.3291, "step": 12920 }, { "epoch": 0.6242933758515727, "grad_norm": 2.1985061168670654, "learning_rate": 3.757066241484273e-07, "loss": 0.201, "step": 12921 }, { "epoch": 0.6243416920326618, "grad_norm": 2.733480215072632, "learning_rate": 3.756583079673382e-07, "loss": 0.2123, "step": 12922 }, { "epoch": 0.6243900082137508, "grad_norm": 3.929287910461426, "learning_rate": 3.756099917862492e-07, "loss": 0.2974, "step": 12923 }, { "epoch": 0.6244383243948398, "grad_norm": 1.9876196384429932, "learning_rate": 3.7556167560516015e-07, "loss": 0.2037, "step": 12924 }, { "epoch": 0.6244866405759288, "grad_norm": 2.4064483642578125, "learning_rate": 3.755133594240711e-07, "loss": 0.2724, "step": 12925 }, { "epoch": 0.624534956757018, "grad_norm": 2.055720567703247, "learning_rate": 3.7546504324298207e-07, "loss": 0.2316, "step": 12926 }, { "epoch": 0.624583272938107, "grad_norm": 3.850525140762329, "learning_rate": 3.75416727061893e-07, "loss": 0.2415, "step": 12927 }, { "epoch": 0.624631589119196, "grad_norm": 2.238941192626953, "learning_rate": 3.7536841088080395e-07, "loss": 0.2322, "step": 12928 }, { "epoch": 0.624679905300285, "grad_norm": 2.284766435623169, "learning_rate": 3.7532009469971494e-07, "loss": 0.2244, "step": 12929 }, { "epoch": 0.6247282214813741, "grad_norm": 4.967028617858887, "learning_rate": 3.752717785186258e-07, "loss": 0.2179, "step": 12930 }, { "epoch": 0.6247765376624631, "grad_norm": 3.3182363510131836, "learning_rate": 3.752234623375368e-07, "loss": 0.5492, "step": 12931 }, { "epoch": 0.6248248538435522, "grad_norm": 2.425407886505127, "learning_rate": 3.751751461564478e-07, "loss": 0.1874, "step": 12932 }, { "epoch": 0.6248731700246413, "grad_norm": 5.111147880554199, "learning_rate": 3.7512682997535874e-07, "loss": 0.382, "step": 12933 }, { "epoch": 0.6249214862057303, "grad_norm": 2.4876906871795654, "learning_rate": 3.750785137942697e-07, "loss": 0.3451, "step": 12934 }, { "epoch": 0.6249698023868193, "grad_norm": 2.5890705585479736, "learning_rate": 3.750301976131806e-07, "loss": 0.2246, "step": 12935 }, { "epoch": 0.6250181185679083, "grad_norm": 2.608522653579712, "learning_rate": 3.749818814320916e-07, "loss": 0.3229, "step": 12936 }, { "epoch": 0.6250664347489975, "grad_norm": 3.787924289703369, "learning_rate": 3.7493356525100254e-07, "loss": 0.2796, "step": 12937 }, { "epoch": 0.6251147509300865, "grad_norm": 2.191312074661255, "learning_rate": 3.748852490699135e-07, "loss": 0.21, "step": 12938 }, { "epoch": 0.6251630671111755, "grad_norm": 3.538278818130493, "learning_rate": 3.7483693288882447e-07, "loss": 0.2841, "step": 12939 }, { "epoch": 0.6252113832922646, "grad_norm": 2.391847848892212, "learning_rate": 3.747886167077354e-07, "loss": 0.3385, "step": 12940 }, { "epoch": 0.6252596994733536, "grad_norm": 2.475785732269287, "learning_rate": 3.7474030052664634e-07, "loss": 0.2051, "step": 12941 }, { "epoch": 0.6253080156544427, "grad_norm": 4.223189353942871, "learning_rate": 3.7469198434555733e-07, "loss": 0.2295, "step": 12942 }, { "epoch": 0.6253563318355317, "grad_norm": 2.833926200866699, "learning_rate": 3.746436681644682e-07, "loss": 0.3234, "step": 12943 }, { "epoch": 0.6254046480166208, "grad_norm": 5.319383144378662, "learning_rate": 3.745953519833792e-07, "loss": 0.3499, "step": 12944 }, { "epoch": 0.6254529641977098, "grad_norm": 3.4684348106384277, "learning_rate": 3.745470358022902e-07, "loss": 0.2292, "step": 12945 }, { "epoch": 0.6255012803787988, "grad_norm": 2.6990115642547607, "learning_rate": 3.744987196212011e-07, "loss": 0.344, "step": 12946 }, { "epoch": 0.625549596559888, "grad_norm": 6.397397518157959, "learning_rate": 3.7445040344011207e-07, "loss": 0.4072, "step": 12947 }, { "epoch": 0.625597912740977, "grad_norm": 6.132441997528076, "learning_rate": 3.74402087259023e-07, "loss": 0.274, "step": 12948 }, { "epoch": 0.625646228922066, "grad_norm": 18.546403884887695, "learning_rate": 3.74353771077934e-07, "loss": 0.1924, "step": 12949 }, { "epoch": 0.625694545103155, "grad_norm": 3.581169366836548, "learning_rate": 3.7430545489684494e-07, "loss": 0.4501, "step": 12950 }, { "epoch": 0.6257428612842441, "grad_norm": 3.7933690547943115, "learning_rate": 3.742571387157559e-07, "loss": 0.3345, "step": 12951 }, { "epoch": 0.6257911774653332, "grad_norm": 4.055501461029053, "learning_rate": 3.7420882253466687e-07, "loss": 0.3564, "step": 12952 }, { "epoch": 0.6258394936464222, "grad_norm": 2.268188953399658, "learning_rate": 3.741605063535778e-07, "loss": 0.2244, "step": 12953 }, { "epoch": 0.6258878098275112, "grad_norm": 2.2698612213134766, "learning_rate": 3.7411219017248874e-07, "loss": 0.2173, "step": 12954 }, { "epoch": 0.6259361260086003, "grad_norm": 2.472855806350708, "learning_rate": 3.7406387399139973e-07, "loss": 0.274, "step": 12955 }, { "epoch": 0.6259844421896893, "grad_norm": 3.7156615257263184, "learning_rate": 3.740155578103106e-07, "loss": 0.3894, "step": 12956 }, { "epoch": 0.6260327583707783, "grad_norm": 2.7050974369049072, "learning_rate": 3.739672416292216e-07, "loss": 0.3305, "step": 12957 }, { "epoch": 0.6260810745518675, "grad_norm": 3.2442479133605957, "learning_rate": 3.739189254481326e-07, "loss": 0.3314, "step": 12958 }, { "epoch": 0.6261293907329565, "grad_norm": 29.106380462646484, "learning_rate": 3.738706092670435e-07, "loss": 0.468, "step": 12959 }, { "epoch": 0.6261777069140455, "grad_norm": 2.225191354751587, "learning_rate": 3.7382229308595447e-07, "loss": 0.265, "step": 12960 }, { "epoch": 0.6262260230951345, "grad_norm": 2.8455722332000732, "learning_rate": 3.737739769048654e-07, "loss": 0.2215, "step": 12961 }, { "epoch": 0.6262743392762236, "grad_norm": 1.490976333618164, "learning_rate": 3.7372566072377634e-07, "loss": 0.1436, "step": 12962 }, { "epoch": 0.6263226554573127, "grad_norm": 4.5813727378845215, "learning_rate": 3.7367734454268733e-07, "loss": 0.2873, "step": 12963 }, { "epoch": 0.6263709716384017, "grad_norm": 2.5354156494140625, "learning_rate": 3.7362902836159827e-07, "loss": 0.1944, "step": 12964 }, { "epoch": 0.6264192878194907, "grad_norm": 3.074681282043457, "learning_rate": 3.7358071218050926e-07, "loss": 0.2058, "step": 12965 }, { "epoch": 0.6264676040005798, "grad_norm": 2.552239418029785, "learning_rate": 3.735323959994202e-07, "loss": 0.296, "step": 12966 }, { "epoch": 0.6265159201816688, "grad_norm": 1.5285530090332031, "learning_rate": 3.7348407981833114e-07, "loss": 0.1614, "step": 12967 }, { "epoch": 0.6265642363627579, "grad_norm": 3.1019227504730225, "learning_rate": 3.7343576363724213e-07, "loss": 0.2425, "step": 12968 }, { "epoch": 0.626612552543847, "grad_norm": 1.7419651746749878, "learning_rate": 3.73387447456153e-07, "loss": 0.1447, "step": 12969 }, { "epoch": 0.626660868724936, "grad_norm": 2.6345059871673584, "learning_rate": 3.73339131275064e-07, "loss": 0.2851, "step": 12970 }, { "epoch": 0.626709184906025, "grad_norm": 2.8232421875, "learning_rate": 3.73290815093975e-07, "loss": 0.3076, "step": 12971 }, { "epoch": 0.626757501087114, "grad_norm": 3.24857497215271, "learning_rate": 3.732424989128859e-07, "loss": 0.4776, "step": 12972 }, { "epoch": 0.6268058172682032, "grad_norm": 5.220333576202393, "learning_rate": 3.7319418273179687e-07, "loss": 0.3089, "step": 12973 }, { "epoch": 0.6268541334492922, "grad_norm": 4.987797737121582, "learning_rate": 3.731458665507078e-07, "loss": 0.148, "step": 12974 }, { "epoch": 0.6269024496303812, "grad_norm": 2.9570770263671875, "learning_rate": 3.7309755036961874e-07, "loss": 0.3892, "step": 12975 }, { "epoch": 0.6269507658114702, "grad_norm": 3.2920916080474854, "learning_rate": 3.7304923418852973e-07, "loss": 0.2235, "step": 12976 }, { "epoch": 0.6269990819925593, "grad_norm": 2.7830147743225098, "learning_rate": 3.7300091800744067e-07, "loss": 0.3646, "step": 12977 }, { "epoch": 0.6270473981736484, "grad_norm": 2.7058498859405518, "learning_rate": 3.729526018263516e-07, "loss": 0.2992, "step": 12978 }, { "epoch": 0.6270957143547374, "grad_norm": 2.2931575775146484, "learning_rate": 3.729042856452626e-07, "loss": 0.2837, "step": 12979 }, { "epoch": 0.6271440305358265, "grad_norm": 1.6789829730987549, "learning_rate": 3.7285596946417353e-07, "loss": 0.181, "step": 12980 }, { "epoch": 0.6271923467169155, "grad_norm": 3.22617506980896, "learning_rate": 3.728076532830845e-07, "loss": 0.257, "step": 12981 }, { "epoch": 0.6272406628980045, "grad_norm": 2.9175918102264404, "learning_rate": 3.727593371019954e-07, "loss": 0.3773, "step": 12982 }, { "epoch": 0.6272889790790935, "grad_norm": 5.08018159866333, "learning_rate": 3.727110209209064e-07, "loss": 0.4626, "step": 12983 }, { "epoch": 0.6273372952601827, "grad_norm": 2.5128793716430664, "learning_rate": 3.726627047398174e-07, "loss": 0.315, "step": 12984 }, { "epoch": 0.6273856114412717, "grad_norm": 2.738150119781494, "learning_rate": 3.7261438855872827e-07, "loss": 0.2196, "step": 12985 }, { "epoch": 0.6274339276223607, "grad_norm": 2.4802563190460205, "learning_rate": 3.7256607237763926e-07, "loss": 0.272, "step": 12986 }, { "epoch": 0.6274822438034497, "grad_norm": 2.4535491466522217, "learning_rate": 3.725177561965502e-07, "loss": 0.2858, "step": 12987 }, { "epoch": 0.6275305599845388, "grad_norm": 2.0813987255096436, "learning_rate": 3.7246944001546114e-07, "loss": 0.2928, "step": 12988 }, { "epoch": 0.6275788761656279, "grad_norm": 10.176084518432617, "learning_rate": 3.7242112383437213e-07, "loss": 0.3042, "step": 12989 }, { "epoch": 0.6276271923467169, "grad_norm": 2.985858201980591, "learning_rate": 3.7237280765328306e-07, "loss": 0.2376, "step": 12990 }, { "epoch": 0.627675508527806, "grad_norm": 2.4682822227478027, "learning_rate": 3.72324491472194e-07, "loss": 0.2977, "step": 12991 }, { "epoch": 0.627723824708895, "grad_norm": 4.278869152069092, "learning_rate": 3.72276175291105e-07, "loss": 0.231, "step": 12992 }, { "epoch": 0.627772140889984, "grad_norm": 2.143277168273926, "learning_rate": 3.7222785911001593e-07, "loss": 0.2482, "step": 12993 }, { "epoch": 0.6278204570710731, "grad_norm": 2.1323354244232178, "learning_rate": 3.721795429289269e-07, "loss": 0.2756, "step": 12994 }, { "epoch": 0.6278687732521622, "grad_norm": 1.7006796598434448, "learning_rate": 3.721312267478378e-07, "loss": 0.189, "step": 12995 }, { "epoch": 0.6279170894332512, "grad_norm": 2.7739808559417725, "learning_rate": 3.720829105667488e-07, "loss": 0.3099, "step": 12996 }, { "epoch": 0.6279654056143402, "grad_norm": 2.430692434310913, "learning_rate": 3.720345943856598e-07, "loss": 0.1743, "step": 12997 }, { "epoch": 0.6280137217954292, "grad_norm": 3.3375790119171143, "learning_rate": 3.7198627820457067e-07, "loss": 0.3248, "step": 12998 }, { "epoch": 0.6280620379765184, "grad_norm": 3.089299440383911, "learning_rate": 3.7193796202348166e-07, "loss": 0.2918, "step": 12999 }, { "epoch": 0.6281103541576074, "grad_norm": 8.703694343566895, "learning_rate": 3.718896458423926e-07, "loss": 0.239, "step": 13000 }, { "epoch": 0.6281586703386964, "grad_norm": 2.8340916633605957, "learning_rate": 3.7184132966130353e-07, "loss": 0.3388, "step": 13001 }, { "epoch": 0.6282069865197855, "grad_norm": 2.7411913871765137, "learning_rate": 3.717930134802145e-07, "loss": 0.2833, "step": 13002 }, { "epoch": 0.6282553027008745, "grad_norm": 2.509129047393799, "learning_rate": 3.7174469729912546e-07, "loss": 0.244, "step": 13003 }, { "epoch": 0.6283036188819636, "grad_norm": 2.7189652919769287, "learning_rate": 3.716963811180364e-07, "loss": 0.3349, "step": 13004 }, { "epoch": 0.6283519350630526, "grad_norm": 2.4074952602386475, "learning_rate": 3.716480649369474e-07, "loss": 0.2574, "step": 13005 }, { "epoch": 0.6284002512441417, "grad_norm": 4.5830512046813965, "learning_rate": 3.715997487558583e-07, "loss": 0.3192, "step": 13006 }, { "epoch": 0.6284485674252307, "grad_norm": 2.2842659950256348, "learning_rate": 3.7155143257476926e-07, "loss": 0.2256, "step": 13007 }, { "epoch": 0.6284968836063197, "grad_norm": 89.13543701171875, "learning_rate": 3.715031163936802e-07, "loss": 0.2711, "step": 13008 }, { "epoch": 0.6285451997874087, "grad_norm": 2.504208564758301, "learning_rate": 3.714548002125912e-07, "loss": 0.2668, "step": 13009 }, { "epoch": 0.6285935159684979, "grad_norm": 2.575239658355713, "learning_rate": 3.714064840315022e-07, "loss": 0.3608, "step": 13010 }, { "epoch": 0.6286418321495869, "grad_norm": 2.934993267059326, "learning_rate": 3.7135816785041306e-07, "loss": 0.2456, "step": 13011 }, { "epoch": 0.6286901483306759, "grad_norm": 2.348262310028076, "learning_rate": 3.7130985166932405e-07, "loss": 0.2979, "step": 13012 }, { "epoch": 0.628738464511765, "grad_norm": 2.695310354232788, "learning_rate": 3.71261535488235e-07, "loss": 0.3125, "step": 13013 }, { "epoch": 0.628786780692854, "grad_norm": 1.965016484260559, "learning_rate": 3.7121321930714593e-07, "loss": 0.1976, "step": 13014 }, { "epoch": 0.6288350968739431, "grad_norm": 2.2525017261505127, "learning_rate": 3.711649031260569e-07, "loss": 0.1877, "step": 13015 }, { "epoch": 0.6288834130550321, "grad_norm": 2.166658878326416, "learning_rate": 3.7111658694496786e-07, "loss": 0.2796, "step": 13016 }, { "epoch": 0.6289317292361212, "grad_norm": 2.932152032852173, "learning_rate": 3.710682707638788e-07, "loss": 0.3378, "step": 13017 }, { "epoch": 0.6289800454172102, "grad_norm": 3.011969804763794, "learning_rate": 3.710199545827898e-07, "loss": 0.1881, "step": 13018 }, { "epoch": 0.6290283615982992, "grad_norm": 2.583674669265747, "learning_rate": 3.709716384017007e-07, "loss": 0.2357, "step": 13019 }, { "epoch": 0.6290766777793884, "grad_norm": 1.8473464250564575, "learning_rate": 3.7092332222061166e-07, "loss": 0.2235, "step": 13020 }, { "epoch": 0.6291249939604774, "grad_norm": 1.9618157148361206, "learning_rate": 3.708750060395226e-07, "loss": 0.1987, "step": 13021 }, { "epoch": 0.6291733101415664, "grad_norm": 3.4280896186828613, "learning_rate": 3.708266898584336e-07, "loss": 0.4639, "step": 13022 }, { "epoch": 0.6292216263226554, "grad_norm": 2.9246156215667725, "learning_rate": 3.707783736773445e-07, "loss": 0.4348, "step": 13023 }, { "epoch": 0.6292699425037445, "grad_norm": 1.8065382242202759, "learning_rate": 3.7073005749625546e-07, "loss": 0.227, "step": 13024 }, { "epoch": 0.6293182586848336, "grad_norm": 2.139963150024414, "learning_rate": 3.7068174131516645e-07, "loss": 0.2243, "step": 13025 }, { "epoch": 0.6293665748659226, "grad_norm": 3.0743672847747803, "learning_rate": 3.7063342513407734e-07, "loss": 0.2697, "step": 13026 }, { "epoch": 0.6294148910470116, "grad_norm": 3.011687994003296, "learning_rate": 3.705851089529883e-07, "loss": 0.3194, "step": 13027 }, { "epoch": 0.6294632072281007, "grad_norm": 2.9457051753997803, "learning_rate": 3.705367927718993e-07, "loss": 0.3342, "step": 13028 }, { "epoch": 0.6295115234091897, "grad_norm": 2.2931275367736816, "learning_rate": 3.7048847659081025e-07, "loss": 0.2697, "step": 13029 }, { "epoch": 0.6295598395902788, "grad_norm": 2.112813711166382, "learning_rate": 3.704401604097212e-07, "loss": 0.2712, "step": 13030 }, { "epoch": 0.6296081557713679, "grad_norm": 2.2016258239746094, "learning_rate": 3.703918442286322e-07, "loss": 0.2204, "step": 13031 }, { "epoch": 0.6296564719524569, "grad_norm": 3.8054697513580322, "learning_rate": 3.703435280475431e-07, "loss": 0.3208, "step": 13032 }, { "epoch": 0.6297047881335459, "grad_norm": 2.7224106788635254, "learning_rate": 3.7029521186645406e-07, "loss": 0.3654, "step": 13033 }, { "epoch": 0.6297531043146349, "grad_norm": 3.1653354167938232, "learning_rate": 3.70246895685365e-07, "loss": 0.3635, "step": 13034 }, { "epoch": 0.629801420495724, "grad_norm": 2.8167741298675537, "learning_rate": 3.70198579504276e-07, "loss": 0.3176, "step": 13035 }, { "epoch": 0.6298497366768131, "grad_norm": 14.21369743347168, "learning_rate": 3.701502633231869e-07, "loss": 0.2032, "step": 13036 }, { "epoch": 0.6298980528579021, "grad_norm": 2.615792989730835, "learning_rate": 3.7010194714209786e-07, "loss": 0.3685, "step": 13037 }, { "epoch": 0.6299463690389911, "grad_norm": 3.201286554336548, "learning_rate": 3.7005363096100885e-07, "loss": 0.2899, "step": 13038 }, { "epoch": 0.6299946852200802, "grad_norm": 2.1068429946899414, "learning_rate": 3.7000531477991973e-07, "loss": 0.2322, "step": 13039 }, { "epoch": 0.6300430014011692, "grad_norm": 2.4475343227386475, "learning_rate": 3.699569985988307e-07, "loss": 0.3576, "step": 13040 }, { "epoch": 0.6300913175822583, "grad_norm": 2.8167920112609863, "learning_rate": 3.699086824177417e-07, "loss": 0.2826, "step": 13041 }, { "epoch": 0.6301396337633474, "grad_norm": 2.3722586631774902, "learning_rate": 3.698603662366526e-07, "loss": 0.3048, "step": 13042 }, { "epoch": 0.6301879499444364, "grad_norm": 2.37144136428833, "learning_rate": 3.698120500555636e-07, "loss": 0.261, "step": 13043 }, { "epoch": 0.6302362661255254, "grad_norm": 2.274711847305298, "learning_rate": 3.697637338744746e-07, "loss": 0.2271, "step": 13044 }, { "epoch": 0.6302845823066144, "grad_norm": 1.7283064126968384, "learning_rate": 3.697154176933855e-07, "loss": 0.1675, "step": 13045 }, { "epoch": 0.6303328984877036, "grad_norm": 18.67049217224121, "learning_rate": 3.6966710151229645e-07, "loss": 0.2287, "step": 13046 }, { "epoch": 0.6303812146687926, "grad_norm": 2.547945261001587, "learning_rate": 3.696187853312074e-07, "loss": 0.2664, "step": 13047 }, { "epoch": 0.6304295308498816, "grad_norm": 5.629641532897949, "learning_rate": 3.695704691501184e-07, "loss": 0.3298, "step": 13048 }, { "epoch": 0.6304778470309707, "grad_norm": 1.8986411094665527, "learning_rate": 3.695221529690293e-07, "loss": 0.1478, "step": 13049 }, { "epoch": 0.6305261632120597, "grad_norm": 2.4814369678497314, "learning_rate": 3.6947383678794025e-07, "loss": 0.2794, "step": 13050 }, { "epoch": 0.6305744793931488, "grad_norm": 2.401355266571045, "learning_rate": 3.6942552060685124e-07, "loss": 0.2541, "step": 13051 }, { "epoch": 0.6306227955742378, "grad_norm": 8.342366218566895, "learning_rate": 3.6937720442576213e-07, "loss": 0.2654, "step": 13052 }, { "epoch": 0.6306711117553269, "grad_norm": 6.079695701599121, "learning_rate": 3.693288882446731e-07, "loss": 0.2413, "step": 13053 }, { "epoch": 0.6307194279364159, "grad_norm": 3.287168502807617, "learning_rate": 3.692805720635841e-07, "loss": 0.2952, "step": 13054 }, { "epoch": 0.6307677441175049, "grad_norm": 2.209874391555786, "learning_rate": 3.69232255882495e-07, "loss": 0.2258, "step": 13055 }, { "epoch": 0.630816060298594, "grad_norm": 127.88399505615234, "learning_rate": 3.69183939701406e-07, "loss": 0.3937, "step": 13056 }, { "epoch": 0.6308643764796831, "grad_norm": 2.534834861755371, "learning_rate": 3.6913562352031697e-07, "loss": 0.2749, "step": 13057 }, { "epoch": 0.6309126926607721, "grad_norm": 2.9722142219543457, "learning_rate": 3.6908730733922786e-07, "loss": 0.2294, "step": 13058 }, { "epoch": 0.6309610088418611, "grad_norm": 2.396900177001953, "learning_rate": 3.6903899115813885e-07, "loss": 0.2747, "step": 13059 }, { "epoch": 0.6310093250229502, "grad_norm": 2.764209270477295, "learning_rate": 3.689906749770498e-07, "loss": 0.2524, "step": 13060 }, { "epoch": 0.6310576412040393, "grad_norm": 4.517242908477783, "learning_rate": 3.689423587959608e-07, "loss": 0.3021, "step": 13061 }, { "epoch": 0.6311059573851283, "grad_norm": 3.0383787155151367, "learning_rate": 3.688940426148717e-07, "loss": 0.3148, "step": 13062 }, { "epoch": 0.6311542735662173, "grad_norm": 2.0848374366760254, "learning_rate": 3.6884572643378265e-07, "loss": 0.2172, "step": 13063 }, { "epoch": 0.6312025897473064, "grad_norm": 3.845257043838501, "learning_rate": 3.6879741025269364e-07, "loss": 0.2995, "step": 13064 }, { "epoch": 0.6312509059283954, "grad_norm": 4.079063415527344, "learning_rate": 3.687490940716045e-07, "loss": 0.269, "step": 13065 }, { "epoch": 0.6312992221094844, "grad_norm": 7.825928688049316, "learning_rate": 3.687007778905155e-07, "loss": 0.3758, "step": 13066 }, { "epoch": 0.6313475382905736, "grad_norm": 3.6342403888702393, "learning_rate": 3.686524617094265e-07, "loss": 0.3332, "step": 13067 }, { "epoch": 0.6313958544716626, "grad_norm": 2.7593302726745605, "learning_rate": 3.686041455283374e-07, "loss": 0.229, "step": 13068 }, { "epoch": 0.6314441706527516, "grad_norm": 2.2089719772338867, "learning_rate": 3.685558293472484e-07, "loss": 0.1837, "step": 13069 }, { "epoch": 0.6314924868338406, "grad_norm": 3.0923938751220703, "learning_rate": 3.6850751316615937e-07, "loss": 0.3791, "step": 13070 }, { "epoch": 0.6315408030149297, "grad_norm": 2.629274368286133, "learning_rate": 3.6845919698507025e-07, "loss": 0.3478, "step": 13071 }, { "epoch": 0.6315891191960188, "grad_norm": 5.158860683441162, "learning_rate": 3.6841088080398124e-07, "loss": 0.3734, "step": 13072 }, { "epoch": 0.6316374353771078, "grad_norm": 3.048067331314087, "learning_rate": 3.683625646228922e-07, "loss": 0.2996, "step": 13073 }, { "epoch": 0.6316857515581968, "grad_norm": 2.2714717388153076, "learning_rate": 3.683142484418031e-07, "loss": 0.2066, "step": 13074 }, { "epoch": 0.6317340677392859, "grad_norm": 2.4116241931915283, "learning_rate": 3.682659322607141e-07, "loss": 0.2813, "step": 13075 }, { "epoch": 0.6317823839203749, "grad_norm": 2.7768125534057617, "learning_rate": 3.6821761607962505e-07, "loss": 0.2928, "step": 13076 }, { "epoch": 0.631830700101464, "grad_norm": 2.833554983139038, "learning_rate": 3.6816929989853604e-07, "loss": 0.2526, "step": 13077 }, { "epoch": 0.631879016282553, "grad_norm": 18.100435256958008, "learning_rate": 3.681209837174469e-07, "loss": 0.2927, "step": 13078 }, { "epoch": 0.6319273324636421, "grad_norm": 3.3640265464782715, "learning_rate": 3.680726675363579e-07, "loss": 0.3091, "step": 13079 }, { "epoch": 0.6319756486447311, "grad_norm": 3.468384265899658, "learning_rate": 3.680243513552689e-07, "loss": 0.2554, "step": 13080 }, { "epoch": 0.6320239648258201, "grad_norm": 5.228979110717773, "learning_rate": 3.679760351741798e-07, "loss": 0.2361, "step": 13081 }, { "epoch": 0.6320722810069093, "grad_norm": 2.0702004432678223, "learning_rate": 3.679277189930908e-07, "loss": 0.2501, "step": 13082 }, { "epoch": 0.6321205971879983, "grad_norm": 3.2951812744140625, "learning_rate": 3.6787940281200177e-07, "loss": 0.2149, "step": 13083 }, { "epoch": 0.6321689133690873, "grad_norm": 8.817387580871582, "learning_rate": 3.6783108663091265e-07, "loss": 0.3815, "step": 13084 }, { "epoch": 0.6322172295501763, "grad_norm": 2.6504218578338623, "learning_rate": 3.6778277044982364e-07, "loss": 0.2274, "step": 13085 }, { "epoch": 0.6322655457312654, "grad_norm": 2.015744686126709, "learning_rate": 3.677344542687346e-07, "loss": 0.2126, "step": 13086 }, { "epoch": 0.6323138619123545, "grad_norm": 3.992389678955078, "learning_rate": 3.676861380876455e-07, "loss": 0.4179, "step": 13087 }, { "epoch": 0.6323621780934435, "grad_norm": 2.9503865242004395, "learning_rate": 3.676378219065565e-07, "loss": 0.4523, "step": 13088 }, { "epoch": 0.6324104942745326, "grad_norm": 37.73478698730469, "learning_rate": 3.6758950572546744e-07, "loss": 0.2538, "step": 13089 }, { "epoch": 0.6324588104556216, "grad_norm": 2.754258632659912, "learning_rate": 3.675411895443784e-07, "loss": 0.3197, "step": 13090 }, { "epoch": 0.6325071266367106, "grad_norm": 2.688976287841797, "learning_rate": 3.674928733632893e-07, "loss": 0.3887, "step": 13091 }, { "epoch": 0.6325554428177996, "grad_norm": 2.776764392852783, "learning_rate": 3.674445571822003e-07, "loss": 0.3825, "step": 13092 }, { "epoch": 0.6326037589988888, "grad_norm": 2.861147403717041, "learning_rate": 3.673962410011113e-07, "loss": 0.3808, "step": 13093 }, { "epoch": 0.6326520751799778, "grad_norm": 2.9853744506835938, "learning_rate": 3.673479248200222e-07, "loss": 0.3244, "step": 13094 }, { "epoch": 0.6327003913610668, "grad_norm": 2.5202975273132324, "learning_rate": 3.6729960863893317e-07, "loss": 0.2954, "step": 13095 }, { "epoch": 0.6327487075421558, "grad_norm": 2.8087408542633057, "learning_rate": 3.6725129245784416e-07, "loss": 0.3333, "step": 13096 }, { "epoch": 0.6327970237232449, "grad_norm": 4.631856441497803, "learning_rate": 3.6720297627675505e-07, "loss": 0.2347, "step": 13097 }, { "epoch": 0.632845339904334, "grad_norm": 2.092496871948242, "learning_rate": 3.6715466009566604e-07, "loss": 0.2473, "step": 13098 }, { "epoch": 0.632893656085423, "grad_norm": 2.825253963470459, "learning_rate": 3.67106343914577e-07, "loss": 0.276, "step": 13099 }, { "epoch": 0.632941972266512, "grad_norm": 2.465099573135376, "learning_rate": 3.670580277334879e-07, "loss": 0.2553, "step": 13100 }, { "epoch": 0.6329902884476011, "grad_norm": 2.5346930027008057, "learning_rate": 3.670097115523989e-07, "loss": 0.2913, "step": 13101 }, { "epoch": 0.6330386046286901, "grad_norm": 8.827157974243164, "learning_rate": 3.6696139537130984e-07, "loss": 0.1646, "step": 13102 }, { "epoch": 0.6330869208097792, "grad_norm": 3.3826098442077637, "learning_rate": 3.669130791902208e-07, "loss": 0.3782, "step": 13103 }, { "epoch": 0.6331352369908683, "grad_norm": 2.3657920360565186, "learning_rate": 3.668647630091317e-07, "loss": 0.3039, "step": 13104 }, { "epoch": 0.6331835531719573, "grad_norm": 2.5335700511932373, "learning_rate": 3.668164468280427e-07, "loss": 0.3214, "step": 13105 }, { "epoch": 0.6332318693530463, "grad_norm": 3.165440797805786, "learning_rate": 3.6676813064695364e-07, "loss": 0.307, "step": 13106 }, { "epoch": 0.6332801855341353, "grad_norm": 2.081882953643799, "learning_rate": 3.667198144658646e-07, "loss": 0.2224, "step": 13107 }, { "epoch": 0.6333285017152245, "grad_norm": 2.352142095565796, "learning_rate": 3.6667149828477557e-07, "loss": 0.2816, "step": 13108 }, { "epoch": 0.6333768178963135, "grad_norm": 2.6230270862579346, "learning_rate": 3.6662318210368656e-07, "loss": 0.2842, "step": 13109 }, { "epoch": 0.6334251340774025, "grad_norm": 10.699450492858887, "learning_rate": 3.6657486592259744e-07, "loss": 0.4932, "step": 13110 }, { "epoch": 0.6334734502584916, "grad_norm": 3.3828184604644775, "learning_rate": 3.6652654974150843e-07, "loss": 0.4208, "step": 13111 }, { "epoch": 0.6335217664395806, "grad_norm": 3.2788991928100586, "learning_rate": 3.6647823356041937e-07, "loss": 0.2475, "step": 13112 }, { "epoch": 0.6335700826206697, "grad_norm": 2.075151205062866, "learning_rate": 3.664299173793303e-07, "loss": 0.2346, "step": 13113 }, { "epoch": 0.6336183988017587, "grad_norm": 2.3052725791931152, "learning_rate": 3.663816011982413e-07, "loss": 0.2344, "step": 13114 }, { "epoch": 0.6336667149828478, "grad_norm": 1.9072768688201904, "learning_rate": 3.6633328501715224e-07, "loss": 0.1757, "step": 13115 }, { "epoch": 0.6337150311639368, "grad_norm": 3.631747245788574, "learning_rate": 3.6628496883606317e-07, "loss": 0.3445, "step": 13116 }, { "epoch": 0.6337633473450258, "grad_norm": 3.4243509769439697, "learning_rate": 3.662366526549741e-07, "loss": 0.3658, "step": 13117 }, { "epoch": 0.6338116635261148, "grad_norm": 2.9043710231781006, "learning_rate": 3.661883364738851e-07, "loss": 0.3714, "step": 13118 }, { "epoch": 0.633859979707204, "grad_norm": 2.210693359375, "learning_rate": 3.6614002029279604e-07, "loss": 0.3058, "step": 13119 }, { "epoch": 0.633908295888293, "grad_norm": 5.903290271759033, "learning_rate": 3.66091704111707e-07, "loss": 0.2584, "step": 13120 }, { "epoch": 0.633956612069382, "grad_norm": 2.5382003784179688, "learning_rate": 3.6604338793061797e-07, "loss": 0.319, "step": 13121 }, { "epoch": 0.6340049282504711, "grad_norm": 3.717616558074951, "learning_rate": 3.659950717495289e-07, "loss": 0.3011, "step": 13122 }, { "epoch": 0.6340532444315601, "grad_norm": 4.05738639831543, "learning_rate": 3.6594675556843984e-07, "loss": 0.276, "step": 13123 }, { "epoch": 0.6341015606126492, "grad_norm": 3.338144063949585, "learning_rate": 3.6589843938735083e-07, "loss": 0.2521, "step": 13124 }, { "epoch": 0.6341498767937382, "grad_norm": 2.061936378479004, "learning_rate": 3.658501232062617e-07, "loss": 0.1935, "step": 13125 }, { "epoch": 0.6341981929748273, "grad_norm": 2.2635107040405273, "learning_rate": 3.658018070251727e-07, "loss": 0.274, "step": 13126 }, { "epoch": 0.6342465091559163, "grad_norm": 3.125488519668579, "learning_rate": 3.657534908440837e-07, "loss": 0.3815, "step": 13127 }, { "epoch": 0.6342948253370053, "grad_norm": 2.3769874572753906, "learning_rate": 3.6570517466299463e-07, "loss": 0.2785, "step": 13128 }, { "epoch": 0.6343431415180945, "grad_norm": 2.997175693511963, "learning_rate": 3.6565685848190557e-07, "loss": 0.4086, "step": 13129 }, { "epoch": 0.6343914576991835, "grad_norm": 3.0291919708251953, "learning_rate": 3.656085423008165e-07, "loss": 0.4829, "step": 13130 }, { "epoch": 0.6344397738802725, "grad_norm": 2.3187177181243896, "learning_rate": 3.655602261197275e-07, "loss": 0.2811, "step": 13131 }, { "epoch": 0.6344880900613615, "grad_norm": 2.525519609451294, "learning_rate": 3.6551190993863843e-07, "loss": 0.3226, "step": 13132 }, { "epoch": 0.6345364062424506, "grad_norm": 2.618565797805786, "learning_rate": 3.6546359375754937e-07, "loss": 0.3154, "step": 13133 }, { "epoch": 0.6345847224235397, "grad_norm": 3.487060546875, "learning_rate": 3.6541527757646036e-07, "loss": 0.2449, "step": 13134 }, { "epoch": 0.6346330386046287, "grad_norm": 2.8922524452209473, "learning_rate": 3.653669613953713e-07, "loss": 0.398, "step": 13135 }, { "epoch": 0.6346813547857177, "grad_norm": 4.470203876495361, "learning_rate": 3.6531864521428224e-07, "loss": 0.3042, "step": 13136 }, { "epoch": 0.6347296709668068, "grad_norm": 2.4794061183929443, "learning_rate": 3.652703290331932e-07, "loss": 0.277, "step": 13137 }, { "epoch": 0.6347779871478958, "grad_norm": 2.608452796936035, "learning_rate": 3.652220128521041e-07, "loss": 0.2815, "step": 13138 }, { "epoch": 0.6348263033289849, "grad_norm": 3.5793538093566895, "learning_rate": 3.651736966710151e-07, "loss": 0.3375, "step": 13139 }, { "epoch": 0.634874619510074, "grad_norm": 3.0503900051116943, "learning_rate": 3.651253804899261e-07, "loss": 0.3382, "step": 13140 }, { "epoch": 0.634922935691163, "grad_norm": 3.167848825454712, "learning_rate": 3.65077064308837e-07, "loss": 0.2907, "step": 13141 }, { "epoch": 0.634971251872252, "grad_norm": 4.51633882522583, "learning_rate": 3.6502874812774797e-07, "loss": 0.4987, "step": 13142 }, { "epoch": 0.635019568053341, "grad_norm": 2.027824640274048, "learning_rate": 3.649804319466589e-07, "loss": 0.1592, "step": 13143 }, { "epoch": 0.6350678842344301, "grad_norm": 3.092111110687256, "learning_rate": 3.649321157655699e-07, "loss": 0.2772, "step": 13144 }, { "epoch": 0.6351162004155192, "grad_norm": 2.921827554702759, "learning_rate": 3.6488379958448083e-07, "loss": 0.371, "step": 13145 }, { "epoch": 0.6351645165966082, "grad_norm": 2.1705639362335205, "learning_rate": 3.6483548340339177e-07, "loss": 0.2024, "step": 13146 }, { "epoch": 0.6352128327776972, "grad_norm": 2.499624490737915, "learning_rate": 3.6478716722230276e-07, "loss": 0.3286, "step": 13147 }, { "epoch": 0.6352611489587863, "grad_norm": 2.9220902919769287, "learning_rate": 3.647388510412137e-07, "loss": 0.3075, "step": 13148 }, { "epoch": 0.6353094651398753, "grad_norm": 3.0416224002838135, "learning_rate": 3.6469053486012463e-07, "loss": 0.472, "step": 13149 }, { "epoch": 0.6353577813209644, "grad_norm": 6.475177764892578, "learning_rate": 3.646422186790356e-07, "loss": 0.3609, "step": 13150 }, { "epoch": 0.6354060975020535, "grad_norm": 1.9978272914886475, "learning_rate": 3.645939024979465e-07, "loss": 0.2481, "step": 13151 }, { "epoch": 0.6354544136831425, "grad_norm": 1.5685484409332275, "learning_rate": 3.645455863168575e-07, "loss": 0.1573, "step": 13152 }, { "epoch": 0.6355027298642315, "grad_norm": 2.64511775970459, "learning_rate": 3.644972701357685e-07, "loss": 0.3589, "step": 13153 }, { "epoch": 0.6355510460453205, "grad_norm": 2.1256191730499268, "learning_rate": 3.6444895395467937e-07, "loss": 0.1844, "step": 13154 }, { "epoch": 0.6355993622264097, "grad_norm": 12.937128067016602, "learning_rate": 3.6440063777359036e-07, "loss": 0.289, "step": 13155 }, { "epoch": 0.6356476784074987, "grad_norm": 3.5334994792938232, "learning_rate": 3.643523215925013e-07, "loss": 0.4505, "step": 13156 }, { "epoch": 0.6356959945885877, "grad_norm": 3.044126272201538, "learning_rate": 3.643040054114123e-07, "loss": 0.3199, "step": 13157 }, { "epoch": 0.6357443107696767, "grad_norm": 2.496814727783203, "learning_rate": 3.6425568923032323e-07, "loss": 0.2375, "step": 13158 }, { "epoch": 0.6357926269507658, "grad_norm": 3.1955792903900146, "learning_rate": 3.6420737304923416e-07, "loss": 0.3192, "step": 13159 }, { "epoch": 0.6358409431318549, "grad_norm": 4.2574028968811035, "learning_rate": 3.6415905686814515e-07, "loss": 0.407, "step": 13160 }, { "epoch": 0.6358892593129439, "grad_norm": 3.188448667526245, "learning_rate": 3.641107406870561e-07, "loss": 0.3395, "step": 13161 }, { "epoch": 0.635937575494033, "grad_norm": 2.743495464324951, "learning_rate": 3.6406242450596703e-07, "loss": 0.2573, "step": 13162 }, { "epoch": 0.635985891675122, "grad_norm": 2.232551336288452, "learning_rate": 3.64014108324878e-07, "loss": 0.2241, "step": 13163 }, { "epoch": 0.636034207856211, "grad_norm": 2.1384098529815674, "learning_rate": 3.639657921437889e-07, "loss": 0.2529, "step": 13164 }, { "epoch": 0.6360825240373001, "grad_norm": 2.604316473007202, "learning_rate": 3.639174759626999e-07, "loss": 0.4121, "step": 13165 }, { "epoch": 0.6361308402183892, "grad_norm": 3.1036903858184814, "learning_rate": 3.638691597816109e-07, "loss": 0.3886, "step": 13166 }, { "epoch": 0.6361791563994782, "grad_norm": 2.6998205184936523, "learning_rate": 3.6382084360052177e-07, "loss": 0.3167, "step": 13167 }, { "epoch": 0.6362274725805672, "grad_norm": 2.700953245162964, "learning_rate": 3.6377252741943276e-07, "loss": 0.3334, "step": 13168 }, { "epoch": 0.6362757887616562, "grad_norm": 3.119662046432495, "learning_rate": 3.637242112383437e-07, "loss": 0.44, "step": 13169 }, { "epoch": 0.6363241049427453, "grad_norm": 3.117424726486206, "learning_rate": 3.6367589505725463e-07, "loss": 0.3352, "step": 13170 }, { "epoch": 0.6363724211238344, "grad_norm": 4.529543876647949, "learning_rate": 3.636275788761656e-07, "loss": 0.3731, "step": 13171 }, { "epoch": 0.6364207373049234, "grad_norm": 6.931138038635254, "learning_rate": 3.6357926269507656e-07, "loss": 0.27, "step": 13172 }, { "epoch": 0.6364690534860125, "grad_norm": 4.115479946136475, "learning_rate": 3.6353094651398755e-07, "loss": 0.3155, "step": 13173 }, { "epoch": 0.6365173696671015, "grad_norm": 6.524108409881592, "learning_rate": 3.634826303328985e-07, "loss": 0.4728, "step": 13174 }, { "epoch": 0.6365656858481905, "grad_norm": 2.8177554607391357, "learning_rate": 3.634343141518094e-07, "loss": 0.3833, "step": 13175 }, { "epoch": 0.6366140020292796, "grad_norm": 14.739608764648438, "learning_rate": 3.633859979707204e-07, "loss": 0.2904, "step": 13176 }, { "epoch": 0.6366623182103687, "grad_norm": 3.56424617767334, "learning_rate": 3.633376817896313e-07, "loss": 0.2221, "step": 13177 }, { "epoch": 0.6367106343914577, "grad_norm": 3.2281594276428223, "learning_rate": 3.632893656085423e-07, "loss": 0.4193, "step": 13178 }, { "epoch": 0.6367589505725467, "grad_norm": 3.387289524078369, "learning_rate": 3.632410494274533e-07, "loss": 0.3044, "step": 13179 }, { "epoch": 0.6368072667536357, "grad_norm": 2.916421890258789, "learning_rate": 3.6319273324636416e-07, "loss": 0.2802, "step": 13180 }, { "epoch": 0.6368555829347249, "grad_norm": 2.9750993251800537, "learning_rate": 3.6314441706527515e-07, "loss": 0.3315, "step": 13181 }, { "epoch": 0.6369038991158139, "grad_norm": 2.6342780590057373, "learning_rate": 3.630961008841861e-07, "loss": 0.2862, "step": 13182 }, { "epoch": 0.6369522152969029, "grad_norm": 2.908515214920044, "learning_rate": 3.6304778470309703e-07, "loss": 0.2829, "step": 13183 }, { "epoch": 0.637000531477992, "grad_norm": 2.0034446716308594, "learning_rate": 3.62999468522008e-07, "loss": 0.2402, "step": 13184 }, { "epoch": 0.637048847659081, "grad_norm": 6.870665073394775, "learning_rate": 3.6295115234091896e-07, "loss": 0.2088, "step": 13185 }, { "epoch": 0.6370971638401701, "grad_norm": 3.3436245918273926, "learning_rate": 3.629028361598299e-07, "loss": 0.368, "step": 13186 }, { "epoch": 0.6371454800212591, "grad_norm": 2.2954909801483154, "learning_rate": 3.6285451997874083e-07, "loss": 0.3232, "step": 13187 }, { "epoch": 0.6371937962023482, "grad_norm": 9.164920806884766, "learning_rate": 3.628062037976518e-07, "loss": 0.4003, "step": 13188 }, { "epoch": 0.6372421123834372, "grad_norm": 2.6083192825317383, "learning_rate": 3.627578876165628e-07, "loss": 0.3159, "step": 13189 }, { "epoch": 0.6372904285645262, "grad_norm": 2.3159117698669434, "learning_rate": 3.627095714354737e-07, "loss": 0.1942, "step": 13190 }, { "epoch": 0.6373387447456154, "grad_norm": 2.3106565475463867, "learning_rate": 3.626612552543847e-07, "loss": 0.2676, "step": 13191 }, { "epoch": 0.6373870609267044, "grad_norm": 2.636875867843628, "learning_rate": 3.626129390732957e-07, "loss": 0.319, "step": 13192 }, { "epoch": 0.6374353771077934, "grad_norm": 3.907691478729248, "learning_rate": 3.6256462289220656e-07, "loss": 0.3267, "step": 13193 }, { "epoch": 0.6374836932888824, "grad_norm": 2.2108423709869385, "learning_rate": 3.6251630671111755e-07, "loss": 0.2624, "step": 13194 }, { "epoch": 0.6375320094699715, "grad_norm": 2.318192958831787, "learning_rate": 3.624679905300285e-07, "loss": 0.2498, "step": 13195 }, { "epoch": 0.6375803256510605, "grad_norm": 3.3359274864196777, "learning_rate": 3.624196743489394e-07, "loss": 0.2794, "step": 13196 }, { "epoch": 0.6376286418321496, "grad_norm": 2.588564395904541, "learning_rate": 3.623713581678504e-07, "loss": 0.2624, "step": 13197 }, { "epoch": 0.6376769580132386, "grad_norm": 3.0715219974517822, "learning_rate": 3.6232304198676135e-07, "loss": 0.316, "step": 13198 }, { "epoch": 0.6377252741943277, "grad_norm": 2.061708927154541, "learning_rate": 3.622747258056723e-07, "loss": 0.2465, "step": 13199 }, { "epoch": 0.6377735903754167, "grad_norm": 3.0371081829071045, "learning_rate": 3.6222640962458323e-07, "loss": 0.2828, "step": 13200 }, { "epoch": 0.6378219065565057, "grad_norm": 5.204116344451904, "learning_rate": 3.621780934434942e-07, "loss": 0.2943, "step": 13201 }, { "epoch": 0.6378702227375949, "grad_norm": 2.3236939907073975, "learning_rate": 3.6212977726240515e-07, "loss": 0.268, "step": 13202 }, { "epoch": 0.6379185389186839, "grad_norm": 3.1338043212890625, "learning_rate": 3.620814610813161e-07, "loss": 0.2695, "step": 13203 }, { "epoch": 0.6379668550997729, "grad_norm": 2.7034428119659424, "learning_rate": 3.620331449002271e-07, "loss": 0.3266, "step": 13204 }, { "epoch": 0.6380151712808619, "grad_norm": 4.573214530944824, "learning_rate": 3.6198482871913807e-07, "loss": 0.4376, "step": 13205 }, { "epoch": 0.638063487461951, "grad_norm": 1.9961583614349365, "learning_rate": 3.6193651253804896e-07, "loss": 0.2101, "step": 13206 }, { "epoch": 0.6381118036430401, "grad_norm": 3.170923948287964, "learning_rate": 3.6188819635695995e-07, "loss": 0.3783, "step": 13207 }, { "epoch": 0.6381601198241291, "grad_norm": 2.9406468868255615, "learning_rate": 3.618398801758709e-07, "loss": 0.34, "step": 13208 }, { "epoch": 0.6382084360052181, "grad_norm": 2.5615200996398926, "learning_rate": 3.617915639947818e-07, "loss": 0.3351, "step": 13209 }, { "epoch": 0.6382567521863072, "grad_norm": 3.102447986602783, "learning_rate": 3.617432478136928e-07, "loss": 0.2469, "step": 13210 }, { "epoch": 0.6383050683673962, "grad_norm": 2.4312939643859863, "learning_rate": 3.6169493163260375e-07, "loss": 0.2698, "step": 13211 }, { "epoch": 0.6383533845484853, "grad_norm": 4.279616355895996, "learning_rate": 3.616466154515147e-07, "loss": 0.2408, "step": 13212 }, { "epoch": 0.6384017007295744, "grad_norm": 2.3608319759368896, "learning_rate": 3.615982992704256e-07, "loss": 0.2293, "step": 13213 }, { "epoch": 0.6384500169106634, "grad_norm": 2.941378593444824, "learning_rate": 3.615499830893366e-07, "loss": 0.3952, "step": 13214 }, { "epoch": 0.6384983330917524, "grad_norm": 3.152799367904663, "learning_rate": 3.6150166690824755e-07, "loss": 0.2097, "step": 13215 }, { "epoch": 0.6385466492728414, "grad_norm": 4.278576850891113, "learning_rate": 3.614533507271585e-07, "loss": 0.2737, "step": 13216 }, { "epoch": 0.6385949654539306, "grad_norm": 2.6728639602661133, "learning_rate": 3.614050345460695e-07, "loss": 0.3413, "step": 13217 }, { "epoch": 0.6386432816350196, "grad_norm": 2.943659782409668, "learning_rate": 3.613567183649804e-07, "loss": 0.3062, "step": 13218 }, { "epoch": 0.6386915978161086, "grad_norm": 3.819606065750122, "learning_rate": 3.6130840218389135e-07, "loss": 0.3582, "step": 13219 }, { "epoch": 0.6387399139971977, "grad_norm": 3.2855944633483887, "learning_rate": 3.6126008600280234e-07, "loss": 0.2464, "step": 13220 }, { "epoch": 0.6387882301782867, "grad_norm": 3.6261112689971924, "learning_rate": 3.6121176982171323e-07, "loss": 0.2301, "step": 13221 }, { "epoch": 0.6388365463593757, "grad_norm": 1.796501874923706, "learning_rate": 3.611634536406242e-07, "loss": 0.2481, "step": 13222 }, { "epoch": 0.6388848625404648, "grad_norm": 2.410658597946167, "learning_rate": 3.611151374595352e-07, "loss": 0.2942, "step": 13223 }, { "epoch": 0.6389331787215539, "grad_norm": 3.546971321105957, "learning_rate": 3.6106682127844615e-07, "loss": 0.2765, "step": 13224 }, { "epoch": 0.6389814949026429, "grad_norm": 3.1459262371063232, "learning_rate": 3.610185050973571e-07, "loss": 0.3007, "step": 13225 }, { "epoch": 0.6390298110837319, "grad_norm": 3.258234739303589, "learning_rate": 3.60970188916268e-07, "loss": 0.2534, "step": 13226 }, { "epoch": 0.6390781272648209, "grad_norm": 3.3928604125976562, "learning_rate": 3.60921872735179e-07, "loss": 0.3515, "step": 13227 }, { "epoch": 0.6391264434459101, "grad_norm": 1.9740782976150513, "learning_rate": 3.6087355655408995e-07, "loss": 0.1806, "step": 13228 }, { "epoch": 0.6391747596269991, "grad_norm": 2.8447279930114746, "learning_rate": 3.608252403730009e-07, "loss": 0.296, "step": 13229 }, { "epoch": 0.6392230758080881, "grad_norm": 2.617326259613037, "learning_rate": 3.607769241919119e-07, "loss": 0.2678, "step": 13230 }, { "epoch": 0.6392713919891772, "grad_norm": 2.5459327697753906, "learning_rate": 3.607286080108228e-07, "loss": 0.2417, "step": 13231 }, { "epoch": 0.6393197081702662, "grad_norm": 2.411334753036499, "learning_rate": 3.6068029182973375e-07, "loss": 0.2478, "step": 13232 }, { "epoch": 0.6393680243513553, "grad_norm": 3.567690372467041, "learning_rate": 3.6063197564864474e-07, "loss": 0.4153, "step": 13233 }, { "epoch": 0.6394163405324443, "grad_norm": 8.691303253173828, "learning_rate": 3.605836594675556e-07, "loss": 0.3329, "step": 13234 }, { "epoch": 0.6394646567135334, "grad_norm": 2.9409570693969727, "learning_rate": 3.605353432864666e-07, "loss": 0.2429, "step": 13235 }, { "epoch": 0.6395129728946224, "grad_norm": 2.217280626296997, "learning_rate": 3.604870271053776e-07, "loss": 0.2528, "step": 13236 }, { "epoch": 0.6395612890757114, "grad_norm": 8.163900375366211, "learning_rate": 3.604387109242885e-07, "loss": 0.3328, "step": 13237 }, { "epoch": 0.6396096052568006, "grad_norm": 2.3305208683013916, "learning_rate": 3.603903947431995e-07, "loss": 0.2791, "step": 13238 }, { "epoch": 0.6396579214378896, "grad_norm": 1.9732861518859863, "learning_rate": 3.603420785621104e-07, "loss": 0.2127, "step": 13239 }, { "epoch": 0.6397062376189786, "grad_norm": 2.6255786418914795, "learning_rate": 3.602937623810214e-07, "loss": 0.3726, "step": 13240 }, { "epoch": 0.6397545538000676, "grad_norm": 2.438598155975342, "learning_rate": 3.6024544619993234e-07, "loss": 0.3535, "step": 13241 }, { "epoch": 0.6398028699811567, "grad_norm": 4.815239429473877, "learning_rate": 3.601971300188433e-07, "loss": 0.2908, "step": 13242 }, { "epoch": 0.6398511861622458, "grad_norm": 3.70050048828125, "learning_rate": 3.6014881383775427e-07, "loss": 0.2844, "step": 13243 }, { "epoch": 0.6398995023433348, "grad_norm": 3.6734631061553955, "learning_rate": 3.601004976566652e-07, "loss": 0.357, "step": 13244 }, { "epoch": 0.6399478185244238, "grad_norm": 3.294917345046997, "learning_rate": 3.6005218147557615e-07, "loss": 0.3037, "step": 13245 }, { "epoch": 0.6399961347055129, "grad_norm": 3.0892271995544434, "learning_rate": 3.6000386529448714e-07, "loss": 0.3149, "step": 13246 }, { "epoch": 0.6400444508866019, "grad_norm": 2.9693450927734375, "learning_rate": 3.59955549113398e-07, "loss": 0.1995, "step": 13247 }, { "epoch": 0.6400927670676909, "grad_norm": 2.628037929534912, "learning_rate": 3.59907232932309e-07, "loss": 0.3292, "step": 13248 }, { "epoch": 0.64014108324878, "grad_norm": 2.002326011657715, "learning_rate": 3.5985891675122e-07, "loss": 0.2232, "step": 13249 }, { "epoch": 0.6401893994298691, "grad_norm": 3.2270851135253906, "learning_rate": 3.598106005701309e-07, "loss": 0.4858, "step": 13250 }, { "epoch": 0.6402377156109581, "grad_norm": 2.8376221656799316, "learning_rate": 3.597622843890419e-07, "loss": 0.3281, "step": 13251 }, { "epoch": 0.6402860317920471, "grad_norm": 7.318587779998779, "learning_rate": 3.597139682079528e-07, "loss": 0.2847, "step": 13252 }, { "epoch": 0.6403343479731362, "grad_norm": 3.331087350845337, "learning_rate": 3.5966565202686375e-07, "loss": 0.2992, "step": 13253 }, { "epoch": 0.6403826641542253, "grad_norm": 3.306572914123535, "learning_rate": 3.5961733584577474e-07, "loss": 0.1544, "step": 13254 }, { "epoch": 0.6404309803353143, "grad_norm": 2.0921263694763184, "learning_rate": 3.595690196646857e-07, "loss": 0.2291, "step": 13255 }, { "epoch": 0.6404792965164033, "grad_norm": 3.4255428314208984, "learning_rate": 3.5952070348359667e-07, "loss": 0.4525, "step": 13256 }, { "epoch": 0.6405276126974924, "grad_norm": 2.168473243713379, "learning_rate": 3.594723873025076e-07, "loss": 0.2352, "step": 13257 }, { "epoch": 0.6405759288785814, "grad_norm": 2.606693983078003, "learning_rate": 3.5942407112141854e-07, "loss": 0.3161, "step": 13258 }, { "epoch": 0.6406242450596705, "grad_norm": 2.5911717414855957, "learning_rate": 3.5937575494032953e-07, "loss": 0.2973, "step": 13259 }, { "epoch": 0.6406725612407596, "grad_norm": 2.9957945346832275, "learning_rate": 3.593274387592404e-07, "loss": 0.3599, "step": 13260 }, { "epoch": 0.6407208774218486, "grad_norm": 1.6104886531829834, "learning_rate": 3.592791225781514e-07, "loss": 0.1519, "step": 13261 }, { "epoch": 0.6407691936029376, "grad_norm": 2.5141074657440186, "learning_rate": 3.592308063970624e-07, "loss": 0.2476, "step": 13262 }, { "epoch": 0.6408175097840266, "grad_norm": 2.735610008239746, "learning_rate": 3.591824902159733e-07, "loss": 0.3688, "step": 13263 }, { "epoch": 0.6408658259651158, "grad_norm": 2.8148679733276367, "learning_rate": 3.5913417403488427e-07, "loss": 0.3318, "step": 13264 }, { "epoch": 0.6409141421462048, "grad_norm": 6.923121929168701, "learning_rate": 3.590858578537952e-07, "loss": 0.4438, "step": 13265 }, { "epoch": 0.6409624583272938, "grad_norm": 1.6906070709228516, "learning_rate": 3.5903754167270615e-07, "loss": 0.1932, "step": 13266 }, { "epoch": 0.6410107745083828, "grad_norm": 6.695126533508301, "learning_rate": 3.5898922549161714e-07, "loss": 0.3749, "step": 13267 }, { "epoch": 0.6410590906894719, "grad_norm": 3.3399863243103027, "learning_rate": 3.589409093105281e-07, "loss": 0.2655, "step": 13268 }, { "epoch": 0.641107406870561, "grad_norm": 2.6624093055725098, "learning_rate": 3.58892593129439e-07, "loss": 0.2843, "step": 13269 }, { "epoch": 0.64115572305165, "grad_norm": 2.7702527046203613, "learning_rate": 3.5884427694835e-07, "loss": 0.379, "step": 13270 }, { "epoch": 0.641204039232739, "grad_norm": 5.206377029418945, "learning_rate": 3.5879596076726094e-07, "loss": 0.2837, "step": 13271 }, { "epoch": 0.6412523554138281, "grad_norm": 3.2790191173553467, "learning_rate": 3.5874764458617193e-07, "loss": 0.2857, "step": 13272 }, { "epoch": 0.6413006715949171, "grad_norm": 1.2148100137710571, "learning_rate": 3.586993284050828e-07, "loss": 0.1382, "step": 13273 }, { "epoch": 0.6413489877760061, "grad_norm": 4.715356826782227, "learning_rate": 3.586510122239938e-07, "loss": 0.2987, "step": 13274 }, { "epoch": 0.6413973039570953, "grad_norm": 2.447688579559326, "learning_rate": 3.586026960429048e-07, "loss": 0.3295, "step": 13275 }, { "epoch": 0.6414456201381843, "grad_norm": 2.509965181350708, "learning_rate": 3.585543798618157e-07, "loss": 0.3169, "step": 13276 }, { "epoch": 0.6414939363192733, "grad_norm": 2.7022814750671387, "learning_rate": 3.5850606368072667e-07, "loss": 0.2241, "step": 13277 }, { "epoch": 0.6415422525003623, "grad_norm": 3.288844108581543, "learning_rate": 3.584577474996376e-07, "loss": 0.2402, "step": 13278 }, { "epoch": 0.6415905686814514, "grad_norm": 2.3157546520233154, "learning_rate": 3.5840943131854854e-07, "loss": 0.2741, "step": 13279 }, { "epoch": 0.6416388848625405, "grad_norm": 3.2365217208862305, "learning_rate": 3.5836111513745953e-07, "loss": 0.2638, "step": 13280 }, { "epoch": 0.6416872010436295, "grad_norm": 2.88206148147583, "learning_rate": 3.5831279895637047e-07, "loss": 0.2722, "step": 13281 }, { "epoch": 0.6417355172247186, "grad_norm": 2.5054306983947754, "learning_rate": 3.582644827752814e-07, "loss": 0.2983, "step": 13282 }, { "epoch": 0.6417838334058076, "grad_norm": 2.6436092853546143, "learning_rate": 3.582161665941924e-07, "loss": 0.382, "step": 13283 }, { "epoch": 0.6418321495868966, "grad_norm": 6.940496444702148, "learning_rate": 3.5816785041310334e-07, "loss": 0.2967, "step": 13284 }, { "epoch": 0.6418804657679857, "grad_norm": 5.040204048156738, "learning_rate": 3.5811953423201427e-07, "loss": 0.3144, "step": 13285 }, { "epoch": 0.6419287819490748, "grad_norm": 2.641178846359253, "learning_rate": 3.580712180509252e-07, "loss": 0.2753, "step": 13286 }, { "epoch": 0.6419770981301638, "grad_norm": 3.329773187637329, "learning_rate": 3.580229018698362e-07, "loss": 0.2933, "step": 13287 }, { "epoch": 0.6420254143112528, "grad_norm": 2.17143177986145, "learning_rate": 3.579745856887472e-07, "loss": 0.2504, "step": 13288 }, { "epoch": 0.6420737304923418, "grad_norm": 1.3442881107330322, "learning_rate": 3.579262695076581e-07, "loss": 0.1447, "step": 13289 }, { "epoch": 0.642122046673431, "grad_norm": 2.041593313217163, "learning_rate": 3.5787795332656906e-07, "loss": 0.2506, "step": 13290 }, { "epoch": 0.64217036285452, "grad_norm": 2.6017367839813232, "learning_rate": 3.5782963714548e-07, "loss": 0.2776, "step": 13291 }, { "epoch": 0.642218679035609, "grad_norm": 2.4829256534576416, "learning_rate": 3.5778132096439094e-07, "loss": 0.2799, "step": 13292 }, { "epoch": 0.6422669952166981, "grad_norm": 2.848651885986328, "learning_rate": 3.5773300478330193e-07, "loss": 0.2879, "step": 13293 }, { "epoch": 0.6423153113977871, "grad_norm": 2.53350567817688, "learning_rate": 3.5768468860221287e-07, "loss": 0.2124, "step": 13294 }, { "epoch": 0.6423636275788762, "grad_norm": 3.727538824081421, "learning_rate": 3.576363724211238e-07, "loss": 0.1971, "step": 13295 }, { "epoch": 0.6424119437599652, "grad_norm": 2.724578619003296, "learning_rate": 3.575880562400348e-07, "loss": 0.2765, "step": 13296 }, { "epoch": 0.6424602599410543, "grad_norm": 2.383347511291504, "learning_rate": 3.5753974005894573e-07, "loss": 0.3154, "step": 13297 }, { "epoch": 0.6425085761221433, "grad_norm": 2.408235788345337, "learning_rate": 3.5749142387785667e-07, "loss": 0.2471, "step": 13298 }, { "epoch": 0.6425568923032323, "grad_norm": 2.561607837677002, "learning_rate": 3.574431076967676e-07, "loss": 0.3486, "step": 13299 }, { "epoch": 0.6426052084843213, "grad_norm": 2.2220115661621094, "learning_rate": 3.573947915156786e-07, "loss": 0.253, "step": 13300 }, { "epoch": 0.6426535246654105, "grad_norm": 1.6706335544586182, "learning_rate": 3.5734647533458953e-07, "loss": 0.1837, "step": 13301 }, { "epoch": 0.6427018408464995, "grad_norm": 4.035999298095703, "learning_rate": 3.5729815915350047e-07, "loss": 0.228, "step": 13302 }, { "epoch": 0.6427501570275885, "grad_norm": 2.560514211654663, "learning_rate": 3.5724984297241146e-07, "loss": 0.3261, "step": 13303 }, { "epoch": 0.6427984732086776, "grad_norm": 5.081149578094482, "learning_rate": 3.5720152679132235e-07, "loss": 0.3038, "step": 13304 }, { "epoch": 0.6428467893897666, "grad_norm": 2.891866683959961, "learning_rate": 3.5715321061023334e-07, "loss": 0.2894, "step": 13305 }, { "epoch": 0.6428951055708557, "grad_norm": 2.856487989425659, "learning_rate": 3.571048944291443e-07, "loss": 0.2285, "step": 13306 }, { "epoch": 0.6429434217519447, "grad_norm": 2.616302967071533, "learning_rate": 3.5705657824805526e-07, "loss": 0.3137, "step": 13307 }, { "epoch": 0.6429917379330338, "grad_norm": 2.77500319480896, "learning_rate": 3.570082620669662e-07, "loss": 0.3121, "step": 13308 }, { "epoch": 0.6430400541141228, "grad_norm": 3.066582202911377, "learning_rate": 3.569599458858772e-07, "loss": 0.2682, "step": 13309 }, { "epoch": 0.6430883702952118, "grad_norm": 3.941694498062134, "learning_rate": 3.5691162970478813e-07, "loss": 0.3188, "step": 13310 }, { "epoch": 0.643136686476301, "grad_norm": 2.9035844802856445, "learning_rate": 3.5686331352369907e-07, "loss": 0.3243, "step": 13311 }, { "epoch": 0.64318500265739, "grad_norm": 2.1244466304779053, "learning_rate": 3.5681499734261e-07, "loss": 0.2739, "step": 13312 }, { "epoch": 0.643233318838479, "grad_norm": 3.403635263442993, "learning_rate": 3.56766681161521e-07, "loss": 0.211, "step": 13313 }, { "epoch": 0.643281635019568, "grad_norm": 2.273967981338501, "learning_rate": 3.5671836498043193e-07, "loss": 0.2316, "step": 13314 }, { "epoch": 0.6433299512006571, "grad_norm": 2.8363277912139893, "learning_rate": 3.5667004879934287e-07, "loss": 0.2372, "step": 13315 }, { "epoch": 0.6433782673817462, "grad_norm": 2.7138261795043945, "learning_rate": 3.5662173261825386e-07, "loss": 0.2832, "step": 13316 }, { "epoch": 0.6434265835628352, "grad_norm": 6.813179969787598, "learning_rate": 3.5657341643716474e-07, "loss": 0.3359, "step": 13317 }, { "epoch": 0.6434748997439242, "grad_norm": 3.0864830017089844, "learning_rate": 3.5652510025607573e-07, "loss": 0.2946, "step": 13318 }, { "epoch": 0.6435232159250133, "grad_norm": 3.0446763038635254, "learning_rate": 3.564767840749867e-07, "loss": 0.3059, "step": 13319 }, { "epoch": 0.6435715321061023, "grad_norm": 2.5219709873199463, "learning_rate": 3.5642846789389766e-07, "loss": 0.3011, "step": 13320 }, { "epoch": 0.6436198482871914, "grad_norm": 4.654045581817627, "learning_rate": 3.563801517128086e-07, "loss": 0.3561, "step": 13321 }, { "epoch": 0.6436681644682805, "grad_norm": 2.268461227416992, "learning_rate": 3.563318355317196e-07, "loss": 0.2616, "step": 13322 }, { "epoch": 0.6437164806493695, "grad_norm": 2.4979684352874756, "learning_rate": 3.562835193506305e-07, "loss": 0.2375, "step": 13323 }, { "epoch": 0.6437647968304585, "grad_norm": 2.4979774951934814, "learning_rate": 3.5623520316954146e-07, "loss": 0.1934, "step": 13324 }, { "epoch": 0.6438131130115475, "grad_norm": 2.5233213901519775, "learning_rate": 3.561868869884524e-07, "loss": 0.2845, "step": 13325 }, { "epoch": 0.6438614291926367, "grad_norm": 1.8916367292404175, "learning_rate": 3.561385708073634e-07, "loss": 0.1653, "step": 13326 }, { "epoch": 0.6439097453737257, "grad_norm": 2.3014323711395264, "learning_rate": 3.560902546262743e-07, "loss": 0.2285, "step": 13327 }, { "epoch": 0.6439580615548147, "grad_norm": 2.516306161880493, "learning_rate": 3.5604193844518526e-07, "loss": 0.2248, "step": 13328 }, { "epoch": 0.6440063777359037, "grad_norm": 3.606070041656494, "learning_rate": 3.5599362226409625e-07, "loss": 0.313, "step": 13329 }, { "epoch": 0.6440546939169928, "grad_norm": 3.4957149028778076, "learning_rate": 3.5594530608300714e-07, "loss": 0.2953, "step": 13330 }, { "epoch": 0.6441030100980818, "grad_norm": 2.760745048522949, "learning_rate": 3.5589698990191813e-07, "loss": 0.3285, "step": 13331 }, { "epoch": 0.6441513262791709, "grad_norm": 2.738039016723633, "learning_rate": 3.558486737208291e-07, "loss": 0.2661, "step": 13332 }, { "epoch": 0.64419964246026, "grad_norm": 3.1358020305633545, "learning_rate": 3.5580035753974e-07, "loss": 0.3223, "step": 13333 }, { "epoch": 0.644247958641349, "grad_norm": 3.8121042251586914, "learning_rate": 3.55752041358651e-07, "loss": 0.2802, "step": 13334 }, { "epoch": 0.644296274822438, "grad_norm": 3.870309829711914, "learning_rate": 3.55703725177562e-07, "loss": 0.2612, "step": 13335 }, { "epoch": 0.644344591003527, "grad_norm": 3.574596643447876, "learning_rate": 3.556554089964729e-07, "loss": 0.4632, "step": 13336 }, { "epoch": 0.6443929071846162, "grad_norm": 2.729862928390503, "learning_rate": 3.5560709281538386e-07, "loss": 0.2855, "step": 13337 }, { "epoch": 0.6444412233657052, "grad_norm": 1.8987517356872559, "learning_rate": 3.555587766342948e-07, "loss": 0.1575, "step": 13338 }, { "epoch": 0.6444895395467942, "grad_norm": 4.389464855194092, "learning_rate": 3.555104604532058e-07, "loss": 0.4204, "step": 13339 }, { "epoch": 0.6445378557278832, "grad_norm": 2.9945693016052246, "learning_rate": 3.554621442721167e-07, "loss": 0.2994, "step": 13340 }, { "epoch": 0.6445861719089723, "grad_norm": 4.629089832305908, "learning_rate": 3.5541382809102766e-07, "loss": 0.3214, "step": 13341 }, { "epoch": 0.6446344880900614, "grad_norm": 5.264679431915283, "learning_rate": 3.5536551190993865e-07, "loss": 0.2968, "step": 13342 }, { "epoch": 0.6446828042711504, "grad_norm": 2.757375478744507, "learning_rate": 3.5531719572884953e-07, "loss": 0.2101, "step": 13343 }, { "epoch": 0.6447311204522395, "grad_norm": 6.247621059417725, "learning_rate": 3.552688795477605e-07, "loss": 0.2315, "step": 13344 }, { "epoch": 0.6447794366333285, "grad_norm": 2.5696041584014893, "learning_rate": 3.552205633666715e-07, "loss": 0.339, "step": 13345 }, { "epoch": 0.6448277528144175, "grad_norm": 2.7207729816436768, "learning_rate": 3.551722471855824e-07, "loss": 0.3234, "step": 13346 }, { "epoch": 0.6448760689955066, "grad_norm": 1.7215220928192139, "learning_rate": 3.551239310044934e-07, "loss": 0.1362, "step": 13347 }, { "epoch": 0.6449243851765957, "grad_norm": 2.12788987159729, "learning_rate": 3.550756148234044e-07, "loss": 0.2436, "step": 13348 }, { "epoch": 0.6449727013576847, "grad_norm": 2.4060757160186768, "learning_rate": 3.5502729864231526e-07, "loss": 0.2611, "step": 13349 }, { "epoch": 0.6450210175387737, "grad_norm": 2.3634305000305176, "learning_rate": 3.5497898246122625e-07, "loss": 0.2648, "step": 13350 }, { "epoch": 0.6450693337198627, "grad_norm": 2.945993661880493, "learning_rate": 3.549306662801372e-07, "loss": 0.3512, "step": 13351 }, { "epoch": 0.6451176499009519, "grad_norm": 2.154310464859009, "learning_rate": 3.548823500990482e-07, "loss": 0.2237, "step": 13352 }, { "epoch": 0.6451659660820409, "grad_norm": 2.3130393028259277, "learning_rate": 3.548340339179591e-07, "loss": 0.1836, "step": 13353 }, { "epoch": 0.6452142822631299, "grad_norm": 3.994028091430664, "learning_rate": 3.5478571773687006e-07, "loss": 0.3187, "step": 13354 }, { "epoch": 0.645262598444219, "grad_norm": 2.3542017936706543, "learning_rate": 3.5473740155578105e-07, "loss": 0.1908, "step": 13355 }, { "epoch": 0.645310914625308, "grad_norm": 3.227339267730713, "learning_rate": 3.5468908537469193e-07, "loss": 0.226, "step": 13356 }, { "epoch": 0.645359230806397, "grad_norm": 5.745959281921387, "learning_rate": 3.546407691936029e-07, "loss": 0.4843, "step": 13357 }, { "epoch": 0.6454075469874861, "grad_norm": 3.6637728214263916, "learning_rate": 3.545924530125139e-07, "loss": 0.2591, "step": 13358 }, { "epoch": 0.6454558631685752, "grad_norm": 3.997335433959961, "learning_rate": 3.545441368314248e-07, "loss": 0.4806, "step": 13359 }, { "epoch": 0.6455041793496642, "grad_norm": 3.49214506149292, "learning_rate": 3.544958206503358e-07, "loss": 0.3858, "step": 13360 }, { "epoch": 0.6455524955307532, "grad_norm": 2.5146520137786865, "learning_rate": 3.544475044692468e-07, "loss": 0.2976, "step": 13361 }, { "epoch": 0.6456008117118422, "grad_norm": 2.009756088256836, "learning_rate": 3.5439918828815766e-07, "loss": 0.2522, "step": 13362 }, { "epoch": 0.6456491278929314, "grad_norm": 2.5001797676086426, "learning_rate": 3.5435087210706865e-07, "loss": 0.1918, "step": 13363 }, { "epoch": 0.6456974440740204, "grad_norm": 3.3209762573242188, "learning_rate": 3.543025559259796e-07, "loss": 0.2875, "step": 13364 }, { "epoch": 0.6457457602551094, "grad_norm": 4.276852607727051, "learning_rate": 3.542542397448905e-07, "loss": 0.3592, "step": 13365 }, { "epoch": 0.6457940764361985, "grad_norm": 2.3834118843078613, "learning_rate": 3.542059235638015e-07, "loss": 0.2366, "step": 13366 }, { "epoch": 0.6458423926172875, "grad_norm": 2.910930633544922, "learning_rate": 3.5415760738271245e-07, "loss": 0.3569, "step": 13367 }, { "epoch": 0.6458907087983766, "grad_norm": 2.118081569671631, "learning_rate": 3.5410929120162344e-07, "loss": 0.2477, "step": 13368 }, { "epoch": 0.6459390249794656, "grad_norm": 2.954474687576294, "learning_rate": 3.5406097502053433e-07, "loss": 0.2634, "step": 13369 }, { "epoch": 0.6459873411605547, "grad_norm": 2.123591661453247, "learning_rate": 3.540126588394453e-07, "loss": 0.221, "step": 13370 }, { "epoch": 0.6460356573416437, "grad_norm": 6.082737922668457, "learning_rate": 3.539643426583563e-07, "loss": 0.3004, "step": 13371 }, { "epoch": 0.6460839735227327, "grad_norm": 2.853933572769165, "learning_rate": 3.539160264772672e-07, "loss": 0.3273, "step": 13372 }, { "epoch": 0.6461322897038219, "grad_norm": 3.093104839324951, "learning_rate": 3.538677102961782e-07, "loss": 0.2576, "step": 13373 }, { "epoch": 0.6461806058849109, "grad_norm": 3.4171247482299805, "learning_rate": 3.5381939411508917e-07, "loss": 0.3742, "step": 13374 }, { "epoch": 0.6462289220659999, "grad_norm": 1.9108729362487793, "learning_rate": 3.5377107793400006e-07, "loss": 0.2006, "step": 13375 }, { "epoch": 0.6462772382470889, "grad_norm": 3.1522629261016846, "learning_rate": 3.5372276175291105e-07, "loss": 0.3156, "step": 13376 }, { "epoch": 0.646325554428178, "grad_norm": 23.2396183013916, "learning_rate": 3.53674445571822e-07, "loss": 0.2331, "step": 13377 }, { "epoch": 0.6463738706092671, "grad_norm": 4.7815093994140625, "learning_rate": 3.536261293907329e-07, "loss": 0.2443, "step": 13378 }, { "epoch": 0.6464221867903561, "grad_norm": 2.722113609313965, "learning_rate": 3.535778132096439e-07, "loss": 0.3674, "step": 13379 }, { "epoch": 0.6464705029714451, "grad_norm": 3.8364975452423096, "learning_rate": 3.5352949702855485e-07, "loss": 0.3268, "step": 13380 }, { "epoch": 0.6465188191525342, "grad_norm": 2.7377524375915527, "learning_rate": 3.534811808474658e-07, "loss": 0.2757, "step": 13381 }, { "epoch": 0.6465671353336232, "grad_norm": 2.2264204025268555, "learning_rate": 3.534328646663767e-07, "loss": 0.2792, "step": 13382 }, { "epoch": 0.6466154515147122, "grad_norm": 4.366396427154541, "learning_rate": 3.533845484852877e-07, "loss": 0.2992, "step": 13383 }, { "epoch": 0.6466637676958014, "grad_norm": 1.6632713079452515, "learning_rate": 3.533362323041987e-07, "loss": 0.1452, "step": 13384 }, { "epoch": 0.6467120838768904, "grad_norm": 2.4688339233398438, "learning_rate": 3.532879161231096e-07, "loss": 0.3158, "step": 13385 }, { "epoch": 0.6467604000579794, "grad_norm": 2.553621768951416, "learning_rate": 3.532395999420206e-07, "loss": 0.2278, "step": 13386 }, { "epoch": 0.6468087162390684, "grad_norm": 3.0874009132385254, "learning_rate": 3.5319128376093157e-07, "loss": 0.3207, "step": 13387 }, { "epoch": 0.6468570324201575, "grad_norm": 2.9138894081115723, "learning_rate": 3.5314296757984245e-07, "loss": 0.3175, "step": 13388 }, { "epoch": 0.6469053486012466, "grad_norm": 1.8638465404510498, "learning_rate": 3.5309465139875344e-07, "loss": 0.2227, "step": 13389 }, { "epoch": 0.6469536647823356, "grad_norm": 2.433441162109375, "learning_rate": 3.530463352176644e-07, "loss": 0.1534, "step": 13390 }, { "epoch": 0.6470019809634246, "grad_norm": 2.5457630157470703, "learning_rate": 3.529980190365753e-07, "loss": 0.2827, "step": 13391 }, { "epoch": 0.6470502971445137, "grad_norm": 2.9287166595458984, "learning_rate": 3.529497028554863e-07, "loss": 0.3499, "step": 13392 }, { "epoch": 0.6470986133256027, "grad_norm": 2.597264289855957, "learning_rate": 3.5290138667439725e-07, "loss": 0.2754, "step": 13393 }, { "epoch": 0.6471469295066918, "grad_norm": 9.994013786315918, "learning_rate": 3.528530704933082e-07, "loss": 0.4516, "step": 13394 }, { "epoch": 0.6471952456877809, "grad_norm": 4.544486045837402, "learning_rate": 3.528047543122191e-07, "loss": 0.3335, "step": 13395 }, { "epoch": 0.6472435618688699, "grad_norm": 2.9771909713745117, "learning_rate": 3.527564381311301e-07, "loss": 0.3862, "step": 13396 }, { "epoch": 0.6472918780499589, "grad_norm": 3.4279062747955322, "learning_rate": 3.5270812195004105e-07, "loss": 0.2555, "step": 13397 }, { "epoch": 0.6473401942310479, "grad_norm": 2.516505002975464, "learning_rate": 3.52659805768952e-07, "loss": 0.2083, "step": 13398 }, { "epoch": 0.6473885104121371, "grad_norm": 2.2675840854644775, "learning_rate": 3.52611489587863e-07, "loss": 0.2376, "step": 13399 }, { "epoch": 0.6474368265932261, "grad_norm": 3.6304662227630615, "learning_rate": 3.5256317340677396e-07, "loss": 0.3587, "step": 13400 }, { "epoch": 0.6474851427743151, "grad_norm": 140.04217529296875, "learning_rate": 3.5251485722568485e-07, "loss": 0.286, "step": 13401 }, { "epoch": 0.6475334589554042, "grad_norm": 2.9199295043945312, "learning_rate": 3.5246654104459584e-07, "loss": 0.3848, "step": 13402 }, { "epoch": 0.6475817751364932, "grad_norm": 3.170801877975464, "learning_rate": 3.524182248635068e-07, "loss": 0.2174, "step": 13403 }, { "epoch": 0.6476300913175823, "grad_norm": 3.041273355484009, "learning_rate": 3.523699086824177e-07, "loss": 0.4131, "step": 13404 }, { "epoch": 0.6476784074986713, "grad_norm": 2.1728363037109375, "learning_rate": 3.523215925013287e-07, "loss": 0.2357, "step": 13405 }, { "epoch": 0.6477267236797604, "grad_norm": 3.725395917892456, "learning_rate": 3.5227327632023964e-07, "loss": 0.4129, "step": 13406 }, { "epoch": 0.6477750398608494, "grad_norm": 2.4185869693756104, "learning_rate": 3.522249601391506e-07, "loss": 0.2782, "step": 13407 }, { "epoch": 0.6478233560419384, "grad_norm": 1.8901522159576416, "learning_rate": 3.521766439580615e-07, "loss": 0.2057, "step": 13408 }, { "epoch": 0.6478716722230274, "grad_norm": 3.4422953128814697, "learning_rate": 3.521283277769725e-07, "loss": 0.3542, "step": 13409 }, { "epoch": 0.6479199884041166, "grad_norm": 4.2079548835754395, "learning_rate": 3.5208001159588344e-07, "loss": 0.453, "step": 13410 }, { "epoch": 0.6479683045852056, "grad_norm": 2.498013734817505, "learning_rate": 3.520316954147944e-07, "loss": 0.3044, "step": 13411 }, { "epoch": 0.6480166207662946, "grad_norm": 2.0449957847595215, "learning_rate": 3.5198337923370537e-07, "loss": 0.2198, "step": 13412 }, { "epoch": 0.6480649369473837, "grad_norm": 2.4199724197387695, "learning_rate": 3.519350630526163e-07, "loss": 0.275, "step": 13413 }, { "epoch": 0.6481132531284727, "grad_norm": 2.5811469554901123, "learning_rate": 3.5188674687152725e-07, "loss": 0.1942, "step": 13414 }, { "epoch": 0.6481615693095618, "grad_norm": 2.6016695499420166, "learning_rate": 3.5183843069043824e-07, "loss": 0.241, "step": 13415 }, { "epoch": 0.6482098854906508, "grad_norm": 2.5459232330322266, "learning_rate": 3.517901145093491e-07, "loss": 0.32, "step": 13416 }, { "epoch": 0.6482582016717399, "grad_norm": 3.612858295440674, "learning_rate": 3.517417983282601e-07, "loss": 0.299, "step": 13417 }, { "epoch": 0.6483065178528289, "grad_norm": 2.491251230239868, "learning_rate": 3.516934821471711e-07, "loss": 0.2388, "step": 13418 }, { "epoch": 0.6483548340339179, "grad_norm": 4.106495380401611, "learning_rate": 3.5164516596608204e-07, "loss": 0.2177, "step": 13419 }, { "epoch": 0.648403150215007, "grad_norm": 2.917928457260132, "learning_rate": 3.51596849784993e-07, "loss": 0.3098, "step": 13420 }, { "epoch": 0.6484514663960961, "grad_norm": 3.2590441703796387, "learning_rate": 3.515485336039039e-07, "loss": 0.4242, "step": 13421 }, { "epoch": 0.6484997825771851, "grad_norm": 2.2445859909057617, "learning_rate": 3.515002174228149e-07, "loss": 0.1964, "step": 13422 }, { "epoch": 0.6485480987582741, "grad_norm": 8.356114387512207, "learning_rate": 3.5145190124172584e-07, "loss": 0.4205, "step": 13423 }, { "epoch": 0.6485964149393632, "grad_norm": 2.4220614433288574, "learning_rate": 3.514035850606368e-07, "loss": 0.2611, "step": 13424 }, { "epoch": 0.6486447311204523, "grad_norm": 2.6985526084899902, "learning_rate": 3.5135526887954777e-07, "loss": 0.3205, "step": 13425 }, { "epoch": 0.6486930473015413, "grad_norm": 2.8624420166015625, "learning_rate": 3.513069526984587e-07, "loss": 0.3446, "step": 13426 }, { "epoch": 0.6487413634826303, "grad_norm": 3.9049112796783447, "learning_rate": 3.5125863651736964e-07, "loss": 0.2919, "step": 13427 }, { "epoch": 0.6487896796637194, "grad_norm": 1.667473316192627, "learning_rate": 3.5121032033628063e-07, "loss": 0.1764, "step": 13428 }, { "epoch": 0.6488379958448084, "grad_norm": 2.649864673614502, "learning_rate": 3.511620041551915e-07, "loss": 0.3067, "step": 13429 }, { "epoch": 0.6488863120258975, "grad_norm": 2.278221845626831, "learning_rate": 3.511136879741025e-07, "loss": 0.2851, "step": 13430 }, { "epoch": 0.6489346282069866, "grad_norm": 7.801959037780762, "learning_rate": 3.510653717930135e-07, "loss": 0.2898, "step": 13431 }, { "epoch": 0.6489829443880756, "grad_norm": 3.1612908840179443, "learning_rate": 3.510170556119244e-07, "loss": 0.3956, "step": 13432 }, { "epoch": 0.6490312605691646, "grad_norm": 2.2344071865081787, "learning_rate": 3.5096873943083537e-07, "loss": 0.1803, "step": 13433 }, { "epoch": 0.6490795767502536, "grad_norm": 2.446720838546753, "learning_rate": 3.509204232497463e-07, "loss": 0.3247, "step": 13434 }, { "epoch": 0.6491278929313427, "grad_norm": 3.998731851577759, "learning_rate": 3.508721070686573e-07, "loss": 0.253, "step": 13435 }, { "epoch": 0.6491762091124318, "grad_norm": 3.870600938796997, "learning_rate": 3.5082379088756824e-07, "loss": 0.3159, "step": 13436 }, { "epoch": 0.6492245252935208, "grad_norm": 7.220836639404297, "learning_rate": 3.507754747064792e-07, "loss": 0.2841, "step": 13437 }, { "epoch": 0.6492728414746098, "grad_norm": 2.410087823867798, "learning_rate": 3.5072715852539016e-07, "loss": 0.3003, "step": 13438 }, { "epoch": 0.6493211576556989, "grad_norm": 2.643465518951416, "learning_rate": 3.506788423443011e-07, "loss": 0.3496, "step": 13439 }, { "epoch": 0.6493694738367879, "grad_norm": 1.961619257926941, "learning_rate": 3.5063052616321204e-07, "loss": 0.2084, "step": 13440 }, { "epoch": 0.649417790017877, "grad_norm": 5.7667741775512695, "learning_rate": 3.5058220998212303e-07, "loss": 0.275, "step": 13441 }, { "epoch": 0.649466106198966, "grad_norm": 1.8862330913543701, "learning_rate": 3.505338938010339e-07, "loss": 0.2562, "step": 13442 }, { "epoch": 0.6495144223800551, "grad_norm": 2.358098268508911, "learning_rate": 3.504855776199449e-07, "loss": 0.2596, "step": 13443 }, { "epoch": 0.6495627385611441, "grad_norm": 2.002716064453125, "learning_rate": 3.504372614388559e-07, "loss": 0.2472, "step": 13444 }, { "epoch": 0.6496110547422331, "grad_norm": 2.407573699951172, "learning_rate": 3.503889452577668e-07, "loss": 0.2725, "step": 13445 }, { "epoch": 0.6496593709233223, "grad_norm": 3.2374467849731445, "learning_rate": 3.5034062907667777e-07, "loss": 0.192, "step": 13446 }, { "epoch": 0.6497076871044113, "grad_norm": 2.216550588607788, "learning_rate": 3.502923128955887e-07, "loss": 0.2065, "step": 13447 }, { "epoch": 0.6497560032855003, "grad_norm": 4.059552192687988, "learning_rate": 3.5024399671449964e-07, "loss": 0.2557, "step": 13448 }, { "epoch": 0.6498043194665893, "grad_norm": 5.183124542236328, "learning_rate": 3.5019568053341063e-07, "loss": 0.3159, "step": 13449 }, { "epoch": 0.6498526356476784, "grad_norm": 3.0458457469940186, "learning_rate": 3.5014736435232157e-07, "loss": 0.2755, "step": 13450 }, { "epoch": 0.6499009518287675, "grad_norm": 3.3944733142852783, "learning_rate": 3.5009904817123256e-07, "loss": 0.3005, "step": 13451 }, { "epoch": 0.6499492680098565, "grad_norm": 2.161910057067871, "learning_rate": 3.500507319901435e-07, "loss": 0.25, "step": 13452 }, { "epoch": 0.6499975841909456, "grad_norm": 6.235024452209473, "learning_rate": 3.5000241580905443e-07, "loss": 0.3861, "step": 13453 }, { "epoch": 0.6500459003720346, "grad_norm": 2.598090171813965, "learning_rate": 3.499540996279654e-07, "loss": 0.2472, "step": 13454 }, { "epoch": 0.6500942165531236, "grad_norm": 2.7821600437164307, "learning_rate": 3.499057834468763e-07, "loss": 0.3591, "step": 13455 }, { "epoch": 0.6501425327342127, "grad_norm": 2.8779826164245605, "learning_rate": 3.498574672657873e-07, "loss": 0.4557, "step": 13456 }, { "epoch": 0.6501908489153018, "grad_norm": 2.963876962661743, "learning_rate": 3.498091510846983e-07, "loss": 0.2298, "step": 13457 }, { "epoch": 0.6502391650963908, "grad_norm": 2.0699689388275146, "learning_rate": 3.497608349036092e-07, "loss": 0.2045, "step": 13458 }, { "epoch": 0.6502874812774798, "grad_norm": 2.140627384185791, "learning_rate": 3.4971251872252016e-07, "loss": 0.2255, "step": 13459 }, { "epoch": 0.6503357974585688, "grad_norm": 3.219440221786499, "learning_rate": 3.496642025414311e-07, "loss": 0.2641, "step": 13460 }, { "epoch": 0.6503841136396579, "grad_norm": 2.1741955280303955, "learning_rate": 3.4961588636034204e-07, "loss": 0.1817, "step": 13461 }, { "epoch": 0.650432429820747, "grad_norm": 2.433286428451538, "learning_rate": 3.4956757017925303e-07, "loss": 0.2995, "step": 13462 }, { "epoch": 0.650480746001836, "grad_norm": 2.773128032684326, "learning_rate": 3.4951925399816397e-07, "loss": 0.3032, "step": 13463 }, { "epoch": 0.650529062182925, "grad_norm": 2.3351097106933594, "learning_rate": 3.494709378170749e-07, "loss": 0.3541, "step": 13464 }, { "epoch": 0.6505773783640141, "grad_norm": 5.906798839569092, "learning_rate": 3.494226216359859e-07, "loss": 0.2496, "step": 13465 }, { "epoch": 0.6506256945451031, "grad_norm": 4.003974914550781, "learning_rate": 3.4937430545489683e-07, "loss": 0.3157, "step": 13466 }, { "epoch": 0.6506740107261922, "grad_norm": 3.650677442550659, "learning_rate": 3.493259892738078e-07, "loss": 0.203, "step": 13467 }, { "epoch": 0.6507223269072813, "grad_norm": 115.4817123413086, "learning_rate": 3.492776730927187e-07, "loss": 0.4001, "step": 13468 }, { "epoch": 0.6507706430883703, "grad_norm": 2.77168869972229, "learning_rate": 3.492293569116297e-07, "loss": 0.3247, "step": 13469 }, { "epoch": 0.6508189592694593, "grad_norm": 3.882484197616577, "learning_rate": 3.491810407305407e-07, "loss": 0.307, "step": 13470 }, { "epoch": 0.6508672754505483, "grad_norm": 2.361358642578125, "learning_rate": 3.4913272454945157e-07, "loss": 0.3127, "step": 13471 }, { "epoch": 0.6509155916316375, "grad_norm": 2.5476441383361816, "learning_rate": 3.4908440836836256e-07, "loss": 0.2761, "step": 13472 }, { "epoch": 0.6509639078127265, "grad_norm": 2.6126739978790283, "learning_rate": 3.490360921872735e-07, "loss": 0.3484, "step": 13473 }, { "epoch": 0.6510122239938155, "grad_norm": 2.8537604808807373, "learning_rate": 3.4898777600618444e-07, "loss": 0.2624, "step": 13474 }, { "epoch": 0.6510605401749046, "grad_norm": 2.5823326110839844, "learning_rate": 3.489394598250954e-07, "loss": 0.3041, "step": 13475 }, { "epoch": 0.6511088563559936, "grad_norm": 2.5409371852874756, "learning_rate": 3.4889114364400636e-07, "loss": 0.3311, "step": 13476 }, { "epoch": 0.6511571725370827, "grad_norm": 2.4237492084503174, "learning_rate": 3.488428274629173e-07, "loss": 0.3099, "step": 13477 }, { "epoch": 0.6512054887181717, "grad_norm": 1.930039644241333, "learning_rate": 3.4879451128182824e-07, "loss": 0.2364, "step": 13478 }, { "epoch": 0.6512538048992608, "grad_norm": 2.4317665100097656, "learning_rate": 3.4874619510073923e-07, "loss": 0.2746, "step": 13479 }, { "epoch": 0.6513021210803498, "grad_norm": 2.9932937622070312, "learning_rate": 3.4869787891965016e-07, "loss": 0.3027, "step": 13480 }, { "epoch": 0.6513504372614388, "grad_norm": 3.2224011421203613, "learning_rate": 3.486495627385611e-07, "loss": 0.3332, "step": 13481 }, { "epoch": 0.651398753442528, "grad_norm": 3.2588775157928467, "learning_rate": 3.486012465574721e-07, "loss": 0.4397, "step": 13482 }, { "epoch": 0.651447069623617, "grad_norm": 1.9212440252304077, "learning_rate": 3.485529303763831e-07, "loss": 0.2678, "step": 13483 }, { "epoch": 0.651495385804706, "grad_norm": 2.0833306312561035, "learning_rate": 3.4850461419529397e-07, "loss": 0.2616, "step": 13484 }, { "epoch": 0.651543701985795, "grad_norm": 2.667564630508423, "learning_rate": 3.4845629801420496e-07, "loss": 0.3242, "step": 13485 }, { "epoch": 0.6515920181668841, "grad_norm": 3.0884149074554443, "learning_rate": 3.484079818331159e-07, "loss": 0.2632, "step": 13486 }, { "epoch": 0.6516403343479731, "grad_norm": 2.3739521503448486, "learning_rate": 3.4835966565202683e-07, "loss": 0.2548, "step": 13487 }, { "epoch": 0.6516886505290622, "grad_norm": 2.32060170173645, "learning_rate": 3.483113494709378e-07, "loss": 0.3194, "step": 13488 }, { "epoch": 0.6517369667101512, "grad_norm": 2.6449506282806396, "learning_rate": 3.4826303328984876e-07, "loss": 0.2169, "step": 13489 }, { "epoch": 0.6517852828912403, "grad_norm": 2.1937546730041504, "learning_rate": 3.482147171087597e-07, "loss": 0.2317, "step": 13490 }, { "epoch": 0.6518335990723293, "grad_norm": 3.6408815383911133, "learning_rate": 3.4816640092767063e-07, "loss": 0.2916, "step": 13491 }, { "epoch": 0.6518819152534183, "grad_norm": 7.136819362640381, "learning_rate": 3.481180847465816e-07, "loss": 0.2601, "step": 13492 }, { "epoch": 0.6519302314345075, "grad_norm": 6.409147262573242, "learning_rate": 3.4806976856549256e-07, "loss": 0.3185, "step": 13493 }, { "epoch": 0.6519785476155965, "grad_norm": 4.165590763092041, "learning_rate": 3.480214523844035e-07, "loss": 0.406, "step": 13494 }, { "epoch": 0.6520268637966855, "grad_norm": 2.849731922149658, "learning_rate": 3.479731362033145e-07, "loss": 0.3078, "step": 13495 }, { "epoch": 0.6520751799777745, "grad_norm": 2.622832775115967, "learning_rate": 3.479248200222254e-07, "loss": 0.2593, "step": 13496 }, { "epoch": 0.6521234961588636, "grad_norm": 1.843826174736023, "learning_rate": 3.4787650384113636e-07, "loss": 0.2485, "step": 13497 }, { "epoch": 0.6521718123399527, "grad_norm": 3.492393732070923, "learning_rate": 3.4782818766004735e-07, "loss": 0.316, "step": 13498 }, { "epoch": 0.6522201285210417, "grad_norm": 3.618377447128296, "learning_rate": 3.477798714789583e-07, "loss": 0.3285, "step": 13499 }, { "epoch": 0.6522684447021307, "grad_norm": 4.16300630569458, "learning_rate": 3.4773155529786923e-07, "loss": 0.3037, "step": 13500 }, { "epoch": 0.6523167608832198, "grad_norm": 3.569371223449707, "learning_rate": 3.476832391167802e-07, "loss": 0.3729, "step": 13501 }, { "epoch": 0.6523650770643088, "grad_norm": 7.573276519775391, "learning_rate": 3.4763492293569116e-07, "loss": 0.2536, "step": 13502 }, { "epoch": 0.6524133932453979, "grad_norm": 11.629569053649902, "learning_rate": 3.475866067546021e-07, "loss": 0.3736, "step": 13503 }, { "epoch": 0.652461709426487, "grad_norm": 1.9581222534179688, "learning_rate": 3.4753829057351303e-07, "loss": 0.2051, "step": 13504 }, { "epoch": 0.652510025607576, "grad_norm": 2.5997040271759033, "learning_rate": 3.47489974392424e-07, "loss": 0.2017, "step": 13505 }, { "epoch": 0.652558341788665, "grad_norm": 2.604130506515503, "learning_rate": 3.4744165821133496e-07, "loss": 0.2183, "step": 13506 }, { "epoch": 0.652606657969754, "grad_norm": 1.9284696578979492, "learning_rate": 3.473933420302459e-07, "loss": 0.1779, "step": 13507 }, { "epoch": 0.6526549741508432, "grad_norm": 3.6727170944213867, "learning_rate": 3.473450258491569e-07, "loss": 0.398, "step": 13508 }, { "epoch": 0.6527032903319322, "grad_norm": 9.10436725616455, "learning_rate": 3.472967096680678e-07, "loss": 0.2332, "step": 13509 }, { "epoch": 0.6527516065130212, "grad_norm": 2.823622941970825, "learning_rate": 3.4724839348697876e-07, "loss": 0.2757, "step": 13510 }, { "epoch": 0.6527999226941102, "grad_norm": 2.518648147583008, "learning_rate": 3.4720007730588975e-07, "loss": 0.3197, "step": 13511 }, { "epoch": 0.6528482388751993, "grad_norm": 1.7405271530151367, "learning_rate": 3.4715176112480063e-07, "loss": 0.1938, "step": 13512 }, { "epoch": 0.6528965550562883, "grad_norm": 2.2668843269348145, "learning_rate": 3.471034449437116e-07, "loss": 0.276, "step": 13513 }, { "epoch": 0.6529448712373774, "grad_norm": 14.234477043151855, "learning_rate": 3.470551287626226e-07, "loss": 0.3892, "step": 13514 }, { "epoch": 0.6529931874184665, "grad_norm": 2.958244562149048, "learning_rate": 3.4700681258153355e-07, "loss": 0.3401, "step": 13515 }, { "epoch": 0.6530415035995555, "grad_norm": 2.1700832843780518, "learning_rate": 3.469584964004445e-07, "loss": 0.2653, "step": 13516 }, { "epoch": 0.6530898197806445, "grad_norm": 3.0607919692993164, "learning_rate": 3.469101802193554e-07, "loss": 0.3199, "step": 13517 }, { "epoch": 0.6531381359617335, "grad_norm": 2.7553815841674805, "learning_rate": 3.468618640382664e-07, "loss": 0.3437, "step": 13518 }, { "epoch": 0.6531864521428227, "grad_norm": 1.6793674230575562, "learning_rate": 3.4681354785717735e-07, "loss": 0.1757, "step": 13519 }, { "epoch": 0.6532347683239117, "grad_norm": 3.0216338634490967, "learning_rate": 3.467652316760883e-07, "loss": 0.3145, "step": 13520 }, { "epoch": 0.6532830845050007, "grad_norm": 5.011268138885498, "learning_rate": 3.467169154949993e-07, "loss": 0.3625, "step": 13521 }, { "epoch": 0.6533314006860897, "grad_norm": 2.534412145614624, "learning_rate": 3.466685993139102e-07, "loss": 0.3171, "step": 13522 }, { "epoch": 0.6533797168671788, "grad_norm": 3.597153425216675, "learning_rate": 3.4662028313282116e-07, "loss": 0.3246, "step": 13523 }, { "epoch": 0.6534280330482679, "grad_norm": 2.9654600620269775, "learning_rate": 3.4657196695173215e-07, "loss": 0.306, "step": 13524 }, { "epoch": 0.6534763492293569, "grad_norm": 3.0093390941619873, "learning_rate": 3.4652365077064303e-07, "loss": 0.3888, "step": 13525 }, { "epoch": 0.653524665410446, "grad_norm": 12.669806480407715, "learning_rate": 3.46475334589554e-07, "loss": 0.2992, "step": 13526 }, { "epoch": 0.653572981591535, "grad_norm": 2.929946184158325, "learning_rate": 3.46427018408465e-07, "loss": 0.3347, "step": 13527 }, { "epoch": 0.653621297772624, "grad_norm": 2.8843190670013428, "learning_rate": 3.463787022273759e-07, "loss": 0.2431, "step": 13528 }, { "epoch": 0.6536696139537131, "grad_norm": 2.5089831352233887, "learning_rate": 3.463303860462869e-07, "loss": 0.2482, "step": 13529 }, { "epoch": 0.6537179301348022, "grad_norm": 2.5922486782073975, "learning_rate": 3.462820698651978e-07, "loss": 0.2984, "step": 13530 }, { "epoch": 0.6537662463158912, "grad_norm": 2.7456281185150146, "learning_rate": 3.462337536841088e-07, "loss": 0.3047, "step": 13531 }, { "epoch": 0.6538145624969802, "grad_norm": 2.7759692668914795, "learning_rate": 3.4618543750301975e-07, "loss": 0.3661, "step": 13532 }, { "epoch": 0.6538628786780692, "grad_norm": 5.036910533905029, "learning_rate": 3.461371213219307e-07, "loss": 0.3727, "step": 13533 }, { "epoch": 0.6539111948591584, "grad_norm": 7.649570941925049, "learning_rate": 3.460888051408417e-07, "loss": 0.2185, "step": 13534 }, { "epoch": 0.6539595110402474, "grad_norm": 2.0337305068969727, "learning_rate": 3.460404889597526e-07, "loss": 0.2235, "step": 13535 }, { "epoch": 0.6540078272213364, "grad_norm": 10.189959526062012, "learning_rate": 3.4599217277866355e-07, "loss": 0.3583, "step": 13536 }, { "epoch": 0.6540561434024255, "grad_norm": 2.714667320251465, "learning_rate": 3.4594385659757454e-07, "loss": 0.3547, "step": 13537 }, { "epoch": 0.6541044595835145, "grad_norm": 28.077457427978516, "learning_rate": 3.4589554041648543e-07, "loss": 0.3305, "step": 13538 }, { "epoch": 0.6541527757646035, "grad_norm": 3.0237631797790527, "learning_rate": 3.458472242353964e-07, "loss": 0.4199, "step": 13539 }, { "epoch": 0.6542010919456926, "grad_norm": 4.023507118225098, "learning_rate": 3.457989080543074e-07, "loss": 0.2877, "step": 13540 }, { "epoch": 0.6542494081267817, "grad_norm": 6.236562252044678, "learning_rate": 3.457505918732183e-07, "loss": 0.3267, "step": 13541 }, { "epoch": 0.6542977243078707, "grad_norm": 3.587989568710327, "learning_rate": 3.457022756921293e-07, "loss": 0.2644, "step": 13542 }, { "epoch": 0.6543460404889597, "grad_norm": 3.9996731281280518, "learning_rate": 3.456539595110402e-07, "loss": 0.223, "step": 13543 }, { "epoch": 0.6543943566700487, "grad_norm": 2.8622591495513916, "learning_rate": 3.4560564332995116e-07, "loss": 0.1805, "step": 13544 }, { "epoch": 0.6544426728511379, "grad_norm": 2.479609489440918, "learning_rate": 3.4555732714886215e-07, "loss": 0.2988, "step": 13545 }, { "epoch": 0.6544909890322269, "grad_norm": 2.075071334838867, "learning_rate": 3.455090109677731e-07, "loss": 0.1537, "step": 13546 }, { "epoch": 0.6545393052133159, "grad_norm": 4.142257213592529, "learning_rate": 3.454606947866841e-07, "loss": 0.4213, "step": 13547 }, { "epoch": 0.654587621394405, "grad_norm": 3.467949390411377, "learning_rate": 3.45412378605595e-07, "loss": 0.381, "step": 13548 }, { "epoch": 0.654635937575494, "grad_norm": 3.9226033687591553, "learning_rate": 3.4536406242450595e-07, "loss": 0.4073, "step": 13549 }, { "epoch": 0.6546842537565831, "grad_norm": 2.4395923614501953, "learning_rate": 3.4531574624341694e-07, "loss": 0.3123, "step": 13550 }, { "epoch": 0.6547325699376721, "grad_norm": 2.9126296043395996, "learning_rate": 3.452674300623278e-07, "loss": 0.3004, "step": 13551 }, { "epoch": 0.6547808861187612, "grad_norm": 4.515726566314697, "learning_rate": 3.452191138812388e-07, "loss": 0.2666, "step": 13552 }, { "epoch": 0.6548292022998502, "grad_norm": 2.7605457305908203, "learning_rate": 3.451707977001498e-07, "loss": 0.3605, "step": 13553 }, { "epoch": 0.6548775184809392, "grad_norm": 2.2881200313568115, "learning_rate": 3.451224815190607e-07, "loss": 0.1949, "step": 13554 }, { "epoch": 0.6549258346620284, "grad_norm": 2.1359570026397705, "learning_rate": 3.450741653379717e-07, "loss": 0.2573, "step": 13555 }, { "epoch": 0.6549741508431174, "grad_norm": 5.8863606452941895, "learning_rate": 3.450258491568826e-07, "loss": 0.3265, "step": 13556 }, { "epoch": 0.6550224670242064, "grad_norm": 2.5631368160247803, "learning_rate": 3.4497753297579355e-07, "loss": 0.3028, "step": 13557 }, { "epoch": 0.6550707832052954, "grad_norm": 2.421870231628418, "learning_rate": 3.4492921679470454e-07, "loss": 0.2846, "step": 13558 }, { "epoch": 0.6551190993863845, "grad_norm": 2.4798054695129395, "learning_rate": 3.448809006136155e-07, "loss": 0.3151, "step": 13559 }, { "epoch": 0.6551674155674736, "grad_norm": 2.289309024810791, "learning_rate": 3.448325844325264e-07, "loss": 0.2539, "step": 13560 }, { "epoch": 0.6552157317485626, "grad_norm": 2.9260354042053223, "learning_rate": 3.447842682514374e-07, "loss": 0.3762, "step": 13561 }, { "epoch": 0.6552640479296516, "grad_norm": 2.9840362071990967, "learning_rate": 3.4473595207034834e-07, "loss": 0.3671, "step": 13562 }, { "epoch": 0.6553123641107407, "grad_norm": 7.824717044830322, "learning_rate": 3.4468763588925933e-07, "loss": 0.3249, "step": 13563 }, { "epoch": 0.6553606802918297, "grad_norm": 3.0181832313537598, "learning_rate": 3.446393197081702e-07, "loss": 0.3403, "step": 13564 }, { "epoch": 0.6554089964729187, "grad_norm": 2.755370855331421, "learning_rate": 3.445910035270812e-07, "loss": 0.2895, "step": 13565 }, { "epoch": 0.6554573126540079, "grad_norm": 3.8729312419891357, "learning_rate": 3.445426873459922e-07, "loss": 0.2312, "step": 13566 }, { "epoch": 0.6555056288350969, "grad_norm": 2.618762254714966, "learning_rate": 3.444943711649031e-07, "loss": 0.314, "step": 13567 }, { "epoch": 0.6555539450161859, "grad_norm": 3.2893059253692627, "learning_rate": 3.444460549838141e-07, "loss": 0.3744, "step": 13568 }, { "epoch": 0.6556022611972749, "grad_norm": 5.261050224304199, "learning_rate": 3.44397738802725e-07, "loss": 0.29, "step": 13569 }, { "epoch": 0.655650577378364, "grad_norm": 7.026209354400635, "learning_rate": 3.4434942262163595e-07, "loss": 0.2965, "step": 13570 }, { "epoch": 0.6556988935594531, "grad_norm": 1.8699841499328613, "learning_rate": 3.4430110644054694e-07, "loss": 0.1948, "step": 13571 }, { "epoch": 0.6557472097405421, "grad_norm": 2.6917216777801514, "learning_rate": 3.442527902594579e-07, "loss": 0.3547, "step": 13572 }, { "epoch": 0.6557955259216312, "grad_norm": 2.8768038749694824, "learning_rate": 3.442044740783688e-07, "loss": 0.1538, "step": 13573 }, { "epoch": 0.6558438421027202, "grad_norm": 2.9065840244293213, "learning_rate": 3.441561578972798e-07, "loss": 0.3653, "step": 13574 }, { "epoch": 0.6558921582838092, "grad_norm": 3.145655632019043, "learning_rate": 3.4410784171619074e-07, "loss": 0.3665, "step": 13575 }, { "epoch": 0.6559404744648983, "grad_norm": 4.362661838531494, "learning_rate": 3.440595255351017e-07, "loss": 0.2934, "step": 13576 }, { "epoch": 0.6559887906459874, "grad_norm": 2.6708428859710693, "learning_rate": 3.440112093540126e-07, "loss": 0.3029, "step": 13577 }, { "epoch": 0.6560371068270764, "grad_norm": 8.578432083129883, "learning_rate": 3.439628931729236e-07, "loss": 0.2286, "step": 13578 }, { "epoch": 0.6560854230081654, "grad_norm": 4.182076930999756, "learning_rate": 3.439145769918346e-07, "loss": 0.2966, "step": 13579 }, { "epoch": 0.6561337391892544, "grad_norm": 3.588229179382324, "learning_rate": 3.438662608107455e-07, "loss": 0.321, "step": 13580 }, { "epoch": 0.6561820553703436, "grad_norm": 2.3724489212036133, "learning_rate": 3.4381794462965647e-07, "loss": 0.2654, "step": 13581 }, { "epoch": 0.6562303715514326, "grad_norm": 6.778928756713867, "learning_rate": 3.437696284485674e-07, "loss": 0.338, "step": 13582 }, { "epoch": 0.6562786877325216, "grad_norm": 2.8511364459991455, "learning_rate": 3.4372131226747835e-07, "loss": 0.2718, "step": 13583 }, { "epoch": 0.6563270039136107, "grad_norm": 2.88936710357666, "learning_rate": 3.4367299608638934e-07, "loss": 0.2755, "step": 13584 }, { "epoch": 0.6563753200946997, "grad_norm": 2.399402618408203, "learning_rate": 3.4362467990530027e-07, "loss": 0.2613, "step": 13585 }, { "epoch": 0.6564236362757888, "grad_norm": 2.3868610858917236, "learning_rate": 3.435763637242112e-07, "loss": 0.2575, "step": 13586 }, { "epoch": 0.6564719524568778, "grad_norm": 3.044879913330078, "learning_rate": 3.435280475431222e-07, "loss": 0.3569, "step": 13587 }, { "epoch": 0.6565202686379669, "grad_norm": 2.3763651847839355, "learning_rate": 3.4347973136203314e-07, "loss": 0.2777, "step": 13588 }, { "epoch": 0.6565685848190559, "grad_norm": 2.6030797958374023, "learning_rate": 3.434314151809441e-07, "loss": 0.2716, "step": 13589 }, { "epoch": 0.6566169010001449, "grad_norm": 2.4807794094085693, "learning_rate": 3.43383098999855e-07, "loss": 0.2992, "step": 13590 }, { "epoch": 0.6566652171812339, "grad_norm": 4.528662204742432, "learning_rate": 3.43334782818766e-07, "loss": 0.3122, "step": 13591 }, { "epoch": 0.6567135333623231, "grad_norm": 2.7491519451141357, "learning_rate": 3.4328646663767694e-07, "loss": 0.3614, "step": 13592 }, { "epoch": 0.6567618495434121, "grad_norm": 7.76798677444458, "learning_rate": 3.432381504565879e-07, "loss": 0.258, "step": 13593 }, { "epoch": 0.6568101657245011, "grad_norm": 2.8882596492767334, "learning_rate": 3.4318983427549887e-07, "loss": 0.2751, "step": 13594 }, { "epoch": 0.6568584819055902, "grad_norm": 2.1133151054382324, "learning_rate": 3.4314151809440975e-07, "loss": 0.2153, "step": 13595 }, { "epoch": 0.6569067980866792, "grad_norm": 2.079226016998291, "learning_rate": 3.4309320191332074e-07, "loss": 0.2481, "step": 13596 }, { "epoch": 0.6569551142677683, "grad_norm": 2.9976418018341064, "learning_rate": 3.4304488573223173e-07, "loss": 0.3999, "step": 13597 }, { "epoch": 0.6570034304488573, "grad_norm": 2.4874958992004395, "learning_rate": 3.4299656955114267e-07, "loss": 0.2417, "step": 13598 }, { "epoch": 0.6570517466299464, "grad_norm": 3.159785270690918, "learning_rate": 3.429482533700536e-07, "loss": 0.388, "step": 13599 }, { "epoch": 0.6571000628110354, "grad_norm": 3.201021909713745, "learning_rate": 3.428999371889646e-07, "loss": 0.3452, "step": 13600 }, { "epoch": 0.6571483789921244, "grad_norm": 5.313179969787598, "learning_rate": 3.4285162100787553e-07, "loss": 0.3523, "step": 13601 }, { "epoch": 0.6571966951732136, "grad_norm": 1.96038818359375, "learning_rate": 3.4280330482678647e-07, "loss": 0.2509, "step": 13602 }, { "epoch": 0.6572450113543026, "grad_norm": 2.3029236793518066, "learning_rate": 3.427549886456974e-07, "loss": 0.2048, "step": 13603 }, { "epoch": 0.6572933275353916, "grad_norm": 2.3442981243133545, "learning_rate": 3.427066724646084e-07, "loss": 0.2195, "step": 13604 }, { "epoch": 0.6573416437164806, "grad_norm": 27.47170639038086, "learning_rate": 3.4265835628351934e-07, "loss": 0.2766, "step": 13605 }, { "epoch": 0.6573899598975697, "grad_norm": 14.659449577331543, "learning_rate": 3.4261004010243027e-07, "loss": 0.351, "step": 13606 }, { "epoch": 0.6574382760786588, "grad_norm": 2.2932076454162598, "learning_rate": 3.4256172392134126e-07, "loss": 0.2276, "step": 13607 }, { "epoch": 0.6574865922597478, "grad_norm": 2.417617082595825, "learning_rate": 3.4251340774025215e-07, "loss": 0.2305, "step": 13608 }, { "epoch": 0.6575349084408368, "grad_norm": 4.922405242919922, "learning_rate": 3.4246509155916314e-07, "loss": 0.3125, "step": 13609 }, { "epoch": 0.6575832246219259, "grad_norm": 3.3425700664520264, "learning_rate": 3.4241677537807413e-07, "loss": 0.3295, "step": 13610 }, { "epoch": 0.6576315408030149, "grad_norm": 2.637942314147949, "learning_rate": 3.42368459196985e-07, "loss": 0.2847, "step": 13611 }, { "epoch": 0.657679856984104, "grad_norm": 2.7184524536132812, "learning_rate": 3.42320143015896e-07, "loss": 0.2968, "step": 13612 }, { "epoch": 0.657728173165193, "grad_norm": 2.9010205268859863, "learning_rate": 3.42271826834807e-07, "loss": 0.2626, "step": 13613 }, { "epoch": 0.6577764893462821, "grad_norm": 4.322474479675293, "learning_rate": 3.4222351065371793e-07, "loss": 0.2864, "step": 13614 }, { "epoch": 0.6578248055273711, "grad_norm": 2.725893974304199, "learning_rate": 3.4217519447262887e-07, "loss": 0.3551, "step": 13615 }, { "epoch": 0.6578731217084601, "grad_norm": 2.5724992752075195, "learning_rate": 3.421268782915398e-07, "loss": 0.2429, "step": 13616 }, { "epoch": 0.6579214378895493, "grad_norm": 2.7734224796295166, "learning_rate": 3.420785621104508e-07, "loss": 0.2882, "step": 13617 }, { "epoch": 0.6579697540706383, "grad_norm": 3.1143884658813477, "learning_rate": 3.4203024592936173e-07, "loss": 0.3167, "step": 13618 }, { "epoch": 0.6580180702517273, "grad_norm": 3.812021017074585, "learning_rate": 3.4198192974827267e-07, "loss": 0.4025, "step": 13619 }, { "epoch": 0.6580663864328163, "grad_norm": 1.7878170013427734, "learning_rate": 3.4193361356718366e-07, "loss": 0.1505, "step": 13620 }, { "epoch": 0.6581147026139054, "grad_norm": 2.27518892288208, "learning_rate": 3.4188529738609454e-07, "loss": 0.2035, "step": 13621 }, { "epoch": 0.6581630187949944, "grad_norm": 2.2127468585968018, "learning_rate": 3.4183698120500553e-07, "loss": 0.2393, "step": 13622 }, { "epoch": 0.6582113349760835, "grad_norm": 1.7035455703735352, "learning_rate": 3.417886650239165e-07, "loss": 0.1811, "step": 13623 }, { "epoch": 0.6582596511571726, "grad_norm": 3.019148349761963, "learning_rate": 3.417403488428274e-07, "loss": 0.4348, "step": 13624 }, { "epoch": 0.6583079673382616, "grad_norm": 2.8158578872680664, "learning_rate": 3.416920326617384e-07, "loss": 0.2564, "step": 13625 }, { "epoch": 0.6583562835193506, "grad_norm": 1.6547638177871704, "learning_rate": 3.416437164806494e-07, "loss": 0.2043, "step": 13626 }, { "epoch": 0.6584045997004396, "grad_norm": 2.5525975227355957, "learning_rate": 3.415954002995603e-07, "loss": 0.2803, "step": 13627 }, { "epoch": 0.6584529158815288, "grad_norm": 3.169743061065674, "learning_rate": 3.4154708411847126e-07, "loss": 0.3789, "step": 13628 }, { "epoch": 0.6585012320626178, "grad_norm": 2.8721139430999756, "learning_rate": 3.414987679373822e-07, "loss": 0.3536, "step": 13629 }, { "epoch": 0.6585495482437068, "grad_norm": 2.9732792377471924, "learning_rate": 3.414504517562932e-07, "loss": 0.2632, "step": 13630 }, { "epoch": 0.6585978644247958, "grad_norm": 2.9702298641204834, "learning_rate": 3.4140213557520413e-07, "loss": 0.3856, "step": 13631 }, { "epoch": 0.6586461806058849, "grad_norm": 3.2695438861846924, "learning_rate": 3.4135381939411507e-07, "loss": 0.2566, "step": 13632 }, { "epoch": 0.658694496786974, "grad_norm": 3.2500975131988525, "learning_rate": 3.4130550321302606e-07, "loss": 0.2881, "step": 13633 }, { "epoch": 0.658742812968063, "grad_norm": 6.091742515563965, "learning_rate": 3.4125718703193694e-07, "loss": 0.4344, "step": 13634 }, { "epoch": 0.658791129149152, "grad_norm": 4.982547283172607, "learning_rate": 3.4120887085084793e-07, "loss": 0.3765, "step": 13635 }, { "epoch": 0.6588394453302411, "grad_norm": 2.446460008621216, "learning_rate": 3.411605546697589e-07, "loss": 0.3503, "step": 13636 }, { "epoch": 0.6588877615113301, "grad_norm": 2.1352105140686035, "learning_rate": 3.411122384886698e-07, "loss": 0.2497, "step": 13637 }, { "epoch": 0.6589360776924192, "grad_norm": 2.805049180984497, "learning_rate": 3.410639223075808e-07, "loss": 0.3627, "step": 13638 }, { "epoch": 0.6589843938735083, "grad_norm": 2.404231548309326, "learning_rate": 3.410156061264918e-07, "loss": 0.2704, "step": 13639 }, { "epoch": 0.6590327100545973, "grad_norm": 4.017260551452637, "learning_rate": 3.4096728994540267e-07, "loss": 0.4955, "step": 13640 }, { "epoch": 0.6590810262356863, "grad_norm": 2.4345905780792236, "learning_rate": 3.4091897376431366e-07, "loss": 0.3528, "step": 13641 }, { "epoch": 0.6591293424167753, "grad_norm": 2.7304372787475586, "learning_rate": 3.408706575832246e-07, "loss": 0.2928, "step": 13642 }, { "epoch": 0.6591776585978645, "grad_norm": 2.735232353210449, "learning_rate": 3.4082234140213553e-07, "loss": 0.2917, "step": 13643 }, { "epoch": 0.6592259747789535, "grad_norm": 2.866866111755371, "learning_rate": 3.407740252210465e-07, "loss": 0.3458, "step": 13644 }, { "epoch": 0.6592742909600425, "grad_norm": 3.8047029972076416, "learning_rate": 3.4072570903995746e-07, "loss": 0.285, "step": 13645 }, { "epoch": 0.6593226071411316, "grad_norm": 2.6195929050445557, "learning_rate": 3.4067739285886845e-07, "loss": 0.3585, "step": 13646 }, { "epoch": 0.6593709233222206, "grad_norm": 2.287855863571167, "learning_rate": 3.4062907667777934e-07, "loss": 0.265, "step": 13647 }, { "epoch": 0.6594192395033096, "grad_norm": 2.3964643478393555, "learning_rate": 3.4058076049669033e-07, "loss": 0.3121, "step": 13648 }, { "epoch": 0.6594675556843987, "grad_norm": 1.8350507020950317, "learning_rate": 3.405324443156013e-07, "loss": 0.1722, "step": 13649 }, { "epoch": 0.6595158718654878, "grad_norm": 1.8314013481140137, "learning_rate": 3.404841281345122e-07, "loss": 0.2347, "step": 13650 }, { "epoch": 0.6595641880465768, "grad_norm": 4.451408386230469, "learning_rate": 3.404358119534232e-07, "loss": 0.3422, "step": 13651 }, { "epoch": 0.6596125042276658, "grad_norm": 3.209472417831421, "learning_rate": 3.403874957723342e-07, "loss": 0.2255, "step": 13652 }, { "epoch": 0.6596608204087548, "grad_norm": 2.4462058544158936, "learning_rate": 3.4033917959124507e-07, "loss": 0.3299, "step": 13653 }, { "epoch": 0.659709136589844, "grad_norm": 3.1954715251922607, "learning_rate": 3.4029086341015606e-07, "loss": 0.3092, "step": 13654 }, { "epoch": 0.659757452770933, "grad_norm": 2.6477015018463135, "learning_rate": 3.40242547229067e-07, "loss": 0.3331, "step": 13655 }, { "epoch": 0.659805768952022, "grad_norm": 2.9951746463775635, "learning_rate": 3.4019423104797793e-07, "loss": 0.3273, "step": 13656 }, { "epoch": 0.6598540851331111, "grad_norm": 5.169264316558838, "learning_rate": 3.401459148668889e-07, "loss": 0.2984, "step": 13657 }, { "epoch": 0.6599024013142001, "grad_norm": 3.4135537147521973, "learning_rate": 3.4009759868579986e-07, "loss": 0.3594, "step": 13658 }, { "epoch": 0.6599507174952892, "grad_norm": 4.1675705909729, "learning_rate": 3.400492825047108e-07, "loss": 0.4029, "step": 13659 }, { "epoch": 0.6599990336763782, "grad_norm": 3.9595229625701904, "learning_rate": 3.4000096632362173e-07, "loss": 0.3934, "step": 13660 }, { "epoch": 0.6600473498574673, "grad_norm": 2.6248421669006348, "learning_rate": 3.399526501425327e-07, "loss": 0.3488, "step": 13661 }, { "epoch": 0.6600956660385563, "grad_norm": 2.840092182159424, "learning_rate": 3.399043339614437e-07, "loss": 0.3165, "step": 13662 }, { "epoch": 0.6601439822196453, "grad_norm": 3.071629285812378, "learning_rate": 3.398560177803546e-07, "loss": 0.3015, "step": 13663 }, { "epoch": 0.6601922984007345, "grad_norm": 2.031136989593506, "learning_rate": 3.398077015992656e-07, "loss": 0.1399, "step": 13664 }, { "epoch": 0.6602406145818235, "grad_norm": 2.9076664447784424, "learning_rate": 3.397593854181766e-07, "loss": 0.2736, "step": 13665 }, { "epoch": 0.6602889307629125, "grad_norm": 3.034956932067871, "learning_rate": 3.3971106923708746e-07, "loss": 0.3055, "step": 13666 }, { "epoch": 0.6603372469440015, "grad_norm": 2.319589138031006, "learning_rate": 3.3966275305599845e-07, "loss": 0.2182, "step": 13667 }, { "epoch": 0.6603855631250906, "grad_norm": 2.480886697769165, "learning_rate": 3.396144368749094e-07, "loss": 0.2348, "step": 13668 }, { "epoch": 0.6604338793061797, "grad_norm": 2.279742956161499, "learning_rate": 3.3956612069382033e-07, "loss": 0.2713, "step": 13669 }, { "epoch": 0.6604821954872687, "grad_norm": 2.8824026584625244, "learning_rate": 3.395178045127313e-07, "loss": 0.3111, "step": 13670 }, { "epoch": 0.6605305116683577, "grad_norm": 1.8363795280456543, "learning_rate": 3.3946948833164225e-07, "loss": 0.2339, "step": 13671 }, { "epoch": 0.6605788278494468, "grad_norm": 1.738309383392334, "learning_rate": 3.394211721505532e-07, "loss": 0.1999, "step": 13672 }, { "epoch": 0.6606271440305358, "grad_norm": 54.697792053222656, "learning_rate": 3.3937285596946413e-07, "loss": 0.3948, "step": 13673 }, { "epoch": 0.6606754602116248, "grad_norm": 1.9141496419906616, "learning_rate": 3.393245397883751e-07, "loss": 0.1945, "step": 13674 }, { "epoch": 0.660723776392714, "grad_norm": 43.573326110839844, "learning_rate": 3.3927622360728606e-07, "loss": 0.3304, "step": 13675 }, { "epoch": 0.660772092573803, "grad_norm": 2.89095401763916, "learning_rate": 3.39227907426197e-07, "loss": 0.3914, "step": 13676 }, { "epoch": 0.660820408754892, "grad_norm": 2.8173792362213135, "learning_rate": 3.39179591245108e-07, "loss": 0.2958, "step": 13677 }, { "epoch": 0.660868724935981, "grad_norm": 2.4621047973632812, "learning_rate": 3.39131275064019e-07, "loss": 0.2906, "step": 13678 }, { "epoch": 0.6609170411170701, "grad_norm": 3.724196672439575, "learning_rate": 3.3908295888292986e-07, "loss": 0.4389, "step": 13679 }, { "epoch": 0.6609653572981592, "grad_norm": 2.3709042072296143, "learning_rate": 3.3903464270184085e-07, "loss": 0.3035, "step": 13680 }, { "epoch": 0.6610136734792482, "grad_norm": 3.4752461910247803, "learning_rate": 3.389863265207518e-07, "loss": 0.328, "step": 13681 }, { "epoch": 0.6610619896603372, "grad_norm": 2.2936112880706787, "learning_rate": 3.389380103396627e-07, "loss": 0.2542, "step": 13682 }, { "epoch": 0.6611103058414263, "grad_norm": 3.687479257583618, "learning_rate": 3.388896941585737e-07, "loss": 0.3795, "step": 13683 }, { "epoch": 0.6611586220225153, "grad_norm": 3.3588035106658936, "learning_rate": 3.3884137797748465e-07, "loss": 0.2914, "step": 13684 }, { "epoch": 0.6612069382036044, "grad_norm": 2.6657590866088867, "learning_rate": 3.387930617963956e-07, "loss": 0.4847, "step": 13685 }, { "epoch": 0.6612552543846935, "grad_norm": 2.2628495693206787, "learning_rate": 3.387447456153065e-07, "loss": 0.2643, "step": 13686 }, { "epoch": 0.6613035705657825, "grad_norm": 3.7939858436584473, "learning_rate": 3.386964294342175e-07, "loss": 0.3554, "step": 13687 }, { "epoch": 0.6613518867468715, "grad_norm": 3.256082057952881, "learning_rate": 3.3864811325312845e-07, "loss": 0.3009, "step": 13688 }, { "epoch": 0.6614002029279605, "grad_norm": 2.1364080905914307, "learning_rate": 3.385997970720394e-07, "loss": 0.2092, "step": 13689 }, { "epoch": 0.6614485191090497, "grad_norm": 3.0002355575561523, "learning_rate": 3.385514808909504e-07, "loss": 0.3151, "step": 13690 }, { "epoch": 0.6614968352901387, "grad_norm": 10.362404823303223, "learning_rate": 3.385031647098613e-07, "loss": 0.2201, "step": 13691 }, { "epoch": 0.6615451514712277, "grad_norm": 3.1989192962646484, "learning_rate": 3.3845484852877226e-07, "loss": 0.2901, "step": 13692 }, { "epoch": 0.6615934676523167, "grad_norm": 3.6710469722747803, "learning_rate": 3.3840653234768325e-07, "loss": 0.409, "step": 13693 }, { "epoch": 0.6616417838334058, "grad_norm": 3.534431219100952, "learning_rate": 3.383582161665942e-07, "loss": 0.418, "step": 13694 }, { "epoch": 0.6616901000144949, "grad_norm": 4.182202339172363, "learning_rate": 3.383098999855051e-07, "loss": 0.4206, "step": 13695 }, { "epoch": 0.6617384161955839, "grad_norm": 1.9725959300994873, "learning_rate": 3.382615838044161e-07, "loss": 0.1979, "step": 13696 }, { "epoch": 0.661786732376673, "grad_norm": 2.453984498977661, "learning_rate": 3.3821326762332705e-07, "loss": 0.2337, "step": 13697 }, { "epoch": 0.661835048557762, "grad_norm": 1.918609857559204, "learning_rate": 3.38164951442238e-07, "loss": 0.2433, "step": 13698 }, { "epoch": 0.661883364738851, "grad_norm": 3.313586950302124, "learning_rate": 3.381166352611489e-07, "loss": 0.3351, "step": 13699 }, { "epoch": 0.66193168091994, "grad_norm": 2.055166721343994, "learning_rate": 3.380683190800599e-07, "loss": 0.2352, "step": 13700 }, { "epoch": 0.6619799971010292, "grad_norm": 3.9762768745422363, "learning_rate": 3.3802000289897085e-07, "loss": 0.2507, "step": 13701 }, { "epoch": 0.6620283132821182, "grad_norm": 2.5931379795074463, "learning_rate": 3.379716867178818e-07, "loss": 0.2874, "step": 13702 }, { "epoch": 0.6620766294632072, "grad_norm": 1.883827805519104, "learning_rate": 3.379233705367928e-07, "loss": 0.1828, "step": 13703 }, { "epoch": 0.6621249456442962, "grad_norm": 2.562840700149536, "learning_rate": 3.378750543557037e-07, "loss": 0.1964, "step": 13704 }, { "epoch": 0.6621732618253853, "grad_norm": 3.4941232204437256, "learning_rate": 3.3782673817461465e-07, "loss": 0.3015, "step": 13705 }, { "epoch": 0.6622215780064744, "grad_norm": 3.4168953895568848, "learning_rate": 3.3777842199352564e-07, "loss": 0.4288, "step": 13706 }, { "epoch": 0.6622698941875634, "grad_norm": 2.7853972911834717, "learning_rate": 3.377301058124365e-07, "loss": 0.3068, "step": 13707 }, { "epoch": 0.6623182103686525, "grad_norm": 3.0914595127105713, "learning_rate": 3.376817896313475e-07, "loss": 0.2908, "step": 13708 }, { "epoch": 0.6623665265497415, "grad_norm": 2.454772710800171, "learning_rate": 3.376334734502585e-07, "loss": 0.2511, "step": 13709 }, { "epoch": 0.6624148427308305, "grad_norm": 1.994579553604126, "learning_rate": 3.3758515726916944e-07, "loss": 0.1959, "step": 13710 }, { "epoch": 0.6624631589119196, "grad_norm": 2.9821269512176514, "learning_rate": 3.375368410880804e-07, "loss": 0.3735, "step": 13711 }, { "epoch": 0.6625114750930087, "grad_norm": 3.2768731117248535, "learning_rate": 3.374885249069913e-07, "loss": 0.3548, "step": 13712 }, { "epoch": 0.6625597912740977, "grad_norm": 3.4299850463867188, "learning_rate": 3.374402087259023e-07, "loss": 0.3535, "step": 13713 }, { "epoch": 0.6626081074551867, "grad_norm": 1.2625056505203247, "learning_rate": 3.3739189254481325e-07, "loss": 0.1523, "step": 13714 }, { "epoch": 0.6626564236362757, "grad_norm": 1.6324310302734375, "learning_rate": 3.373435763637242e-07, "loss": 0.1542, "step": 13715 }, { "epoch": 0.6627047398173649, "grad_norm": 2.1717002391815186, "learning_rate": 3.3729526018263517e-07, "loss": 0.3169, "step": 13716 }, { "epoch": 0.6627530559984539, "grad_norm": 2.6524810791015625, "learning_rate": 3.372469440015461e-07, "loss": 0.3311, "step": 13717 }, { "epoch": 0.6628013721795429, "grad_norm": 2.777005434036255, "learning_rate": 3.3719862782045705e-07, "loss": 0.2598, "step": 13718 }, { "epoch": 0.662849688360632, "grad_norm": 5.918224334716797, "learning_rate": 3.3715031163936804e-07, "loss": 0.2992, "step": 13719 }, { "epoch": 0.662898004541721, "grad_norm": 3.124439001083374, "learning_rate": 3.371019954582789e-07, "loss": 0.4063, "step": 13720 }, { "epoch": 0.6629463207228101, "grad_norm": 2.812856912612915, "learning_rate": 3.370536792771899e-07, "loss": 0.4066, "step": 13721 }, { "epoch": 0.6629946369038991, "grad_norm": 2.7131662368774414, "learning_rate": 3.370053630961009e-07, "loss": 0.2771, "step": 13722 }, { "epoch": 0.6630429530849882, "grad_norm": 5.848856449127197, "learning_rate": 3.369570469150118e-07, "loss": 0.2658, "step": 13723 }, { "epoch": 0.6630912692660772, "grad_norm": 2.8953349590301514, "learning_rate": 3.369087307339228e-07, "loss": 0.3777, "step": 13724 }, { "epoch": 0.6631395854471662, "grad_norm": 3.1921303272247314, "learning_rate": 3.368604145528337e-07, "loss": 0.4024, "step": 13725 }, { "epoch": 0.6631879016282552, "grad_norm": 4.387075424194336, "learning_rate": 3.368120983717447e-07, "loss": 0.2896, "step": 13726 }, { "epoch": 0.6632362178093444, "grad_norm": 2.451446533203125, "learning_rate": 3.3676378219065564e-07, "loss": 0.2784, "step": 13727 }, { "epoch": 0.6632845339904334, "grad_norm": 2.5574543476104736, "learning_rate": 3.367154660095666e-07, "loss": 0.3062, "step": 13728 }, { "epoch": 0.6633328501715224, "grad_norm": 5.327968120574951, "learning_rate": 3.3666714982847757e-07, "loss": 0.2938, "step": 13729 }, { "epoch": 0.6633811663526115, "grad_norm": 2.563938856124878, "learning_rate": 3.366188336473885e-07, "loss": 0.2846, "step": 13730 }, { "epoch": 0.6634294825337005, "grad_norm": 9.49432373046875, "learning_rate": 3.3657051746629944e-07, "loss": 0.3931, "step": 13731 }, { "epoch": 0.6634777987147896, "grad_norm": 3.391190767288208, "learning_rate": 3.3652220128521043e-07, "loss": 0.3904, "step": 13732 }, { "epoch": 0.6635261148958786, "grad_norm": 2.664769172668457, "learning_rate": 3.364738851041213e-07, "loss": 0.3332, "step": 13733 }, { "epoch": 0.6635744310769677, "grad_norm": 2.259321928024292, "learning_rate": 3.364255689230323e-07, "loss": 0.1458, "step": 13734 }, { "epoch": 0.6636227472580567, "grad_norm": 1.9740009307861328, "learning_rate": 3.363772527419433e-07, "loss": 0.2196, "step": 13735 }, { "epoch": 0.6636710634391457, "grad_norm": 2.4318790435791016, "learning_rate": 3.363289365608542e-07, "loss": 0.2556, "step": 13736 }, { "epoch": 0.6637193796202349, "grad_norm": 11.533406257629395, "learning_rate": 3.362806203797652e-07, "loss": 0.3172, "step": 13737 }, { "epoch": 0.6637676958013239, "grad_norm": 6.9722113609313965, "learning_rate": 3.362323041986761e-07, "loss": 0.3454, "step": 13738 }, { "epoch": 0.6638160119824129, "grad_norm": 2.738246202468872, "learning_rate": 3.3618398801758705e-07, "loss": 0.3127, "step": 13739 }, { "epoch": 0.6638643281635019, "grad_norm": 2.552279472351074, "learning_rate": 3.3613567183649804e-07, "loss": 0.2592, "step": 13740 }, { "epoch": 0.663912644344591, "grad_norm": 2.1091997623443604, "learning_rate": 3.36087355655409e-07, "loss": 0.236, "step": 13741 }, { "epoch": 0.6639609605256801, "grad_norm": 7.620062351226807, "learning_rate": 3.3603903947431997e-07, "loss": 0.3026, "step": 13742 }, { "epoch": 0.6640092767067691, "grad_norm": 5.155736923217773, "learning_rate": 3.359907232932309e-07, "loss": 0.3425, "step": 13743 }, { "epoch": 0.6640575928878582, "grad_norm": 4.167778968811035, "learning_rate": 3.3594240711214184e-07, "loss": 0.1755, "step": 13744 }, { "epoch": 0.6641059090689472, "grad_norm": 3.066953659057617, "learning_rate": 3.3589409093105283e-07, "loss": 0.4404, "step": 13745 }, { "epoch": 0.6641542252500362, "grad_norm": 2.4326016902923584, "learning_rate": 3.358457747499637e-07, "loss": 0.2052, "step": 13746 }, { "epoch": 0.6642025414311253, "grad_norm": 2.8106679916381836, "learning_rate": 3.357974585688747e-07, "loss": 0.3324, "step": 13747 }, { "epoch": 0.6642508576122144, "grad_norm": 2.5997557640075684, "learning_rate": 3.357491423877857e-07, "loss": 0.2963, "step": 13748 }, { "epoch": 0.6642991737933034, "grad_norm": 2.5736987590789795, "learning_rate": 3.357008262066966e-07, "loss": 0.1506, "step": 13749 }, { "epoch": 0.6643474899743924, "grad_norm": 2.750905752182007, "learning_rate": 3.3565251002560757e-07, "loss": 0.2163, "step": 13750 }, { "epoch": 0.6643958061554814, "grad_norm": 4.564485549926758, "learning_rate": 3.356041938445185e-07, "loss": 0.3642, "step": 13751 }, { "epoch": 0.6644441223365705, "grad_norm": 2.719585657119751, "learning_rate": 3.3555587766342944e-07, "loss": 0.3513, "step": 13752 }, { "epoch": 0.6644924385176596, "grad_norm": 3.236980438232422, "learning_rate": 3.3550756148234043e-07, "loss": 0.348, "step": 13753 }, { "epoch": 0.6645407546987486, "grad_norm": 2.9629459381103516, "learning_rate": 3.3545924530125137e-07, "loss": 0.3132, "step": 13754 }, { "epoch": 0.6645890708798377, "grad_norm": 2.7019870281219482, "learning_rate": 3.354109291201623e-07, "loss": 0.3199, "step": 13755 }, { "epoch": 0.6646373870609267, "grad_norm": 1.7906056642532349, "learning_rate": 3.3536261293907325e-07, "loss": 0.1916, "step": 13756 }, { "epoch": 0.6646857032420157, "grad_norm": 4.507637977600098, "learning_rate": 3.3531429675798424e-07, "loss": 0.2764, "step": 13757 }, { "epoch": 0.6647340194231048, "grad_norm": 2.091716766357422, "learning_rate": 3.3526598057689523e-07, "loss": 0.2539, "step": 13758 }, { "epoch": 0.6647823356041939, "grad_norm": 2.8250625133514404, "learning_rate": 3.352176643958061e-07, "loss": 0.2901, "step": 13759 }, { "epoch": 0.6648306517852829, "grad_norm": 3.131869077682495, "learning_rate": 3.351693482147171e-07, "loss": 0.3102, "step": 13760 }, { "epoch": 0.6648789679663719, "grad_norm": 2.634117603302002, "learning_rate": 3.351210320336281e-07, "loss": 0.3344, "step": 13761 }, { "epoch": 0.6649272841474609, "grad_norm": 5.84067964553833, "learning_rate": 3.35072715852539e-07, "loss": 0.2854, "step": 13762 }, { "epoch": 0.6649756003285501, "grad_norm": 3.7236745357513428, "learning_rate": 3.3502439967144997e-07, "loss": 0.3236, "step": 13763 }, { "epoch": 0.6650239165096391, "grad_norm": 2.055219888687134, "learning_rate": 3.349760834903609e-07, "loss": 0.1984, "step": 13764 }, { "epoch": 0.6650722326907281, "grad_norm": 10.202606201171875, "learning_rate": 3.3492776730927184e-07, "loss": 0.4017, "step": 13765 }, { "epoch": 0.6651205488718172, "grad_norm": 2.5679757595062256, "learning_rate": 3.3487945112818283e-07, "loss": 0.3695, "step": 13766 }, { "epoch": 0.6651688650529062, "grad_norm": 3.0249252319335938, "learning_rate": 3.3483113494709377e-07, "loss": 0.3279, "step": 13767 }, { "epoch": 0.6652171812339953, "grad_norm": 2.9282212257385254, "learning_rate": 3.347828187660047e-07, "loss": 0.2289, "step": 13768 }, { "epoch": 0.6652654974150843, "grad_norm": 2.1280250549316406, "learning_rate": 3.3473450258491564e-07, "loss": 0.2696, "step": 13769 }, { "epoch": 0.6653138135961734, "grad_norm": 2.7108681201934814, "learning_rate": 3.3468618640382663e-07, "loss": 0.2484, "step": 13770 }, { "epoch": 0.6653621297772624, "grad_norm": 2.6319148540496826, "learning_rate": 3.3463787022273757e-07, "loss": 0.3547, "step": 13771 }, { "epoch": 0.6654104459583514, "grad_norm": 4.599151134490967, "learning_rate": 3.345895540416485e-07, "loss": 0.2826, "step": 13772 }, { "epoch": 0.6654587621394406, "grad_norm": 3.8975555896759033, "learning_rate": 3.345412378605595e-07, "loss": 0.4548, "step": 13773 }, { "epoch": 0.6655070783205296, "grad_norm": 2.628469467163086, "learning_rate": 3.344929216794705e-07, "loss": 0.3544, "step": 13774 }, { "epoch": 0.6655553945016186, "grad_norm": 1.606796383857727, "learning_rate": 3.3444460549838137e-07, "loss": 0.1554, "step": 13775 }, { "epoch": 0.6656037106827076, "grad_norm": 2.3545167446136475, "learning_rate": 3.3439628931729236e-07, "loss": 0.1549, "step": 13776 }, { "epoch": 0.6656520268637967, "grad_norm": 2.4956448078155518, "learning_rate": 3.343479731362033e-07, "loss": 0.2631, "step": 13777 }, { "epoch": 0.6657003430448857, "grad_norm": 2.8468005657196045, "learning_rate": 3.3429965695511424e-07, "loss": 0.3334, "step": 13778 }, { "epoch": 0.6657486592259748, "grad_norm": 3.3052470684051514, "learning_rate": 3.3425134077402523e-07, "loss": 0.3649, "step": 13779 }, { "epoch": 0.6657969754070638, "grad_norm": 53.962646484375, "learning_rate": 3.3420302459293616e-07, "loss": 0.162, "step": 13780 }, { "epoch": 0.6658452915881529, "grad_norm": 7.599466323852539, "learning_rate": 3.341547084118471e-07, "loss": 0.2443, "step": 13781 }, { "epoch": 0.6658936077692419, "grad_norm": 1.9938619136810303, "learning_rate": 3.3410639223075804e-07, "loss": 0.233, "step": 13782 }, { "epoch": 0.6659419239503309, "grad_norm": 4.881366729736328, "learning_rate": 3.3405807604966903e-07, "loss": 0.3255, "step": 13783 }, { "epoch": 0.66599024013142, "grad_norm": 2.7946648597717285, "learning_rate": 3.3400975986857997e-07, "loss": 0.3475, "step": 13784 }, { "epoch": 0.6660385563125091, "grad_norm": 4.068920612335205, "learning_rate": 3.339614436874909e-07, "loss": 0.2659, "step": 13785 }, { "epoch": 0.6660868724935981, "grad_norm": 2.848735809326172, "learning_rate": 3.339131275064019e-07, "loss": 0.3583, "step": 13786 }, { "epoch": 0.6661351886746871, "grad_norm": 2.6480491161346436, "learning_rate": 3.3386481132531283e-07, "loss": 0.323, "step": 13787 }, { "epoch": 0.6661835048557762, "grad_norm": 2.4044318199157715, "learning_rate": 3.3381649514422377e-07, "loss": 0.2411, "step": 13788 }, { "epoch": 0.6662318210368653, "grad_norm": 2.023831367492676, "learning_rate": 3.3376817896313476e-07, "loss": 0.2105, "step": 13789 }, { "epoch": 0.6662801372179543, "grad_norm": 2.8471083641052246, "learning_rate": 3.3371986278204564e-07, "loss": 0.3173, "step": 13790 }, { "epoch": 0.6663284533990433, "grad_norm": 2.6341137886047363, "learning_rate": 3.3367154660095663e-07, "loss": 0.2382, "step": 13791 }, { "epoch": 0.6663767695801324, "grad_norm": 4.533409595489502, "learning_rate": 3.336232304198676e-07, "loss": 0.3213, "step": 13792 }, { "epoch": 0.6664250857612214, "grad_norm": 2.2611684799194336, "learning_rate": 3.3357491423877856e-07, "loss": 0.3071, "step": 13793 }, { "epoch": 0.6664734019423105, "grad_norm": 2.6995737552642822, "learning_rate": 3.335265980576895e-07, "loss": 0.3367, "step": 13794 }, { "epoch": 0.6665217181233996, "grad_norm": 2.787905216217041, "learning_rate": 3.3347828187660044e-07, "loss": 0.2253, "step": 13795 }, { "epoch": 0.6665700343044886, "grad_norm": 12.48133373260498, "learning_rate": 3.334299656955114e-07, "loss": 0.2038, "step": 13796 }, { "epoch": 0.6666183504855776, "grad_norm": 2.9390370845794678, "learning_rate": 3.3338164951442236e-07, "loss": 0.3324, "step": 13797 }, { "epoch": 0.6666666666666666, "grad_norm": 2.044640302658081, "learning_rate": 3.333333333333333e-07, "loss": 0.2255, "step": 13798 }, { "epoch": 0.6667149828477558, "grad_norm": 2.7759933471679688, "learning_rate": 3.332850171522443e-07, "loss": 0.3187, "step": 13799 }, { "epoch": 0.6667632990288448, "grad_norm": 2.539820432662964, "learning_rate": 3.3323670097115523e-07, "loss": 0.2419, "step": 13800 }, { "epoch": 0.6668116152099338, "grad_norm": 2.874091625213623, "learning_rate": 3.3318838479006617e-07, "loss": 0.239, "step": 13801 }, { "epoch": 0.6668599313910228, "grad_norm": 1.934073567390442, "learning_rate": 3.3314006860897716e-07, "loss": 0.2703, "step": 13802 }, { "epoch": 0.6669082475721119, "grad_norm": 2.8078651428222656, "learning_rate": 3.3309175242788804e-07, "loss": 0.3318, "step": 13803 }, { "epoch": 0.6669565637532009, "grad_norm": 5.769325256347656, "learning_rate": 3.3304343624679903e-07, "loss": 0.2282, "step": 13804 }, { "epoch": 0.66700487993429, "grad_norm": 2.568704128265381, "learning_rate": 3.3299512006571e-07, "loss": 0.2424, "step": 13805 }, { "epoch": 0.667053196115379, "grad_norm": 2.935377836227417, "learning_rate": 3.329468038846209e-07, "loss": 0.3146, "step": 13806 }, { "epoch": 0.6671015122964681, "grad_norm": 2.586961269378662, "learning_rate": 3.328984877035319e-07, "loss": 0.2498, "step": 13807 }, { "epoch": 0.6671498284775571, "grad_norm": 4.350339412689209, "learning_rate": 3.3285017152244283e-07, "loss": 0.2979, "step": 13808 }, { "epoch": 0.6671981446586461, "grad_norm": 2.9912798404693604, "learning_rate": 3.328018553413538e-07, "loss": 0.2937, "step": 13809 }, { "epoch": 0.6672464608397353, "grad_norm": 1.9922410249710083, "learning_rate": 3.3275353916026476e-07, "loss": 0.1819, "step": 13810 }, { "epoch": 0.6672947770208243, "grad_norm": 1.7099655866622925, "learning_rate": 3.327052229791757e-07, "loss": 0.2166, "step": 13811 }, { "epoch": 0.6673430932019133, "grad_norm": 2.5320894718170166, "learning_rate": 3.326569067980867e-07, "loss": 0.2379, "step": 13812 }, { "epoch": 0.6673914093830023, "grad_norm": 2.8875484466552734, "learning_rate": 3.326085906169976e-07, "loss": 0.4349, "step": 13813 }, { "epoch": 0.6674397255640914, "grad_norm": 3.7478435039520264, "learning_rate": 3.3256027443590856e-07, "loss": 0.3816, "step": 13814 }, { "epoch": 0.6674880417451805, "grad_norm": 3.5081686973571777, "learning_rate": 3.3251195825481955e-07, "loss": 0.4363, "step": 13815 }, { "epoch": 0.6675363579262695, "grad_norm": 2.520350694656372, "learning_rate": 3.3246364207373044e-07, "loss": 0.3521, "step": 13816 }, { "epoch": 0.6675846741073586, "grad_norm": 3.325629711151123, "learning_rate": 3.3241532589264143e-07, "loss": 0.3119, "step": 13817 }, { "epoch": 0.6676329902884476, "grad_norm": 2.365131378173828, "learning_rate": 3.323670097115524e-07, "loss": 0.321, "step": 13818 }, { "epoch": 0.6676813064695366, "grad_norm": 1.7658271789550781, "learning_rate": 3.323186935304633e-07, "loss": 0.2357, "step": 13819 }, { "epoch": 0.6677296226506257, "grad_norm": 3.1485116481781006, "learning_rate": 3.322703773493743e-07, "loss": 0.2085, "step": 13820 }, { "epoch": 0.6677779388317148, "grad_norm": 2.657484769821167, "learning_rate": 3.3222206116828523e-07, "loss": 0.2861, "step": 13821 }, { "epoch": 0.6678262550128038, "grad_norm": 1.7003296613693237, "learning_rate": 3.3217374498719617e-07, "loss": 0.2141, "step": 13822 }, { "epoch": 0.6678745711938928, "grad_norm": 3.2869842052459717, "learning_rate": 3.3212542880610716e-07, "loss": 0.3958, "step": 13823 }, { "epoch": 0.6679228873749818, "grad_norm": 2.1209888458251953, "learning_rate": 3.320771126250181e-07, "loss": 0.2267, "step": 13824 }, { "epoch": 0.667971203556071, "grad_norm": 3.724397897720337, "learning_rate": 3.320287964439291e-07, "loss": 0.3648, "step": 13825 }, { "epoch": 0.66801951973716, "grad_norm": 2.1348769664764404, "learning_rate": 3.3198048026284e-07, "loss": 0.1961, "step": 13826 }, { "epoch": 0.668067835918249, "grad_norm": 3.5800795555114746, "learning_rate": 3.3193216408175096e-07, "loss": 0.4218, "step": 13827 }, { "epoch": 0.6681161520993381, "grad_norm": 6.684551239013672, "learning_rate": 3.3188384790066195e-07, "loss": 0.1822, "step": 13828 }, { "epoch": 0.6681644682804271, "grad_norm": 2.7907893657684326, "learning_rate": 3.3183553171957283e-07, "loss": 0.3113, "step": 13829 }, { "epoch": 0.6682127844615161, "grad_norm": 3.403104782104492, "learning_rate": 3.317872155384838e-07, "loss": 0.2229, "step": 13830 }, { "epoch": 0.6682611006426052, "grad_norm": 3.1448402404785156, "learning_rate": 3.317388993573948e-07, "loss": 0.3478, "step": 13831 }, { "epoch": 0.6683094168236943, "grad_norm": 2.8854215145111084, "learning_rate": 3.316905831763057e-07, "loss": 0.3564, "step": 13832 }, { "epoch": 0.6683577330047833, "grad_norm": 2.328193187713623, "learning_rate": 3.316422669952167e-07, "loss": 0.2868, "step": 13833 }, { "epoch": 0.6684060491858723, "grad_norm": 1.54973566532135, "learning_rate": 3.315939508141276e-07, "loss": 0.2219, "step": 13834 }, { "epoch": 0.6684543653669613, "grad_norm": 3.752598524093628, "learning_rate": 3.3154563463303856e-07, "loss": 0.3998, "step": 13835 }, { "epoch": 0.6685026815480505, "grad_norm": 4.851632595062256, "learning_rate": 3.3149731845194955e-07, "loss": 0.3203, "step": 13836 }, { "epoch": 0.6685509977291395, "grad_norm": 3.5802676677703857, "learning_rate": 3.314490022708605e-07, "loss": 0.3683, "step": 13837 }, { "epoch": 0.6685993139102285, "grad_norm": 2.340850830078125, "learning_rate": 3.3140068608977143e-07, "loss": 0.2601, "step": 13838 }, { "epoch": 0.6686476300913176, "grad_norm": 2.662951707839966, "learning_rate": 3.313523699086824e-07, "loss": 0.209, "step": 13839 }, { "epoch": 0.6686959462724066, "grad_norm": 1.9670921564102173, "learning_rate": 3.3130405372759335e-07, "loss": 0.1903, "step": 13840 }, { "epoch": 0.6687442624534957, "grad_norm": 2.6404919624328613, "learning_rate": 3.3125573754650434e-07, "loss": 0.38, "step": 13841 }, { "epoch": 0.6687925786345847, "grad_norm": 2.183736562728882, "learning_rate": 3.3120742136541523e-07, "loss": 0.1886, "step": 13842 }, { "epoch": 0.6688408948156738, "grad_norm": 2.33856463432312, "learning_rate": 3.311591051843262e-07, "loss": 0.2014, "step": 13843 }, { "epoch": 0.6688892109967628, "grad_norm": 2.470233917236328, "learning_rate": 3.311107890032372e-07, "loss": 0.3432, "step": 13844 }, { "epoch": 0.6689375271778518, "grad_norm": 2.155189037322998, "learning_rate": 3.310624728221481e-07, "loss": 0.2144, "step": 13845 }, { "epoch": 0.668985843358941, "grad_norm": 2.3783299922943115, "learning_rate": 3.310141566410591e-07, "loss": 0.3127, "step": 13846 }, { "epoch": 0.66903415954003, "grad_norm": 5.733931541442871, "learning_rate": 3.3096584045997e-07, "loss": 0.3924, "step": 13847 }, { "epoch": 0.669082475721119, "grad_norm": 4.427035331726074, "learning_rate": 3.3091752427888096e-07, "loss": 0.3513, "step": 13848 }, { "epoch": 0.669130791902208, "grad_norm": 2.9559237957000732, "learning_rate": 3.3086920809779195e-07, "loss": 0.2936, "step": 13849 }, { "epoch": 0.6691791080832971, "grad_norm": 3.16542911529541, "learning_rate": 3.308208919167029e-07, "loss": 0.3624, "step": 13850 }, { "epoch": 0.6692274242643862, "grad_norm": 12.338554382324219, "learning_rate": 3.307725757356138e-07, "loss": 0.2917, "step": 13851 }, { "epoch": 0.6692757404454752, "grad_norm": 3.038438320159912, "learning_rate": 3.307242595545248e-07, "loss": 0.2371, "step": 13852 }, { "epoch": 0.6693240566265642, "grad_norm": 1.8366796970367432, "learning_rate": 3.3067594337343575e-07, "loss": 0.194, "step": 13853 }, { "epoch": 0.6693723728076533, "grad_norm": 4.549978733062744, "learning_rate": 3.306276271923467e-07, "loss": 0.3554, "step": 13854 }, { "epoch": 0.6694206889887423, "grad_norm": 2.873581647872925, "learning_rate": 3.305793110112576e-07, "loss": 0.2949, "step": 13855 }, { "epoch": 0.6694690051698313, "grad_norm": 3.104214668273926, "learning_rate": 3.305309948301686e-07, "loss": 0.5186, "step": 13856 }, { "epoch": 0.6695173213509205, "grad_norm": 2.152825117111206, "learning_rate": 3.304826786490796e-07, "loss": 0.2157, "step": 13857 }, { "epoch": 0.6695656375320095, "grad_norm": 2.3892464637756348, "learning_rate": 3.304343624679905e-07, "loss": 0.2584, "step": 13858 }, { "epoch": 0.6696139537130985, "grad_norm": 3.4892585277557373, "learning_rate": 3.303860462869015e-07, "loss": 0.3322, "step": 13859 }, { "epoch": 0.6696622698941875, "grad_norm": 1.7945888042449951, "learning_rate": 3.303377301058124e-07, "loss": 0.1909, "step": 13860 }, { "epoch": 0.6697105860752766, "grad_norm": 5.557650566101074, "learning_rate": 3.3028941392472335e-07, "loss": 0.384, "step": 13861 }, { "epoch": 0.6697589022563657, "grad_norm": 1.8319554328918457, "learning_rate": 3.3024109774363435e-07, "loss": 0.168, "step": 13862 }, { "epoch": 0.6698072184374547, "grad_norm": 2.542695999145508, "learning_rate": 3.301927815625453e-07, "loss": 0.3287, "step": 13863 }, { "epoch": 0.6698555346185437, "grad_norm": 3.135986328125, "learning_rate": 3.301444653814562e-07, "loss": 0.3518, "step": 13864 }, { "epoch": 0.6699038507996328, "grad_norm": 2.5648601055145264, "learning_rate": 3.300961492003672e-07, "loss": 0.2894, "step": 13865 }, { "epoch": 0.6699521669807218, "grad_norm": 2.6409804821014404, "learning_rate": 3.3004783301927815e-07, "loss": 0.2606, "step": 13866 }, { "epoch": 0.6700004831618109, "grad_norm": 4.234293460845947, "learning_rate": 3.299995168381891e-07, "loss": 0.4128, "step": 13867 }, { "epoch": 0.6700487993429, "grad_norm": 3.34639048576355, "learning_rate": 3.299512006571e-07, "loss": 0.2813, "step": 13868 }, { "epoch": 0.670097115523989, "grad_norm": 3.2882556915283203, "learning_rate": 3.29902884476011e-07, "loss": 0.4331, "step": 13869 }, { "epoch": 0.670145431705078, "grad_norm": 2.852534770965576, "learning_rate": 3.2985456829492195e-07, "loss": 0.3008, "step": 13870 }, { "epoch": 0.670193747886167, "grad_norm": 4.44735050201416, "learning_rate": 3.298062521138329e-07, "loss": 0.3749, "step": 13871 }, { "epoch": 0.6702420640672562, "grad_norm": 3.1042122840881348, "learning_rate": 3.297579359327439e-07, "loss": 0.2226, "step": 13872 }, { "epoch": 0.6702903802483452, "grad_norm": 4.296836853027344, "learning_rate": 3.297096197516548e-07, "loss": 0.2785, "step": 13873 }, { "epoch": 0.6703386964294342, "grad_norm": 2.9287455081939697, "learning_rate": 3.2966130357056575e-07, "loss": 0.4071, "step": 13874 }, { "epoch": 0.6703870126105232, "grad_norm": 8.053643226623535, "learning_rate": 3.2961298738947674e-07, "loss": 0.371, "step": 13875 }, { "epoch": 0.6704353287916123, "grad_norm": 2.3399546146392822, "learning_rate": 3.295646712083877e-07, "loss": 0.2408, "step": 13876 }, { "epoch": 0.6704836449727014, "grad_norm": 3.376875638961792, "learning_rate": 3.295163550272986e-07, "loss": 0.261, "step": 13877 }, { "epoch": 0.6705319611537904, "grad_norm": 2.6672136783599854, "learning_rate": 3.294680388462096e-07, "loss": 0.1832, "step": 13878 }, { "epoch": 0.6705802773348795, "grad_norm": 2.2525501251220703, "learning_rate": 3.2941972266512054e-07, "loss": 0.276, "step": 13879 }, { "epoch": 0.6706285935159685, "grad_norm": 70.15713500976562, "learning_rate": 3.293714064840315e-07, "loss": 0.2154, "step": 13880 }, { "epoch": 0.6706769096970575, "grad_norm": 6.124290466308594, "learning_rate": 3.293230903029424e-07, "loss": 0.4485, "step": 13881 }, { "epoch": 0.6707252258781465, "grad_norm": 3.9432168006896973, "learning_rate": 3.292747741218534e-07, "loss": 0.3166, "step": 13882 }, { "epoch": 0.6707735420592357, "grad_norm": 107.82219696044922, "learning_rate": 3.2922645794076435e-07, "loss": 0.3042, "step": 13883 }, { "epoch": 0.6708218582403247, "grad_norm": 2.2760167121887207, "learning_rate": 3.291781417596753e-07, "loss": 0.232, "step": 13884 }, { "epoch": 0.6708701744214137, "grad_norm": 1.9579284191131592, "learning_rate": 3.2912982557858627e-07, "loss": 0.1716, "step": 13885 }, { "epoch": 0.6709184906025027, "grad_norm": 2.2501866817474365, "learning_rate": 3.2908150939749716e-07, "loss": 0.2687, "step": 13886 }, { "epoch": 0.6709668067835918, "grad_norm": 3.3708226680755615, "learning_rate": 3.2903319321640815e-07, "loss": 0.2681, "step": 13887 }, { "epoch": 0.6710151229646809, "grad_norm": 3.2469305992126465, "learning_rate": 3.2898487703531914e-07, "loss": 0.3344, "step": 13888 }, { "epoch": 0.6710634391457699, "grad_norm": 2.2897560596466064, "learning_rate": 3.289365608542301e-07, "loss": 0.2191, "step": 13889 }, { "epoch": 0.671111755326859, "grad_norm": 2.995084762573242, "learning_rate": 3.28888244673141e-07, "loss": 0.3883, "step": 13890 }, { "epoch": 0.671160071507948, "grad_norm": 3.659940004348755, "learning_rate": 3.28839928492052e-07, "loss": 0.2865, "step": 13891 }, { "epoch": 0.671208387689037, "grad_norm": 2.505769968032837, "learning_rate": 3.2879161231096294e-07, "loss": 0.2992, "step": 13892 }, { "epoch": 0.6712567038701261, "grad_norm": 3.2617390155792236, "learning_rate": 3.287432961298739e-07, "loss": 0.2964, "step": 13893 }, { "epoch": 0.6713050200512152, "grad_norm": 2.7684786319732666, "learning_rate": 3.286949799487848e-07, "loss": 0.3113, "step": 13894 }, { "epoch": 0.6713533362323042, "grad_norm": 3.1034178733825684, "learning_rate": 3.286466637676958e-07, "loss": 0.2541, "step": 13895 }, { "epoch": 0.6714016524133932, "grad_norm": 2.511267900466919, "learning_rate": 3.2859834758660674e-07, "loss": 0.1998, "step": 13896 }, { "epoch": 0.6714499685944822, "grad_norm": 16.293556213378906, "learning_rate": 3.285500314055177e-07, "loss": 0.2539, "step": 13897 }, { "epoch": 0.6714982847755714, "grad_norm": 2.2381443977355957, "learning_rate": 3.2850171522442867e-07, "loss": 0.2589, "step": 13898 }, { "epoch": 0.6715466009566604, "grad_norm": 2.1301848888397217, "learning_rate": 3.2845339904333955e-07, "loss": 0.2366, "step": 13899 }, { "epoch": 0.6715949171377494, "grad_norm": 5.105840682983398, "learning_rate": 3.2840508286225054e-07, "loss": 0.2776, "step": 13900 }, { "epoch": 0.6716432333188385, "grad_norm": 3.0426888465881348, "learning_rate": 3.2835676668116153e-07, "loss": 0.2855, "step": 13901 }, { "epoch": 0.6716915494999275, "grad_norm": 2.166926383972168, "learning_rate": 3.283084505000724e-07, "loss": 0.2214, "step": 13902 }, { "epoch": 0.6717398656810166, "grad_norm": 2.7865684032440186, "learning_rate": 3.282601343189834e-07, "loss": 0.3195, "step": 13903 }, { "epoch": 0.6717881818621056, "grad_norm": 3.0244925022125244, "learning_rate": 3.282118181378944e-07, "loss": 0.3368, "step": 13904 }, { "epoch": 0.6718364980431947, "grad_norm": 3.250462055206299, "learning_rate": 3.2816350195680534e-07, "loss": 0.309, "step": 13905 }, { "epoch": 0.6718848142242837, "grad_norm": 2.3557419776916504, "learning_rate": 3.2811518577571627e-07, "loss": 0.209, "step": 13906 }, { "epoch": 0.6719331304053727, "grad_norm": 2.7068817615509033, "learning_rate": 3.280668695946272e-07, "loss": 0.3471, "step": 13907 }, { "epoch": 0.6719814465864619, "grad_norm": 2.6103270053863525, "learning_rate": 3.280185534135382e-07, "loss": 0.3192, "step": 13908 }, { "epoch": 0.6720297627675509, "grad_norm": 2.380211353302002, "learning_rate": 3.2797023723244914e-07, "loss": 0.2864, "step": 13909 }, { "epoch": 0.6720780789486399, "grad_norm": 2.043290853500366, "learning_rate": 3.279219210513601e-07, "loss": 0.2699, "step": 13910 }, { "epoch": 0.6721263951297289, "grad_norm": 2.5101494789123535, "learning_rate": 3.2787360487027107e-07, "loss": 0.204, "step": 13911 }, { "epoch": 0.672174711310818, "grad_norm": 2.280808448791504, "learning_rate": 3.2782528868918195e-07, "loss": 0.162, "step": 13912 }, { "epoch": 0.672223027491907, "grad_norm": 2.6846611499786377, "learning_rate": 3.2777697250809294e-07, "loss": 0.322, "step": 13913 }, { "epoch": 0.6722713436729961, "grad_norm": 3.2277872562408447, "learning_rate": 3.2772865632700393e-07, "loss": 0.3464, "step": 13914 }, { "epoch": 0.6723196598540851, "grad_norm": 3.596130132675171, "learning_rate": 3.276803401459148e-07, "loss": 0.2927, "step": 13915 }, { "epoch": 0.6723679760351742, "grad_norm": 4.208130359649658, "learning_rate": 3.276320239648258e-07, "loss": 0.2198, "step": 13916 }, { "epoch": 0.6724162922162632, "grad_norm": 2.5457193851470947, "learning_rate": 3.275837077837368e-07, "loss": 0.3134, "step": 13917 }, { "epoch": 0.6724646083973522, "grad_norm": 2.635420083999634, "learning_rate": 3.275353916026477e-07, "loss": 0.2796, "step": 13918 }, { "epoch": 0.6725129245784414, "grad_norm": 2.1703946590423584, "learning_rate": 3.2748707542155867e-07, "loss": 0.2529, "step": 13919 }, { "epoch": 0.6725612407595304, "grad_norm": 3.973494291305542, "learning_rate": 3.274387592404696e-07, "loss": 0.2502, "step": 13920 }, { "epoch": 0.6726095569406194, "grad_norm": 1.7741636037826538, "learning_rate": 3.273904430593806e-07, "loss": 0.1767, "step": 13921 }, { "epoch": 0.6726578731217084, "grad_norm": 2.5034327507019043, "learning_rate": 3.2734212687829153e-07, "loss": 0.3205, "step": 13922 }, { "epoch": 0.6727061893027975, "grad_norm": 1.7595361471176147, "learning_rate": 3.2729381069720247e-07, "loss": 0.2418, "step": 13923 }, { "epoch": 0.6727545054838866, "grad_norm": 4.750727653503418, "learning_rate": 3.2724549451611346e-07, "loss": 0.3039, "step": 13924 }, { "epoch": 0.6728028216649756, "grad_norm": 3.6435627937316895, "learning_rate": 3.2719717833502435e-07, "loss": 0.1965, "step": 13925 }, { "epoch": 0.6728511378460647, "grad_norm": 1.711381435394287, "learning_rate": 3.2714886215393534e-07, "loss": 0.1562, "step": 13926 }, { "epoch": 0.6728994540271537, "grad_norm": 4.4467244148254395, "learning_rate": 3.2710054597284633e-07, "loss": 0.2963, "step": 13927 }, { "epoch": 0.6729477702082427, "grad_norm": 4.227607250213623, "learning_rate": 3.270522297917572e-07, "loss": 0.2163, "step": 13928 }, { "epoch": 0.6729960863893318, "grad_norm": 3.494621753692627, "learning_rate": 3.270039136106682e-07, "loss": 0.3067, "step": 13929 }, { "epoch": 0.6730444025704209, "grad_norm": 2.6335866451263428, "learning_rate": 3.269555974295792e-07, "loss": 0.3749, "step": 13930 }, { "epoch": 0.6730927187515099, "grad_norm": 1.7448197603225708, "learning_rate": 3.269072812484901e-07, "loss": 0.1806, "step": 13931 }, { "epoch": 0.6731410349325989, "grad_norm": 2.75844144821167, "learning_rate": 3.2685896506740107e-07, "loss": 0.3459, "step": 13932 }, { "epoch": 0.6731893511136879, "grad_norm": 2.2745614051818848, "learning_rate": 3.26810648886312e-07, "loss": 0.2685, "step": 13933 }, { "epoch": 0.6732376672947771, "grad_norm": 4.210329055786133, "learning_rate": 3.2676233270522294e-07, "loss": 0.3018, "step": 13934 }, { "epoch": 0.6732859834758661, "grad_norm": 6.777096271514893, "learning_rate": 3.2671401652413393e-07, "loss": 0.3346, "step": 13935 }, { "epoch": 0.6733342996569551, "grad_norm": 2.9257407188415527, "learning_rate": 3.2666570034304487e-07, "loss": 0.3699, "step": 13936 }, { "epoch": 0.6733826158380442, "grad_norm": 2.7951741218566895, "learning_rate": 3.2661738416195586e-07, "loss": 0.2421, "step": 13937 }, { "epoch": 0.6734309320191332, "grad_norm": 3.5310397148132324, "learning_rate": 3.2656906798086674e-07, "loss": 0.3321, "step": 13938 }, { "epoch": 0.6734792482002222, "grad_norm": 6.703618049621582, "learning_rate": 3.2652075179977773e-07, "loss": 0.2606, "step": 13939 }, { "epoch": 0.6735275643813113, "grad_norm": 2.146050453186035, "learning_rate": 3.264724356186887e-07, "loss": 0.1627, "step": 13940 }, { "epoch": 0.6735758805624004, "grad_norm": 2.7255823612213135, "learning_rate": 3.264241194375996e-07, "loss": 0.1019, "step": 13941 }, { "epoch": 0.6736241967434894, "grad_norm": 2.364259958267212, "learning_rate": 3.263758032565106e-07, "loss": 0.2849, "step": 13942 }, { "epoch": 0.6736725129245784, "grad_norm": 2.853954792022705, "learning_rate": 3.263274870754216e-07, "loss": 0.2374, "step": 13943 }, { "epoch": 0.6737208291056674, "grad_norm": 3.5345425605773926, "learning_rate": 3.2627917089433247e-07, "loss": 0.241, "step": 13944 }, { "epoch": 0.6737691452867566, "grad_norm": 2.7180533409118652, "learning_rate": 3.2623085471324346e-07, "loss": 0.3499, "step": 13945 }, { "epoch": 0.6738174614678456, "grad_norm": 7.720763683319092, "learning_rate": 3.261825385321544e-07, "loss": 0.2435, "step": 13946 }, { "epoch": 0.6738657776489346, "grad_norm": 2.119446277618408, "learning_rate": 3.2613422235106534e-07, "loss": 0.2213, "step": 13947 }, { "epoch": 0.6739140938300237, "grad_norm": 3.0661308765411377, "learning_rate": 3.2608590616997633e-07, "loss": 0.3713, "step": 13948 }, { "epoch": 0.6739624100111127, "grad_norm": 15.062132835388184, "learning_rate": 3.2603758998888726e-07, "loss": 0.2361, "step": 13949 }, { "epoch": 0.6740107261922018, "grad_norm": 9.971386909484863, "learning_rate": 3.259892738077982e-07, "loss": 0.3717, "step": 13950 }, { "epoch": 0.6740590423732908, "grad_norm": 1.90191650390625, "learning_rate": 3.2594095762670914e-07, "loss": 0.2259, "step": 13951 }, { "epoch": 0.6741073585543799, "grad_norm": 3.0475878715515137, "learning_rate": 3.2589264144562013e-07, "loss": 0.2427, "step": 13952 }, { "epoch": 0.6741556747354689, "grad_norm": 2.9316132068634033, "learning_rate": 3.258443252645311e-07, "loss": 0.2978, "step": 13953 }, { "epoch": 0.6742039909165579, "grad_norm": 7.811963081359863, "learning_rate": 3.25796009083442e-07, "loss": 0.3276, "step": 13954 }, { "epoch": 0.674252307097647, "grad_norm": 6.2429518699646, "learning_rate": 3.25747692902353e-07, "loss": 0.3758, "step": 13955 }, { "epoch": 0.6743006232787361, "grad_norm": 4.041101932525635, "learning_rate": 3.25699376721264e-07, "loss": 0.2725, "step": 13956 }, { "epoch": 0.6743489394598251, "grad_norm": 2.8878872394561768, "learning_rate": 3.2565106054017487e-07, "loss": 0.3292, "step": 13957 }, { "epoch": 0.6743972556409141, "grad_norm": 2.374695301055908, "learning_rate": 3.2560274435908586e-07, "loss": 0.2207, "step": 13958 }, { "epoch": 0.6744455718220032, "grad_norm": 2.318681240081787, "learning_rate": 3.255544281779968e-07, "loss": 0.2504, "step": 13959 }, { "epoch": 0.6744938880030923, "grad_norm": 2.439422845840454, "learning_rate": 3.2550611199690773e-07, "loss": 0.3379, "step": 13960 }, { "epoch": 0.6745422041841813, "grad_norm": 1.9550108909606934, "learning_rate": 3.254577958158187e-07, "loss": 0.2222, "step": 13961 }, { "epoch": 0.6745905203652703, "grad_norm": 1.9068892002105713, "learning_rate": 3.2540947963472966e-07, "loss": 0.1502, "step": 13962 }, { "epoch": 0.6746388365463594, "grad_norm": 2.1433639526367188, "learning_rate": 3.253611634536406e-07, "loss": 0.2312, "step": 13963 }, { "epoch": 0.6746871527274484, "grad_norm": 2.3084611892700195, "learning_rate": 3.2531284727255154e-07, "loss": 0.226, "step": 13964 }, { "epoch": 0.6747354689085374, "grad_norm": 2.59679913520813, "learning_rate": 3.252645310914625e-07, "loss": 0.2243, "step": 13965 }, { "epoch": 0.6747837850896266, "grad_norm": 2.7443063259124756, "learning_rate": 3.2521621491037346e-07, "loss": 0.2631, "step": 13966 }, { "epoch": 0.6748321012707156, "grad_norm": 15.815271377563477, "learning_rate": 3.251678987292844e-07, "loss": 0.4176, "step": 13967 }, { "epoch": 0.6748804174518046, "grad_norm": 1.98586905002594, "learning_rate": 3.251195825481954e-07, "loss": 0.1966, "step": 13968 }, { "epoch": 0.6749287336328936, "grad_norm": 2.0963733196258545, "learning_rate": 3.250712663671064e-07, "loss": 0.2182, "step": 13969 }, { "epoch": 0.6749770498139827, "grad_norm": 2.922004222869873, "learning_rate": 3.2502295018601726e-07, "loss": 0.3094, "step": 13970 }, { "epoch": 0.6750253659950718, "grad_norm": 2.429147958755493, "learning_rate": 3.2497463400492826e-07, "loss": 0.3196, "step": 13971 }, { "epoch": 0.6750736821761608, "grad_norm": 2.8546760082244873, "learning_rate": 3.249263178238392e-07, "loss": 0.2723, "step": 13972 }, { "epoch": 0.6751219983572498, "grad_norm": 2.420423746109009, "learning_rate": 3.2487800164275013e-07, "loss": 0.2653, "step": 13973 }, { "epoch": 0.6751703145383389, "grad_norm": 2.505686044692993, "learning_rate": 3.248296854616611e-07, "loss": 0.2793, "step": 13974 }, { "epoch": 0.6752186307194279, "grad_norm": 2.7521095275878906, "learning_rate": 3.2478136928057206e-07, "loss": 0.2011, "step": 13975 }, { "epoch": 0.675266946900517, "grad_norm": 2.274723529815674, "learning_rate": 3.24733053099483e-07, "loss": 0.2459, "step": 13976 }, { "epoch": 0.675315263081606, "grad_norm": 3.807589054107666, "learning_rate": 3.2468473691839393e-07, "loss": 0.3918, "step": 13977 }, { "epoch": 0.6753635792626951, "grad_norm": 2.656872510910034, "learning_rate": 3.246364207373049e-07, "loss": 0.2484, "step": 13978 }, { "epoch": 0.6754118954437841, "grad_norm": 3.0257134437561035, "learning_rate": 3.2458810455621586e-07, "loss": 0.2085, "step": 13979 }, { "epoch": 0.6754602116248731, "grad_norm": 6.310760974884033, "learning_rate": 3.245397883751268e-07, "loss": 0.4338, "step": 13980 }, { "epoch": 0.6755085278059623, "grad_norm": 2.853914737701416, "learning_rate": 3.244914721940378e-07, "loss": 0.2561, "step": 13981 }, { "epoch": 0.6755568439870513, "grad_norm": 4.406083106994629, "learning_rate": 3.244431560129487e-07, "loss": 0.1763, "step": 13982 }, { "epoch": 0.6756051601681403, "grad_norm": 3.426607131958008, "learning_rate": 3.2439483983185966e-07, "loss": 0.2276, "step": 13983 }, { "epoch": 0.6756534763492293, "grad_norm": 2.023293972015381, "learning_rate": 3.2434652365077065e-07, "loss": 0.1838, "step": 13984 }, { "epoch": 0.6757017925303184, "grad_norm": 5.521618843078613, "learning_rate": 3.2429820746968154e-07, "loss": 0.2739, "step": 13985 }, { "epoch": 0.6757501087114075, "grad_norm": 2.6294667720794678, "learning_rate": 3.242498912885925e-07, "loss": 0.316, "step": 13986 }, { "epoch": 0.6757984248924965, "grad_norm": 2.2973759174346924, "learning_rate": 3.242015751075035e-07, "loss": 0.1911, "step": 13987 }, { "epoch": 0.6758467410735856, "grad_norm": 2.6538243293762207, "learning_rate": 3.2415325892641445e-07, "loss": 0.2158, "step": 13988 }, { "epoch": 0.6758950572546746, "grad_norm": 2.4074971675872803, "learning_rate": 3.241049427453254e-07, "loss": 0.2815, "step": 13989 }, { "epoch": 0.6759433734357636, "grad_norm": 1.9458112716674805, "learning_rate": 3.2405662656423633e-07, "loss": 0.1673, "step": 13990 }, { "epoch": 0.6759916896168526, "grad_norm": 2.133686065673828, "learning_rate": 3.240083103831473e-07, "loss": 0.1773, "step": 13991 }, { "epoch": 0.6760400057979418, "grad_norm": 3.1876280307769775, "learning_rate": 3.2395999420205826e-07, "loss": 0.2111, "step": 13992 }, { "epoch": 0.6760883219790308, "grad_norm": 3.1488921642303467, "learning_rate": 3.239116780209692e-07, "loss": 0.2854, "step": 13993 }, { "epoch": 0.6761366381601198, "grad_norm": 3.052502155303955, "learning_rate": 3.238633618398802e-07, "loss": 0.1781, "step": 13994 }, { "epoch": 0.6761849543412088, "grad_norm": 2.271669626235962, "learning_rate": 3.238150456587911e-07, "loss": 0.2476, "step": 13995 }, { "epoch": 0.6762332705222979, "grad_norm": 3.5277974605560303, "learning_rate": 3.2376672947770206e-07, "loss": 0.3364, "step": 13996 }, { "epoch": 0.676281586703387, "grad_norm": 2.8104515075683594, "learning_rate": 3.2371841329661305e-07, "loss": 0.2477, "step": 13997 }, { "epoch": 0.676329902884476, "grad_norm": 2.5536792278289795, "learning_rate": 3.2367009711552393e-07, "loss": 0.2138, "step": 13998 }, { "epoch": 0.6763782190655651, "grad_norm": 2.3182616233825684, "learning_rate": 3.236217809344349e-07, "loss": 0.2764, "step": 13999 }, { "epoch": 0.6764265352466541, "grad_norm": 1.8054695129394531, "learning_rate": 3.235734647533459e-07, "loss": 0.1801, "step": 14000 }, { "epoch": 0.6764748514277431, "grad_norm": 1.8574658632278442, "learning_rate": 3.235251485722568e-07, "loss": 0.1704, "step": 14001 }, { "epoch": 0.6765231676088322, "grad_norm": 1.987406611442566, "learning_rate": 3.234768323911678e-07, "loss": 0.2352, "step": 14002 }, { "epoch": 0.6765714837899213, "grad_norm": 1.8715846538543701, "learning_rate": 3.234285162100787e-07, "loss": 0.1689, "step": 14003 }, { "epoch": 0.6766197999710103, "grad_norm": 2.749323844909668, "learning_rate": 3.233802000289897e-07, "loss": 0.3344, "step": 14004 }, { "epoch": 0.6766681161520993, "grad_norm": 2.581969976425171, "learning_rate": 3.2333188384790065e-07, "loss": 0.2887, "step": 14005 }, { "epoch": 0.6767164323331883, "grad_norm": 2.9599790573120117, "learning_rate": 3.232835676668116e-07, "loss": 0.2814, "step": 14006 }, { "epoch": 0.6767647485142775, "grad_norm": 2.251981258392334, "learning_rate": 3.232352514857226e-07, "loss": 0.2451, "step": 14007 }, { "epoch": 0.6768130646953665, "grad_norm": 11.171781539916992, "learning_rate": 3.231869353046335e-07, "loss": 0.2885, "step": 14008 }, { "epoch": 0.6768613808764555, "grad_norm": 2.2784225940704346, "learning_rate": 3.2313861912354445e-07, "loss": 0.213, "step": 14009 }, { "epoch": 0.6769096970575446, "grad_norm": 2.514585018157959, "learning_rate": 3.2309030294245544e-07, "loss": 0.3006, "step": 14010 }, { "epoch": 0.6769580132386336, "grad_norm": 2.8227975368499756, "learning_rate": 3.2304198676136633e-07, "loss": 0.2218, "step": 14011 }, { "epoch": 0.6770063294197227, "grad_norm": 4.285383701324463, "learning_rate": 3.229936705802773e-07, "loss": 0.5233, "step": 14012 }, { "epoch": 0.6770546456008117, "grad_norm": 4.124172210693359, "learning_rate": 3.229453543991883e-07, "loss": 0.3309, "step": 14013 }, { "epoch": 0.6771029617819008, "grad_norm": 2.4123106002807617, "learning_rate": 3.228970382180992e-07, "loss": 0.33, "step": 14014 }, { "epoch": 0.6771512779629898, "grad_norm": 2.9542181491851807, "learning_rate": 3.228487220370102e-07, "loss": 0.291, "step": 14015 }, { "epoch": 0.6771995941440788, "grad_norm": 2.541457176208496, "learning_rate": 3.228004058559211e-07, "loss": 0.2515, "step": 14016 }, { "epoch": 0.6772479103251678, "grad_norm": 2.3695380687713623, "learning_rate": 3.2275208967483206e-07, "loss": 0.3138, "step": 14017 }, { "epoch": 0.677296226506257, "grad_norm": 2.177481174468994, "learning_rate": 3.2270377349374305e-07, "loss": 0.2492, "step": 14018 }, { "epoch": 0.677344542687346, "grad_norm": 3.643772840499878, "learning_rate": 3.22655457312654e-07, "loss": 0.305, "step": 14019 }, { "epoch": 0.677392858868435, "grad_norm": 2.065272331237793, "learning_rate": 3.22607141131565e-07, "loss": 0.254, "step": 14020 }, { "epoch": 0.6774411750495241, "grad_norm": 3.0290775299072266, "learning_rate": 3.225588249504759e-07, "loss": 0.3461, "step": 14021 }, { "epoch": 0.6774894912306131, "grad_norm": 2.2694079875946045, "learning_rate": 3.2251050876938685e-07, "loss": 0.2583, "step": 14022 }, { "epoch": 0.6775378074117022, "grad_norm": 3.2328262329101562, "learning_rate": 3.2246219258829784e-07, "loss": 0.3615, "step": 14023 }, { "epoch": 0.6775861235927912, "grad_norm": 2.2356698513031006, "learning_rate": 3.224138764072087e-07, "loss": 0.3165, "step": 14024 }, { "epoch": 0.6776344397738803, "grad_norm": 3.03286075592041, "learning_rate": 3.223655602261197e-07, "loss": 0.3021, "step": 14025 }, { "epoch": 0.6776827559549693, "grad_norm": 2.739579916000366, "learning_rate": 3.223172440450307e-07, "loss": 0.2734, "step": 14026 }, { "epoch": 0.6777310721360583, "grad_norm": 2.3195550441741943, "learning_rate": 3.222689278639416e-07, "loss": 0.1678, "step": 14027 }, { "epoch": 0.6777793883171475, "grad_norm": 2.2766807079315186, "learning_rate": 3.222206116828526e-07, "loss": 0.23, "step": 14028 }, { "epoch": 0.6778277044982365, "grad_norm": 2.012972831726074, "learning_rate": 3.221722955017635e-07, "loss": 0.24, "step": 14029 }, { "epoch": 0.6778760206793255, "grad_norm": 3.415846824645996, "learning_rate": 3.2212397932067445e-07, "loss": 0.3706, "step": 14030 }, { "epoch": 0.6779243368604145, "grad_norm": 2.242299795150757, "learning_rate": 3.2207566313958544e-07, "loss": 0.258, "step": 14031 }, { "epoch": 0.6779726530415036, "grad_norm": 2.521120548248291, "learning_rate": 3.220273469584964e-07, "loss": 0.284, "step": 14032 }, { "epoch": 0.6780209692225927, "grad_norm": 2.3612921237945557, "learning_rate": 3.219790307774073e-07, "loss": 0.3123, "step": 14033 }, { "epoch": 0.6780692854036817, "grad_norm": 2.285508155822754, "learning_rate": 3.219307145963183e-07, "loss": 0.2408, "step": 14034 }, { "epoch": 0.6781176015847707, "grad_norm": 3.309451103210449, "learning_rate": 3.2188239841522925e-07, "loss": 0.2553, "step": 14035 }, { "epoch": 0.6781659177658598, "grad_norm": 3.355465888977051, "learning_rate": 3.2183408223414024e-07, "loss": 0.1988, "step": 14036 }, { "epoch": 0.6782142339469488, "grad_norm": 3.311447858810425, "learning_rate": 3.217857660530511e-07, "loss": 0.3006, "step": 14037 }, { "epoch": 0.6782625501280379, "grad_norm": 2.718034029006958, "learning_rate": 3.217374498719621e-07, "loss": 0.3865, "step": 14038 }, { "epoch": 0.678310866309127, "grad_norm": 2.659543514251709, "learning_rate": 3.216891336908731e-07, "loss": 0.3226, "step": 14039 }, { "epoch": 0.678359182490216, "grad_norm": 37.47212219238281, "learning_rate": 3.21640817509784e-07, "loss": 0.1843, "step": 14040 }, { "epoch": 0.678407498671305, "grad_norm": 2.266852378845215, "learning_rate": 3.21592501328695e-07, "loss": 0.2494, "step": 14041 }, { "epoch": 0.678455814852394, "grad_norm": 2.5405406951904297, "learning_rate": 3.215441851476059e-07, "loss": 0.31, "step": 14042 }, { "epoch": 0.6785041310334831, "grad_norm": 2.7576396465301514, "learning_rate": 3.2149586896651685e-07, "loss": 0.3032, "step": 14043 }, { "epoch": 0.6785524472145722, "grad_norm": 4.416896820068359, "learning_rate": 3.2144755278542784e-07, "loss": 0.4402, "step": 14044 }, { "epoch": 0.6786007633956612, "grad_norm": 2.7068800926208496, "learning_rate": 3.213992366043388e-07, "loss": 0.2255, "step": 14045 }, { "epoch": 0.6786490795767502, "grad_norm": 2.882030963897705, "learning_rate": 3.213509204232497e-07, "loss": 0.4054, "step": 14046 }, { "epoch": 0.6786973957578393, "grad_norm": 2.6696724891662598, "learning_rate": 3.2130260424216065e-07, "loss": 0.3237, "step": 14047 }, { "epoch": 0.6787457119389283, "grad_norm": 2.021296262741089, "learning_rate": 3.2125428806107164e-07, "loss": 0.2102, "step": 14048 }, { "epoch": 0.6787940281200174, "grad_norm": 1.3198777437210083, "learning_rate": 3.2120597187998263e-07, "loss": 0.1289, "step": 14049 }, { "epoch": 0.6788423443011065, "grad_norm": 12.835657119750977, "learning_rate": 3.211576556988935e-07, "loss": 0.3188, "step": 14050 }, { "epoch": 0.6788906604821955, "grad_norm": 2.896357536315918, "learning_rate": 3.211093395178045e-07, "loss": 0.2506, "step": 14051 }, { "epoch": 0.6789389766632845, "grad_norm": 2.202043294906616, "learning_rate": 3.210610233367155e-07, "loss": 0.2112, "step": 14052 }, { "epoch": 0.6789872928443735, "grad_norm": 2.3763155937194824, "learning_rate": 3.210127071556264e-07, "loss": 0.2232, "step": 14053 }, { "epoch": 0.6790356090254627, "grad_norm": 3.1849758625030518, "learning_rate": 3.2096439097453737e-07, "loss": 0.3023, "step": 14054 }, { "epoch": 0.6790839252065517, "grad_norm": 2.4603402614593506, "learning_rate": 3.209160747934483e-07, "loss": 0.2149, "step": 14055 }, { "epoch": 0.6791322413876407, "grad_norm": 4.3749589920043945, "learning_rate": 3.2086775861235925e-07, "loss": 0.3455, "step": 14056 }, { "epoch": 0.6791805575687297, "grad_norm": 3.1130809783935547, "learning_rate": 3.2081944243127024e-07, "loss": 0.2617, "step": 14057 }, { "epoch": 0.6792288737498188, "grad_norm": 2.0342178344726562, "learning_rate": 3.207711262501812e-07, "loss": 0.2226, "step": 14058 }, { "epoch": 0.6792771899309079, "grad_norm": 3.059523344039917, "learning_rate": 3.207228100690921e-07, "loss": 0.4754, "step": 14059 }, { "epoch": 0.6793255061119969, "grad_norm": 2.972806453704834, "learning_rate": 3.2067449388800305e-07, "loss": 0.33, "step": 14060 }, { "epoch": 0.679373822293086, "grad_norm": 3.0490615367889404, "learning_rate": 3.2062617770691404e-07, "loss": 0.3588, "step": 14061 }, { "epoch": 0.679422138474175, "grad_norm": 7.005829334259033, "learning_rate": 3.20577861525825e-07, "loss": 0.4037, "step": 14062 }, { "epoch": 0.679470454655264, "grad_norm": 5.3220415115356445, "learning_rate": 3.205295453447359e-07, "loss": 0.3617, "step": 14063 }, { "epoch": 0.6795187708363531, "grad_norm": 6.363746643066406, "learning_rate": 3.204812291636469e-07, "loss": 0.3302, "step": 14064 }, { "epoch": 0.6795670870174422, "grad_norm": 2.3667867183685303, "learning_rate": 3.204329129825579e-07, "loss": 0.2846, "step": 14065 }, { "epoch": 0.6796154031985312, "grad_norm": 2.7936208248138428, "learning_rate": 3.203845968014688e-07, "loss": 0.2967, "step": 14066 }, { "epoch": 0.6796637193796202, "grad_norm": 2.1348583698272705, "learning_rate": 3.2033628062037977e-07, "loss": 0.1976, "step": 14067 }, { "epoch": 0.6797120355607092, "grad_norm": 4.095012187957764, "learning_rate": 3.202879644392907e-07, "loss": 0.311, "step": 14068 }, { "epoch": 0.6797603517417983, "grad_norm": 4.064224720001221, "learning_rate": 3.2023964825820164e-07, "loss": 0.2622, "step": 14069 }, { "epoch": 0.6798086679228874, "grad_norm": 3.3535940647125244, "learning_rate": 3.2019133207711263e-07, "loss": 0.3291, "step": 14070 }, { "epoch": 0.6798569841039764, "grad_norm": 2.5747764110565186, "learning_rate": 3.2014301589602357e-07, "loss": 0.2793, "step": 14071 }, { "epoch": 0.6799053002850655, "grad_norm": 1.8369795083999634, "learning_rate": 3.200946997149345e-07, "loss": 0.2533, "step": 14072 }, { "epoch": 0.6799536164661545, "grad_norm": 2.7166080474853516, "learning_rate": 3.2004638353384545e-07, "loss": 0.276, "step": 14073 }, { "epoch": 0.6800019326472435, "grad_norm": 4.4493632316589355, "learning_rate": 3.1999806735275644e-07, "loss": 0.2599, "step": 14074 }, { "epoch": 0.6800502488283326, "grad_norm": 2.7973580360412598, "learning_rate": 3.1994975117166737e-07, "loss": 0.2863, "step": 14075 }, { "epoch": 0.6800985650094217, "grad_norm": 2.578469753265381, "learning_rate": 3.199014349905783e-07, "loss": 0.346, "step": 14076 }, { "epoch": 0.6801468811905107, "grad_norm": 2.473374605178833, "learning_rate": 3.198531188094893e-07, "loss": 0.2036, "step": 14077 }, { "epoch": 0.6801951973715997, "grad_norm": 3.031566858291626, "learning_rate": 3.1980480262840024e-07, "loss": 0.3981, "step": 14078 }, { "epoch": 0.6802435135526887, "grad_norm": 3.653350830078125, "learning_rate": 3.197564864473112e-07, "loss": 0.3373, "step": 14079 }, { "epoch": 0.6802918297337779, "grad_norm": 5.128104209899902, "learning_rate": 3.1970817026622217e-07, "loss": 0.2806, "step": 14080 }, { "epoch": 0.6803401459148669, "grad_norm": 3.6513609886169434, "learning_rate": 3.1965985408513305e-07, "loss": 0.3572, "step": 14081 }, { "epoch": 0.6803884620959559, "grad_norm": 5.260552406311035, "learning_rate": 3.1961153790404404e-07, "loss": 0.224, "step": 14082 }, { "epoch": 0.680436778277045, "grad_norm": 2.744112014770508, "learning_rate": 3.1956322172295503e-07, "loss": 0.3435, "step": 14083 }, { "epoch": 0.680485094458134, "grad_norm": 3.229003429412842, "learning_rate": 3.1951490554186597e-07, "loss": 0.33, "step": 14084 }, { "epoch": 0.6805334106392231, "grad_norm": 3.8811964988708496, "learning_rate": 3.194665893607769e-07, "loss": 0.3031, "step": 14085 }, { "epoch": 0.6805817268203121, "grad_norm": 2.3506505489349365, "learning_rate": 3.1941827317968784e-07, "loss": 0.2056, "step": 14086 }, { "epoch": 0.6806300430014012, "grad_norm": 2.0517022609710693, "learning_rate": 3.1936995699859883e-07, "loss": 0.2621, "step": 14087 }, { "epoch": 0.6806783591824902, "grad_norm": 4.176469802856445, "learning_rate": 3.1932164081750977e-07, "loss": 0.2561, "step": 14088 }, { "epoch": 0.6807266753635792, "grad_norm": 3.351949453353882, "learning_rate": 3.192733246364207e-07, "loss": 0.3328, "step": 14089 }, { "epoch": 0.6807749915446684, "grad_norm": 3.2075753211975098, "learning_rate": 3.192250084553317e-07, "loss": 0.4345, "step": 14090 }, { "epoch": 0.6808233077257574, "grad_norm": 5.310091018676758, "learning_rate": 3.1917669227424263e-07, "loss": 0.2354, "step": 14091 }, { "epoch": 0.6808716239068464, "grad_norm": 3.0178072452545166, "learning_rate": 3.1912837609315357e-07, "loss": 0.394, "step": 14092 }, { "epoch": 0.6809199400879354, "grad_norm": 2.6351852416992188, "learning_rate": 3.1908005991206456e-07, "loss": 0.3078, "step": 14093 }, { "epoch": 0.6809682562690245, "grad_norm": 4.210153579711914, "learning_rate": 3.1903174373097545e-07, "loss": 0.2092, "step": 14094 }, { "epoch": 0.6810165724501135, "grad_norm": 4.965404987335205, "learning_rate": 3.1898342754988644e-07, "loss": 0.307, "step": 14095 }, { "epoch": 0.6810648886312026, "grad_norm": 6.824380397796631, "learning_rate": 3.189351113687974e-07, "loss": 0.2722, "step": 14096 }, { "epoch": 0.6811132048122917, "grad_norm": 2.834564208984375, "learning_rate": 3.188867951877083e-07, "loss": 0.3152, "step": 14097 }, { "epoch": 0.6811615209933807, "grad_norm": 2.5697717666625977, "learning_rate": 3.188384790066193e-07, "loss": 0.3014, "step": 14098 }, { "epoch": 0.6812098371744697, "grad_norm": 2.025982618331909, "learning_rate": 3.1879016282553024e-07, "loss": 0.2652, "step": 14099 }, { "epoch": 0.6812581533555587, "grad_norm": 2.287332534790039, "learning_rate": 3.1874184664444123e-07, "loss": 0.2963, "step": 14100 }, { "epoch": 0.6813064695366479, "grad_norm": 1.9326626062393188, "learning_rate": 3.1869353046335217e-07, "loss": 0.1729, "step": 14101 }, { "epoch": 0.6813547857177369, "grad_norm": 4.35254430770874, "learning_rate": 3.186452142822631e-07, "loss": 0.3314, "step": 14102 }, { "epoch": 0.6814031018988259, "grad_norm": 6.768065452575684, "learning_rate": 3.185968981011741e-07, "loss": 0.1784, "step": 14103 }, { "epoch": 0.6814514180799149, "grad_norm": 3.8917758464813232, "learning_rate": 3.1854858192008503e-07, "loss": 0.3627, "step": 14104 }, { "epoch": 0.681499734261004, "grad_norm": 2.3979063034057617, "learning_rate": 3.1850026573899597e-07, "loss": 0.2206, "step": 14105 }, { "epoch": 0.6815480504420931, "grad_norm": 2.454390287399292, "learning_rate": 3.1845194955790696e-07, "loss": 0.4016, "step": 14106 }, { "epoch": 0.6815963666231821, "grad_norm": 2.834695816040039, "learning_rate": 3.1840363337681784e-07, "loss": 0.3588, "step": 14107 }, { "epoch": 0.6816446828042712, "grad_norm": 2.442359209060669, "learning_rate": 3.1835531719572883e-07, "loss": 0.3121, "step": 14108 }, { "epoch": 0.6816929989853602, "grad_norm": 4.887266635894775, "learning_rate": 3.183070010146398e-07, "loss": 0.4634, "step": 14109 }, { "epoch": 0.6817413151664492, "grad_norm": 3.7508537769317627, "learning_rate": 3.182586848335507e-07, "loss": 0.3818, "step": 14110 }, { "epoch": 0.6817896313475383, "grad_norm": 2.0238146781921387, "learning_rate": 3.182103686524617e-07, "loss": 0.1956, "step": 14111 }, { "epoch": 0.6818379475286274, "grad_norm": 3.7362401485443115, "learning_rate": 3.1816205247137263e-07, "loss": 0.2948, "step": 14112 }, { "epoch": 0.6818862637097164, "grad_norm": 3.5076041221618652, "learning_rate": 3.1811373629028357e-07, "loss": 0.308, "step": 14113 }, { "epoch": 0.6819345798908054, "grad_norm": 1.951533317565918, "learning_rate": 3.1806542010919456e-07, "loss": 0.2228, "step": 14114 }, { "epoch": 0.6819828960718944, "grad_norm": 4.345764636993408, "learning_rate": 3.180171039281055e-07, "loss": 0.2298, "step": 14115 }, { "epoch": 0.6820312122529836, "grad_norm": 2.8381574153900146, "learning_rate": 3.179687877470165e-07, "loss": 0.3665, "step": 14116 }, { "epoch": 0.6820795284340726, "grad_norm": 2.727532148361206, "learning_rate": 3.1792047156592743e-07, "loss": 0.2548, "step": 14117 }, { "epoch": 0.6821278446151616, "grad_norm": 3.1741394996643066, "learning_rate": 3.1787215538483836e-07, "loss": 0.303, "step": 14118 }, { "epoch": 0.6821761607962507, "grad_norm": 5.465235710144043, "learning_rate": 3.1782383920374935e-07, "loss": 0.3219, "step": 14119 }, { "epoch": 0.6822244769773397, "grad_norm": 1.942838191986084, "learning_rate": 3.1777552302266024e-07, "loss": 0.2689, "step": 14120 }, { "epoch": 0.6822727931584287, "grad_norm": 2.0190250873565674, "learning_rate": 3.1772720684157123e-07, "loss": 0.2417, "step": 14121 }, { "epoch": 0.6823211093395178, "grad_norm": 2.5407872200012207, "learning_rate": 3.176788906604822e-07, "loss": 0.303, "step": 14122 }, { "epoch": 0.6823694255206069, "grad_norm": 2.210289478302002, "learning_rate": 3.176305744793931e-07, "loss": 0.286, "step": 14123 }, { "epoch": 0.6824177417016959, "grad_norm": 2.9166815280914307, "learning_rate": 3.175822582983041e-07, "loss": 0.3514, "step": 14124 }, { "epoch": 0.6824660578827849, "grad_norm": 2.1955366134643555, "learning_rate": 3.1753394211721503e-07, "loss": 0.2524, "step": 14125 }, { "epoch": 0.6825143740638739, "grad_norm": 2.6645305156707764, "learning_rate": 3.1748562593612597e-07, "loss": 0.2961, "step": 14126 }, { "epoch": 0.6825626902449631, "grad_norm": 3.370277166366577, "learning_rate": 3.1743730975503696e-07, "loss": 0.2232, "step": 14127 }, { "epoch": 0.6826110064260521, "grad_norm": 2.6214537620544434, "learning_rate": 3.173889935739479e-07, "loss": 0.3413, "step": 14128 }, { "epoch": 0.6826593226071411, "grad_norm": 2.588153839111328, "learning_rate": 3.1734067739285883e-07, "loss": 0.3089, "step": 14129 }, { "epoch": 0.6827076387882302, "grad_norm": 2.6496686935424805, "learning_rate": 3.172923612117698e-07, "loss": 0.4137, "step": 14130 }, { "epoch": 0.6827559549693192, "grad_norm": 2.352295398712158, "learning_rate": 3.1724404503068076e-07, "loss": 0.1616, "step": 14131 }, { "epoch": 0.6828042711504083, "grad_norm": 2.9821372032165527, "learning_rate": 3.1719572884959175e-07, "loss": 0.3601, "step": 14132 }, { "epoch": 0.6828525873314973, "grad_norm": 1.6154463291168213, "learning_rate": 3.1714741266850263e-07, "loss": 0.19, "step": 14133 }, { "epoch": 0.6829009035125864, "grad_norm": 2.617262601852417, "learning_rate": 3.170990964874136e-07, "loss": 0.3381, "step": 14134 }, { "epoch": 0.6829492196936754, "grad_norm": 2.1713600158691406, "learning_rate": 3.170507803063246e-07, "loss": 0.2548, "step": 14135 }, { "epoch": 0.6829975358747644, "grad_norm": 2.479360818862915, "learning_rate": 3.170024641252355e-07, "loss": 0.3927, "step": 14136 }, { "epoch": 0.6830458520558536, "grad_norm": 2.841621160507202, "learning_rate": 3.169541479441465e-07, "loss": 0.2278, "step": 14137 }, { "epoch": 0.6830941682369426, "grad_norm": 3.1606295108795166, "learning_rate": 3.1690583176305743e-07, "loss": 0.2322, "step": 14138 }, { "epoch": 0.6831424844180316, "grad_norm": 2.5920748710632324, "learning_rate": 3.1685751558196836e-07, "loss": 0.3616, "step": 14139 }, { "epoch": 0.6831908005991206, "grad_norm": 2.3923723697662354, "learning_rate": 3.1680919940087935e-07, "loss": 0.2554, "step": 14140 }, { "epoch": 0.6832391167802097, "grad_norm": 3.1179122924804688, "learning_rate": 3.167608832197903e-07, "loss": 0.2371, "step": 14141 }, { "epoch": 0.6832874329612988, "grad_norm": 2.2178025245666504, "learning_rate": 3.1671256703870123e-07, "loss": 0.2835, "step": 14142 }, { "epoch": 0.6833357491423878, "grad_norm": 3.5328996181488037, "learning_rate": 3.166642508576122e-07, "loss": 0.3308, "step": 14143 }, { "epoch": 0.6833840653234768, "grad_norm": 2.8676040172576904, "learning_rate": 3.1661593467652316e-07, "loss": 0.3781, "step": 14144 }, { "epoch": 0.6834323815045659, "grad_norm": 8.477129936218262, "learning_rate": 3.165676184954341e-07, "loss": 0.3364, "step": 14145 }, { "epoch": 0.6834806976856549, "grad_norm": 2.1942708492279053, "learning_rate": 3.1651930231434503e-07, "loss": 0.2929, "step": 14146 }, { "epoch": 0.6835290138667439, "grad_norm": 1.7872239351272583, "learning_rate": 3.16470986133256e-07, "loss": 0.1811, "step": 14147 }, { "epoch": 0.683577330047833, "grad_norm": 3.5951342582702637, "learning_rate": 3.16422669952167e-07, "loss": 0.3023, "step": 14148 }, { "epoch": 0.6836256462289221, "grad_norm": 2.474699020385742, "learning_rate": 3.163743537710779e-07, "loss": 0.212, "step": 14149 }, { "epoch": 0.6836739624100111, "grad_norm": 4.156750202178955, "learning_rate": 3.163260375899889e-07, "loss": 0.3028, "step": 14150 }, { "epoch": 0.6837222785911001, "grad_norm": 1.6648046970367432, "learning_rate": 3.162777214088998e-07, "loss": 0.1781, "step": 14151 }, { "epoch": 0.6837705947721892, "grad_norm": 2.412749767303467, "learning_rate": 3.1622940522781076e-07, "loss": 0.2449, "step": 14152 }, { "epoch": 0.6838189109532783, "grad_norm": 2.7417595386505127, "learning_rate": 3.1618108904672175e-07, "loss": 0.2667, "step": 14153 }, { "epoch": 0.6838672271343673, "grad_norm": 2.5813000202178955, "learning_rate": 3.161327728656327e-07, "loss": 0.3827, "step": 14154 }, { "epoch": 0.6839155433154563, "grad_norm": 2.3191447257995605, "learning_rate": 3.160844566845436e-07, "loss": 0.2795, "step": 14155 }, { "epoch": 0.6839638594965454, "grad_norm": 1.9704232215881348, "learning_rate": 3.160361405034546e-07, "loss": 0.2351, "step": 14156 }, { "epoch": 0.6840121756776344, "grad_norm": 3.6991117000579834, "learning_rate": 3.1598782432236555e-07, "loss": 0.408, "step": 14157 }, { "epoch": 0.6840604918587235, "grad_norm": 2.635979413986206, "learning_rate": 3.159395081412765e-07, "loss": 0.2904, "step": 14158 }, { "epoch": 0.6841088080398126, "grad_norm": 5.948727130889893, "learning_rate": 3.1589119196018743e-07, "loss": 0.3472, "step": 14159 }, { "epoch": 0.6841571242209016, "grad_norm": 4.959198474884033, "learning_rate": 3.158428757790984e-07, "loss": 0.3532, "step": 14160 }, { "epoch": 0.6842054404019906, "grad_norm": 2.6482441425323486, "learning_rate": 3.1579455959800936e-07, "loss": 0.3053, "step": 14161 }, { "epoch": 0.6842537565830796, "grad_norm": 2.296741247177124, "learning_rate": 3.157462434169203e-07, "loss": 0.2379, "step": 14162 }, { "epoch": 0.6843020727641688, "grad_norm": 4.730119705200195, "learning_rate": 3.156979272358313e-07, "loss": 0.379, "step": 14163 }, { "epoch": 0.6843503889452578, "grad_norm": 3.023756265640259, "learning_rate": 3.1564961105474217e-07, "loss": 0.3259, "step": 14164 }, { "epoch": 0.6843987051263468, "grad_norm": 2.742953062057495, "learning_rate": 3.1560129487365316e-07, "loss": 0.3208, "step": 14165 }, { "epoch": 0.6844470213074358, "grad_norm": 2.0809409618377686, "learning_rate": 3.1555297869256415e-07, "loss": 0.2558, "step": 14166 }, { "epoch": 0.6844953374885249, "grad_norm": 9.124167442321777, "learning_rate": 3.155046625114751e-07, "loss": 0.4982, "step": 14167 }, { "epoch": 0.684543653669614, "grad_norm": 2.304405927658081, "learning_rate": 3.15456346330386e-07, "loss": 0.197, "step": 14168 }, { "epoch": 0.684591969850703, "grad_norm": 2.6921982765197754, "learning_rate": 3.15408030149297e-07, "loss": 0.2924, "step": 14169 }, { "epoch": 0.6846402860317921, "grad_norm": 2.8139781951904297, "learning_rate": 3.1535971396820795e-07, "loss": 0.2825, "step": 14170 }, { "epoch": 0.6846886022128811, "grad_norm": 2.8833553791046143, "learning_rate": 3.153113977871189e-07, "loss": 0.2746, "step": 14171 }, { "epoch": 0.6847369183939701, "grad_norm": 12.851605415344238, "learning_rate": 3.152630816060298e-07, "loss": 0.2944, "step": 14172 }, { "epoch": 0.6847852345750591, "grad_norm": 6.588525772094727, "learning_rate": 3.152147654249408e-07, "loss": 0.2168, "step": 14173 }, { "epoch": 0.6848335507561483, "grad_norm": 2.6088240146636963, "learning_rate": 3.1516644924385175e-07, "loss": 0.2431, "step": 14174 }, { "epoch": 0.6848818669372373, "grad_norm": 2.8244152069091797, "learning_rate": 3.151181330627627e-07, "loss": 0.3301, "step": 14175 }, { "epoch": 0.6849301831183263, "grad_norm": 2.1654765605926514, "learning_rate": 3.150698168816737e-07, "loss": 0.2326, "step": 14176 }, { "epoch": 0.6849784992994153, "grad_norm": 3.2905101776123047, "learning_rate": 3.1502150070058456e-07, "loss": 0.2706, "step": 14177 }, { "epoch": 0.6850268154805044, "grad_norm": 4.198575973510742, "learning_rate": 3.1497318451949555e-07, "loss": 0.3415, "step": 14178 }, { "epoch": 0.6850751316615935, "grad_norm": 2.1547610759735107, "learning_rate": 3.1492486833840654e-07, "loss": 0.215, "step": 14179 }, { "epoch": 0.6851234478426825, "grad_norm": 7.862307548522949, "learning_rate": 3.1487655215731743e-07, "loss": 0.3555, "step": 14180 }, { "epoch": 0.6851717640237716, "grad_norm": 2.5070645809173584, "learning_rate": 3.148282359762284e-07, "loss": 0.2169, "step": 14181 }, { "epoch": 0.6852200802048606, "grad_norm": 2.4776203632354736, "learning_rate": 3.147799197951394e-07, "loss": 0.3164, "step": 14182 }, { "epoch": 0.6852683963859496, "grad_norm": 2.5644986629486084, "learning_rate": 3.1473160361405035e-07, "loss": 0.296, "step": 14183 }, { "epoch": 0.6853167125670387, "grad_norm": 1.4324198961257935, "learning_rate": 3.146832874329613e-07, "loss": 0.1593, "step": 14184 }, { "epoch": 0.6853650287481278, "grad_norm": 9.445731163024902, "learning_rate": 3.146349712518722e-07, "loss": 0.2334, "step": 14185 }, { "epoch": 0.6854133449292168, "grad_norm": 2.970669746398926, "learning_rate": 3.145866550707832e-07, "loss": 0.446, "step": 14186 }, { "epoch": 0.6854616611103058, "grad_norm": 2.311537981033325, "learning_rate": 3.1453833888969415e-07, "loss": 0.2307, "step": 14187 }, { "epoch": 0.6855099772913948, "grad_norm": 3.315488338470459, "learning_rate": 3.144900227086051e-07, "loss": 0.2768, "step": 14188 }, { "epoch": 0.685558293472484, "grad_norm": 4.471951961517334, "learning_rate": 3.144417065275161e-07, "loss": 0.1532, "step": 14189 }, { "epoch": 0.685606609653573, "grad_norm": 3.148850440979004, "learning_rate": 3.1439339034642696e-07, "loss": 0.369, "step": 14190 }, { "epoch": 0.685654925834662, "grad_norm": 2.5844736099243164, "learning_rate": 3.1434507416533795e-07, "loss": 0.2069, "step": 14191 }, { "epoch": 0.6857032420157511, "grad_norm": 8.697294235229492, "learning_rate": 3.1429675798424894e-07, "loss": 0.1356, "step": 14192 }, { "epoch": 0.6857515581968401, "grad_norm": 5.776721954345703, "learning_rate": 3.142484418031598e-07, "loss": 0.3912, "step": 14193 }, { "epoch": 0.6857998743779292, "grad_norm": 2.3870913982391357, "learning_rate": 3.142001256220708e-07, "loss": 0.2133, "step": 14194 }, { "epoch": 0.6858481905590182, "grad_norm": 2.6218786239624023, "learning_rate": 3.141518094409818e-07, "loss": 0.3297, "step": 14195 }, { "epoch": 0.6858965067401073, "grad_norm": 4.754032135009766, "learning_rate": 3.141034932598927e-07, "loss": 0.2687, "step": 14196 }, { "epoch": 0.6859448229211963, "grad_norm": 23.07970428466797, "learning_rate": 3.140551770788037e-07, "loss": 0.2186, "step": 14197 }, { "epoch": 0.6859931391022853, "grad_norm": 6.944838523864746, "learning_rate": 3.140068608977146e-07, "loss": 0.3811, "step": 14198 }, { "epoch": 0.6860414552833745, "grad_norm": 4.220679759979248, "learning_rate": 3.139585447166256e-07, "loss": 0.395, "step": 14199 }, { "epoch": 0.6860897714644635, "grad_norm": 2.4569389820098877, "learning_rate": 3.1391022853553654e-07, "loss": 0.2843, "step": 14200 }, { "epoch": 0.6861380876455525, "grad_norm": 3.280308961868286, "learning_rate": 3.138619123544475e-07, "loss": 0.2277, "step": 14201 }, { "epoch": 0.6861864038266415, "grad_norm": 4.159555912017822, "learning_rate": 3.1381359617335847e-07, "loss": 0.3177, "step": 14202 }, { "epoch": 0.6862347200077306, "grad_norm": 2.2330849170684814, "learning_rate": 3.1376527999226936e-07, "loss": 0.2165, "step": 14203 }, { "epoch": 0.6862830361888196, "grad_norm": 2.7104737758636475, "learning_rate": 3.1371696381118035e-07, "loss": 0.2353, "step": 14204 }, { "epoch": 0.6863313523699087, "grad_norm": 2.3925864696502686, "learning_rate": 3.1366864763009134e-07, "loss": 0.2805, "step": 14205 }, { "epoch": 0.6863796685509977, "grad_norm": 2.135035991668701, "learning_rate": 3.136203314490022e-07, "loss": 0.2561, "step": 14206 }, { "epoch": 0.6864279847320868, "grad_norm": 2.7888736724853516, "learning_rate": 3.135720152679132e-07, "loss": 0.4037, "step": 14207 }, { "epoch": 0.6864763009131758, "grad_norm": 3.2206952571868896, "learning_rate": 3.135236990868242e-07, "loss": 0.4151, "step": 14208 }, { "epoch": 0.6865246170942648, "grad_norm": 2.6597461700439453, "learning_rate": 3.134753829057351e-07, "loss": 0.3663, "step": 14209 }, { "epoch": 0.686572933275354, "grad_norm": 2.4967663288116455, "learning_rate": 3.134270667246461e-07, "loss": 0.276, "step": 14210 }, { "epoch": 0.686621249456443, "grad_norm": 6.35844612121582, "learning_rate": 3.13378750543557e-07, "loss": 0.2212, "step": 14211 }, { "epoch": 0.686669565637532, "grad_norm": 2.6820337772369385, "learning_rate": 3.13330434362468e-07, "loss": 0.2939, "step": 14212 }, { "epoch": 0.686717881818621, "grad_norm": 3.0035500526428223, "learning_rate": 3.1328211818137894e-07, "loss": 0.1829, "step": 14213 }, { "epoch": 0.6867661979997101, "grad_norm": 2.531404495239258, "learning_rate": 3.132338020002899e-07, "loss": 0.2564, "step": 14214 }, { "epoch": 0.6868145141807992, "grad_norm": 2.7251429557800293, "learning_rate": 3.1318548581920087e-07, "loss": 0.2818, "step": 14215 }, { "epoch": 0.6868628303618882, "grad_norm": 2.8495781421661377, "learning_rate": 3.1313716963811175e-07, "loss": 0.3268, "step": 14216 }, { "epoch": 0.6869111465429772, "grad_norm": 2.4451959133148193, "learning_rate": 3.1308885345702274e-07, "loss": 0.2136, "step": 14217 }, { "epoch": 0.6869594627240663, "grad_norm": 3.30553936958313, "learning_rate": 3.1304053727593373e-07, "loss": 0.2815, "step": 14218 }, { "epoch": 0.6870077789051553, "grad_norm": 2.524935007095337, "learning_rate": 3.129922210948446e-07, "loss": 0.3131, "step": 14219 }, { "epoch": 0.6870560950862444, "grad_norm": 3.6299593448638916, "learning_rate": 3.129439049137556e-07, "loss": 0.2941, "step": 14220 }, { "epoch": 0.6871044112673335, "grad_norm": 5.7917160987854, "learning_rate": 3.128955887326666e-07, "loss": 0.4202, "step": 14221 }, { "epoch": 0.6871527274484225, "grad_norm": 3.1671252250671387, "learning_rate": 3.128472725515775e-07, "loss": 0.2884, "step": 14222 }, { "epoch": 0.6872010436295115, "grad_norm": 2.4484570026397705, "learning_rate": 3.1279895637048847e-07, "loss": 0.2737, "step": 14223 }, { "epoch": 0.6872493598106005, "grad_norm": 9.505075454711914, "learning_rate": 3.127506401893994e-07, "loss": 0.3882, "step": 14224 }, { "epoch": 0.6872976759916897, "grad_norm": 2.8610293865203857, "learning_rate": 3.1270232400831035e-07, "loss": 0.2988, "step": 14225 }, { "epoch": 0.6873459921727787, "grad_norm": 2.7038772106170654, "learning_rate": 3.1265400782722134e-07, "loss": 0.3555, "step": 14226 }, { "epoch": 0.6873943083538677, "grad_norm": 1.9337040185928345, "learning_rate": 3.126056916461323e-07, "loss": 0.1972, "step": 14227 }, { "epoch": 0.6874426245349567, "grad_norm": 3.7143826484680176, "learning_rate": 3.1255737546504326e-07, "loss": 0.3345, "step": 14228 }, { "epoch": 0.6874909407160458, "grad_norm": 2.3809759616851807, "learning_rate": 3.1250905928395415e-07, "loss": 0.2364, "step": 14229 }, { "epoch": 0.6875392568971348, "grad_norm": 2.729918956756592, "learning_rate": 3.1246074310286514e-07, "loss": 0.2748, "step": 14230 }, { "epoch": 0.6875875730782239, "grad_norm": 2.502113103866577, "learning_rate": 3.1241242692177613e-07, "loss": 0.2047, "step": 14231 }, { "epoch": 0.687635889259313, "grad_norm": 6.864233016967773, "learning_rate": 3.12364110740687e-07, "loss": 0.3454, "step": 14232 }, { "epoch": 0.687684205440402, "grad_norm": 3.1000678539276123, "learning_rate": 3.12315794559598e-07, "loss": 0.3562, "step": 14233 }, { "epoch": 0.687732521621491, "grad_norm": 2.915095567703247, "learning_rate": 3.12267478378509e-07, "loss": 0.3022, "step": 14234 }, { "epoch": 0.68778083780258, "grad_norm": 6.506309509277344, "learning_rate": 3.122191621974199e-07, "loss": 0.3531, "step": 14235 }, { "epoch": 0.6878291539836692, "grad_norm": 2.7779414653778076, "learning_rate": 3.1217084601633087e-07, "loss": 0.3243, "step": 14236 }, { "epoch": 0.6878774701647582, "grad_norm": 2.364258050918579, "learning_rate": 3.121225298352418e-07, "loss": 0.2744, "step": 14237 }, { "epoch": 0.6879257863458472, "grad_norm": 3.3046088218688965, "learning_rate": 3.1207421365415274e-07, "loss": 0.2906, "step": 14238 }, { "epoch": 0.6879741025269362, "grad_norm": 2.1365673542022705, "learning_rate": 3.1202589747306373e-07, "loss": 0.153, "step": 14239 }, { "epoch": 0.6880224187080253, "grad_norm": 2.186677932739258, "learning_rate": 3.1197758129197467e-07, "loss": 0.2874, "step": 14240 }, { "epoch": 0.6880707348891144, "grad_norm": 3.199509382247925, "learning_rate": 3.119292651108856e-07, "loss": 0.2638, "step": 14241 }, { "epoch": 0.6881190510702034, "grad_norm": 3.793372869491577, "learning_rate": 3.1188094892979655e-07, "loss": 0.3952, "step": 14242 }, { "epoch": 0.6881673672512925, "grad_norm": 4.450831413269043, "learning_rate": 3.1183263274870754e-07, "loss": 0.3541, "step": 14243 }, { "epoch": 0.6882156834323815, "grad_norm": 3.995464563369751, "learning_rate": 3.117843165676185e-07, "loss": 0.2499, "step": 14244 }, { "epoch": 0.6882639996134705, "grad_norm": 3.0174381732940674, "learning_rate": 3.117360003865294e-07, "loss": 0.3592, "step": 14245 }, { "epoch": 0.6883123157945596, "grad_norm": 1.873279094696045, "learning_rate": 3.116876842054404e-07, "loss": 0.3083, "step": 14246 }, { "epoch": 0.6883606319756487, "grad_norm": 2.676919937133789, "learning_rate": 3.116393680243514e-07, "loss": 0.277, "step": 14247 }, { "epoch": 0.6884089481567377, "grad_norm": 2.3220744132995605, "learning_rate": 3.115910518432623e-07, "loss": 0.2066, "step": 14248 }, { "epoch": 0.6884572643378267, "grad_norm": 1.711712121963501, "learning_rate": 3.1154273566217326e-07, "loss": 0.1874, "step": 14249 }, { "epoch": 0.6885055805189157, "grad_norm": 2.8203887939453125, "learning_rate": 3.114944194810842e-07, "loss": 0.3262, "step": 14250 }, { "epoch": 0.6885538967000049, "grad_norm": 2.4888830184936523, "learning_rate": 3.1144610329999514e-07, "loss": 0.2719, "step": 14251 }, { "epoch": 0.6886022128810939, "grad_norm": 2.8801965713500977, "learning_rate": 3.1139778711890613e-07, "loss": 0.259, "step": 14252 }, { "epoch": 0.6886505290621829, "grad_norm": 4.807213306427002, "learning_rate": 3.1134947093781707e-07, "loss": 0.2898, "step": 14253 }, { "epoch": 0.688698845243272, "grad_norm": 2.631561279296875, "learning_rate": 3.11301154756728e-07, "loss": 0.3145, "step": 14254 }, { "epoch": 0.688747161424361, "grad_norm": 3.114424467086792, "learning_rate": 3.1125283857563894e-07, "loss": 0.2182, "step": 14255 }, { "epoch": 0.68879547760545, "grad_norm": 2.2744925022125244, "learning_rate": 3.1120452239454993e-07, "loss": 0.1956, "step": 14256 }, { "epoch": 0.6888437937865391, "grad_norm": 4.594661712646484, "learning_rate": 3.1115620621346087e-07, "loss": 0.2808, "step": 14257 }, { "epoch": 0.6888921099676282, "grad_norm": 2.9323978424072266, "learning_rate": 3.111078900323718e-07, "loss": 0.4223, "step": 14258 }, { "epoch": 0.6889404261487172, "grad_norm": 4.242496490478516, "learning_rate": 3.110595738512828e-07, "loss": 0.3701, "step": 14259 }, { "epoch": 0.6889887423298062, "grad_norm": 4.036792278289795, "learning_rate": 3.110112576701938e-07, "loss": 0.4421, "step": 14260 }, { "epoch": 0.6890370585108953, "grad_norm": 2.617370843887329, "learning_rate": 3.1096294148910467e-07, "loss": 0.3631, "step": 14261 }, { "epoch": 0.6890853746919844, "grad_norm": 2.7750158309936523, "learning_rate": 3.1091462530801566e-07, "loss": 0.3597, "step": 14262 }, { "epoch": 0.6891336908730734, "grad_norm": 2.511998414993286, "learning_rate": 3.108663091269266e-07, "loss": 0.2601, "step": 14263 }, { "epoch": 0.6891820070541624, "grad_norm": 2.97538685798645, "learning_rate": 3.1081799294583754e-07, "loss": 0.3969, "step": 14264 }, { "epoch": 0.6892303232352515, "grad_norm": 3.061633825302124, "learning_rate": 3.107696767647485e-07, "loss": 0.4053, "step": 14265 }, { "epoch": 0.6892786394163405, "grad_norm": 2.7720282077789307, "learning_rate": 3.1072136058365946e-07, "loss": 0.2951, "step": 14266 }, { "epoch": 0.6893269555974296, "grad_norm": 3.2299935817718506, "learning_rate": 3.106730444025704e-07, "loss": 0.3396, "step": 14267 }, { "epoch": 0.6893752717785187, "grad_norm": 1.5178859233856201, "learning_rate": 3.1062472822148134e-07, "loss": 0.1517, "step": 14268 }, { "epoch": 0.6894235879596077, "grad_norm": 8.222966194152832, "learning_rate": 3.1057641204039233e-07, "loss": 0.1858, "step": 14269 }, { "epoch": 0.6894719041406967, "grad_norm": 3.4958529472351074, "learning_rate": 3.1052809585930327e-07, "loss": 0.2986, "step": 14270 }, { "epoch": 0.6895202203217857, "grad_norm": 2.147982358932495, "learning_rate": 3.104797796782142e-07, "loss": 0.284, "step": 14271 }, { "epoch": 0.6895685365028749, "grad_norm": 3.5708272457122803, "learning_rate": 3.104314634971252e-07, "loss": 0.1976, "step": 14272 }, { "epoch": 0.6896168526839639, "grad_norm": 4.791383266448975, "learning_rate": 3.1038314731603613e-07, "loss": 0.3771, "step": 14273 }, { "epoch": 0.6896651688650529, "grad_norm": 2.2555692195892334, "learning_rate": 3.1033483113494707e-07, "loss": 0.2685, "step": 14274 }, { "epoch": 0.6897134850461419, "grad_norm": 4.525725364685059, "learning_rate": 3.1028651495385806e-07, "loss": 0.262, "step": 14275 }, { "epoch": 0.689761801227231, "grad_norm": 2.4284701347351074, "learning_rate": 3.1023819877276894e-07, "loss": 0.3076, "step": 14276 }, { "epoch": 0.6898101174083201, "grad_norm": 2.5799481868743896, "learning_rate": 3.1018988259167993e-07, "loss": 0.3084, "step": 14277 }, { "epoch": 0.6898584335894091, "grad_norm": 5.597795486450195, "learning_rate": 3.101415664105909e-07, "loss": 0.4314, "step": 14278 }, { "epoch": 0.6899067497704982, "grad_norm": 2.9100003242492676, "learning_rate": 3.1009325022950186e-07, "loss": 0.4527, "step": 14279 }, { "epoch": 0.6899550659515872, "grad_norm": 4.784740447998047, "learning_rate": 3.100449340484128e-07, "loss": 0.37, "step": 14280 }, { "epoch": 0.6900033821326762, "grad_norm": 2.908043146133423, "learning_rate": 3.0999661786732373e-07, "loss": 0.3761, "step": 14281 }, { "epoch": 0.6900516983137652, "grad_norm": 3.4446773529052734, "learning_rate": 3.099483016862347e-07, "loss": 0.3632, "step": 14282 }, { "epoch": 0.6901000144948544, "grad_norm": 2.8323111534118652, "learning_rate": 3.0989998550514566e-07, "loss": 0.2437, "step": 14283 }, { "epoch": 0.6901483306759434, "grad_norm": 2.164433240890503, "learning_rate": 3.098516693240566e-07, "loss": 0.2575, "step": 14284 }, { "epoch": 0.6901966468570324, "grad_norm": 2.4159839153289795, "learning_rate": 3.098033531429676e-07, "loss": 0.2308, "step": 14285 }, { "epoch": 0.6902449630381214, "grad_norm": 2.9782581329345703, "learning_rate": 3.097550369618785e-07, "loss": 0.3238, "step": 14286 }, { "epoch": 0.6902932792192105, "grad_norm": 2.498806953430176, "learning_rate": 3.0970672078078946e-07, "loss": 0.2517, "step": 14287 }, { "epoch": 0.6903415954002996, "grad_norm": 2.9995510578155518, "learning_rate": 3.0965840459970045e-07, "loss": 0.3645, "step": 14288 }, { "epoch": 0.6903899115813886, "grad_norm": 2.983461618423462, "learning_rate": 3.0961008841861134e-07, "loss": 0.247, "step": 14289 }, { "epoch": 0.6904382277624777, "grad_norm": 2.6611905097961426, "learning_rate": 3.0956177223752233e-07, "loss": 0.2571, "step": 14290 }, { "epoch": 0.6904865439435667, "grad_norm": 2.319528579711914, "learning_rate": 3.095134560564333e-07, "loss": 0.2717, "step": 14291 }, { "epoch": 0.6905348601246557, "grad_norm": 4.74042272567749, "learning_rate": 3.094651398753442e-07, "loss": 0.3939, "step": 14292 }, { "epoch": 0.6905831763057448, "grad_norm": 3.063918113708496, "learning_rate": 3.094168236942552e-07, "loss": 0.3406, "step": 14293 }, { "epoch": 0.6906314924868339, "grad_norm": 4.272457122802734, "learning_rate": 3.0936850751316613e-07, "loss": 0.3971, "step": 14294 }, { "epoch": 0.6906798086679229, "grad_norm": 1.7936720848083496, "learning_rate": 3.093201913320771e-07, "loss": 0.2329, "step": 14295 }, { "epoch": 0.6907281248490119, "grad_norm": 2.599938154220581, "learning_rate": 3.0927187515098806e-07, "loss": 0.3563, "step": 14296 }, { "epoch": 0.6907764410301009, "grad_norm": 2.4476277828216553, "learning_rate": 3.09223558969899e-07, "loss": 0.2731, "step": 14297 }, { "epoch": 0.6908247572111901, "grad_norm": 3.063793897628784, "learning_rate": 3.0917524278881e-07, "loss": 0.4486, "step": 14298 }, { "epoch": 0.6908730733922791, "grad_norm": 2.741849660873413, "learning_rate": 3.091269266077209e-07, "loss": 0.3297, "step": 14299 }, { "epoch": 0.6909213895733681, "grad_norm": 2.170635938644409, "learning_rate": 3.0907861042663186e-07, "loss": 0.1771, "step": 14300 }, { "epoch": 0.6909697057544572, "grad_norm": 2.206968069076538, "learning_rate": 3.0903029424554285e-07, "loss": 0.2335, "step": 14301 }, { "epoch": 0.6910180219355462, "grad_norm": 3.5031988620758057, "learning_rate": 3.0898197806445373e-07, "loss": 0.2933, "step": 14302 }, { "epoch": 0.6910663381166353, "grad_norm": 3.0185835361480713, "learning_rate": 3.089336618833647e-07, "loss": 0.3839, "step": 14303 }, { "epoch": 0.6911146542977243, "grad_norm": 2.2742807865142822, "learning_rate": 3.088853457022757e-07, "loss": 0.2424, "step": 14304 }, { "epoch": 0.6911629704788134, "grad_norm": 2.987823963165283, "learning_rate": 3.088370295211866e-07, "loss": 0.2108, "step": 14305 }, { "epoch": 0.6912112866599024, "grad_norm": 2.7601890563964844, "learning_rate": 3.087887133400976e-07, "loss": 0.2647, "step": 14306 }, { "epoch": 0.6912596028409914, "grad_norm": 4.631529808044434, "learning_rate": 3.0874039715900853e-07, "loss": 0.361, "step": 14307 }, { "epoch": 0.6913079190220804, "grad_norm": 2.7819597721099854, "learning_rate": 3.0869208097791946e-07, "loss": 0.3307, "step": 14308 }, { "epoch": 0.6913562352031696, "grad_norm": 2.2780096530914307, "learning_rate": 3.0864376479683045e-07, "loss": 0.2119, "step": 14309 }, { "epoch": 0.6914045513842586, "grad_norm": 2.291584014892578, "learning_rate": 3.085954486157414e-07, "loss": 0.191, "step": 14310 }, { "epoch": 0.6914528675653476, "grad_norm": 6.0870041847229, "learning_rate": 3.085471324346524e-07, "loss": 0.3032, "step": 14311 }, { "epoch": 0.6915011837464367, "grad_norm": 4.411505222320557, "learning_rate": 3.084988162535633e-07, "loss": 0.3139, "step": 14312 }, { "epoch": 0.6915494999275257, "grad_norm": 2.3462419509887695, "learning_rate": 3.0845050007247426e-07, "loss": 0.2619, "step": 14313 }, { "epoch": 0.6915978161086148, "grad_norm": 3.2686095237731934, "learning_rate": 3.0840218389138525e-07, "loss": 0.3519, "step": 14314 }, { "epoch": 0.6916461322897038, "grad_norm": 2.3504116535186768, "learning_rate": 3.0835386771029613e-07, "loss": 0.2472, "step": 14315 }, { "epoch": 0.6916944484707929, "grad_norm": 2.342190980911255, "learning_rate": 3.083055515292071e-07, "loss": 0.2603, "step": 14316 }, { "epoch": 0.6917427646518819, "grad_norm": 2.849369525909424, "learning_rate": 3.082572353481181e-07, "loss": 0.3191, "step": 14317 }, { "epoch": 0.6917910808329709, "grad_norm": 2.222813129425049, "learning_rate": 3.08208919167029e-07, "loss": 0.2403, "step": 14318 }, { "epoch": 0.69183939701406, "grad_norm": 2.7835402488708496, "learning_rate": 3.0816060298594e-07, "loss": 0.2889, "step": 14319 }, { "epoch": 0.6918877131951491, "grad_norm": 2.316981792449951, "learning_rate": 3.081122868048509e-07, "loss": 0.2731, "step": 14320 }, { "epoch": 0.6919360293762381, "grad_norm": 2.8636350631713867, "learning_rate": 3.0806397062376186e-07, "loss": 0.3081, "step": 14321 }, { "epoch": 0.6919843455573271, "grad_norm": 4.148293972015381, "learning_rate": 3.0801565444267285e-07, "loss": 0.3265, "step": 14322 }, { "epoch": 0.6920326617384162, "grad_norm": 2.7427823543548584, "learning_rate": 3.079673382615838e-07, "loss": 0.2922, "step": 14323 }, { "epoch": 0.6920809779195053, "grad_norm": 2.301830768585205, "learning_rate": 3.079190220804947e-07, "loss": 0.2371, "step": 14324 }, { "epoch": 0.6921292941005943, "grad_norm": 3.1642911434173584, "learning_rate": 3.0787070589940566e-07, "loss": 0.1789, "step": 14325 }, { "epoch": 0.6921776102816833, "grad_norm": 4.140202045440674, "learning_rate": 3.0782238971831665e-07, "loss": 0.3139, "step": 14326 }, { "epoch": 0.6922259264627724, "grad_norm": 4.695125579833984, "learning_rate": 3.0777407353722764e-07, "loss": 0.2916, "step": 14327 }, { "epoch": 0.6922742426438614, "grad_norm": 2.3955259323120117, "learning_rate": 3.0772575735613853e-07, "loss": 0.3199, "step": 14328 }, { "epoch": 0.6923225588249505, "grad_norm": 3.709606409072876, "learning_rate": 3.076774411750495e-07, "loss": 0.2164, "step": 14329 }, { "epoch": 0.6923708750060396, "grad_norm": 2.565528392791748, "learning_rate": 3.076291249939605e-07, "loss": 0.3184, "step": 14330 }, { "epoch": 0.6924191911871286, "grad_norm": 1.8929123878479004, "learning_rate": 3.075808088128714e-07, "loss": 0.2265, "step": 14331 }, { "epoch": 0.6924675073682176, "grad_norm": 3.71012282371521, "learning_rate": 3.075324926317824e-07, "loss": 0.3262, "step": 14332 }, { "epoch": 0.6925158235493066, "grad_norm": 3.022958755493164, "learning_rate": 3.074841764506933e-07, "loss": 0.2188, "step": 14333 }, { "epoch": 0.6925641397303957, "grad_norm": 5.971786975860596, "learning_rate": 3.0743586026960426e-07, "loss": 0.3092, "step": 14334 }, { "epoch": 0.6926124559114848, "grad_norm": 2.5660483837127686, "learning_rate": 3.0738754408851525e-07, "loss": 0.2779, "step": 14335 }, { "epoch": 0.6926607720925738, "grad_norm": 1.9329785108566284, "learning_rate": 3.073392279074262e-07, "loss": 0.2413, "step": 14336 }, { "epoch": 0.6927090882736628, "grad_norm": 2.7649099826812744, "learning_rate": 3.072909117263371e-07, "loss": 0.1668, "step": 14337 }, { "epoch": 0.6927574044547519, "grad_norm": 2.13521671295166, "learning_rate": 3.0724259554524806e-07, "loss": 0.1806, "step": 14338 }, { "epoch": 0.6928057206358409, "grad_norm": 2.2151594161987305, "learning_rate": 3.0719427936415905e-07, "loss": 0.292, "step": 14339 }, { "epoch": 0.69285403681693, "grad_norm": 4.052314281463623, "learning_rate": 3.0714596318307e-07, "loss": 0.2608, "step": 14340 }, { "epoch": 0.692902352998019, "grad_norm": 3.3284196853637695, "learning_rate": 3.070976470019809e-07, "loss": 0.3801, "step": 14341 }, { "epoch": 0.6929506691791081, "grad_norm": 2.337721586227417, "learning_rate": 3.070493308208919e-07, "loss": 0.2733, "step": 14342 }, { "epoch": 0.6929989853601971, "grad_norm": 2.473466157913208, "learning_rate": 3.070010146398029e-07, "loss": 0.2943, "step": 14343 }, { "epoch": 0.6930473015412861, "grad_norm": 2.5972676277160645, "learning_rate": 3.069526984587138e-07, "loss": 0.2752, "step": 14344 }, { "epoch": 0.6930956177223753, "grad_norm": 2.416844129562378, "learning_rate": 3.069043822776248e-07, "loss": 0.3153, "step": 14345 }, { "epoch": 0.6931439339034643, "grad_norm": 3.0583317279815674, "learning_rate": 3.068560660965357e-07, "loss": 0.376, "step": 14346 }, { "epoch": 0.6931922500845533, "grad_norm": 1.9170368909835815, "learning_rate": 3.0680774991544665e-07, "loss": 0.218, "step": 14347 }, { "epoch": 0.6932405662656423, "grad_norm": 2.42230486869812, "learning_rate": 3.0675943373435764e-07, "loss": 0.3583, "step": 14348 }, { "epoch": 0.6932888824467314, "grad_norm": 2.0257298946380615, "learning_rate": 3.067111175532686e-07, "loss": 0.248, "step": 14349 }, { "epoch": 0.6933371986278205, "grad_norm": 5.529679298400879, "learning_rate": 3.066628013721795e-07, "loss": 0.2664, "step": 14350 }, { "epoch": 0.6933855148089095, "grad_norm": 2.4220333099365234, "learning_rate": 3.0661448519109046e-07, "loss": 0.27, "step": 14351 }, { "epoch": 0.6934338309899986, "grad_norm": 1.9685837030410767, "learning_rate": 3.0656616901000145e-07, "loss": 0.1328, "step": 14352 }, { "epoch": 0.6934821471710876, "grad_norm": 3.048708438873291, "learning_rate": 3.065178528289124e-07, "loss": 0.3687, "step": 14353 }, { "epoch": 0.6935304633521766, "grad_norm": 2.5179617404937744, "learning_rate": 3.064695366478233e-07, "loss": 0.3209, "step": 14354 }, { "epoch": 0.6935787795332657, "grad_norm": 1.9042558670043945, "learning_rate": 3.064212204667343e-07, "loss": 0.2293, "step": 14355 }, { "epoch": 0.6936270957143548, "grad_norm": 4.152710914611816, "learning_rate": 3.0637290428564525e-07, "loss": 0.3284, "step": 14356 }, { "epoch": 0.6936754118954438, "grad_norm": 3.02005934715271, "learning_rate": 3.063245881045562e-07, "loss": 0.4178, "step": 14357 }, { "epoch": 0.6937237280765328, "grad_norm": 1.912302851676941, "learning_rate": 3.062762719234672e-07, "loss": 0.2409, "step": 14358 }, { "epoch": 0.6937720442576218, "grad_norm": 3.5650267601013184, "learning_rate": 3.0622795574237806e-07, "loss": 0.3875, "step": 14359 }, { "epoch": 0.6938203604387109, "grad_norm": 3.5989863872528076, "learning_rate": 3.0617963956128905e-07, "loss": 0.2809, "step": 14360 }, { "epoch": 0.6938686766198, "grad_norm": 2.1102707386016846, "learning_rate": 3.0613132338020004e-07, "loss": 0.2785, "step": 14361 }, { "epoch": 0.693916992800889, "grad_norm": 2.507803201675415, "learning_rate": 3.06083007199111e-07, "loss": 0.3222, "step": 14362 }, { "epoch": 0.6939653089819781, "grad_norm": 3.1597580909729004, "learning_rate": 3.060346910180219e-07, "loss": 0.3161, "step": 14363 }, { "epoch": 0.6940136251630671, "grad_norm": 2.277900457382202, "learning_rate": 3.0598637483693285e-07, "loss": 0.2026, "step": 14364 }, { "epoch": 0.6940619413441561, "grad_norm": 2.0843756198883057, "learning_rate": 3.0593805865584384e-07, "loss": 0.2281, "step": 14365 }, { "epoch": 0.6941102575252452, "grad_norm": 2.2087366580963135, "learning_rate": 3.058897424747548e-07, "loss": 0.2821, "step": 14366 }, { "epoch": 0.6941585737063343, "grad_norm": 2.297264337539673, "learning_rate": 3.058414262936657e-07, "loss": 0.3101, "step": 14367 }, { "epoch": 0.6942068898874233, "grad_norm": 3.18520450592041, "learning_rate": 3.057931101125767e-07, "loss": 0.3004, "step": 14368 }, { "epoch": 0.6942552060685123, "grad_norm": 2.7231011390686035, "learning_rate": 3.0574479393148764e-07, "loss": 0.1948, "step": 14369 }, { "epoch": 0.6943035222496013, "grad_norm": 2.577840805053711, "learning_rate": 3.056964777503986e-07, "loss": 0.2869, "step": 14370 }, { "epoch": 0.6943518384306905, "grad_norm": 7.0019612312316895, "learning_rate": 3.0564816156930957e-07, "loss": 0.2854, "step": 14371 }, { "epoch": 0.6944001546117795, "grad_norm": 2.119464635848999, "learning_rate": 3.0559984538822046e-07, "loss": 0.1846, "step": 14372 }, { "epoch": 0.6944484707928685, "grad_norm": 2.4124274253845215, "learning_rate": 3.0555152920713145e-07, "loss": 0.2647, "step": 14373 }, { "epoch": 0.6944967869739576, "grad_norm": 2.998615026473999, "learning_rate": 3.0550321302604244e-07, "loss": 0.4238, "step": 14374 }, { "epoch": 0.6945451031550466, "grad_norm": 4.698163986206055, "learning_rate": 3.0545489684495337e-07, "loss": 0.1979, "step": 14375 }, { "epoch": 0.6945934193361357, "grad_norm": 1.7936748266220093, "learning_rate": 3.054065806638643e-07, "loss": 0.2361, "step": 14376 }, { "epoch": 0.6946417355172247, "grad_norm": 2.7376625537872314, "learning_rate": 3.0535826448277525e-07, "loss": 0.304, "step": 14377 }, { "epoch": 0.6946900516983138, "grad_norm": 6.195948600769043, "learning_rate": 3.0530994830168624e-07, "loss": 0.3954, "step": 14378 }, { "epoch": 0.6947383678794028, "grad_norm": 2.8434135913848877, "learning_rate": 3.052616321205972e-07, "loss": 0.3605, "step": 14379 }, { "epoch": 0.6947866840604918, "grad_norm": 2.9343132972717285, "learning_rate": 3.052133159395081e-07, "loss": 0.3577, "step": 14380 }, { "epoch": 0.694835000241581, "grad_norm": 2.548220157623291, "learning_rate": 3.051649997584191e-07, "loss": 0.3032, "step": 14381 }, { "epoch": 0.69488331642267, "grad_norm": 3.2236897945404053, "learning_rate": 3.0511668357733004e-07, "loss": 0.2442, "step": 14382 }, { "epoch": 0.694931632603759, "grad_norm": 2.8296618461608887, "learning_rate": 3.05068367396241e-07, "loss": 0.301, "step": 14383 }, { "epoch": 0.694979948784848, "grad_norm": 2.5756285190582275, "learning_rate": 3.0502005121515197e-07, "loss": 0.2309, "step": 14384 }, { "epoch": 0.6950282649659371, "grad_norm": 2.811052083969116, "learning_rate": 3.0497173503406285e-07, "loss": 0.3834, "step": 14385 }, { "epoch": 0.6950765811470261, "grad_norm": 11.692183494567871, "learning_rate": 3.0492341885297384e-07, "loss": 0.3782, "step": 14386 }, { "epoch": 0.6951248973281152, "grad_norm": 8.773444175720215, "learning_rate": 3.0487510267188483e-07, "loss": 0.2775, "step": 14387 }, { "epoch": 0.6951732135092042, "grad_norm": 1.6152044534683228, "learning_rate": 3.048267864907957e-07, "loss": 0.1394, "step": 14388 }, { "epoch": 0.6952215296902933, "grad_norm": 1.6037622690200806, "learning_rate": 3.047784703097067e-07, "loss": 0.1872, "step": 14389 }, { "epoch": 0.6952698458713823, "grad_norm": 2.0757558345794678, "learning_rate": 3.0473015412861764e-07, "loss": 0.2164, "step": 14390 }, { "epoch": 0.6953181620524713, "grad_norm": 2.8541815280914307, "learning_rate": 3.0468183794752863e-07, "loss": 0.3095, "step": 14391 }, { "epoch": 0.6953664782335605, "grad_norm": 2.0705111026763916, "learning_rate": 3.0463352176643957e-07, "loss": 0.1783, "step": 14392 }, { "epoch": 0.6954147944146495, "grad_norm": 11.679180145263672, "learning_rate": 3.045852055853505e-07, "loss": 0.2764, "step": 14393 }, { "epoch": 0.6954631105957385, "grad_norm": 2.23571515083313, "learning_rate": 3.045368894042615e-07, "loss": 0.3043, "step": 14394 }, { "epoch": 0.6955114267768275, "grad_norm": 2.9381062984466553, "learning_rate": 3.0448857322317244e-07, "loss": 0.3173, "step": 14395 }, { "epoch": 0.6955597429579166, "grad_norm": 3.567716121673584, "learning_rate": 3.044402570420834e-07, "loss": 0.389, "step": 14396 }, { "epoch": 0.6956080591390057, "grad_norm": 3.537376642227173, "learning_rate": 3.0439194086099436e-07, "loss": 0.3947, "step": 14397 }, { "epoch": 0.6956563753200947, "grad_norm": 5.711662292480469, "learning_rate": 3.0434362467990525e-07, "loss": 0.429, "step": 14398 }, { "epoch": 0.6957046915011837, "grad_norm": 3.1183605194091797, "learning_rate": 3.0429530849881624e-07, "loss": 0.329, "step": 14399 }, { "epoch": 0.6957530076822728, "grad_norm": 2.5612103939056396, "learning_rate": 3.0424699231772723e-07, "loss": 0.2767, "step": 14400 }, { "epoch": 0.6958013238633618, "grad_norm": 2.684946060180664, "learning_rate": 3.041986761366381e-07, "loss": 0.2986, "step": 14401 }, { "epoch": 0.6958496400444509, "grad_norm": 2.856732130050659, "learning_rate": 3.041503599555491e-07, "loss": 0.2942, "step": 14402 }, { "epoch": 0.69589795622554, "grad_norm": 1.3459928035736084, "learning_rate": 3.0410204377446004e-07, "loss": 0.1381, "step": 14403 }, { "epoch": 0.695946272406629, "grad_norm": 1.3232228755950928, "learning_rate": 3.04053727593371e-07, "loss": 0.151, "step": 14404 }, { "epoch": 0.695994588587718, "grad_norm": 4.22961950302124, "learning_rate": 3.0400541141228197e-07, "loss": 0.4032, "step": 14405 }, { "epoch": 0.696042904768807, "grad_norm": 5.550333023071289, "learning_rate": 3.039570952311929e-07, "loss": 0.2499, "step": 14406 }, { "epoch": 0.6960912209498962, "grad_norm": 3.363232374191284, "learning_rate": 3.039087790501039e-07, "loss": 0.2766, "step": 14407 }, { "epoch": 0.6961395371309852, "grad_norm": 14.304778099060059, "learning_rate": 3.0386046286901483e-07, "loss": 0.3413, "step": 14408 }, { "epoch": 0.6961878533120742, "grad_norm": 3.4467828273773193, "learning_rate": 3.0381214668792577e-07, "loss": 0.4037, "step": 14409 }, { "epoch": 0.6962361694931632, "grad_norm": 2.973405599594116, "learning_rate": 3.0376383050683676e-07, "loss": 0.3418, "step": 14410 }, { "epoch": 0.6962844856742523, "grad_norm": 2.5936548709869385, "learning_rate": 3.0371551432574764e-07, "loss": 0.3646, "step": 14411 }, { "epoch": 0.6963328018553413, "grad_norm": 3.359423875808716, "learning_rate": 3.0366719814465863e-07, "loss": 0.3926, "step": 14412 }, { "epoch": 0.6963811180364304, "grad_norm": 2.9937901496887207, "learning_rate": 3.036188819635696e-07, "loss": 0.3239, "step": 14413 }, { "epoch": 0.6964294342175195, "grad_norm": 1.598679542541504, "learning_rate": 3.035705657824805e-07, "loss": 0.1727, "step": 14414 }, { "epoch": 0.6964777503986085, "grad_norm": 2.0023350715637207, "learning_rate": 3.035222496013915e-07, "loss": 0.1963, "step": 14415 }, { "epoch": 0.6965260665796975, "grad_norm": 2.806154489517212, "learning_rate": 3.0347393342030244e-07, "loss": 0.2697, "step": 14416 }, { "epoch": 0.6965743827607865, "grad_norm": 4.889762878417969, "learning_rate": 3.034256172392134e-07, "loss": 0.3866, "step": 14417 }, { "epoch": 0.6966226989418757, "grad_norm": 3.5307435989379883, "learning_rate": 3.0337730105812436e-07, "loss": 0.3357, "step": 14418 }, { "epoch": 0.6966710151229647, "grad_norm": 4.217531204223633, "learning_rate": 3.033289848770353e-07, "loss": 0.291, "step": 14419 }, { "epoch": 0.6967193313040537, "grad_norm": 2.8168299198150635, "learning_rate": 3.0328066869594624e-07, "loss": 0.3281, "step": 14420 }, { "epoch": 0.6967676474851427, "grad_norm": 2.710247039794922, "learning_rate": 3.0323235251485723e-07, "loss": 0.3157, "step": 14421 }, { "epoch": 0.6968159636662318, "grad_norm": 4.111472129821777, "learning_rate": 3.0318403633376817e-07, "loss": 0.3235, "step": 14422 }, { "epoch": 0.6968642798473209, "grad_norm": 3.1024882793426514, "learning_rate": 3.0313572015267916e-07, "loss": 0.3366, "step": 14423 }, { "epoch": 0.6969125960284099, "grad_norm": 3.2131521701812744, "learning_rate": 3.0308740397159004e-07, "loss": 0.311, "step": 14424 }, { "epoch": 0.696960912209499, "grad_norm": 2.749915838241577, "learning_rate": 3.0303908779050103e-07, "loss": 0.2664, "step": 14425 }, { "epoch": 0.697009228390588, "grad_norm": 2.4299299716949463, "learning_rate": 3.02990771609412e-07, "loss": 0.2891, "step": 14426 }, { "epoch": 0.697057544571677, "grad_norm": 2.7623751163482666, "learning_rate": 3.029424554283229e-07, "loss": 0.2528, "step": 14427 }, { "epoch": 0.6971058607527661, "grad_norm": 4.4058146476745605, "learning_rate": 3.028941392472339e-07, "loss": 0.3149, "step": 14428 }, { "epoch": 0.6971541769338552, "grad_norm": 2.614009141921997, "learning_rate": 3.0284582306614483e-07, "loss": 0.3291, "step": 14429 }, { "epoch": 0.6972024931149442, "grad_norm": 5.382599830627441, "learning_rate": 3.0279750688505577e-07, "loss": 0.3364, "step": 14430 }, { "epoch": 0.6972508092960332, "grad_norm": 2.6556079387664795, "learning_rate": 3.0274919070396676e-07, "loss": 0.2012, "step": 14431 }, { "epoch": 0.6972991254771223, "grad_norm": 2.703498125076294, "learning_rate": 3.027008745228777e-07, "loss": 0.3002, "step": 14432 }, { "epoch": 0.6973474416582114, "grad_norm": 2.219780683517456, "learning_rate": 3.0265255834178864e-07, "loss": 0.1698, "step": 14433 }, { "epoch": 0.6973957578393004, "grad_norm": 3.455508232116699, "learning_rate": 3.026042421606996e-07, "loss": 0.2489, "step": 14434 }, { "epoch": 0.6974440740203894, "grad_norm": 2.114272356033325, "learning_rate": 3.0255592597961056e-07, "loss": 0.2583, "step": 14435 }, { "epoch": 0.6974923902014785, "grad_norm": 2.9294960498809814, "learning_rate": 3.025076097985215e-07, "loss": 0.4396, "step": 14436 }, { "epoch": 0.6975407063825675, "grad_norm": 2.5000557899475098, "learning_rate": 3.0245929361743244e-07, "loss": 0.2407, "step": 14437 }, { "epoch": 0.6975890225636565, "grad_norm": 2.9700982570648193, "learning_rate": 3.0241097743634343e-07, "loss": 0.473, "step": 14438 }, { "epoch": 0.6976373387447456, "grad_norm": 2.8490986824035645, "learning_rate": 3.023626612552544e-07, "loss": 0.3412, "step": 14439 }, { "epoch": 0.6976856549258347, "grad_norm": 2.6294684410095215, "learning_rate": 3.023143450741653e-07, "loss": 0.316, "step": 14440 }, { "epoch": 0.6977339711069237, "grad_norm": 3.0171425342559814, "learning_rate": 3.022660288930763e-07, "loss": 0.4001, "step": 14441 }, { "epoch": 0.6977822872880127, "grad_norm": 2.0217294692993164, "learning_rate": 3.0221771271198723e-07, "loss": 0.2492, "step": 14442 }, { "epoch": 0.6978306034691018, "grad_norm": 2.917896270751953, "learning_rate": 3.0216939653089817e-07, "loss": 0.3703, "step": 14443 }, { "epoch": 0.6978789196501909, "grad_norm": 4.752985000610352, "learning_rate": 3.0212108034980916e-07, "loss": 0.2253, "step": 14444 }, { "epoch": 0.6979272358312799, "grad_norm": 2.134870767593384, "learning_rate": 3.020727641687201e-07, "loss": 0.1924, "step": 14445 }, { "epoch": 0.6979755520123689, "grad_norm": 3.0554983615875244, "learning_rate": 3.0202444798763103e-07, "loss": 0.2016, "step": 14446 }, { "epoch": 0.698023868193458, "grad_norm": 2.8544669151306152, "learning_rate": 3.01976131806542e-07, "loss": 0.2054, "step": 14447 }, { "epoch": 0.698072184374547, "grad_norm": 2.751194477081299, "learning_rate": 3.0192781562545296e-07, "loss": 0.2625, "step": 14448 }, { "epoch": 0.6981205005556361, "grad_norm": 3.806591749191284, "learning_rate": 3.018794994443639e-07, "loss": 0.3434, "step": 14449 }, { "epoch": 0.6981688167367252, "grad_norm": 2.3583314418792725, "learning_rate": 3.0183118326327483e-07, "loss": 0.2777, "step": 14450 }, { "epoch": 0.6982171329178142, "grad_norm": 9.347705841064453, "learning_rate": 3.017828670821858e-07, "loss": 0.3119, "step": 14451 }, { "epoch": 0.6982654490989032, "grad_norm": 2.6670327186584473, "learning_rate": 3.0173455090109676e-07, "loss": 0.289, "step": 14452 }, { "epoch": 0.6983137652799922, "grad_norm": 2.982483148574829, "learning_rate": 3.016862347200077e-07, "loss": 0.2707, "step": 14453 }, { "epoch": 0.6983620814610814, "grad_norm": 2.5244359970092773, "learning_rate": 3.016379185389187e-07, "loss": 0.3145, "step": 14454 }, { "epoch": 0.6984103976421704, "grad_norm": 2.1455554962158203, "learning_rate": 3.0158960235782957e-07, "loss": 0.2392, "step": 14455 }, { "epoch": 0.6984587138232594, "grad_norm": 2.992223024368286, "learning_rate": 3.0154128617674056e-07, "loss": 0.3704, "step": 14456 }, { "epoch": 0.6985070300043484, "grad_norm": 2.4323503971099854, "learning_rate": 3.0149296999565155e-07, "loss": 0.2807, "step": 14457 }, { "epoch": 0.6985553461854375, "grad_norm": 3.2368369102478027, "learning_rate": 3.014446538145625e-07, "loss": 0.2977, "step": 14458 }, { "epoch": 0.6986036623665266, "grad_norm": 2.308722496032715, "learning_rate": 3.0139633763347343e-07, "loss": 0.3336, "step": 14459 }, { "epoch": 0.6986519785476156, "grad_norm": 3.690477132797241, "learning_rate": 3.013480214523844e-07, "loss": 0.4518, "step": 14460 }, { "epoch": 0.6987002947287047, "grad_norm": 2.858927011489868, "learning_rate": 3.0129970527129536e-07, "loss": 0.2773, "step": 14461 }, { "epoch": 0.6987486109097937, "grad_norm": 2.4249465465545654, "learning_rate": 3.012513890902063e-07, "loss": 0.2347, "step": 14462 }, { "epoch": 0.6987969270908827, "grad_norm": 2.1098148822784424, "learning_rate": 3.0120307290911723e-07, "loss": 0.3031, "step": 14463 }, { "epoch": 0.6988452432719718, "grad_norm": 2.870971202850342, "learning_rate": 3.011547567280282e-07, "loss": 0.3555, "step": 14464 }, { "epoch": 0.6988935594530609, "grad_norm": 3.128645896911621, "learning_rate": 3.0110644054693916e-07, "loss": 0.2674, "step": 14465 }, { "epoch": 0.6989418756341499, "grad_norm": 2.772542715072632, "learning_rate": 3.010581243658501e-07, "loss": 0.3535, "step": 14466 }, { "epoch": 0.6989901918152389, "grad_norm": 2.3381712436676025, "learning_rate": 3.010098081847611e-07, "loss": 0.2501, "step": 14467 }, { "epoch": 0.6990385079963279, "grad_norm": 2.85288405418396, "learning_rate": 3.0096149200367197e-07, "loss": 0.3741, "step": 14468 }, { "epoch": 0.699086824177417, "grad_norm": 2.8096120357513428, "learning_rate": 3.0091317582258296e-07, "loss": 0.4137, "step": 14469 }, { "epoch": 0.6991351403585061, "grad_norm": 2.8715646266937256, "learning_rate": 3.0086485964149395e-07, "loss": 0.2852, "step": 14470 }, { "epoch": 0.6991834565395951, "grad_norm": 2.6482348442077637, "learning_rate": 3.0081654346040483e-07, "loss": 0.4002, "step": 14471 }, { "epoch": 0.6992317727206842, "grad_norm": 2.7637252807617188, "learning_rate": 3.007682272793158e-07, "loss": 0.2843, "step": 14472 }, { "epoch": 0.6992800889017732, "grad_norm": 5.03282356262207, "learning_rate": 3.007199110982268e-07, "loss": 0.2417, "step": 14473 }, { "epoch": 0.6993284050828622, "grad_norm": 3.8378190994262695, "learning_rate": 3.0067159491713775e-07, "loss": 0.2529, "step": 14474 }, { "epoch": 0.6993767212639513, "grad_norm": 1750.029541015625, "learning_rate": 3.006232787360487e-07, "loss": 0.2405, "step": 14475 }, { "epoch": 0.6994250374450404, "grad_norm": 1.9197049140930176, "learning_rate": 3.005749625549596e-07, "loss": 0.1983, "step": 14476 }, { "epoch": 0.6994733536261294, "grad_norm": 4.448502540588379, "learning_rate": 3.005266463738706e-07, "loss": 0.3397, "step": 14477 }, { "epoch": 0.6995216698072184, "grad_norm": 2.580371618270874, "learning_rate": 3.0047833019278155e-07, "loss": 0.2126, "step": 14478 }, { "epoch": 0.6995699859883074, "grad_norm": 6.321473121643066, "learning_rate": 3.004300140116925e-07, "loss": 0.1685, "step": 14479 }, { "epoch": 0.6996183021693966, "grad_norm": 3.2230780124664307, "learning_rate": 3.003816978306035e-07, "loss": 0.3755, "step": 14480 }, { "epoch": 0.6996666183504856, "grad_norm": 4.368964195251465, "learning_rate": 3.0033338164951437e-07, "loss": 0.2785, "step": 14481 }, { "epoch": 0.6997149345315746, "grad_norm": 2.0895986557006836, "learning_rate": 3.0028506546842536e-07, "loss": 0.2477, "step": 14482 }, { "epoch": 0.6997632507126637, "grad_norm": 2.5939979553222656, "learning_rate": 3.0023674928733635e-07, "loss": 0.2653, "step": 14483 }, { "epoch": 0.6998115668937527, "grad_norm": 2.8808817863464355, "learning_rate": 3.0018843310624723e-07, "loss": 0.2421, "step": 14484 }, { "epoch": 0.6998598830748418, "grad_norm": 2.6316468715667725, "learning_rate": 3.001401169251582e-07, "loss": 0.3126, "step": 14485 }, { "epoch": 0.6999081992559308, "grad_norm": 4.881869792938232, "learning_rate": 3.000918007440692e-07, "loss": 0.3524, "step": 14486 }, { "epoch": 0.6999565154370199, "grad_norm": 1.7471563816070557, "learning_rate": 3.000434845629801e-07, "loss": 0.1466, "step": 14487 }, { "epoch": 0.7000048316181089, "grad_norm": 3.8995165824890137, "learning_rate": 2.999951683818911e-07, "loss": 0.3746, "step": 14488 }, { "epoch": 0.7000531477991979, "grad_norm": 3.0940098762512207, "learning_rate": 2.99946852200802e-07, "loss": 0.4639, "step": 14489 }, { "epoch": 0.700101463980287, "grad_norm": 5.265985488891602, "learning_rate": 2.99898536019713e-07, "loss": 0.2719, "step": 14490 }, { "epoch": 0.7001497801613761, "grad_norm": 4.155041694641113, "learning_rate": 2.9985021983862395e-07, "loss": 0.2789, "step": 14491 }, { "epoch": 0.7001980963424651, "grad_norm": 2.286688804626465, "learning_rate": 2.998019036575349e-07, "loss": 0.1478, "step": 14492 }, { "epoch": 0.7002464125235541, "grad_norm": 3.861509323120117, "learning_rate": 2.997535874764459e-07, "loss": 0.25, "step": 14493 }, { "epoch": 0.7002947287046432, "grad_norm": 3.0042617321014404, "learning_rate": 2.9970527129535676e-07, "loss": 0.4639, "step": 14494 }, { "epoch": 0.7003430448857322, "grad_norm": 2.6672229766845703, "learning_rate": 2.9965695511426775e-07, "loss": 0.3792, "step": 14495 }, { "epoch": 0.7003913610668213, "grad_norm": 2.1890106201171875, "learning_rate": 2.9960863893317874e-07, "loss": 0.2175, "step": 14496 }, { "epoch": 0.7004396772479103, "grad_norm": 2.7398717403411865, "learning_rate": 2.9956032275208963e-07, "loss": 0.2474, "step": 14497 }, { "epoch": 0.7004879934289994, "grad_norm": 2.2679085731506348, "learning_rate": 2.995120065710006e-07, "loss": 0.226, "step": 14498 }, { "epoch": 0.7005363096100884, "grad_norm": 1.7794404029846191, "learning_rate": 2.994636903899116e-07, "loss": 0.1828, "step": 14499 }, { "epoch": 0.7005846257911774, "grad_norm": 2.642076015472412, "learning_rate": 2.994153742088225e-07, "loss": 0.2774, "step": 14500 }, { "epoch": 0.7006329419722666, "grad_norm": 2.695925712585449, "learning_rate": 2.993670580277335e-07, "loss": 0.3125, "step": 14501 }, { "epoch": 0.7006812581533556, "grad_norm": 5.1459550857543945, "learning_rate": 2.993187418466444e-07, "loss": 0.2825, "step": 14502 }, { "epoch": 0.7007295743344446, "grad_norm": 2.9819254875183105, "learning_rate": 2.9927042566555536e-07, "loss": 0.3548, "step": 14503 }, { "epoch": 0.7007778905155336, "grad_norm": 1.8236851692199707, "learning_rate": 2.9922210948446635e-07, "loss": 0.2371, "step": 14504 }, { "epoch": 0.7008262066966227, "grad_norm": 3.737725257873535, "learning_rate": 2.991737933033773e-07, "loss": 0.3008, "step": 14505 }, { "epoch": 0.7008745228777118, "grad_norm": 3.203754186630249, "learning_rate": 2.991254771222883e-07, "loss": 0.4711, "step": 14506 }, { "epoch": 0.7009228390588008, "grad_norm": 2.1302261352539062, "learning_rate": 2.9907716094119916e-07, "loss": 0.1866, "step": 14507 }, { "epoch": 0.7009711552398898, "grad_norm": 1.6323974132537842, "learning_rate": 2.9902884476011015e-07, "loss": 0.1858, "step": 14508 }, { "epoch": 0.7010194714209789, "grad_norm": 2.9853029251098633, "learning_rate": 2.9898052857902114e-07, "loss": 0.3345, "step": 14509 }, { "epoch": 0.7010677876020679, "grad_norm": 2.754019021987915, "learning_rate": 2.98932212397932e-07, "loss": 0.2772, "step": 14510 }, { "epoch": 0.701116103783157, "grad_norm": 2.8948874473571777, "learning_rate": 2.98883896216843e-07, "loss": 0.2626, "step": 14511 }, { "epoch": 0.701164419964246, "grad_norm": 2.508528232574463, "learning_rate": 2.98835580035754e-07, "loss": 0.1914, "step": 14512 }, { "epoch": 0.7012127361453351, "grad_norm": 4.846154689788818, "learning_rate": 2.987872638546649e-07, "loss": 0.2424, "step": 14513 }, { "epoch": 0.7012610523264241, "grad_norm": 4.372528076171875, "learning_rate": 2.987389476735759e-07, "loss": 0.3337, "step": 14514 }, { "epoch": 0.7013093685075131, "grad_norm": 2.4705445766448975, "learning_rate": 2.986906314924868e-07, "loss": 0.2805, "step": 14515 }, { "epoch": 0.7013576846886023, "grad_norm": 2.4714043140411377, "learning_rate": 2.9864231531139775e-07, "loss": 0.269, "step": 14516 }, { "epoch": 0.7014060008696913, "grad_norm": 2.6088850498199463, "learning_rate": 2.9859399913030874e-07, "loss": 0.3585, "step": 14517 }, { "epoch": 0.7014543170507803, "grad_norm": 2.8546228408813477, "learning_rate": 2.985456829492197e-07, "loss": 0.289, "step": 14518 }, { "epoch": 0.7015026332318693, "grad_norm": 2.967829465866089, "learning_rate": 2.984973667681306e-07, "loss": 0.1763, "step": 14519 }, { "epoch": 0.7015509494129584, "grad_norm": 2.1976799964904785, "learning_rate": 2.9844905058704155e-07, "loss": 0.2285, "step": 14520 }, { "epoch": 0.7015992655940474, "grad_norm": 2.393211841583252, "learning_rate": 2.9840073440595254e-07, "loss": 0.2412, "step": 14521 }, { "epoch": 0.7016475817751365, "grad_norm": 2.2830851078033447, "learning_rate": 2.9835241822486354e-07, "loss": 0.246, "step": 14522 }, { "epoch": 0.7016958979562256, "grad_norm": 1.750683069229126, "learning_rate": 2.983041020437744e-07, "loss": 0.1584, "step": 14523 }, { "epoch": 0.7017442141373146, "grad_norm": 5.8044514656066895, "learning_rate": 2.982557858626854e-07, "loss": 0.3057, "step": 14524 }, { "epoch": 0.7017925303184036, "grad_norm": 2.5667006969451904, "learning_rate": 2.982074696815964e-07, "loss": 0.3122, "step": 14525 }, { "epoch": 0.7018408464994926, "grad_norm": 3.0904927253723145, "learning_rate": 2.981591535005073e-07, "loss": 0.3071, "step": 14526 }, { "epoch": 0.7018891626805818, "grad_norm": 3.5582282543182373, "learning_rate": 2.981108373194183e-07, "loss": 0.2285, "step": 14527 }, { "epoch": 0.7019374788616708, "grad_norm": 3.7603750228881836, "learning_rate": 2.980625211383292e-07, "loss": 0.3833, "step": 14528 }, { "epoch": 0.7019857950427598, "grad_norm": 2.075932025909424, "learning_rate": 2.9801420495724015e-07, "loss": 0.2495, "step": 14529 }, { "epoch": 0.7020341112238488, "grad_norm": 5.239256381988525, "learning_rate": 2.9796588877615114e-07, "loss": 0.2965, "step": 14530 }, { "epoch": 0.7020824274049379, "grad_norm": 4.192519187927246, "learning_rate": 2.979175725950621e-07, "loss": 0.2807, "step": 14531 }, { "epoch": 0.702130743586027, "grad_norm": 7.214315891265869, "learning_rate": 2.97869256413973e-07, "loss": 0.3109, "step": 14532 }, { "epoch": 0.702179059767116, "grad_norm": 4.316205024719238, "learning_rate": 2.9782094023288395e-07, "loss": 0.3737, "step": 14533 }, { "epoch": 0.7022273759482051, "grad_norm": 3.0785202980041504, "learning_rate": 2.9777262405179494e-07, "loss": 0.4309, "step": 14534 }, { "epoch": 0.7022756921292941, "grad_norm": 3.300630807876587, "learning_rate": 2.977243078707059e-07, "loss": 0.3912, "step": 14535 }, { "epoch": 0.7023240083103831, "grad_norm": 3.401566505432129, "learning_rate": 2.976759916896168e-07, "loss": 0.3327, "step": 14536 }, { "epoch": 0.7023723244914722, "grad_norm": 2.449862241744995, "learning_rate": 2.976276755085278e-07, "loss": 0.3827, "step": 14537 }, { "epoch": 0.7024206406725613, "grad_norm": 2.726752281188965, "learning_rate": 2.975793593274388e-07, "loss": 0.351, "step": 14538 }, { "epoch": 0.7024689568536503, "grad_norm": 2.2270374298095703, "learning_rate": 2.975310431463497e-07, "loss": 0.1447, "step": 14539 }, { "epoch": 0.7025172730347393, "grad_norm": 2.440573215484619, "learning_rate": 2.9748272696526067e-07, "loss": 0.2582, "step": 14540 }, { "epoch": 0.7025655892158283, "grad_norm": 2.992419958114624, "learning_rate": 2.974344107841716e-07, "loss": 0.3076, "step": 14541 }, { "epoch": 0.7026139053969175, "grad_norm": 3.555626630783081, "learning_rate": 2.9738609460308255e-07, "loss": 0.4408, "step": 14542 }, { "epoch": 0.7026622215780065, "grad_norm": 2.378336191177368, "learning_rate": 2.9733777842199354e-07, "loss": 0.2911, "step": 14543 }, { "epoch": 0.7027105377590955, "grad_norm": 3.2608542442321777, "learning_rate": 2.9728946224090447e-07, "loss": 0.3613, "step": 14544 }, { "epoch": 0.7027588539401846, "grad_norm": 4.274324417114258, "learning_rate": 2.972411460598154e-07, "loss": 0.3577, "step": 14545 }, { "epoch": 0.7028071701212736, "grad_norm": 2.675995349884033, "learning_rate": 2.9719282987872635e-07, "loss": 0.364, "step": 14546 }, { "epoch": 0.7028554863023626, "grad_norm": 3.396967649459839, "learning_rate": 2.9714451369763734e-07, "loss": 0.3648, "step": 14547 }, { "epoch": 0.7029038024834517, "grad_norm": 13.067376136779785, "learning_rate": 2.970961975165483e-07, "loss": 0.3232, "step": 14548 }, { "epoch": 0.7029521186645408, "grad_norm": 3.0545125007629395, "learning_rate": 2.970478813354592e-07, "loss": 0.2975, "step": 14549 }, { "epoch": 0.7030004348456298, "grad_norm": 2.0816328525543213, "learning_rate": 2.969995651543702e-07, "loss": 0.1834, "step": 14550 }, { "epoch": 0.7030487510267188, "grad_norm": 2.4878783226013184, "learning_rate": 2.9695124897328114e-07, "loss": 0.2871, "step": 14551 }, { "epoch": 0.7030970672078078, "grad_norm": 2.479724407196045, "learning_rate": 2.969029327921921e-07, "loss": 0.2922, "step": 14552 }, { "epoch": 0.703145383388897, "grad_norm": 2.9889631271362305, "learning_rate": 2.9685461661110307e-07, "loss": 0.3386, "step": 14553 }, { "epoch": 0.703193699569986, "grad_norm": 1.9942923784255981, "learning_rate": 2.96806300430014e-07, "loss": 0.2344, "step": 14554 }, { "epoch": 0.703242015751075, "grad_norm": 5.551800727844238, "learning_rate": 2.9675798424892494e-07, "loss": 0.2128, "step": 14555 }, { "epoch": 0.7032903319321641, "grad_norm": 1.329625129699707, "learning_rate": 2.9670966806783593e-07, "loss": 0.1467, "step": 14556 }, { "epoch": 0.7033386481132531, "grad_norm": 3.711444854736328, "learning_rate": 2.9666135188674687e-07, "loss": 0.2988, "step": 14557 }, { "epoch": 0.7033869642943422, "grad_norm": 3.587197780609131, "learning_rate": 2.966130357056578e-07, "loss": 0.4083, "step": 14558 }, { "epoch": 0.7034352804754312, "grad_norm": 2.273982524871826, "learning_rate": 2.9656471952456874e-07, "loss": 0.2322, "step": 14559 }, { "epoch": 0.7034835966565203, "grad_norm": 2.5927839279174805, "learning_rate": 2.9651640334347973e-07, "loss": 0.3422, "step": 14560 }, { "epoch": 0.7035319128376093, "grad_norm": 4.883986949920654, "learning_rate": 2.9646808716239067e-07, "loss": 0.3189, "step": 14561 }, { "epoch": 0.7035802290186983, "grad_norm": 2.188579797744751, "learning_rate": 2.964197709813016e-07, "loss": 0.1879, "step": 14562 }, { "epoch": 0.7036285451997875, "grad_norm": 20.49942398071289, "learning_rate": 2.963714548002126e-07, "loss": 0.2559, "step": 14563 }, { "epoch": 0.7036768613808765, "grad_norm": 2.9526073932647705, "learning_rate": 2.9632313861912354e-07, "loss": 0.3148, "step": 14564 }, { "epoch": 0.7037251775619655, "grad_norm": 2.6154892444610596, "learning_rate": 2.9627482243803447e-07, "loss": 0.2531, "step": 14565 }, { "epoch": 0.7037734937430545, "grad_norm": 3.058018207550049, "learning_rate": 2.9622650625694546e-07, "loss": 0.3168, "step": 14566 }, { "epoch": 0.7038218099241436, "grad_norm": 2.704468250274658, "learning_rate": 2.9617819007585635e-07, "loss": 0.2122, "step": 14567 }, { "epoch": 0.7038701261052327, "grad_norm": 2.1959986686706543, "learning_rate": 2.9612987389476734e-07, "loss": 0.2308, "step": 14568 }, { "epoch": 0.7039184422863217, "grad_norm": 2.9692130088806152, "learning_rate": 2.9608155771367833e-07, "loss": 0.4863, "step": 14569 }, { "epoch": 0.7039667584674107, "grad_norm": 2.146122455596924, "learning_rate": 2.9603324153258927e-07, "loss": 0.2055, "step": 14570 }, { "epoch": 0.7040150746484998, "grad_norm": 7.611958980560303, "learning_rate": 2.959849253515002e-07, "loss": 0.4515, "step": 14571 }, { "epoch": 0.7040633908295888, "grad_norm": 2.4577760696411133, "learning_rate": 2.9593660917041114e-07, "loss": 0.258, "step": 14572 }, { "epoch": 0.7041117070106778, "grad_norm": 3.1293439865112305, "learning_rate": 2.9588829298932213e-07, "loss": 0.4185, "step": 14573 }, { "epoch": 0.704160023191767, "grad_norm": 1.4136102199554443, "learning_rate": 2.9583997680823307e-07, "loss": 0.1567, "step": 14574 }, { "epoch": 0.704208339372856, "grad_norm": 2.6308956146240234, "learning_rate": 2.95791660627144e-07, "loss": 0.2914, "step": 14575 }, { "epoch": 0.704256655553945, "grad_norm": 5.39230489730835, "learning_rate": 2.95743344446055e-07, "loss": 0.3999, "step": 14576 }, { "epoch": 0.704304971735034, "grad_norm": 3.9820284843444824, "learning_rate": 2.9569502826496593e-07, "loss": 0.4366, "step": 14577 }, { "epoch": 0.7043532879161231, "grad_norm": 2.1926369667053223, "learning_rate": 2.9564671208387687e-07, "loss": 0.2885, "step": 14578 }, { "epoch": 0.7044016040972122, "grad_norm": 2.102140188217163, "learning_rate": 2.9559839590278786e-07, "loss": 0.2529, "step": 14579 }, { "epoch": 0.7044499202783012, "grad_norm": 4.955587387084961, "learning_rate": 2.9555007972169874e-07, "loss": 0.2216, "step": 14580 }, { "epoch": 0.7044982364593902, "grad_norm": 2.6013243198394775, "learning_rate": 2.9550176354060973e-07, "loss": 0.331, "step": 14581 }, { "epoch": 0.7045465526404793, "grad_norm": 2.2839138507843018, "learning_rate": 2.954534473595207e-07, "loss": 0.2737, "step": 14582 }, { "epoch": 0.7045948688215683, "grad_norm": 2.210735559463501, "learning_rate": 2.954051311784316e-07, "loss": 0.2413, "step": 14583 }, { "epoch": 0.7046431850026574, "grad_norm": 2.0013468265533447, "learning_rate": 2.953568149973426e-07, "loss": 0.2386, "step": 14584 }, { "epoch": 0.7046915011837465, "grad_norm": 2.7757887840270996, "learning_rate": 2.9530849881625354e-07, "loss": 0.3535, "step": 14585 }, { "epoch": 0.7047398173648355, "grad_norm": 2.577739953994751, "learning_rate": 2.9526018263516453e-07, "loss": 0.2972, "step": 14586 }, { "epoch": 0.7047881335459245, "grad_norm": 3.251063585281372, "learning_rate": 2.9521186645407546e-07, "loss": 0.284, "step": 14587 }, { "epoch": 0.7048364497270135, "grad_norm": 4.020509719848633, "learning_rate": 2.951635502729864e-07, "loss": 0.3882, "step": 14588 }, { "epoch": 0.7048847659081027, "grad_norm": 2.783255100250244, "learning_rate": 2.951152340918974e-07, "loss": 0.3871, "step": 14589 }, { "epoch": 0.7049330820891917, "grad_norm": 5.1149821281433105, "learning_rate": 2.9506691791080833e-07, "loss": 0.1494, "step": 14590 }, { "epoch": 0.7049813982702807, "grad_norm": 4.147392749786377, "learning_rate": 2.9501860172971927e-07, "loss": 0.4633, "step": 14591 }, { "epoch": 0.7050297144513697, "grad_norm": 2.1858391761779785, "learning_rate": 2.9497028554863026e-07, "loss": 0.2393, "step": 14592 }, { "epoch": 0.7050780306324588, "grad_norm": 12.815415382385254, "learning_rate": 2.9492196936754114e-07, "loss": 0.19, "step": 14593 }, { "epoch": 0.7051263468135479, "grad_norm": 8.396824836730957, "learning_rate": 2.9487365318645213e-07, "loss": 0.3571, "step": 14594 }, { "epoch": 0.7051746629946369, "grad_norm": 3.390904188156128, "learning_rate": 2.948253370053631e-07, "loss": 0.3895, "step": 14595 }, { "epoch": 0.705222979175726, "grad_norm": 2.0206918716430664, "learning_rate": 2.94777020824274e-07, "loss": 0.2204, "step": 14596 }, { "epoch": 0.705271295356815, "grad_norm": 3.7769668102264404, "learning_rate": 2.94728704643185e-07, "loss": 0.426, "step": 14597 }, { "epoch": 0.705319611537904, "grad_norm": 3.310659408569336, "learning_rate": 2.9468038846209593e-07, "loss": 0.3253, "step": 14598 }, { "epoch": 0.705367927718993, "grad_norm": 4.069805145263672, "learning_rate": 2.9463207228100687e-07, "loss": 0.3856, "step": 14599 }, { "epoch": 0.7054162439000822, "grad_norm": 27.92237091064453, "learning_rate": 2.9458375609991786e-07, "loss": 0.3327, "step": 14600 }, { "epoch": 0.7054645600811712, "grad_norm": 2.1725242137908936, "learning_rate": 2.945354399188288e-07, "loss": 0.2474, "step": 14601 }, { "epoch": 0.7055128762622602, "grad_norm": 1.7606608867645264, "learning_rate": 2.944871237377398e-07, "loss": 0.1404, "step": 14602 }, { "epoch": 0.7055611924433492, "grad_norm": 18.71426773071289, "learning_rate": 2.9443880755665067e-07, "loss": 0.2145, "step": 14603 }, { "epoch": 0.7056095086244383, "grad_norm": 1.7886792421340942, "learning_rate": 2.9439049137556166e-07, "loss": 0.1519, "step": 14604 }, { "epoch": 0.7056578248055274, "grad_norm": 2.1873250007629395, "learning_rate": 2.9434217519447265e-07, "loss": 0.2828, "step": 14605 }, { "epoch": 0.7057061409866164, "grad_norm": 2.094353199005127, "learning_rate": 2.9429385901338354e-07, "loss": 0.2843, "step": 14606 }, { "epoch": 0.7057544571677055, "grad_norm": 1.9464104175567627, "learning_rate": 2.9424554283229453e-07, "loss": 0.1862, "step": 14607 }, { "epoch": 0.7058027733487945, "grad_norm": 2.740304470062256, "learning_rate": 2.941972266512055e-07, "loss": 0.3311, "step": 14608 }, { "epoch": 0.7058510895298835, "grad_norm": 2.483447313308716, "learning_rate": 2.941489104701164e-07, "loss": 0.219, "step": 14609 }, { "epoch": 0.7058994057109726, "grad_norm": 2.117504119873047, "learning_rate": 2.941005942890274e-07, "loss": 0.2224, "step": 14610 }, { "epoch": 0.7059477218920617, "grad_norm": 2.566987991333008, "learning_rate": 2.9405227810793833e-07, "loss": 0.3063, "step": 14611 }, { "epoch": 0.7059960380731507, "grad_norm": 5.219844818115234, "learning_rate": 2.9400396192684927e-07, "loss": 0.2864, "step": 14612 }, { "epoch": 0.7060443542542397, "grad_norm": 3.362910509109497, "learning_rate": 2.9395564574576026e-07, "loss": 0.3465, "step": 14613 }, { "epoch": 0.7060926704353288, "grad_norm": 11.477890968322754, "learning_rate": 2.939073295646712e-07, "loss": 0.3503, "step": 14614 }, { "epoch": 0.7061409866164179, "grad_norm": 1.5479987859725952, "learning_rate": 2.9385901338358213e-07, "loss": 0.1569, "step": 14615 }, { "epoch": 0.7061893027975069, "grad_norm": 3.6623122692108154, "learning_rate": 2.9381069720249307e-07, "loss": 0.416, "step": 14616 }, { "epoch": 0.7062376189785959, "grad_norm": 2.649695634841919, "learning_rate": 2.9376238102140406e-07, "loss": 0.2531, "step": 14617 }, { "epoch": 0.706285935159685, "grad_norm": 4.247467517852783, "learning_rate": 2.9371406484031505e-07, "loss": 0.3913, "step": 14618 }, { "epoch": 0.706334251340774, "grad_norm": 1.8271129131317139, "learning_rate": 2.9366574865922593e-07, "loss": 0.1899, "step": 14619 }, { "epoch": 0.7063825675218631, "grad_norm": 3.4759020805358887, "learning_rate": 2.936174324781369e-07, "loss": 0.3111, "step": 14620 }, { "epoch": 0.7064308837029522, "grad_norm": 2.7885873317718506, "learning_rate": 2.935691162970479e-07, "loss": 0.3855, "step": 14621 }, { "epoch": 0.7064791998840412, "grad_norm": 2.6338765621185303, "learning_rate": 2.935208001159588e-07, "loss": 0.2893, "step": 14622 }, { "epoch": 0.7065275160651302, "grad_norm": 3.5413625240325928, "learning_rate": 2.934724839348698e-07, "loss": 0.339, "step": 14623 }, { "epoch": 0.7065758322462192, "grad_norm": 1.969143033027649, "learning_rate": 2.934241677537807e-07, "loss": 0.2018, "step": 14624 }, { "epoch": 0.7066241484273083, "grad_norm": 7.831990718841553, "learning_rate": 2.9337585157269166e-07, "loss": 0.2221, "step": 14625 }, { "epoch": 0.7066724646083974, "grad_norm": 2.7736423015594482, "learning_rate": 2.9332753539160265e-07, "loss": 0.2675, "step": 14626 }, { "epoch": 0.7067207807894864, "grad_norm": 2.760923385620117, "learning_rate": 2.932792192105136e-07, "loss": 0.2163, "step": 14627 }, { "epoch": 0.7067690969705754, "grad_norm": 3.2025814056396484, "learning_rate": 2.9323090302942453e-07, "loss": 0.222, "step": 14628 }, { "epoch": 0.7068174131516645, "grad_norm": 2.551906108856201, "learning_rate": 2.9318258684833546e-07, "loss": 0.3031, "step": 14629 }, { "epoch": 0.7068657293327535, "grad_norm": 2.788895845413208, "learning_rate": 2.9313427066724646e-07, "loss": 0.1982, "step": 14630 }, { "epoch": 0.7069140455138426, "grad_norm": 2.9951183795928955, "learning_rate": 2.930859544861574e-07, "loss": 0.4282, "step": 14631 }, { "epoch": 0.7069623616949317, "grad_norm": 2.1517114639282227, "learning_rate": 2.9303763830506833e-07, "loss": 0.1984, "step": 14632 }, { "epoch": 0.7070106778760207, "grad_norm": 2.5536088943481445, "learning_rate": 2.929893221239793e-07, "loss": 0.2864, "step": 14633 }, { "epoch": 0.7070589940571097, "grad_norm": 1.9985891580581665, "learning_rate": 2.929410059428903e-07, "loss": 0.2454, "step": 14634 }, { "epoch": 0.7071073102381987, "grad_norm": 2.537888765335083, "learning_rate": 2.928926897618012e-07, "loss": 0.3278, "step": 14635 }, { "epoch": 0.7071556264192879, "grad_norm": 2.539090871810913, "learning_rate": 2.928443735807122e-07, "loss": 0.2819, "step": 14636 }, { "epoch": 0.7072039426003769, "grad_norm": 2.159546136856079, "learning_rate": 2.927960573996231e-07, "loss": 0.1978, "step": 14637 }, { "epoch": 0.7072522587814659, "grad_norm": 66.24227142333984, "learning_rate": 2.9274774121853406e-07, "loss": 0.4401, "step": 14638 }, { "epoch": 0.7073005749625549, "grad_norm": 3.5397236347198486, "learning_rate": 2.9269942503744505e-07, "loss": 0.3885, "step": 14639 }, { "epoch": 0.707348891143644, "grad_norm": 2.837364435195923, "learning_rate": 2.92651108856356e-07, "loss": 0.1759, "step": 14640 }, { "epoch": 0.7073972073247331, "grad_norm": 7.206136226654053, "learning_rate": 2.926027926752669e-07, "loss": 0.3123, "step": 14641 }, { "epoch": 0.7074455235058221, "grad_norm": 8.858813285827637, "learning_rate": 2.9255447649417786e-07, "loss": 0.2468, "step": 14642 }, { "epoch": 0.7074938396869112, "grad_norm": 2.2984817028045654, "learning_rate": 2.9250616031308885e-07, "loss": 0.2903, "step": 14643 }, { "epoch": 0.7075421558680002, "grad_norm": 2.750739336013794, "learning_rate": 2.924578441319998e-07, "loss": 0.3648, "step": 14644 }, { "epoch": 0.7075904720490892, "grad_norm": 6.914647102355957, "learning_rate": 2.924095279509107e-07, "loss": 0.3303, "step": 14645 }, { "epoch": 0.7076387882301783, "grad_norm": 2.3500559329986572, "learning_rate": 2.923612117698217e-07, "loss": 0.1936, "step": 14646 }, { "epoch": 0.7076871044112674, "grad_norm": 3.6427645683288574, "learning_rate": 2.9231289558873265e-07, "loss": 0.3145, "step": 14647 }, { "epoch": 0.7077354205923564, "grad_norm": 2.3137400150299072, "learning_rate": 2.922645794076436e-07, "loss": 0.295, "step": 14648 }, { "epoch": 0.7077837367734454, "grad_norm": 2.5836050510406494, "learning_rate": 2.922162632265546e-07, "loss": 0.3312, "step": 14649 }, { "epoch": 0.7078320529545344, "grad_norm": 2.611663818359375, "learning_rate": 2.9216794704546547e-07, "loss": 0.2556, "step": 14650 }, { "epoch": 0.7078803691356235, "grad_norm": 9.818159103393555, "learning_rate": 2.9211963086437646e-07, "loss": 0.4201, "step": 14651 }, { "epoch": 0.7079286853167126, "grad_norm": 1.4297306537628174, "learning_rate": 2.9207131468328745e-07, "loss": 0.1593, "step": 14652 }, { "epoch": 0.7079770014978016, "grad_norm": 2.666755437850952, "learning_rate": 2.920229985021984e-07, "loss": 0.2665, "step": 14653 }, { "epoch": 0.7080253176788907, "grad_norm": 2.0834009647369385, "learning_rate": 2.919746823211093e-07, "loss": 0.2492, "step": 14654 }, { "epoch": 0.7080736338599797, "grad_norm": 2.537425994873047, "learning_rate": 2.9192636614002026e-07, "loss": 0.1978, "step": 14655 }, { "epoch": 0.7081219500410687, "grad_norm": 1.8516353368759155, "learning_rate": 2.9187804995893125e-07, "loss": 0.2362, "step": 14656 }, { "epoch": 0.7081702662221578, "grad_norm": 3.9357235431671143, "learning_rate": 2.918297337778422e-07, "loss": 0.3253, "step": 14657 }, { "epoch": 0.7082185824032469, "grad_norm": 5.22075891494751, "learning_rate": 2.917814175967531e-07, "loss": 0.2933, "step": 14658 }, { "epoch": 0.7082668985843359, "grad_norm": 4.312933921813965, "learning_rate": 2.917331014156641e-07, "loss": 0.4383, "step": 14659 }, { "epoch": 0.7083152147654249, "grad_norm": 3.2156264781951904, "learning_rate": 2.9168478523457505e-07, "loss": 0.3664, "step": 14660 }, { "epoch": 0.7083635309465139, "grad_norm": 2.797335147857666, "learning_rate": 2.91636469053486e-07, "loss": 0.3027, "step": 14661 }, { "epoch": 0.7084118471276031, "grad_norm": 4.366804122924805, "learning_rate": 2.91588152872397e-07, "loss": 0.3929, "step": 14662 }, { "epoch": 0.7084601633086921, "grad_norm": 3.3088605403900146, "learning_rate": 2.9153983669130786e-07, "loss": 0.3035, "step": 14663 }, { "epoch": 0.7085084794897811, "grad_norm": 2.853804349899292, "learning_rate": 2.9149152051021885e-07, "loss": 0.2355, "step": 14664 }, { "epoch": 0.7085567956708702, "grad_norm": 2.3792929649353027, "learning_rate": 2.9144320432912984e-07, "loss": 0.2856, "step": 14665 }, { "epoch": 0.7086051118519592, "grad_norm": 3.1648218631744385, "learning_rate": 2.913948881480407e-07, "loss": 0.3308, "step": 14666 }, { "epoch": 0.7086534280330483, "grad_norm": 3.1347312927246094, "learning_rate": 2.913465719669517e-07, "loss": 0.3407, "step": 14667 }, { "epoch": 0.7087017442141373, "grad_norm": 3.336209297180176, "learning_rate": 2.9129825578586265e-07, "loss": 0.316, "step": 14668 }, { "epoch": 0.7087500603952264, "grad_norm": 9.067872047424316, "learning_rate": 2.9124993960477364e-07, "loss": 0.3345, "step": 14669 }, { "epoch": 0.7087983765763154, "grad_norm": 3.0105478763580322, "learning_rate": 2.912016234236846e-07, "loss": 0.3413, "step": 14670 }, { "epoch": 0.7088466927574044, "grad_norm": 2.8406124114990234, "learning_rate": 2.911533072425955e-07, "loss": 0.2006, "step": 14671 }, { "epoch": 0.7088950089384936, "grad_norm": 3.632384777069092, "learning_rate": 2.911049910615065e-07, "loss": 0.2328, "step": 14672 }, { "epoch": 0.7089433251195826, "grad_norm": 2.632486581802368, "learning_rate": 2.9105667488041745e-07, "loss": 0.1913, "step": 14673 }, { "epoch": 0.7089916413006716, "grad_norm": 3.5951406955718994, "learning_rate": 2.910083586993284e-07, "loss": 0.1849, "step": 14674 }, { "epoch": 0.7090399574817606, "grad_norm": 2.9366559982299805, "learning_rate": 2.909600425182394e-07, "loss": 0.281, "step": 14675 }, { "epoch": 0.7090882736628497, "grad_norm": 2.549893617630005, "learning_rate": 2.9091172633715026e-07, "loss": 0.2906, "step": 14676 }, { "epoch": 0.7091365898439387, "grad_norm": 2.081407070159912, "learning_rate": 2.9086341015606125e-07, "loss": 0.2147, "step": 14677 }, { "epoch": 0.7091849060250278, "grad_norm": 3.7066149711608887, "learning_rate": 2.9081509397497224e-07, "loss": 0.3364, "step": 14678 }, { "epoch": 0.7092332222061168, "grad_norm": 2.5263922214508057, "learning_rate": 2.907667777938831e-07, "loss": 0.2692, "step": 14679 }, { "epoch": 0.7092815383872059, "grad_norm": 2.201975107192993, "learning_rate": 2.907184616127941e-07, "loss": 0.263, "step": 14680 }, { "epoch": 0.7093298545682949, "grad_norm": 4.0858025550842285, "learning_rate": 2.9067014543170505e-07, "loss": 0.2694, "step": 14681 }, { "epoch": 0.7093781707493839, "grad_norm": 2.2179160118103027, "learning_rate": 2.90621829250616e-07, "loss": 0.2456, "step": 14682 }, { "epoch": 0.709426486930473, "grad_norm": 1.6124368906021118, "learning_rate": 2.90573513069527e-07, "loss": 0.1462, "step": 14683 }, { "epoch": 0.7094748031115621, "grad_norm": 2.5328078269958496, "learning_rate": 2.905251968884379e-07, "loss": 0.2252, "step": 14684 }, { "epoch": 0.7095231192926511, "grad_norm": 2.321155548095703, "learning_rate": 2.904768807073489e-07, "loss": 0.2485, "step": 14685 }, { "epoch": 0.7095714354737401, "grad_norm": 2.8061413764953613, "learning_rate": 2.9042856452625984e-07, "loss": 0.2438, "step": 14686 }, { "epoch": 0.7096197516548292, "grad_norm": 1.8833274841308594, "learning_rate": 2.903802483451708e-07, "loss": 0.2124, "step": 14687 }, { "epoch": 0.7096680678359183, "grad_norm": 2.8165011405944824, "learning_rate": 2.9033193216408177e-07, "loss": 0.2753, "step": 14688 }, { "epoch": 0.7097163840170073, "grad_norm": 1.8482227325439453, "learning_rate": 2.9028361598299265e-07, "loss": 0.1823, "step": 14689 }, { "epoch": 0.7097647001980963, "grad_norm": 2.602121353149414, "learning_rate": 2.9023529980190364e-07, "loss": 0.2887, "step": 14690 }, { "epoch": 0.7098130163791854, "grad_norm": 4.099380970001221, "learning_rate": 2.9018698362081463e-07, "loss": 0.2265, "step": 14691 }, { "epoch": 0.7098613325602744, "grad_norm": 2.690873861312866, "learning_rate": 2.901386674397255e-07, "loss": 0.2693, "step": 14692 }, { "epoch": 0.7099096487413635, "grad_norm": 3.284806489944458, "learning_rate": 2.900903512586365e-07, "loss": 0.2949, "step": 14693 }, { "epoch": 0.7099579649224526, "grad_norm": 3.1036839485168457, "learning_rate": 2.9004203507754745e-07, "loss": 0.2872, "step": 14694 }, { "epoch": 0.7100062811035416, "grad_norm": 2.6255555152893066, "learning_rate": 2.899937188964584e-07, "loss": 0.2757, "step": 14695 }, { "epoch": 0.7100545972846306, "grad_norm": 2.717787504196167, "learning_rate": 2.899454027153694e-07, "loss": 0.2577, "step": 14696 }, { "epoch": 0.7101029134657196, "grad_norm": 3.0945420265197754, "learning_rate": 2.898970865342803e-07, "loss": 0.3366, "step": 14697 }, { "epoch": 0.7101512296468088, "grad_norm": 6.216279983520508, "learning_rate": 2.8984877035319125e-07, "loss": 0.465, "step": 14698 }, { "epoch": 0.7101995458278978, "grad_norm": 2.860485553741455, "learning_rate": 2.8980045417210224e-07, "loss": 0.264, "step": 14699 }, { "epoch": 0.7102478620089868, "grad_norm": 4.968689918518066, "learning_rate": 2.897521379910132e-07, "loss": 0.3707, "step": 14700 }, { "epoch": 0.7102961781900758, "grad_norm": 7.37460994720459, "learning_rate": 2.8970382180992417e-07, "loss": 0.1978, "step": 14701 }, { "epoch": 0.7103444943711649, "grad_norm": 3.4304075241088867, "learning_rate": 2.8965550562883505e-07, "loss": 0.2106, "step": 14702 }, { "epoch": 0.7103928105522539, "grad_norm": 22.840782165527344, "learning_rate": 2.8960718944774604e-07, "loss": 0.1759, "step": 14703 }, { "epoch": 0.710441126733343, "grad_norm": 3.3016302585601807, "learning_rate": 2.8955887326665703e-07, "loss": 0.2128, "step": 14704 }, { "epoch": 0.7104894429144321, "grad_norm": 2.956138849258423, "learning_rate": 2.895105570855679e-07, "loss": 0.3236, "step": 14705 }, { "epoch": 0.7105377590955211, "grad_norm": 4.727108478546143, "learning_rate": 2.894622409044789e-07, "loss": 0.2979, "step": 14706 }, { "epoch": 0.7105860752766101, "grad_norm": 3.081983804702759, "learning_rate": 2.8941392472338984e-07, "loss": 0.3743, "step": 14707 }, { "epoch": 0.7106343914576991, "grad_norm": 1.7474393844604492, "learning_rate": 2.893656085423008e-07, "loss": 0.1902, "step": 14708 }, { "epoch": 0.7106827076387883, "grad_norm": 2.781575918197632, "learning_rate": 2.8931729236121177e-07, "loss": 0.2831, "step": 14709 }, { "epoch": 0.7107310238198773, "grad_norm": 4.851097106933594, "learning_rate": 2.892689761801227e-07, "loss": 0.4594, "step": 14710 }, { "epoch": 0.7107793400009663, "grad_norm": 1.7970538139343262, "learning_rate": 2.8922065999903364e-07, "loss": 0.2272, "step": 14711 }, { "epoch": 0.7108276561820553, "grad_norm": 2.536558151245117, "learning_rate": 2.8917234381794464e-07, "loss": 0.3294, "step": 14712 }, { "epoch": 0.7108759723631444, "grad_norm": 2.5921616554260254, "learning_rate": 2.8912402763685557e-07, "loss": 0.3651, "step": 14713 }, { "epoch": 0.7109242885442335, "grad_norm": 2.4011735916137695, "learning_rate": 2.890757114557665e-07, "loss": 0.2579, "step": 14714 }, { "epoch": 0.7109726047253225, "grad_norm": 1.9441148042678833, "learning_rate": 2.8902739527467745e-07, "loss": 0.1585, "step": 14715 }, { "epoch": 0.7110209209064116, "grad_norm": 3.8237416744232178, "learning_rate": 2.8897907909358844e-07, "loss": 0.3106, "step": 14716 }, { "epoch": 0.7110692370875006, "grad_norm": 4.029210567474365, "learning_rate": 2.8893076291249943e-07, "loss": 0.3515, "step": 14717 }, { "epoch": 0.7111175532685896, "grad_norm": 3.3411362171173096, "learning_rate": 2.888824467314103e-07, "loss": 0.3131, "step": 14718 }, { "epoch": 0.7111658694496787, "grad_norm": 2.0473203659057617, "learning_rate": 2.888341305503213e-07, "loss": 0.2445, "step": 14719 }, { "epoch": 0.7112141856307678, "grad_norm": 8.862505912780762, "learning_rate": 2.8878581436923224e-07, "loss": 0.2721, "step": 14720 }, { "epoch": 0.7112625018118568, "grad_norm": 5.432314872741699, "learning_rate": 2.887374981881432e-07, "loss": 0.3747, "step": 14721 }, { "epoch": 0.7113108179929458, "grad_norm": 3.0859134197235107, "learning_rate": 2.8868918200705417e-07, "loss": 0.3841, "step": 14722 }, { "epoch": 0.7113591341740348, "grad_norm": 2.6367266178131104, "learning_rate": 2.886408658259651e-07, "loss": 0.3123, "step": 14723 }, { "epoch": 0.711407450355124, "grad_norm": 2.2870986461639404, "learning_rate": 2.8859254964487604e-07, "loss": 0.229, "step": 14724 }, { "epoch": 0.711455766536213, "grad_norm": 3.527780771255493, "learning_rate": 2.8854423346378703e-07, "loss": 0.2331, "step": 14725 }, { "epoch": 0.711504082717302, "grad_norm": 2.6917238235473633, "learning_rate": 2.8849591728269797e-07, "loss": 0.2734, "step": 14726 }, { "epoch": 0.7115523988983911, "grad_norm": 2.472327709197998, "learning_rate": 2.884476011016089e-07, "loss": 0.2495, "step": 14727 }, { "epoch": 0.7116007150794801, "grad_norm": 1.9587273597717285, "learning_rate": 2.8839928492051984e-07, "loss": 0.2468, "step": 14728 }, { "epoch": 0.7116490312605691, "grad_norm": 2.4888062477111816, "learning_rate": 2.8835096873943083e-07, "loss": 0.2581, "step": 14729 }, { "epoch": 0.7116973474416582, "grad_norm": 3.7569735050201416, "learning_rate": 2.8830265255834177e-07, "loss": 0.3707, "step": 14730 }, { "epoch": 0.7117456636227473, "grad_norm": 3.688096761703491, "learning_rate": 2.882543363772527e-07, "loss": 0.3137, "step": 14731 }, { "epoch": 0.7117939798038363, "grad_norm": 2.266170024871826, "learning_rate": 2.882060201961637e-07, "loss": 0.2337, "step": 14732 }, { "epoch": 0.7118422959849253, "grad_norm": 4.659561634063721, "learning_rate": 2.8815770401507464e-07, "loss": 0.3443, "step": 14733 }, { "epoch": 0.7118906121660143, "grad_norm": 3.0050387382507324, "learning_rate": 2.8810938783398557e-07, "loss": 0.2393, "step": 14734 }, { "epoch": 0.7119389283471035, "grad_norm": 3.541045904159546, "learning_rate": 2.8806107165289656e-07, "loss": 0.2442, "step": 14735 }, { "epoch": 0.7119872445281925, "grad_norm": 2.090022087097168, "learning_rate": 2.880127554718075e-07, "loss": 0.2212, "step": 14736 }, { "epoch": 0.7120355607092815, "grad_norm": 3.7246334552764893, "learning_rate": 2.8796443929071844e-07, "loss": 0.2469, "step": 14737 }, { "epoch": 0.7120838768903706, "grad_norm": 2.414736747741699, "learning_rate": 2.8791612310962943e-07, "loss": 0.2461, "step": 14738 }, { "epoch": 0.7121321930714596, "grad_norm": 2.5055503845214844, "learning_rate": 2.8786780692854037e-07, "loss": 0.1988, "step": 14739 }, { "epoch": 0.7121805092525487, "grad_norm": 2.7881433963775635, "learning_rate": 2.878194907474513e-07, "loss": 0.3611, "step": 14740 }, { "epoch": 0.7122288254336377, "grad_norm": 1.5645378828048706, "learning_rate": 2.8777117456636224e-07, "loss": 0.168, "step": 14741 }, { "epoch": 0.7122771416147268, "grad_norm": 2.4001951217651367, "learning_rate": 2.8772285838527323e-07, "loss": 0.2241, "step": 14742 }, { "epoch": 0.7123254577958158, "grad_norm": 2.871401071548462, "learning_rate": 2.8767454220418417e-07, "loss": 0.3395, "step": 14743 }, { "epoch": 0.7123737739769048, "grad_norm": 2.9724552631378174, "learning_rate": 2.876262260230951e-07, "loss": 0.2992, "step": 14744 }, { "epoch": 0.712422090157994, "grad_norm": 3.797184705734253, "learning_rate": 2.875779098420061e-07, "loss": 0.2485, "step": 14745 }, { "epoch": 0.712470406339083, "grad_norm": 3.847266912460327, "learning_rate": 2.87529593660917e-07, "loss": 0.3497, "step": 14746 }, { "epoch": 0.712518722520172, "grad_norm": 3.1201906204223633, "learning_rate": 2.8748127747982797e-07, "loss": 0.2113, "step": 14747 }, { "epoch": 0.712567038701261, "grad_norm": 3.078645706176758, "learning_rate": 2.8743296129873896e-07, "loss": 0.2894, "step": 14748 }, { "epoch": 0.7126153548823501, "grad_norm": 3.219071388244629, "learning_rate": 2.873846451176499e-07, "loss": 0.3095, "step": 14749 }, { "epoch": 0.7126636710634392, "grad_norm": 2.6284279823303223, "learning_rate": 2.8733632893656083e-07, "loss": 0.2375, "step": 14750 }, { "epoch": 0.7127119872445282, "grad_norm": 2.974315643310547, "learning_rate": 2.872880127554718e-07, "loss": 0.3295, "step": 14751 }, { "epoch": 0.7127603034256172, "grad_norm": 3.1322097778320312, "learning_rate": 2.8723969657438276e-07, "loss": 0.3346, "step": 14752 }, { "epoch": 0.7128086196067063, "grad_norm": 2.8606760501861572, "learning_rate": 2.871913803932937e-07, "loss": 0.3804, "step": 14753 }, { "epoch": 0.7128569357877953, "grad_norm": 3.3773562908172607, "learning_rate": 2.8714306421220464e-07, "loss": 0.2807, "step": 14754 }, { "epoch": 0.7129052519688844, "grad_norm": 2.617328405380249, "learning_rate": 2.870947480311156e-07, "loss": 0.338, "step": 14755 }, { "epoch": 0.7129535681499735, "grad_norm": 3.9524149894714355, "learning_rate": 2.8704643185002656e-07, "loss": 0.3757, "step": 14756 }, { "epoch": 0.7130018843310625, "grad_norm": 4.886935710906982, "learning_rate": 2.869981156689375e-07, "loss": 0.2977, "step": 14757 }, { "epoch": 0.7130502005121515, "grad_norm": 4.713967323303223, "learning_rate": 2.869497994878485e-07, "loss": 0.3501, "step": 14758 }, { "epoch": 0.7130985166932405, "grad_norm": 2.503267765045166, "learning_rate": 2.869014833067594e-07, "loss": 0.2566, "step": 14759 }, { "epoch": 0.7131468328743296, "grad_norm": 3.2998392581939697, "learning_rate": 2.8685316712567037e-07, "loss": 0.3321, "step": 14760 }, { "epoch": 0.7131951490554187, "grad_norm": 2.364137649536133, "learning_rate": 2.8680485094458136e-07, "loss": 0.234, "step": 14761 }, { "epoch": 0.7132434652365077, "grad_norm": 2.6763222217559814, "learning_rate": 2.8675653476349224e-07, "loss": 0.2296, "step": 14762 }, { "epoch": 0.7132917814175967, "grad_norm": 2.4545722007751465, "learning_rate": 2.8670821858240323e-07, "loss": 0.3406, "step": 14763 }, { "epoch": 0.7133400975986858, "grad_norm": 5.514986038208008, "learning_rate": 2.866599024013142e-07, "loss": 0.3326, "step": 14764 }, { "epoch": 0.7133884137797748, "grad_norm": 2.7513296604156494, "learning_rate": 2.8661158622022516e-07, "loss": 0.2917, "step": 14765 }, { "epoch": 0.7134367299608639, "grad_norm": 1.7986228466033936, "learning_rate": 2.865632700391361e-07, "loss": 0.1651, "step": 14766 }, { "epoch": 0.713485046141953, "grad_norm": 4.541754245758057, "learning_rate": 2.8651495385804703e-07, "loss": 0.2894, "step": 14767 }, { "epoch": 0.713533362323042, "grad_norm": 2.3705508708953857, "learning_rate": 2.86466637676958e-07, "loss": 0.2608, "step": 14768 }, { "epoch": 0.713581678504131, "grad_norm": 3.5088961124420166, "learning_rate": 2.8641832149586896e-07, "loss": 0.2694, "step": 14769 }, { "epoch": 0.71362999468522, "grad_norm": 4.02140474319458, "learning_rate": 2.863700053147799e-07, "loss": 0.3488, "step": 14770 }, { "epoch": 0.7136783108663092, "grad_norm": 2.3164737224578857, "learning_rate": 2.863216891336909e-07, "loss": 0.2503, "step": 14771 }, { "epoch": 0.7137266270473982, "grad_norm": 3.5230445861816406, "learning_rate": 2.8627337295260177e-07, "loss": 0.2564, "step": 14772 }, { "epoch": 0.7137749432284872, "grad_norm": 3.69739031791687, "learning_rate": 2.8622505677151276e-07, "loss": 0.243, "step": 14773 }, { "epoch": 0.7138232594095762, "grad_norm": 3.295058250427246, "learning_rate": 2.8617674059042375e-07, "loss": 0.4244, "step": 14774 }, { "epoch": 0.7138715755906653, "grad_norm": 4.005565643310547, "learning_rate": 2.8612842440933464e-07, "loss": 0.2349, "step": 14775 }, { "epoch": 0.7139198917717544, "grad_norm": 2.9509809017181396, "learning_rate": 2.8608010822824563e-07, "loss": 0.3037, "step": 14776 }, { "epoch": 0.7139682079528434, "grad_norm": 2.3349318504333496, "learning_rate": 2.860317920471566e-07, "loss": 0.2687, "step": 14777 }, { "epoch": 0.7140165241339325, "grad_norm": 2.7459707260131836, "learning_rate": 2.859834758660675e-07, "loss": 0.3066, "step": 14778 }, { "epoch": 0.7140648403150215, "grad_norm": 8.954620361328125, "learning_rate": 2.859351596849785e-07, "loss": 0.3162, "step": 14779 }, { "epoch": 0.7141131564961105, "grad_norm": 2.3339970111846924, "learning_rate": 2.8588684350388943e-07, "loss": 0.2349, "step": 14780 }, { "epoch": 0.7141614726771996, "grad_norm": 1.9791269302368164, "learning_rate": 2.858385273228004e-07, "loss": 0.2146, "step": 14781 }, { "epoch": 0.7142097888582887, "grad_norm": 1.7958240509033203, "learning_rate": 2.8579021114171136e-07, "loss": 0.2292, "step": 14782 }, { "epoch": 0.7142581050393777, "grad_norm": 2.262125015258789, "learning_rate": 2.857418949606223e-07, "loss": 0.2307, "step": 14783 }, { "epoch": 0.7143064212204667, "grad_norm": 4.185639381408691, "learning_rate": 2.856935787795333e-07, "loss": 0.2765, "step": 14784 }, { "epoch": 0.7143547374015558, "grad_norm": 2.279277801513672, "learning_rate": 2.8564526259844417e-07, "loss": 0.2498, "step": 14785 }, { "epoch": 0.7144030535826448, "grad_norm": 61.039154052734375, "learning_rate": 2.8559694641735516e-07, "loss": 0.3619, "step": 14786 }, { "epoch": 0.7144513697637339, "grad_norm": 8.842674255371094, "learning_rate": 2.8554863023626615e-07, "loss": 0.23, "step": 14787 }, { "epoch": 0.7144996859448229, "grad_norm": 2.5391712188720703, "learning_rate": 2.8550031405517703e-07, "loss": 0.3179, "step": 14788 }, { "epoch": 0.714548002125912, "grad_norm": 2.595074415206909, "learning_rate": 2.85451997874088e-07, "loss": 0.3261, "step": 14789 }, { "epoch": 0.714596318307001, "grad_norm": 2.337411403656006, "learning_rate": 2.85403681692999e-07, "loss": 0.2397, "step": 14790 }, { "epoch": 0.71464463448809, "grad_norm": 2.0119874477386475, "learning_rate": 2.853553655119099e-07, "loss": 0.1746, "step": 14791 }, { "epoch": 0.7146929506691792, "grad_norm": 1.853271245956421, "learning_rate": 2.853070493308209e-07, "loss": 0.2557, "step": 14792 }, { "epoch": 0.7147412668502682, "grad_norm": 4.277182579040527, "learning_rate": 2.852587331497318e-07, "loss": 0.2725, "step": 14793 }, { "epoch": 0.7147895830313572, "grad_norm": 3.1052095890045166, "learning_rate": 2.8521041696864276e-07, "loss": 0.2617, "step": 14794 }, { "epoch": 0.7148378992124462, "grad_norm": 5.107645034790039, "learning_rate": 2.8516210078755375e-07, "loss": 0.3208, "step": 14795 }, { "epoch": 0.7148862153935353, "grad_norm": 4.0819501876831055, "learning_rate": 2.851137846064647e-07, "loss": 0.2363, "step": 14796 }, { "epoch": 0.7149345315746244, "grad_norm": 2.2394795417785645, "learning_rate": 2.850654684253757e-07, "loss": 0.1789, "step": 14797 }, { "epoch": 0.7149828477557134, "grad_norm": 2.3958652019500732, "learning_rate": 2.8501715224428656e-07, "loss": 0.2708, "step": 14798 }, { "epoch": 0.7150311639368024, "grad_norm": 2.448953628540039, "learning_rate": 2.8496883606319755e-07, "loss": 0.1839, "step": 14799 }, { "epoch": 0.7150794801178915, "grad_norm": 169.7017822265625, "learning_rate": 2.8492051988210854e-07, "loss": 0.2611, "step": 14800 }, { "epoch": 0.7151277962989805, "grad_norm": 2.7596821784973145, "learning_rate": 2.8487220370101943e-07, "loss": 0.2975, "step": 14801 }, { "epoch": 0.7151761124800696, "grad_norm": 3.2608799934387207, "learning_rate": 2.848238875199304e-07, "loss": 0.4801, "step": 14802 }, { "epoch": 0.7152244286611587, "grad_norm": 3.8432483673095703, "learning_rate": 2.847755713388414e-07, "loss": 0.4173, "step": 14803 }, { "epoch": 0.7152727448422477, "grad_norm": 2.5245792865753174, "learning_rate": 2.847272551577523e-07, "loss": 0.2702, "step": 14804 }, { "epoch": 0.7153210610233367, "grad_norm": 2.8610188961029053, "learning_rate": 2.846789389766633e-07, "loss": 0.2237, "step": 14805 }, { "epoch": 0.7153693772044257, "grad_norm": 2.37373423576355, "learning_rate": 2.846306227955742e-07, "loss": 0.2859, "step": 14806 }, { "epoch": 0.7154176933855149, "grad_norm": 5.140161991119385, "learning_rate": 2.8458230661448516e-07, "loss": 0.4006, "step": 14807 }, { "epoch": 0.7154660095666039, "grad_norm": 2.58520770072937, "learning_rate": 2.8453399043339615e-07, "loss": 0.3532, "step": 14808 }, { "epoch": 0.7155143257476929, "grad_norm": 4.8368940353393555, "learning_rate": 2.844856742523071e-07, "loss": 0.3046, "step": 14809 }, { "epoch": 0.7155626419287819, "grad_norm": 3.0200154781341553, "learning_rate": 2.84437358071218e-07, "loss": 0.336, "step": 14810 }, { "epoch": 0.715610958109871, "grad_norm": 4.078190326690674, "learning_rate": 2.8438904189012896e-07, "loss": 0.4511, "step": 14811 }, { "epoch": 0.71565927429096, "grad_norm": 2.2331085205078125, "learning_rate": 2.8434072570903995e-07, "loss": 0.2494, "step": 14812 }, { "epoch": 0.7157075904720491, "grad_norm": 2.1261417865753174, "learning_rate": 2.8429240952795094e-07, "loss": 0.2058, "step": 14813 }, { "epoch": 0.7157559066531382, "grad_norm": 3.0798988342285156, "learning_rate": 2.842440933468618e-07, "loss": 0.3618, "step": 14814 }, { "epoch": 0.7158042228342272, "grad_norm": 3.033505916595459, "learning_rate": 2.841957771657728e-07, "loss": 0.2252, "step": 14815 }, { "epoch": 0.7158525390153162, "grad_norm": 3.3426811695098877, "learning_rate": 2.841474609846838e-07, "loss": 0.3566, "step": 14816 }, { "epoch": 0.7159008551964052, "grad_norm": 2.531968355178833, "learning_rate": 2.840991448035947e-07, "loss": 0.2046, "step": 14817 }, { "epoch": 0.7159491713774944, "grad_norm": 3.0706851482391357, "learning_rate": 2.840508286225057e-07, "loss": 0.2034, "step": 14818 }, { "epoch": 0.7159974875585834, "grad_norm": 2.815648078918457, "learning_rate": 2.840025124414166e-07, "loss": 0.3042, "step": 14819 }, { "epoch": 0.7160458037396724, "grad_norm": 3.6763341426849365, "learning_rate": 2.8395419626032756e-07, "loss": 0.284, "step": 14820 }, { "epoch": 0.7160941199207614, "grad_norm": 2.2734875679016113, "learning_rate": 2.8390588007923855e-07, "loss": 0.2147, "step": 14821 }, { "epoch": 0.7161424361018505, "grad_norm": 3.052493095397949, "learning_rate": 2.838575638981495e-07, "loss": 0.3926, "step": 14822 }, { "epoch": 0.7161907522829396, "grad_norm": 4.470569133758545, "learning_rate": 2.838092477170604e-07, "loss": 0.3585, "step": 14823 }, { "epoch": 0.7162390684640286, "grad_norm": 3.421774387359619, "learning_rate": 2.8376093153597136e-07, "loss": 0.269, "step": 14824 }, { "epoch": 0.7162873846451177, "grad_norm": 3.851454019546509, "learning_rate": 2.8371261535488235e-07, "loss": 0.3314, "step": 14825 }, { "epoch": 0.7163357008262067, "grad_norm": 2.072495222091675, "learning_rate": 2.836642991737933e-07, "loss": 0.2083, "step": 14826 }, { "epoch": 0.7163840170072957, "grad_norm": 2.818929433822632, "learning_rate": 2.836159829927042e-07, "loss": 0.3918, "step": 14827 }, { "epoch": 0.7164323331883848, "grad_norm": 2.192068338394165, "learning_rate": 2.835676668116152e-07, "loss": 0.252, "step": 14828 }, { "epoch": 0.7164806493694739, "grad_norm": 2.9091694355010986, "learning_rate": 2.835193506305262e-07, "loss": 0.3555, "step": 14829 }, { "epoch": 0.7165289655505629, "grad_norm": 3.3358302116394043, "learning_rate": 2.834710344494371e-07, "loss": 0.3678, "step": 14830 }, { "epoch": 0.7165772817316519, "grad_norm": 2.2177772521972656, "learning_rate": 2.834227182683481e-07, "loss": 0.2014, "step": 14831 }, { "epoch": 0.7166255979127409, "grad_norm": 2.89827823638916, "learning_rate": 2.83374402087259e-07, "loss": 0.3594, "step": 14832 }, { "epoch": 0.7166739140938301, "grad_norm": 2.0292909145355225, "learning_rate": 2.8332608590616995e-07, "loss": 0.2517, "step": 14833 }, { "epoch": 0.7167222302749191, "grad_norm": 2.543375015258789, "learning_rate": 2.8327776972508094e-07, "loss": 0.3554, "step": 14834 }, { "epoch": 0.7167705464560081, "grad_norm": 3.3944382667541504, "learning_rate": 2.832294535439919e-07, "loss": 0.2081, "step": 14835 }, { "epoch": 0.7168188626370972, "grad_norm": 3.762319326400757, "learning_rate": 2.831811373629028e-07, "loss": 0.2695, "step": 14836 }, { "epoch": 0.7168671788181862, "grad_norm": 2.3613367080688477, "learning_rate": 2.8313282118181375e-07, "loss": 0.2863, "step": 14837 }, { "epoch": 0.7169154949992752, "grad_norm": 2.9734857082366943, "learning_rate": 2.8308450500072474e-07, "loss": 0.3114, "step": 14838 }, { "epoch": 0.7169638111803643, "grad_norm": 10.120391845703125, "learning_rate": 2.830361888196357e-07, "loss": 0.2888, "step": 14839 }, { "epoch": 0.7170121273614534, "grad_norm": 2.707308053970337, "learning_rate": 2.829878726385466e-07, "loss": 0.2066, "step": 14840 }, { "epoch": 0.7170604435425424, "grad_norm": 4.696478366851807, "learning_rate": 2.829395564574576e-07, "loss": 0.3321, "step": 14841 }, { "epoch": 0.7171087597236314, "grad_norm": 6.742214679718018, "learning_rate": 2.8289124027636855e-07, "loss": 0.327, "step": 14842 }, { "epoch": 0.7171570759047204, "grad_norm": 2.928274154663086, "learning_rate": 2.828429240952795e-07, "loss": 0.234, "step": 14843 }, { "epoch": 0.7172053920858096, "grad_norm": 4.600396156311035, "learning_rate": 2.8279460791419047e-07, "loss": 0.2136, "step": 14844 }, { "epoch": 0.7172537082668986, "grad_norm": 4.153478622436523, "learning_rate": 2.8274629173310136e-07, "loss": 0.33, "step": 14845 }, { "epoch": 0.7173020244479876, "grad_norm": 3.367854118347168, "learning_rate": 2.8269797555201235e-07, "loss": 0.3287, "step": 14846 }, { "epoch": 0.7173503406290767, "grad_norm": 6.518134593963623, "learning_rate": 2.8264965937092334e-07, "loss": 0.3151, "step": 14847 }, { "epoch": 0.7173986568101657, "grad_norm": 2.2777535915374756, "learning_rate": 2.826013431898343e-07, "loss": 0.2843, "step": 14848 }, { "epoch": 0.7174469729912548, "grad_norm": 3.072739362716675, "learning_rate": 2.825530270087452e-07, "loss": 0.4162, "step": 14849 }, { "epoch": 0.7174952891723438, "grad_norm": 3.147308111190796, "learning_rate": 2.8250471082765615e-07, "loss": 0.3492, "step": 14850 }, { "epoch": 0.7175436053534329, "grad_norm": 2.315861940383911, "learning_rate": 2.8245639464656714e-07, "loss": 0.2819, "step": 14851 }, { "epoch": 0.7175919215345219, "grad_norm": 1.7475500106811523, "learning_rate": 2.824080784654781e-07, "loss": 0.1425, "step": 14852 }, { "epoch": 0.7176402377156109, "grad_norm": 9.674383163452148, "learning_rate": 2.82359762284389e-07, "loss": 0.3342, "step": 14853 }, { "epoch": 0.7176885538967, "grad_norm": 3.274158239364624, "learning_rate": 2.823114461033e-07, "loss": 0.2451, "step": 14854 }, { "epoch": 0.7177368700777891, "grad_norm": 3.1013858318328857, "learning_rate": 2.8226312992221094e-07, "loss": 0.176, "step": 14855 }, { "epoch": 0.7177851862588781, "grad_norm": 3.194758176803589, "learning_rate": 2.822148137411219e-07, "loss": 0.317, "step": 14856 }, { "epoch": 0.7178335024399671, "grad_norm": 2.1851553916931152, "learning_rate": 2.8216649756003287e-07, "loss": 0.2239, "step": 14857 }, { "epoch": 0.7178818186210562, "grad_norm": 7.931330680847168, "learning_rate": 2.8211818137894375e-07, "loss": 0.4463, "step": 14858 }, { "epoch": 0.7179301348021453, "grad_norm": 2.6624057292938232, "learning_rate": 2.8206986519785474e-07, "loss": 0.3725, "step": 14859 }, { "epoch": 0.7179784509832343, "grad_norm": 2.830676794052124, "learning_rate": 2.8202154901676573e-07, "loss": 0.3363, "step": 14860 }, { "epoch": 0.7180267671643233, "grad_norm": 3.2397754192352295, "learning_rate": 2.819732328356766e-07, "loss": 0.4747, "step": 14861 }, { "epoch": 0.7180750833454124, "grad_norm": 2.58333683013916, "learning_rate": 2.819249166545876e-07, "loss": 0.257, "step": 14862 }, { "epoch": 0.7181233995265014, "grad_norm": 2.817225456237793, "learning_rate": 2.8187660047349855e-07, "loss": 0.3357, "step": 14863 }, { "epoch": 0.7181717157075904, "grad_norm": 2.8674206733703613, "learning_rate": 2.8182828429240954e-07, "loss": 0.3284, "step": 14864 }, { "epoch": 0.7182200318886796, "grad_norm": 2.589914560317993, "learning_rate": 2.817799681113205e-07, "loss": 0.3487, "step": 14865 }, { "epoch": 0.7182683480697686, "grad_norm": 2.229686737060547, "learning_rate": 2.817316519302314e-07, "loss": 0.146, "step": 14866 }, { "epoch": 0.7183166642508576, "grad_norm": 3.069059133529663, "learning_rate": 2.816833357491424e-07, "loss": 0.3772, "step": 14867 }, { "epoch": 0.7183649804319466, "grad_norm": 2.2236666679382324, "learning_rate": 2.8163501956805334e-07, "loss": 0.3166, "step": 14868 }, { "epoch": 0.7184132966130357, "grad_norm": 2.7663607597351074, "learning_rate": 2.815867033869643e-07, "loss": 0.4146, "step": 14869 }, { "epoch": 0.7184616127941248, "grad_norm": 5.503988265991211, "learning_rate": 2.8153838720587527e-07, "loss": 0.3288, "step": 14870 }, { "epoch": 0.7185099289752138, "grad_norm": 2.4150373935699463, "learning_rate": 2.8149007102478615e-07, "loss": 0.2779, "step": 14871 }, { "epoch": 0.7185582451563028, "grad_norm": 2.702326774597168, "learning_rate": 2.8144175484369714e-07, "loss": 0.3514, "step": 14872 }, { "epoch": 0.7186065613373919, "grad_norm": 2.7237284183502197, "learning_rate": 2.8139343866260813e-07, "loss": 0.2923, "step": 14873 }, { "epoch": 0.7186548775184809, "grad_norm": 3.0892815589904785, "learning_rate": 2.81345122481519e-07, "loss": 0.419, "step": 14874 }, { "epoch": 0.71870319369957, "grad_norm": 3.56016206741333, "learning_rate": 2.8129680630043e-07, "loss": 0.2185, "step": 14875 }, { "epoch": 0.7187515098806591, "grad_norm": 2.7054591178894043, "learning_rate": 2.8124849011934094e-07, "loss": 0.3455, "step": 14876 }, { "epoch": 0.7187998260617481, "grad_norm": 2.5518295764923096, "learning_rate": 2.812001739382519e-07, "loss": 0.2153, "step": 14877 }, { "epoch": 0.7188481422428371, "grad_norm": 3.382894515991211, "learning_rate": 2.8115185775716287e-07, "loss": 0.4317, "step": 14878 }, { "epoch": 0.7188964584239261, "grad_norm": 5.572364807128906, "learning_rate": 2.811035415760738e-07, "loss": 0.3336, "step": 14879 }, { "epoch": 0.7189447746050153, "grad_norm": 2.159337043762207, "learning_rate": 2.810552253949848e-07, "loss": 0.2147, "step": 14880 }, { "epoch": 0.7189930907861043, "grad_norm": 2.8973188400268555, "learning_rate": 2.8100690921389573e-07, "loss": 0.3331, "step": 14881 }, { "epoch": 0.7190414069671933, "grad_norm": 8.977433204650879, "learning_rate": 2.8095859303280667e-07, "loss": 0.3894, "step": 14882 }, { "epoch": 0.7190897231482823, "grad_norm": 2.649998664855957, "learning_rate": 2.8091027685171766e-07, "loss": 0.28, "step": 14883 }, { "epoch": 0.7191380393293714, "grad_norm": 2.726811647415161, "learning_rate": 2.8086196067062855e-07, "loss": 0.2507, "step": 14884 }, { "epoch": 0.7191863555104605, "grad_norm": 3.09909725189209, "learning_rate": 2.8081364448953954e-07, "loss": 0.3856, "step": 14885 }, { "epoch": 0.7192346716915495, "grad_norm": 2.7035157680511475, "learning_rate": 2.8076532830845053e-07, "loss": 0.2967, "step": 14886 }, { "epoch": 0.7192829878726386, "grad_norm": 5.5162811279296875, "learning_rate": 2.807170121273614e-07, "loss": 0.2729, "step": 14887 }, { "epoch": 0.7193313040537276, "grad_norm": 2.8463099002838135, "learning_rate": 2.806686959462724e-07, "loss": 0.247, "step": 14888 }, { "epoch": 0.7193796202348166, "grad_norm": 3.667975664138794, "learning_rate": 2.8062037976518334e-07, "loss": 0.3342, "step": 14889 }, { "epoch": 0.7194279364159056, "grad_norm": 2.079022169113159, "learning_rate": 2.805720635840943e-07, "loss": 0.3257, "step": 14890 }, { "epoch": 0.7194762525969948, "grad_norm": 2.0897369384765625, "learning_rate": 2.8052374740300527e-07, "loss": 0.2116, "step": 14891 }, { "epoch": 0.7195245687780838, "grad_norm": 2.9689581394195557, "learning_rate": 2.804754312219162e-07, "loss": 0.2773, "step": 14892 }, { "epoch": 0.7195728849591728, "grad_norm": 3.592210292816162, "learning_rate": 2.8042711504082714e-07, "loss": 0.3159, "step": 14893 }, { "epoch": 0.7196212011402618, "grad_norm": 3.3945279121398926, "learning_rate": 2.803787988597381e-07, "loss": 0.3342, "step": 14894 }, { "epoch": 0.7196695173213509, "grad_norm": 3.0692086219787598, "learning_rate": 2.8033048267864907e-07, "loss": 0.2914, "step": 14895 }, { "epoch": 0.71971783350244, "grad_norm": 3.190213203430176, "learning_rate": 2.8028216649756006e-07, "loss": 0.2602, "step": 14896 }, { "epoch": 0.719766149683529, "grad_norm": 2.643800973892212, "learning_rate": 2.8023385031647094e-07, "loss": 0.2973, "step": 14897 }, { "epoch": 0.7198144658646181, "grad_norm": 3.1252403259277344, "learning_rate": 2.8018553413538193e-07, "loss": 0.2765, "step": 14898 }, { "epoch": 0.7198627820457071, "grad_norm": 1.8342573642730713, "learning_rate": 2.801372179542929e-07, "loss": 0.2312, "step": 14899 }, { "epoch": 0.7199110982267961, "grad_norm": 2.978492259979248, "learning_rate": 2.800889017732038e-07, "loss": 0.376, "step": 14900 }, { "epoch": 0.7199594144078852, "grad_norm": 2.9108452796936035, "learning_rate": 2.800405855921148e-07, "loss": 0.3927, "step": 14901 }, { "epoch": 0.7200077305889743, "grad_norm": 3.3212361335754395, "learning_rate": 2.7999226941102574e-07, "loss": 0.2461, "step": 14902 }, { "epoch": 0.7200560467700633, "grad_norm": 2.7071621417999268, "learning_rate": 2.7994395322993667e-07, "loss": 0.3132, "step": 14903 }, { "epoch": 0.7201043629511523, "grad_norm": 3.269296884536743, "learning_rate": 2.7989563704884766e-07, "loss": 0.3543, "step": 14904 }, { "epoch": 0.7201526791322413, "grad_norm": 2.7232165336608887, "learning_rate": 2.798473208677586e-07, "loss": 0.3335, "step": 14905 }, { "epoch": 0.7202009953133305, "grad_norm": 2.3181214332580566, "learning_rate": 2.7979900468666954e-07, "loss": 0.3468, "step": 14906 }, { "epoch": 0.7202493114944195, "grad_norm": 4.503687858581543, "learning_rate": 2.797506885055805e-07, "loss": 0.24, "step": 14907 }, { "epoch": 0.7202976276755085, "grad_norm": 1.6926058530807495, "learning_rate": 2.7970237232449146e-07, "loss": 0.206, "step": 14908 }, { "epoch": 0.7203459438565976, "grad_norm": 1.6930967569351196, "learning_rate": 2.796540561434024e-07, "loss": 0.1912, "step": 14909 }, { "epoch": 0.7203942600376866, "grad_norm": 7.278827667236328, "learning_rate": 2.7960573996231334e-07, "loss": 0.2654, "step": 14910 }, { "epoch": 0.7204425762187757, "grad_norm": 12.280723571777344, "learning_rate": 2.7955742378122433e-07, "loss": 0.2509, "step": 14911 }, { "epoch": 0.7204908923998647, "grad_norm": 1.6887856721878052, "learning_rate": 2.795091076001353e-07, "loss": 0.202, "step": 14912 }, { "epoch": 0.7205392085809538, "grad_norm": 2.7885119915008545, "learning_rate": 2.794607914190462e-07, "loss": 0.3521, "step": 14913 }, { "epoch": 0.7205875247620428, "grad_norm": 2.8791749477386475, "learning_rate": 2.794124752379572e-07, "loss": 0.415, "step": 14914 }, { "epoch": 0.7206358409431318, "grad_norm": 3.4780685901641846, "learning_rate": 2.7936415905686813e-07, "loss": 0.4446, "step": 14915 }, { "epoch": 0.7206841571242208, "grad_norm": 14.123406410217285, "learning_rate": 2.7931584287577907e-07, "loss": 0.3022, "step": 14916 }, { "epoch": 0.72073247330531, "grad_norm": 7.506750106811523, "learning_rate": 2.7926752669469006e-07, "loss": 0.2989, "step": 14917 }, { "epoch": 0.720780789486399, "grad_norm": 5.146058559417725, "learning_rate": 2.79219210513601e-07, "loss": 0.303, "step": 14918 }, { "epoch": 0.720829105667488, "grad_norm": 2.66009259223938, "learning_rate": 2.7917089433251193e-07, "loss": 0.3261, "step": 14919 }, { "epoch": 0.7208774218485771, "grad_norm": 3.3217546939849854, "learning_rate": 2.7912257815142287e-07, "loss": 0.3673, "step": 14920 }, { "epoch": 0.7209257380296661, "grad_norm": 3.636204242706299, "learning_rate": 2.7907426197033386e-07, "loss": 0.2585, "step": 14921 }, { "epoch": 0.7209740542107552, "grad_norm": 1.8255811929702759, "learning_rate": 2.790259457892448e-07, "loss": 0.1858, "step": 14922 }, { "epoch": 0.7210223703918442, "grad_norm": 2.543668746948242, "learning_rate": 2.7897762960815574e-07, "loss": 0.3045, "step": 14923 }, { "epoch": 0.7210706865729333, "grad_norm": 3.008723735809326, "learning_rate": 2.789293134270667e-07, "loss": 0.3436, "step": 14924 }, { "epoch": 0.7211190027540223, "grad_norm": 2.7030298709869385, "learning_rate": 2.7888099724597766e-07, "loss": 0.3005, "step": 14925 }, { "epoch": 0.7211673189351113, "grad_norm": 2.6147398948669434, "learning_rate": 2.788326810648886e-07, "loss": 0.3278, "step": 14926 }, { "epoch": 0.7212156351162005, "grad_norm": 4.1905388832092285, "learning_rate": 2.787843648837996e-07, "loss": 0.314, "step": 14927 }, { "epoch": 0.7212639512972895, "grad_norm": 2.358323335647583, "learning_rate": 2.7873604870271053e-07, "loss": 0.2781, "step": 14928 }, { "epoch": 0.7213122674783785, "grad_norm": 2.590115547180176, "learning_rate": 2.7868773252162147e-07, "loss": 0.2889, "step": 14929 }, { "epoch": 0.7213605836594675, "grad_norm": 2.7770040035247803, "learning_rate": 2.7863941634053246e-07, "loss": 0.2838, "step": 14930 }, { "epoch": 0.7214088998405566, "grad_norm": 2.3309412002563477, "learning_rate": 2.785911001594434e-07, "loss": 0.2495, "step": 14931 }, { "epoch": 0.7214572160216457, "grad_norm": 2.4400148391723633, "learning_rate": 2.7854278397835433e-07, "loss": 0.2797, "step": 14932 }, { "epoch": 0.7215055322027347, "grad_norm": 2.9962596893310547, "learning_rate": 2.7849446779726527e-07, "loss": 0.2534, "step": 14933 }, { "epoch": 0.7215538483838237, "grad_norm": 2.13861083984375, "learning_rate": 2.7844615161617626e-07, "loss": 0.2302, "step": 14934 }, { "epoch": 0.7216021645649128, "grad_norm": 2.5356411933898926, "learning_rate": 2.783978354350872e-07, "loss": 0.2399, "step": 14935 }, { "epoch": 0.7216504807460018, "grad_norm": 1.6855720281600952, "learning_rate": 2.7834951925399813e-07, "loss": 0.2423, "step": 14936 }, { "epoch": 0.7216987969270909, "grad_norm": 2.6174497604370117, "learning_rate": 2.783012030729091e-07, "loss": 0.2682, "step": 14937 }, { "epoch": 0.72174711310818, "grad_norm": 3.802778482437134, "learning_rate": 2.7825288689182006e-07, "loss": 0.2193, "step": 14938 }, { "epoch": 0.721795429289269, "grad_norm": 7.600852012634277, "learning_rate": 2.78204570710731e-07, "loss": 0.3656, "step": 14939 }, { "epoch": 0.721843745470358, "grad_norm": 4.910370826721191, "learning_rate": 2.78156254529642e-07, "loss": 0.4516, "step": 14940 }, { "epoch": 0.721892061651447, "grad_norm": 3.3050758838653564, "learning_rate": 2.7810793834855287e-07, "loss": 0.3873, "step": 14941 }, { "epoch": 0.7219403778325361, "grad_norm": 2.757319450378418, "learning_rate": 2.7805962216746386e-07, "loss": 0.3588, "step": 14942 }, { "epoch": 0.7219886940136252, "grad_norm": 10.31055736541748, "learning_rate": 2.7801130598637485e-07, "loss": 0.3757, "step": 14943 }, { "epoch": 0.7220370101947142, "grad_norm": 2.452960252761841, "learning_rate": 2.779629898052858e-07, "loss": 0.2743, "step": 14944 }, { "epoch": 0.7220853263758032, "grad_norm": 2.2005510330200195, "learning_rate": 2.779146736241967e-07, "loss": 0.2245, "step": 14945 }, { "epoch": 0.7221336425568923, "grad_norm": 2.12814998626709, "learning_rate": 2.7786635744310766e-07, "loss": 0.2121, "step": 14946 }, { "epoch": 0.7221819587379813, "grad_norm": 2.6560580730438232, "learning_rate": 2.7781804126201865e-07, "loss": 0.3064, "step": 14947 }, { "epoch": 0.7222302749190704, "grad_norm": 2.6100871562957764, "learning_rate": 2.777697250809296e-07, "loss": 0.3179, "step": 14948 }, { "epoch": 0.7222785911001595, "grad_norm": 3.0108656883239746, "learning_rate": 2.7772140889984053e-07, "loss": 0.2658, "step": 14949 }, { "epoch": 0.7223269072812485, "grad_norm": 2.8225021362304688, "learning_rate": 2.776730927187515e-07, "loss": 0.3483, "step": 14950 }, { "epoch": 0.7223752234623375, "grad_norm": 2.0253472328186035, "learning_rate": 2.7762477653766246e-07, "loss": 0.2519, "step": 14951 }, { "epoch": 0.7224235396434265, "grad_norm": 2.567197561264038, "learning_rate": 2.775764603565734e-07, "loss": 0.2231, "step": 14952 }, { "epoch": 0.7224718558245157, "grad_norm": 5.357185363769531, "learning_rate": 2.775281441754844e-07, "loss": 0.3505, "step": 14953 }, { "epoch": 0.7225201720056047, "grad_norm": 2.5734031200408936, "learning_rate": 2.7747982799439527e-07, "loss": 0.3321, "step": 14954 }, { "epoch": 0.7225684881866937, "grad_norm": 10.306317329406738, "learning_rate": 2.7743151181330626e-07, "loss": 0.2437, "step": 14955 }, { "epoch": 0.7226168043677828, "grad_norm": 12.316831588745117, "learning_rate": 2.7738319563221725e-07, "loss": 0.2572, "step": 14956 }, { "epoch": 0.7226651205488718, "grad_norm": 3.7351129055023193, "learning_rate": 2.7733487945112813e-07, "loss": 0.3166, "step": 14957 }, { "epoch": 0.7227134367299609, "grad_norm": 2.856517791748047, "learning_rate": 2.772865632700391e-07, "loss": 0.3946, "step": 14958 }, { "epoch": 0.7227617529110499, "grad_norm": 5.47666072845459, "learning_rate": 2.7723824708895006e-07, "loss": 0.4903, "step": 14959 }, { "epoch": 0.722810069092139, "grad_norm": 2.1972224712371826, "learning_rate": 2.7718993090786105e-07, "loss": 0.2574, "step": 14960 }, { "epoch": 0.722858385273228, "grad_norm": 20.09540557861328, "learning_rate": 2.77141614726772e-07, "loss": 0.2031, "step": 14961 }, { "epoch": 0.722906701454317, "grad_norm": 7.734371662139893, "learning_rate": 2.770932985456829e-07, "loss": 0.2445, "step": 14962 }, { "epoch": 0.7229550176354061, "grad_norm": 4.636730194091797, "learning_rate": 2.770449823645939e-07, "loss": 0.2893, "step": 14963 }, { "epoch": 0.7230033338164952, "grad_norm": 2.3532869815826416, "learning_rate": 2.7699666618350485e-07, "loss": 0.1875, "step": 14964 }, { "epoch": 0.7230516499975842, "grad_norm": 2.415532350540161, "learning_rate": 2.769483500024158e-07, "loss": 0.1973, "step": 14965 }, { "epoch": 0.7230999661786732, "grad_norm": 3.5942840576171875, "learning_rate": 2.769000338213268e-07, "loss": 0.4114, "step": 14966 }, { "epoch": 0.7231482823597623, "grad_norm": 3.7664356231689453, "learning_rate": 2.7685171764023766e-07, "loss": 0.2378, "step": 14967 }, { "epoch": 0.7231965985408513, "grad_norm": 5.679832935333252, "learning_rate": 2.7680340145914865e-07, "loss": 0.2951, "step": 14968 }, { "epoch": 0.7232449147219404, "grad_norm": 3.0447299480438232, "learning_rate": 2.7675508527805964e-07, "loss": 0.1733, "step": 14969 }, { "epoch": 0.7232932309030294, "grad_norm": 2.5555572509765625, "learning_rate": 2.7670676909697053e-07, "loss": 0.3082, "step": 14970 }, { "epoch": 0.7233415470841185, "grad_norm": 7.263103008270264, "learning_rate": 2.766584529158815e-07, "loss": 0.3321, "step": 14971 }, { "epoch": 0.7233898632652075, "grad_norm": 3.2307534217834473, "learning_rate": 2.7661013673479246e-07, "loss": 0.3581, "step": 14972 }, { "epoch": 0.7234381794462965, "grad_norm": 5.388193130493164, "learning_rate": 2.765618205537034e-07, "loss": 0.3298, "step": 14973 }, { "epoch": 0.7234864956273857, "grad_norm": 2.165963888168335, "learning_rate": 2.765135043726144e-07, "loss": 0.2465, "step": 14974 }, { "epoch": 0.7235348118084747, "grad_norm": 2.4707865715026855, "learning_rate": 2.764651881915253e-07, "loss": 0.2802, "step": 14975 }, { "epoch": 0.7235831279895637, "grad_norm": 2.942409038543701, "learning_rate": 2.764168720104363e-07, "loss": 0.3595, "step": 14976 }, { "epoch": 0.7236314441706527, "grad_norm": 2.6126394271850586, "learning_rate": 2.7636855582934725e-07, "loss": 0.2512, "step": 14977 }, { "epoch": 0.7236797603517418, "grad_norm": 2.093445301055908, "learning_rate": 2.763202396482582e-07, "loss": 0.1829, "step": 14978 }, { "epoch": 0.7237280765328309, "grad_norm": 4.2584404945373535, "learning_rate": 2.762719234671692e-07, "loss": 0.2658, "step": 14979 }, { "epoch": 0.7237763927139199, "grad_norm": 3.362581729888916, "learning_rate": 2.7622360728608006e-07, "loss": 0.3662, "step": 14980 }, { "epoch": 0.7238247088950089, "grad_norm": 2.3827872276306152, "learning_rate": 2.7617529110499105e-07, "loss": 0.227, "step": 14981 }, { "epoch": 0.723873025076098, "grad_norm": 2.7644424438476562, "learning_rate": 2.7612697492390204e-07, "loss": 0.3504, "step": 14982 }, { "epoch": 0.723921341257187, "grad_norm": 2.433931827545166, "learning_rate": 2.760786587428129e-07, "loss": 0.2801, "step": 14983 }, { "epoch": 0.7239696574382761, "grad_norm": 3.030843734741211, "learning_rate": 2.760303425617239e-07, "loss": 0.3161, "step": 14984 }, { "epoch": 0.7240179736193652, "grad_norm": 2.8887252807617188, "learning_rate": 2.7598202638063485e-07, "loss": 0.1918, "step": 14985 }, { "epoch": 0.7240662898004542, "grad_norm": 5.508941650390625, "learning_rate": 2.759337101995458e-07, "loss": 0.2295, "step": 14986 }, { "epoch": 0.7241146059815432, "grad_norm": 2.7465155124664307, "learning_rate": 2.758853940184568e-07, "loss": 0.3166, "step": 14987 }, { "epoch": 0.7241629221626322, "grad_norm": 4.825692653656006, "learning_rate": 2.758370778373677e-07, "loss": 0.3188, "step": 14988 }, { "epoch": 0.7242112383437214, "grad_norm": 2.589846611022949, "learning_rate": 2.7578876165627865e-07, "loss": 0.3183, "step": 14989 }, { "epoch": 0.7242595545248104, "grad_norm": 2.553861141204834, "learning_rate": 2.7574044547518964e-07, "loss": 0.27, "step": 14990 }, { "epoch": 0.7243078707058994, "grad_norm": 5.155829906463623, "learning_rate": 2.756921292941006e-07, "loss": 0.3273, "step": 14991 }, { "epoch": 0.7243561868869884, "grad_norm": 2.1879196166992188, "learning_rate": 2.7564381311301157e-07, "loss": 0.1998, "step": 14992 }, { "epoch": 0.7244045030680775, "grad_norm": 4.774379253387451, "learning_rate": 2.7559549693192246e-07, "loss": 0.2857, "step": 14993 }, { "epoch": 0.7244528192491665, "grad_norm": 2.181673765182495, "learning_rate": 2.7554718075083345e-07, "loss": 0.254, "step": 14994 }, { "epoch": 0.7245011354302556, "grad_norm": 2.5645265579223633, "learning_rate": 2.7549886456974444e-07, "loss": 0.202, "step": 14995 }, { "epoch": 0.7245494516113447, "grad_norm": 2.463224411010742, "learning_rate": 2.754505483886553e-07, "loss": 0.2592, "step": 14996 }, { "epoch": 0.7245977677924337, "grad_norm": 3.0445053577423096, "learning_rate": 2.754022322075663e-07, "loss": 0.3093, "step": 14997 }, { "epoch": 0.7246460839735227, "grad_norm": 3.6755483150482178, "learning_rate": 2.7535391602647725e-07, "loss": 0.3619, "step": 14998 }, { "epoch": 0.7246944001546117, "grad_norm": 2.237382650375366, "learning_rate": 2.753055998453882e-07, "loss": 0.216, "step": 14999 }, { "epoch": 0.7247427163357009, "grad_norm": 1.6427409648895264, "learning_rate": 2.752572836642992e-07, "loss": 0.1716, "step": 15000 }, { "epoch": 0.7247910325167899, "grad_norm": 2.3465681076049805, "learning_rate": 2.752089674832101e-07, "loss": 0.2794, "step": 15001 }, { "epoch": 0.7248393486978789, "grad_norm": 2.8439176082611084, "learning_rate": 2.7516065130212105e-07, "loss": 0.282, "step": 15002 }, { "epoch": 0.7248876648789679, "grad_norm": 2.21662974357605, "learning_rate": 2.7511233512103204e-07, "loss": 0.2986, "step": 15003 }, { "epoch": 0.724935981060057, "grad_norm": 2.7207818031311035, "learning_rate": 2.75064018939943e-07, "loss": 0.2624, "step": 15004 }, { "epoch": 0.7249842972411461, "grad_norm": 1.5552860498428345, "learning_rate": 2.750157027588539e-07, "loss": 0.1576, "step": 15005 }, { "epoch": 0.7250326134222351, "grad_norm": 7.7449951171875, "learning_rate": 2.7496738657776485e-07, "loss": 0.4087, "step": 15006 }, { "epoch": 0.7250809296033242, "grad_norm": 3.228217601776123, "learning_rate": 2.7491907039667584e-07, "loss": 0.338, "step": 15007 }, { "epoch": 0.7251292457844132, "grad_norm": 4.14520788192749, "learning_rate": 2.7487075421558683e-07, "loss": 0.3173, "step": 15008 }, { "epoch": 0.7251775619655022, "grad_norm": 2.7380268573760986, "learning_rate": 2.748224380344977e-07, "loss": 0.396, "step": 15009 }, { "epoch": 0.7252258781465913, "grad_norm": 4.1659016609191895, "learning_rate": 2.747741218534087e-07, "loss": 0.2902, "step": 15010 }, { "epoch": 0.7252741943276804, "grad_norm": 4.439956188201904, "learning_rate": 2.7472580567231965e-07, "loss": 0.266, "step": 15011 }, { "epoch": 0.7253225105087694, "grad_norm": 2.5059750080108643, "learning_rate": 2.746774894912306e-07, "loss": 0.3186, "step": 15012 }, { "epoch": 0.7253708266898584, "grad_norm": 2.518432855606079, "learning_rate": 2.7462917331014157e-07, "loss": 0.2863, "step": 15013 }, { "epoch": 0.7254191428709474, "grad_norm": 4.834775924682617, "learning_rate": 2.745808571290525e-07, "loss": 0.3415, "step": 15014 }, { "epoch": 0.7254674590520366, "grad_norm": 6.01342248916626, "learning_rate": 2.7453254094796345e-07, "loss": 0.2563, "step": 15015 }, { "epoch": 0.7255157752331256, "grad_norm": 4.005560398101807, "learning_rate": 2.7448422476687444e-07, "loss": 0.3945, "step": 15016 }, { "epoch": 0.7255640914142146, "grad_norm": 2.0728366374969482, "learning_rate": 2.744359085857854e-07, "loss": 0.2871, "step": 15017 }, { "epoch": 0.7256124075953037, "grad_norm": 8.354874610900879, "learning_rate": 2.743875924046963e-07, "loss": 0.3535, "step": 15018 }, { "epoch": 0.7256607237763927, "grad_norm": 4.060719966888428, "learning_rate": 2.7433927622360725e-07, "loss": 0.3044, "step": 15019 }, { "epoch": 0.7257090399574817, "grad_norm": 2.6357839107513428, "learning_rate": 2.7429096004251824e-07, "loss": 0.3355, "step": 15020 }, { "epoch": 0.7257573561385708, "grad_norm": 1.6323148012161255, "learning_rate": 2.742426438614292e-07, "loss": 0.1923, "step": 15021 }, { "epoch": 0.7258056723196599, "grad_norm": 2.8458714485168457, "learning_rate": 2.741943276803401e-07, "loss": 0.2752, "step": 15022 }, { "epoch": 0.7258539885007489, "grad_norm": 2.5480964183807373, "learning_rate": 2.741460114992511e-07, "loss": 0.26, "step": 15023 }, { "epoch": 0.7259023046818379, "grad_norm": 2.143833637237549, "learning_rate": 2.74097695318162e-07, "loss": 0.2249, "step": 15024 }, { "epoch": 0.7259506208629269, "grad_norm": 6.700104713439941, "learning_rate": 2.74049379137073e-07, "loss": 0.3956, "step": 15025 }, { "epoch": 0.7259989370440161, "grad_norm": 3.23404860496521, "learning_rate": 2.7400106295598397e-07, "loss": 0.3397, "step": 15026 }, { "epoch": 0.7260472532251051, "grad_norm": 2.5793282985687256, "learning_rate": 2.739527467748949e-07, "loss": 0.2439, "step": 15027 }, { "epoch": 0.7260955694061941, "grad_norm": 1.7859761714935303, "learning_rate": 2.7390443059380584e-07, "loss": 0.2061, "step": 15028 }, { "epoch": 0.7261438855872832, "grad_norm": 3.129573106765747, "learning_rate": 2.7385611441271683e-07, "loss": 0.3029, "step": 15029 }, { "epoch": 0.7261922017683722, "grad_norm": 2.5585780143737793, "learning_rate": 2.7380779823162777e-07, "loss": 0.2808, "step": 15030 }, { "epoch": 0.7262405179494613, "grad_norm": 2.895313262939453, "learning_rate": 2.737594820505387e-07, "loss": 0.251, "step": 15031 }, { "epoch": 0.7262888341305503, "grad_norm": 2.603701591491699, "learning_rate": 2.7371116586944965e-07, "loss": 0.2009, "step": 15032 }, { "epoch": 0.7263371503116394, "grad_norm": 2.8115108013153076, "learning_rate": 2.7366284968836064e-07, "loss": 0.3048, "step": 15033 }, { "epoch": 0.7263854664927284, "grad_norm": 5.728520393371582, "learning_rate": 2.7361453350727157e-07, "loss": 0.4714, "step": 15034 }, { "epoch": 0.7264337826738174, "grad_norm": 2.334674835205078, "learning_rate": 2.735662173261825e-07, "loss": 0.1822, "step": 15035 }, { "epoch": 0.7264820988549066, "grad_norm": 2.2496707439422607, "learning_rate": 2.735179011450935e-07, "loss": 0.2529, "step": 15036 }, { "epoch": 0.7265304150359956, "grad_norm": 2.3490090370178223, "learning_rate": 2.734695849640044e-07, "loss": 0.2894, "step": 15037 }, { "epoch": 0.7265787312170846, "grad_norm": 10.04749584197998, "learning_rate": 2.734212687829154e-07, "loss": 0.3345, "step": 15038 }, { "epoch": 0.7266270473981736, "grad_norm": 1.9893555641174316, "learning_rate": 2.7337295260182637e-07, "loss": 0.2229, "step": 15039 }, { "epoch": 0.7266753635792627, "grad_norm": 25.074888229370117, "learning_rate": 2.7332463642073725e-07, "loss": 0.2635, "step": 15040 }, { "epoch": 0.7267236797603518, "grad_norm": 3.760011672973633, "learning_rate": 2.7327632023964824e-07, "loss": 0.3503, "step": 15041 }, { "epoch": 0.7267719959414408, "grad_norm": 3.7847557067871094, "learning_rate": 2.7322800405855923e-07, "loss": 0.3966, "step": 15042 }, { "epoch": 0.7268203121225298, "grad_norm": 3.923366069793701, "learning_rate": 2.7317968787747017e-07, "loss": 0.3525, "step": 15043 }, { "epoch": 0.7268686283036189, "grad_norm": 2.607800006866455, "learning_rate": 2.731313716963811e-07, "loss": 0.2981, "step": 15044 }, { "epoch": 0.7269169444847079, "grad_norm": 1.5109049081802368, "learning_rate": 2.7308305551529204e-07, "loss": 0.1561, "step": 15045 }, { "epoch": 0.726965260665797, "grad_norm": 3.026477575302124, "learning_rate": 2.7303473933420303e-07, "loss": 0.3602, "step": 15046 }, { "epoch": 0.7270135768468861, "grad_norm": 3.489168405532837, "learning_rate": 2.7298642315311397e-07, "loss": 0.3515, "step": 15047 }, { "epoch": 0.7270618930279751, "grad_norm": 3.0374085903167725, "learning_rate": 2.729381069720249e-07, "loss": 0.4902, "step": 15048 }, { "epoch": 0.7271102092090641, "grad_norm": 2.9523773193359375, "learning_rate": 2.728897907909359e-07, "loss": 0.3909, "step": 15049 }, { "epoch": 0.7271585253901531, "grad_norm": 3.6551425457000732, "learning_rate": 2.728414746098468e-07, "loss": 0.4594, "step": 15050 }, { "epoch": 0.7272068415712422, "grad_norm": 5.684018135070801, "learning_rate": 2.7279315842875777e-07, "loss": 0.3576, "step": 15051 }, { "epoch": 0.7272551577523313, "grad_norm": 1.86750066280365, "learning_rate": 2.7274484224766876e-07, "loss": 0.1916, "step": 15052 }, { "epoch": 0.7273034739334203, "grad_norm": 2.918259620666504, "learning_rate": 2.7269652606657965e-07, "loss": 0.3411, "step": 15053 }, { "epoch": 0.7273517901145093, "grad_norm": 1.6200116872787476, "learning_rate": 2.7264820988549064e-07, "loss": 0.1951, "step": 15054 }, { "epoch": 0.7274001062955984, "grad_norm": 2.4826505184173584, "learning_rate": 2.7259989370440163e-07, "loss": 0.3074, "step": 15055 }, { "epoch": 0.7274484224766874, "grad_norm": 3.0181453227996826, "learning_rate": 2.725515775233125e-07, "loss": 0.3461, "step": 15056 }, { "epoch": 0.7274967386577765, "grad_norm": 2.106487512588501, "learning_rate": 2.725032613422235e-07, "loss": 0.184, "step": 15057 }, { "epoch": 0.7275450548388656, "grad_norm": 2.706690788269043, "learning_rate": 2.7245494516113444e-07, "loss": 0.355, "step": 15058 }, { "epoch": 0.7275933710199546, "grad_norm": 2.988931894302368, "learning_rate": 2.7240662898004543e-07, "loss": 0.3082, "step": 15059 }, { "epoch": 0.7276416872010436, "grad_norm": 3.054541826248169, "learning_rate": 2.7235831279895637e-07, "loss": 0.3198, "step": 15060 }, { "epoch": 0.7276900033821326, "grad_norm": 2.7271981239318848, "learning_rate": 2.723099966178673e-07, "loss": 0.2537, "step": 15061 }, { "epoch": 0.7277383195632218, "grad_norm": 2.7455101013183594, "learning_rate": 2.722616804367783e-07, "loss": 0.2612, "step": 15062 }, { "epoch": 0.7277866357443108, "grad_norm": 2.4447829723358154, "learning_rate": 2.722133642556892e-07, "loss": 0.2814, "step": 15063 }, { "epoch": 0.7278349519253998, "grad_norm": 2.078263998031616, "learning_rate": 2.7216504807460017e-07, "loss": 0.2011, "step": 15064 }, { "epoch": 0.7278832681064888, "grad_norm": 5.233034133911133, "learning_rate": 2.7211673189351116e-07, "loss": 0.2779, "step": 15065 }, { "epoch": 0.7279315842875779, "grad_norm": 3.6287736892700195, "learning_rate": 2.7206841571242204e-07, "loss": 0.3496, "step": 15066 }, { "epoch": 0.727979900468667, "grad_norm": 3.4090218544006348, "learning_rate": 2.7202009953133303e-07, "loss": 0.2998, "step": 15067 }, { "epoch": 0.728028216649756, "grad_norm": 2.9744417667388916, "learning_rate": 2.71971783350244e-07, "loss": 0.3586, "step": 15068 }, { "epoch": 0.7280765328308451, "grad_norm": 2.9274041652679443, "learning_rate": 2.719234671691549e-07, "loss": 0.3989, "step": 15069 }, { "epoch": 0.7281248490119341, "grad_norm": 2.5996601581573486, "learning_rate": 2.718751509880659e-07, "loss": 0.3374, "step": 15070 }, { "epoch": 0.7281731651930231, "grad_norm": 2.1639564037323, "learning_rate": 2.7182683480697683e-07, "loss": 0.2744, "step": 15071 }, { "epoch": 0.7282214813741122, "grad_norm": 1.7750587463378906, "learning_rate": 2.7177851862588777e-07, "loss": 0.1747, "step": 15072 }, { "epoch": 0.7282697975552013, "grad_norm": 12.050492286682129, "learning_rate": 2.7173020244479876e-07, "loss": 0.2374, "step": 15073 }, { "epoch": 0.7283181137362903, "grad_norm": 2.189511775970459, "learning_rate": 2.716818862637097e-07, "loss": 0.2713, "step": 15074 }, { "epoch": 0.7283664299173793, "grad_norm": 2.2908835411071777, "learning_rate": 2.716335700826207e-07, "loss": 0.3324, "step": 15075 }, { "epoch": 0.7284147460984683, "grad_norm": 2.994852304458618, "learning_rate": 2.715852539015316e-07, "loss": 0.3125, "step": 15076 }, { "epoch": 0.7284630622795574, "grad_norm": 3.341381549835205, "learning_rate": 2.7153693772044256e-07, "loss": 0.3642, "step": 15077 }, { "epoch": 0.7285113784606465, "grad_norm": 2.453526020050049, "learning_rate": 2.7148862153935355e-07, "loss": 0.2329, "step": 15078 }, { "epoch": 0.7285596946417355, "grad_norm": 2.2713358402252197, "learning_rate": 2.7144030535826444e-07, "loss": 0.2526, "step": 15079 }, { "epoch": 0.7286080108228246, "grad_norm": 2.1617138385772705, "learning_rate": 2.7139198917717543e-07, "loss": 0.1811, "step": 15080 }, { "epoch": 0.7286563270039136, "grad_norm": 2.0281107425689697, "learning_rate": 2.713436729960864e-07, "loss": 0.1786, "step": 15081 }, { "epoch": 0.7287046431850026, "grad_norm": 2.06154727935791, "learning_rate": 2.712953568149973e-07, "loss": 0.2288, "step": 15082 }, { "epoch": 0.7287529593660917, "grad_norm": 2.26932692527771, "learning_rate": 2.712470406339083e-07, "loss": 0.2651, "step": 15083 }, { "epoch": 0.7288012755471808, "grad_norm": 2.9666411876678467, "learning_rate": 2.7119872445281923e-07, "loss": 0.3043, "step": 15084 }, { "epoch": 0.7288495917282698, "grad_norm": 1.7103707790374756, "learning_rate": 2.7115040827173017e-07, "loss": 0.1748, "step": 15085 }, { "epoch": 0.7288979079093588, "grad_norm": 3.8244926929473877, "learning_rate": 2.7110209209064116e-07, "loss": 0.254, "step": 15086 }, { "epoch": 0.7289462240904478, "grad_norm": 2.8310117721557617, "learning_rate": 2.710537759095521e-07, "loss": 0.3023, "step": 15087 }, { "epoch": 0.728994540271537, "grad_norm": 2.043184518814087, "learning_rate": 2.7100545972846303e-07, "loss": 0.2319, "step": 15088 }, { "epoch": 0.729042856452626, "grad_norm": 2.697016954421997, "learning_rate": 2.7095714354737397e-07, "loss": 0.2771, "step": 15089 }, { "epoch": 0.729091172633715, "grad_norm": 3.094381332397461, "learning_rate": 2.7090882736628496e-07, "loss": 0.4036, "step": 15090 }, { "epoch": 0.7291394888148041, "grad_norm": 3.3175442218780518, "learning_rate": 2.7086051118519595e-07, "loss": 0.3531, "step": 15091 }, { "epoch": 0.7291878049958931, "grad_norm": 21.580249786376953, "learning_rate": 2.7081219500410684e-07, "loss": 0.3562, "step": 15092 }, { "epoch": 0.7292361211769822, "grad_norm": 2.2240869998931885, "learning_rate": 2.707638788230178e-07, "loss": 0.2448, "step": 15093 }, { "epoch": 0.7292844373580712, "grad_norm": 2.352818012237549, "learning_rate": 2.707155626419288e-07, "loss": 0.2274, "step": 15094 }, { "epoch": 0.7293327535391603, "grad_norm": 2.8509135246276855, "learning_rate": 2.706672464608397e-07, "loss": 0.43, "step": 15095 }, { "epoch": 0.7293810697202493, "grad_norm": 2.8871774673461914, "learning_rate": 2.706189302797507e-07, "loss": 0.2506, "step": 15096 }, { "epoch": 0.7294293859013383, "grad_norm": 2.954406499862671, "learning_rate": 2.7057061409866163e-07, "loss": 0.1793, "step": 15097 }, { "epoch": 0.7294777020824275, "grad_norm": 3.433774948120117, "learning_rate": 2.7052229791757256e-07, "loss": 0.2575, "step": 15098 }, { "epoch": 0.7295260182635165, "grad_norm": 4.049907684326172, "learning_rate": 2.7047398173648355e-07, "loss": 0.283, "step": 15099 }, { "epoch": 0.7295743344446055, "grad_norm": 3.0720062255859375, "learning_rate": 2.704256655553945e-07, "loss": 0.391, "step": 15100 }, { "epoch": 0.7296226506256945, "grad_norm": 2.275702714920044, "learning_rate": 2.7037734937430543e-07, "loss": 0.2342, "step": 15101 }, { "epoch": 0.7296709668067836, "grad_norm": 3.859767436981201, "learning_rate": 2.7032903319321637e-07, "loss": 0.3237, "step": 15102 }, { "epoch": 0.7297192829878726, "grad_norm": 2.2045018672943115, "learning_rate": 2.7028071701212736e-07, "loss": 0.2195, "step": 15103 }, { "epoch": 0.7297675991689617, "grad_norm": 2.878255844116211, "learning_rate": 2.7023240083103835e-07, "loss": 0.3317, "step": 15104 }, { "epoch": 0.7298159153500507, "grad_norm": 2.1050779819488525, "learning_rate": 2.7018408464994923e-07, "loss": 0.2083, "step": 15105 }, { "epoch": 0.7298642315311398, "grad_norm": 2.5489304065704346, "learning_rate": 2.701357684688602e-07, "loss": 0.3347, "step": 15106 }, { "epoch": 0.7299125477122288, "grad_norm": 4.373250484466553, "learning_rate": 2.700874522877712e-07, "loss": 0.3718, "step": 15107 }, { "epoch": 0.7299608638933178, "grad_norm": 5.56074333190918, "learning_rate": 2.700391361066821e-07, "loss": 0.4846, "step": 15108 }, { "epoch": 0.730009180074407, "grad_norm": 3.317134141921997, "learning_rate": 2.699908199255931e-07, "loss": 0.4117, "step": 15109 }, { "epoch": 0.730057496255496, "grad_norm": 3.463909149169922, "learning_rate": 2.69942503744504e-07, "loss": 0.3485, "step": 15110 }, { "epoch": 0.730105812436585, "grad_norm": 1.9385536909103394, "learning_rate": 2.6989418756341496e-07, "loss": 0.2098, "step": 15111 }, { "epoch": 0.730154128617674, "grad_norm": 1.8582971096038818, "learning_rate": 2.6984587138232595e-07, "loss": 0.2132, "step": 15112 }, { "epoch": 0.7302024447987631, "grad_norm": 2.8657562732696533, "learning_rate": 2.697975552012369e-07, "loss": 0.2795, "step": 15113 }, { "epoch": 0.7302507609798522, "grad_norm": 2.6166398525238037, "learning_rate": 2.697492390201478e-07, "loss": 0.2497, "step": 15114 }, { "epoch": 0.7302990771609412, "grad_norm": 2.502241373062134, "learning_rate": 2.6970092283905876e-07, "loss": 0.2643, "step": 15115 }, { "epoch": 0.7303473933420302, "grad_norm": 2.402555465698242, "learning_rate": 2.6965260665796975e-07, "loss": 0.3484, "step": 15116 }, { "epoch": 0.7303957095231193, "grad_norm": 2.8102264404296875, "learning_rate": 2.696042904768807e-07, "loss": 0.2637, "step": 15117 }, { "epoch": 0.7304440257042083, "grad_norm": 5.5481767654418945, "learning_rate": 2.6955597429579163e-07, "loss": 0.3099, "step": 15118 }, { "epoch": 0.7304923418852974, "grad_norm": 2.7560791969299316, "learning_rate": 2.695076581147026e-07, "loss": 0.2834, "step": 15119 }, { "epoch": 0.7305406580663865, "grad_norm": 4.435047626495361, "learning_rate": 2.694593419336136e-07, "loss": 0.2131, "step": 15120 }, { "epoch": 0.7305889742474755, "grad_norm": 2.7391767501831055, "learning_rate": 2.694110257525245e-07, "loss": 0.3011, "step": 15121 }, { "epoch": 0.7306372904285645, "grad_norm": 2.7612392902374268, "learning_rate": 2.693627095714355e-07, "loss": 0.2263, "step": 15122 }, { "epoch": 0.7306856066096535, "grad_norm": 2.293290376663208, "learning_rate": 2.693143933903464e-07, "loss": 0.2713, "step": 15123 }, { "epoch": 0.7307339227907427, "grad_norm": 2.7054014205932617, "learning_rate": 2.6926607720925736e-07, "loss": 0.3232, "step": 15124 }, { "epoch": 0.7307822389718317, "grad_norm": 3.0197863578796387, "learning_rate": 2.6921776102816835e-07, "loss": 0.3153, "step": 15125 }, { "epoch": 0.7308305551529207, "grad_norm": 2.85764741897583, "learning_rate": 2.691694448470793e-07, "loss": 0.3349, "step": 15126 }, { "epoch": 0.7308788713340097, "grad_norm": 1.6303457021713257, "learning_rate": 2.691211286659902e-07, "loss": 0.178, "step": 15127 }, { "epoch": 0.7309271875150988, "grad_norm": 4.097344875335693, "learning_rate": 2.6907281248490116e-07, "loss": 0.2247, "step": 15128 }, { "epoch": 0.7309755036961878, "grad_norm": 8.50837516784668, "learning_rate": 2.6902449630381215e-07, "loss": 0.2868, "step": 15129 }, { "epoch": 0.7310238198772769, "grad_norm": 3.815542221069336, "learning_rate": 2.689761801227231e-07, "loss": 0.2757, "step": 15130 }, { "epoch": 0.731072136058366, "grad_norm": 3.313181161880493, "learning_rate": 2.68927863941634e-07, "loss": 0.4391, "step": 15131 }, { "epoch": 0.731120452239455, "grad_norm": 4.348818302154541, "learning_rate": 2.68879547760545e-07, "loss": 0.3434, "step": 15132 }, { "epoch": 0.731168768420544, "grad_norm": 5.003868579864502, "learning_rate": 2.6883123157945595e-07, "loss": 0.3212, "step": 15133 }, { "epoch": 0.731217084601633, "grad_norm": 3.153951406478882, "learning_rate": 2.687829153983669e-07, "loss": 0.2408, "step": 15134 }, { "epoch": 0.7312654007827222, "grad_norm": 1.770892858505249, "learning_rate": 2.687345992172779e-07, "loss": 0.1789, "step": 15135 }, { "epoch": 0.7313137169638112, "grad_norm": 3.129810094833374, "learning_rate": 2.6868628303618876e-07, "loss": 0.2749, "step": 15136 }, { "epoch": 0.7313620331449002, "grad_norm": 31.75381088256836, "learning_rate": 2.6863796685509975e-07, "loss": 0.3815, "step": 15137 }, { "epoch": 0.7314103493259893, "grad_norm": 5.021953105926514, "learning_rate": 2.6858965067401074e-07, "loss": 0.3437, "step": 15138 }, { "epoch": 0.7314586655070783, "grad_norm": 4.258963584899902, "learning_rate": 2.685413344929217e-07, "loss": 0.4927, "step": 15139 }, { "epoch": 0.7315069816881674, "grad_norm": 2.719043016433716, "learning_rate": 2.684930183118326e-07, "loss": 0.3156, "step": 15140 }, { "epoch": 0.7315552978692564, "grad_norm": 2.8311378955841064, "learning_rate": 2.6844470213074356e-07, "loss": 0.2425, "step": 15141 }, { "epoch": 0.7316036140503455, "grad_norm": 2.317410707473755, "learning_rate": 2.6839638594965455e-07, "loss": 0.2741, "step": 15142 }, { "epoch": 0.7316519302314345, "grad_norm": 3.2574105262756348, "learning_rate": 2.683480697685655e-07, "loss": 0.3028, "step": 15143 }, { "epoch": 0.7317002464125235, "grad_norm": 5.4486985206604, "learning_rate": 2.682997535874764e-07, "loss": 0.2398, "step": 15144 }, { "epoch": 0.7317485625936127, "grad_norm": 2.1312618255615234, "learning_rate": 2.682514374063874e-07, "loss": 0.2392, "step": 15145 }, { "epoch": 0.7317968787747017, "grad_norm": 4.953431606292725, "learning_rate": 2.6820312122529835e-07, "loss": 0.2789, "step": 15146 }, { "epoch": 0.7318451949557907, "grad_norm": 14.010725975036621, "learning_rate": 2.681548050442093e-07, "loss": 0.278, "step": 15147 }, { "epoch": 0.7318935111368797, "grad_norm": 6.728566646575928, "learning_rate": 2.681064888631203e-07, "loss": 0.332, "step": 15148 }, { "epoch": 0.7319418273179688, "grad_norm": 1.9196109771728516, "learning_rate": 2.6805817268203116e-07, "loss": 0.2611, "step": 15149 }, { "epoch": 0.7319901434990579, "grad_norm": 2.6417813301086426, "learning_rate": 2.6800985650094215e-07, "loss": 0.3026, "step": 15150 }, { "epoch": 0.7320384596801469, "grad_norm": 2.482783079147339, "learning_rate": 2.6796154031985314e-07, "loss": 0.3472, "step": 15151 }, { "epoch": 0.7320867758612359, "grad_norm": 8.031020164489746, "learning_rate": 2.67913224138764e-07, "loss": 0.295, "step": 15152 }, { "epoch": 0.732135092042325, "grad_norm": 2.9677698612213135, "learning_rate": 2.67864907957675e-07, "loss": 0.2878, "step": 15153 }, { "epoch": 0.732183408223414, "grad_norm": 3.3720321655273438, "learning_rate": 2.6781659177658595e-07, "loss": 0.3076, "step": 15154 }, { "epoch": 0.732231724404503, "grad_norm": 2.8016018867492676, "learning_rate": 2.6776827559549694e-07, "loss": 0.2524, "step": 15155 }, { "epoch": 0.7322800405855922, "grad_norm": 2.485062837600708, "learning_rate": 2.677199594144079e-07, "loss": 0.2844, "step": 15156 }, { "epoch": 0.7323283567666812, "grad_norm": 2.561718225479126, "learning_rate": 2.676716432333188e-07, "loss": 0.2642, "step": 15157 }, { "epoch": 0.7323766729477702, "grad_norm": 2.6342194080352783, "learning_rate": 2.676233270522298e-07, "loss": 0.2061, "step": 15158 }, { "epoch": 0.7324249891288592, "grad_norm": 2.531912326812744, "learning_rate": 2.6757501087114074e-07, "loss": 0.2822, "step": 15159 }, { "epoch": 0.7324733053099483, "grad_norm": 3.7418885231018066, "learning_rate": 2.675266946900517e-07, "loss": 0.2781, "step": 15160 }, { "epoch": 0.7325216214910374, "grad_norm": 2.119305372238159, "learning_rate": 2.6747837850896267e-07, "loss": 0.1987, "step": 15161 }, { "epoch": 0.7325699376721264, "grad_norm": 2.26763653755188, "learning_rate": 2.6743006232787356e-07, "loss": 0.2352, "step": 15162 }, { "epoch": 0.7326182538532154, "grad_norm": 4.405850410461426, "learning_rate": 2.6738174614678455e-07, "loss": 0.2652, "step": 15163 }, { "epoch": 0.7326665700343045, "grad_norm": 2.6101691722869873, "learning_rate": 2.6733342996569554e-07, "loss": 0.2974, "step": 15164 }, { "epoch": 0.7327148862153935, "grad_norm": 5.6596784591674805, "learning_rate": 2.672851137846064e-07, "loss": 0.456, "step": 15165 }, { "epoch": 0.7327632023964826, "grad_norm": 2.4141242504119873, "learning_rate": 2.672367976035174e-07, "loss": 0.3028, "step": 15166 }, { "epoch": 0.7328115185775717, "grad_norm": 3.415517568588257, "learning_rate": 2.6718848142242835e-07, "loss": 0.2856, "step": 15167 }, { "epoch": 0.7328598347586607, "grad_norm": 2.1404366493225098, "learning_rate": 2.671401652413393e-07, "loss": 0.2709, "step": 15168 }, { "epoch": 0.7329081509397497, "grad_norm": 2.361919403076172, "learning_rate": 2.670918490602503e-07, "loss": 0.3173, "step": 15169 }, { "epoch": 0.7329564671208387, "grad_norm": 2.5503110885620117, "learning_rate": 2.670435328791612e-07, "loss": 0.2904, "step": 15170 }, { "epoch": 0.7330047833019279, "grad_norm": 2.6121954917907715, "learning_rate": 2.669952166980722e-07, "loss": 0.2182, "step": 15171 }, { "epoch": 0.7330530994830169, "grad_norm": 3.507478952407837, "learning_rate": 2.669469005169831e-07, "loss": 0.4271, "step": 15172 }, { "epoch": 0.7331014156641059, "grad_norm": 21.075706481933594, "learning_rate": 2.668985843358941e-07, "loss": 0.3396, "step": 15173 }, { "epoch": 0.7331497318451949, "grad_norm": 3.241459846496582, "learning_rate": 2.6685026815480507e-07, "loss": 0.3886, "step": 15174 }, { "epoch": 0.733198048026284, "grad_norm": 2.781376838684082, "learning_rate": 2.6680195197371595e-07, "loss": 0.306, "step": 15175 }, { "epoch": 0.7332463642073731, "grad_norm": 2.618480920791626, "learning_rate": 2.6675363579262694e-07, "loss": 0.348, "step": 15176 }, { "epoch": 0.7332946803884621, "grad_norm": 2.490065574645996, "learning_rate": 2.6670531961153793e-07, "loss": 0.3556, "step": 15177 }, { "epoch": 0.7333429965695512, "grad_norm": 2.7019221782684326, "learning_rate": 2.666570034304488e-07, "loss": 0.3296, "step": 15178 }, { "epoch": 0.7333913127506402, "grad_norm": 18.58231544494629, "learning_rate": 2.666086872493598e-07, "loss": 0.2493, "step": 15179 }, { "epoch": 0.7334396289317292, "grad_norm": 2.2373764514923096, "learning_rate": 2.6656037106827074e-07, "loss": 0.2419, "step": 15180 }, { "epoch": 0.7334879451128182, "grad_norm": 2.5572853088378906, "learning_rate": 2.665120548871817e-07, "loss": 0.3158, "step": 15181 }, { "epoch": 0.7335362612939074, "grad_norm": 3.057974100112915, "learning_rate": 2.6646373870609267e-07, "loss": 0.298, "step": 15182 }, { "epoch": 0.7335845774749964, "grad_norm": 3.4866340160369873, "learning_rate": 2.664154225250036e-07, "loss": 0.3145, "step": 15183 }, { "epoch": 0.7336328936560854, "grad_norm": 3.4269959926605225, "learning_rate": 2.6636710634391455e-07, "loss": 0.2845, "step": 15184 }, { "epoch": 0.7336812098371744, "grad_norm": 2.372357130050659, "learning_rate": 2.663187901628255e-07, "loss": 0.2452, "step": 15185 }, { "epoch": 0.7337295260182635, "grad_norm": 2.2498438358306885, "learning_rate": 2.662704739817365e-07, "loss": 0.2924, "step": 15186 }, { "epoch": 0.7337778421993526, "grad_norm": 2.112597703933716, "learning_rate": 2.6622215780064746e-07, "loss": 0.2312, "step": 15187 }, { "epoch": 0.7338261583804416, "grad_norm": 1.912428379058838, "learning_rate": 2.6617384161955835e-07, "loss": 0.2012, "step": 15188 }, { "epoch": 0.7338744745615307, "grad_norm": 2.1137583255767822, "learning_rate": 2.6612552543846934e-07, "loss": 0.2512, "step": 15189 }, { "epoch": 0.7339227907426197, "grad_norm": 2.2716445922851562, "learning_rate": 2.6607720925738033e-07, "loss": 0.2585, "step": 15190 }, { "epoch": 0.7339711069237087, "grad_norm": 2.410386800765991, "learning_rate": 2.660288930762912e-07, "loss": 0.2598, "step": 15191 }, { "epoch": 0.7340194231047978, "grad_norm": 3.7436020374298096, "learning_rate": 2.659805768952022e-07, "loss": 0.3239, "step": 15192 }, { "epoch": 0.7340677392858869, "grad_norm": 2.0903406143188477, "learning_rate": 2.6593226071411314e-07, "loss": 0.2472, "step": 15193 }, { "epoch": 0.7341160554669759, "grad_norm": 5.688063144683838, "learning_rate": 2.658839445330241e-07, "loss": 0.3596, "step": 15194 }, { "epoch": 0.7341643716480649, "grad_norm": 2.964510917663574, "learning_rate": 2.6583562835193507e-07, "loss": 0.2563, "step": 15195 }, { "epoch": 0.7342126878291539, "grad_norm": 2.2492260932922363, "learning_rate": 2.65787312170846e-07, "loss": 0.2164, "step": 15196 }, { "epoch": 0.7342610040102431, "grad_norm": 3.689152956008911, "learning_rate": 2.6573899598975694e-07, "loss": 0.163, "step": 15197 }, { "epoch": 0.7343093201913321, "grad_norm": 2.499439001083374, "learning_rate": 2.656906798086679e-07, "loss": 0.2835, "step": 15198 }, { "epoch": 0.7343576363724211, "grad_norm": 3.3024024963378906, "learning_rate": 2.6564236362757887e-07, "loss": 0.4586, "step": 15199 }, { "epoch": 0.7344059525535102, "grad_norm": 17.594274520874023, "learning_rate": 2.655940474464898e-07, "loss": 0.4014, "step": 15200 }, { "epoch": 0.7344542687345992, "grad_norm": 2.409238576889038, "learning_rate": 2.6554573126540075e-07, "loss": 0.2992, "step": 15201 }, { "epoch": 0.7345025849156883, "grad_norm": 10.404243469238281, "learning_rate": 2.6549741508431174e-07, "loss": 0.4697, "step": 15202 }, { "epoch": 0.7345509010967773, "grad_norm": 2.520657777786255, "learning_rate": 2.654490989032227e-07, "loss": 0.3121, "step": 15203 }, { "epoch": 0.7345992172778664, "grad_norm": 1.811382532119751, "learning_rate": 2.654007827221336e-07, "loss": 0.1962, "step": 15204 }, { "epoch": 0.7346475334589554, "grad_norm": 1.5666450262069702, "learning_rate": 2.653524665410446e-07, "loss": 0.1942, "step": 15205 }, { "epoch": 0.7346958496400444, "grad_norm": 3.6544148921966553, "learning_rate": 2.6530415035995554e-07, "loss": 0.186, "step": 15206 }, { "epoch": 0.7347441658211334, "grad_norm": 3.659914255142212, "learning_rate": 2.652558341788665e-07, "loss": 0.2571, "step": 15207 }, { "epoch": 0.7347924820022226, "grad_norm": 1.754332423210144, "learning_rate": 2.6520751799777747e-07, "loss": 0.1981, "step": 15208 }, { "epoch": 0.7348407981833116, "grad_norm": 1.7063179016113281, "learning_rate": 2.651592018166884e-07, "loss": 0.1794, "step": 15209 }, { "epoch": 0.7348891143644006, "grad_norm": 4.242270469665527, "learning_rate": 2.6511088563559934e-07, "loss": 0.3021, "step": 15210 }, { "epoch": 0.7349374305454897, "grad_norm": 2.6666152477264404, "learning_rate": 2.650625694545103e-07, "loss": 0.3051, "step": 15211 }, { "epoch": 0.7349857467265787, "grad_norm": 1.9858800172805786, "learning_rate": 2.6501425327342127e-07, "loss": 0.195, "step": 15212 }, { "epoch": 0.7350340629076678, "grad_norm": 1.9270939826965332, "learning_rate": 2.649659370923322e-07, "loss": 0.2458, "step": 15213 }, { "epoch": 0.7350823790887568, "grad_norm": 3.283891201019287, "learning_rate": 2.6491762091124314e-07, "loss": 0.3655, "step": 15214 }, { "epoch": 0.7351306952698459, "grad_norm": 3.8090288639068604, "learning_rate": 2.6486930473015413e-07, "loss": 0.3952, "step": 15215 }, { "epoch": 0.7351790114509349, "grad_norm": 4.42142915725708, "learning_rate": 2.6482098854906507e-07, "loss": 0.3571, "step": 15216 }, { "epoch": 0.7352273276320239, "grad_norm": 3.6917049884796143, "learning_rate": 2.64772672367976e-07, "loss": 0.3162, "step": 15217 }, { "epoch": 0.7352756438131131, "grad_norm": 3.376389503479004, "learning_rate": 2.64724356186887e-07, "loss": 0.1459, "step": 15218 }, { "epoch": 0.7353239599942021, "grad_norm": 8.501046180725098, "learning_rate": 2.646760400057979e-07, "loss": 0.167, "step": 15219 }, { "epoch": 0.7353722761752911, "grad_norm": 3.253997325897217, "learning_rate": 2.6462772382470887e-07, "loss": 0.4126, "step": 15220 }, { "epoch": 0.7354205923563801, "grad_norm": 2.965595006942749, "learning_rate": 2.6457940764361986e-07, "loss": 0.3883, "step": 15221 }, { "epoch": 0.7354689085374692, "grad_norm": 4.190825462341309, "learning_rate": 2.645310914625308e-07, "loss": 0.298, "step": 15222 }, { "epoch": 0.7355172247185583, "grad_norm": 2.536815881729126, "learning_rate": 2.6448277528144174e-07, "loss": 0.3178, "step": 15223 }, { "epoch": 0.7355655408996473, "grad_norm": 2.2367591857910156, "learning_rate": 2.6443445910035267e-07, "loss": 0.2457, "step": 15224 }, { "epoch": 0.7356138570807363, "grad_norm": 3.9102065563201904, "learning_rate": 2.6438614291926366e-07, "loss": 0.2389, "step": 15225 }, { "epoch": 0.7356621732618254, "grad_norm": 2.4223525524139404, "learning_rate": 2.643378267381746e-07, "loss": 0.2825, "step": 15226 }, { "epoch": 0.7357104894429144, "grad_norm": 2.343574285507202, "learning_rate": 2.6428951055708554e-07, "loss": 0.3192, "step": 15227 }, { "epoch": 0.7357588056240035, "grad_norm": 2.0710885524749756, "learning_rate": 2.6424119437599653e-07, "loss": 0.2638, "step": 15228 }, { "epoch": 0.7358071218050926, "grad_norm": 2.364492177963257, "learning_rate": 2.6419287819490747e-07, "loss": 0.2842, "step": 15229 }, { "epoch": 0.7358554379861816, "grad_norm": 6.741630554199219, "learning_rate": 2.641445620138184e-07, "loss": 0.2537, "step": 15230 }, { "epoch": 0.7359037541672706, "grad_norm": 2.265138864517212, "learning_rate": 2.640962458327294e-07, "loss": 0.273, "step": 15231 }, { "epoch": 0.7359520703483596, "grad_norm": 1.7342199087142944, "learning_rate": 2.640479296516403e-07, "loss": 0.1887, "step": 15232 }, { "epoch": 0.7360003865294487, "grad_norm": 7.261979103088379, "learning_rate": 2.6399961347055127e-07, "loss": 0.2431, "step": 15233 }, { "epoch": 0.7360487027105378, "grad_norm": 2.4409172534942627, "learning_rate": 2.6395129728946226e-07, "loss": 0.2826, "step": 15234 }, { "epoch": 0.7360970188916268, "grad_norm": 4.313488483428955, "learning_rate": 2.6390298110837314e-07, "loss": 0.324, "step": 15235 }, { "epoch": 0.7361453350727158, "grad_norm": 2.5580878257751465, "learning_rate": 2.6385466492728413e-07, "loss": 0.3704, "step": 15236 }, { "epoch": 0.7361936512538049, "grad_norm": 3.654181718826294, "learning_rate": 2.6380634874619507e-07, "loss": 0.3143, "step": 15237 }, { "epoch": 0.7362419674348939, "grad_norm": 2.9461910724639893, "learning_rate": 2.6375803256510606e-07, "loss": 0.4679, "step": 15238 }, { "epoch": 0.736290283615983, "grad_norm": 2.834545135498047, "learning_rate": 2.63709716384017e-07, "loss": 0.2578, "step": 15239 }, { "epoch": 0.7363385997970721, "grad_norm": 2.865029811859131, "learning_rate": 2.6366140020292793e-07, "loss": 0.3162, "step": 15240 }, { "epoch": 0.7363869159781611, "grad_norm": 2.112785577774048, "learning_rate": 2.636130840218389e-07, "loss": 0.1949, "step": 15241 }, { "epoch": 0.7364352321592501, "grad_norm": 2.8230278491973877, "learning_rate": 2.6356476784074986e-07, "loss": 0.309, "step": 15242 }, { "epoch": 0.7364835483403391, "grad_norm": 2.3976433277130127, "learning_rate": 2.635164516596608e-07, "loss": 0.2884, "step": 15243 }, { "epoch": 0.7365318645214283, "grad_norm": 2.4397025108337402, "learning_rate": 2.634681354785718e-07, "loss": 0.3067, "step": 15244 }, { "epoch": 0.7365801807025173, "grad_norm": 2.1346309185028076, "learning_rate": 2.634198192974827e-07, "loss": 0.3148, "step": 15245 }, { "epoch": 0.7366284968836063, "grad_norm": 1.9117807149887085, "learning_rate": 2.6337150311639366e-07, "loss": 0.1426, "step": 15246 }, { "epoch": 0.7366768130646953, "grad_norm": 5.642893314361572, "learning_rate": 2.6332318693530465e-07, "loss": 0.279, "step": 15247 }, { "epoch": 0.7367251292457844, "grad_norm": 2.108811616897583, "learning_rate": 2.6327487075421554e-07, "loss": 0.1826, "step": 15248 }, { "epoch": 0.7367734454268735, "grad_norm": 7.6597208976745605, "learning_rate": 2.6322655457312653e-07, "loss": 0.2767, "step": 15249 }, { "epoch": 0.7368217616079625, "grad_norm": 2.3644204139709473, "learning_rate": 2.6317823839203747e-07, "loss": 0.3247, "step": 15250 }, { "epoch": 0.7368700777890516, "grad_norm": 3.1825616359710693, "learning_rate": 2.631299222109484e-07, "loss": 0.3043, "step": 15251 }, { "epoch": 0.7369183939701406, "grad_norm": 3.111931562423706, "learning_rate": 2.630816060298594e-07, "loss": 0.2567, "step": 15252 }, { "epoch": 0.7369667101512296, "grad_norm": 7.159991264343262, "learning_rate": 2.6303328984877033e-07, "loss": 0.3615, "step": 15253 }, { "epoch": 0.7370150263323187, "grad_norm": 4.8972697257995605, "learning_rate": 2.629849736676813e-07, "loss": 0.3569, "step": 15254 }, { "epoch": 0.7370633425134078, "grad_norm": 5.389959812164307, "learning_rate": 2.6293665748659226e-07, "loss": 0.2165, "step": 15255 }, { "epoch": 0.7371116586944968, "grad_norm": 2.088545799255371, "learning_rate": 2.628883413055032e-07, "loss": 0.221, "step": 15256 }, { "epoch": 0.7371599748755858, "grad_norm": 4.031036853790283, "learning_rate": 2.628400251244142e-07, "loss": 0.4086, "step": 15257 }, { "epoch": 0.7372082910566748, "grad_norm": 4.53577995300293, "learning_rate": 2.6279170894332507e-07, "loss": 0.3417, "step": 15258 }, { "epoch": 0.7372566072377639, "grad_norm": 11.184141159057617, "learning_rate": 2.6274339276223606e-07, "loss": 0.2019, "step": 15259 }, { "epoch": 0.737304923418853, "grad_norm": 4.827689170837402, "learning_rate": 2.6269507658114705e-07, "loss": 0.23, "step": 15260 }, { "epoch": 0.737353239599942, "grad_norm": 2.4849343299865723, "learning_rate": 2.6264676040005793e-07, "loss": 0.2835, "step": 15261 }, { "epoch": 0.7374015557810311, "grad_norm": 2.2856979370117188, "learning_rate": 2.625984442189689e-07, "loss": 0.3197, "step": 15262 }, { "epoch": 0.7374498719621201, "grad_norm": 2.7222862243652344, "learning_rate": 2.6255012803787986e-07, "loss": 0.2691, "step": 15263 }, { "epoch": 0.7374981881432091, "grad_norm": 2.662644624710083, "learning_rate": 2.625018118567908e-07, "loss": 0.2846, "step": 15264 }, { "epoch": 0.7375465043242982, "grad_norm": 3.803105115890503, "learning_rate": 2.624534956757018e-07, "loss": 0.2831, "step": 15265 }, { "epoch": 0.7375948205053873, "grad_norm": 3.59484601020813, "learning_rate": 2.6240517949461273e-07, "loss": 0.3027, "step": 15266 }, { "epoch": 0.7376431366864763, "grad_norm": 2.0249056816101074, "learning_rate": 2.623568633135237e-07, "loss": 0.2294, "step": 15267 }, { "epoch": 0.7376914528675653, "grad_norm": 5.509886264801025, "learning_rate": 2.6230854713243465e-07, "loss": 0.4446, "step": 15268 }, { "epoch": 0.7377397690486543, "grad_norm": 2.7763521671295166, "learning_rate": 2.622602309513456e-07, "loss": 0.3235, "step": 15269 }, { "epoch": 0.7377880852297435, "grad_norm": 2.644766092300415, "learning_rate": 2.622119147702566e-07, "loss": 0.3001, "step": 15270 }, { "epoch": 0.7378364014108325, "grad_norm": 46.40414047241211, "learning_rate": 2.6216359858916747e-07, "loss": 0.4894, "step": 15271 }, { "epoch": 0.7378847175919215, "grad_norm": 1.9926737546920776, "learning_rate": 2.6211528240807846e-07, "loss": 0.2501, "step": 15272 }, { "epoch": 0.7379330337730106, "grad_norm": 2.286130666732788, "learning_rate": 2.6206696622698945e-07, "loss": 0.2956, "step": 15273 }, { "epoch": 0.7379813499540996, "grad_norm": 12.95713996887207, "learning_rate": 2.6201865004590033e-07, "loss": 0.2558, "step": 15274 }, { "epoch": 0.7380296661351887, "grad_norm": 3.937030553817749, "learning_rate": 2.619703338648113e-07, "loss": 0.4293, "step": 15275 }, { "epoch": 0.7380779823162777, "grad_norm": 17.0506649017334, "learning_rate": 2.6192201768372226e-07, "loss": 0.2501, "step": 15276 }, { "epoch": 0.7381262984973668, "grad_norm": 2.3590118885040283, "learning_rate": 2.618737015026332e-07, "loss": 0.2757, "step": 15277 }, { "epoch": 0.7381746146784558, "grad_norm": 2.0051143169403076, "learning_rate": 2.618253853215442e-07, "loss": 0.2614, "step": 15278 }, { "epoch": 0.7382229308595448, "grad_norm": 26.833282470703125, "learning_rate": 2.617770691404551e-07, "loss": 0.2395, "step": 15279 }, { "epoch": 0.738271247040634, "grad_norm": 3.1717560291290283, "learning_rate": 2.6172875295936606e-07, "loss": 0.3397, "step": 15280 }, { "epoch": 0.738319563221723, "grad_norm": 28.836244583129883, "learning_rate": 2.6168043677827705e-07, "loss": 0.3346, "step": 15281 }, { "epoch": 0.738367879402812, "grad_norm": 2.2887344360351562, "learning_rate": 2.61632120597188e-07, "loss": 0.2572, "step": 15282 }, { "epoch": 0.738416195583901, "grad_norm": 3.145315647125244, "learning_rate": 2.61583804416099e-07, "loss": 0.3592, "step": 15283 }, { "epoch": 0.7384645117649901, "grad_norm": 4.513092517852783, "learning_rate": 2.6153548823500986e-07, "loss": 0.2183, "step": 15284 }, { "epoch": 0.7385128279460791, "grad_norm": 2.969714879989624, "learning_rate": 2.6148717205392085e-07, "loss": 0.2777, "step": 15285 }, { "epoch": 0.7385611441271682, "grad_norm": 2.5736944675445557, "learning_rate": 2.6143885587283184e-07, "loss": 0.194, "step": 15286 }, { "epoch": 0.7386094603082572, "grad_norm": 4.750220775604248, "learning_rate": 2.6139053969174273e-07, "loss": 0.3059, "step": 15287 }, { "epoch": 0.7386577764893463, "grad_norm": 3.873584032058716, "learning_rate": 2.613422235106537e-07, "loss": 0.2661, "step": 15288 }, { "epoch": 0.7387060926704353, "grad_norm": 4.7924957275390625, "learning_rate": 2.6129390732956465e-07, "loss": 0.4027, "step": 15289 }, { "epoch": 0.7387544088515243, "grad_norm": 3.665196657180786, "learning_rate": 2.612455911484756e-07, "loss": 0.3458, "step": 15290 }, { "epoch": 0.7388027250326135, "grad_norm": 3.543396234512329, "learning_rate": 2.611972749673866e-07, "loss": 0.1817, "step": 15291 }, { "epoch": 0.7388510412137025, "grad_norm": 2.922199010848999, "learning_rate": 2.611489587862975e-07, "loss": 0.3187, "step": 15292 }, { "epoch": 0.7388993573947915, "grad_norm": 2.6009674072265625, "learning_rate": 2.6110064260520846e-07, "loss": 0.3869, "step": 15293 }, { "epoch": 0.7389476735758805, "grad_norm": 2.5270748138427734, "learning_rate": 2.6105232642411945e-07, "loss": 0.2602, "step": 15294 }, { "epoch": 0.7389959897569696, "grad_norm": 3.77785587310791, "learning_rate": 2.610040102430304e-07, "loss": 0.2656, "step": 15295 }, { "epoch": 0.7390443059380587, "grad_norm": 5.680363178253174, "learning_rate": 2.609556940619413e-07, "loss": 0.2814, "step": 15296 }, { "epoch": 0.7390926221191477, "grad_norm": 3.457590341567993, "learning_rate": 2.6090737788085226e-07, "loss": 0.3032, "step": 15297 }, { "epoch": 0.7391409383002367, "grad_norm": 3.732386350631714, "learning_rate": 2.6085906169976325e-07, "loss": 0.2953, "step": 15298 }, { "epoch": 0.7391892544813258, "grad_norm": 2.567781686782837, "learning_rate": 2.6081074551867424e-07, "loss": 0.2492, "step": 15299 }, { "epoch": 0.7392375706624148, "grad_norm": 3.2872328758239746, "learning_rate": 2.607624293375851e-07, "loss": 0.3283, "step": 15300 }, { "epoch": 0.7392858868435039, "grad_norm": 6.001733303070068, "learning_rate": 2.607141131564961e-07, "loss": 0.2755, "step": 15301 }, { "epoch": 0.739334203024593, "grad_norm": 5.803103923797607, "learning_rate": 2.6066579697540705e-07, "loss": 0.4465, "step": 15302 }, { "epoch": 0.739382519205682, "grad_norm": 2.078212022781372, "learning_rate": 2.60617480794318e-07, "loss": 0.2341, "step": 15303 }, { "epoch": 0.739430835386771, "grad_norm": 2.235907793045044, "learning_rate": 2.60569164613229e-07, "loss": 0.2519, "step": 15304 }, { "epoch": 0.73947915156786, "grad_norm": 2.6982455253601074, "learning_rate": 2.605208484321399e-07, "loss": 0.2784, "step": 15305 }, { "epoch": 0.7395274677489492, "grad_norm": 2.160378932952881, "learning_rate": 2.6047253225105085e-07, "loss": 0.2691, "step": 15306 }, { "epoch": 0.7395757839300382, "grad_norm": 2.946890354156494, "learning_rate": 2.6042421606996184e-07, "loss": 0.2147, "step": 15307 }, { "epoch": 0.7396241001111272, "grad_norm": 3.306363821029663, "learning_rate": 2.603758998888728e-07, "loss": 0.3883, "step": 15308 }, { "epoch": 0.7396724162922163, "grad_norm": 2.295743227005005, "learning_rate": 2.603275837077837e-07, "loss": 0.3246, "step": 15309 }, { "epoch": 0.7397207324733053, "grad_norm": 3.9745049476623535, "learning_rate": 2.6027926752669466e-07, "loss": 0.372, "step": 15310 }, { "epoch": 0.7397690486543944, "grad_norm": 2.2909414768218994, "learning_rate": 2.6023095134560565e-07, "loss": 0.2515, "step": 15311 }, { "epoch": 0.7398173648354834, "grad_norm": 3.0852701663970947, "learning_rate": 2.601826351645166e-07, "loss": 0.405, "step": 15312 }, { "epoch": 0.7398656810165725, "grad_norm": 2.3606069087982178, "learning_rate": 2.601343189834275e-07, "loss": 0.2352, "step": 15313 }, { "epoch": 0.7399139971976615, "grad_norm": 3.5699806213378906, "learning_rate": 2.600860028023385e-07, "loss": 0.4402, "step": 15314 }, { "epoch": 0.7399623133787505, "grad_norm": 3.078252077102661, "learning_rate": 2.600376866212494e-07, "loss": 0.3053, "step": 15315 }, { "epoch": 0.7400106295598395, "grad_norm": 55.87147521972656, "learning_rate": 2.599893704401604e-07, "loss": 0.1946, "step": 15316 }, { "epoch": 0.7400589457409287, "grad_norm": 26.235368728637695, "learning_rate": 2.599410542590714e-07, "loss": 0.2962, "step": 15317 }, { "epoch": 0.7401072619220177, "grad_norm": 2.4393179416656494, "learning_rate": 2.598927380779823e-07, "loss": 0.1899, "step": 15318 }, { "epoch": 0.7401555781031067, "grad_norm": 3.520563840866089, "learning_rate": 2.5984442189689325e-07, "loss": 0.2175, "step": 15319 }, { "epoch": 0.7402038942841958, "grad_norm": 3.0040793418884277, "learning_rate": 2.5979610571580424e-07, "loss": 0.3798, "step": 15320 }, { "epoch": 0.7402522104652848, "grad_norm": 2.1365249156951904, "learning_rate": 2.597477895347152e-07, "loss": 0.2113, "step": 15321 }, { "epoch": 0.7403005266463739, "grad_norm": 2.5068788528442383, "learning_rate": 2.596994733536261e-07, "loss": 0.2591, "step": 15322 }, { "epoch": 0.7403488428274629, "grad_norm": 2.032313108444214, "learning_rate": 2.5965115717253705e-07, "loss": 0.1819, "step": 15323 }, { "epoch": 0.740397159008552, "grad_norm": 3.347512722015381, "learning_rate": 2.5960284099144804e-07, "loss": 0.1786, "step": 15324 }, { "epoch": 0.740445475189641, "grad_norm": 1.9976704120635986, "learning_rate": 2.59554524810359e-07, "loss": 0.2092, "step": 15325 }, { "epoch": 0.74049379137073, "grad_norm": 1.974493384361267, "learning_rate": 2.595062086292699e-07, "loss": 0.2016, "step": 15326 }, { "epoch": 0.7405421075518192, "grad_norm": 3.963634967803955, "learning_rate": 2.594578924481809e-07, "loss": 0.3793, "step": 15327 }, { "epoch": 0.7405904237329082, "grad_norm": 2.198496103286743, "learning_rate": 2.594095762670918e-07, "loss": 0.2743, "step": 15328 }, { "epoch": 0.7406387399139972, "grad_norm": 2.199633836746216, "learning_rate": 2.593612600860028e-07, "loss": 0.3118, "step": 15329 }, { "epoch": 0.7406870560950862, "grad_norm": 2.3792669773101807, "learning_rate": 2.5931294390491377e-07, "loss": 0.2913, "step": 15330 }, { "epoch": 0.7407353722761753, "grad_norm": 2.6670689582824707, "learning_rate": 2.5926462772382466e-07, "loss": 0.2586, "step": 15331 }, { "epoch": 0.7407836884572644, "grad_norm": 3.638932466506958, "learning_rate": 2.5921631154273565e-07, "loss": 0.34, "step": 15332 }, { "epoch": 0.7408320046383534, "grad_norm": 3.946309804916382, "learning_rate": 2.5916799536164664e-07, "loss": 0.279, "step": 15333 }, { "epoch": 0.7408803208194424, "grad_norm": 2.6306116580963135, "learning_rate": 2.5911967918055757e-07, "loss": 0.4122, "step": 15334 }, { "epoch": 0.7409286370005315, "grad_norm": 2.4213948249816895, "learning_rate": 2.590713629994685e-07, "loss": 0.3015, "step": 15335 }, { "epoch": 0.7409769531816205, "grad_norm": 2.6489784717559814, "learning_rate": 2.5902304681837945e-07, "loss": 0.2609, "step": 15336 }, { "epoch": 0.7410252693627096, "grad_norm": 2.686034917831421, "learning_rate": 2.5897473063729044e-07, "loss": 0.2746, "step": 15337 }, { "epoch": 0.7410735855437987, "grad_norm": 2.7240893840789795, "learning_rate": 2.589264144562014e-07, "loss": 0.2999, "step": 15338 }, { "epoch": 0.7411219017248877, "grad_norm": 2.0628182888031006, "learning_rate": 2.588780982751123e-07, "loss": 0.2294, "step": 15339 }, { "epoch": 0.7411702179059767, "grad_norm": 3.118821620941162, "learning_rate": 2.588297820940233e-07, "loss": 0.3286, "step": 15340 }, { "epoch": 0.7412185340870657, "grad_norm": 2.656581163406372, "learning_rate": 2.587814659129342e-07, "loss": 0.3743, "step": 15341 }, { "epoch": 0.7412668502681548, "grad_norm": 6.289766788482666, "learning_rate": 2.587331497318452e-07, "loss": 0.3262, "step": 15342 }, { "epoch": 0.7413151664492439, "grad_norm": 2.8666129112243652, "learning_rate": 2.5868483355075617e-07, "loss": 0.3207, "step": 15343 }, { "epoch": 0.7413634826303329, "grad_norm": 2.06514573097229, "learning_rate": 2.5863651736966705e-07, "loss": 0.188, "step": 15344 }, { "epoch": 0.7414117988114219, "grad_norm": 2.0414175987243652, "learning_rate": 2.5858820118857804e-07, "loss": 0.1687, "step": 15345 }, { "epoch": 0.741460114992511, "grad_norm": 4.7603325843811035, "learning_rate": 2.5853988500748903e-07, "loss": 0.2874, "step": 15346 }, { "epoch": 0.7415084311736, "grad_norm": 10.908400535583496, "learning_rate": 2.584915688263999e-07, "loss": 0.3786, "step": 15347 }, { "epoch": 0.7415567473546891, "grad_norm": 2.675895929336548, "learning_rate": 2.584432526453109e-07, "loss": 0.2749, "step": 15348 }, { "epoch": 0.7416050635357782, "grad_norm": 2.0107243061065674, "learning_rate": 2.5839493646422184e-07, "loss": 0.2236, "step": 15349 }, { "epoch": 0.7416533797168672, "grad_norm": 2.1199419498443604, "learning_rate": 2.5834662028313283e-07, "loss": 0.2587, "step": 15350 }, { "epoch": 0.7417016958979562, "grad_norm": 1.8142632246017456, "learning_rate": 2.5829830410204377e-07, "loss": 0.2128, "step": 15351 }, { "epoch": 0.7417500120790452, "grad_norm": 2.5158157348632812, "learning_rate": 2.582499879209547e-07, "loss": 0.2762, "step": 15352 }, { "epoch": 0.7417983282601344, "grad_norm": 1.9776360988616943, "learning_rate": 2.582016717398657e-07, "loss": 0.2061, "step": 15353 }, { "epoch": 0.7418466444412234, "grad_norm": 3.8464832305908203, "learning_rate": 2.581533555587766e-07, "loss": 0.2961, "step": 15354 }, { "epoch": 0.7418949606223124, "grad_norm": 2.4114725589752197, "learning_rate": 2.581050393776876e-07, "loss": 0.2796, "step": 15355 }, { "epoch": 0.7419432768034014, "grad_norm": 4.882997035980225, "learning_rate": 2.5805672319659856e-07, "loss": 0.2754, "step": 15356 }, { "epoch": 0.7419915929844905, "grad_norm": 2.882188558578491, "learning_rate": 2.5800840701550945e-07, "loss": 0.2739, "step": 15357 }, { "epoch": 0.7420399091655796, "grad_norm": 3.3589284420013428, "learning_rate": 2.5796009083442044e-07, "loss": 0.3482, "step": 15358 }, { "epoch": 0.7420882253466686, "grad_norm": 3.107308864593506, "learning_rate": 2.5791177465333143e-07, "loss": 0.3611, "step": 15359 }, { "epoch": 0.7421365415277577, "grad_norm": 3.542299747467041, "learning_rate": 2.578634584722423e-07, "loss": 0.3014, "step": 15360 }, { "epoch": 0.7421848577088467, "grad_norm": 2.763556718826294, "learning_rate": 2.578151422911533e-07, "loss": 0.3384, "step": 15361 }, { "epoch": 0.7422331738899357, "grad_norm": 1.4507319927215576, "learning_rate": 2.5776682611006424e-07, "loss": 0.1595, "step": 15362 }, { "epoch": 0.7422814900710248, "grad_norm": 2.112243890762329, "learning_rate": 2.577185099289752e-07, "loss": 0.2276, "step": 15363 }, { "epoch": 0.7423298062521139, "grad_norm": 2.156930685043335, "learning_rate": 2.5767019374788617e-07, "loss": 0.2009, "step": 15364 }, { "epoch": 0.7423781224332029, "grad_norm": 3.9696648120880127, "learning_rate": 2.576218775667971e-07, "loss": 0.4169, "step": 15365 }, { "epoch": 0.7424264386142919, "grad_norm": 4.457352161407471, "learning_rate": 2.575735613857081e-07, "loss": 0.2323, "step": 15366 }, { "epoch": 0.7424747547953809, "grad_norm": 1.6389249563217163, "learning_rate": 2.57525245204619e-07, "loss": 0.1617, "step": 15367 }, { "epoch": 0.74252307097647, "grad_norm": 2.1508662700653076, "learning_rate": 2.5747692902352997e-07, "loss": 0.2539, "step": 15368 }, { "epoch": 0.7425713871575591, "grad_norm": 12.3101167678833, "learning_rate": 2.5742861284244096e-07, "loss": 0.2451, "step": 15369 }, { "epoch": 0.7426197033386481, "grad_norm": 3.2353057861328125, "learning_rate": 2.5738029666135184e-07, "loss": 0.4575, "step": 15370 }, { "epoch": 0.7426680195197372, "grad_norm": 7.891012191772461, "learning_rate": 2.5733198048026284e-07, "loss": 0.2274, "step": 15371 }, { "epoch": 0.7427163357008262, "grad_norm": 2.654417037963867, "learning_rate": 2.572836642991738e-07, "loss": 0.3892, "step": 15372 }, { "epoch": 0.7427646518819152, "grad_norm": 2.758594274520874, "learning_rate": 2.572353481180847e-07, "loss": 0.2163, "step": 15373 }, { "epoch": 0.7428129680630043, "grad_norm": 4.089129447937012, "learning_rate": 2.571870319369957e-07, "loss": 0.2727, "step": 15374 }, { "epoch": 0.7428612842440934, "grad_norm": 3.3158223628997803, "learning_rate": 2.5713871575590664e-07, "loss": 0.3581, "step": 15375 }, { "epoch": 0.7429096004251824, "grad_norm": 3.006037473678589, "learning_rate": 2.570903995748176e-07, "loss": 0.3392, "step": 15376 }, { "epoch": 0.7429579166062714, "grad_norm": 2.878227949142456, "learning_rate": 2.5704208339372856e-07, "loss": 0.3744, "step": 15377 }, { "epoch": 0.7430062327873604, "grad_norm": 2.7413721084594727, "learning_rate": 2.569937672126395e-07, "loss": 0.264, "step": 15378 }, { "epoch": 0.7430545489684496, "grad_norm": 2.226067304611206, "learning_rate": 2.5694545103155044e-07, "loss": 0.2028, "step": 15379 }, { "epoch": 0.7431028651495386, "grad_norm": 2.8065621852874756, "learning_rate": 2.568971348504614e-07, "loss": 0.3584, "step": 15380 }, { "epoch": 0.7431511813306276, "grad_norm": 4.560223579406738, "learning_rate": 2.5684881866937237e-07, "loss": 0.2932, "step": 15381 }, { "epoch": 0.7431994975117167, "grad_norm": 2.564591407775879, "learning_rate": 2.5680050248828336e-07, "loss": 0.3284, "step": 15382 }, { "epoch": 0.7432478136928057, "grad_norm": 2.319861888885498, "learning_rate": 2.5675218630719424e-07, "loss": 0.2766, "step": 15383 }, { "epoch": 0.7432961298738948, "grad_norm": 3.4753575325012207, "learning_rate": 2.5670387012610523e-07, "loss": 0.3036, "step": 15384 }, { "epoch": 0.7433444460549838, "grad_norm": 3.066710948944092, "learning_rate": 2.566555539450162e-07, "loss": 0.4014, "step": 15385 }, { "epoch": 0.7433927622360729, "grad_norm": 2.8117964267730713, "learning_rate": 2.566072377639271e-07, "loss": 0.2972, "step": 15386 }, { "epoch": 0.7434410784171619, "grad_norm": 1.6976827383041382, "learning_rate": 2.565589215828381e-07, "loss": 0.1486, "step": 15387 }, { "epoch": 0.7434893945982509, "grad_norm": 2.596515655517578, "learning_rate": 2.5651060540174903e-07, "loss": 0.2883, "step": 15388 }, { "epoch": 0.74353771077934, "grad_norm": 3.3721072673797607, "learning_rate": 2.5646228922065997e-07, "loss": 0.3742, "step": 15389 }, { "epoch": 0.7435860269604291, "grad_norm": 1.6259255409240723, "learning_rate": 2.5641397303957096e-07, "loss": 0.1605, "step": 15390 }, { "epoch": 0.7436343431415181, "grad_norm": 2.5260016918182373, "learning_rate": 2.563656568584819e-07, "loss": 0.2799, "step": 15391 }, { "epoch": 0.7436826593226071, "grad_norm": 5.460970878601074, "learning_rate": 2.5631734067739284e-07, "loss": 0.3544, "step": 15392 }, { "epoch": 0.7437309755036962, "grad_norm": 2.495008945465088, "learning_rate": 2.5626902449630377e-07, "loss": 0.2793, "step": 15393 }, { "epoch": 0.7437792916847852, "grad_norm": 4.584275245666504, "learning_rate": 2.5622070831521476e-07, "loss": 0.4805, "step": 15394 }, { "epoch": 0.7438276078658743, "grad_norm": 2.2247276306152344, "learning_rate": 2.561723921341257e-07, "loss": 0.2911, "step": 15395 }, { "epoch": 0.7438759240469633, "grad_norm": 2.4704477787017822, "learning_rate": 2.5612407595303664e-07, "loss": 0.2213, "step": 15396 }, { "epoch": 0.7439242402280524, "grad_norm": 1.753861904144287, "learning_rate": 2.5607575977194763e-07, "loss": 0.2165, "step": 15397 }, { "epoch": 0.7439725564091414, "grad_norm": 2.2859015464782715, "learning_rate": 2.560274435908586e-07, "loss": 0.2149, "step": 15398 }, { "epoch": 0.7440208725902304, "grad_norm": 2.5392239093780518, "learning_rate": 2.559791274097695e-07, "loss": 0.2796, "step": 15399 }, { "epoch": 0.7440691887713196, "grad_norm": 2.235769510269165, "learning_rate": 2.559308112286805e-07, "loss": 0.2401, "step": 15400 }, { "epoch": 0.7441175049524086, "grad_norm": 3.1929996013641357, "learning_rate": 2.5588249504759143e-07, "loss": 0.2936, "step": 15401 }, { "epoch": 0.7441658211334976, "grad_norm": 2.8944685459136963, "learning_rate": 2.5583417886650237e-07, "loss": 0.3393, "step": 15402 }, { "epoch": 0.7442141373145866, "grad_norm": 3.4466614723205566, "learning_rate": 2.5578586268541336e-07, "loss": 0.2253, "step": 15403 }, { "epoch": 0.7442624534956757, "grad_norm": 2.2495017051696777, "learning_rate": 2.557375465043243e-07, "loss": 0.2365, "step": 15404 }, { "epoch": 0.7443107696767648, "grad_norm": 3.885591506958008, "learning_rate": 2.5568923032323523e-07, "loss": 0.3888, "step": 15405 }, { "epoch": 0.7443590858578538, "grad_norm": 2.425795555114746, "learning_rate": 2.5564091414214617e-07, "loss": 0.3179, "step": 15406 }, { "epoch": 0.7444074020389428, "grad_norm": 2.630650043487549, "learning_rate": 2.5559259796105716e-07, "loss": 0.354, "step": 15407 }, { "epoch": 0.7444557182200319, "grad_norm": 4.97291374206543, "learning_rate": 2.555442817799681e-07, "loss": 0.4471, "step": 15408 }, { "epoch": 0.7445040344011209, "grad_norm": 2.433140754699707, "learning_rate": 2.5549596559887903e-07, "loss": 0.4, "step": 15409 }, { "epoch": 0.74455235058221, "grad_norm": 3.5154850482940674, "learning_rate": 2.5544764941779e-07, "loss": 0.1825, "step": 15410 }, { "epoch": 0.7446006667632991, "grad_norm": 3.2357614040374756, "learning_rate": 2.5539933323670096e-07, "loss": 0.4031, "step": 15411 }, { "epoch": 0.7446489829443881, "grad_norm": 3.0800647735595703, "learning_rate": 2.553510170556119e-07, "loss": 0.241, "step": 15412 }, { "epoch": 0.7446972991254771, "grad_norm": 2.9736111164093018, "learning_rate": 2.553027008745229e-07, "loss": 0.2818, "step": 15413 }, { "epoch": 0.7447456153065661, "grad_norm": 2.1350438594818115, "learning_rate": 2.5525438469343377e-07, "loss": 0.2475, "step": 15414 }, { "epoch": 0.7447939314876553, "grad_norm": 3.130309581756592, "learning_rate": 2.5520606851234476e-07, "loss": 0.2886, "step": 15415 }, { "epoch": 0.7448422476687443, "grad_norm": 2.8911564350128174, "learning_rate": 2.5515775233125575e-07, "loss": 0.2957, "step": 15416 }, { "epoch": 0.7448905638498333, "grad_norm": 2.3140830993652344, "learning_rate": 2.551094361501667e-07, "loss": 0.1729, "step": 15417 }, { "epoch": 0.7449388800309223, "grad_norm": 2.473745346069336, "learning_rate": 2.5506111996907763e-07, "loss": 0.2611, "step": 15418 }, { "epoch": 0.7449871962120114, "grad_norm": 2.738760232925415, "learning_rate": 2.5501280378798857e-07, "loss": 0.3516, "step": 15419 }, { "epoch": 0.7450355123931004, "grad_norm": 2.0726499557495117, "learning_rate": 2.5496448760689956e-07, "loss": 0.2166, "step": 15420 }, { "epoch": 0.7450838285741895, "grad_norm": 12.784331321716309, "learning_rate": 2.549161714258105e-07, "loss": 0.2788, "step": 15421 }, { "epoch": 0.7451321447552786, "grad_norm": 3.0502090454101562, "learning_rate": 2.5486785524472143e-07, "loss": 0.3791, "step": 15422 }, { "epoch": 0.7451804609363676, "grad_norm": 2.677110195159912, "learning_rate": 2.548195390636324e-07, "loss": 0.3528, "step": 15423 }, { "epoch": 0.7452287771174566, "grad_norm": 3.6016175746917725, "learning_rate": 2.5477122288254336e-07, "loss": 0.2497, "step": 15424 }, { "epoch": 0.7452770932985456, "grad_norm": 2.6110565662384033, "learning_rate": 2.547229067014543e-07, "loss": 0.2735, "step": 15425 }, { "epoch": 0.7453254094796348, "grad_norm": 3.5635080337524414, "learning_rate": 2.546745905203653e-07, "loss": 0.4594, "step": 15426 }, { "epoch": 0.7453737256607238, "grad_norm": 6.842761993408203, "learning_rate": 2.5462627433927617e-07, "loss": 0.2378, "step": 15427 }, { "epoch": 0.7454220418418128, "grad_norm": 2.591215133666992, "learning_rate": 2.5457795815818716e-07, "loss": 0.2761, "step": 15428 }, { "epoch": 0.7454703580229018, "grad_norm": 6.268364906311035, "learning_rate": 2.5452964197709815e-07, "loss": 0.3769, "step": 15429 }, { "epoch": 0.7455186742039909, "grad_norm": 2.181600332260132, "learning_rate": 2.544813257960091e-07, "loss": 0.2018, "step": 15430 }, { "epoch": 0.74556699038508, "grad_norm": 3.2827744483947754, "learning_rate": 2.5443300961492e-07, "loss": 0.2732, "step": 15431 }, { "epoch": 0.745615306566169, "grad_norm": 2.4334614276885986, "learning_rate": 2.5438469343383096e-07, "loss": 0.2889, "step": 15432 }, { "epoch": 0.7456636227472581, "grad_norm": 3.4306156635284424, "learning_rate": 2.5433637725274195e-07, "loss": 0.3777, "step": 15433 }, { "epoch": 0.7457119389283471, "grad_norm": 3.0588877201080322, "learning_rate": 2.542880610716529e-07, "loss": 0.32, "step": 15434 }, { "epoch": 0.7457602551094361, "grad_norm": 3.6373565196990967, "learning_rate": 2.5423974489056383e-07, "loss": 0.3073, "step": 15435 }, { "epoch": 0.7458085712905252, "grad_norm": 2.9399187564849854, "learning_rate": 2.541914287094748e-07, "loss": 0.322, "step": 15436 }, { "epoch": 0.7458568874716143, "grad_norm": 1.892547607421875, "learning_rate": 2.5414311252838575e-07, "loss": 0.2288, "step": 15437 }, { "epoch": 0.7459052036527033, "grad_norm": 2.217794179916382, "learning_rate": 2.540947963472967e-07, "loss": 0.2675, "step": 15438 }, { "epoch": 0.7459535198337923, "grad_norm": 2.1432127952575684, "learning_rate": 2.540464801662077e-07, "loss": 0.2551, "step": 15439 }, { "epoch": 0.7460018360148813, "grad_norm": 2.7686877250671387, "learning_rate": 2.5399816398511857e-07, "loss": 0.3663, "step": 15440 }, { "epoch": 0.7460501521959705, "grad_norm": 2.6371209621429443, "learning_rate": 2.5394984780402956e-07, "loss": 0.2917, "step": 15441 }, { "epoch": 0.7460984683770595, "grad_norm": 3.338361978530884, "learning_rate": 2.5390153162294055e-07, "loss": 0.2483, "step": 15442 }, { "epoch": 0.7461467845581485, "grad_norm": 2.1565139293670654, "learning_rate": 2.5385321544185143e-07, "loss": 0.2083, "step": 15443 }, { "epoch": 0.7461951007392376, "grad_norm": 3.089263439178467, "learning_rate": 2.538048992607624e-07, "loss": 0.4024, "step": 15444 }, { "epoch": 0.7462434169203266, "grad_norm": 3.183631181716919, "learning_rate": 2.5375658307967336e-07, "loss": 0.2296, "step": 15445 }, { "epoch": 0.7462917331014156, "grad_norm": 1.6983648538589478, "learning_rate": 2.5370826689858435e-07, "loss": 0.1661, "step": 15446 }, { "epoch": 0.7463400492825047, "grad_norm": 3.815347671508789, "learning_rate": 2.536599507174953e-07, "loss": 0.3523, "step": 15447 }, { "epoch": 0.7463883654635938, "grad_norm": 4.039661884307861, "learning_rate": 2.536116345364062e-07, "loss": 0.3077, "step": 15448 }, { "epoch": 0.7464366816446828, "grad_norm": 8.670634269714355, "learning_rate": 2.535633183553172e-07, "loss": 0.3679, "step": 15449 }, { "epoch": 0.7464849978257718, "grad_norm": 3.3123035430908203, "learning_rate": 2.5351500217422815e-07, "loss": 0.3066, "step": 15450 }, { "epoch": 0.7465333140068608, "grad_norm": 2.644674301147461, "learning_rate": 2.534666859931391e-07, "loss": 0.285, "step": 15451 }, { "epoch": 0.74658163018795, "grad_norm": 3.1440389156341553, "learning_rate": 2.534183698120501e-07, "loss": 0.2885, "step": 15452 }, { "epoch": 0.746629946369039, "grad_norm": 2.4338698387145996, "learning_rate": 2.5337005363096096e-07, "loss": 0.3037, "step": 15453 }, { "epoch": 0.746678262550128, "grad_norm": 1.8823652267456055, "learning_rate": 2.5332173744987195e-07, "loss": 0.2146, "step": 15454 }, { "epoch": 0.7467265787312171, "grad_norm": 4.290771484375, "learning_rate": 2.5327342126878294e-07, "loss": 0.2808, "step": 15455 }, { "epoch": 0.7467748949123061, "grad_norm": 2.4624319076538086, "learning_rate": 2.5322510508769383e-07, "loss": 0.2971, "step": 15456 }, { "epoch": 0.7468232110933952, "grad_norm": 2.928082227706909, "learning_rate": 2.531767889066048e-07, "loss": 0.3146, "step": 15457 }, { "epoch": 0.7468715272744842, "grad_norm": 2.6027371883392334, "learning_rate": 2.5312847272551575e-07, "loss": 0.3111, "step": 15458 }, { "epoch": 0.7469198434555733, "grad_norm": 2.058812379837036, "learning_rate": 2.530801565444267e-07, "loss": 0.2365, "step": 15459 }, { "epoch": 0.7469681596366623, "grad_norm": 5.548631191253662, "learning_rate": 2.530318403633377e-07, "loss": 0.3501, "step": 15460 }, { "epoch": 0.7470164758177513, "grad_norm": 2.080303907394409, "learning_rate": 2.529835241822486e-07, "loss": 0.2634, "step": 15461 }, { "epoch": 0.7470647919988405, "grad_norm": 3.002565622329712, "learning_rate": 2.529352080011596e-07, "loss": 0.2756, "step": 15462 }, { "epoch": 0.7471131081799295, "grad_norm": 4.623694896697998, "learning_rate": 2.528868918200705e-07, "loss": 0.3165, "step": 15463 }, { "epoch": 0.7471614243610185, "grad_norm": 2.089357376098633, "learning_rate": 2.528385756389815e-07, "loss": 0.2481, "step": 15464 }, { "epoch": 0.7472097405421075, "grad_norm": 3.479398488998413, "learning_rate": 2.527902594578925e-07, "loss": 0.2668, "step": 15465 }, { "epoch": 0.7472580567231966, "grad_norm": 2.5812809467315674, "learning_rate": 2.5274194327680336e-07, "loss": 0.3015, "step": 15466 }, { "epoch": 0.7473063729042857, "grad_norm": 2.120055675506592, "learning_rate": 2.5269362709571435e-07, "loss": 0.223, "step": 15467 }, { "epoch": 0.7473546890853747, "grad_norm": 3.7054147720336914, "learning_rate": 2.5264531091462534e-07, "loss": 0.3741, "step": 15468 }, { "epoch": 0.7474030052664637, "grad_norm": 7.611909866333008, "learning_rate": 2.525969947335362e-07, "loss": 0.2373, "step": 15469 }, { "epoch": 0.7474513214475528, "grad_norm": 1.723870873451233, "learning_rate": 2.525486785524472e-07, "loss": 0.2261, "step": 15470 }, { "epoch": 0.7474996376286418, "grad_norm": 1.6677727699279785, "learning_rate": 2.5250036237135815e-07, "loss": 0.1846, "step": 15471 }, { "epoch": 0.7475479538097308, "grad_norm": 1.9810972213745117, "learning_rate": 2.524520461902691e-07, "loss": 0.2075, "step": 15472 }, { "epoch": 0.74759626999082, "grad_norm": 3.4745194911956787, "learning_rate": 2.524037300091801e-07, "loss": 0.3585, "step": 15473 }, { "epoch": 0.747644586171909, "grad_norm": 2.443138599395752, "learning_rate": 2.52355413828091e-07, "loss": 0.2802, "step": 15474 }, { "epoch": 0.747692902352998, "grad_norm": 11.382278442382812, "learning_rate": 2.5230709764700195e-07, "loss": 0.247, "step": 15475 }, { "epoch": 0.747741218534087, "grad_norm": 3.4532175064086914, "learning_rate": 2.522587814659129e-07, "loss": 0.3055, "step": 15476 }, { "epoch": 0.7477895347151761, "grad_norm": 2.9247512817382812, "learning_rate": 2.522104652848239e-07, "loss": 0.2093, "step": 15477 }, { "epoch": 0.7478378508962652, "grad_norm": 2.9931561946868896, "learning_rate": 2.5216214910373487e-07, "loss": 0.388, "step": 15478 }, { "epoch": 0.7478861670773542, "grad_norm": 3.031344413757324, "learning_rate": 2.5211383292264575e-07, "loss": 0.2364, "step": 15479 }, { "epoch": 0.7479344832584433, "grad_norm": 2.657721996307373, "learning_rate": 2.5206551674155675e-07, "loss": 0.3134, "step": 15480 }, { "epoch": 0.7479827994395323, "grad_norm": 2.4152109622955322, "learning_rate": 2.5201720056046774e-07, "loss": 0.2616, "step": 15481 }, { "epoch": 0.7480311156206213, "grad_norm": 15.100163459777832, "learning_rate": 2.519688843793786e-07, "loss": 0.2578, "step": 15482 }, { "epoch": 0.7480794318017104, "grad_norm": 15.822087287902832, "learning_rate": 2.519205681982896e-07, "loss": 0.4537, "step": 15483 }, { "epoch": 0.7481277479827995, "grad_norm": 1.597294807434082, "learning_rate": 2.5187225201720055e-07, "loss": 0.1486, "step": 15484 }, { "epoch": 0.7481760641638885, "grad_norm": 6.186117649078369, "learning_rate": 2.518239358361115e-07, "loss": 0.2226, "step": 15485 }, { "epoch": 0.7482243803449775, "grad_norm": 3.4457573890686035, "learning_rate": 2.517756196550225e-07, "loss": 0.3527, "step": 15486 }, { "epoch": 0.7482726965260665, "grad_norm": 2.056581735610962, "learning_rate": 2.517273034739334e-07, "loss": 0.2087, "step": 15487 }, { "epoch": 0.7483210127071557, "grad_norm": 2.9622738361358643, "learning_rate": 2.5167898729284435e-07, "loss": 0.3102, "step": 15488 }, { "epoch": 0.7483693288882447, "grad_norm": 2.9868392944335938, "learning_rate": 2.516306711117553e-07, "loss": 0.2323, "step": 15489 }, { "epoch": 0.7484176450693337, "grad_norm": 3.688068389892578, "learning_rate": 2.515823549306663e-07, "loss": 0.4219, "step": 15490 }, { "epoch": 0.7484659612504228, "grad_norm": 3.0491504669189453, "learning_rate": 2.515340387495772e-07, "loss": 0.1734, "step": 15491 }, { "epoch": 0.7485142774315118, "grad_norm": 2.94952130317688, "learning_rate": 2.5148572256848815e-07, "loss": 0.1528, "step": 15492 }, { "epoch": 0.7485625936126009, "grad_norm": 2.884666681289673, "learning_rate": 2.5143740638739914e-07, "loss": 0.3488, "step": 15493 }, { "epoch": 0.7486109097936899, "grad_norm": 2.7158355712890625, "learning_rate": 2.5138909020631013e-07, "loss": 0.2873, "step": 15494 }, { "epoch": 0.748659225974779, "grad_norm": 2.3221614360809326, "learning_rate": 2.51340774025221e-07, "loss": 0.3154, "step": 15495 }, { "epoch": 0.748707542155868, "grad_norm": 2.3888423442840576, "learning_rate": 2.51292457844132e-07, "loss": 0.3782, "step": 15496 }, { "epoch": 0.748755858336957, "grad_norm": 2.4880428314208984, "learning_rate": 2.5124414166304294e-07, "loss": 0.2625, "step": 15497 }, { "epoch": 0.748804174518046, "grad_norm": 2.67395281791687, "learning_rate": 2.511958254819539e-07, "loss": 0.2988, "step": 15498 }, { "epoch": 0.7488524906991352, "grad_norm": 4.333948135375977, "learning_rate": 2.5114750930086487e-07, "loss": 0.2642, "step": 15499 }, { "epoch": 0.7489008068802242, "grad_norm": 1.780728816986084, "learning_rate": 2.510991931197758e-07, "loss": 0.1862, "step": 15500 }, { "epoch": 0.7489491230613132, "grad_norm": 15.335783004760742, "learning_rate": 2.5105087693868675e-07, "loss": 0.3272, "step": 15501 }, { "epoch": 0.7489974392424023, "grad_norm": 1.6756378412246704, "learning_rate": 2.510025607575977e-07, "loss": 0.2279, "step": 15502 }, { "epoch": 0.7490457554234913, "grad_norm": 3.0369060039520264, "learning_rate": 2.5095424457650867e-07, "loss": 0.337, "step": 15503 }, { "epoch": 0.7490940716045804, "grad_norm": 3.6289844512939453, "learning_rate": 2.509059283954196e-07, "loss": 0.2221, "step": 15504 }, { "epoch": 0.7491423877856694, "grad_norm": 4.12240743637085, "learning_rate": 2.5085761221433055e-07, "loss": 0.4626, "step": 15505 }, { "epoch": 0.7491907039667585, "grad_norm": 2.2346057891845703, "learning_rate": 2.5080929603324154e-07, "loss": 0.2421, "step": 15506 }, { "epoch": 0.7492390201478475, "grad_norm": 2.854048252105713, "learning_rate": 2.507609798521525e-07, "loss": 0.3808, "step": 15507 }, { "epoch": 0.7492873363289365, "grad_norm": 1.7484736442565918, "learning_rate": 2.507126636710634e-07, "loss": 0.228, "step": 15508 }, { "epoch": 0.7493356525100257, "grad_norm": 3.6103932857513428, "learning_rate": 2.506643474899744e-07, "loss": 0.2221, "step": 15509 }, { "epoch": 0.7493839686911147, "grad_norm": 4.72802209854126, "learning_rate": 2.506160313088853e-07, "loss": 0.3187, "step": 15510 }, { "epoch": 0.7494322848722037, "grad_norm": 1.9313713312149048, "learning_rate": 2.505677151277963e-07, "loss": 0.2013, "step": 15511 }, { "epoch": 0.7494806010532927, "grad_norm": 2.727680206298828, "learning_rate": 2.5051939894670727e-07, "loss": 0.1821, "step": 15512 }, { "epoch": 0.7495289172343818, "grad_norm": 3.150599479675293, "learning_rate": 2.504710827656182e-07, "loss": 0.4654, "step": 15513 }, { "epoch": 0.7495772334154709, "grad_norm": 2.322542905807495, "learning_rate": 2.5042276658452914e-07, "loss": 0.272, "step": 15514 }, { "epoch": 0.7496255495965599, "grad_norm": 1.8194063901901245, "learning_rate": 2.503744504034401e-07, "loss": 0.2047, "step": 15515 }, { "epoch": 0.7496738657776489, "grad_norm": 2.174729585647583, "learning_rate": 2.5032613422235107e-07, "loss": 0.2209, "step": 15516 }, { "epoch": 0.749722181958738, "grad_norm": 2.8424770832061768, "learning_rate": 2.50277818041262e-07, "loss": 0.3368, "step": 15517 }, { "epoch": 0.749770498139827, "grad_norm": 3.665795087814331, "learning_rate": 2.5022950186017294e-07, "loss": 0.2165, "step": 15518 }, { "epoch": 0.7498188143209161, "grad_norm": 2.2110369205474854, "learning_rate": 2.5018118567908393e-07, "loss": 0.2988, "step": 15519 }, { "epoch": 0.7498671305020052, "grad_norm": 2.843095064163208, "learning_rate": 2.5013286949799487e-07, "loss": 0.1651, "step": 15520 }, { "epoch": 0.7499154466830942, "grad_norm": 11.238300323486328, "learning_rate": 2.500845533169058e-07, "loss": 0.4439, "step": 15521 }, { "epoch": 0.7499637628641832, "grad_norm": 2.226924180984497, "learning_rate": 2.500362371358168e-07, "loss": 0.2714, "step": 15522 }, { "epoch": 0.7500120790452722, "grad_norm": 2.380768299102783, "learning_rate": 2.4998792095472774e-07, "loss": 0.2478, "step": 15523 }, { "epoch": 0.7500603952263613, "grad_norm": 7.724602222442627, "learning_rate": 2.4993960477363867e-07, "loss": 0.2678, "step": 15524 }, { "epoch": 0.7501087114074504, "grad_norm": 3.0157697200775146, "learning_rate": 2.498912885925496e-07, "loss": 0.2286, "step": 15525 }, { "epoch": 0.7501570275885394, "grad_norm": 2.6329188346862793, "learning_rate": 2.4984297241146055e-07, "loss": 0.3794, "step": 15526 }, { "epoch": 0.7502053437696284, "grad_norm": 4.152597904205322, "learning_rate": 2.4979465623037154e-07, "loss": 0.3245, "step": 15527 }, { "epoch": 0.7502536599507175, "grad_norm": 3.770934581756592, "learning_rate": 2.497463400492825e-07, "loss": 0.2695, "step": 15528 }, { "epoch": 0.7503019761318065, "grad_norm": 2.730456590652466, "learning_rate": 2.4969802386819347e-07, "loss": 0.3192, "step": 15529 }, { "epoch": 0.7503502923128956, "grad_norm": 12.364694595336914, "learning_rate": 2.496497076871044e-07, "loss": 0.2937, "step": 15530 }, { "epoch": 0.7503986084939847, "grad_norm": 2.6667659282684326, "learning_rate": 2.4960139150601534e-07, "loss": 0.3113, "step": 15531 }, { "epoch": 0.7504469246750737, "grad_norm": 3.0542171001434326, "learning_rate": 2.4955307532492633e-07, "loss": 0.3319, "step": 15532 }, { "epoch": 0.7504952408561627, "grad_norm": 2.750791072845459, "learning_rate": 2.4950475914383727e-07, "loss": 0.2941, "step": 15533 }, { "epoch": 0.7505435570372517, "grad_norm": 3.0609097480773926, "learning_rate": 2.494564429627482e-07, "loss": 0.3308, "step": 15534 }, { "epoch": 0.7505918732183409, "grad_norm": 6.044008255004883, "learning_rate": 2.494081267816592e-07, "loss": 0.2587, "step": 15535 }, { "epoch": 0.7506401893994299, "grad_norm": 2.642690420150757, "learning_rate": 2.4935981060057013e-07, "loss": 0.3218, "step": 15536 }, { "epoch": 0.7506885055805189, "grad_norm": 4.206795692443848, "learning_rate": 2.4931149441948107e-07, "loss": 0.2859, "step": 15537 }, { "epoch": 0.7507368217616079, "grad_norm": 2.813185214996338, "learning_rate": 2.49263178238392e-07, "loss": 0.355, "step": 15538 }, { "epoch": 0.750785137942697, "grad_norm": 2.2800133228302, "learning_rate": 2.4921486205730294e-07, "loss": 0.1709, "step": 15539 }, { "epoch": 0.7508334541237861, "grad_norm": 3.18991756439209, "learning_rate": 2.4916654587621393e-07, "loss": 0.2777, "step": 15540 }, { "epoch": 0.7508817703048751, "grad_norm": 3.00148868560791, "learning_rate": 2.4911822969512487e-07, "loss": 0.2039, "step": 15541 }, { "epoch": 0.7509300864859642, "grad_norm": 2.569978952407837, "learning_rate": 2.490699135140358e-07, "loss": 0.2037, "step": 15542 }, { "epoch": 0.7509784026670532, "grad_norm": 4.900729656219482, "learning_rate": 2.490215973329468e-07, "loss": 0.3038, "step": 15543 }, { "epoch": 0.7510267188481422, "grad_norm": 2.948497772216797, "learning_rate": 2.4897328115185774e-07, "loss": 0.34, "step": 15544 }, { "epoch": 0.7510750350292313, "grad_norm": 2.19736385345459, "learning_rate": 2.4892496497076873e-07, "loss": 0.2371, "step": 15545 }, { "epoch": 0.7511233512103204, "grad_norm": 5.044500827789307, "learning_rate": 2.4887664878967966e-07, "loss": 0.1696, "step": 15546 }, { "epoch": 0.7511716673914094, "grad_norm": 2.318260669708252, "learning_rate": 2.488283326085906e-07, "loss": 0.3032, "step": 15547 }, { "epoch": 0.7512199835724984, "grad_norm": 2.067134380340576, "learning_rate": 2.487800164275016e-07, "loss": 0.2226, "step": 15548 }, { "epoch": 0.7512682997535874, "grad_norm": 2.134644031524658, "learning_rate": 2.4873170024641253e-07, "loss": 0.2375, "step": 15549 }, { "epoch": 0.7513166159346765, "grad_norm": 2.084975004196167, "learning_rate": 2.4868338406532347e-07, "loss": 0.2256, "step": 15550 }, { "epoch": 0.7513649321157656, "grad_norm": 3.1420841217041016, "learning_rate": 2.486350678842344e-07, "loss": 0.323, "step": 15551 }, { "epoch": 0.7514132482968546, "grad_norm": 2.7271387577056885, "learning_rate": 2.4858675170314534e-07, "loss": 0.1757, "step": 15552 }, { "epoch": 0.7514615644779437, "grad_norm": 3.141486644744873, "learning_rate": 2.4853843552205633e-07, "loss": 0.3503, "step": 15553 }, { "epoch": 0.7515098806590327, "grad_norm": 2.485976457595825, "learning_rate": 2.4849011934096727e-07, "loss": 0.2759, "step": 15554 }, { "epoch": 0.7515581968401217, "grad_norm": 2.86759090423584, "learning_rate": 2.484418031598782e-07, "loss": 0.272, "step": 15555 }, { "epoch": 0.7516065130212108, "grad_norm": 2.7951362133026123, "learning_rate": 2.483934869787892e-07, "loss": 0.3219, "step": 15556 }, { "epoch": 0.7516548292022999, "grad_norm": 15.117897033691406, "learning_rate": 2.4834517079770013e-07, "loss": 0.2143, "step": 15557 }, { "epoch": 0.7517031453833889, "grad_norm": 2.3033981323242188, "learning_rate": 2.4829685461661107e-07, "loss": 0.2491, "step": 15558 }, { "epoch": 0.7517514615644779, "grad_norm": 2.586010217666626, "learning_rate": 2.4824853843552206e-07, "loss": 0.3615, "step": 15559 }, { "epoch": 0.7517997777455669, "grad_norm": 3.1001780033111572, "learning_rate": 2.48200222254433e-07, "loss": 0.4394, "step": 15560 }, { "epoch": 0.7518480939266561, "grad_norm": 2.3174257278442383, "learning_rate": 2.48151906073344e-07, "loss": 0.2069, "step": 15561 }, { "epoch": 0.7518964101077451, "grad_norm": 1.712511658668518, "learning_rate": 2.481035898922549e-07, "loss": 0.2009, "step": 15562 }, { "epoch": 0.7519447262888341, "grad_norm": 3.2051661014556885, "learning_rate": 2.4805527371116586e-07, "loss": 0.3508, "step": 15563 }, { "epoch": 0.7519930424699232, "grad_norm": 3.877596139907837, "learning_rate": 2.480069575300768e-07, "loss": 0.2899, "step": 15564 }, { "epoch": 0.7520413586510122, "grad_norm": 4.534409523010254, "learning_rate": 2.4795864134898774e-07, "loss": 0.3215, "step": 15565 }, { "epoch": 0.7520896748321013, "grad_norm": 2.746150255203247, "learning_rate": 2.4791032516789873e-07, "loss": 0.2765, "step": 15566 }, { "epoch": 0.7521379910131903, "grad_norm": 3.3429672718048096, "learning_rate": 2.4786200898680966e-07, "loss": 0.5023, "step": 15567 }, { "epoch": 0.7521863071942794, "grad_norm": 2.45206618309021, "learning_rate": 2.478136928057206e-07, "loss": 0.3274, "step": 15568 }, { "epoch": 0.7522346233753684, "grad_norm": 3.041965961456299, "learning_rate": 2.477653766246316e-07, "loss": 0.3839, "step": 15569 }, { "epoch": 0.7522829395564574, "grad_norm": 2.68404483795166, "learning_rate": 2.4771706044354253e-07, "loss": 0.2775, "step": 15570 }, { "epoch": 0.7523312557375466, "grad_norm": 2.3992621898651123, "learning_rate": 2.4766874426245347e-07, "loss": 0.2167, "step": 15571 }, { "epoch": 0.7523795719186356, "grad_norm": 2.389420747756958, "learning_rate": 2.4762042808136446e-07, "loss": 0.2953, "step": 15572 }, { "epoch": 0.7524278880997246, "grad_norm": 2.7853634357452393, "learning_rate": 2.475721119002754e-07, "loss": 0.2665, "step": 15573 }, { "epoch": 0.7524762042808136, "grad_norm": 2.3476057052612305, "learning_rate": 2.4752379571918633e-07, "loss": 0.2482, "step": 15574 }, { "epoch": 0.7525245204619027, "grad_norm": 2.8016750812530518, "learning_rate": 2.474754795380973e-07, "loss": 0.3835, "step": 15575 }, { "epoch": 0.7525728366429917, "grad_norm": 3.812717914581299, "learning_rate": 2.4742716335700826e-07, "loss": 0.3451, "step": 15576 }, { "epoch": 0.7526211528240808, "grad_norm": 2.3903822898864746, "learning_rate": 2.473788471759192e-07, "loss": 0.2805, "step": 15577 }, { "epoch": 0.7526694690051698, "grad_norm": 1.9179680347442627, "learning_rate": 2.4733053099483013e-07, "loss": 0.2158, "step": 15578 }, { "epoch": 0.7527177851862589, "grad_norm": 2.3848555088043213, "learning_rate": 2.472822148137411e-07, "loss": 0.3012, "step": 15579 }, { "epoch": 0.7527661013673479, "grad_norm": 3.3539016246795654, "learning_rate": 2.4723389863265206e-07, "loss": 0.2959, "step": 15580 }, { "epoch": 0.7528144175484369, "grad_norm": 3.5717179775238037, "learning_rate": 2.47185582451563e-07, "loss": 0.4028, "step": 15581 }, { "epoch": 0.7528627337295261, "grad_norm": 3.1434481143951416, "learning_rate": 2.4713726627047394e-07, "loss": 0.2071, "step": 15582 }, { "epoch": 0.7529110499106151, "grad_norm": 3.2120206356048584, "learning_rate": 2.470889500893849e-07, "loss": 0.2883, "step": 15583 }, { "epoch": 0.7529593660917041, "grad_norm": 2.0684382915496826, "learning_rate": 2.4704063390829586e-07, "loss": 0.1951, "step": 15584 }, { "epoch": 0.7530076822727931, "grad_norm": 3.162090301513672, "learning_rate": 2.4699231772720685e-07, "loss": 0.429, "step": 15585 }, { "epoch": 0.7530559984538822, "grad_norm": 2.171095609664917, "learning_rate": 2.469440015461178e-07, "loss": 0.1982, "step": 15586 }, { "epoch": 0.7531043146349713, "grad_norm": 3.4047350883483887, "learning_rate": 2.4689568536502873e-07, "loss": 0.3704, "step": 15587 }, { "epoch": 0.7531526308160603, "grad_norm": 4.6331281661987305, "learning_rate": 2.468473691839397e-07, "loss": 0.4313, "step": 15588 }, { "epoch": 0.7532009469971493, "grad_norm": 2.793104887008667, "learning_rate": 2.4679905300285066e-07, "loss": 0.1708, "step": 15589 }, { "epoch": 0.7532492631782384, "grad_norm": 7.2459397315979, "learning_rate": 2.467507368217616e-07, "loss": 0.3383, "step": 15590 }, { "epoch": 0.7532975793593274, "grad_norm": 2.3612453937530518, "learning_rate": 2.4670242064067253e-07, "loss": 0.2348, "step": 15591 }, { "epoch": 0.7533458955404165, "grad_norm": 1.9544363021850586, "learning_rate": 2.466541044595835e-07, "loss": 0.2248, "step": 15592 }, { "epoch": 0.7533942117215056, "grad_norm": 2.9895763397216797, "learning_rate": 2.4660578827849446e-07, "loss": 0.3215, "step": 15593 }, { "epoch": 0.7534425279025946, "grad_norm": 2.429036855697632, "learning_rate": 2.465574720974054e-07, "loss": 0.2069, "step": 15594 }, { "epoch": 0.7534908440836836, "grad_norm": 2.594088315963745, "learning_rate": 2.4650915591631633e-07, "loss": 0.3431, "step": 15595 }, { "epoch": 0.7535391602647726, "grad_norm": 2.0934946537017822, "learning_rate": 2.464608397352273e-07, "loss": 0.2528, "step": 15596 }, { "epoch": 0.7535874764458618, "grad_norm": 4.122930526733398, "learning_rate": 2.4641252355413826e-07, "loss": 0.2548, "step": 15597 }, { "epoch": 0.7536357926269508, "grad_norm": 3.3825647830963135, "learning_rate": 2.463642073730492e-07, "loss": 0.2851, "step": 15598 }, { "epoch": 0.7536841088080398, "grad_norm": 2.767523765563965, "learning_rate": 2.463158911919602e-07, "loss": 0.3171, "step": 15599 }, { "epoch": 0.7537324249891288, "grad_norm": 2.5433902740478516, "learning_rate": 2.462675750108711e-07, "loss": 0.1549, "step": 15600 }, { "epoch": 0.7537807411702179, "grad_norm": 2.934197425842285, "learning_rate": 2.462192588297821e-07, "loss": 0.3418, "step": 15601 }, { "epoch": 0.753829057351307, "grad_norm": 4.7762651443481445, "learning_rate": 2.4617094264869305e-07, "loss": 0.2482, "step": 15602 }, { "epoch": 0.753877373532396, "grad_norm": 2.3789169788360596, "learning_rate": 2.46122626467604e-07, "loss": 0.1503, "step": 15603 }, { "epoch": 0.7539256897134851, "grad_norm": 3.2779881954193115, "learning_rate": 2.460743102865149e-07, "loss": 0.2491, "step": 15604 }, { "epoch": 0.7539740058945741, "grad_norm": 2.1705482006073, "learning_rate": 2.460259941054259e-07, "loss": 0.2745, "step": 15605 }, { "epoch": 0.7540223220756631, "grad_norm": 2.0624337196350098, "learning_rate": 2.4597767792433685e-07, "loss": 0.2103, "step": 15606 }, { "epoch": 0.7540706382567521, "grad_norm": 2.9225146770477295, "learning_rate": 2.459293617432478e-07, "loss": 0.336, "step": 15607 }, { "epoch": 0.7541189544378413, "grad_norm": 1.9866570234298706, "learning_rate": 2.4588104556215873e-07, "loss": 0.1768, "step": 15608 }, { "epoch": 0.7541672706189303, "grad_norm": 2.777559995651245, "learning_rate": 2.458327293810697e-07, "loss": 0.2708, "step": 15609 }, { "epoch": 0.7542155868000193, "grad_norm": 4.612911701202393, "learning_rate": 2.4578441319998066e-07, "loss": 0.2913, "step": 15610 }, { "epoch": 0.7542639029811083, "grad_norm": 2.2448434829711914, "learning_rate": 2.457360970188916e-07, "loss": 0.2207, "step": 15611 }, { "epoch": 0.7543122191621974, "grad_norm": 3.9719815254211426, "learning_rate": 2.456877808378026e-07, "loss": 0.2652, "step": 15612 }, { "epoch": 0.7543605353432865, "grad_norm": 2.515760660171509, "learning_rate": 2.456394646567135e-07, "loss": 0.3305, "step": 15613 }, { "epoch": 0.7544088515243755, "grad_norm": 3.267805576324463, "learning_rate": 2.4559114847562446e-07, "loss": 0.3577, "step": 15614 }, { "epoch": 0.7544571677054646, "grad_norm": 3.0314083099365234, "learning_rate": 2.4554283229453545e-07, "loss": 0.2906, "step": 15615 }, { "epoch": 0.7545054838865536, "grad_norm": 2.0923614501953125, "learning_rate": 2.454945161134464e-07, "loss": 0.2081, "step": 15616 }, { "epoch": 0.7545538000676426, "grad_norm": 5.5018630027771, "learning_rate": 2.454461999323573e-07, "loss": 0.3508, "step": 15617 }, { "epoch": 0.7546021162487317, "grad_norm": 1.7351549863815308, "learning_rate": 2.453978837512683e-07, "loss": 0.2041, "step": 15618 }, { "epoch": 0.7546504324298208, "grad_norm": 2.3845512866973877, "learning_rate": 2.4534956757017925e-07, "loss": 0.2254, "step": 15619 }, { "epoch": 0.7546987486109098, "grad_norm": 2.3344929218292236, "learning_rate": 2.453012513890902e-07, "loss": 0.3224, "step": 15620 }, { "epoch": 0.7547470647919988, "grad_norm": 2.949974298477173, "learning_rate": 2.452529352080011e-07, "loss": 0.3624, "step": 15621 }, { "epoch": 0.7547953809730878, "grad_norm": 3.832383394241333, "learning_rate": 2.452046190269121e-07, "loss": 0.2667, "step": 15622 }, { "epoch": 0.754843697154177, "grad_norm": 2.9204652309417725, "learning_rate": 2.4515630284582305e-07, "loss": 0.2452, "step": 15623 }, { "epoch": 0.754892013335266, "grad_norm": 81.229248046875, "learning_rate": 2.45107986664734e-07, "loss": 0.2576, "step": 15624 }, { "epoch": 0.754940329516355, "grad_norm": 2.262132167816162, "learning_rate": 2.45059670483645e-07, "loss": 0.2756, "step": 15625 }, { "epoch": 0.7549886456974441, "grad_norm": 5.582147121429443, "learning_rate": 2.450113543025559e-07, "loss": 0.4111, "step": 15626 }, { "epoch": 0.7550369618785331, "grad_norm": 4.945033073425293, "learning_rate": 2.4496303812146685e-07, "loss": 0.415, "step": 15627 }, { "epoch": 0.7550852780596222, "grad_norm": 2.2988877296447754, "learning_rate": 2.4491472194037784e-07, "loss": 0.2801, "step": 15628 }, { "epoch": 0.7551335942407112, "grad_norm": 2.6772027015686035, "learning_rate": 2.448664057592888e-07, "loss": 0.2872, "step": 15629 }, { "epoch": 0.7551819104218003, "grad_norm": 2.15573787689209, "learning_rate": 2.448180895781997e-07, "loss": 0.2167, "step": 15630 }, { "epoch": 0.7552302266028893, "grad_norm": 2.9006214141845703, "learning_rate": 2.447697733971107e-07, "loss": 0.2837, "step": 15631 }, { "epoch": 0.7552785427839783, "grad_norm": 3.0840258598327637, "learning_rate": 2.4472145721602165e-07, "loss": 0.4751, "step": 15632 }, { "epoch": 0.7553268589650673, "grad_norm": 3.770164966583252, "learning_rate": 2.446731410349326e-07, "loss": 0.4647, "step": 15633 }, { "epoch": 0.7553751751461565, "grad_norm": 2.1306605339050293, "learning_rate": 2.446248248538435e-07, "loss": 0.1963, "step": 15634 }, { "epoch": 0.7554234913272455, "grad_norm": 2.263723611831665, "learning_rate": 2.445765086727545e-07, "loss": 0.2256, "step": 15635 }, { "epoch": 0.7554718075083345, "grad_norm": 2.501373291015625, "learning_rate": 2.4452819249166545e-07, "loss": 0.3085, "step": 15636 }, { "epoch": 0.7555201236894236, "grad_norm": 2.8562324047088623, "learning_rate": 2.444798763105764e-07, "loss": 0.3031, "step": 15637 }, { "epoch": 0.7555684398705126, "grad_norm": 17.902132034301758, "learning_rate": 2.444315601294873e-07, "loss": 0.5064, "step": 15638 }, { "epoch": 0.7556167560516017, "grad_norm": 2.243633985519409, "learning_rate": 2.443832439483983e-07, "loss": 0.2433, "step": 15639 }, { "epoch": 0.7556650722326907, "grad_norm": 2.649064064025879, "learning_rate": 2.4433492776730925e-07, "loss": 0.3757, "step": 15640 }, { "epoch": 0.7557133884137798, "grad_norm": 2.775987148284912, "learning_rate": 2.4428661158622024e-07, "loss": 0.391, "step": 15641 }, { "epoch": 0.7557617045948688, "grad_norm": 2.4934146404266357, "learning_rate": 2.442382954051312e-07, "loss": 0.2941, "step": 15642 }, { "epoch": 0.7558100207759578, "grad_norm": 2.0368664264678955, "learning_rate": 2.441899792240421e-07, "loss": 0.2073, "step": 15643 }, { "epoch": 0.755858336957047, "grad_norm": 2.6937334537506104, "learning_rate": 2.441416630429531e-07, "loss": 0.2643, "step": 15644 }, { "epoch": 0.755906653138136, "grad_norm": 2.5522172451019287, "learning_rate": 2.4409334686186404e-07, "loss": 0.3326, "step": 15645 }, { "epoch": 0.755954969319225, "grad_norm": 2.944263458251953, "learning_rate": 2.44045030680775e-07, "loss": 0.2505, "step": 15646 }, { "epoch": 0.756003285500314, "grad_norm": 2.392017364501953, "learning_rate": 2.439967144996859e-07, "loss": 0.2841, "step": 15647 }, { "epoch": 0.7560516016814031, "grad_norm": 1.952552080154419, "learning_rate": 2.439483983185969e-07, "loss": 0.2492, "step": 15648 }, { "epoch": 0.7560999178624922, "grad_norm": 3.0595407485961914, "learning_rate": 2.4390008213750784e-07, "loss": 0.4555, "step": 15649 }, { "epoch": 0.7561482340435812, "grad_norm": 2.077522039413452, "learning_rate": 2.438517659564188e-07, "loss": 0.2244, "step": 15650 }, { "epoch": 0.7561965502246702, "grad_norm": 2.1676206588745117, "learning_rate": 2.438034497753297e-07, "loss": 0.2419, "step": 15651 }, { "epoch": 0.7562448664057593, "grad_norm": 3.255984306335449, "learning_rate": 2.437551335942407e-07, "loss": 0.238, "step": 15652 }, { "epoch": 0.7562931825868483, "grad_norm": 7.661678314208984, "learning_rate": 2.4370681741315165e-07, "loss": 0.2085, "step": 15653 }, { "epoch": 0.7563414987679374, "grad_norm": 1.760738730430603, "learning_rate": 2.436585012320626e-07, "loss": 0.1796, "step": 15654 }, { "epoch": 0.7563898149490265, "grad_norm": 7.147261142730713, "learning_rate": 2.436101850509736e-07, "loss": 0.2001, "step": 15655 }, { "epoch": 0.7564381311301155, "grad_norm": 11.577556610107422, "learning_rate": 2.435618688698845e-07, "loss": 0.3231, "step": 15656 }, { "epoch": 0.7564864473112045, "grad_norm": 1.9926984310150146, "learning_rate": 2.435135526887955e-07, "loss": 0.1444, "step": 15657 }, { "epoch": 0.7565347634922935, "grad_norm": 2.2592198848724365, "learning_rate": 2.4346523650770644e-07, "loss": 0.2173, "step": 15658 }, { "epoch": 0.7565830796733826, "grad_norm": 2.8361377716064453, "learning_rate": 2.434169203266174e-07, "loss": 0.383, "step": 15659 }, { "epoch": 0.7566313958544717, "grad_norm": 4.076650619506836, "learning_rate": 2.433686041455283e-07, "loss": 0.3297, "step": 15660 }, { "epoch": 0.7566797120355607, "grad_norm": 2.1141583919525146, "learning_rate": 2.433202879644393e-07, "loss": 0.2436, "step": 15661 }, { "epoch": 0.7567280282166498, "grad_norm": 2.4996211528778076, "learning_rate": 2.4327197178335024e-07, "loss": 0.2879, "step": 15662 }, { "epoch": 0.7567763443977388, "grad_norm": 2.7954044342041016, "learning_rate": 2.432236556022612e-07, "loss": 0.3023, "step": 15663 }, { "epoch": 0.7568246605788278, "grad_norm": 2.9187631607055664, "learning_rate": 2.431753394211721e-07, "loss": 0.3413, "step": 15664 }, { "epoch": 0.7568729767599169, "grad_norm": 2.2266085147857666, "learning_rate": 2.4312702324008305e-07, "loss": 0.2734, "step": 15665 }, { "epoch": 0.756921292941006, "grad_norm": 2.1917803287506104, "learning_rate": 2.4307870705899404e-07, "loss": 0.2717, "step": 15666 }, { "epoch": 0.756969609122095, "grad_norm": 3.9859554767608643, "learning_rate": 2.43030390877905e-07, "loss": 0.2264, "step": 15667 }, { "epoch": 0.757017925303184, "grad_norm": 2.1883444786071777, "learning_rate": 2.4298207469681597e-07, "loss": 0.2539, "step": 15668 }, { "epoch": 0.757066241484273, "grad_norm": 2.5560882091522217, "learning_rate": 2.429337585157269e-07, "loss": 0.3508, "step": 15669 }, { "epoch": 0.7571145576653622, "grad_norm": 4.468620300292969, "learning_rate": 2.4288544233463785e-07, "loss": 0.3223, "step": 15670 }, { "epoch": 0.7571628738464512, "grad_norm": 3.147071599960327, "learning_rate": 2.4283712615354884e-07, "loss": 0.3617, "step": 15671 }, { "epoch": 0.7572111900275402, "grad_norm": 4.93127965927124, "learning_rate": 2.4278880997245977e-07, "loss": 0.3105, "step": 15672 }, { "epoch": 0.7572595062086293, "grad_norm": 3.3073854446411133, "learning_rate": 2.427404937913707e-07, "loss": 0.3716, "step": 15673 }, { "epoch": 0.7573078223897183, "grad_norm": 13.091205596923828, "learning_rate": 2.426921776102817e-07, "loss": 0.263, "step": 15674 }, { "epoch": 0.7573561385708074, "grad_norm": 2.296743631362915, "learning_rate": 2.4264386142919264e-07, "loss": 0.2401, "step": 15675 }, { "epoch": 0.7574044547518964, "grad_norm": 2.2053158283233643, "learning_rate": 2.425955452481036e-07, "loss": 0.2205, "step": 15676 }, { "epoch": 0.7574527709329855, "grad_norm": 2.716578960418701, "learning_rate": 2.425472290670145e-07, "loss": 0.2383, "step": 15677 }, { "epoch": 0.7575010871140745, "grad_norm": 7.7921037673950195, "learning_rate": 2.4249891288592545e-07, "loss": 0.3976, "step": 15678 }, { "epoch": 0.7575494032951635, "grad_norm": 2.732358932495117, "learning_rate": 2.4245059670483644e-07, "loss": 0.344, "step": 15679 }, { "epoch": 0.7575977194762527, "grad_norm": 1.8148643970489502, "learning_rate": 2.424022805237474e-07, "loss": 0.1717, "step": 15680 }, { "epoch": 0.7576460356573417, "grad_norm": 2.1399009227752686, "learning_rate": 2.423539643426583e-07, "loss": 0.2987, "step": 15681 }, { "epoch": 0.7576943518384307, "grad_norm": 3.988849401473999, "learning_rate": 2.423056481615693e-07, "loss": 0.2415, "step": 15682 }, { "epoch": 0.7577426680195197, "grad_norm": 2.6502387523651123, "learning_rate": 2.4225733198048024e-07, "loss": 0.3131, "step": 15683 }, { "epoch": 0.7577909842006088, "grad_norm": 2.6974451541900635, "learning_rate": 2.4220901579939123e-07, "loss": 0.3384, "step": 15684 }, { "epoch": 0.7578393003816978, "grad_norm": 2.8953781127929688, "learning_rate": 2.4216069961830217e-07, "loss": 0.3823, "step": 15685 }, { "epoch": 0.7578876165627869, "grad_norm": 2.3700146675109863, "learning_rate": 2.421123834372131e-07, "loss": 0.2516, "step": 15686 }, { "epoch": 0.7579359327438759, "grad_norm": 2.439899206161499, "learning_rate": 2.420640672561241e-07, "loss": 0.2633, "step": 15687 }, { "epoch": 0.757984248924965, "grad_norm": 2.5990395545959473, "learning_rate": 2.4201575107503503e-07, "loss": 0.332, "step": 15688 }, { "epoch": 0.758032565106054, "grad_norm": 3.6832938194274902, "learning_rate": 2.4196743489394597e-07, "loss": 0.3348, "step": 15689 }, { "epoch": 0.758080881287143, "grad_norm": 2.2233662605285645, "learning_rate": 2.419191187128569e-07, "loss": 0.235, "step": 15690 }, { "epoch": 0.7581291974682322, "grad_norm": 3.782175302505493, "learning_rate": 2.4187080253176785e-07, "loss": 0.321, "step": 15691 }, { "epoch": 0.7581775136493212, "grad_norm": 2.2897191047668457, "learning_rate": 2.4182248635067884e-07, "loss": 0.2785, "step": 15692 }, { "epoch": 0.7582258298304102, "grad_norm": 2.51996111869812, "learning_rate": 2.4177417016958977e-07, "loss": 0.3117, "step": 15693 }, { "epoch": 0.7582741460114992, "grad_norm": 2.8112258911132812, "learning_rate": 2.417258539885007e-07, "loss": 0.2981, "step": 15694 }, { "epoch": 0.7583224621925883, "grad_norm": 12.590815544128418, "learning_rate": 2.416775378074117e-07, "loss": 0.3265, "step": 15695 }, { "epoch": 0.7583707783736774, "grad_norm": 2.448699474334717, "learning_rate": 2.4162922162632264e-07, "loss": 0.3024, "step": 15696 }, { "epoch": 0.7584190945547664, "grad_norm": 2.5103838443756104, "learning_rate": 2.415809054452336e-07, "loss": 0.3236, "step": 15697 }, { "epoch": 0.7584674107358554, "grad_norm": 2.9497551918029785, "learning_rate": 2.4153258926414457e-07, "loss": 0.508, "step": 15698 }, { "epoch": 0.7585157269169445, "grad_norm": 2.9244112968444824, "learning_rate": 2.414842730830555e-07, "loss": 0.2525, "step": 15699 }, { "epoch": 0.7585640430980335, "grad_norm": 7.221355438232422, "learning_rate": 2.414359569019665e-07, "loss": 0.3312, "step": 15700 }, { "epoch": 0.7586123592791226, "grad_norm": 1.8411426544189453, "learning_rate": 2.4138764072087743e-07, "loss": 0.2518, "step": 15701 }, { "epoch": 0.7586606754602117, "grad_norm": 2.979757785797119, "learning_rate": 2.4133932453978837e-07, "loss": 0.2579, "step": 15702 }, { "epoch": 0.7587089916413007, "grad_norm": 2.0605485439300537, "learning_rate": 2.412910083586993e-07, "loss": 0.2069, "step": 15703 }, { "epoch": 0.7587573078223897, "grad_norm": 1.9575296640396118, "learning_rate": 2.4124269217761024e-07, "loss": 0.1892, "step": 15704 }, { "epoch": 0.7588056240034787, "grad_norm": 6.455848217010498, "learning_rate": 2.4119437599652123e-07, "loss": 0.2812, "step": 15705 }, { "epoch": 0.7588539401845679, "grad_norm": 2.2233481407165527, "learning_rate": 2.4114605981543217e-07, "loss": 0.2339, "step": 15706 }, { "epoch": 0.7589022563656569, "grad_norm": 1.7594482898712158, "learning_rate": 2.410977436343431e-07, "loss": 0.206, "step": 15707 }, { "epoch": 0.7589505725467459, "grad_norm": 2.0921106338500977, "learning_rate": 2.410494274532541e-07, "loss": 0.2235, "step": 15708 }, { "epoch": 0.7589988887278349, "grad_norm": 3.541036605834961, "learning_rate": 2.4100111127216503e-07, "loss": 0.2563, "step": 15709 }, { "epoch": 0.759047204908924, "grad_norm": 2.1202402114868164, "learning_rate": 2.4095279509107597e-07, "loss": 0.2302, "step": 15710 }, { "epoch": 0.759095521090013, "grad_norm": 2.1357553005218506, "learning_rate": 2.4090447890998696e-07, "loss": 0.1672, "step": 15711 }, { "epoch": 0.7591438372711021, "grad_norm": 8.23405933380127, "learning_rate": 2.408561627288979e-07, "loss": 0.3764, "step": 15712 }, { "epoch": 0.7591921534521912, "grad_norm": 2.568009853363037, "learning_rate": 2.408078465478089e-07, "loss": 0.2446, "step": 15713 }, { "epoch": 0.7592404696332802, "grad_norm": 3.672689437866211, "learning_rate": 2.407595303667198e-07, "loss": 0.3333, "step": 15714 }, { "epoch": 0.7592887858143692, "grad_norm": 3.062556028366089, "learning_rate": 2.4071121418563076e-07, "loss": 0.3491, "step": 15715 }, { "epoch": 0.7593371019954582, "grad_norm": 2.9367878437042236, "learning_rate": 2.406628980045417e-07, "loss": 0.3677, "step": 15716 }, { "epoch": 0.7593854181765474, "grad_norm": 1.4489970207214355, "learning_rate": 2.4061458182345264e-07, "loss": 0.1536, "step": 15717 }, { "epoch": 0.7594337343576364, "grad_norm": 3.0704357624053955, "learning_rate": 2.4056626564236363e-07, "loss": 0.383, "step": 15718 }, { "epoch": 0.7594820505387254, "grad_norm": 3.013915538787842, "learning_rate": 2.4051794946127457e-07, "loss": 0.1603, "step": 15719 }, { "epoch": 0.7595303667198144, "grad_norm": 2.4215967655181885, "learning_rate": 2.404696332801855e-07, "loss": 0.2839, "step": 15720 }, { "epoch": 0.7595786829009035, "grad_norm": 2.720928192138672, "learning_rate": 2.4042131709909644e-07, "loss": 0.2693, "step": 15721 }, { "epoch": 0.7596269990819926, "grad_norm": 2.2018048763275146, "learning_rate": 2.4037300091800743e-07, "loss": 0.2377, "step": 15722 }, { "epoch": 0.7596753152630816, "grad_norm": 8.322991371154785, "learning_rate": 2.4032468473691837e-07, "loss": 0.4513, "step": 15723 }, { "epoch": 0.7597236314441707, "grad_norm": 1.680925965309143, "learning_rate": 2.4027636855582936e-07, "loss": 0.1628, "step": 15724 }, { "epoch": 0.7597719476252597, "grad_norm": 3.411843776702881, "learning_rate": 2.402280523747403e-07, "loss": 0.3184, "step": 15725 }, { "epoch": 0.7598202638063487, "grad_norm": 2.1897025108337402, "learning_rate": 2.4017973619365123e-07, "loss": 0.3178, "step": 15726 }, { "epoch": 0.7598685799874378, "grad_norm": 3.057061195373535, "learning_rate": 2.401314200125622e-07, "loss": 0.2907, "step": 15727 }, { "epoch": 0.7599168961685269, "grad_norm": 2.4202747344970703, "learning_rate": 2.4008310383147316e-07, "loss": 0.25, "step": 15728 }, { "epoch": 0.7599652123496159, "grad_norm": 4.004024982452393, "learning_rate": 2.400347876503841e-07, "loss": 0.3484, "step": 15729 }, { "epoch": 0.7600135285307049, "grad_norm": 2.7959144115448, "learning_rate": 2.3998647146929503e-07, "loss": 0.4403, "step": 15730 }, { "epoch": 0.7600618447117939, "grad_norm": 2.806215524673462, "learning_rate": 2.39938155288206e-07, "loss": 0.3485, "step": 15731 }, { "epoch": 0.7601101608928831, "grad_norm": 6.823509216308594, "learning_rate": 2.3988983910711696e-07, "loss": 0.4129, "step": 15732 }, { "epoch": 0.7601584770739721, "grad_norm": 2.5749645233154297, "learning_rate": 2.398415229260279e-07, "loss": 0.2929, "step": 15733 }, { "epoch": 0.7602067932550611, "grad_norm": 2.68282151222229, "learning_rate": 2.3979320674493884e-07, "loss": 0.289, "step": 15734 }, { "epoch": 0.7602551094361502, "grad_norm": 2.034043550491333, "learning_rate": 2.3974489056384983e-07, "loss": 0.2177, "step": 15735 }, { "epoch": 0.7603034256172392, "grad_norm": 2.9776384830474854, "learning_rate": 2.3969657438276076e-07, "loss": 0.2756, "step": 15736 }, { "epoch": 0.7603517417983282, "grad_norm": 3.2032523155212402, "learning_rate": 2.396482582016717e-07, "loss": 0.3804, "step": 15737 }, { "epoch": 0.7604000579794173, "grad_norm": 3.6325273513793945, "learning_rate": 2.395999420205827e-07, "loss": 0.3349, "step": 15738 }, { "epoch": 0.7604483741605064, "grad_norm": 2.513031482696533, "learning_rate": 2.3955162583949363e-07, "loss": 0.212, "step": 15739 }, { "epoch": 0.7604966903415954, "grad_norm": 2.5582146644592285, "learning_rate": 2.395033096584046e-07, "loss": 0.3151, "step": 15740 }, { "epoch": 0.7605450065226844, "grad_norm": 2.8609890937805176, "learning_rate": 2.3945499347731556e-07, "loss": 0.316, "step": 15741 }, { "epoch": 0.7605933227037734, "grad_norm": 3.4139153957366943, "learning_rate": 2.394066772962265e-07, "loss": 0.382, "step": 15742 }, { "epoch": 0.7606416388848626, "grad_norm": 4.519085884094238, "learning_rate": 2.3935836111513743e-07, "loss": 0.3459, "step": 15743 }, { "epoch": 0.7606899550659516, "grad_norm": 3.1970949172973633, "learning_rate": 2.393100449340484e-07, "loss": 0.3788, "step": 15744 }, { "epoch": 0.7607382712470406, "grad_norm": 3.512390613555908, "learning_rate": 2.3926172875295936e-07, "loss": 0.297, "step": 15745 }, { "epoch": 0.7607865874281297, "grad_norm": 2.4687657356262207, "learning_rate": 2.392134125718703e-07, "loss": 0.276, "step": 15746 }, { "epoch": 0.7608349036092187, "grad_norm": 1.8342607021331787, "learning_rate": 2.3916509639078123e-07, "loss": 0.1959, "step": 15747 }, { "epoch": 0.7608832197903078, "grad_norm": 2.9466490745544434, "learning_rate": 2.391167802096922e-07, "loss": 0.3608, "step": 15748 }, { "epoch": 0.7609315359713968, "grad_norm": 11.63241195678711, "learning_rate": 2.3906846402860316e-07, "loss": 0.22, "step": 15749 }, { "epoch": 0.7609798521524859, "grad_norm": 2.924675464630127, "learning_rate": 2.390201478475141e-07, "loss": 0.3498, "step": 15750 }, { "epoch": 0.7610281683335749, "grad_norm": 2.4503626823425293, "learning_rate": 2.389718316664251e-07, "loss": 0.2676, "step": 15751 }, { "epoch": 0.7610764845146639, "grad_norm": 5.3008012771606445, "learning_rate": 2.38923515485336e-07, "loss": 0.3492, "step": 15752 }, { "epoch": 0.7611248006957531, "grad_norm": 4.563323497772217, "learning_rate": 2.3887519930424696e-07, "loss": 0.3408, "step": 15753 }, { "epoch": 0.7611731168768421, "grad_norm": 1.6438695192337036, "learning_rate": 2.3882688312315795e-07, "loss": 0.1938, "step": 15754 }, { "epoch": 0.7612214330579311, "grad_norm": 2.630305290222168, "learning_rate": 2.387785669420689e-07, "loss": 0.3476, "step": 15755 }, { "epoch": 0.7612697492390201, "grad_norm": 3.3717291355133057, "learning_rate": 2.3873025076097983e-07, "loss": 0.1488, "step": 15756 }, { "epoch": 0.7613180654201092, "grad_norm": 2.954479694366455, "learning_rate": 2.386819345798908e-07, "loss": 0.2897, "step": 15757 }, { "epoch": 0.7613663816011983, "grad_norm": 2.0509121417999268, "learning_rate": 2.3863361839880175e-07, "loss": 0.2762, "step": 15758 }, { "epoch": 0.7614146977822873, "grad_norm": 1.8939274549484253, "learning_rate": 2.385853022177127e-07, "loss": 0.2111, "step": 15759 }, { "epoch": 0.7614630139633763, "grad_norm": 2.7995450496673584, "learning_rate": 2.3853698603662363e-07, "loss": 0.324, "step": 15760 }, { "epoch": 0.7615113301444654, "grad_norm": 1.6454696655273438, "learning_rate": 2.384886698555346e-07, "loss": 0.1758, "step": 15761 }, { "epoch": 0.7615596463255544, "grad_norm": 3.0640509128570557, "learning_rate": 2.3844035367444556e-07, "loss": 0.1562, "step": 15762 }, { "epoch": 0.7616079625066434, "grad_norm": 1.9696760177612305, "learning_rate": 2.3839203749335652e-07, "loss": 0.2055, "step": 15763 }, { "epoch": 0.7616562786877326, "grad_norm": 25.433082580566406, "learning_rate": 2.3834372131226746e-07, "loss": 0.2771, "step": 15764 }, { "epoch": 0.7617045948688216, "grad_norm": 2.8999595642089844, "learning_rate": 2.3829540513117842e-07, "loss": 0.419, "step": 15765 }, { "epoch": 0.7617529110499106, "grad_norm": 2.102633237838745, "learning_rate": 2.3824708895008939e-07, "loss": 0.2388, "step": 15766 }, { "epoch": 0.7618012272309996, "grad_norm": 2.512270212173462, "learning_rate": 2.3819877276900032e-07, "loss": 0.2789, "step": 15767 }, { "epoch": 0.7618495434120887, "grad_norm": 2.483736276626587, "learning_rate": 2.3815045658791129e-07, "loss": 0.2794, "step": 15768 }, { "epoch": 0.7618978595931778, "grad_norm": 3.863980531692505, "learning_rate": 2.3810214040682222e-07, "loss": 0.3621, "step": 15769 }, { "epoch": 0.7619461757742668, "grad_norm": 3.288280487060547, "learning_rate": 2.380538242257332e-07, "loss": 0.4194, "step": 15770 }, { "epoch": 0.7619944919553558, "grad_norm": 2.5407795906066895, "learning_rate": 2.3800550804464415e-07, "loss": 0.2654, "step": 15771 }, { "epoch": 0.7620428081364449, "grad_norm": 3.4297304153442383, "learning_rate": 2.379571918635551e-07, "loss": 0.3891, "step": 15772 }, { "epoch": 0.7620911243175339, "grad_norm": 2.2448794841766357, "learning_rate": 2.3790887568246603e-07, "loss": 0.2933, "step": 15773 }, { "epoch": 0.762139440498623, "grad_norm": 2.6159021854400635, "learning_rate": 2.3786055950137702e-07, "loss": 0.3411, "step": 15774 }, { "epoch": 0.7621877566797121, "grad_norm": 2.4177443981170654, "learning_rate": 2.3781224332028795e-07, "loss": 0.3059, "step": 15775 }, { "epoch": 0.7622360728608011, "grad_norm": 4.410743236541748, "learning_rate": 2.3776392713919892e-07, "loss": 0.2891, "step": 15776 }, { "epoch": 0.7622843890418901, "grad_norm": 3.20466947555542, "learning_rate": 2.3771561095810985e-07, "loss": 0.3532, "step": 15777 }, { "epoch": 0.7623327052229791, "grad_norm": 3.9013192653656006, "learning_rate": 2.3766729477702082e-07, "loss": 0.4049, "step": 15778 }, { "epoch": 0.7623810214040683, "grad_norm": 7.841189861297607, "learning_rate": 2.3761897859593178e-07, "loss": 0.3375, "step": 15779 }, { "epoch": 0.7624293375851573, "grad_norm": 2.0606801509857178, "learning_rate": 2.3757066241484272e-07, "loss": 0.2042, "step": 15780 }, { "epoch": 0.7624776537662463, "grad_norm": 2.1497795581817627, "learning_rate": 2.3752234623375366e-07, "loss": 0.1854, "step": 15781 }, { "epoch": 0.7625259699473353, "grad_norm": 3.006160020828247, "learning_rate": 2.3747403005266462e-07, "loss": 0.3306, "step": 15782 }, { "epoch": 0.7625742861284244, "grad_norm": 1.8707338571548462, "learning_rate": 2.3742571387157558e-07, "loss": 0.2606, "step": 15783 }, { "epoch": 0.7626226023095135, "grad_norm": 3.3074162006378174, "learning_rate": 2.3737739769048655e-07, "loss": 0.3391, "step": 15784 }, { "epoch": 0.7626709184906025, "grad_norm": 13.467732429504395, "learning_rate": 2.3732908150939748e-07, "loss": 0.2394, "step": 15785 }, { "epoch": 0.7627192346716916, "grad_norm": 4.609264850616455, "learning_rate": 2.3728076532830842e-07, "loss": 0.26, "step": 15786 }, { "epoch": 0.7627675508527806, "grad_norm": 4.134220123291016, "learning_rate": 2.372324491472194e-07, "loss": 0.2889, "step": 15787 }, { "epoch": 0.7628158670338696, "grad_norm": 2.3209643363952637, "learning_rate": 2.3718413296613035e-07, "loss": 0.1411, "step": 15788 }, { "epoch": 0.7628641832149586, "grad_norm": 3.2164204120635986, "learning_rate": 2.371358167850413e-07, "loss": 0.4163, "step": 15789 }, { "epoch": 0.7629124993960478, "grad_norm": 3.059816598892212, "learning_rate": 2.3708750060395225e-07, "loss": 0.2723, "step": 15790 }, { "epoch": 0.7629608155771368, "grad_norm": 2.9049508571624756, "learning_rate": 2.3703918442286321e-07, "loss": 0.4294, "step": 15791 }, { "epoch": 0.7630091317582258, "grad_norm": 3.54577898979187, "learning_rate": 2.3699086824177418e-07, "loss": 0.3633, "step": 15792 }, { "epoch": 0.7630574479393148, "grad_norm": 2.1852378845214844, "learning_rate": 2.3694255206068512e-07, "loss": 0.2125, "step": 15793 }, { "epoch": 0.7631057641204039, "grad_norm": 1.8139517307281494, "learning_rate": 2.3689423587959605e-07, "loss": 0.2258, "step": 15794 }, { "epoch": 0.763154080301493, "grad_norm": 2.4807186126708984, "learning_rate": 2.3684591969850702e-07, "loss": 0.2771, "step": 15795 }, { "epoch": 0.763202396482582, "grad_norm": 2.248588800430298, "learning_rate": 2.3679760351741798e-07, "loss": 0.338, "step": 15796 }, { "epoch": 0.7632507126636711, "grad_norm": 3.467982053756714, "learning_rate": 2.3674928733632892e-07, "loss": 0.2606, "step": 15797 }, { "epoch": 0.7632990288447601, "grad_norm": 2.508877992630005, "learning_rate": 2.3670097115523988e-07, "loss": 0.2818, "step": 15798 }, { "epoch": 0.7633473450258491, "grad_norm": 3.446845531463623, "learning_rate": 2.3665265497415082e-07, "loss": 0.2977, "step": 15799 }, { "epoch": 0.7633956612069382, "grad_norm": 1.7615185976028442, "learning_rate": 2.366043387930618e-07, "loss": 0.1874, "step": 15800 }, { "epoch": 0.7634439773880273, "grad_norm": 3.117100954055786, "learning_rate": 2.3655602261197275e-07, "loss": 0.3749, "step": 15801 }, { "epoch": 0.7634922935691163, "grad_norm": 3.322692632675171, "learning_rate": 2.3650770643088368e-07, "loss": 0.2476, "step": 15802 }, { "epoch": 0.7635406097502053, "grad_norm": 1.9901893138885498, "learning_rate": 2.3645939024979465e-07, "loss": 0.2149, "step": 15803 }, { "epoch": 0.7635889259312943, "grad_norm": 4.321579933166504, "learning_rate": 2.364110740687056e-07, "loss": 0.3556, "step": 15804 }, { "epoch": 0.7636372421123835, "grad_norm": 2.805558443069458, "learning_rate": 2.3636275788761655e-07, "loss": 0.3485, "step": 15805 }, { "epoch": 0.7636855582934725, "grad_norm": 6.304582595825195, "learning_rate": 2.363144417065275e-07, "loss": 0.2705, "step": 15806 }, { "epoch": 0.7637338744745615, "grad_norm": 2.4460513591766357, "learning_rate": 2.3626612552543845e-07, "loss": 0.2299, "step": 15807 }, { "epoch": 0.7637821906556506, "grad_norm": 3.0121378898620605, "learning_rate": 2.362178093443494e-07, "loss": 0.3839, "step": 15808 }, { "epoch": 0.7638305068367396, "grad_norm": 2.683297872543335, "learning_rate": 2.3616949316326038e-07, "loss": 0.361, "step": 15809 }, { "epoch": 0.7638788230178287, "grad_norm": 2.5123419761657715, "learning_rate": 2.3612117698217131e-07, "loss": 0.296, "step": 15810 }, { "epoch": 0.7639271391989177, "grad_norm": 2.676450729370117, "learning_rate": 2.3607286080108228e-07, "loss": 0.378, "step": 15811 }, { "epoch": 0.7639754553800068, "grad_norm": 8.009893417358398, "learning_rate": 2.3602454461999321e-07, "loss": 0.2601, "step": 15812 }, { "epoch": 0.7640237715610958, "grad_norm": 3.2970376014709473, "learning_rate": 2.3597622843890418e-07, "loss": 0.2647, "step": 15813 }, { "epoch": 0.7640720877421848, "grad_norm": 5.829756736755371, "learning_rate": 2.3592791225781514e-07, "loss": 0.488, "step": 15814 }, { "epoch": 0.7641204039232738, "grad_norm": 3.260716676712036, "learning_rate": 2.3587959607672608e-07, "loss": 0.3167, "step": 15815 }, { "epoch": 0.764168720104363, "grad_norm": 3.0879180431365967, "learning_rate": 2.3583127989563704e-07, "loss": 0.2585, "step": 15816 }, { "epoch": 0.764217036285452, "grad_norm": 4.12969446182251, "learning_rate": 2.3578296371454798e-07, "loss": 0.3036, "step": 15817 }, { "epoch": 0.764265352466541, "grad_norm": 5.921830654144287, "learning_rate": 2.3573464753345894e-07, "loss": 0.2278, "step": 15818 }, { "epoch": 0.7643136686476301, "grad_norm": 5.26270055770874, "learning_rate": 2.356863313523699e-07, "loss": 0.4074, "step": 15819 }, { "epoch": 0.7643619848287191, "grad_norm": 2.440584659576416, "learning_rate": 2.3563801517128085e-07, "loss": 0.2318, "step": 15820 }, { "epoch": 0.7644103010098082, "grad_norm": 2.452552556991577, "learning_rate": 2.3558969899019178e-07, "loss": 0.2953, "step": 15821 }, { "epoch": 0.7644586171908972, "grad_norm": 3.734673500061035, "learning_rate": 2.3554138280910277e-07, "loss": 0.2694, "step": 15822 }, { "epoch": 0.7645069333719863, "grad_norm": 2.956202507019043, "learning_rate": 2.354930666280137e-07, "loss": 0.3319, "step": 15823 }, { "epoch": 0.7645552495530753, "grad_norm": 1.565652847290039, "learning_rate": 2.3544475044692467e-07, "loss": 0.1696, "step": 15824 }, { "epoch": 0.7646035657341643, "grad_norm": 7.377847194671631, "learning_rate": 2.353964342658356e-07, "loss": 0.2627, "step": 15825 }, { "epoch": 0.7646518819152535, "grad_norm": 2.7153801918029785, "learning_rate": 2.3534811808474657e-07, "loss": 0.2524, "step": 15826 }, { "epoch": 0.7647001980963425, "grad_norm": 6.313071250915527, "learning_rate": 2.3529980190365754e-07, "loss": 0.3074, "step": 15827 }, { "epoch": 0.7647485142774315, "grad_norm": 3.1461000442504883, "learning_rate": 2.3525148572256848e-07, "loss": 0.2206, "step": 15828 }, { "epoch": 0.7647968304585205, "grad_norm": 2.3118956089019775, "learning_rate": 2.352031695414794e-07, "loss": 0.2452, "step": 15829 }, { "epoch": 0.7648451466396096, "grad_norm": 20.257667541503906, "learning_rate": 2.3515485336039038e-07, "loss": 0.1684, "step": 15830 }, { "epoch": 0.7648934628206987, "grad_norm": 3.011812686920166, "learning_rate": 2.3510653717930134e-07, "loss": 0.4383, "step": 15831 }, { "epoch": 0.7649417790017877, "grad_norm": 2.1585757732391357, "learning_rate": 2.350582209982123e-07, "loss": 0.2798, "step": 15832 }, { "epoch": 0.7649900951828768, "grad_norm": 2.647738456726074, "learning_rate": 2.3500990481712324e-07, "loss": 0.3036, "step": 15833 }, { "epoch": 0.7650384113639658, "grad_norm": 2.9069314002990723, "learning_rate": 2.3496158863603418e-07, "loss": 0.3521, "step": 15834 }, { "epoch": 0.7650867275450548, "grad_norm": 5.17030143737793, "learning_rate": 2.3491327245494517e-07, "loss": 0.2538, "step": 15835 }, { "epoch": 0.7651350437261439, "grad_norm": 2.1269712448120117, "learning_rate": 2.348649562738561e-07, "loss": 0.2708, "step": 15836 }, { "epoch": 0.765183359907233, "grad_norm": 2.8387224674224854, "learning_rate": 2.3481664009276704e-07, "loss": 0.2882, "step": 15837 }, { "epoch": 0.765231676088322, "grad_norm": 4.824037075042725, "learning_rate": 2.34768323911678e-07, "loss": 0.3729, "step": 15838 }, { "epoch": 0.765279992269411, "grad_norm": 2.19675350189209, "learning_rate": 2.3472000773058897e-07, "loss": 0.2428, "step": 15839 }, { "epoch": 0.7653283084505, "grad_norm": 2.134080171585083, "learning_rate": 2.3467169154949993e-07, "loss": 0.2451, "step": 15840 }, { "epoch": 0.7653766246315891, "grad_norm": 3.454838991165161, "learning_rate": 2.3462337536841087e-07, "loss": 0.5018, "step": 15841 }, { "epoch": 0.7654249408126782, "grad_norm": 3.5026869773864746, "learning_rate": 2.345750591873218e-07, "loss": 0.2973, "step": 15842 }, { "epoch": 0.7654732569937672, "grad_norm": 3.3946824073791504, "learning_rate": 2.3452674300623277e-07, "loss": 0.3795, "step": 15843 }, { "epoch": 0.7655215731748563, "grad_norm": 5.058707237243652, "learning_rate": 2.3447842682514374e-07, "loss": 0.3372, "step": 15844 }, { "epoch": 0.7655698893559453, "grad_norm": 2.4893300533294678, "learning_rate": 2.3443011064405467e-07, "loss": 0.2848, "step": 15845 }, { "epoch": 0.7656182055370343, "grad_norm": 2.343590497970581, "learning_rate": 2.3438179446296564e-07, "loss": 0.2628, "step": 15846 }, { "epoch": 0.7656665217181234, "grad_norm": 2.348623037338257, "learning_rate": 2.3433347828187658e-07, "loss": 0.288, "step": 15847 }, { "epoch": 0.7657148378992125, "grad_norm": 1.8661302328109741, "learning_rate": 2.3428516210078757e-07, "loss": 0.1876, "step": 15848 }, { "epoch": 0.7657631540803015, "grad_norm": 3.6545825004577637, "learning_rate": 2.342368459196985e-07, "loss": 0.391, "step": 15849 }, { "epoch": 0.7658114702613905, "grad_norm": 2.2031240463256836, "learning_rate": 2.3418852973860944e-07, "loss": 0.1934, "step": 15850 }, { "epoch": 0.7658597864424795, "grad_norm": 2.641416311264038, "learning_rate": 2.341402135575204e-07, "loss": 0.3641, "step": 15851 }, { "epoch": 0.7659081026235687, "grad_norm": 2.880333662033081, "learning_rate": 2.3409189737643137e-07, "loss": 0.2309, "step": 15852 }, { "epoch": 0.7659564188046577, "grad_norm": 2.5412302017211914, "learning_rate": 2.340435811953423e-07, "loss": 0.3667, "step": 15853 }, { "epoch": 0.7660047349857467, "grad_norm": 6.550662040710449, "learning_rate": 2.3399526501425327e-07, "loss": 0.3483, "step": 15854 }, { "epoch": 0.7660530511668358, "grad_norm": 3.3739383220672607, "learning_rate": 2.339469488331642e-07, "loss": 0.3091, "step": 15855 }, { "epoch": 0.7661013673479248, "grad_norm": 3.0810611248016357, "learning_rate": 2.3389863265207514e-07, "loss": 0.2378, "step": 15856 }, { "epoch": 0.7661496835290139, "grad_norm": 5.925337791442871, "learning_rate": 2.3385031647098613e-07, "loss": 0.258, "step": 15857 }, { "epoch": 0.7661979997101029, "grad_norm": 1.4895954132080078, "learning_rate": 2.3380200028989707e-07, "loss": 0.1477, "step": 15858 }, { "epoch": 0.766246315891192, "grad_norm": 1.9542524814605713, "learning_rate": 2.3375368410880803e-07, "loss": 0.2383, "step": 15859 }, { "epoch": 0.766294632072281, "grad_norm": 3.071026563644409, "learning_rate": 2.3370536792771897e-07, "loss": 0.3069, "step": 15860 }, { "epoch": 0.76634294825337, "grad_norm": 11.680179595947266, "learning_rate": 2.3365705174662994e-07, "loss": 0.2448, "step": 15861 }, { "epoch": 0.7663912644344592, "grad_norm": 3.4162983894348145, "learning_rate": 2.336087355655409e-07, "loss": 0.4145, "step": 15862 }, { "epoch": 0.7664395806155482, "grad_norm": 3.3371808528900146, "learning_rate": 2.3356041938445184e-07, "loss": 0.3489, "step": 15863 }, { "epoch": 0.7664878967966372, "grad_norm": 2.920546293258667, "learning_rate": 2.3351210320336277e-07, "loss": 0.2642, "step": 15864 }, { "epoch": 0.7665362129777262, "grad_norm": 2.0883636474609375, "learning_rate": 2.3346378702227376e-07, "loss": 0.2199, "step": 15865 }, { "epoch": 0.7665845291588153, "grad_norm": 4.687335014343262, "learning_rate": 2.334154708411847e-07, "loss": 0.294, "step": 15866 }, { "epoch": 0.7666328453399043, "grad_norm": 3.083583354949951, "learning_rate": 2.3336715466009566e-07, "loss": 0.2154, "step": 15867 }, { "epoch": 0.7666811615209934, "grad_norm": 2.47428560256958, "learning_rate": 2.333188384790066e-07, "loss": 0.1997, "step": 15868 }, { "epoch": 0.7667294777020824, "grad_norm": 2.4954166412353516, "learning_rate": 2.3327052229791754e-07, "loss": 0.2851, "step": 15869 }, { "epoch": 0.7667777938831715, "grad_norm": 2.5519001483917236, "learning_rate": 2.3322220611682853e-07, "loss": 0.3147, "step": 15870 }, { "epoch": 0.7668261100642605, "grad_norm": 3.856098175048828, "learning_rate": 2.3317388993573947e-07, "loss": 0.3787, "step": 15871 }, { "epoch": 0.7668744262453495, "grad_norm": 5.378329277038574, "learning_rate": 2.3312557375465043e-07, "loss": 0.2071, "step": 15872 }, { "epoch": 0.7669227424264387, "grad_norm": 4.884915351867676, "learning_rate": 2.3307725757356137e-07, "loss": 0.2803, "step": 15873 }, { "epoch": 0.7669710586075277, "grad_norm": 2.269223690032959, "learning_rate": 2.3302894139247233e-07, "loss": 0.2841, "step": 15874 }, { "epoch": 0.7670193747886167, "grad_norm": 3.0960540771484375, "learning_rate": 2.329806252113833e-07, "loss": 0.3044, "step": 15875 }, { "epoch": 0.7670676909697057, "grad_norm": 2.601047992706299, "learning_rate": 2.3293230903029423e-07, "loss": 0.3351, "step": 15876 }, { "epoch": 0.7671160071507948, "grad_norm": 3.6820898056030273, "learning_rate": 2.3288399284920517e-07, "loss": 0.3179, "step": 15877 }, { "epoch": 0.7671643233318839, "grad_norm": 3.839604377746582, "learning_rate": 2.3283567666811616e-07, "loss": 0.6016, "step": 15878 }, { "epoch": 0.7672126395129729, "grad_norm": 2.6183536052703857, "learning_rate": 2.327873604870271e-07, "loss": 0.2783, "step": 15879 }, { "epoch": 0.7672609556940619, "grad_norm": 2.225818157196045, "learning_rate": 2.3273904430593806e-07, "loss": 0.1949, "step": 15880 }, { "epoch": 0.767309271875151, "grad_norm": 2.342552661895752, "learning_rate": 2.32690728124849e-07, "loss": 0.2977, "step": 15881 }, { "epoch": 0.76735758805624, "grad_norm": 2.8443853855133057, "learning_rate": 2.3264241194375994e-07, "loss": 0.3514, "step": 15882 }, { "epoch": 0.7674059042373291, "grad_norm": 2.2288503646850586, "learning_rate": 2.3259409576267093e-07, "loss": 0.2061, "step": 15883 }, { "epoch": 0.7674542204184182, "grad_norm": 1.9018408060073853, "learning_rate": 2.3254577958158186e-07, "loss": 0.2686, "step": 15884 }, { "epoch": 0.7675025365995072, "grad_norm": 1.7250642776489258, "learning_rate": 2.324974634004928e-07, "loss": 0.1883, "step": 15885 }, { "epoch": 0.7675508527805962, "grad_norm": 4.264096736907959, "learning_rate": 2.3244914721940376e-07, "loss": 0.3664, "step": 15886 }, { "epoch": 0.7675991689616852, "grad_norm": 2.8545761108398438, "learning_rate": 2.3240083103831473e-07, "loss": 0.2502, "step": 15887 }, { "epoch": 0.7676474851427744, "grad_norm": 3.7495486736297607, "learning_rate": 2.323525148572257e-07, "loss": 0.3092, "step": 15888 }, { "epoch": 0.7676958013238634, "grad_norm": 2.2124881744384766, "learning_rate": 2.3230419867613663e-07, "loss": 0.2769, "step": 15889 }, { "epoch": 0.7677441175049524, "grad_norm": 6.766687393188477, "learning_rate": 2.3225588249504757e-07, "loss": 0.2463, "step": 15890 }, { "epoch": 0.7677924336860414, "grad_norm": 2.1324377059936523, "learning_rate": 2.3220756631395856e-07, "loss": 0.1713, "step": 15891 }, { "epoch": 0.7678407498671305, "grad_norm": 2.1635499000549316, "learning_rate": 2.321592501328695e-07, "loss": 0.2016, "step": 15892 }, { "epoch": 0.7678890660482196, "grad_norm": 2.4476897716522217, "learning_rate": 2.3211093395178043e-07, "loss": 0.2659, "step": 15893 }, { "epoch": 0.7679373822293086, "grad_norm": 4.154313564300537, "learning_rate": 2.320626177706914e-07, "loss": 0.4369, "step": 15894 }, { "epoch": 0.7679856984103977, "grad_norm": 3.4433674812316895, "learning_rate": 2.3201430158960233e-07, "loss": 0.3258, "step": 15895 }, { "epoch": 0.7680340145914867, "grad_norm": 4.061333656311035, "learning_rate": 2.3196598540851332e-07, "loss": 0.3567, "step": 15896 }, { "epoch": 0.7680823307725757, "grad_norm": 3.095787286758423, "learning_rate": 2.3191766922742426e-07, "loss": 0.2755, "step": 15897 }, { "epoch": 0.7681306469536647, "grad_norm": 6.612185001373291, "learning_rate": 2.318693530463352e-07, "loss": 0.2629, "step": 15898 }, { "epoch": 0.7681789631347539, "grad_norm": 4.362344741821289, "learning_rate": 2.3182103686524616e-07, "loss": 0.3557, "step": 15899 }, { "epoch": 0.7682272793158429, "grad_norm": 2.6497092247009277, "learning_rate": 2.3177272068415712e-07, "loss": 0.343, "step": 15900 }, { "epoch": 0.7682755954969319, "grad_norm": 2.3289828300476074, "learning_rate": 2.3172440450306806e-07, "loss": 0.2047, "step": 15901 }, { "epoch": 0.7683239116780209, "grad_norm": 2.9764347076416016, "learning_rate": 2.3167608832197903e-07, "loss": 0.3518, "step": 15902 }, { "epoch": 0.76837222785911, "grad_norm": 3.040734052658081, "learning_rate": 2.3162777214088996e-07, "loss": 0.3386, "step": 15903 }, { "epoch": 0.7684205440401991, "grad_norm": 2.2730422019958496, "learning_rate": 2.3157945595980095e-07, "loss": 0.1551, "step": 15904 }, { "epoch": 0.7684688602212881, "grad_norm": 3.360123872756958, "learning_rate": 2.315311397787119e-07, "loss": 0.3352, "step": 15905 }, { "epoch": 0.7685171764023772, "grad_norm": 2.715942621231079, "learning_rate": 2.3148282359762283e-07, "loss": 0.2865, "step": 15906 }, { "epoch": 0.7685654925834662, "grad_norm": 1.85373854637146, "learning_rate": 2.314345074165338e-07, "loss": 0.1995, "step": 15907 }, { "epoch": 0.7686138087645552, "grad_norm": 2.212141990661621, "learning_rate": 2.3138619123544473e-07, "loss": 0.2642, "step": 15908 }, { "epoch": 0.7686621249456443, "grad_norm": 6.718811511993408, "learning_rate": 2.313378750543557e-07, "loss": 0.4104, "step": 15909 }, { "epoch": 0.7687104411267334, "grad_norm": 2.3964297771453857, "learning_rate": 2.3128955887326666e-07, "loss": 0.2605, "step": 15910 }, { "epoch": 0.7687587573078224, "grad_norm": 6.731953144073486, "learning_rate": 2.312412426921776e-07, "loss": 0.2571, "step": 15911 }, { "epoch": 0.7688070734889114, "grad_norm": 4.494607925415039, "learning_rate": 2.3119292651108853e-07, "loss": 0.1672, "step": 15912 }, { "epoch": 0.7688553896700004, "grad_norm": 2.9419796466827393, "learning_rate": 2.3114461032999952e-07, "loss": 0.2942, "step": 15913 }, { "epoch": 0.7689037058510896, "grad_norm": 2.1560075283050537, "learning_rate": 2.3109629414891046e-07, "loss": 0.2304, "step": 15914 }, { "epoch": 0.7689520220321786, "grad_norm": 2.3480842113494873, "learning_rate": 2.3104797796782142e-07, "loss": 0.2918, "step": 15915 }, { "epoch": 0.7690003382132676, "grad_norm": 2.8014168739318848, "learning_rate": 2.3099966178673236e-07, "loss": 0.2333, "step": 15916 }, { "epoch": 0.7690486543943567, "grad_norm": 4.584288597106934, "learning_rate": 2.3095134560564332e-07, "loss": 0.3824, "step": 15917 }, { "epoch": 0.7690969705754457, "grad_norm": 2.371058464050293, "learning_rate": 2.3090302942455429e-07, "loss": 0.2297, "step": 15918 }, { "epoch": 0.7691452867565348, "grad_norm": 2.871504306793213, "learning_rate": 2.3085471324346522e-07, "loss": 0.2541, "step": 15919 }, { "epoch": 0.7691936029376238, "grad_norm": 3.134758234024048, "learning_rate": 2.3080639706237616e-07, "loss": 0.3348, "step": 15920 }, { "epoch": 0.7692419191187129, "grad_norm": 3.0193533897399902, "learning_rate": 2.3075808088128712e-07, "loss": 0.4331, "step": 15921 }, { "epoch": 0.7692902352998019, "grad_norm": 4.955324649810791, "learning_rate": 2.307097647001981e-07, "loss": 0.3686, "step": 15922 }, { "epoch": 0.7693385514808909, "grad_norm": 4.16577672958374, "learning_rate": 2.3066144851910905e-07, "loss": 0.3344, "step": 15923 }, { "epoch": 0.76938686766198, "grad_norm": 3.772719621658325, "learning_rate": 2.3061313233802e-07, "loss": 0.2438, "step": 15924 }, { "epoch": 0.7694351838430691, "grad_norm": 2.3589284420013428, "learning_rate": 2.3056481615693093e-07, "loss": 0.3129, "step": 15925 }, { "epoch": 0.7694835000241581, "grad_norm": 4.030847072601318, "learning_rate": 2.3051649997584192e-07, "loss": 0.5876, "step": 15926 }, { "epoch": 0.7695318162052471, "grad_norm": 2.586453437805176, "learning_rate": 2.3046818379475285e-07, "loss": 0.3273, "step": 15927 }, { "epoch": 0.7695801323863362, "grad_norm": 2.7664215564727783, "learning_rate": 2.304198676136638e-07, "loss": 0.325, "step": 15928 }, { "epoch": 0.7696284485674252, "grad_norm": 3.746647834777832, "learning_rate": 2.3037155143257476e-07, "loss": 0.4403, "step": 15929 }, { "epoch": 0.7696767647485143, "grad_norm": 2.6042568683624268, "learning_rate": 2.3032323525148572e-07, "loss": 0.3284, "step": 15930 }, { "epoch": 0.7697250809296033, "grad_norm": 2.9522924423217773, "learning_rate": 2.3027491907039668e-07, "loss": 0.4121, "step": 15931 }, { "epoch": 0.7697733971106924, "grad_norm": 2.2648909091949463, "learning_rate": 2.3022660288930762e-07, "loss": 0.3623, "step": 15932 }, { "epoch": 0.7698217132917814, "grad_norm": 2.5144283771514893, "learning_rate": 2.3017828670821856e-07, "loss": 0.1769, "step": 15933 }, { "epoch": 0.7698700294728704, "grad_norm": 2.893226385116577, "learning_rate": 2.3012997052712952e-07, "loss": 0.444, "step": 15934 }, { "epoch": 0.7699183456539596, "grad_norm": 2.7154572010040283, "learning_rate": 2.3008165434604048e-07, "loss": 0.4016, "step": 15935 }, { "epoch": 0.7699666618350486, "grad_norm": 3.4350030422210693, "learning_rate": 2.3003333816495142e-07, "loss": 0.2081, "step": 15936 }, { "epoch": 0.7700149780161376, "grad_norm": 5.918498516082764, "learning_rate": 2.2998502198386239e-07, "loss": 0.3168, "step": 15937 }, { "epoch": 0.7700632941972266, "grad_norm": 3.123244524002075, "learning_rate": 2.2993670580277332e-07, "loss": 0.3139, "step": 15938 }, { "epoch": 0.7701116103783157, "grad_norm": 4.29348611831665, "learning_rate": 2.2988838962168431e-07, "loss": 0.2916, "step": 15939 }, { "epoch": 0.7701599265594048, "grad_norm": 2.396411657333374, "learning_rate": 2.2984007344059525e-07, "loss": 0.3039, "step": 15940 }, { "epoch": 0.7702082427404938, "grad_norm": 3.2500131130218506, "learning_rate": 2.297917572595062e-07, "loss": 0.2893, "step": 15941 }, { "epoch": 0.7702565589215828, "grad_norm": 3.410205125808716, "learning_rate": 2.2974344107841715e-07, "loss": 0.5733, "step": 15942 }, { "epoch": 0.7703048751026719, "grad_norm": 2.12939715385437, "learning_rate": 2.2969512489732812e-07, "loss": 0.2333, "step": 15943 }, { "epoch": 0.7703531912837609, "grad_norm": 2.659883737564087, "learning_rate": 2.2964680871623905e-07, "loss": 0.3276, "step": 15944 }, { "epoch": 0.77040150746485, "grad_norm": 1.7002060413360596, "learning_rate": 2.2959849253515002e-07, "loss": 0.1872, "step": 15945 }, { "epoch": 0.7704498236459391, "grad_norm": 2.8336188793182373, "learning_rate": 2.2955017635406095e-07, "loss": 0.3606, "step": 15946 }, { "epoch": 0.7704981398270281, "grad_norm": 3.3714466094970703, "learning_rate": 2.2950186017297192e-07, "loss": 0.3628, "step": 15947 }, { "epoch": 0.7705464560081171, "grad_norm": 3.00041127204895, "learning_rate": 2.2945354399188288e-07, "loss": 0.4325, "step": 15948 }, { "epoch": 0.7705947721892061, "grad_norm": 2.575634002685547, "learning_rate": 2.2940522781079382e-07, "loss": 0.2372, "step": 15949 }, { "epoch": 0.7706430883702952, "grad_norm": 9.788674354553223, "learning_rate": 2.2935691162970478e-07, "loss": 0.2444, "step": 15950 }, { "epoch": 0.7706914045513843, "grad_norm": 1.5159153938293457, "learning_rate": 2.2930859544861572e-07, "loss": 0.1695, "step": 15951 }, { "epoch": 0.7707397207324733, "grad_norm": 2.6384479999542236, "learning_rate": 2.2926027926752668e-07, "loss": 0.3152, "step": 15952 }, { "epoch": 0.7707880369135623, "grad_norm": 2.42342209815979, "learning_rate": 2.2921196308643765e-07, "loss": 0.2162, "step": 15953 }, { "epoch": 0.7708363530946514, "grad_norm": 1.971896767616272, "learning_rate": 2.2916364690534858e-07, "loss": 0.2073, "step": 15954 }, { "epoch": 0.7708846692757404, "grad_norm": 2.065246343612671, "learning_rate": 2.2911533072425955e-07, "loss": 0.2428, "step": 15955 }, { "epoch": 0.7709329854568295, "grad_norm": 2.4758381843566895, "learning_rate": 2.2906701454317049e-07, "loss": 0.2296, "step": 15956 }, { "epoch": 0.7709813016379186, "grad_norm": 3.656583309173584, "learning_rate": 2.2901869836208145e-07, "loss": 0.2531, "step": 15957 }, { "epoch": 0.7710296178190076, "grad_norm": 2.601630210876465, "learning_rate": 2.289703821809924e-07, "loss": 0.2493, "step": 15958 }, { "epoch": 0.7710779340000966, "grad_norm": 3.0062544345855713, "learning_rate": 2.2892206599990335e-07, "loss": 0.351, "step": 15959 }, { "epoch": 0.7711262501811856, "grad_norm": 13.084491729736328, "learning_rate": 2.288737498188143e-07, "loss": 0.1829, "step": 15960 }, { "epoch": 0.7711745663622748, "grad_norm": 3.4768545627593994, "learning_rate": 2.2882543363772528e-07, "loss": 0.365, "step": 15961 }, { "epoch": 0.7712228825433638, "grad_norm": 2.409984827041626, "learning_rate": 2.2877711745663621e-07, "loss": 0.2837, "step": 15962 }, { "epoch": 0.7712711987244528, "grad_norm": 3.083935260772705, "learning_rate": 2.2872880127554718e-07, "loss": 0.3717, "step": 15963 }, { "epoch": 0.7713195149055418, "grad_norm": 2.2155394554138184, "learning_rate": 2.2868048509445812e-07, "loss": 0.2491, "step": 15964 }, { "epoch": 0.7713678310866309, "grad_norm": 2.5552480220794678, "learning_rate": 2.2863216891336908e-07, "loss": 0.2586, "step": 15965 }, { "epoch": 0.77141614726772, "grad_norm": 2.75065541267395, "learning_rate": 2.2858385273228004e-07, "loss": 0.3357, "step": 15966 }, { "epoch": 0.771464463448809, "grad_norm": 2.886370897293091, "learning_rate": 2.2853553655119098e-07, "loss": 0.3228, "step": 15967 }, { "epoch": 0.7715127796298981, "grad_norm": 3.058506965637207, "learning_rate": 2.2848722037010192e-07, "loss": 0.3848, "step": 15968 }, { "epoch": 0.7715610958109871, "grad_norm": 2.053358316421509, "learning_rate": 2.2843890418901288e-07, "loss": 0.2246, "step": 15969 }, { "epoch": 0.7716094119920761, "grad_norm": 2.3994390964508057, "learning_rate": 2.2839058800792385e-07, "loss": 0.2326, "step": 15970 }, { "epoch": 0.7716577281731652, "grad_norm": 4.895322799682617, "learning_rate": 2.283422718268348e-07, "loss": 0.3539, "step": 15971 }, { "epoch": 0.7717060443542543, "grad_norm": 3.206336498260498, "learning_rate": 2.2829395564574575e-07, "loss": 0.2862, "step": 15972 }, { "epoch": 0.7717543605353433, "grad_norm": 2.6411798000335693, "learning_rate": 2.2824563946465668e-07, "loss": 0.3418, "step": 15973 }, { "epoch": 0.7718026767164323, "grad_norm": 2.0954320430755615, "learning_rate": 2.2819732328356767e-07, "loss": 0.2093, "step": 15974 }, { "epoch": 0.7718509928975213, "grad_norm": 3.078498363494873, "learning_rate": 2.281490071024786e-07, "loss": 0.2823, "step": 15975 }, { "epoch": 0.7718993090786104, "grad_norm": 2.876811981201172, "learning_rate": 2.2810069092138955e-07, "loss": 0.3648, "step": 15976 }, { "epoch": 0.7719476252596995, "grad_norm": 3.2727606296539307, "learning_rate": 2.280523747403005e-07, "loss": 0.3288, "step": 15977 }, { "epoch": 0.7719959414407885, "grad_norm": 71.7200698852539, "learning_rate": 2.2800405855921148e-07, "loss": 0.3549, "step": 15978 }, { "epoch": 0.7720442576218776, "grad_norm": 2.8528225421905518, "learning_rate": 2.2795574237812244e-07, "loss": 0.407, "step": 15979 }, { "epoch": 0.7720925738029666, "grad_norm": 3.694575548171997, "learning_rate": 2.2790742619703338e-07, "loss": 0.3032, "step": 15980 }, { "epoch": 0.7721408899840556, "grad_norm": 4.591386795043945, "learning_rate": 2.2785911001594431e-07, "loss": 0.2415, "step": 15981 }, { "epoch": 0.7721892061651447, "grad_norm": 3.4223928451538086, "learning_rate": 2.2781079383485528e-07, "loss": 0.3466, "step": 15982 }, { "epoch": 0.7722375223462338, "grad_norm": 2.9732398986816406, "learning_rate": 2.2776247765376624e-07, "loss": 0.2312, "step": 15983 }, { "epoch": 0.7722858385273228, "grad_norm": 2.2938520908355713, "learning_rate": 2.2771416147267718e-07, "loss": 0.3445, "step": 15984 }, { "epoch": 0.7723341547084118, "grad_norm": 2.3259940147399902, "learning_rate": 2.2766584529158814e-07, "loss": 0.2355, "step": 15985 }, { "epoch": 0.7723824708895008, "grad_norm": 2.8074939250946045, "learning_rate": 2.2761752911049908e-07, "loss": 0.3666, "step": 15986 }, { "epoch": 0.77243078707059, "grad_norm": 1.7572739124298096, "learning_rate": 2.2756921292941007e-07, "loss": 0.1904, "step": 15987 }, { "epoch": 0.772479103251679, "grad_norm": 4.913679122924805, "learning_rate": 2.27520896748321e-07, "loss": 0.2104, "step": 15988 }, { "epoch": 0.772527419432768, "grad_norm": 2.6136562824249268, "learning_rate": 2.2747258056723194e-07, "loss": 0.262, "step": 15989 }, { "epoch": 0.7725757356138571, "grad_norm": 3.2444612979888916, "learning_rate": 2.274242643861429e-07, "loss": 0.2822, "step": 15990 }, { "epoch": 0.7726240517949461, "grad_norm": 3.5612032413482666, "learning_rate": 2.2737594820505387e-07, "loss": 0.3663, "step": 15991 }, { "epoch": 0.7726723679760352, "grad_norm": 3.645472764968872, "learning_rate": 2.273276320239648e-07, "loss": 0.1748, "step": 15992 }, { "epoch": 0.7727206841571242, "grad_norm": 4.026426315307617, "learning_rate": 2.2727931584287577e-07, "loss": 0.3422, "step": 15993 }, { "epoch": 0.7727690003382133, "grad_norm": 3.024852752685547, "learning_rate": 2.272309996617867e-07, "loss": 0.297, "step": 15994 }, { "epoch": 0.7728173165193023, "grad_norm": 1.9815140962600708, "learning_rate": 2.2718268348069767e-07, "loss": 0.1931, "step": 15995 }, { "epoch": 0.7728656327003913, "grad_norm": 1.9871914386749268, "learning_rate": 2.2713436729960864e-07, "loss": 0.2296, "step": 15996 }, { "epoch": 0.7729139488814805, "grad_norm": 2.4147534370422363, "learning_rate": 2.2708605111851958e-07, "loss": 0.2035, "step": 15997 }, { "epoch": 0.7729622650625695, "grad_norm": 2.423537254333496, "learning_rate": 2.2703773493743054e-07, "loss": 0.286, "step": 15998 }, { "epoch": 0.7730105812436585, "grad_norm": 2.409177541732788, "learning_rate": 2.2698941875634148e-07, "loss": 0.2958, "step": 15999 }, { "epoch": 0.7730588974247475, "grad_norm": 7.732159614562988, "learning_rate": 2.2694110257525244e-07, "loss": 0.3796, "step": 16000 }, { "epoch": 0.7731072136058366, "grad_norm": 2.5149319171905518, "learning_rate": 2.268927863941634e-07, "loss": 0.2709, "step": 16001 }, { "epoch": 0.7731555297869256, "grad_norm": 2.9380273818969727, "learning_rate": 2.2684447021307434e-07, "loss": 0.2799, "step": 16002 }, { "epoch": 0.7732038459680147, "grad_norm": 2.2990472316741943, "learning_rate": 2.267961540319853e-07, "loss": 0.2392, "step": 16003 }, { "epoch": 0.7732521621491038, "grad_norm": 2.2873377799987793, "learning_rate": 2.2674783785089627e-07, "loss": 0.3208, "step": 16004 }, { "epoch": 0.7733004783301928, "grad_norm": 3.2997124195098877, "learning_rate": 2.266995216698072e-07, "loss": 0.2545, "step": 16005 }, { "epoch": 0.7733487945112818, "grad_norm": 4.709676742553711, "learning_rate": 2.2665120548871817e-07, "loss": 0.4495, "step": 16006 }, { "epoch": 0.7733971106923708, "grad_norm": 6.357265949249268, "learning_rate": 2.266028893076291e-07, "loss": 0.3693, "step": 16007 }, { "epoch": 0.77344542687346, "grad_norm": 2.7748489379882812, "learning_rate": 2.2655457312654004e-07, "loss": 0.3419, "step": 16008 }, { "epoch": 0.773493743054549, "grad_norm": 2.181257724761963, "learning_rate": 2.2650625694545103e-07, "loss": 0.2441, "step": 16009 }, { "epoch": 0.773542059235638, "grad_norm": 2.5575759410858154, "learning_rate": 2.2645794076436197e-07, "loss": 0.2216, "step": 16010 }, { "epoch": 0.773590375416727, "grad_norm": 2.093519687652588, "learning_rate": 2.2640962458327294e-07, "loss": 0.2649, "step": 16011 }, { "epoch": 0.7736386915978161, "grad_norm": 4.109784126281738, "learning_rate": 2.2636130840218387e-07, "loss": 0.2016, "step": 16012 }, { "epoch": 0.7736870077789052, "grad_norm": 3.593977928161621, "learning_rate": 2.2631299222109484e-07, "loss": 0.1888, "step": 16013 }, { "epoch": 0.7737353239599942, "grad_norm": 3.3373043537139893, "learning_rate": 2.262646760400058e-07, "loss": 0.2748, "step": 16014 }, { "epoch": 0.7737836401410833, "grad_norm": 2.492339849472046, "learning_rate": 2.2621635985891674e-07, "loss": 0.2393, "step": 16015 }, { "epoch": 0.7738319563221723, "grad_norm": 2.5580272674560547, "learning_rate": 2.2616804367782767e-07, "loss": 0.2752, "step": 16016 }, { "epoch": 0.7738802725032613, "grad_norm": 2.671644926071167, "learning_rate": 2.2611972749673867e-07, "loss": 0.3706, "step": 16017 }, { "epoch": 0.7739285886843504, "grad_norm": 3.554248094558716, "learning_rate": 2.260714113156496e-07, "loss": 0.2815, "step": 16018 }, { "epoch": 0.7739769048654395, "grad_norm": 7.987009048461914, "learning_rate": 2.2602309513456057e-07, "loss": 0.4209, "step": 16019 }, { "epoch": 0.7740252210465285, "grad_norm": 2.2516303062438965, "learning_rate": 2.259747789534715e-07, "loss": 0.2625, "step": 16020 }, { "epoch": 0.7740735372276175, "grad_norm": 3.892049789428711, "learning_rate": 2.2592646277238244e-07, "loss": 0.23, "step": 16021 }, { "epoch": 0.7741218534087065, "grad_norm": 2.4406471252441406, "learning_rate": 2.2587814659129343e-07, "loss": 0.2476, "step": 16022 }, { "epoch": 0.7741701695897957, "grad_norm": 2.017571210861206, "learning_rate": 2.2582983041020437e-07, "loss": 0.2272, "step": 16023 }, { "epoch": 0.7742184857708847, "grad_norm": 2.156019687652588, "learning_rate": 2.257815142291153e-07, "loss": 0.1927, "step": 16024 }, { "epoch": 0.7742668019519737, "grad_norm": 3.0239336490631104, "learning_rate": 2.2573319804802627e-07, "loss": 0.3817, "step": 16025 }, { "epoch": 0.7743151181330628, "grad_norm": 2.6948468685150146, "learning_rate": 2.2568488186693723e-07, "loss": 0.3286, "step": 16026 }, { "epoch": 0.7743634343141518, "grad_norm": 3.128950834274292, "learning_rate": 2.256365656858482e-07, "loss": 0.3168, "step": 16027 }, { "epoch": 0.7744117504952408, "grad_norm": 4.105168342590332, "learning_rate": 2.2558824950475913e-07, "loss": 0.3377, "step": 16028 }, { "epoch": 0.7744600666763299, "grad_norm": 2.2063302993774414, "learning_rate": 2.2553993332367007e-07, "loss": 0.2348, "step": 16029 }, { "epoch": 0.774508382857419, "grad_norm": 5.209844589233398, "learning_rate": 2.2549161714258106e-07, "loss": 0.5275, "step": 16030 }, { "epoch": 0.774556699038508, "grad_norm": 2.0933964252471924, "learning_rate": 2.25443300961492e-07, "loss": 0.2122, "step": 16031 }, { "epoch": 0.774605015219597, "grad_norm": 3.4037411212921143, "learning_rate": 2.2539498478040294e-07, "loss": 0.4184, "step": 16032 }, { "epoch": 0.774653331400686, "grad_norm": 4.939587116241455, "learning_rate": 2.253466685993139e-07, "loss": 0.3386, "step": 16033 }, { "epoch": 0.7747016475817752, "grad_norm": 2.884908676147461, "learning_rate": 2.2529835241822484e-07, "loss": 0.2984, "step": 16034 }, { "epoch": 0.7747499637628642, "grad_norm": 4.297660827636719, "learning_rate": 2.2525003623713583e-07, "loss": 0.3563, "step": 16035 }, { "epoch": 0.7747982799439532, "grad_norm": 3.1574323177337646, "learning_rate": 2.2520172005604676e-07, "loss": 0.3382, "step": 16036 }, { "epoch": 0.7748465961250423, "grad_norm": 2.3119547367095947, "learning_rate": 2.251534038749577e-07, "loss": 0.2756, "step": 16037 }, { "epoch": 0.7748949123061313, "grad_norm": 2.8033552169799805, "learning_rate": 2.2510508769386867e-07, "loss": 0.3366, "step": 16038 }, { "epoch": 0.7749432284872204, "grad_norm": 2.2276508808135986, "learning_rate": 2.2505677151277963e-07, "loss": 0.2628, "step": 16039 }, { "epoch": 0.7749915446683094, "grad_norm": 3.2397677898406982, "learning_rate": 2.2500845533169057e-07, "loss": 0.4037, "step": 16040 }, { "epoch": 0.7750398608493985, "grad_norm": 2.0140910148620605, "learning_rate": 2.2496013915060153e-07, "loss": 0.2445, "step": 16041 }, { "epoch": 0.7750881770304875, "grad_norm": 2.03792667388916, "learning_rate": 2.2491182296951247e-07, "loss": 0.2361, "step": 16042 }, { "epoch": 0.7751364932115765, "grad_norm": 2.2460296154022217, "learning_rate": 2.2486350678842346e-07, "loss": 0.2784, "step": 16043 }, { "epoch": 0.7751848093926657, "grad_norm": 2.9451098442077637, "learning_rate": 2.248151906073344e-07, "loss": 0.3211, "step": 16044 }, { "epoch": 0.7752331255737547, "grad_norm": 2.8646750450134277, "learning_rate": 2.2476687442624533e-07, "loss": 0.2149, "step": 16045 }, { "epoch": 0.7752814417548437, "grad_norm": 3.032792568206787, "learning_rate": 2.247185582451563e-07, "loss": 0.3889, "step": 16046 }, { "epoch": 0.7753297579359327, "grad_norm": 3.454883098602295, "learning_rate": 2.2467024206406723e-07, "loss": 0.2307, "step": 16047 }, { "epoch": 0.7753780741170218, "grad_norm": 2.88950514793396, "learning_rate": 2.246219258829782e-07, "loss": 0.2214, "step": 16048 }, { "epoch": 0.7754263902981109, "grad_norm": 4.629883289337158, "learning_rate": 2.2457360970188916e-07, "loss": 0.3675, "step": 16049 }, { "epoch": 0.7754747064791999, "grad_norm": 2.294990301132202, "learning_rate": 2.245252935208001e-07, "loss": 0.2366, "step": 16050 }, { "epoch": 0.7755230226602889, "grad_norm": 2.634918451309204, "learning_rate": 2.2447697733971106e-07, "loss": 0.2249, "step": 16051 }, { "epoch": 0.775571338841378, "grad_norm": 2.3878564834594727, "learning_rate": 2.2442866115862203e-07, "loss": 0.2527, "step": 16052 }, { "epoch": 0.775619655022467, "grad_norm": 5.9008684158325195, "learning_rate": 2.2438034497753296e-07, "loss": 0.332, "step": 16053 }, { "epoch": 0.775667971203556, "grad_norm": 3.3022541999816895, "learning_rate": 2.2433202879644393e-07, "loss": 0.3182, "step": 16054 }, { "epoch": 0.7757162873846452, "grad_norm": 2.7158007621765137, "learning_rate": 2.2428371261535486e-07, "loss": 0.3176, "step": 16055 }, { "epoch": 0.7757646035657342, "grad_norm": 2.5693154335021973, "learning_rate": 2.2423539643426583e-07, "loss": 0.3301, "step": 16056 }, { "epoch": 0.7758129197468232, "grad_norm": 2.984984874725342, "learning_rate": 2.241870802531768e-07, "loss": 0.3406, "step": 16057 }, { "epoch": 0.7758612359279122, "grad_norm": 2.6402511596679688, "learning_rate": 2.2413876407208773e-07, "loss": 0.3175, "step": 16058 }, { "epoch": 0.7759095521090013, "grad_norm": 3.02778959274292, "learning_rate": 2.240904478909987e-07, "loss": 0.3897, "step": 16059 }, { "epoch": 0.7759578682900904, "grad_norm": 3.9855425357818604, "learning_rate": 2.2404213170990963e-07, "loss": 0.4602, "step": 16060 }, { "epoch": 0.7760061844711794, "grad_norm": 2.2334742546081543, "learning_rate": 2.239938155288206e-07, "loss": 0.2405, "step": 16061 }, { "epoch": 0.7760545006522684, "grad_norm": 2.9027979373931885, "learning_rate": 2.2394549934773156e-07, "loss": 0.301, "step": 16062 }, { "epoch": 0.7761028168333575, "grad_norm": 2.6522834300994873, "learning_rate": 2.238971831666425e-07, "loss": 0.257, "step": 16063 }, { "epoch": 0.7761511330144465, "grad_norm": 2.361192464828491, "learning_rate": 2.2384886698555343e-07, "loss": 0.2437, "step": 16064 }, { "epoch": 0.7761994491955356, "grad_norm": 3.1327245235443115, "learning_rate": 2.2380055080446442e-07, "loss": 0.3702, "step": 16065 }, { "epoch": 0.7762477653766247, "grad_norm": 3.3489863872528076, "learning_rate": 2.2375223462337536e-07, "loss": 0.3241, "step": 16066 }, { "epoch": 0.7762960815577137, "grad_norm": 1.6767436265945435, "learning_rate": 2.2370391844228632e-07, "loss": 0.1745, "step": 16067 }, { "epoch": 0.7763443977388027, "grad_norm": 4.631744384765625, "learning_rate": 2.2365560226119726e-07, "loss": 0.3388, "step": 16068 }, { "epoch": 0.7763927139198917, "grad_norm": 9.422725677490234, "learning_rate": 2.2360728608010822e-07, "loss": 0.3159, "step": 16069 }, { "epoch": 0.7764410301009809, "grad_norm": 2.497464418411255, "learning_rate": 2.235589698990192e-07, "loss": 0.2979, "step": 16070 }, { "epoch": 0.7764893462820699, "grad_norm": 4.856711387634277, "learning_rate": 2.2351065371793013e-07, "loss": 0.3515, "step": 16071 }, { "epoch": 0.7765376624631589, "grad_norm": 2.6777448654174805, "learning_rate": 2.2346233753684106e-07, "loss": 0.2793, "step": 16072 }, { "epoch": 0.7765859786442479, "grad_norm": 3.178162097930908, "learning_rate": 2.2341402135575203e-07, "loss": 0.2819, "step": 16073 }, { "epoch": 0.776634294825337, "grad_norm": 3.5536949634552, "learning_rate": 2.23365705174663e-07, "loss": 0.2804, "step": 16074 }, { "epoch": 0.7766826110064261, "grad_norm": 2.66986346244812, "learning_rate": 2.2331738899357395e-07, "loss": 0.2811, "step": 16075 }, { "epoch": 0.7767309271875151, "grad_norm": 2.421260118484497, "learning_rate": 2.232690728124849e-07, "loss": 0.2707, "step": 16076 }, { "epoch": 0.7767792433686042, "grad_norm": 3.8124372959136963, "learning_rate": 2.2322075663139583e-07, "loss": 0.2663, "step": 16077 }, { "epoch": 0.7768275595496932, "grad_norm": 7.497610569000244, "learning_rate": 2.2317244045030682e-07, "loss": 0.2996, "step": 16078 }, { "epoch": 0.7768758757307822, "grad_norm": 2.415407180786133, "learning_rate": 2.2312412426921776e-07, "loss": 0.23, "step": 16079 }, { "epoch": 0.7769241919118712, "grad_norm": 3.00457763671875, "learning_rate": 2.230758080881287e-07, "loss": 0.2137, "step": 16080 }, { "epoch": 0.7769725080929604, "grad_norm": 2.5858066082000732, "learning_rate": 2.2302749190703966e-07, "loss": 0.3056, "step": 16081 }, { "epoch": 0.7770208242740494, "grad_norm": 3.0947253704071045, "learning_rate": 2.2297917572595062e-07, "loss": 0.3741, "step": 16082 }, { "epoch": 0.7770691404551384, "grad_norm": 2.865849018096924, "learning_rate": 2.2293085954486158e-07, "loss": 0.2707, "step": 16083 }, { "epoch": 0.7771174566362274, "grad_norm": 1.9001502990722656, "learning_rate": 2.2288254336377252e-07, "loss": 0.2091, "step": 16084 }, { "epoch": 0.7771657728173165, "grad_norm": 2.8408830165863037, "learning_rate": 2.2283422718268346e-07, "loss": 0.2606, "step": 16085 }, { "epoch": 0.7772140889984056, "grad_norm": 4.700212478637695, "learning_rate": 2.2278591100159442e-07, "loss": 0.2914, "step": 16086 }, { "epoch": 0.7772624051794946, "grad_norm": 3.6988513469696045, "learning_rate": 2.2273759482050539e-07, "loss": 0.226, "step": 16087 }, { "epoch": 0.7773107213605837, "grad_norm": 2.0211336612701416, "learning_rate": 2.2268927863941632e-07, "loss": 0.2263, "step": 16088 }, { "epoch": 0.7773590375416727, "grad_norm": 9.272626876831055, "learning_rate": 2.226409624583273e-07, "loss": 0.2397, "step": 16089 }, { "epoch": 0.7774073537227617, "grad_norm": 2.4197065830230713, "learning_rate": 2.2259264627723822e-07, "loss": 0.2938, "step": 16090 }, { "epoch": 0.7774556699038508, "grad_norm": 2.5128376483917236, "learning_rate": 2.2254433009614921e-07, "loss": 0.2239, "step": 16091 }, { "epoch": 0.7775039860849399, "grad_norm": 2.9791016578674316, "learning_rate": 2.2249601391506015e-07, "loss": 0.2364, "step": 16092 }, { "epoch": 0.7775523022660289, "grad_norm": 2.8602185249328613, "learning_rate": 2.224476977339711e-07, "loss": 0.2574, "step": 16093 }, { "epoch": 0.7776006184471179, "grad_norm": 2.2826294898986816, "learning_rate": 2.2239938155288205e-07, "loss": 0.208, "step": 16094 }, { "epoch": 0.777648934628207, "grad_norm": 3.1653060913085938, "learning_rate": 2.22351065371793e-07, "loss": 0.3026, "step": 16095 }, { "epoch": 0.7776972508092961, "grad_norm": 5.199989318847656, "learning_rate": 2.2230274919070395e-07, "loss": 0.3031, "step": 16096 }, { "epoch": 0.7777455669903851, "grad_norm": 5.501478672027588, "learning_rate": 2.2225443300961492e-07, "loss": 0.1871, "step": 16097 }, { "epoch": 0.7777938831714741, "grad_norm": 3.221557378768921, "learning_rate": 2.2220611682852585e-07, "loss": 0.3483, "step": 16098 }, { "epoch": 0.7778421993525632, "grad_norm": 4.9332275390625, "learning_rate": 2.221578006474368e-07, "loss": 0.2754, "step": 16099 }, { "epoch": 0.7778905155336522, "grad_norm": 2.6827778816223145, "learning_rate": 2.2210948446634778e-07, "loss": 0.3415, "step": 16100 }, { "epoch": 0.7779388317147413, "grad_norm": 1.8523824214935303, "learning_rate": 2.2206116828525872e-07, "loss": 0.1447, "step": 16101 }, { "epoch": 0.7779871478958303, "grad_norm": 1.8078495264053345, "learning_rate": 2.2201285210416968e-07, "loss": 0.2274, "step": 16102 }, { "epoch": 0.7780354640769194, "grad_norm": 44.860557556152344, "learning_rate": 2.2196453592308062e-07, "loss": 0.3083, "step": 16103 }, { "epoch": 0.7780837802580084, "grad_norm": 1.6432814598083496, "learning_rate": 2.2191621974199158e-07, "loss": 0.1683, "step": 16104 }, { "epoch": 0.7781320964390974, "grad_norm": 2.0624866485595703, "learning_rate": 2.2186790356090255e-07, "loss": 0.2541, "step": 16105 }, { "epoch": 0.7781804126201864, "grad_norm": 4.81572151184082, "learning_rate": 2.2181958737981349e-07, "loss": 0.2377, "step": 16106 }, { "epoch": 0.7782287288012756, "grad_norm": 3.3260064125061035, "learning_rate": 2.2177127119872442e-07, "loss": 0.4117, "step": 16107 }, { "epoch": 0.7782770449823646, "grad_norm": 9.260732650756836, "learning_rate": 2.2172295501763539e-07, "loss": 0.3628, "step": 16108 }, { "epoch": 0.7783253611634536, "grad_norm": 2.333280563354492, "learning_rate": 2.2167463883654635e-07, "loss": 0.2963, "step": 16109 }, { "epoch": 0.7783736773445427, "grad_norm": 3.107053279876709, "learning_rate": 2.2162632265545731e-07, "loss": 0.3169, "step": 16110 }, { "epoch": 0.7784219935256317, "grad_norm": 3.3854594230651855, "learning_rate": 2.2157800647436825e-07, "loss": 0.3321, "step": 16111 }, { "epoch": 0.7784703097067208, "grad_norm": 8.030057907104492, "learning_rate": 2.215296902932792e-07, "loss": 0.4209, "step": 16112 }, { "epoch": 0.7785186258878098, "grad_norm": 1.3458452224731445, "learning_rate": 2.2148137411219018e-07, "loss": 0.163, "step": 16113 }, { "epoch": 0.7785669420688989, "grad_norm": 3.050116777420044, "learning_rate": 2.2143305793110112e-07, "loss": 0.3652, "step": 16114 }, { "epoch": 0.7786152582499879, "grad_norm": 1.9567699432373047, "learning_rate": 2.2138474175001205e-07, "loss": 0.2322, "step": 16115 }, { "epoch": 0.7786635744310769, "grad_norm": 3.007685899734497, "learning_rate": 2.2133642556892302e-07, "loss": 0.3428, "step": 16116 }, { "epoch": 0.7787118906121661, "grad_norm": 1.7556251287460327, "learning_rate": 2.2128810938783398e-07, "loss": 0.2006, "step": 16117 }, { "epoch": 0.7787602067932551, "grad_norm": 1.927876591682434, "learning_rate": 2.2123979320674494e-07, "loss": 0.1919, "step": 16118 }, { "epoch": 0.7788085229743441, "grad_norm": 3.2980728149414062, "learning_rate": 2.2119147702565588e-07, "loss": 0.2881, "step": 16119 }, { "epoch": 0.7788568391554331, "grad_norm": 1.7434210777282715, "learning_rate": 2.2114316084456682e-07, "loss": 0.1775, "step": 16120 }, { "epoch": 0.7789051553365222, "grad_norm": 2.4375882148742676, "learning_rate": 2.2109484466347778e-07, "loss": 0.299, "step": 16121 }, { "epoch": 0.7789534715176113, "grad_norm": 2.737454891204834, "learning_rate": 2.2104652848238875e-07, "loss": 0.3418, "step": 16122 }, { "epoch": 0.7790017876987003, "grad_norm": 2.7974352836608887, "learning_rate": 2.2099821230129968e-07, "loss": 0.2597, "step": 16123 }, { "epoch": 0.7790501038797893, "grad_norm": 3.546374559402466, "learning_rate": 2.2094989612021065e-07, "loss": 0.3175, "step": 16124 }, { "epoch": 0.7790984200608784, "grad_norm": 3.45105242729187, "learning_rate": 2.2090157993912158e-07, "loss": 0.2599, "step": 16125 }, { "epoch": 0.7791467362419674, "grad_norm": 4.599440574645996, "learning_rate": 2.2085326375803258e-07, "loss": 0.37, "step": 16126 }, { "epoch": 0.7791950524230565, "grad_norm": 2.7955565452575684, "learning_rate": 2.208049475769435e-07, "loss": 0.3267, "step": 16127 }, { "epoch": 0.7792433686041456, "grad_norm": 2.3952419757843018, "learning_rate": 2.2075663139585445e-07, "loss": 0.2836, "step": 16128 }, { "epoch": 0.7792916847852346, "grad_norm": 5.001273155212402, "learning_rate": 2.2070831521476541e-07, "loss": 0.4138, "step": 16129 }, { "epoch": 0.7793400009663236, "grad_norm": 1.5299450159072876, "learning_rate": 2.2065999903367638e-07, "loss": 0.1542, "step": 16130 }, { "epoch": 0.7793883171474126, "grad_norm": 3.1978728771209717, "learning_rate": 2.2061168285258731e-07, "loss": 0.3783, "step": 16131 }, { "epoch": 0.7794366333285017, "grad_norm": 2.6260478496551514, "learning_rate": 2.2056336667149828e-07, "loss": 0.3932, "step": 16132 }, { "epoch": 0.7794849495095908, "grad_norm": 2.2906131744384766, "learning_rate": 2.2051505049040922e-07, "loss": 0.2986, "step": 16133 }, { "epoch": 0.7795332656906798, "grad_norm": 2.807185173034668, "learning_rate": 2.2046673430932018e-07, "loss": 0.2494, "step": 16134 }, { "epoch": 0.7795815818717688, "grad_norm": 4.091216564178467, "learning_rate": 2.2041841812823114e-07, "loss": 0.2964, "step": 16135 }, { "epoch": 0.7796298980528579, "grad_norm": 3.051162004470825, "learning_rate": 2.2037010194714208e-07, "loss": 0.4626, "step": 16136 }, { "epoch": 0.7796782142339469, "grad_norm": 2.8034188747406006, "learning_rate": 2.2032178576605304e-07, "loss": 0.316, "step": 16137 }, { "epoch": 0.779726530415036, "grad_norm": 2.890793800354004, "learning_rate": 2.2027346958496398e-07, "loss": 0.2056, "step": 16138 }, { "epoch": 0.7797748465961251, "grad_norm": 3.392815113067627, "learning_rate": 2.2022515340387495e-07, "loss": 0.265, "step": 16139 }, { "epoch": 0.7798231627772141, "grad_norm": 2.4221911430358887, "learning_rate": 2.201768372227859e-07, "loss": 0.2588, "step": 16140 }, { "epoch": 0.7798714789583031, "grad_norm": 2.082737445831299, "learning_rate": 2.2012852104169685e-07, "loss": 0.2109, "step": 16141 }, { "epoch": 0.7799197951393921, "grad_norm": 2.4053730964660645, "learning_rate": 2.200802048606078e-07, "loss": 0.2886, "step": 16142 }, { "epoch": 0.7799681113204813, "grad_norm": 2.742203712463379, "learning_rate": 2.2003188867951877e-07, "loss": 0.4472, "step": 16143 }, { "epoch": 0.7800164275015703, "grad_norm": 2.879941701889038, "learning_rate": 2.199835724984297e-07, "loss": 0.3162, "step": 16144 }, { "epoch": 0.7800647436826593, "grad_norm": 6.282532691955566, "learning_rate": 2.1993525631734067e-07, "loss": 0.3332, "step": 16145 }, { "epoch": 0.7801130598637483, "grad_norm": 2.601337194442749, "learning_rate": 2.198869401362516e-07, "loss": 0.2471, "step": 16146 }, { "epoch": 0.7801613760448374, "grad_norm": 3.806705951690674, "learning_rate": 2.1983862395516255e-07, "loss": 0.3136, "step": 16147 }, { "epoch": 0.7802096922259265, "grad_norm": 2.914027214050293, "learning_rate": 2.1979030777407354e-07, "loss": 0.2982, "step": 16148 }, { "epoch": 0.7802580084070155, "grad_norm": 2.2563834190368652, "learning_rate": 2.1974199159298448e-07, "loss": 0.2429, "step": 16149 }, { "epoch": 0.7803063245881046, "grad_norm": 1.9525474309921265, "learning_rate": 2.1969367541189544e-07, "loss": 0.1719, "step": 16150 }, { "epoch": 0.7803546407691936, "grad_norm": 3.1214864253997803, "learning_rate": 2.1964535923080638e-07, "loss": 0.3849, "step": 16151 }, { "epoch": 0.7804029569502826, "grad_norm": 1.7936890125274658, "learning_rate": 2.1959704304971734e-07, "loss": 0.191, "step": 16152 }, { "epoch": 0.7804512731313717, "grad_norm": 3.630671739578247, "learning_rate": 2.195487268686283e-07, "loss": 0.1656, "step": 16153 }, { "epoch": 0.7804995893124608, "grad_norm": 1.7159103155136108, "learning_rate": 2.1950041068753924e-07, "loss": 0.2027, "step": 16154 }, { "epoch": 0.7805479054935498, "grad_norm": 2.430734872817993, "learning_rate": 2.1945209450645018e-07, "loss": 0.2203, "step": 16155 }, { "epoch": 0.7805962216746388, "grad_norm": 3.5477254390716553, "learning_rate": 2.1940377832536117e-07, "loss": 0.2675, "step": 16156 }, { "epoch": 0.7806445378557278, "grad_norm": 2.3922109603881836, "learning_rate": 2.193554621442721e-07, "loss": 0.2736, "step": 16157 }, { "epoch": 0.7806928540368169, "grad_norm": 4.996420860290527, "learning_rate": 2.1930714596318307e-07, "loss": 0.2911, "step": 16158 }, { "epoch": 0.780741170217906, "grad_norm": 2.341954469680786, "learning_rate": 2.19258829782094e-07, "loss": 0.2321, "step": 16159 }, { "epoch": 0.780789486398995, "grad_norm": 3.8646440505981445, "learning_rate": 2.1921051360100495e-07, "loss": 0.2464, "step": 16160 }, { "epoch": 0.7808378025800841, "grad_norm": 2.7572665214538574, "learning_rate": 2.1916219741991594e-07, "loss": 0.2931, "step": 16161 }, { "epoch": 0.7808861187611731, "grad_norm": 8.055898666381836, "learning_rate": 2.1911388123882687e-07, "loss": 0.2566, "step": 16162 }, { "epoch": 0.7809344349422621, "grad_norm": 1.6704497337341309, "learning_rate": 2.190655650577378e-07, "loss": 0.1798, "step": 16163 }, { "epoch": 0.7809827511233512, "grad_norm": 2.527280330657959, "learning_rate": 2.1901724887664877e-07, "loss": 0.3432, "step": 16164 }, { "epoch": 0.7810310673044403, "grad_norm": 1.6538817882537842, "learning_rate": 2.1896893269555974e-07, "loss": 0.1864, "step": 16165 }, { "epoch": 0.7810793834855293, "grad_norm": 3.079413652420044, "learning_rate": 2.189206165144707e-07, "loss": 0.2639, "step": 16166 }, { "epoch": 0.7811276996666183, "grad_norm": 2.281130790710449, "learning_rate": 2.1887230033338164e-07, "loss": 0.1966, "step": 16167 }, { "epoch": 0.7811760158477074, "grad_norm": 3.8029212951660156, "learning_rate": 2.1882398415229258e-07, "loss": 0.2973, "step": 16168 }, { "epoch": 0.7812243320287965, "grad_norm": 3.901806592941284, "learning_rate": 2.1877566797120357e-07, "loss": 0.4265, "step": 16169 }, { "epoch": 0.7812726482098855, "grad_norm": 2.4586517810821533, "learning_rate": 2.187273517901145e-07, "loss": 0.2666, "step": 16170 }, { "epoch": 0.7813209643909745, "grad_norm": 2.4794814586639404, "learning_rate": 2.1867903560902544e-07, "loss": 0.2019, "step": 16171 }, { "epoch": 0.7813692805720636, "grad_norm": 2.185415029525757, "learning_rate": 2.186307194279364e-07, "loss": 0.2744, "step": 16172 }, { "epoch": 0.7814175967531526, "grad_norm": 7.269125938415527, "learning_rate": 2.1858240324684734e-07, "loss": 0.3594, "step": 16173 }, { "epoch": 0.7814659129342417, "grad_norm": 2.8965370655059814, "learning_rate": 2.1853408706575833e-07, "loss": 0.3567, "step": 16174 }, { "epoch": 0.7815142291153307, "grad_norm": 1.527255892753601, "learning_rate": 2.1848577088466927e-07, "loss": 0.1317, "step": 16175 }, { "epoch": 0.7815625452964198, "grad_norm": 2.3410449028015137, "learning_rate": 2.184374547035802e-07, "loss": 0.2704, "step": 16176 }, { "epoch": 0.7816108614775088, "grad_norm": 3.9896202087402344, "learning_rate": 2.1838913852249117e-07, "loss": 0.2856, "step": 16177 }, { "epoch": 0.7816591776585978, "grad_norm": 3.583726167678833, "learning_rate": 2.1834082234140213e-07, "loss": 0.2403, "step": 16178 }, { "epoch": 0.781707493839687, "grad_norm": 2.4898581504821777, "learning_rate": 2.1829250616031307e-07, "loss": 0.2085, "step": 16179 }, { "epoch": 0.781755810020776, "grad_norm": 3.540245771408081, "learning_rate": 2.1824418997922404e-07, "loss": 0.3317, "step": 16180 }, { "epoch": 0.781804126201865, "grad_norm": 4.470473289489746, "learning_rate": 2.1819587379813497e-07, "loss": 0.3249, "step": 16181 }, { "epoch": 0.781852442382954, "grad_norm": 3.2930195331573486, "learning_rate": 2.1814755761704596e-07, "loss": 0.3454, "step": 16182 }, { "epoch": 0.7819007585640431, "grad_norm": 3.2278311252593994, "learning_rate": 2.180992414359569e-07, "loss": 0.3995, "step": 16183 }, { "epoch": 0.7819490747451322, "grad_norm": 3.7073984146118164, "learning_rate": 2.1805092525486784e-07, "loss": 0.2586, "step": 16184 }, { "epoch": 0.7819973909262212, "grad_norm": 2.1816797256469727, "learning_rate": 2.180026090737788e-07, "loss": 0.1838, "step": 16185 }, { "epoch": 0.7820457071073103, "grad_norm": 2.921755313873291, "learning_rate": 2.1795429289268974e-07, "loss": 0.4208, "step": 16186 }, { "epoch": 0.7820940232883993, "grad_norm": 2.2153186798095703, "learning_rate": 2.179059767116007e-07, "loss": 0.2484, "step": 16187 }, { "epoch": 0.7821423394694883, "grad_norm": 1.9289335012435913, "learning_rate": 2.1785766053051167e-07, "loss": 0.1565, "step": 16188 }, { "epoch": 0.7821906556505773, "grad_norm": 2.035243511199951, "learning_rate": 2.178093443494226e-07, "loss": 0.1957, "step": 16189 }, { "epoch": 0.7822389718316665, "grad_norm": 2.3362298011779785, "learning_rate": 2.1776102816833357e-07, "loss": 0.2805, "step": 16190 }, { "epoch": 0.7822872880127555, "grad_norm": 3.275178909301758, "learning_rate": 2.1771271198724453e-07, "loss": 0.368, "step": 16191 }, { "epoch": 0.7823356041938445, "grad_norm": 2.963937520980835, "learning_rate": 2.1766439580615547e-07, "loss": 0.1842, "step": 16192 }, { "epoch": 0.7823839203749335, "grad_norm": 2.618818521499634, "learning_rate": 2.1761607962506643e-07, "loss": 0.3764, "step": 16193 }, { "epoch": 0.7824322365560226, "grad_norm": 2.3413939476013184, "learning_rate": 2.1756776344397737e-07, "loss": 0.3142, "step": 16194 }, { "epoch": 0.7824805527371117, "grad_norm": 3.030869722366333, "learning_rate": 2.1751944726288833e-07, "loss": 0.2865, "step": 16195 }, { "epoch": 0.7825288689182007, "grad_norm": 2.107677459716797, "learning_rate": 2.174711310817993e-07, "loss": 0.2152, "step": 16196 }, { "epoch": 0.7825771850992898, "grad_norm": 16.458547592163086, "learning_rate": 2.1742281490071023e-07, "loss": 0.3098, "step": 16197 }, { "epoch": 0.7826255012803788, "grad_norm": 17.45450782775879, "learning_rate": 2.173744987196212e-07, "loss": 0.366, "step": 16198 }, { "epoch": 0.7826738174614678, "grad_norm": 2.126861095428467, "learning_rate": 2.1732618253853213e-07, "loss": 0.2464, "step": 16199 }, { "epoch": 0.7827221336425569, "grad_norm": 2.811391592025757, "learning_rate": 2.172778663574431e-07, "loss": 0.3635, "step": 16200 }, { "epoch": 0.782770449823646, "grad_norm": 2.4647128582000732, "learning_rate": 2.1722955017635406e-07, "loss": 0.2375, "step": 16201 }, { "epoch": 0.782818766004735, "grad_norm": 2.226651906967163, "learning_rate": 2.17181233995265e-07, "loss": 0.2596, "step": 16202 }, { "epoch": 0.782867082185824, "grad_norm": 2.2993412017822266, "learning_rate": 2.1713291781417594e-07, "loss": 0.2191, "step": 16203 }, { "epoch": 0.782915398366913, "grad_norm": 2.4132754802703857, "learning_rate": 2.1708460163308693e-07, "loss": 0.324, "step": 16204 }, { "epoch": 0.7829637145480022, "grad_norm": 2.7542762756347656, "learning_rate": 2.1703628545199786e-07, "loss": 0.2807, "step": 16205 }, { "epoch": 0.7830120307290912, "grad_norm": 1.677688479423523, "learning_rate": 2.1698796927090883e-07, "loss": 0.1691, "step": 16206 }, { "epoch": 0.7830603469101802, "grad_norm": 2.637005090713501, "learning_rate": 2.1693965308981977e-07, "loss": 0.3477, "step": 16207 }, { "epoch": 0.7831086630912693, "grad_norm": 2.5232772827148438, "learning_rate": 2.1689133690873073e-07, "loss": 0.3641, "step": 16208 }, { "epoch": 0.7831569792723583, "grad_norm": 2.904310703277588, "learning_rate": 2.168430207276417e-07, "loss": 0.221, "step": 16209 }, { "epoch": 0.7832052954534474, "grad_norm": 2.913604259490967, "learning_rate": 2.1679470454655263e-07, "loss": 0.1941, "step": 16210 }, { "epoch": 0.7832536116345364, "grad_norm": 2.4153921604156494, "learning_rate": 2.1674638836546357e-07, "loss": 0.2556, "step": 16211 }, { "epoch": 0.7833019278156255, "grad_norm": 1.9014140367507935, "learning_rate": 2.1669807218437453e-07, "loss": 0.148, "step": 16212 }, { "epoch": 0.7833502439967145, "grad_norm": 2.6119003295898438, "learning_rate": 2.166497560032855e-07, "loss": 0.3695, "step": 16213 }, { "epoch": 0.7833985601778035, "grad_norm": 2.28182315826416, "learning_rate": 2.1660143982219646e-07, "loss": 0.2042, "step": 16214 }, { "epoch": 0.7834468763588925, "grad_norm": 7.268991947174072, "learning_rate": 2.165531236411074e-07, "loss": 0.3326, "step": 16215 }, { "epoch": 0.7834951925399817, "grad_norm": 3.895798444747925, "learning_rate": 2.1650480746001833e-07, "loss": 0.3178, "step": 16216 }, { "epoch": 0.7835435087210707, "grad_norm": 2.854473352432251, "learning_rate": 2.1645649127892932e-07, "loss": 0.3887, "step": 16217 }, { "epoch": 0.7835918249021597, "grad_norm": 19.32883071899414, "learning_rate": 2.1640817509784026e-07, "loss": 0.341, "step": 16218 }, { "epoch": 0.7836401410832488, "grad_norm": 2.9802610874176025, "learning_rate": 2.163598589167512e-07, "loss": 0.2663, "step": 16219 }, { "epoch": 0.7836884572643378, "grad_norm": 7.8204755783081055, "learning_rate": 2.1631154273566216e-07, "loss": 0.4807, "step": 16220 }, { "epoch": 0.7837367734454269, "grad_norm": 2.4847421646118164, "learning_rate": 2.1626322655457313e-07, "loss": 0.2426, "step": 16221 }, { "epoch": 0.7837850896265159, "grad_norm": 4.082108497619629, "learning_rate": 2.162149103734841e-07, "loss": 0.2719, "step": 16222 }, { "epoch": 0.783833405807605, "grad_norm": 3.585367202758789, "learning_rate": 2.1616659419239503e-07, "loss": 0.4173, "step": 16223 }, { "epoch": 0.783881721988694, "grad_norm": 7.406703948974609, "learning_rate": 2.1611827801130596e-07, "loss": 0.4222, "step": 16224 }, { "epoch": 0.783930038169783, "grad_norm": 3.9447035789489746, "learning_rate": 2.1606996183021693e-07, "loss": 0.3957, "step": 16225 }, { "epoch": 0.7839783543508722, "grad_norm": 4.0598249435424805, "learning_rate": 2.160216456491279e-07, "loss": 0.2699, "step": 16226 }, { "epoch": 0.7840266705319612, "grad_norm": 5.757853031158447, "learning_rate": 2.1597332946803883e-07, "loss": 0.4175, "step": 16227 }, { "epoch": 0.7840749867130502, "grad_norm": 2.824711799621582, "learning_rate": 2.159250132869498e-07, "loss": 0.2863, "step": 16228 }, { "epoch": 0.7841233028941392, "grad_norm": 2.9561147689819336, "learning_rate": 2.1587669710586073e-07, "loss": 0.2754, "step": 16229 }, { "epoch": 0.7841716190752283, "grad_norm": 6.175940036773682, "learning_rate": 2.1582838092477172e-07, "loss": 0.292, "step": 16230 }, { "epoch": 0.7842199352563174, "grad_norm": 1.8178038597106934, "learning_rate": 2.1578006474368266e-07, "loss": 0.249, "step": 16231 }, { "epoch": 0.7842682514374064, "grad_norm": 2.493565320968628, "learning_rate": 2.157317485625936e-07, "loss": 0.279, "step": 16232 }, { "epoch": 0.7843165676184954, "grad_norm": 4.241443634033203, "learning_rate": 2.1568343238150456e-07, "loss": 0.255, "step": 16233 }, { "epoch": 0.7843648837995845, "grad_norm": 2.445211172103882, "learning_rate": 2.156351162004155e-07, "loss": 0.2047, "step": 16234 }, { "epoch": 0.7844131999806735, "grad_norm": 1.976144552230835, "learning_rate": 2.1558680001932646e-07, "loss": 0.2333, "step": 16235 }, { "epoch": 0.7844615161617626, "grad_norm": 3.451425552368164, "learning_rate": 2.1553848383823742e-07, "loss": 0.4287, "step": 16236 }, { "epoch": 0.7845098323428517, "grad_norm": 2.1218783855438232, "learning_rate": 2.1549016765714836e-07, "loss": 0.1817, "step": 16237 }, { "epoch": 0.7845581485239407, "grad_norm": 1.8618171215057373, "learning_rate": 2.1544185147605932e-07, "loss": 0.178, "step": 16238 }, { "epoch": 0.7846064647050297, "grad_norm": 3.3803277015686035, "learning_rate": 2.153935352949703e-07, "loss": 0.337, "step": 16239 }, { "epoch": 0.7846547808861187, "grad_norm": 1.9002145528793335, "learning_rate": 2.1534521911388122e-07, "loss": 0.2026, "step": 16240 }, { "epoch": 0.7847030970672078, "grad_norm": 2.5732951164245605, "learning_rate": 2.152969029327922e-07, "loss": 0.2665, "step": 16241 }, { "epoch": 0.7847514132482969, "grad_norm": 2.310974597930908, "learning_rate": 2.1524858675170313e-07, "loss": 0.169, "step": 16242 }, { "epoch": 0.7847997294293859, "grad_norm": 4.79680061340332, "learning_rate": 2.152002705706141e-07, "loss": 0.207, "step": 16243 }, { "epoch": 0.7848480456104749, "grad_norm": 2.5824804306030273, "learning_rate": 2.1515195438952505e-07, "loss": 0.2428, "step": 16244 }, { "epoch": 0.784896361791564, "grad_norm": 2.0093252658843994, "learning_rate": 2.15103638208436e-07, "loss": 0.2247, "step": 16245 }, { "epoch": 0.784944677972653, "grad_norm": 2.369596004486084, "learning_rate": 2.1505532202734695e-07, "loss": 0.2848, "step": 16246 }, { "epoch": 0.7849929941537421, "grad_norm": 2.167170524597168, "learning_rate": 2.150070058462579e-07, "loss": 0.2532, "step": 16247 }, { "epoch": 0.7850413103348312, "grad_norm": 3.246603488922119, "learning_rate": 2.1495868966516886e-07, "loss": 0.3018, "step": 16248 }, { "epoch": 0.7850896265159202, "grad_norm": 5.802046298980713, "learning_rate": 2.1491037348407982e-07, "loss": 0.241, "step": 16249 }, { "epoch": 0.7851379426970092, "grad_norm": 3.1314122676849365, "learning_rate": 2.1486205730299076e-07, "loss": 0.3836, "step": 16250 }, { "epoch": 0.7851862588780982, "grad_norm": 3.2611846923828125, "learning_rate": 2.148137411219017e-07, "loss": 0.2824, "step": 16251 }, { "epoch": 0.7852345750591874, "grad_norm": 2.05456805229187, "learning_rate": 2.1476542494081268e-07, "loss": 0.2197, "step": 16252 }, { "epoch": 0.7852828912402764, "grad_norm": 45.1804313659668, "learning_rate": 2.1471710875972362e-07, "loss": 0.2209, "step": 16253 }, { "epoch": 0.7853312074213654, "grad_norm": 3.3852698802948, "learning_rate": 2.1466879257863458e-07, "loss": 0.3863, "step": 16254 }, { "epoch": 0.7853795236024544, "grad_norm": 3.104902505874634, "learning_rate": 2.1462047639754552e-07, "loss": 0.3758, "step": 16255 }, { "epoch": 0.7854278397835435, "grad_norm": 3.2672812938690186, "learning_rate": 2.1457216021645649e-07, "loss": 0.3999, "step": 16256 }, { "epoch": 0.7854761559646326, "grad_norm": 3.486342191696167, "learning_rate": 2.1452384403536745e-07, "loss": 0.3473, "step": 16257 }, { "epoch": 0.7855244721457216, "grad_norm": 3.6858510971069336, "learning_rate": 2.1447552785427839e-07, "loss": 0.5183, "step": 16258 }, { "epoch": 0.7855727883268107, "grad_norm": 1.5326529741287231, "learning_rate": 2.1442721167318932e-07, "loss": 0.1753, "step": 16259 }, { "epoch": 0.7856211045078997, "grad_norm": 2.1823861598968506, "learning_rate": 2.143788954921003e-07, "loss": 0.2297, "step": 16260 }, { "epoch": 0.7856694206889887, "grad_norm": 2.6439313888549805, "learning_rate": 2.1433057931101125e-07, "loss": 0.3005, "step": 16261 }, { "epoch": 0.7857177368700778, "grad_norm": 2.4678537845611572, "learning_rate": 2.1428226312992222e-07, "loss": 0.3147, "step": 16262 }, { "epoch": 0.7857660530511669, "grad_norm": 2.157655954360962, "learning_rate": 2.1423394694883315e-07, "loss": 0.2693, "step": 16263 }, { "epoch": 0.7858143692322559, "grad_norm": 2.2054712772369385, "learning_rate": 2.141856307677441e-07, "loss": 0.2008, "step": 16264 }, { "epoch": 0.7858626854133449, "grad_norm": 4.837737083435059, "learning_rate": 2.1413731458665508e-07, "loss": 0.3833, "step": 16265 }, { "epoch": 0.7859110015944339, "grad_norm": 2.997997283935547, "learning_rate": 2.1408899840556602e-07, "loss": 0.4158, "step": 16266 }, { "epoch": 0.785959317775523, "grad_norm": 2.21333646774292, "learning_rate": 2.1404068222447695e-07, "loss": 0.2392, "step": 16267 }, { "epoch": 0.7860076339566121, "grad_norm": 3.206413745880127, "learning_rate": 2.1399236604338792e-07, "loss": 0.287, "step": 16268 }, { "epoch": 0.7860559501377011, "grad_norm": 2.8756792545318604, "learning_rate": 2.1394404986229888e-07, "loss": 0.3563, "step": 16269 }, { "epoch": 0.7861042663187902, "grad_norm": 6.060952186584473, "learning_rate": 2.1389573368120985e-07, "loss": 0.4109, "step": 16270 }, { "epoch": 0.7861525824998792, "grad_norm": 3.0452260971069336, "learning_rate": 2.1384741750012078e-07, "loss": 0.4385, "step": 16271 }, { "epoch": 0.7862008986809682, "grad_norm": 6.18097448348999, "learning_rate": 2.1379910131903172e-07, "loss": 0.3509, "step": 16272 }, { "epoch": 0.7862492148620573, "grad_norm": 2.1218338012695312, "learning_rate": 2.1375078513794268e-07, "loss": 0.1737, "step": 16273 }, { "epoch": 0.7862975310431464, "grad_norm": 4.568028926849365, "learning_rate": 2.1370246895685365e-07, "loss": 0.2358, "step": 16274 }, { "epoch": 0.7863458472242354, "grad_norm": 4.181802272796631, "learning_rate": 2.1365415277576459e-07, "loss": 0.3278, "step": 16275 }, { "epoch": 0.7863941634053244, "grad_norm": 1.4164754152297974, "learning_rate": 2.1360583659467555e-07, "loss": 0.1577, "step": 16276 }, { "epoch": 0.7864424795864134, "grad_norm": 2.837648630142212, "learning_rate": 2.1355752041358649e-07, "loss": 0.3262, "step": 16277 }, { "epoch": 0.7864907957675026, "grad_norm": 4.9689040184021, "learning_rate": 2.1350920423249748e-07, "loss": 0.3971, "step": 16278 }, { "epoch": 0.7865391119485916, "grad_norm": 2.777141809463501, "learning_rate": 2.1346088805140841e-07, "loss": 0.2735, "step": 16279 }, { "epoch": 0.7865874281296806, "grad_norm": 2.301119327545166, "learning_rate": 2.1341257187031935e-07, "loss": 0.3225, "step": 16280 }, { "epoch": 0.7866357443107697, "grad_norm": 16.477930068969727, "learning_rate": 2.1336425568923031e-07, "loss": 0.2422, "step": 16281 }, { "epoch": 0.7866840604918587, "grad_norm": 2.772670269012451, "learning_rate": 2.1331593950814128e-07, "loss": 0.33, "step": 16282 }, { "epoch": 0.7867323766729478, "grad_norm": 2.4033749103546143, "learning_rate": 2.1326762332705222e-07, "loss": 0.2363, "step": 16283 }, { "epoch": 0.7867806928540368, "grad_norm": 3.0907130241394043, "learning_rate": 2.1321930714596318e-07, "loss": 0.2537, "step": 16284 }, { "epoch": 0.7868290090351259, "grad_norm": 2.4946908950805664, "learning_rate": 2.1317099096487412e-07, "loss": 0.2976, "step": 16285 }, { "epoch": 0.7868773252162149, "grad_norm": 6.133704662322998, "learning_rate": 2.1312267478378505e-07, "loss": 0.2485, "step": 16286 }, { "epoch": 0.7869256413973039, "grad_norm": 5.15677547454834, "learning_rate": 2.1307435860269604e-07, "loss": 0.2931, "step": 16287 }, { "epoch": 0.7869739575783931, "grad_norm": 2.612915515899658, "learning_rate": 2.1302604242160698e-07, "loss": 0.3402, "step": 16288 }, { "epoch": 0.7870222737594821, "grad_norm": 1.955779790878296, "learning_rate": 2.1297772624051795e-07, "loss": 0.2308, "step": 16289 }, { "epoch": 0.7870705899405711, "grad_norm": 2.4344356060028076, "learning_rate": 2.1292941005942888e-07, "loss": 0.2217, "step": 16290 }, { "epoch": 0.7871189061216601, "grad_norm": 2.9579176902770996, "learning_rate": 2.1288109387833985e-07, "loss": 0.2929, "step": 16291 }, { "epoch": 0.7871672223027492, "grad_norm": 3.963547468185425, "learning_rate": 2.128327776972508e-07, "loss": 0.3778, "step": 16292 }, { "epoch": 0.7872155384838382, "grad_norm": 4.008639335632324, "learning_rate": 2.1278446151616175e-07, "loss": 0.2656, "step": 16293 }, { "epoch": 0.7872638546649273, "grad_norm": 6.117886066436768, "learning_rate": 2.1273614533507268e-07, "loss": 0.3667, "step": 16294 }, { "epoch": 0.7873121708460163, "grad_norm": 3.058363676071167, "learning_rate": 2.1268782915398367e-07, "loss": 0.3056, "step": 16295 }, { "epoch": 0.7873604870271054, "grad_norm": 7.980467319488525, "learning_rate": 2.126395129728946e-07, "loss": 0.3007, "step": 16296 }, { "epoch": 0.7874088032081944, "grad_norm": 2.878636360168457, "learning_rate": 2.1259119679180558e-07, "loss": 0.4566, "step": 16297 }, { "epoch": 0.7874571193892834, "grad_norm": 2.3160345554351807, "learning_rate": 2.125428806107165e-07, "loss": 0.1973, "step": 16298 }, { "epoch": 0.7875054355703726, "grad_norm": 2.7041890621185303, "learning_rate": 2.1249456442962745e-07, "loss": 0.3447, "step": 16299 }, { "epoch": 0.7875537517514616, "grad_norm": 2.0864100456237793, "learning_rate": 2.1244624824853844e-07, "loss": 0.2213, "step": 16300 }, { "epoch": 0.7876020679325506, "grad_norm": 2.8825180530548096, "learning_rate": 2.1239793206744938e-07, "loss": 0.3474, "step": 16301 }, { "epoch": 0.7876503841136396, "grad_norm": 2.009552478790283, "learning_rate": 2.1234961588636032e-07, "loss": 0.2054, "step": 16302 }, { "epoch": 0.7876987002947287, "grad_norm": 2.56581449508667, "learning_rate": 2.1230129970527128e-07, "loss": 0.3107, "step": 16303 }, { "epoch": 0.7877470164758178, "grad_norm": 1.9543124437332153, "learning_rate": 2.1225298352418224e-07, "loss": 0.1725, "step": 16304 }, { "epoch": 0.7877953326569068, "grad_norm": 4.073208332061768, "learning_rate": 2.122046673430932e-07, "loss": 0.2433, "step": 16305 }, { "epoch": 0.7878436488379958, "grad_norm": 2.456205368041992, "learning_rate": 2.1215635116200414e-07, "loss": 0.2832, "step": 16306 }, { "epoch": 0.7878919650190849, "grad_norm": 4.763174533843994, "learning_rate": 2.1210803498091508e-07, "loss": 0.2952, "step": 16307 }, { "epoch": 0.7879402812001739, "grad_norm": 1.8593863248825073, "learning_rate": 2.1205971879982607e-07, "loss": 0.2402, "step": 16308 }, { "epoch": 0.787988597381263, "grad_norm": 2.347269296646118, "learning_rate": 2.12011402618737e-07, "loss": 0.2141, "step": 16309 }, { "epoch": 0.7880369135623521, "grad_norm": 2.568816900253296, "learning_rate": 2.1196308643764795e-07, "loss": 0.241, "step": 16310 }, { "epoch": 0.7880852297434411, "grad_norm": 6.42742919921875, "learning_rate": 2.119147702565589e-07, "loss": 0.3115, "step": 16311 }, { "epoch": 0.7881335459245301, "grad_norm": 2.0802409648895264, "learning_rate": 2.1186645407546985e-07, "loss": 0.217, "step": 16312 }, { "epoch": 0.7881818621056191, "grad_norm": 4.41116189956665, "learning_rate": 2.1181813789438084e-07, "loss": 0.3533, "step": 16313 }, { "epoch": 0.7882301782867083, "grad_norm": 2.7286651134490967, "learning_rate": 2.1176982171329177e-07, "loss": 0.3361, "step": 16314 }, { "epoch": 0.7882784944677973, "grad_norm": 3.8449273109436035, "learning_rate": 2.117215055322027e-07, "loss": 0.2958, "step": 16315 }, { "epoch": 0.7883268106488863, "grad_norm": 2.6068122386932373, "learning_rate": 2.1167318935111368e-07, "loss": 0.2975, "step": 16316 }, { "epoch": 0.7883751268299753, "grad_norm": 2.5006134510040283, "learning_rate": 2.1162487317002464e-07, "loss": 0.329, "step": 16317 }, { "epoch": 0.7884234430110644, "grad_norm": 2.189389228820801, "learning_rate": 2.115765569889356e-07, "loss": 0.2432, "step": 16318 }, { "epoch": 0.7884717591921534, "grad_norm": 3.0071277618408203, "learning_rate": 2.1152824080784654e-07, "loss": 0.3286, "step": 16319 }, { "epoch": 0.7885200753732425, "grad_norm": 36.91047286987305, "learning_rate": 2.1147992462675748e-07, "loss": 0.415, "step": 16320 }, { "epoch": 0.7885683915543316, "grad_norm": 3.6032216548919678, "learning_rate": 2.1143160844566847e-07, "loss": 0.3529, "step": 16321 }, { "epoch": 0.7886167077354206, "grad_norm": 1.2493295669555664, "learning_rate": 2.113832922645794e-07, "loss": 0.1277, "step": 16322 }, { "epoch": 0.7886650239165096, "grad_norm": 2.311281681060791, "learning_rate": 2.1133497608349034e-07, "loss": 0.2422, "step": 16323 }, { "epoch": 0.7887133400975986, "grad_norm": 3.1071293354034424, "learning_rate": 2.112866599024013e-07, "loss": 0.4235, "step": 16324 }, { "epoch": 0.7887616562786878, "grad_norm": 5.0830230712890625, "learning_rate": 2.1123834372131224e-07, "loss": 0.3541, "step": 16325 }, { "epoch": 0.7888099724597768, "grad_norm": 15.957050323486328, "learning_rate": 2.1119002754022323e-07, "loss": 0.1446, "step": 16326 }, { "epoch": 0.7888582886408658, "grad_norm": 3.8941524028778076, "learning_rate": 2.1114171135913417e-07, "loss": 0.3765, "step": 16327 }, { "epoch": 0.7889066048219548, "grad_norm": 2.533055543899536, "learning_rate": 2.110933951780451e-07, "loss": 0.2388, "step": 16328 }, { "epoch": 0.7889549210030439, "grad_norm": 2.726580858230591, "learning_rate": 2.1104507899695607e-07, "loss": 0.2882, "step": 16329 }, { "epoch": 0.789003237184133, "grad_norm": 3.6473586559295654, "learning_rate": 2.1099676281586704e-07, "loss": 0.2559, "step": 16330 }, { "epoch": 0.789051553365222, "grad_norm": 2.8521711826324463, "learning_rate": 2.1094844663477797e-07, "loss": 0.2213, "step": 16331 }, { "epoch": 0.7890998695463111, "grad_norm": 2.3875129222869873, "learning_rate": 2.1090013045368894e-07, "loss": 0.2769, "step": 16332 }, { "epoch": 0.7891481857274001, "grad_norm": 3.1380934715270996, "learning_rate": 2.1085181427259987e-07, "loss": 0.3918, "step": 16333 }, { "epoch": 0.7891965019084891, "grad_norm": 1.6649059057235718, "learning_rate": 2.1080349809151086e-07, "loss": 0.1174, "step": 16334 }, { "epoch": 0.7892448180895782, "grad_norm": 2.469623327255249, "learning_rate": 2.107551819104218e-07, "loss": 0.239, "step": 16335 }, { "epoch": 0.7892931342706673, "grad_norm": 2.142021894454956, "learning_rate": 2.1070686572933274e-07, "loss": 0.2521, "step": 16336 }, { "epoch": 0.7893414504517563, "grad_norm": 5.297297954559326, "learning_rate": 2.106585495482437e-07, "loss": 0.2721, "step": 16337 }, { "epoch": 0.7893897666328453, "grad_norm": 2.584979295730591, "learning_rate": 2.1061023336715464e-07, "loss": 0.1721, "step": 16338 }, { "epoch": 0.7894380828139343, "grad_norm": 9.560835838317871, "learning_rate": 2.105619171860656e-07, "loss": 0.2531, "step": 16339 }, { "epoch": 0.7894863989950235, "grad_norm": 2.12894868850708, "learning_rate": 2.1051360100497657e-07, "loss": 0.2696, "step": 16340 }, { "epoch": 0.7895347151761125, "grad_norm": 2.2525229454040527, "learning_rate": 2.104652848238875e-07, "loss": 0.2328, "step": 16341 }, { "epoch": 0.7895830313572015, "grad_norm": 2.619884729385376, "learning_rate": 2.1041696864279844e-07, "loss": 0.2967, "step": 16342 }, { "epoch": 0.7896313475382906, "grad_norm": 3.219081401824951, "learning_rate": 2.1036865246170943e-07, "loss": 0.2368, "step": 16343 }, { "epoch": 0.7896796637193796, "grad_norm": 2.1227166652679443, "learning_rate": 2.1032033628062037e-07, "loss": 0.29, "step": 16344 }, { "epoch": 0.7897279799004686, "grad_norm": 30.76555061340332, "learning_rate": 2.1027202009953133e-07, "loss": 0.3901, "step": 16345 }, { "epoch": 0.7897762960815577, "grad_norm": 1.8217066526412964, "learning_rate": 2.1022370391844227e-07, "loss": 0.1947, "step": 16346 }, { "epoch": 0.7898246122626468, "grad_norm": 2.4744958877563477, "learning_rate": 2.1017538773735323e-07, "loss": 0.2789, "step": 16347 }, { "epoch": 0.7898729284437358, "grad_norm": 2.4351086616516113, "learning_rate": 2.101270715562642e-07, "loss": 0.2244, "step": 16348 }, { "epoch": 0.7899212446248248, "grad_norm": 2.706998348236084, "learning_rate": 2.1007875537517513e-07, "loss": 0.2761, "step": 16349 }, { "epoch": 0.7899695608059139, "grad_norm": 2.817516326904297, "learning_rate": 2.1003043919408607e-07, "loss": 0.1848, "step": 16350 }, { "epoch": 0.790017876987003, "grad_norm": 2.8061583042144775, "learning_rate": 2.0998212301299704e-07, "loss": 0.3713, "step": 16351 }, { "epoch": 0.790066193168092, "grad_norm": 3.009225845336914, "learning_rate": 2.09933806831908e-07, "loss": 0.1763, "step": 16352 }, { "epoch": 0.790114509349181, "grad_norm": 2.065581798553467, "learning_rate": 2.0988549065081896e-07, "loss": 0.2234, "step": 16353 }, { "epoch": 0.7901628255302701, "grad_norm": 5.499481201171875, "learning_rate": 2.098371744697299e-07, "loss": 0.304, "step": 16354 }, { "epoch": 0.7902111417113591, "grad_norm": 2.801670551300049, "learning_rate": 2.0978885828864084e-07, "loss": 0.3243, "step": 16355 }, { "epoch": 0.7902594578924482, "grad_norm": 3.4824295043945312, "learning_rate": 2.0974054210755183e-07, "loss": 0.2104, "step": 16356 }, { "epoch": 0.7903077740735373, "grad_norm": 3.0223395824432373, "learning_rate": 2.0969222592646277e-07, "loss": 0.3406, "step": 16357 }, { "epoch": 0.7903560902546263, "grad_norm": 2.661433458328247, "learning_rate": 2.096439097453737e-07, "loss": 0.2675, "step": 16358 }, { "epoch": 0.7904044064357153, "grad_norm": 3.077332019805908, "learning_rate": 2.0959559356428467e-07, "loss": 0.4444, "step": 16359 }, { "epoch": 0.7904527226168043, "grad_norm": 2.7999203205108643, "learning_rate": 2.0954727738319563e-07, "loss": 0.313, "step": 16360 }, { "epoch": 0.7905010387978935, "grad_norm": 1.928667664527893, "learning_rate": 2.094989612021066e-07, "loss": 0.215, "step": 16361 }, { "epoch": 0.7905493549789825, "grad_norm": 2.4153008460998535, "learning_rate": 2.0945064502101753e-07, "loss": 0.2846, "step": 16362 }, { "epoch": 0.7905976711600715, "grad_norm": 2.239379405975342, "learning_rate": 2.0940232883992847e-07, "loss": 0.2026, "step": 16363 }, { "epoch": 0.7906459873411605, "grad_norm": 2.9073500633239746, "learning_rate": 2.0935401265883943e-07, "loss": 0.3739, "step": 16364 }, { "epoch": 0.7906943035222496, "grad_norm": 2.902743101119995, "learning_rate": 2.093056964777504e-07, "loss": 0.348, "step": 16365 }, { "epoch": 0.7907426197033387, "grad_norm": 5.991757869720459, "learning_rate": 2.0925738029666133e-07, "loss": 0.4033, "step": 16366 }, { "epoch": 0.7907909358844277, "grad_norm": 6.889919757843018, "learning_rate": 2.092090641155723e-07, "loss": 0.5539, "step": 16367 }, { "epoch": 0.7908392520655168, "grad_norm": 2.6062216758728027, "learning_rate": 2.0916074793448323e-07, "loss": 0.3078, "step": 16368 }, { "epoch": 0.7908875682466058, "grad_norm": 2.5670342445373535, "learning_rate": 2.0911243175339422e-07, "loss": 0.3202, "step": 16369 }, { "epoch": 0.7909358844276948, "grad_norm": 2.4961771965026855, "learning_rate": 2.0906411557230516e-07, "loss": 0.1955, "step": 16370 }, { "epoch": 0.7909842006087838, "grad_norm": 2.3590550422668457, "learning_rate": 2.090157993912161e-07, "loss": 0.3342, "step": 16371 }, { "epoch": 0.791032516789873, "grad_norm": 2.88368821144104, "learning_rate": 2.0896748321012706e-07, "loss": 0.3722, "step": 16372 }, { "epoch": 0.791080832970962, "grad_norm": 2.6464145183563232, "learning_rate": 2.0891916702903803e-07, "loss": 0.3121, "step": 16373 }, { "epoch": 0.791129149152051, "grad_norm": 2.9198572635650635, "learning_rate": 2.0887085084794896e-07, "loss": 0.2173, "step": 16374 }, { "epoch": 0.79117746533314, "grad_norm": 2.6997125148773193, "learning_rate": 2.0882253466685993e-07, "loss": 0.2905, "step": 16375 }, { "epoch": 0.7912257815142291, "grad_norm": 2.4458940029144287, "learning_rate": 2.0877421848577086e-07, "loss": 0.2694, "step": 16376 }, { "epoch": 0.7912740976953182, "grad_norm": 11.966609001159668, "learning_rate": 2.0872590230468183e-07, "loss": 0.3296, "step": 16377 }, { "epoch": 0.7913224138764072, "grad_norm": 2.1231729984283447, "learning_rate": 2.086775861235928e-07, "loss": 0.2533, "step": 16378 }, { "epoch": 0.7913707300574963, "grad_norm": 2.8114559650421143, "learning_rate": 2.0862926994250373e-07, "loss": 0.3606, "step": 16379 }, { "epoch": 0.7914190462385853, "grad_norm": 7.4563307762146, "learning_rate": 2.085809537614147e-07, "loss": 0.3033, "step": 16380 }, { "epoch": 0.7914673624196743, "grad_norm": 3.7288146018981934, "learning_rate": 2.0853263758032563e-07, "loss": 0.2187, "step": 16381 }, { "epoch": 0.7915156786007634, "grad_norm": 2.486574172973633, "learning_rate": 2.084843213992366e-07, "loss": 0.2599, "step": 16382 }, { "epoch": 0.7915639947818525, "grad_norm": 2.557939052581787, "learning_rate": 2.0843600521814756e-07, "loss": 0.2764, "step": 16383 }, { "epoch": 0.7916123109629415, "grad_norm": 3.10675311088562, "learning_rate": 2.083876890370585e-07, "loss": 0.3498, "step": 16384 }, { "epoch": 0.7916606271440305, "grad_norm": 6.414631366729736, "learning_rate": 2.0833937285596946e-07, "loss": 0.2498, "step": 16385 }, { "epoch": 0.7917089433251195, "grad_norm": 2.569988489151001, "learning_rate": 2.082910566748804e-07, "loss": 0.2027, "step": 16386 }, { "epoch": 0.7917572595062087, "grad_norm": 9.787152290344238, "learning_rate": 2.0824274049379136e-07, "loss": 0.2484, "step": 16387 }, { "epoch": 0.7918055756872977, "grad_norm": 2.374368190765381, "learning_rate": 2.0819442431270232e-07, "loss": 0.2627, "step": 16388 }, { "epoch": 0.7918538918683867, "grad_norm": 1.650084137916565, "learning_rate": 2.0814610813161326e-07, "loss": 0.2092, "step": 16389 }, { "epoch": 0.7919022080494758, "grad_norm": 12.046812057495117, "learning_rate": 2.080977919505242e-07, "loss": 0.2696, "step": 16390 }, { "epoch": 0.7919505242305648, "grad_norm": 2.6890110969543457, "learning_rate": 2.080494757694352e-07, "loss": 0.2733, "step": 16391 }, { "epoch": 0.7919988404116539, "grad_norm": 1.542979121208191, "learning_rate": 2.0800115958834613e-07, "loss": 0.1592, "step": 16392 }, { "epoch": 0.7920471565927429, "grad_norm": 1.8152827024459839, "learning_rate": 2.079528434072571e-07, "loss": 0.1694, "step": 16393 }, { "epoch": 0.792095472773832, "grad_norm": 2.7269747257232666, "learning_rate": 2.0790452722616803e-07, "loss": 0.3028, "step": 16394 }, { "epoch": 0.792143788954921, "grad_norm": 3.8223538398742676, "learning_rate": 2.07856211045079e-07, "loss": 0.2108, "step": 16395 }, { "epoch": 0.79219210513601, "grad_norm": 2.860018730163574, "learning_rate": 2.0780789486398995e-07, "loss": 0.3074, "step": 16396 }, { "epoch": 0.792240421317099, "grad_norm": 2.6288278102874756, "learning_rate": 2.077595786829009e-07, "loss": 0.3604, "step": 16397 }, { "epoch": 0.7922887374981882, "grad_norm": 2.4296464920043945, "learning_rate": 2.0771126250181183e-07, "loss": 0.3011, "step": 16398 }, { "epoch": 0.7923370536792772, "grad_norm": 3.278632164001465, "learning_rate": 2.076629463207228e-07, "loss": 0.3956, "step": 16399 }, { "epoch": 0.7923853698603662, "grad_norm": 23.19488525390625, "learning_rate": 2.0761463013963376e-07, "loss": 0.2405, "step": 16400 }, { "epoch": 0.7924336860414553, "grad_norm": 2.2607498168945312, "learning_rate": 2.0756631395854472e-07, "loss": 0.2038, "step": 16401 }, { "epoch": 0.7924820022225443, "grad_norm": 2.631460666656494, "learning_rate": 2.0751799777745566e-07, "loss": 0.2725, "step": 16402 }, { "epoch": 0.7925303184036334, "grad_norm": 2.334965229034424, "learning_rate": 2.074696815963666e-07, "loss": 0.2057, "step": 16403 }, { "epoch": 0.7925786345847224, "grad_norm": 4.356825828552246, "learning_rate": 2.0742136541527758e-07, "loss": 0.3964, "step": 16404 }, { "epoch": 0.7926269507658115, "grad_norm": 2.5036327838897705, "learning_rate": 2.0737304923418852e-07, "loss": 0.2654, "step": 16405 }, { "epoch": 0.7926752669469005, "grad_norm": 3.8902814388275146, "learning_rate": 2.0732473305309946e-07, "loss": 0.2779, "step": 16406 }, { "epoch": 0.7927235831279895, "grad_norm": 2.463761806488037, "learning_rate": 2.0727641687201042e-07, "loss": 0.2865, "step": 16407 }, { "epoch": 0.7927718993090787, "grad_norm": 2.312629461288452, "learning_rate": 2.072281006909214e-07, "loss": 0.2568, "step": 16408 }, { "epoch": 0.7928202154901677, "grad_norm": 1.7453402280807495, "learning_rate": 2.0717978450983235e-07, "loss": 0.1763, "step": 16409 }, { "epoch": 0.7928685316712567, "grad_norm": 1.6547983884811401, "learning_rate": 2.071314683287433e-07, "loss": 0.1846, "step": 16410 }, { "epoch": 0.7929168478523457, "grad_norm": 4.971450328826904, "learning_rate": 2.0708315214765423e-07, "loss": 0.3394, "step": 16411 }, { "epoch": 0.7929651640334348, "grad_norm": 3.7087364196777344, "learning_rate": 2.070348359665652e-07, "loss": 0.3681, "step": 16412 }, { "epoch": 0.7930134802145239, "grad_norm": 2.5032341480255127, "learning_rate": 2.0698651978547615e-07, "loss": 0.327, "step": 16413 }, { "epoch": 0.7930617963956129, "grad_norm": 2.127614736557007, "learning_rate": 2.069382036043871e-07, "loss": 0.203, "step": 16414 }, { "epoch": 0.7931101125767019, "grad_norm": 4.936425685882568, "learning_rate": 2.0688988742329805e-07, "loss": 0.3171, "step": 16415 }, { "epoch": 0.793158428757791, "grad_norm": 2.6665091514587402, "learning_rate": 2.06841571242209e-07, "loss": 0.2645, "step": 16416 }, { "epoch": 0.79320674493888, "grad_norm": 2.732205629348755, "learning_rate": 2.0679325506111998e-07, "loss": 0.2362, "step": 16417 }, { "epoch": 0.7932550611199691, "grad_norm": 3.807201862335205, "learning_rate": 2.0674493888003092e-07, "loss": 0.3389, "step": 16418 }, { "epoch": 0.7933033773010582, "grad_norm": 2.0818769931793213, "learning_rate": 2.0669662269894186e-07, "loss": 0.2437, "step": 16419 }, { "epoch": 0.7933516934821472, "grad_norm": 3.1807239055633545, "learning_rate": 2.0664830651785282e-07, "loss": 0.3144, "step": 16420 }, { "epoch": 0.7934000096632362, "grad_norm": 2.4776394367218018, "learning_rate": 2.0659999033676378e-07, "loss": 0.2415, "step": 16421 }, { "epoch": 0.7934483258443252, "grad_norm": 2.804844856262207, "learning_rate": 2.0655167415567472e-07, "loss": 0.2665, "step": 16422 }, { "epoch": 0.7934966420254143, "grad_norm": 3.637458086013794, "learning_rate": 2.0650335797458568e-07, "loss": 0.2263, "step": 16423 }, { "epoch": 0.7935449582065034, "grad_norm": 2.7771098613739014, "learning_rate": 2.0645504179349662e-07, "loss": 0.2652, "step": 16424 }, { "epoch": 0.7935932743875924, "grad_norm": 2.5681240558624268, "learning_rate": 2.0640672561240759e-07, "loss": 0.3207, "step": 16425 }, { "epoch": 0.7936415905686814, "grad_norm": 3.4489974975585938, "learning_rate": 2.0635840943131855e-07, "loss": 0.3252, "step": 16426 }, { "epoch": 0.7936899067497705, "grad_norm": 2.716576337814331, "learning_rate": 2.0631009325022949e-07, "loss": 0.3933, "step": 16427 }, { "epoch": 0.7937382229308595, "grad_norm": 3.3087403774261475, "learning_rate": 2.0626177706914045e-07, "loss": 0.299, "step": 16428 }, { "epoch": 0.7937865391119486, "grad_norm": 2.713181257247925, "learning_rate": 2.062134608880514e-07, "loss": 0.2368, "step": 16429 }, { "epoch": 0.7938348552930377, "grad_norm": 2.184392213821411, "learning_rate": 2.0616514470696235e-07, "loss": 0.2295, "step": 16430 }, { "epoch": 0.7938831714741267, "grad_norm": 2.643756866455078, "learning_rate": 2.0611682852587331e-07, "loss": 0.2514, "step": 16431 }, { "epoch": 0.7939314876552157, "grad_norm": 2.39670729637146, "learning_rate": 2.0606851234478425e-07, "loss": 0.2804, "step": 16432 }, { "epoch": 0.7939798038363047, "grad_norm": 3.5276618003845215, "learning_rate": 2.0602019616369522e-07, "loss": 0.2378, "step": 16433 }, { "epoch": 0.7940281200173939, "grad_norm": 5.344168663024902, "learning_rate": 2.0597187998260618e-07, "loss": 0.233, "step": 16434 }, { "epoch": 0.7940764361984829, "grad_norm": 1.9247393608093262, "learning_rate": 2.0592356380151712e-07, "loss": 0.129, "step": 16435 }, { "epoch": 0.7941247523795719, "grad_norm": 1.901397705078125, "learning_rate": 2.0587524762042808e-07, "loss": 0.2321, "step": 16436 }, { "epoch": 0.7941730685606609, "grad_norm": 3.6764087677001953, "learning_rate": 2.0582693143933902e-07, "loss": 0.3855, "step": 16437 }, { "epoch": 0.79422138474175, "grad_norm": 5.529055118560791, "learning_rate": 2.0577861525824996e-07, "loss": 0.2841, "step": 16438 }, { "epoch": 0.7942697009228391, "grad_norm": 2.1924333572387695, "learning_rate": 2.0573029907716095e-07, "loss": 0.2992, "step": 16439 }, { "epoch": 0.7943180171039281, "grad_norm": 2.207979202270508, "learning_rate": 2.0568198289607188e-07, "loss": 0.2367, "step": 16440 }, { "epoch": 0.7943663332850172, "grad_norm": 3.061037540435791, "learning_rate": 2.0563366671498285e-07, "loss": 0.3658, "step": 16441 }, { "epoch": 0.7944146494661062, "grad_norm": 2.1915690898895264, "learning_rate": 2.0558535053389378e-07, "loss": 0.2713, "step": 16442 }, { "epoch": 0.7944629656471952, "grad_norm": 16.344966888427734, "learning_rate": 2.0553703435280475e-07, "loss": 0.2489, "step": 16443 }, { "epoch": 0.7945112818282843, "grad_norm": 2.9562747478485107, "learning_rate": 2.054887181717157e-07, "loss": 0.3125, "step": 16444 }, { "epoch": 0.7945595980093734, "grad_norm": 2.2285006046295166, "learning_rate": 2.0544040199062665e-07, "loss": 0.1606, "step": 16445 }, { "epoch": 0.7946079141904624, "grad_norm": 3.576526641845703, "learning_rate": 2.0539208580953759e-07, "loss": 0.4101, "step": 16446 }, { "epoch": 0.7946562303715514, "grad_norm": 2.4969687461853027, "learning_rate": 2.0534376962844858e-07, "loss": 0.3311, "step": 16447 }, { "epoch": 0.7947045465526404, "grad_norm": 2.295236110687256, "learning_rate": 2.052954534473595e-07, "loss": 0.2661, "step": 16448 }, { "epoch": 0.7947528627337296, "grad_norm": 1.9929828643798828, "learning_rate": 2.0524713726627048e-07, "loss": 0.1907, "step": 16449 }, { "epoch": 0.7948011789148186, "grad_norm": 2.588848114013672, "learning_rate": 2.0519882108518141e-07, "loss": 0.3062, "step": 16450 }, { "epoch": 0.7948494950959076, "grad_norm": 3.0141823291778564, "learning_rate": 2.0515050490409235e-07, "loss": 0.2688, "step": 16451 }, { "epoch": 0.7948978112769967, "grad_norm": 6.002682209014893, "learning_rate": 2.0510218872300334e-07, "loss": 0.3275, "step": 16452 }, { "epoch": 0.7949461274580857, "grad_norm": 5.922032356262207, "learning_rate": 2.0505387254191428e-07, "loss": 0.44, "step": 16453 }, { "epoch": 0.7949944436391747, "grad_norm": 31.926664352416992, "learning_rate": 2.0500555636082522e-07, "loss": 0.395, "step": 16454 }, { "epoch": 0.7950427598202638, "grad_norm": 2.585139036178589, "learning_rate": 2.0495724017973618e-07, "loss": 0.2704, "step": 16455 }, { "epoch": 0.7950910760013529, "grad_norm": 3.7099030017852783, "learning_rate": 2.0490892399864714e-07, "loss": 0.357, "step": 16456 }, { "epoch": 0.7951393921824419, "grad_norm": 2.754276990890503, "learning_rate": 2.048606078175581e-07, "loss": 0.3894, "step": 16457 }, { "epoch": 0.7951877083635309, "grad_norm": 3.0956673622131348, "learning_rate": 2.0481229163646904e-07, "loss": 0.3309, "step": 16458 }, { "epoch": 0.79523602454462, "grad_norm": 7.397951126098633, "learning_rate": 2.0476397545537998e-07, "loss": 0.3557, "step": 16459 }, { "epoch": 0.7952843407257091, "grad_norm": 3.872576951980591, "learning_rate": 2.0471565927429097e-07, "loss": 0.3334, "step": 16460 }, { "epoch": 0.7953326569067981, "grad_norm": 2.6281161308288574, "learning_rate": 2.046673430932019e-07, "loss": 0.3405, "step": 16461 }, { "epoch": 0.7953809730878871, "grad_norm": 1.9618421792984009, "learning_rate": 2.0461902691211285e-07, "loss": 0.2474, "step": 16462 }, { "epoch": 0.7954292892689762, "grad_norm": 1.9712985754013062, "learning_rate": 2.045707107310238e-07, "loss": 0.2589, "step": 16463 }, { "epoch": 0.7954776054500652, "grad_norm": 3.704601287841797, "learning_rate": 2.0452239454993475e-07, "loss": 0.3351, "step": 16464 }, { "epoch": 0.7955259216311543, "grad_norm": 4.609308242797852, "learning_rate": 2.0447407836884574e-07, "loss": 0.2155, "step": 16465 }, { "epoch": 0.7955742378122433, "grad_norm": 2.561950922012329, "learning_rate": 2.0442576218775668e-07, "loss": 0.2881, "step": 16466 }, { "epoch": 0.7956225539933324, "grad_norm": 2.3358888626098633, "learning_rate": 2.043774460066676e-07, "loss": 0.2069, "step": 16467 }, { "epoch": 0.7956708701744214, "grad_norm": 8.015254020690918, "learning_rate": 2.0432912982557858e-07, "loss": 0.3508, "step": 16468 }, { "epoch": 0.7957191863555104, "grad_norm": 2.1576199531555176, "learning_rate": 2.0428081364448954e-07, "loss": 0.271, "step": 16469 }, { "epoch": 0.7957675025365996, "grad_norm": 2.6485207080841064, "learning_rate": 2.0423249746340048e-07, "loss": 0.3098, "step": 16470 }, { "epoch": 0.7958158187176886, "grad_norm": 5.418323993682861, "learning_rate": 2.0418418128231144e-07, "loss": 0.3661, "step": 16471 }, { "epoch": 0.7958641348987776, "grad_norm": 2.1160671710968018, "learning_rate": 2.0413586510122238e-07, "loss": 0.2165, "step": 16472 }, { "epoch": 0.7959124510798666, "grad_norm": 3.289400815963745, "learning_rate": 2.0408754892013337e-07, "loss": 0.3667, "step": 16473 }, { "epoch": 0.7959607672609557, "grad_norm": 3.3758132457733154, "learning_rate": 2.040392327390443e-07, "loss": 0.4001, "step": 16474 }, { "epoch": 0.7960090834420448, "grad_norm": 4.18000602722168, "learning_rate": 2.0399091655795524e-07, "loss": 0.3089, "step": 16475 }, { "epoch": 0.7960573996231338, "grad_norm": 2.9311583042144775, "learning_rate": 2.039426003768662e-07, "loss": 0.4172, "step": 16476 }, { "epoch": 0.7961057158042228, "grad_norm": 3.1974387168884277, "learning_rate": 2.0389428419577714e-07, "loss": 0.3059, "step": 16477 }, { "epoch": 0.7961540319853119, "grad_norm": 6.784008502960205, "learning_rate": 2.038459680146881e-07, "loss": 0.2429, "step": 16478 }, { "epoch": 0.7962023481664009, "grad_norm": 4.334059715270996, "learning_rate": 2.0379765183359907e-07, "loss": 0.4072, "step": 16479 }, { "epoch": 0.7962506643474899, "grad_norm": 7.6020708084106445, "learning_rate": 2.0374933565251e-07, "loss": 0.2042, "step": 16480 }, { "epoch": 0.7962989805285791, "grad_norm": 3.0436716079711914, "learning_rate": 2.0370101947142097e-07, "loss": 0.2965, "step": 16481 }, { "epoch": 0.7963472967096681, "grad_norm": 2.5927634239196777, "learning_rate": 2.0365270329033194e-07, "loss": 0.3127, "step": 16482 }, { "epoch": 0.7963956128907571, "grad_norm": 2.1387791633605957, "learning_rate": 2.0360438710924287e-07, "loss": 0.2246, "step": 16483 }, { "epoch": 0.7964439290718461, "grad_norm": 3.068889856338501, "learning_rate": 2.0355607092815384e-07, "loss": 0.4342, "step": 16484 }, { "epoch": 0.7964922452529352, "grad_norm": 2.1960370540618896, "learning_rate": 2.0350775474706477e-07, "loss": 0.2158, "step": 16485 }, { "epoch": 0.7965405614340243, "grad_norm": 3.0122170448303223, "learning_rate": 2.0345943856597574e-07, "loss": 0.2659, "step": 16486 }, { "epoch": 0.7965888776151133, "grad_norm": 5.1434645652771, "learning_rate": 2.034111223848867e-07, "loss": 0.4754, "step": 16487 }, { "epoch": 0.7966371937962023, "grad_norm": 1.816206455230713, "learning_rate": 2.0336280620379764e-07, "loss": 0.1808, "step": 16488 }, { "epoch": 0.7966855099772914, "grad_norm": 1.7081201076507568, "learning_rate": 2.033144900227086e-07, "loss": 0.1721, "step": 16489 }, { "epoch": 0.7967338261583804, "grad_norm": 2.7122766971588135, "learning_rate": 2.0326617384161954e-07, "loss": 0.3505, "step": 16490 }, { "epoch": 0.7967821423394695, "grad_norm": 8.249682426452637, "learning_rate": 2.032178576605305e-07, "loss": 0.4786, "step": 16491 }, { "epoch": 0.7968304585205586, "grad_norm": 2.5719118118286133, "learning_rate": 2.0316954147944147e-07, "loss": 0.3452, "step": 16492 }, { "epoch": 0.7968787747016476, "grad_norm": 2.4784023761749268, "learning_rate": 2.031212252983524e-07, "loss": 0.2322, "step": 16493 }, { "epoch": 0.7969270908827366, "grad_norm": 4.774710178375244, "learning_rate": 2.0307290911726334e-07, "loss": 0.376, "step": 16494 }, { "epoch": 0.7969754070638256, "grad_norm": 1.8688231706619263, "learning_rate": 2.0302459293617433e-07, "loss": 0.1819, "step": 16495 }, { "epoch": 0.7970237232449148, "grad_norm": 2.4204728603363037, "learning_rate": 2.0297627675508527e-07, "loss": 0.2462, "step": 16496 }, { "epoch": 0.7970720394260038, "grad_norm": 2.6288673877716064, "learning_rate": 2.0292796057399623e-07, "loss": 0.3123, "step": 16497 }, { "epoch": 0.7971203556070928, "grad_norm": 1.518062710762024, "learning_rate": 2.0287964439290717e-07, "loss": 0.1683, "step": 16498 }, { "epoch": 0.7971686717881818, "grad_norm": 2.5829076766967773, "learning_rate": 2.0283132821181813e-07, "loss": 0.34, "step": 16499 }, { "epoch": 0.7972169879692709, "grad_norm": 3.0609331130981445, "learning_rate": 2.027830120307291e-07, "loss": 0.3294, "step": 16500 }, { "epoch": 0.79726530415036, "grad_norm": 1.7398836612701416, "learning_rate": 2.0273469584964004e-07, "loss": 0.1934, "step": 16501 }, { "epoch": 0.797313620331449, "grad_norm": 2.4917776584625244, "learning_rate": 2.0268637966855097e-07, "loss": 0.2843, "step": 16502 }, { "epoch": 0.7973619365125381, "grad_norm": 2.7606399059295654, "learning_rate": 2.0263806348746194e-07, "loss": 0.3064, "step": 16503 }, { "epoch": 0.7974102526936271, "grad_norm": 2.603428363800049, "learning_rate": 2.025897473063729e-07, "loss": 0.248, "step": 16504 }, { "epoch": 0.7974585688747161, "grad_norm": 1.9848672151565552, "learning_rate": 2.0254143112528386e-07, "loss": 0.2585, "step": 16505 }, { "epoch": 0.7975068850558051, "grad_norm": 3.9583182334899902, "learning_rate": 2.024931149441948e-07, "loss": 0.3189, "step": 16506 }, { "epoch": 0.7975552012368943, "grad_norm": 2.5671324729919434, "learning_rate": 2.0244479876310574e-07, "loss": 0.2427, "step": 16507 }, { "epoch": 0.7976035174179833, "grad_norm": 2.293452501296997, "learning_rate": 2.0239648258201673e-07, "loss": 0.2497, "step": 16508 }, { "epoch": 0.7976518335990723, "grad_norm": 2.9609665870666504, "learning_rate": 2.0234816640092767e-07, "loss": 0.2679, "step": 16509 }, { "epoch": 0.7977001497801613, "grad_norm": 2.95833420753479, "learning_rate": 2.022998502198386e-07, "loss": 0.3741, "step": 16510 }, { "epoch": 0.7977484659612504, "grad_norm": 2.440361738204956, "learning_rate": 2.0225153403874957e-07, "loss": 0.2546, "step": 16511 }, { "epoch": 0.7977967821423395, "grad_norm": 2.444267511367798, "learning_rate": 2.0220321785766053e-07, "loss": 0.2908, "step": 16512 }, { "epoch": 0.7978450983234285, "grad_norm": 3.108119487762451, "learning_rate": 2.021549016765715e-07, "loss": 0.2944, "step": 16513 }, { "epoch": 0.7978934145045176, "grad_norm": 2.6385021209716797, "learning_rate": 2.0210658549548243e-07, "loss": 0.2532, "step": 16514 }, { "epoch": 0.7979417306856066, "grad_norm": 2.813218355178833, "learning_rate": 2.0205826931439337e-07, "loss": 0.3784, "step": 16515 }, { "epoch": 0.7979900468666956, "grad_norm": 9.649933815002441, "learning_rate": 2.0200995313330433e-07, "loss": 0.3637, "step": 16516 }, { "epoch": 0.7980383630477847, "grad_norm": 2.1555449962615967, "learning_rate": 2.019616369522153e-07, "loss": 0.2151, "step": 16517 }, { "epoch": 0.7980866792288738, "grad_norm": 3.052684783935547, "learning_rate": 2.0191332077112623e-07, "loss": 0.2319, "step": 16518 }, { "epoch": 0.7981349954099628, "grad_norm": 3.217371940612793, "learning_rate": 2.018650045900372e-07, "loss": 0.323, "step": 16519 }, { "epoch": 0.7981833115910518, "grad_norm": 4.509215831756592, "learning_rate": 2.0181668840894814e-07, "loss": 0.2557, "step": 16520 }, { "epoch": 0.7982316277721409, "grad_norm": 3.529757499694824, "learning_rate": 2.0176837222785913e-07, "loss": 0.4111, "step": 16521 }, { "epoch": 0.79827994395323, "grad_norm": 2.591884136199951, "learning_rate": 2.0172005604677006e-07, "loss": 0.2874, "step": 16522 }, { "epoch": 0.798328260134319, "grad_norm": 4.311592102050781, "learning_rate": 2.01671739865681e-07, "loss": 0.2524, "step": 16523 }, { "epoch": 0.798376576315408, "grad_norm": 3.5328404903411865, "learning_rate": 2.0162342368459196e-07, "loss": 0.3558, "step": 16524 }, { "epoch": 0.7984248924964971, "grad_norm": 3.738241672515869, "learning_rate": 2.015751075035029e-07, "loss": 0.3993, "step": 16525 }, { "epoch": 0.7984732086775861, "grad_norm": 3.8023815155029297, "learning_rate": 2.0152679132241386e-07, "loss": 0.2666, "step": 16526 }, { "epoch": 0.7985215248586752, "grad_norm": 2.062983512878418, "learning_rate": 2.0147847514132483e-07, "loss": 0.2529, "step": 16527 }, { "epoch": 0.7985698410397642, "grad_norm": 4.561412334442139, "learning_rate": 2.0143015896023577e-07, "loss": 0.3393, "step": 16528 }, { "epoch": 0.7986181572208533, "grad_norm": 14.003016471862793, "learning_rate": 2.013818427791467e-07, "loss": 0.2886, "step": 16529 }, { "epoch": 0.7986664734019423, "grad_norm": 2.519451141357422, "learning_rate": 2.013335265980577e-07, "loss": 0.3176, "step": 16530 }, { "epoch": 0.7987147895830313, "grad_norm": 2.005500555038452, "learning_rate": 2.0128521041696863e-07, "loss": 0.2249, "step": 16531 }, { "epoch": 0.7987631057641204, "grad_norm": 6.650039196014404, "learning_rate": 2.012368942358796e-07, "loss": 0.3689, "step": 16532 }, { "epoch": 0.7988114219452095, "grad_norm": 3.9122061729431152, "learning_rate": 2.0118857805479053e-07, "loss": 0.3087, "step": 16533 }, { "epoch": 0.7988597381262985, "grad_norm": 2.75940203666687, "learning_rate": 2.011402618737015e-07, "loss": 0.3001, "step": 16534 }, { "epoch": 0.7989080543073875, "grad_norm": 6.639848709106445, "learning_rate": 2.0109194569261246e-07, "loss": 0.3585, "step": 16535 }, { "epoch": 0.7989563704884766, "grad_norm": 3.544847011566162, "learning_rate": 2.010436295115234e-07, "loss": 0.3981, "step": 16536 }, { "epoch": 0.7990046866695656, "grad_norm": 2.5516576766967773, "learning_rate": 2.0099531333043433e-07, "loss": 0.3415, "step": 16537 }, { "epoch": 0.7990530028506547, "grad_norm": 2.5843541622161865, "learning_rate": 2.009469971493453e-07, "loss": 0.3419, "step": 16538 }, { "epoch": 0.7991013190317438, "grad_norm": 2.343752145767212, "learning_rate": 2.0089868096825626e-07, "loss": 0.2203, "step": 16539 }, { "epoch": 0.7991496352128328, "grad_norm": 9.33665943145752, "learning_rate": 2.0085036478716722e-07, "loss": 0.2559, "step": 16540 }, { "epoch": 0.7991979513939218, "grad_norm": 2.935852289199829, "learning_rate": 2.0080204860607816e-07, "loss": 0.4522, "step": 16541 }, { "epoch": 0.7992462675750108, "grad_norm": 5.218754291534424, "learning_rate": 2.007537324249891e-07, "loss": 0.3721, "step": 16542 }, { "epoch": 0.7992945837561, "grad_norm": 3.7988674640655518, "learning_rate": 2.007054162439001e-07, "loss": 0.3725, "step": 16543 }, { "epoch": 0.799342899937189, "grad_norm": 8.342183113098145, "learning_rate": 2.0065710006281103e-07, "loss": 0.3344, "step": 16544 }, { "epoch": 0.799391216118278, "grad_norm": 2.271256446838379, "learning_rate": 2.0060878388172196e-07, "loss": 0.2207, "step": 16545 }, { "epoch": 0.799439532299367, "grad_norm": 2.594223976135254, "learning_rate": 2.0056046770063293e-07, "loss": 0.331, "step": 16546 }, { "epoch": 0.7994878484804561, "grad_norm": 3.79805326461792, "learning_rate": 2.005121515195439e-07, "loss": 0.3039, "step": 16547 }, { "epoch": 0.7995361646615452, "grad_norm": 1.6859132051467896, "learning_rate": 2.0046383533845486e-07, "loss": 0.1812, "step": 16548 }, { "epoch": 0.7995844808426342, "grad_norm": 2.666145086288452, "learning_rate": 2.004155191573658e-07, "loss": 0.401, "step": 16549 }, { "epoch": 0.7996327970237233, "grad_norm": 2.2737488746643066, "learning_rate": 2.0036720297627673e-07, "loss": 0.234, "step": 16550 }, { "epoch": 0.7996811132048123, "grad_norm": 2.590505361557007, "learning_rate": 2.003188867951877e-07, "loss": 0.2804, "step": 16551 }, { "epoch": 0.7997294293859013, "grad_norm": 2.436204433441162, "learning_rate": 2.0027057061409866e-07, "loss": 0.1948, "step": 16552 }, { "epoch": 0.7997777455669904, "grad_norm": 2.3607113361358643, "learning_rate": 2.002222544330096e-07, "loss": 0.2127, "step": 16553 }, { "epoch": 0.7998260617480795, "grad_norm": 2.1379077434539795, "learning_rate": 2.0017393825192056e-07, "loss": 0.2691, "step": 16554 }, { "epoch": 0.7998743779291685, "grad_norm": 2.118077516555786, "learning_rate": 2.001256220708315e-07, "loss": 0.2231, "step": 16555 }, { "epoch": 0.7999226941102575, "grad_norm": 2.379065752029419, "learning_rate": 2.0007730588974249e-07, "loss": 0.2686, "step": 16556 }, { "epoch": 0.7999710102913465, "grad_norm": 2.061816453933716, "learning_rate": 2.0002898970865342e-07, "loss": 0.2372, "step": 16557 }, { "epoch": 0.8000193264724356, "grad_norm": 3.350367784500122, "learning_rate": 1.9998067352756436e-07, "loss": 0.2104, "step": 16558 }, { "epoch": 0.8000676426535247, "grad_norm": 2.8013675212860107, "learning_rate": 1.9993235734647532e-07, "loss": 0.3187, "step": 16559 }, { "epoch": 0.8001159588346137, "grad_norm": 2.28269624710083, "learning_rate": 1.998840411653863e-07, "loss": 0.2671, "step": 16560 }, { "epoch": 0.8001642750157028, "grad_norm": 2.8170580863952637, "learning_rate": 1.9983572498429723e-07, "loss": 0.3213, "step": 16561 }, { "epoch": 0.8002125911967918, "grad_norm": 3.207282304763794, "learning_rate": 1.997874088032082e-07, "loss": 0.2063, "step": 16562 }, { "epoch": 0.8002609073778808, "grad_norm": 3.3361082077026367, "learning_rate": 1.9973909262211913e-07, "loss": 0.4175, "step": 16563 }, { "epoch": 0.8003092235589699, "grad_norm": 3.522792339324951, "learning_rate": 1.996907764410301e-07, "loss": 0.2852, "step": 16564 }, { "epoch": 0.800357539740059, "grad_norm": 6.682438373565674, "learning_rate": 1.9964246025994105e-07, "loss": 0.2794, "step": 16565 }, { "epoch": 0.800405855921148, "grad_norm": 2.8313138484954834, "learning_rate": 1.99594144078852e-07, "loss": 0.256, "step": 16566 }, { "epoch": 0.800454172102237, "grad_norm": 3.2447874546051025, "learning_rate": 1.9954582789776295e-07, "loss": 0.1881, "step": 16567 }, { "epoch": 0.800502488283326, "grad_norm": 2.2766427993774414, "learning_rate": 1.994975117166739e-07, "loss": 0.2375, "step": 16568 }, { "epoch": 0.8005508044644152, "grad_norm": 3.5003528594970703, "learning_rate": 1.9944919553558486e-07, "loss": 0.2945, "step": 16569 }, { "epoch": 0.8005991206455042, "grad_norm": 2.7720680236816406, "learning_rate": 1.9940087935449582e-07, "loss": 0.3174, "step": 16570 }, { "epoch": 0.8006474368265932, "grad_norm": 1.7088783979415894, "learning_rate": 1.9935256317340676e-07, "loss": 0.1472, "step": 16571 }, { "epoch": 0.8006957530076823, "grad_norm": 2.1754515171051025, "learning_rate": 1.9930424699231772e-07, "loss": 0.2545, "step": 16572 }, { "epoch": 0.8007440691887713, "grad_norm": 2.624237060546875, "learning_rate": 1.9925593081122868e-07, "loss": 0.1981, "step": 16573 }, { "epoch": 0.8007923853698604, "grad_norm": 2.6514692306518555, "learning_rate": 1.9920761463013962e-07, "loss": 0.3037, "step": 16574 }, { "epoch": 0.8008407015509494, "grad_norm": 2.800281047821045, "learning_rate": 1.9915929844905059e-07, "loss": 0.3014, "step": 16575 }, { "epoch": 0.8008890177320385, "grad_norm": 3.6835739612579346, "learning_rate": 1.9911098226796152e-07, "loss": 0.3182, "step": 16576 }, { "epoch": 0.8009373339131275, "grad_norm": 2.406822681427002, "learning_rate": 1.9906266608687246e-07, "loss": 0.2445, "step": 16577 }, { "epoch": 0.8009856500942165, "grad_norm": 2.593822717666626, "learning_rate": 1.9901434990578345e-07, "loss": 0.1731, "step": 16578 }, { "epoch": 0.8010339662753057, "grad_norm": 4.568345069885254, "learning_rate": 1.989660337246944e-07, "loss": 0.3219, "step": 16579 }, { "epoch": 0.8010822824563947, "grad_norm": 3.5012032985687256, "learning_rate": 1.9891771754360535e-07, "loss": 0.2854, "step": 16580 }, { "epoch": 0.8011305986374837, "grad_norm": 3.444396495819092, "learning_rate": 1.988694013625163e-07, "loss": 0.3338, "step": 16581 }, { "epoch": 0.8011789148185727, "grad_norm": 3.161654472351074, "learning_rate": 1.9882108518142725e-07, "loss": 0.3876, "step": 16582 }, { "epoch": 0.8012272309996618, "grad_norm": 2.4831411838531494, "learning_rate": 1.9877276900033822e-07, "loss": 0.2804, "step": 16583 }, { "epoch": 0.8012755471807508, "grad_norm": 4.74247932434082, "learning_rate": 1.9872445281924915e-07, "loss": 0.383, "step": 16584 }, { "epoch": 0.8013238633618399, "grad_norm": 2.7915141582489014, "learning_rate": 1.986761366381601e-07, "loss": 0.3347, "step": 16585 }, { "epoch": 0.8013721795429289, "grad_norm": 2.163766860961914, "learning_rate": 1.9862782045707108e-07, "loss": 0.1692, "step": 16586 }, { "epoch": 0.801420495724018, "grad_norm": 1.415387749671936, "learning_rate": 1.9857950427598202e-07, "loss": 0.1521, "step": 16587 }, { "epoch": 0.801468811905107, "grad_norm": 3.122431755065918, "learning_rate": 1.9853118809489298e-07, "loss": 0.2348, "step": 16588 }, { "epoch": 0.801517128086196, "grad_norm": 2.592475175857544, "learning_rate": 1.9848287191380392e-07, "loss": 0.3405, "step": 16589 }, { "epoch": 0.8015654442672852, "grad_norm": 2.548877239227295, "learning_rate": 1.9843455573271486e-07, "loss": 0.3084, "step": 16590 }, { "epoch": 0.8016137604483742, "grad_norm": 1.987733006477356, "learning_rate": 1.9838623955162585e-07, "loss": 0.2021, "step": 16591 }, { "epoch": 0.8016620766294632, "grad_norm": 3.4592502117156982, "learning_rate": 1.9833792337053678e-07, "loss": 0.4012, "step": 16592 }, { "epoch": 0.8017103928105522, "grad_norm": 2.481444835662842, "learning_rate": 1.9828960718944772e-07, "loss": 0.2686, "step": 16593 }, { "epoch": 0.8017587089916413, "grad_norm": 2.2803821563720703, "learning_rate": 1.9824129100835868e-07, "loss": 0.1935, "step": 16594 }, { "epoch": 0.8018070251727304, "grad_norm": 2.386317014694214, "learning_rate": 1.9819297482726965e-07, "loss": 0.2991, "step": 16595 }, { "epoch": 0.8018553413538194, "grad_norm": 4.195529460906982, "learning_rate": 1.981446586461806e-07, "loss": 0.2842, "step": 16596 }, { "epoch": 0.8019036575349084, "grad_norm": 3.183776378631592, "learning_rate": 1.9809634246509155e-07, "loss": 0.2621, "step": 16597 }, { "epoch": 0.8019519737159975, "grad_norm": 3.1690664291381836, "learning_rate": 1.980480262840025e-07, "loss": 0.2781, "step": 16598 }, { "epoch": 0.8020002898970865, "grad_norm": 2.6092796325683594, "learning_rate": 1.9799971010291348e-07, "loss": 0.2727, "step": 16599 }, { "epoch": 0.8020486060781756, "grad_norm": 3.2895028591156006, "learning_rate": 1.9795139392182441e-07, "loss": 0.3081, "step": 16600 }, { "epoch": 0.8020969222592647, "grad_norm": 1.829578161239624, "learning_rate": 1.9790307774073535e-07, "loss": 0.2208, "step": 16601 }, { "epoch": 0.8021452384403537, "grad_norm": 2.0825443267822266, "learning_rate": 1.9785476155964632e-07, "loss": 0.1967, "step": 16602 }, { "epoch": 0.8021935546214427, "grad_norm": 2.8340916633605957, "learning_rate": 1.9780644537855725e-07, "loss": 0.2699, "step": 16603 }, { "epoch": 0.8022418708025317, "grad_norm": 4.023831367492676, "learning_rate": 1.9775812919746824e-07, "loss": 0.3192, "step": 16604 }, { "epoch": 0.8022901869836209, "grad_norm": 3.165884256362915, "learning_rate": 1.9770981301637918e-07, "loss": 0.2698, "step": 16605 }, { "epoch": 0.8023385031647099, "grad_norm": 2.3810088634490967, "learning_rate": 1.9766149683529012e-07, "loss": 0.3406, "step": 16606 }, { "epoch": 0.8023868193457989, "grad_norm": 2.4109749794006348, "learning_rate": 1.9761318065420108e-07, "loss": 0.1976, "step": 16607 }, { "epoch": 0.8024351355268879, "grad_norm": 3.509284257888794, "learning_rate": 1.9756486447311204e-07, "loss": 0.3995, "step": 16608 }, { "epoch": 0.802483451707977, "grad_norm": 13.426977157592773, "learning_rate": 1.9751654829202298e-07, "loss": 0.3145, "step": 16609 }, { "epoch": 0.802531767889066, "grad_norm": 1.891316294670105, "learning_rate": 1.9746823211093395e-07, "loss": 0.2072, "step": 16610 }, { "epoch": 0.8025800840701551, "grad_norm": 6.108478546142578, "learning_rate": 1.9741991592984488e-07, "loss": 0.3056, "step": 16611 }, { "epoch": 0.8026284002512442, "grad_norm": 2.341501474380493, "learning_rate": 1.9737159974875587e-07, "loss": 0.2651, "step": 16612 }, { "epoch": 0.8026767164323332, "grad_norm": 13.80064868927002, "learning_rate": 1.973232835676668e-07, "loss": 0.2851, "step": 16613 }, { "epoch": 0.8027250326134222, "grad_norm": 2.2727158069610596, "learning_rate": 1.9727496738657775e-07, "loss": 0.1801, "step": 16614 }, { "epoch": 0.8027733487945112, "grad_norm": 2.3198764324188232, "learning_rate": 1.972266512054887e-07, "loss": 0.2499, "step": 16615 }, { "epoch": 0.8028216649756004, "grad_norm": 2.536834239959717, "learning_rate": 1.9717833502439965e-07, "loss": 0.2705, "step": 16616 }, { "epoch": 0.8028699811566894, "grad_norm": 6.2025299072265625, "learning_rate": 1.971300188433106e-07, "loss": 0.2218, "step": 16617 }, { "epoch": 0.8029182973377784, "grad_norm": 30.946693420410156, "learning_rate": 1.9708170266222158e-07, "loss": 0.2207, "step": 16618 }, { "epoch": 0.8029666135188674, "grad_norm": 2.667545795440674, "learning_rate": 1.9703338648113251e-07, "loss": 0.2977, "step": 16619 }, { "epoch": 0.8030149296999565, "grad_norm": 3.1146018505096436, "learning_rate": 1.9698507030004348e-07, "loss": 0.35, "step": 16620 }, { "epoch": 0.8030632458810456, "grad_norm": 2.5866096019744873, "learning_rate": 1.9693675411895444e-07, "loss": 0.3023, "step": 16621 }, { "epoch": 0.8031115620621346, "grad_norm": 9.600955963134766, "learning_rate": 1.9688843793786538e-07, "loss": 0.3095, "step": 16622 }, { "epoch": 0.8031598782432237, "grad_norm": 2.0177054405212402, "learning_rate": 1.9684012175677634e-07, "loss": 0.2209, "step": 16623 }, { "epoch": 0.8032081944243127, "grad_norm": 2.4616503715515137, "learning_rate": 1.9679180557568728e-07, "loss": 0.276, "step": 16624 }, { "epoch": 0.8032565106054017, "grad_norm": 5.286370754241943, "learning_rate": 1.9674348939459824e-07, "loss": 0.3855, "step": 16625 }, { "epoch": 0.8033048267864908, "grad_norm": 3.070286989212036, "learning_rate": 1.966951732135092e-07, "loss": 0.439, "step": 16626 }, { "epoch": 0.8033531429675799, "grad_norm": 1.8972764015197754, "learning_rate": 1.9664685703242014e-07, "loss": 0.1969, "step": 16627 }, { "epoch": 0.8034014591486689, "grad_norm": 2.661076307296753, "learning_rate": 1.965985408513311e-07, "loss": 0.2228, "step": 16628 }, { "epoch": 0.8034497753297579, "grad_norm": 2.4107892513275146, "learning_rate": 1.9655022467024205e-07, "loss": 0.3534, "step": 16629 }, { "epoch": 0.803498091510847, "grad_norm": 2.928945302963257, "learning_rate": 1.96501908489153e-07, "loss": 0.3756, "step": 16630 }, { "epoch": 0.8035464076919361, "grad_norm": 3.4257051944732666, "learning_rate": 1.9645359230806397e-07, "loss": 0.4662, "step": 16631 }, { "epoch": 0.8035947238730251, "grad_norm": 2.953564405441284, "learning_rate": 1.964052761269749e-07, "loss": 0.3198, "step": 16632 }, { "epoch": 0.8036430400541141, "grad_norm": 1.8598440885543823, "learning_rate": 1.9635695994588585e-07, "loss": 0.1705, "step": 16633 }, { "epoch": 0.8036913562352032, "grad_norm": 2.5605738162994385, "learning_rate": 1.9630864376479684e-07, "loss": 0.3342, "step": 16634 }, { "epoch": 0.8037396724162922, "grad_norm": 2.218430995941162, "learning_rate": 1.9626032758370777e-07, "loss": 0.2628, "step": 16635 }, { "epoch": 0.8037879885973812, "grad_norm": 3.060026168823242, "learning_rate": 1.9621201140261874e-07, "loss": 0.2503, "step": 16636 }, { "epoch": 0.8038363047784703, "grad_norm": 2.601621150970459, "learning_rate": 1.9616369522152968e-07, "loss": 0.2124, "step": 16637 }, { "epoch": 0.8038846209595594, "grad_norm": 2.364433526992798, "learning_rate": 1.9611537904044064e-07, "loss": 0.2145, "step": 16638 }, { "epoch": 0.8039329371406484, "grad_norm": 23.388652801513672, "learning_rate": 1.960670628593516e-07, "loss": 0.3041, "step": 16639 }, { "epoch": 0.8039812533217374, "grad_norm": 2.2872061729431152, "learning_rate": 1.9601874667826254e-07, "loss": 0.2185, "step": 16640 }, { "epoch": 0.8040295695028264, "grad_norm": 4.451651573181152, "learning_rate": 1.9597043049717348e-07, "loss": 0.2738, "step": 16641 }, { "epoch": 0.8040778856839156, "grad_norm": 4.165406703948975, "learning_rate": 1.9592211431608444e-07, "loss": 0.2571, "step": 16642 }, { "epoch": 0.8041262018650046, "grad_norm": 3.079643726348877, "learning_rate": 1.958737981349954e-07, "loss": 0.3548, "step": 16643 }, { "epoch": 0.8041745180460936, "grad_norm": 2.7518062591552734, "learning_rate": 1.9582548195390637e-07, "loss": 0.314, "step": 16644 }, { "epoch": 0.8042228342271827, "grad_norm": 1.8502089977264404, "learning_rate": 1.957771657728173e-07, "loss": 0.2389, "step": 16645 }, { "epoch": 0.8042711504082717, "grad_norm": 2.697803020477295, "learning_rate": 1.9572884959172824e-07, "loss": 0.3328, "step": 16646 }, { "epoch": 0.8043194665893608, "grad_norm": 1.8662309646606445, "learning_rate": 1.9568053341063923e-07, "loss": 0.2207, "step": 16647 }, { "epoch": 0.8043677827704498, "grad_norm": 1.532204508781433, "learning_rate": 1.9563221722955017e-07, "loss": 0.1393, "step": 16648 }, { "epoch": 0.8044160989515389, "grad_norm": 2.4875597953796387, "learning_rate": 1.955839010484611e-07, "loss": 0.3334, "step": 16649 }, { "epoch": 0.8044644151326279, "grad_norm": 1.6812268495559692, "learning_rate": 1.9553558486737207e-07, "loss": 0.1854, "step": 16650 }, { "epoch": 0.8045127313137169, "grad_norm": 2.5041489601135254, "learning_rate": 1.9548726868628304e-07, "loss": 0.2611, "step": 16651 }, { "epoch": 0.8045610474948061, "grad_norm": 2.395444869995117, "learning_rate": 1.95438952505194e-07, "loss": 0.2385, "step": 16652 }, { "epoch": 0.8046093636758951, "grad_norm": 6.94688081741333, "learning_rate": 1.9539063632410494e-07, "loss": 0.2185, "step": 16653 }, { "epoch": 0.8046576798569841, "grad_norm": 2.4449563026428223, "learning_rate": 1.9534232014301587e-07, "loss": 0.2466, "step": 16654 }, { "epoch": 0.8047059960380731, "grad_norm": 2.896484136581421, "learning_rate": 1.9529400396192684e-07, "loss": 0.4328, "step": 16655 }, { "epoch": 0.8047543122191622, "grad_norm": 83.95186614990234, "learning_rate": 1.952456877808378e-07, "loss": 0.3318, "step": 16656 }, { "epoch": 0.8048026284002513, "grad_norm": 2.780709981918335, "learning_rate": 1.9519737159974874e-07, "loss": 0.3017, "step": 16657 }, { "epoch": 0.8048509445813403, "grad_norm": 2.066927194595337, "learning_rate": 1.951490554186597e-07, "loss": 0.1936, "step": 16658 }, { "epoch": 0.8048992607624293, "grad_norm": 1.9218530654907227, "learning_rate": 1.9510073923757064e-07, "loss": 0.1933, "step": 16659 }, { "epoch": 0.8049475769435184, "grad_norm": 2.3841969966888428, "learning_rate": 1.9505242305648163e-07, "loss": 0.2471, "step": 16660 }, { "epoch": 0.8049958931246074, "grad_norm": 6.566352367401123, "learning_rate": 1.9500410687539257e-07, "loss": 0.3087, "step": 16661 }, { "epoch": 0.8050442093056964, "grad_norm": 2.2340915203094482, "learning_rate": 1.949557906943035e-07, "loss": 0.2068, "step": 16662 }, { "epoch": 0.8050925254867856, "grad_norm": 2.8008642196655273, "learning_rate": 1.9490747451321447e-07, "loss": 0.2904, "step": 16663 }, { "epoch": 0.8051408416678746, "grad_norm": 19.5950984954834, "learning_rate": 1.948591583321254e-07, "loss": 0.334, "step": 16664 }, { "epoch": 0.8051891578489636, "grad_norm": 2.647749662399292, "learning_rate": 1.9481084215103637e-07, "loss": 0.2946, "step": 16665 }, { "epoch": 0.8052374740300526, "grad_norm": 2.1166210174560547, "learning_rate": 1.9476252596994733e-07, "loss": 0.1827, "step": 16666 }, { "epoch": 0.8052857902111417, "grad_norm": 1.751428484916687, "learning_rate": 1.9471420978885827e-07, "loss": 0.2213, "step": 16667 }, { "epoch": 0.8053341063922308, "grad_norm": 2.3661861419677734, "learning_rate": 1.9466589360776923e-07, "loss": 0.3056, "step": 16668 }, { "epoch": 0.8053824225733198, "grad_norm": 5.231388092041016, "learning_rate": 1.946175774266802e-07, "loss": 0.2774, "step": 16669 }, { "epoch": 0.8054307387544088, "grad_norm": 2.5937674045562744, "learning_rate": 1.9456926124559114e-07, "loss": 0.2604, "step": 16670 }, { "epoch": 0.8054790549354979, "grad_norm": 3.468712568283081, "learning_rate": 1.945209450645021e-07, "loss": 0.3352, "step": 16671 }, { "epoch": 0.8055273711165869, "grad_norm": 2.285611629486084, "learning_rate": 1.9447262888341304e-07, "loss": 0.3203, "step": 16672 }, { "epoch": 0.805575687297676, "grad_norm": 2.3164920806884766, "learning_rate": 1.94424312702324e-07, "loss": 0.2236, "step": 16673 }, { "epoch": 0.8056240034787651, "grad_norm": 2.846597671508789, "learning_rate": 1.9437599652123496e-07, "loss": 0.3253, "step": 16674 }, { "epoch": 0.8056723196598541, "grad_norm": 2.5090367794036865, "learning_rate": 1.943276803401459e-07, "loss": 0.2483, "step": 16675 }, { "epoch": 0.8057206358409431, "grad_norm": 2.199345350265503, "learning_rate": 1.9427936415905686e-07, "loss": 0.2728, "step": 16676 }, { "epoch": 0.8057689520220321, "grad_norm": 2.4534149169921875, "learning_rate": 1.942310479779678e-07, "loss": 0.316, "step": 16677 }, { "epoch": 0.8058172682031213, "grad_norm": 2.4669978618621826, "learning_rate": 1.9418273179687877e-07, "loss": 0.2727, "step": 16678 }, { "epoch": 0.8058655843842103, "grad_norm": 2.766056776046753, "learning_rate": 1.9413441561578973e-07, "loss": 0.2944, "step": 16679 }, { "epoch": 0.8059139005652993, "grad_norm": 2.692314863204956, "learning_rate": 1.9408609943470067e-07, "loss": 0.2856, "step": 16680 }, { "epoch": 0.8059622167463883, "grad_norm": 2.81868314743042, "learning_rate": 1.940377832536116e-07, "loss": 0.3731, "step": 16681 }, { "epoch": 0.8060105329274774, "grad_norm": 4.291779518127441, "learning_rate": 1.939894670725226e-07, "loss": 0.286, "step": 16682 }, { "epoch": 0.8060588491085665, "grad_norm": 2.7148611545562744, "learning_rate": 1.9394115089143353e-07, "loss": 0.2475, "step": 16683 }, { "epoch": 0.8061071652896555, "grad_norm": 1.5141386985778809, "learning_rate": 1.938928347103445e-07, "loss": 0.1559, "step": 16684 }, { "epoch": 0.8061554814707446, "grad_norm": 3.0220234394073486, "learning_rate": 1.9384451852925543e-07, "loss": 0.3125, "step": 16685 }, { "epoch": 0.8062037976518336, "grad_norm": 3.3815600872039795, "learning_rate": 1.937962023481664e-07, "loss": 0.3215, "step": 16686 }, { "epoch": 0.8062521138329226, "grad_norm": 2.4106051921844482, "learning_rate": 1.9374788616707736e-07, "loss": 0.3371, "step": 16687 }, { "epoch": 0.8063004300140116, "grad_norm": 2.684433698654175, "learning_rate": 1.936995699859883e-07, "loss": 0.2959, "step": 16688 }, { "epoch": 0.8063487461951008, "grad_norm": 1.7890865802764893, "learning_rate": 1.9365125380489923e-07, "loss": 0.1817, "step": 16689 }, { "epoch": 0.8063970623761898, "grad_norm": 3.494203805923462, "learning_rate": 1.936029376238102e-07, "loss": 0.3529, "step": 16690 }, { "epoch": 0.8064453785572788, "grad_norm": 2.199293851852417, "learning_rate": 1.9355462144272116e-07, "loss": 0.2577, "step": 16691 }, { "epoch": 0.8064936947383679, "grad_norm": 2.8472800254821777, "learning_rate": 1.9350630526163213e-07, "loss": 0.3093, "step": 16692 }, { "epoch": 0.8065420109194569, "grad_norm": 2.5113492012023926, "learning_rate": 1.9345798908054306e-07, "loss": 0.3097, "step": 16693 }, { "epoch": 0.806590327100546, "grad_norm": 1.914905309677124, "learning_rate": 1.93409672899454e-07, "loss": 0.1816, "step": 16694 }, { "epoch": 0.806638643281635, "grad_norm": 3.3618664741516113, "learning_rate": 1.93361356718365e-07, "loss": 0.3535, "step": 16695 }, { "epoch": 0.8066869594627241, "grad_norm": 2.4051740169525146, "learning_rate": 1.9331304053727593e-07, "loss": 0.2501, "step": 16696 }, { "epoch": 0.8067352756438131, "grad_norm": 3.696422815322876, "learning_rate": 1.9326472435618687e-07, "loss": 0.2956, "step": 16697 }, { "epoch": 0.8067835918249021, "grad_norm": 2.4779467582702637, "learning_rate": 1.9321640817509783e-07, "loss": 0.1917, "step": 16698 }, { "epoch": 0.8068319080059912, "grad_norm": 3.089768409729004, "learning_rate": 1.931680919940088e-07, "loss": 0.4447, "step": 16699 }, { "epoch": 0.8068802241870803, "grad_norm": 2.6665053367614746, "learning_rate": 1.9311977581291976e-07, "loss": 0.3197, "step": 16700 }, { "epoch": 0.8069285403681693, "grad_norm": 1.9974244832992554, "learning_rate": 1.930714596318307e-07, "loss": 0.2216, "step": 16701 }, { "epoch": 0.8069768565492583, "grad_norm": 2.4653573036193848, "learning_rate": 1.9302314345074163e-07, "loss": 0.217, "step": 16702 }, { "epoch": 0.8070251727303474, "grad_norm": 3.535311460494995, "learning_rate": 1.929748272696526e-07, "loss": 0.2742, "step": 16703 }, { "epoch": 0.8070734889114365, "grad_norm": 2.46089768409729, "learning_rate": 1.9292651108856356e-07, "loss": 0.2757, "step": 16704 }, { "epoch": 0.8071218050925255, "grad_norm": 2.3920769691467285, "learning_rate": 1.928781949074745e-07, "loss": 0.2301, "step": 16705 }, { "epoch": 0.8071701212736145, "grad_norm": 2.6909127235412598, "learning_rate": 1.9282987872638546e-07, "loss": 0.3433, "step": 16706 }, { "epoch": 0.8072184374547036, "grad_norm": 2.3172993659973145, "learning_rate": 1.927815625452964e-07, "loss": 0.2407, "step": 16707 }, { "epoch": 0.8072667536357926, "grad_norm": 2.8404905796051025, "learning_rate": 1.927332463642074e-07, "loss": 0.3253, "step": 16708 }, { "epoch": 0.8073150698168817, "grad_norm": 3.374100923538208, "learning_rate": 1.9268493018311832e-07, "loss": 0.3351, "step": 16709 }, { "epoch": 0.8073633859979708, "grad_norm": 2.467466115951538, "learning_rate": 1.9263661400202926e-07, "loss": 0.278, "step": 16710 }, { "epoch": 0.8074117021790598, "grad_norm": 3.500704526901245, "learning_rate": 1.9258829782094023e-07, "loss": 0.4555, "step": 16711 }, { "epoch": 0.8074600183601488, "grad_norm": 2.497483730316162, "learning_rate": 1.925399816398512e-07, "loss": 0.3619, "step": 16712 }, { "epoch": 0.8075083345412378, "grad_norm": 4.049330711364746, "learning_rate": 1.9249166545876213e-07, "loss": 0.3414, "step": 16713 }, { "epoch": 0.8075566507223269, "grad_norm": 9.18395709991455, "learning_rate": 1.924433492776731e-07, "loss": 0.184, "step": 16714 }, { "epoch": 0.807604966903416, "grad_norm": 2.5813236236572266, "learning_rate": 1.9239503309658403e-07, "loss": 0.3398, "step": 16715 }, { "epoch": 0.807653283084505, "grad_norm": 3.0801546573638916, "learning_rate": 1.9234671691549496e-07, "loss": 0.387, "step": 16716 }, { "epoch": 0.807701599265594, "grad_norm": 3.2479608058929443, "learning_rate": 1.9229840073440596e-07, "loss": 0.3459, "step": 16717 }, { "epoch": 0.8077499154466831, "grad_norm": 2.281907558441162, "learning_rate": 1.922500845533169e-07, "loss": 0.2351, "step": 16718 }, { "epoch": 0.8077982316277721, "grad_norm": 2.617769718170166, "learning_rate": 1.9220176837222786e-07, "loss": 0.3142, "step": 16719 }, { "epoch": 0.8078465478088612, "grad_norm": 9.231194496154785, "learning_rate": 1.921534521911388e-07, "loss": 0.2846, "step": 16720 }, { "epoch": 0.8078948639899503, "grad_norm": 2.849447727203369, "learning_rate": 1.9210513601004976e-07, "loss": 0.3061, "step": 16721 }, { "epoch": 0.8079431801710393, "grad_norm": 1.5936894416809082, "learning_rate": 1.9205681982896072e-07, "loss": 0.1523, "step": 16722 }, { "epoch": 0.8079914963521283, "grad_norm": 5.549564361572266, "learning_rate": 1.9200850364787166e-07, "loss": 0.313, "step": 16723 }, { "epoch": 0.8080398125332173, "grad_norm": 1.6799347400665283, "learning_rate": 1.919601874667826e-07, "loss": 0.1772, "step": 16724 }, { "epoch": 0.8080881287143065, "grad_norm": 1.898573398590088, "learning_rate": 1.9191187128569359e-07, "loss": 0.1929, "step": 16725 }, { "epoch": 0.8081364448953955, "grad_norm": 3.1438839435577393, "learning_rate": 1.9186355510460452e-07, "loss": 0.2746, "step": 16726 }, { "epoch": 0.8081847610764845, "grad_norm": 2.2829244136810303, "learning_rate": 1.9181523892351549e-07, "loss": 0.2474, "step": 16727 }, { "epoch": 0.8082330772575735, "grad_norm": 3.1921236515045166, "learning_rate": 1.9176692274242642e-07, "loss": 0.267, "step": 16728 }, { "epoch": 0.8082813934386626, "grad_norm": 3.7883925437927246, "learning_rate": 1.9171860656133736e-07, "loss": 0.3813, "step": 16729 }, { "epoch": 0.8083297096197517, "grad_norm": 2.3485238552093506, "learning_rate": 1.9167029038024835e-07, "loss": 0.3197, "step": 16730 }, { "epoch": 0.8083780258008407, "grad_norm": 2.304654359817505, "learning_rate": 1.916219741991593e-07, "loss": 0.1845, "step": 16731 }, { "epoch": 0.8084263419819298, "grad_norm": 2.429171562194824, "learning_rate": 1.9157365801807023e-07, "loss": 0.3114, "step": 16732 }, { "epoch": 0.8084746581630188, "grad_norm": 2.972109317779541, "learning_rate": 1.915253418369812e-07, "loss": 0.3066, "step": 16733 }, { "epoch": 0.8085229743441078, "grad_norm": 4.281588077545166, "learning_rate": 1.9147702565589215e-07, "loss": 0.3349, "step": 16734 }, { "epoch": 0.8085712905251969, "grad_norm": 6.2702717781066895, "learning_rate": 1.9142870947480312e-07, "loss": 0.2115, "step": 16735 }, { "epoch": 0.808619606706286, "grad_norm": 2.35949969291687, "learning_rate": 1.9138039329371405e-07, "loss": 0.2096, "step": 16736 }, { "epoch": 0.808667922887375, "grad_norm": 4.251513481140137, "learning_rate": 1.91332077112625e-07, "loss": 0.346, "step": 16737 }, { "epoch": 0.808716239068464, "grad_norm": 3.9981329441070557, "learning_rate": 1.9128376093153598e-07, "loss": 0.3769, "step": 16738 }, { "epoch": 0.808764555249553, "grad_norm": 3.7820653915405273, "learning_rate": 1.9123544475044692e-07, "loss": 0.2976, "step": 16739 }, { "epoch": 0.8088128714306422, "grad_norm": 2.831514358520508, "learning_rate": 1.9118712856935786e-07, "loss": 0.2482, "step": 16740 }, { "epoch": 0.8088611876117312, "grad_norm": 3.0245914459228516, "learning_rate": 1.9113881238826882e-07, "loss": 0.3425, "step": 16741 }, { "epoch": 0.8089095037928202, "grad_norm": 5.187691688537598, "learning_rate": 1.9109049620717976e-07, "loss": 0.4346, "step": 16742 }, { "epoch": 0.8089578199739093, "grad_norm": 3.7935526371002197, "learning_rate": 1.9104218002609075e-07, "loss": 0.2667, "step": 16743 }, { "epoch": 0.8090061361549983, "grad_norm": 3.1573240756988525, "learning_rate": 1.9099386384500168e-07, "loss": 0.3492, "step": 16744 }, { "epoch": 0.8090544523360873, "grad_norm": 3.096212863922119, "learning_rate": 1.9094554766391262e-07, "loss": 0.5447, "step": 16745 }, { "epoch": 0.8091027685171764, "grad_norm": 5.932316303253174, "learning_rate": 1.9089723148282359e-07, "loss": 0.2836, "step": 16746 }, { "epoch": 0.8091510846982655, "grad_norm": 2.0638792514801025, "learning_rate": 1.9084891530173455e-07, "loss": 0.1814, "step": 16747 }, { "epoch": 0.8091994008793545, "grad_norm": 3.3149828910827637, "learning_rate": 1.908005991206455e-07, "loss": 0.2253, "step": 16748 }, { "epoch": 0.8092477170604435, "grad_norm": 6.575047492980957, "learning_rate": 1.9075228293955645e-07, "loss": 0.2183, "step": 16749 }, { "epoch": 0.8092960332415325, "grad_norm": 4.280858516693115, "learning_rate": 1.907039667584674e-07, "loss": 0.3304, "step": 16750 }, { "epoch": 0.8093443494226217, "grad_norm": 2.6835439205169678, "learning_rate": 1.9065565057737838e-07, "loss": 0.3367, "step": 16751 }, { "epoch": 0.8093926656037107, "grad_norm": 2.295825719833374, "learning_rate": 1.9060733439628932e-07, "loss": 0.167, "step": 16752 }, { "epoch": 0.8094409817847997, "grad_norm": 2.398590564727783, "learning_rate": 1.9055901821520025e-07, "loss": 0.2801, "step": 16753 }, { "epoch": 0.8094892979658888, "grad_norm": 2.9317095279693604, "learning_rate": 1.9051070203411122e-07, "loss": 0.3268, "step": 16754 }, { "epoch": 0.8095376141469778, "grad_norm": 3.5925230979919434, "learning_rate": 1.9046238585302215e-07, "loss": 0.2634, "step": 16755 }, { "epoch": 0.8095859303280669, "grad_norm": 2.1205835342407227, "learning_rate": 1.9041406967193312e-07, "loss": 0.2482, "step": 16756 }, { "epoch": 0.8096342465091559, "grad_norm": 2.6862921714782715, "learning_rate": 1.9036575349084408e-07, "loss": 0.2612, "step": 16757 }, { "epoch": 0.809682562690245, "grad_norm": 3.5217697620391846, "learning_rate": 1.9031743730975502e-07, "loss": 0.2941, "step": 16758 }, { "epoch": 0.809730878871334, "grad_norm": 2.7365529537200928, "learning_rate": 1.9026912112866598e-07, "loss": 0.3952, "step": 16759 }, { "epoch": 0.809779195052423, "grad_norm": 3.544274091720581, "learning_rate": 1.9022080494757695e-07, "loss": 0.3919, "step": 16760 }, { "epoch": 0.8098275112335122, "grad_norm": 3.752053737640381, "learning_rate": 1.9017248876648788e-07, "loss": 0.3793, "step": 16761 }, { "epoch": 0.8098758274146012, "grad_norm": 3.7610864639282227, "learning_rate": 1.9012417258539885e-07, "loss": 0.2957, "step": 16762 }, { "epoch": 0.8099241435956902, "grad_norm": 2.7087764739990234, "learning_rate": 1.9007585640430978e-07, "loss": 0.2276, "step": 16763 }, { "epoch": 0.8099724597767792, "grad_norm": 4.6589155197143555, "learning_rate": 1.9002754022322075e-07, "loss": 0.2529, "step": 16764 }, { "epoch": 0.8100207759578683, "grad_norm": 3.58882212638855, "learning_rate": 1.899792240421317e-07, "loss": 0.3007, "step": 16765 }, { "epoch": 0.8100690921389574, "grad_norm": 2.461956024169922, "learning_rate": 1.8993090786104265e-07, "loss": 0.2663, "step": 16766 }, { "epoch": 0.8101174083200464, "grad_norm": 2.9173178672790527, "learning_rate": 1.898825916799536e-07, "loss": 0.3018, "step": 16767 }, { "epoch": 0.8101657245011354, "grad_norm": 3.66076922416687, "learning_rate": 1.8983427549886455e-07, "loss": 0.2985, "step": 16768 }, { "epoch": 0.8102140406822245, "grad_norm": 2.2997753620147705, "learning_rate": 1.8978595931777551e-07, "loss": 0.2083, "step": 16769 }, { "epoch": 0.8102623568633135, "grad_norm": 2.392683982849121, "learning_rate": 1.8973764313668648e-07, "loss": 0.3195, "step": 16770 }, { "epoch": 0.8103106730444025, "grad_norm": 3.227731704711914, "learning_rate": 1.8968932695559741e-07, "loss": 0.3032, "step": 16771 }, { "epoch": 0.8103589892254917, "grad_norm": 2.6577718257904053, "learning_rate": 1.8964101077450835e-07, "loss": 0.2221, "step": 16772 }, { "epoch": 0.8104073054065807, "grad_norm": 2.7736313343048096, "learning_rate": 1.8959269459341934e-07, "loss": 0.3435, "step": 16773 }, { "epoch": 0.8104556215876697, "grad_norm": 2.614109516143799, "learning_rate": 1.8954437841233028e-07, "loss": 0.2614, "step": 16774 }, { "epoch": 0.8105039377687587, "grad_norm": 2.300560474395752, "learning_rate": 1.8949606223124124e-07, "loss": 0.2596, "step": 16775 }, { "epoch": 0.8105522539498478, "grad_norm": 2.588662624359131, "learning_rate": 1.8944774605015218e-07, "loss": 0.4079, "step": 16776 }, { "epoch": 0.8106005701309369, "grad_norm": 2.6010992527008057, "learning_rate": 1.8939942986906314e-07, "loss": 0.2174, "step": 16777 }, { "epoch": 0.8106488863120259, "grad_norm": 3.1733107566833496, "learning_rate": 1.893511136879741e-07, "loss": 0.2153, "step": 16778 }, { "epoch": 0.8106972024931149, "grad_norm": 4.385829448699951, "learning_rate": 1.8930279750688505e-07, "loss": 0.3201, "step": 16779 }, { "epoch": 0.810745518674204, "grad_norm": 1.5997283458709717, "learning_rate": 1.8925448132579598e-07, "loss": 0.1698, "step": 16780 }, { "epoch": 0.810793834855293, "grad_norm": 5.172223091125488, "learning_rate": 1.8920616514470695e-07, "loss": 0.206, "step": 16781 }, { "epoch": 0.8108421510363821, "grad_norm": 3.3394134044647217, "learning_rate": 1.891578489636179e-07, "loss": 0.2404, "step": 16782 }, { "epoch": 0.8108904672174712, "grad_norm": 2.2441565990448, "learning_rate": 1.8910953278252887e-07, "loss": 0.2141, "step": 16783 }, { "epoch": 0.8109387833985602, "grad_norm": 2.3585312366485596, "learning_rate": 1.890612166014398e-07, "loss": 0.1658, "step": 16784 }, { "epoch": 0.8109870995796492, "grad_norm": 2.386810302734375, "learning_rate": 1.8901290042035075e-07, "loss": 0.2733, "step": 16785 }, { "epoch": 0.8110354157607382, "grad_norm": 2.9149439334869385, "learning_rate": 1.8896458423926174e-07, "loss": 0.2643, "step": 16786 }, { "epoch": 0.8110837319418274, "grad_norm": 1.5094915628433228, "learning_rate": 1.8891626805817268e-07, "loss": 0.1324, "step": 16787 }, { "epoch": 0.8111320481229164, "grad_norm": 2.7868354320526123, "learning_rate": 1.8886795187708361e-07, "loss": 0.3083, "step": 16788 }, { "epoch": 0.8111803643040054, "grad_norm": 2.6466195583343506, "learning_rate": 1.8881963569599458e-07, "loss": 0.2569, "step": 16789 }, { "epoch": 0.8112286804850944, "grad_norm": 2.739650249481201, "learning_rate": 1.8877131951490554e-07, "loss": 0.402, "step": 16790 }, { "epoch": 0.8112769966661835, "grad_norm": 3.041618585586548, "learning_rate": 1.887230033338165e-07, "loss": 0.2598, "step": 16791 }, { "epoch": 0.8113253128472726, "grad_norm": 2.651500940322876, "learning_rate": 1.8867468715272744e-07, "loss": 0.2442, "step": 16792 }, { "epoch": 0.8113736290283616, "grad_norm": 3.8705852031707764, "learning_rate": 1.8862637097163838e-07, "loss": 0.3183, "step": 16793 }, { "epoch": 0.8114219452094507, "grad_norm": 9.702193260192871, "learning_rate": 1.8857805479054934e-07, "loss": 0.3673, "step": 16794 }, { "epoch": 0.8114702613905397, "grad_norm": 2.626473903656006, "learning_rate": 1.885297386094603e-07, "loss": 0.2789, "step": 16795 }, { "epoch": 0.8115185775716287, "grad_norm": 2.1159262657165527, "learning_rate": 1.8848142242837124e-07, "loss": 0.2644, "step": 16796 }, { "epoch": 0.8115668937527177, "grad_norm": 3.0993449687957764, "learning_rate": 1.884331062472822e-07, "loss": 0.2666, "step": 16797 }, { "epoch": 0.8116152099338069, "grad_norm": 2.129647970199585, "learning_rate": 1.8838479006619314e-07, "loss": 0.2079, "step": 16798 }, { "epoch": 0.8116635261148959, "grad_norm": 3.78603458404541, "learning_rate": 1.8833647388510414e-07, "loss": 0.2788, "step": 16799 }, { "epoch": 0.8117118422959849, "grad_norm": 2.3391964435577393, "learning_rate": 1.8828815770401507e-07, "loss": 0.2947, "step": 16800 }, { "epoch": 0.811760158477074, "grad_norm": 2.414001226425171, "learning_rate": 1.88239841522926e-07, "loss": 0.2292, "step": 16801 }, { "epoch": 0.811808474658163, "grad_norm": 3.0682320594787598, "learning_rate": 1.8819152534183697e-07, "loss": 0.2963, "step": 16802 }, { "epoch": 0.8118567908392521, "grad_norm": 2.3210222721099854, "learning_rate": 1.881432091607479e-07, "loss": 0.2805, "step": 16803 }, { "epoch": 0.8119051070203411, "grad_norm": 7.508394718170166, "learning_rate": 1.8809489297965887e-07, "loss": 0.4871, "step": 16804 }, { "epoch": 0.8119534232014302, "grad_norm": 2.7770187854766846, "learning_rate": 1.8804657679856984e-07, "loss": 0.3837, "step": 16805 }, { "epoch": 0.8120017393825192, "grad_norm": 2.5202653408050537, "learning_rate": 1.8799826061748078e-07, "loss": 0.3419, "step": 16806 }, { "epoch": 0.8120500555636082, "grad_norm": 2.6005382537841797, "learning_rate": 1.8794994443639174e-07, "loss": 0.352, "step": 16807 }, { "epoch": 0.8120983717446973, "grad_norm": 5.0894856452941895, "learning_rate": 1.879016282553027e-07, "loss": 0.3749, "step": 16808 }, { "epoch": 0.8121466879257864, "grad_norm": 6.577173233032227, "learning_rate": 1.8785331207421364e-07, "loss": 0.4102, "step": 16809 }, { "epoch": 0.8121950041068754, "grad_norm": 2.521217107772827, "learning_rate": 1.878049958931246e-07, "loss": 0.4145, "step": 16810 }, { "epoch": 0.8122433202879644, "grad_norm": 2.259610891342163, "learning_rate": 1.8775667971203554e-07, "loss": 0.3186, "step": 16811 }, { "epoch": 0.8122916364690534, "grad_norm": 3.077544689178467, "learning_rate": 1.877083635309465e-07, "loss": 0.2114, "step": 16812 }, { "epoch": 0.8123399526501426, "grad_norm": 2.579195976257324, "learning_rate": 1.8766004734985747e-07, "loss": 0.2851, "step": 16813 }, { "epoch": 0.8123882688312316, "grad_norm": 5.787399768829346, "learning_rate": 1.876117311687684e-07, "loss": 0.2963, "step": 16814 }, { "epoch": 0.8124365850123206, "grad_norm": 5.999335765838623, "learning_rate": 1.8756341498767937e-07, "loss": 0.4019, "step": 16815 }, { "epoch": 0.8124849011934097, "grad_norm": 3.0586111545562744, "learning_rate": 1.875150988065903e-07, "loss": 0.2415, "step": 16816 }, { "epoch": 0.8125332173744987, "grad_norm": 2.5581295490264893, "learning_rate": 1.8746678262550127e-07, "loss": 0.2764, "step": 16817 }, { "epoch": 0.8125815335555878, "grad_norm": 2.845963716506958, "learning_rate": 1.8741846644441223e-07, "loss": 0.3589, "step": 16818 }, { "epoch": 0.8126298497366768, "grad_norm": 2.5592262744903564, "learning_rate": 1.8737015026332317e-07, "loss": 0.3068, "step": 16819 }, { "epoch": 0.8126781659177659, "grad_norm": 5.0259904861450195, "learning_rate": 1.873218340822341e-07, "loss": 0.2514, "step": 16820 }, { "epoch": 0.8127264820988549, "grad_norm": 9.015401840209961, "learning_rate": 1.872735179011451e-07, "loss": 0.3498, "step": 16821 }, { "epoch": 0.8127747982799439, "grad_norm": 2.300795078277588, "learning_rate": 1.8722520172005604e-07, "loss": 0.2405, "step": 16822 }, { "epoch": 0.812823114461033, "grad_norm": 2.411616802215576, "learning_rate": 1.87176885538967e-07, "loss": 0.2587, "step": 16823 }, { "epoch": 0.8128714306421221, "grad_norm": 2.281567096710205, "learning_rate": 1.8712856935787794e-07, "loss": 0.156, "step": 16824 }, { "epoch": 0.8129197468232111, "grad_norm": 2.522494077682495, "learning_rate": 1.870802531767889e-07, "loss": 0.224, "step": 16825 }, { "epoch": 0.8129680630043001, "grad_norm": 2.6337180137634277, "learning_rate": 1.8703193699569987e-07, "loss": 0.3732, "step": 16826 }, { "epoch": 0.8130163791853892, "grad_norm": 3.7204127311706543, "learning_rate": 1.869836208146108e-07, "loss": 0.3507, "step": 16827 }, { "epoch": 0.8130646953664782, "grad_norm": 3.229043960571289, "learning_rate": 1.8693530463352174e-07, "loss": 0.2621, "step": 16828 }, { "epoch": 0.8131130115475673, "grad_norm": 2.919588327407837, "learning_rate": 1.868869884524327e-07, "loss": 0.367, "step": 16829 }, { "epoch": 0.8131613277286563, "grad_norm": 2.0810494422912598, "learning_rate": 1.8683867227134367e-07, "loss": 0.2377, "step": 16830 }, { "epoch": 0.8132096439097454, "grad_norm": 4.602197647094727, "learning_rate": 1.8679035609025463e-07, "loss": 0.3036, "step": 16831 }, { "epoch": 0.8132579600908344, "grad_norm": 2.8397862911224365, "learning_rate": 1.8674203990916557e-07, "loss": 0.404, "step": 16832 }, { "epoch": 0.8133062762719234, "grad_norm": 6.9126152992248535, "learning_rate": 1.866937237280765e-07, "loss": 0.3125, "step": 16833 }, { "epoch": 0.8133545924530126, "grad_norm": 2.1905622482299805, "learning_rate": 1.866454075469875e-07, "loss": 0.2952, "step": 16834 }, { "epoch": 0.8134029086341016, "grad_norm": 2.541382074356079, "learning_rate": 1.8659709136589843e-07, "loss": 0.2978, "step": 16835 }, { "epoch": 0.8134512248151906, "grad_norm": 2.3598949909210205, "learning_rate": 1.8654877518480937e-07, "loss": 0.1795, "step": 16836 }, { "epoch": 0.8134995409962796, "grad_norm": 3.4771676063537598, "learning_rate": 1.8650045900372033e-07, "loss": 0.3847, "step": 16837 }, { "epoch": 0.8135478571773687, "grad_norm": 2.367781639099121, "learning_rate": 1.864521428226313e-07, "loss": 0.2849, "step": 16838 }, { "epoch": 0.8135961733584578, "grad_norm": 24.646310806274414, "learning_rate": 1.8640382664154226e-07, "loss": 0.2873, "step": 16839 }, { "epoch": 0.8136444895395468, "grad_norm": 30.903369903564453, "learning_rate": 1.863555104604532e-07, "loss": 0.4445, "step": 16840 }, { "epoch": 0.8136928057206358, "grad_norm": 2.3014893531799316, "learning_rate": 1.8630719427936414e-07, "loss": 0.3307, "step": 16841 }, { "epoch": 0.8137411219017249, "grad_norm": 7.448296070098877, "learning_rate": 1.862588780982751e-07, "loss": 0.2994, "step": 16842 }, { "epoch": 0.8137894380828139, "grad_norm": 3.0904619693756104, "learning_rate": 1.8621056191718606e-07, "loss": 0.3285, "step": 16843 }, { "epoch": 0.813837754263903, "grad_norm": 2.672624349594116, "learning_rate": 1.86162245736097e-07, "loss": 0.2734, "step": 16844 }, { "epoch": 0.8138860704449921, "grad_norm": 3.695488214492798, "learning_rate": 1.8611392955500796e-07, "loss": 0.2952, "step": 16845 }, { "epoch": 0.8139343866260811, "grad_norm": 2.449866533279419, "learning_rate": 1.860656133739189e-07, "loss": 0.255, "step": 16846 }, { "epoch": 0.8139827028071701, "grad_norm": 3.086433172225952, "learning_rate": 1.860172971928299e-07, "loss": 0.4105, "step": 16847 }, { "epoch": 0.8140310189882591, "grad_norm": 2.921252727508545, "learning_rate": 1.8596898101174083e-07, "loss": 0.2656, "step": 16848 }, { "epoch": 0.8140793351693482, "grad_norm": 2.5819759368896484, "learning_rate": 1.8592066483065177e-07, "loss": 0.2977, "step": 16849 }, { "epoch": 0.8141276513504373, "grad_norm": 2.835589647293091, "learning_rate": 1.8587234864956273e-07, "loss": 0.4024, "step": 16850 }, { "epoch": 0.8141759675315263, "grad_norm": 6.359674453735352, "learning_rate": 1.858240324684737e-07, "loss": 0.2233, "step": 16851 }, { "epoch": 0.8142242837126153, "grad_norm": 2.4868881702423096, "learning_rate": 1.8577571628738463e-07, "loss": 0.3439, "step": 16852 }, { "epoch": 0.8142725998937044, "grad_norm": 2.2101638317108154, "learning_rate": 1.857274001062956e-07, "loss": 0.1962, "step": 16853 }, { "epoch": 0.8143209160747934, "grad_norm": 2.3687682151794434, "learning_rate": 1.8567908392520653e-07, "loss": 0.3208, "step": 16854 }, { "epoch": 0.8143692322558825, "grad_norm": 3.778751850128174, "learning_rate": 1.856307677441175e-07, "loss": 0.3167, "step": 16855 }, { "epoch": 0.8144175484369716, "grad_norm": 3.9727578163146973, "learning_rate": 1.8558245156302846e-07, "loss": 0.3117, "step": 16856 }, { "epoch": 0.8144658646180606, "grad_norm": 3.1743900775909424, "learning_rate": 1.855341353819394e-07, "loss": 0.3028, "step": 16857 }, { "epoch": 0.8145141807991496, "grad_norm": 5.082581520080566, "learning_rate": 1.8548581920085036e-07, "loss": 0.2846, "step": 16858 }, { "epoch": 0.8145624969802386, "grad_norm": 5.7384138107299805, "learning_rate": 1.854375030197613e-07, "loss": 0.3425, "step": 16859 }, { "epoch": 0.8146108131613278, "grad_norm": 3.5873281955718994, "learning_rate": 1.8538918683867226e-07, "loss": 0.3236, "step": 16860 }, { "epoch": 0.8146591293424168, "grad_norm": 2.5904369354248047, "learning_rate": 1.8534087065758323e-07, "loss": 0.3393, "step": 16861 }, { "epoch": 0.8147074455235058, "grad_norm": 2.2849199771881104, "learning_rate": 1.8529255447649416e-07, "loss": 0.2431, "step": 16862 }, { "epoch": 0.8147557617045948, "grad_norm": 2.1335291862487793, "learning_rate": 1.8524423829540513e-07, "loss": 0.2712, "step": 16863 }, { "epoch": 0.8148040778856839, "grad_norm": 3.051560401916504, "learning_rate": 1.851959221143161e-07, "loss": 0.3475, "step": 16864 }, { "epoch": 0.814852394066773, "grad_norm": 2.9902796745300293, "learning_rate": 1.8514760593322703e-07, "loss": 0.2674, "step": 16865 }, { "epoch": 0.814900710247862, "grad_norm": 3.5073304176330566, "learning_rate": 1.85099289752138e-07, "loss": 0.2911, "step": 16866 }, { "epoch": 0.8149490264289511, "grad_norm": 1.783218502998352, "learning_rate": 1.8505097357104893e-07, "loss": 0.2503, "step": 16867 }, { "epoch": 0.8149973426100401, "grad_norm": 7.095035552978516, "learning_rate": 1.8500265738995987e-07, "loss": 0.3016, "step": 16868 }, { "epoch": 0.8150456587911291, "grad_norm": 7.320489883422852, "learning_rate": 1.8495434120887086e-07, "loss": 0.2329, "step": 16869 }, { "epoch": 0.8150939749722182, "grad_norm": 2.2746219635009766, "learning_rate": 1.849060250277818e-07, "loss": 0.2673, "step": 16870 }, { "epoch": 0.8151422911533073, "grad_norm": 2.515211820602417, "learning_rate": 1.8485770884669276e-07, "loss": 0.3336, "step": 16871 }, { "epoch": 0.8151906073343963, "grad_norm": 2.4934744834899902, "learning_rate": 1.848093926656037e-07, "loss": 0.2771, "step": 16872 }, { "epoch": 0.8152389235154853, "grad_norm": 2.325648546218872, "learning_rate": 1.8476107648451466e-07, "loss": 0.217, "step": 16873 }, { "epoch": 0.8152872396965744, "grad_norm": 3.6332056522369385, "learning_rate": 1.8471276030342562e-07, "loss": 0.5706, "step": 16874 }, { "epoch": 0.8153355558776634, "grad_norm": 4.7147908210754395, "learning_rate": 1.8466444412233656e-07, "loss": 0.3066, "step": 16875 }, { "epoch": 0.8153838720587525, "grad_norm": 5.004394054412842, "learning_rate": 1.846161279412475e-07, "loss": 0.3091, "step": 16876 }, { "epoch": 0.8154321882398415, "grad_norm": 3.87597918510437, "learning_rate": 1.8456781176015849e-07, "loss": 0.2563, "step": 16877 }, { "epoch": 0.8154805044209306, "grad_norm": 4.15110445022583, "learning_rate": 1.8451949557906942e-07, "loss": 0.3166, "step": 16878 }, { "epoch": 0.8155288206020196, "grad_norm": 1.8363595008850098, "learning_rate": 1.844711793979804e-07, "loss": 0.1711, "step": 16879 }, { "epoch": 0.8155771367831086, "grad_norm": 3.1706595420837402, "learning_rate": 1.8442286321689133e-07, "loss": 0.2741, "step": 16880 }, { "epoch": 0.8156254529641978, "grad_norm": 4.160129547119141, "learning_rate": 1.8437454703580226e-07, "loss": 0.3811, "step": 16881 }, { "epoch": 0.8156737691452868, "grad_norm": 1.9310137033462524, "learning_rate": 1.8432623085471325e-07, "loss": 0.2243, "step": 16882 }, { "epoch": 0.8157220853263758, "grad_norm": 5.981140613555908, "learning_rate": 1.842779146736242e-07, "loss": 0.3679, "step": 16883 }, { "epoch": 0.8157704015074648, "grad_norm": 2.8772783279418945, "learning_rate": 1.8422959849253513e-07, "loss": 0.2766, "step": 16884 }, { "epoch": 0.8158187176885539, "grad_norm": 5.578856945037842, "learning_rate": 1.841812823114461e-07, "loss": 0.2667, "step": 16885 }, { "epoch": 0.815867033869643, "grad_norm": 2.9534108638763428, "learning_rate": 1.8413296613035705e-07, "loss": 0.3809, "step": 16886 }, { "epoch": 0.815915350050732, "grad_norm": 2.1122336387634277, "learning_rate": 1.8408464994926802e-07, "loss": 0.2535, "step": 16887 }, { "epoch": 0.815963666231821, "grad_norm": 3.172311305999756, "learning_rate": 1.8403633376817896e-07, "loss": 0.386, "step": 16888 }, { "epoch": 0.8160119824129101, "grad_norm": 2.160633087158203, "learning_rate": 1.839880175870899e-07, "loss": 0.2066, "step": 16889 }, { "epoch": 0.8160602985939991, "grad_norm": 2.3886914253234863, "learning_rate": 1.8393970140600088e-07, "loss": 0.2782, "step": 16890 }, { "epoch": 0.8161086147750882, "grad_norm": 2.833753824234009, "learning_rate": 1.8389138522491182e-07, "loss": 0.3531, "step": 16891 }, { "epoch": 0.8161569309561773, "grad_norm": 3.0026655197143555, "learning_rate": 1.8384306904382276e-07, "loss": 0.342, "step": 16892 }, { "epoch": 0.8162052471372663, "grad_norm": 2.394167900085449, "learning_rate": 1.8379475286273372e-07, "loss": 0.3525, "step": 16893 }, { "epoch": 0.8162535633183553, "grad_norm": 4.216013431549072, "learning_rate": 1.8374643668164466e-07, "loss": 0.205, "step": 16894 }, { "epoch": 0.8163018794994443, "grad_norm": 3.021559238433838, "learning_rate": 1.8369812050055565e-07, "loss": 0.3506, "step": 16895 }, { "epoch": 0.8163501956805335, "grad_norm": 2.2637760639190674, "learning_rate": 1.8364980431946659e-07, "loss": 0.2805, "step": 16896 }, { "epoch": 0.8163985118616225, "grad_norm": 4.77290153503418, "learning_rate": 1.8360148813837752e-07, "loss": 0.2461, "step": 16897 }, { "epoch": 0.8164468280427115, "grad_norm": 2.8271632194519043, "learning_rate": 1.835531719572885e-07, "loss": 0.2495, "step": 16898 }, { "epoch": 0.8164951442238005, "grad_norm": 3.041696071624756, "learning_rate": 1.8350485577619945e-07, "loss": 0.3615, "step": 16899 }, { "epoch": 0.8165434604048896, "grad_norm": 2.2442123889923096, "learning_rate": 1.834565395951104e-07, "loss": 0.2406, "step": 16900 }, { "epoch": 0.8165917765859786, "grad_norm": 4.86159086227417, "learning_rate": 1.8340822341402135e-07, "loss": 0.2843, "step": 16901 }, { "epoch": 0.8166400927670677, "grad_norm": 2.6720380783081055, "learning_rate": 1.833599072329323e-07, "loss": 0.2971, "step": 16902 }, { "epoch": 0.8166884089481568, "grad_norm": 2.4311327934265137, "learning_rate": 1.8331159105184328e-07, "loss": 0.3252, "step": 16903 }, { "epoch": 0.8167367251292458, "grad_norm": 2.774878978729248, "learning_rate": 1.8326327487075422e-07, "loss": 0.2968, "step": 16904 }, { "epoch": 0.8167850413103348, "grad_norm": 2.3452365398406982, "learning_rate": 1.8321495868966515e-07, "loss": 0.2163, "step": 16905 }, { "epoch": 0.8168333574914238, "grad_norm": 3.5865843296051025, "learning_rate": 1.8316664250857612e-07, "loss": 0.3396, "step": 16906 }, { "epoch": 0.816881673672513, "grad_norm": 2.141833782196045, "learning_rate": 1.8311832632748706e-07, "loss": 0.197, "step": 16907 }, { "epoch": 0.816929989853602, "grad_norm": 2.4489293098449707, "learning_rate": 1.8307001014639802e-07, "loss": 0.2669, "step": 16908 }, { "epoch": 0.816978306034691, "grad_norm": 1.7787246704101562, "learning_rate": 1.8302169396530898e-07, "loss": 0.1837, "step": 16909 }, { "epoch": 0.81702662221578, "grad_norm": 4.243058681488037, "learning_rate": 1.8297337778421992e-07, "loss": 0.4029, "step": 16910 }, { "epoch": 0.8170749383968691, "grad_norm": 3.825484275817871, "learning_rate": 1.8292506160313086e-07, "loss": 0.2864, "step": 16911 }, { "epoch": 0.8171232545779582, "grad_norm": 2.613450288772583, "learning_rate": 1.8287674542204185e-07, "loss": 0.3493, "step": 16912 }, { "epoch": 0.8171715707590472, "grad_norm": 2.5818965435028076, "learning_rate": 1.8282842924095278e-07, "loss": 0.3751, "step": 16913 }, { "epoch": 0.8172198869401363, "grad_norm": 2.369617223739624, "learning_rate": 1.8278011305986375e-07, "loss": 0.2649, "step": 16914 }, { "epoch": 0.8172682031212253, "grad_norm": 3.0459537506103516, "learning_rate": 1.8273179687877469e-07, "loss": 0.3047, "step": 16915 }, { "epoch": 0.8173165193023143, "grad_norm": 2.7383532524108887, "learning_rate": 1.8268348069768565e-07, "loss": 0.2724, "step": 16916 }, { "epoch": 0.8173648354834034, "grad_norm": 2.7726316452026367, "learning_rate": 1.826351645165966e-07, "loss": 0.265, "step": 16917 }, { "epoch": 0.8174131516644925, "grad_norm": 2.589986562728882, "learning_rate": 1.8258684833550755e-07, "loss": 0.2464, "step": 16918 }, { "epoch": 0.8174614678455815, "grad_norm": 1.926544189453125, "learning_rate": 1.825385321544185e-07, "loss": 0.2175, "step": 16919 }, { "epoch": 0.8175097840266705, "grad_norm": 2.7494521141052246, "learning_rate": 1.8249021597332945e-07, "loss": 0.4001, "step": 16920 }, { "epoch": 0.8175581002077595, "grad_norm": 2.856727361679077, "learning_rate": 1.8244189979224042e-07, "loss": 0.2674, "step": 16921 }, { "epoch": 0.8176064163888487, "grad_norm": 1.8520681858062744, "learning_rate": 1.8239358361115138e-07, "loss": 0.1894, "step": 16922 }, { "epoch": 0.8176547325699377, "grad_norm": 2.289259910583496, "learning_rate": 1.8234526743006232e-07, "loss": 0.1989, "step": 16923 }, { "epoch": 0.8177030487510267, "grad_norm": 1.6027169227600098, "learning_rate": 1.8229695124897325e-07, "loss": 0.184, "step": 16924 }, { "epoch": 0.8177513649321158, "grad_norm": 2.303446054458618, "learning_rate": 1.8224863506788424e-07, "loss": 0.2768, "step": 16925 }, { "epoch": 0.8177996811132048, "grad_norm": 4.147305488586426, "learning_rate": 1.8220031888679518e-07, "loss": 0.1872, "step": 16926 }, { "epoch": 0.8178479972942938, "grad_norm": 3.1030571460723877, "learning_rate": 1.8215200270570614e-07, "loss": 0.4013, "step": 16927 }, { "epoch": 0.8178963134753829, "grad_norm": 3.659644365310669, "learning_rate": 1.8210368652461708e-07, "loss": 0.3844, "step": 16928 }, { "epoch": 0.817944629656472, "grad_norm": 2.12943696975708, "learning_rate": 1.8205537034352805e-07, "loss": 0.2669, "step": 16929 }, { "epoch": 0.817992945837561, "grad_norm": 19.39227294921875, "learning_rate": 1.82007054162439e-07, "loss": 0.3175, "step": 16930 }, { "epoch": 0.81804126201865, "grad_norm": 3.1655986309051514, "learning_rate": 1.8195873798134995e-07, "loss": 0.2487, "step": 16931 }, { "epoch": 0.818089578199739, "grad_norm": 2.994575262069702, "learning_rate": 1.8191042180026088e-07, "loss": 0.23, "step": 16932 }, { "epoch": 0.8181378943808282, "grad_norm": 2.433669090270996, "learning_rate": 1.8186210561917185e-07, "loss": 0.2326, "step": 16933 }, { "epoch": 0.8181862105619172, "grad_norm": 2.6218740940093994, "learning_rate": 1.818137894380828e-07, "loss": 0.3352, "step": 16934 }, { "epoch": 0.8182345267430062, "grad_norm": 3.9110636711120605, "learning_rate": 1.8176547325699378e-07, "loss": 0.3154, "step": 16935 }, { "epoch": 0.8182828429240953, "grad_norm": 2.5153422355651855, "learning_rate": 1.817171570759047e-07, "loss": 0.2446, "step": 16936 }, { "epoch": 0.8183311591051843, "grad_norm": 2.468717575073242, "learning_rate": 1.8166884089481565e-07, "loss": 0.2231, "step": 16937 }, { "epoch": 0.8183794752862734, "grad_norm": 3.0663270950317383, "learning_rate": 1.8162052471372664e-07, "loss": 0.3908, "step": 16938 }, { "epoch": 0.8184277914673624, "grad_norm": 2.2502734661102295, "learning_rate": 1.8157220853263758e-07, "loss": 0.2501, "step": 16939 }, { "epoch": 0.8184761076484515, "grad_norm": 2.7195167541503906, "learning_rate": 1.8152389235154851e-07, "loss": 0.3767, "step": 16940 }, { "epoch": 0.8185244238295405, "grad_norm": 2.162884473800659, "learning_rate": 1.8147557617045948e-07, "loss": 0.2642, "step": 16941 }, { "epoch": 0.8185727400106295, "grad_norm": 2.3347551822662354, "learning_rate": 1.8142725998937042e-07, "loss": 0.2454, "step": 16942 }, { "epoch": 0.8186210561917187, "grad_norm": 3.8511130809783936, "learning_rate": 1.813789438082814e-07, "loss": 0.2506, "step": 16943 }, { "epoch": 0.8186693723728077, "grad_norm": 5.879289627075195, "learning_rate": 1.8133062762719234e-07, "loss": 0.2958, "step": 16944 }, { "epoch": 0.8187176885538967, "grad_norm": 7.345483779907227, "learning_rate": 1.8128231144610328e-07, "loss": 0.373, "step": 16945 }, { "epoch": 0.8187660047349857, "grad_norm": 3.806547164916992, "learning_rate": 1.8123399526501424e-07, "loss": 0.3688, "step": 16946 }, { "epoch": 0.8188143209160748, "grad_norm": 4.013080596923828, "learning_rate": 1.811856790839252e-07, "loss": 0.3372, "step": 16947 }, { "epoch": 0.8188626370971639, "grad_norm": 2.542678117752075, "learning_rate": 1.8113736290283615e-07, "loss": 0.2822, "step": 16948 }, { "epoch": 0.8189109532782529, "grad_norm": 2.594160556793213, "learning_rate": 1.810890467217471e-07, "loss": 0.3654, "step": 16949 }, { "epoch": 0.8189592694593419, "grad_norm": 1.8740016222000122, "learning_rate": 1.8104073054065805e-07, "loss": 0.222, "step": 16950 }, { "epoch": 0.819007585640431, "grad_norm": 3.7057840824127197, "learning_rate": 1.8099241435956904e-07, "loss": 0.3103, "step": 16951 }, { "epoch": 0.81905590182152, "grad_norm": 6.731388568878174, "learning_rate": 1.8094409817847997e-07, "loss": 0.4099, "step": 16952 }, { "epoch": 0.819104218002609, "grad_norm": 3.0233099460601807, "learning_rate": 1.808957819973909e-07, "loss": 0.3956, "step": 16953 }, { "epoch": 0.8191525341836982, "grad_norm": 3.448509931564331, "learning_rate": 1.8084746581630187e-07, "loss": 0.3316, "step": 16954 }, { "epoch": 0.8192008503647872, "grad_norm": 3.04921293258667, "learning_rate": 1.807991496352128e-07, "loss": 0.2769, "step": 16955 }, { "epoch": 0.8192491665458762, "grad_norm": 2.178177833557129, "learning_rate": 1.8075083345412378e-07, "loss": 0.2992, "step": 16956 }, { "epoch": 0.8192974827269652, "grad_norm": 1.8799407482147217, "learning_rate": 1.8070251727303474e-07, "loss": 0.1985, "step": 16957 }, { "epoch": 0.8193457989080543, "grad_norm": 4.054657459259033, "learning_rate": 1.8065420109194568e-07, "loss": 0.335, "step": 16958 }, { "epoch": 0.8193941150891434, "grad_norm": 2.8728294372558594, "learning_rate": 1.8060588491085661e-07, "loss": 0.2549, "step": 16959 }, { "epoch": 0.8194424312702324, "grad_norm": 2.352189540863037, "learning_rate": 1.805575687297676e-07, "loss": 0.2497, "step": 16960 }, { "epoch": 0.8194907474513214, "grad_norm": 3.365246534347534, "learning_rate": 1.8050925254867854e-07, "loss": 0.4125, "step": 16961 }, { "epoch": 0.8195390636324105, "grad_norm": 2.945733070373535, "learning_rate": 1.804609363675895e-07, "loss": 0.4611, "step": 16962 }, { "epoch": 0.8195873798134995, "grad_norm": 2.283635377883911, "learning_rate": 1.8041262018650044e-07, "loss": 0.3368, "step": 16963 }, { "epoch": 0.8196356959945886, "grad_norm": 2.912475824356079, "learning_rate": 1.803643040054114e-07, "loss": 0.3023, "step": 16964 }, { "epoch": 0.8196840121756777, "grad_norm": 3.2491393089294434, "learning_rate": 1.8031598782432237e-07, "loss": 0.3843, "step": 16965 }, { "epoch": 0.8197323283567667, "grad_norm": 3.6471126079559326, "learning_rate": 1.802676716432333e-07, "loss": 0.3574, "step": 16966 }, { "epoch": 0.8197806445378557, "grad_norm": 2.711259126663208, "learning_rate": 1.8021935546214424e-07, "loss": 0.2143, "step": 16967 }, { "epoch": 0.8198289607189447, "grad_norm": 2.314689874649048, "learning_rate": 1.801710392810552e-07, "loss": 0.2998, "step": 16968 }, { "epoch": 0.8198772769000339, "grad_norm": 2.633718490600586, "learning_rate": 1.8012272309996617e-07, "loss": 0.2319, "step": 16969 }, { "epoch": 0.8199255930811229, "grad_norm": 21.5335750579834, "learning_rate": 1.8007440691887714e-07, "loss": 0.3946, "step": 16970 }, { "epoch": 0.8199739092622119, "grad_norm": 2.6660187244415283, "learning_rate": 1.8002609073778807e-07, "loss": 0.2721, "step": 16971 }, { "epoch": 0.820022225443301, "grad_norm": 2.230834484100342, "learning_rate": 1.79977774556699e-07, "loss": 0.1854, "step": 16972 }, { "epoch": 0.82007054162439, "grad_norm": 4.1316328048706055, "learning_rate": 1.7992945837561e-07, "loss": 0.1754, "step": 16973 }, { "epoch": 0.8201188578054791, "grad_norm": 2.794273614883423, "learning_rate": 1.7988114219452094e-07, "loss": 0.1824, "step": 16974 }, { "epoch": 0.8201671739865681, "grad_norm": 2.894740343093872, "learning_rate": 1.7983282601343188e-07, "loss": 0.3512, "step": 16975 }, { "epoch": 0.8202154901676572, "grad_norm": 1.8883599042892456, "learning_rate": 1.7978450983234284e-07, "loss": 0.1838, "step": 16976 }, { "epoch": 0.8202638063487462, "grad_norm": 3.452741861343384, "learning_rate": 1.797361936512538e-07, "loss": 0.3213, "step": 16977 }, { "epoch": 0.8203121225298352, "grad_norm": 2.5475189685821533, "learning_rate": 1.7968787747016477e-07, "loss": 0.213, "step": 16978 }, { "epoch": 0.8203604387109242, "grad_norm": 18.81500244140625, "learning_rate": 1.796395612890757e-07, "loss": 0.3035, "step": 16979 }, { "epoch": 0.8204087548920134, "grad_norm": 2.020381212234497, "learning_rate": 1.7959124510798664e-07, "loss": 0.2357, "step": 16980 }, { "epoch": 0.8204570710731024, "grad_norm": 2.276240110397339, "learning_rate": 1.795429289268976e-07, "loss": 0.1647, "step": 16981 }, { "epoch": 0.8205053872541914, "grad_norm": 2.7914953231811523, "learning_rate": 1.7949461274580857e-07, "loss": 0.3157, "step": 16982 }, { "epoch": 0.8205537034352804, "grad_norm": 2.3643081188201904, "learning_rate": 1.794462965647195e-07, "loss": 0.2544, "step": 16983 }, { "epoch": 0.8206020196163695, "grad_norm": 3.2517597675323486, "learning_rate": 1.7939798038363047e-07, "loss": 0.2237, "step": 16984 }, { "epoch": 0.8206503357974586, "grad_norm": 9.949568748474121, "learning_rate": 1.793496642025414e-07, "loss": 0.2846, "step": 16985 }, { "epoch": 0.8206986519785476, "grad_norm": 2.783602237701416, "learning_rate": 1.793013480214524e-07, "loss": 0.3385, "step": 16986 }, { "epoch": 0.8207469681596367, "grad_norm": 2.334125518798828, "learning_rate": 1.7925303184036333e-07, "loss": 0.2744, "step": 16987 }, { "epoch": 0.8207952843407257, "grad_norm": 3.3970837593078613, "learning_rate": 1.7920471565927427e-07, "loss": 0.1836, "step": 16988 }, { "epoch": 0.8208436005218147, "grad_norm": 3.390087127685547, "learning_rate": 1.7915639947818524e-07, "loss": 0.4625, "step": 16989 }, { "epoch": 0.8208919167029038, "grad_norm": 3.6501755714416504, "learning_rate": 1.791080832970962e-07, "loss": 0.3718, "step": 16990 }, { "epoch": 0.8209402328839929, "grad_norm": 3.6272177696228027, "learning_rate": 1.7905976711600714e-07, "loss": 0.2279, "step": 16991 }, { "epoch": 0.8209885490650819, "grad_norm": 2.8171823024749756, "learning_rate": 1.790114509349181e-07, "loss": 0.3389, "step": 16992 }, { "epoch": 0.8210368652461709, "grad_norm": 1.8753803968429565, "learning_rate": 1.7896313475382904e-07, "loss": 0.2153, "step": 16993 }, { "epoch": 0.82108518142726, "grad_norm": 2.8541531562805176, "learning_rate": 1.7891481857274e-07, "loss": 0.2994, "step": 16994 }, { "epoch": 0.8211334976083491, "grad_norm": 2.504807710647583, "learning_rate": 1.7886650239165096e-07, "loss": 0.4172, "step": 16995 }, { "epoch": 0.8211818137894381, "grad_norm": 3.384251356124878, "learning_rate": 1.788181862105619e-07, "loss": 0.3326, "step": 16996 }, { "epoch": 0.8212301299705271, "grad_norm": 4.372352123260498, "learning_rate": 1.7876987002947287e-07, "loss": 0.2654, "step": 16997 }, { "epoch": 0.8212784461516162, "grad_norm": 1.8799083232879639, "learning_rate": 1.787215538483838e-07, "loss": 0.1982, "step": 16998 }, { "epoch": 0.8213267623327052, "grad_norm": 2.084141969680786, "learning_rate": 1.7867323766729477e-07, "loss": 0.2409, "step": 16999 }, { "epoch": 0.8213750785137943, "grad_norm": 3.4740235805511475, "learning_rate": 1.7862492148620573e-07, "loss": 0.4039, "step": 17000 }, { "epoch": 0.8214233946948833, "grad_norm": 3.269861936569214, "learning_rate": 1.7857660530511667e-07, "loss": 0.351, "step": 17001 }, { "epoch": 0.8214717108759724, "grad_norm": 1.6908618211746216, "learning_rate": 1.7852828912402763e-07, "loss": 0.1726, "step": 17002 }, { "epoch": 0.8215200270570614, "grad_norm": 2.6850407123565674, "learning_rate": 1.784799729429386e-07, "loss": 0.3481, "step": 17003 }, { "epoch": 0.8215683432381504, "grad_norm": 4.332028865814209, "learning_rate": 1.7843165676184953e-07, "loss": 0.3534, "step": 17004 }, { "epoch": 0.8216166594192394, "grad_norm": 2.4631357192993164, "learning_rate": 1.783833405807605e-07, "loss": 0.2489, "step": 17005 }, { "epoch": 0.8216649756003286, "grad_norm": 23.941650390625, "learning_rate": 1.7833502439967143e-07, "loss": 0.4073, "step": 17006 }, { "epoch": 0.8217132917814176, "grad_norm": 2.1834793090820312, "learning_rate": 1.7828670821858237e-07, "loss": 0.2421, "step": 17007 }, { "epoch": 0.8217616079625066, "grad_norm": 1.9530775547027588, "learning_rate": 1.7823839203749336e-07, "loss": 0.1794, "step": 17008 }, { "epoch": 0.8218099241435957, "grad_norm": 4.266020774841309, "learning_rate": 1.781900758564043e-07, "loss": 0.2386, "step": 17009 }, { "epoch": 0.8218582403246847, "grad_norm": 2.619309186935425, "learning_rate": 1.7814175967531526e-07, "loss": 0.3405, "step": 17010 }, { "epoch": 0.8219065565057738, "grad_norm": 2.609025001525879, "learning_rate": 1.780934434942262e-07, "loss": 0.319, "step": 17011 }, { "epoch": 0.8219548726868628, "grad_norm": 1.8611421585083008, "learning_rate": 1.7804512731313716e-07, "loss": 0.1998, "step": 17012 }, { "epoch": 0.8220031888679519, "grad_norm": 3.9965579509735107, "learning_rate": 1.7799681113204813e-07, "loss": 0.486, "step": 17013 }, { "epoch": 0.8220515050490409, "grad_norm": 3.19842529296875, "learning_rate": 1.7794849495095906e-07, "loss": 0.3511, "step": 17014 }, { "epoch": 0.8220998212301299, "grad_norm": 2.1002116203308105, "learning_rate": 1.7790017876987e-07, "loss": 0.2492, "step": 17015 }, { "epoch": 0.8221481374112191, "grad_norm": 2.140972375869751, "learning_rate": 1.77851862588781e-07, "loss": 0.2344, "step": 17016 }, { "epoch": 0.8221964535923081, "grad_norm": 1.9940698146820068, "learning_rate": 1.7780354640769193e-07, "loss": 0.179, "step": 17017 }, { "epoch": 0.8222447697733971, "grad_norm": 5.685347080230713, "learning_rate": 1.777552302266029e-07, "loss": 0.4336, "step": 17018 }, { "epoch": 0.8222930859544861, "grad_norm": 2.1558284759521484, "learning_rate": 1.7770691404551383e-07, "loss": 0.2141, "step": 17019 }, { "epoch": 0.8223414021355752, "grad_norm": 2.8744804859161377, "learning_rate": 1.7765859786442477e-07, "loss": 0.3074, "step": 17020 }, { "epoch": 0.8223897183166643, "grad_norm": 1.942927598953247, "learning_rate": 1.7761028168333576e-07, "loss": 0.2083, "step": 17021 }, { "epoch": 0.8224380344977533, "grad_norm": 3.0893661975860596, "learning_rate": 1.775619655022467e-07, "loss": 0.3808, "step": 17022 }, { "epoch": 0.8224863506788423, "grad_norm": 6.70225191116333, "learning_rate": 1.7751364932115763e-07, "loss": 0.3166, "step": 17023 }, { "epoch": 0.8225346668599314, "grad_norm": 2.755035400390625, "learning_rate": 1.774653331400686e-07, "loss": 0.2423, "step": 17024 }, { "epoch": 0.8225829830410204, "grad_norm": 3.43925142288208, "learning_rate": 1.7741701695897956e-07, "loss": 0.3448, "step": 17025 }, { "epoch": 0.8226312992221095, "grad_norm": 2.8686602115631104, "learning_rate": 1.7736870077789052e-07, "loss": 0.3431, "step": 17026 }, { "epoch": 0.8226796154031986, "grad_norm": 2.55747652053833, "learning_rate": 1.7732038459680146e-07, "loss": 0.3311, "step": 17027 }, { "epoch": 0.8227279315842876, "grad_norm": 2.769378662109375, "learning_rate": 1.772720684157124e-07, "loss": 0.2752, "step": 17028 }, { "epoch": 0.8227762477653766, "grad_norm": 2.4458718299865723, "learning_rate": 1.772237522346234e-07, "loss": 0.3051, "step": 17029 }, { "epoch": 0.8228245639464656, "grad_norm": 3.623065233230591, "learning_rate": 1.7717543605353433e-07, "loss": 0.3641, "step": 17030 }, { "epoch": 0.8228728801275548, "grad_norm": 2.300078868865967, "learning_rate": 1.7712711987244526e-07, "loss": 0.257, "step": 17031 }, { "epoch": 0.8229211963086438, "grad_norm": 23.083341598510742, "learning_rate": 1.7707880369135623e-07, "loss": 0.1941, "step": 17032 }, { "epoch": 0.8229695124897328, "grad_norm": 2.400455951690674, "learning_rate": 1.7703048751026716e-07, "loss": 0.2999, "step": 17033 }, { "epoch": 0.8230178286708218, "grad_norm": 2.4280636310577393, "learning_rate": 1.7698217132917815e-07, "loss": 0.1836, "step": 17034 }, { "epoch": 0.8230661448519109, "grad_norm": 12.112664222717285, "learning_rate": 1.769338551480891e-07, "loss": 0.3598, "step": 17035 }, { "epoch": 0.8231144610329999, "grad_norm": 2.236271619796753, "learning_rate": 1.7688553896700003e-07, "loss": 0.2846, "step": 17036 }, { "epoch": 0.823162777214089, "grad_norm": 2.526315212249756, "learning_rate": 1.76837222785911e-07, "loss": 0.3418, "step": 17037 }, { "epoch": 0.8232110933951781, "grad_norm": 1.8259650468826294, "learning_rate": 1.7678890660482196e-07, "loss": 0.2067, "step": 17038 }, { "epoch": 0.8232594095762671, "grad_norm": 3.1669881343841553, "learning_rate": 1.767405904237329e-07, "loss": 0.3535, "step": 17039 }, { "epoch": 0.8233077257573561, "grad_norm": 2.5797665119171143, "learning_rate": 1.7669227424264386e-07, "loss": 0.365, "step": 17040 }, { "epoch": 0.8233560419384451, "grad_norm": 4.6483564376831055, "learning_rate": 1.766439580615548e-07, "loss": 0.2152, "step": 17041 }, { "epoch": 0.8234043581195343, "grad_norm": 3.885807752609253, "learning_rate": 1.7659564188046578e-07, "loss": 0.3034, "step": 17042 }, { "epoch": 0.8234526743006233, "grad_norm": 2.2998175621032715, "learning_rate": 1.7654732569937672e-07, "loss": 0.2414, "step": 17043 }, { "epoch": 0.8235009904817123, "grad_norm": 2.634821891784668, "learning_rate": 1.7649900951828766e-07, "loss": 0.3258, "step": 17044 }, { "epoch": 0.8235493066628014, "grad_norm": 3.58288311958313, "learning_rate": 1.7645069333719862e-07, "loss": 0.3043, "step": 17045 }, { "epoch": 0.8235976228438904, "grad_norm": 2.863433361053467, "learning_rate": 1.7640237715610956e-07, "loss": 0.4026, "step": 17046 }, { "epoch": 0.8236459390249795, "grad_norm": 2.5455310344696045, "learning_rate": 1.7635406097502052e-07, "loss": 0.3259, "step": 17047 }, { "epoch": 0.8236942552060685, "grad_norm": 3.9706029891967773, "learning_rate": 1.763057447939315e-07, "loss": 0.291, "step": 17048 }, { "epoch": 0.8237425713871576, "grad_norm": 1.6536304950714111, "learning_rate": 1.7625742861284242e-07, "loss": 0.1379, "step": 17049 }, { "epoch": 0.8237908875682466, "grad_norm": 1.901111125946045, "learning_rate": 1.762091124317534e-07, "loss": 0.213, "step": 17050 }, { "epoch": 0.8238392037493356, "grad_norm": 2.2849113941192627, "learning_rate": 1.7616079625066435e-07, "loss": 0.243, "step": 17051 }, { "epoch": 0.8238875199304247, "grad_norm": 3.432590961456299, "learning_rate": 1.761124800695753e-07, "loss": 0.4081, "step": 17052 }, { "epoch": 0.8239358361115138, "grad_norm": 3.0326168537139893, "learning_rate": 1.7606416388848625e-07, "loss": 0.2964, "step": 17053 }, { "epoch": 0.8239841522926028, "grad_norm": 1.7641290426254272, "learning_rate": 1.760158477073972e-07, "loss": 0.1462, "step": 17054 }, { "epoch": 0.8240324684736918, "grad_norm": 2.56059193611145, "learning_rate": 1.7596753152630815e-07, "loss": 0.2998, "step": 17055 }, { "epoch": 0.8240807846547809, "grad_norm": 2.2373247146606445, "learning_rate": 1.7591921534521912e-07, "loss": 0.2427, "step": 17056 }, { "epoch": 0.82412910083587, "grad_norm": 2.469322443008423, "learning_rate": 1.7587089916413006e-07, "loss": 0.1817, "step": 17057 }, { "epoch": 0.824177417016959, "grad_norm": 4.581174373626709, "learning_rate": 1.7582258298304102e-07, "loss": 0.4412, "step": 17058 }, { "epoch": 0.824225733198048, "grad_norm": 2.8969790935516357, "learning_rate": 1.7577426680195196e-07, "loss": 0.307, "step": 17059 }, { "epoch": 0.8242740493791371, "grad_norm": 2.5852930545806885, "learning_rate": 1.7572595062086292e-07, "loss": 0.3347, "step": 17060 }, { "epoch": 0.8243223655602261, "grad_norm": 3.4567408561706543, "learning_rate": 1.7567763443977388e-07, "loss": 0.2303, "step": 17061 }, { "epoch": 0.8243706817413151, "grad_norm": 2.372396230697632, "learning_rate": 1.7562931825868482e-07, "loss": 0.2712, "step": 17062 }, { "epoch": 0.8244189979224043, "grad_norm": 2.4114105701446533, "learning_rate": 1.7558100207759576e-07, "loss": 0.28, "step": 17063 }, { "epoch": 0.8244673141034933, "grad_norm": 4.138772964477539, "learning_rate": 1.7553268589650675e-07, "loss": 0.2493, "step": 17064 }, { "epoch": 0.8245156302845823, "grad_norm": 3.249109983444214, "learning_rate": 1.7548436971541769e-07, "loss": 0.2231, "step": 17065 }, { "epoch": 0.8245639464656713, "grad_norm": 2.316222906112671, "learning_rate": 1.7543605353432865e-07, "loss": 0.3094, "step": 17066 }, { "epoch": 0.8246122626467604, "grad_norm": 2.4441957473754883, "learning_rate": 1.753877373532396e-07, "loss": 0.2319, "step": 17067 }, { "epoch": 0.8246605788278495, "grad_norm": 2.1393887996673584, "learning_rate": 1.7533942117215055e-07, "loss": 0.2533, "step": 17068 }, { "epoch": 0.8247088950089385, "grad_norm": 3.9531309604644775, "learning_rate": 1.7529110499106151e-07, "loss": 0.1747, "step": 17069 }, { "epoch": 0.8247572111900275, "grad_norm": 2.757279872894287, "learning_rate": 1.7524278880997245e-07, "loss": 0.2906, "step": 17070 }, { "epoch": 0.8248055273711166, "grad_norm": 14.421818733215332, "learning_rate": 1.751944726288834e-07, "loss": 0.3707, "step": 17071 }, { "epoch": 0.8248538435522056, "grad_norm": 3.6026077270507812, "learning_rate": 1.7514615644779435e-07, "loss": 0.3197, "step": 17072 }, { "epoch": 0.8249021597332947, "grad_norm": 2.3774845600128174, "learning_rate": 1.7509784026670532e-07, "loss": 0.2345, "step": 17073 }, { "epoch": 0.8249504759143838, "grad_norm": 2.0649566650390625, "learning_rate": 1.7504952408561628e-07, "loss": 0.2141, "step": 17074 }, { "epoch": 0.8249987920954728, "grad_norm": 3.612854480743408, "learning_rate": 1.7500120790452722e-07, "loss": 0.3607, "step": 17075 }, { "epoch": 0.8250471082765618, "grad_norm": 3.8981339931488037, "learning_rate": 1.7495289172343815e-07, "loss": 0.348, "step": 17076 }, { "epoch": 0.8250954244576508, "grad_norm": 1.8860467672348022, "learning_rate": 1.7490457554234914e-07, "loss": 0.178, "step": 17077 }, { "epoch": 0.82514374063874, "grad_norm": 2.747760057449341, "learning_rate": 1.7485625936126008e-07, "loss": 0.236, "step": 17078 }, { "epoch": 0.825192056819829, "grad_norm": 2.6795451641082764, "learning_rate": 1.7480794318017102e-07, "loss": 0.2217, "step": 17079 }, { "epoch": 0.825240373000918, "grad_norm": 3.287003755569458, "learning_rate": 1.7475962699908198e-07, "loss": 0.28, "step": 17080 }, { "epoch": 0.825288689182007, "grad_norm": 5.696117877960205, "learning_rate": 1.7471131081799295e-07, "loss": 0.3644, "step": 17081 }, { "epoch": 0.8253370053630961, "grad_norm": 3.4407854080200195, "learning_rate": 1.746629946369039e-07, "loss": 0.3171, "step": 17082 }, { "epoch": 0.8253853215441852, "grad_norm": 2.452996253967285, "learning_rate": 1.7461467845581485e-07, "loss": 0.3008, "step": 17083 }, { "epoch": 0.8254336377252742, "grad_norm": 2.263392210006714, "learning_rate": 1.7456636227472579e-07, "loss": 0.3091, "step": 17084 }, { "epoch": 0.8254819539063633, "grad_norm": 1.920061469078064, "learning_rate": 1.7451804609363675e-07, "loss": 0.2323, "step": 17085 }, { "epoch": 0.8255302700874523, "grad_norm": 1.9114094972610474, "learning_rate": 1.744697299125477e-07, "loss": 0.1657, "step": 17086 }, { "epoch": 0.8255785862685413, "grad_norm": 3.3356235027313232, "learning_rate": 1.7442141373145865e-07, "loss": 0.4112, "step": 17087 }, { "epoch": 0.8256269024496303, "grad_norm": 2.9130876064300537, "learning_rate": 1.7437309755036961e-07, "loss": 0.2777, "step": 17088 }, { "epoch": 0.8256752186307195, "grad_norm": 4.778754711151123, "learning_rate": 1.7432478136928055e-07, "loss": 0.3427, "step": 17089 }, { "epoch": 0.8257235348118085, "grad_norm": 1.4626795053482056, "learning_rate": 1.7427646518819154e-07, "loss": 0.1531, "step": 17090 }, { "epoch": 0.8257718509928975, "grad_norm": 3.0724756717681885, "learning_rate": 1.7422814900710248e-07, "loss": 0.3805, "step": 17091 }, { "epoch": 0.8258201671739865, "grad_norm": 2.7712888717651367, "learning_rate": 1.7417983282601342e-07, "loss": 0.3237, "step": 17092 }, { "epoch": 0.8258684833550756, "grad_norm": 3.6840269565582275, "learning_rate": 1.7413151664492438e-07, "loss": 0.3119, "step": 17093 }, { "epoch": 0.8259167995361647, "grad_norm": 9.037928581237793, "learning_rate": 1.7408320046383532e-07, "loss": 0.2638, "step": 17094 }, { "epoch": 0.8259651157172537, "grad_norm": 2.160947322845459, "learning_rate": 1.7403488428274628e-07, "loss": 0.2524, "step": 17095 }, { "epoch": 0.8260134318983428, "grad_norm": 2.4825785160064697, "learning_rate": 1.7398656810165724e-07, "loss": 0.2642, "step": 17096 }, { "epoch": 0.8260617480794318, "grad_norm": 2.100306510925293, "learning_rate": 1.7393825192056818e-07, "loss": 0.1845, "step": 17097 }, { "epoch": 0.8261100642605208, "grad_norm": 2.7190263271331787, "learning_rate": 1.7388993573947915e-07, "loss": 0.3471, "step": 17098 }, { "epoch": 0.8261583804416099, "grad_norm": 3.1407415866851807, "learning_rate": 1.738416195583901e-07, "loss": 0.3053, "step": 17099 }, { "epoch": 0.826206696622699, "grad_norm": 2.458948850631714, "learning_rate": 1.7379330337730105e-07, "loss": 0.2218, "step": 17100 }, { "epoch": 0.826255012803788, "grad_norm": 2.5299949645996094, "learning_rate": 1.73744987196212e-07, "loss": 0.2907, "step": 17101 }, { "epoch": 0.826303328984877, "grad_norm": 2.36143159866333, "learning_rate": 1.7369667101512295e-07, "loss": 0.3078, "step": 17102 }, { "epoch": 0.826351645165966, "grad_norm": 2.400343656539917, "learning_rate": 1.736483548340339e-07, "loss": 0.2631, "step": 17103 }, { "epoch": 0.8263999613470552, "grad_norm": 2.6323747634887695, "learning_rate": 1.7360003865294487e-07, "loss": 0.2817, "step": 17104 }, { "epoch": 0.8264482775281442, "grad_norm": 2.974738121032715, "learning_rate": 1.735517224718558e-07, "loss": 0.2172, "step": 17105 }, { "epoch": 0.8264965937092332, "grad_norm": 2.377845048904419, "learning_rate": 1.7350340629076678e-07, "loss": 0.234, "step": 17106 }, { "epoch": 0.8265449098903223, "grad_norm": 2.371752977371216, "learning_rate": 1.734550901096777e-07, "loss": 0.2476, "step": 17107 }, { "epoch": 0.8265932260714113, "grad_norm": 2.7483415603637695, "learning_rate": 1.7340677392858868e-07, "loss": 0.2225, "step": 17108 }, { "epoch": 0.8266415422525004, "grad_norm": 14.718162536621094, "learning_rate": 1.7335845774749964e-07, "loss": 0.2477, "step": 17109 }, { "epoch": 0.8266898584335894, "grad_norm": 1.9249296188354492, "learning_rate": 1.7331014156641058e-07, "loss": 0.1849, "step": 17110 }, { "epoch": 0.8267381746146785, "grad_norm": 3.4216549396514893, "learning_rate": 1.7326182538532152e-07, "loss": 0.3175, "step": 17111 }, { "epoch": 0.8267864907957675, "grad_norm": 2.1713690757751465, "learning_rate": 1.732135092042325e-07, "loss": 0.1919, "step": 17112 }, { "epoch": 0.8268348069768565, "grad_norm": 4.233209133148193, "learning_rate": 1.7316519302314344e-07, "loss": 0.267, "step": 17113 }, { "epoch": 0.8268831231579455, "grad_norm": 3.1499133110046387, "learning_rate": 1.731168768420544e-07, "loss": 0.3255, "step": 17114 }, { "epoch": 0.8269314393390347, "grad_norm": 2.7202751636505127, "learning_rate": 1.7306856066096534e-07, "loss": 0.2242, "step": 17115 }, { "epoch": 0.8269797555201237, "grad_norm": 2.788658618927002, "learning_rate": 1.730202444798763e-07, "loss": 0.2735, "step": 17116 }, { "epoch": 0.8270280717012127, "grad_norm": 3.120957851409912, "learning_rate": 1.7297192829878727e-07, "loss": 0.3247, "step": 17117 }, { "epoch": 0.8270763878823018, "grad_norm": 2.183588981628418, "learning_rate": 1.729236121176982e-07, "loss": 0.2317, "step": 17118 }, { "epoch": 0.8271247040633908, "grad_norm": 2.8391335010528564, "learning_rate": 1.7287529593660915e-07, "loss": 0.2888, "step": 17119 }, { "epoch": 0.8271730202444799, "grad_norm": 3.908677339553833, "learning_rate": 1.728269797555201e-07, "loss": 0.3517, "step": 17120 }, { "epoch": 0.8272213364255689, "grad_norm": 11.601263046264648, "learning_rate": 1.7277866357443107e-07, "loss": 0.2691, "step": 17121 }, { "epoch": 0.827269652606658, "grad_norm": 1.9165980815887451, "learning_rate": 1.7273034739334204e-07, "loss": 0.1703, "step": 17122 }, { "epoch": 0.827317968787747, "grad_norm": 3.9235639572143555, "learning_rate": 1.7268203121225297e-07, "loss": 0.2953, "step": 17123 }, { "epoch": 0.827366284968836, "grad_norm": 2.8058347702026367, "learning_rate": 1.726337150311639e-07, "loss": 0.317, "step": 17124 }, { "epoch": 0.8274146011499252, "grad_norm": 4.526888847351074, "learning_rate": 1.725853988500749e-07, "loss": 0.2834, "step": 17125 }, { "epoch": 0.8274629173310142, "grad_norm": 4.22421932220459, "learning_rate": 1.7253708266898584e-07, "loss": 0.3646, "step": 17126 }, { "epoch": 0.8275112335121032, "grad_norm": 2.6510252952575684, "learning_rate": 1.7248876648789678e-07, "loss": 0.2244, "step": 17127 }, { "epoch": 0.8275595496931922, "grad_norm": 9.352493286132812, "learning_rate": 1.7244045030680774e-07, "loss": 0.3191, "step": 17128 }, { "epoch": 0.8276078658742813, "grad_norm": 2.3754632472991943, "learning_rate": 1.723921341257187e-07, "loss": 0.2899, "step": 17129 }, { "epoch": 0.8276561820553704, "grad_norm": 2.199812650680542, "learning_rate": 1.7234381794462967e-07, "loss": 0.2004, "step": 17130 }, { "epoch": 0.8277044982364594, "grad_norm": 2.8917949199676514, "learning_rate": 1.722955017635406e-07, "loss": 0.3854, "step": 17131 }, { "epoch": 0.8277528144175484, "grad_norm": 6.591396808624268, "learning_rate": 1.7224718558245154e-07, "loss": 0.2429, "step": 17132 }, { "epoch": 0.8278011305986375, "grad_norm": 2.323018789291382, "learning_rate": 1.721988694013625e-07, "loss": 0.2483, "step": 17133 }, { "epoch": 0.8278494467797265, "grad_norm": 2.8026108741760254, "learning_rate": 1.7215055322027347e-07, "loss": 0.3802, "step": 17134 }, { "epoch": 0.8278977629608156, "grad_norm": 2.6024177074432373, "learning_rate": 1.721022370391844e-07, "loss": 0.2736, "step": 17135 }, { "epoch": 0.8279460791419047, "grad_norm": 3.056574583053589, "learning_rate": 1.7205392085809537e-07, "loss": 0.2423, "step": 17136 }, { "epoch": 0.8279943953229937, "grad_norm": 2.0437331199645996, "learning_rate": 1.720056046770063e-07, "loss": 0.2208, "step": 17137 }, { "epoch": 0.8280427115040827, "grad_norm": 4.720810890197754, "learning_rate": 1.719572884959173e-07, "loss": 0.316, "step": 17138 }, { "epoch": 0.8280910276851717, "grad_norm": 4.278242588043213, "learning_rate": 1.7190897231482824e-07, "loss": 0.2613, "step": 17139 }, { "epoch": 0.8281393438662608, "grad_norm": 1.8795583248138428, "learning_rate": 1.7186065613373917e-07, "loss": 0.1725, "step": 17140 }, { "epoch": 0.8281876600473499, "grad_norm": 3.943688154220581, "learning_rate": 1.7181233995265014e-07, "loss": 0.1997, "step": 17141 }, { "epoch": 0.8282359762284389, "grad_norm": 3.533830165863037, "learning_rate": 1.717640237715611e-07, "loss": 0.3157, "step": 17142 }, { "epoch": 0.828284292409528, "grad_norm": 2.1751081943511963, "learning_rate": 1.7171570759047204e-07, "loss": 0.2822, "step": 17143 }, { "epoch": 0.828332608590617, "grad_norm": 2.7579023838043213, "learning_rate": 1.71667391409383e-07, "loss": 0.2551, "step": 17144 }, { "epoch": 0.828380924771706, "grad_norm": 2.4430558681488037, "learning_rate": 1.7161907522829394e-07, "loss": 0.2868, "step": 17145 }, { "epoch": 0.8284292409527951, "grad_norm": 3.299388885498047, "learning_rate": 1.7157075904720488e-07, "loss": 0.2284, "step": 17146 }, { "epoch": 0.8284775571338842, "grad_norm": 1.350740909576416, "learning_rate": 1.7152244286611587e-07, "loss": 0.1415, "step": 17147 }, { "epoch": 0.8285258733149732, "grad_norm": 3.4123644828796387, "learning_rate": 1.714741266850268e-07, "loss": 0.2593, "step": 17148 }, { "epoch": 0.8285741894960622, "grad_norm": 2.3472816944122314, "learning_rate": 1.7142581050393777e-07, "loss": 0.297, "step": 17149 }, { "epoch": 0.8286225056771512, "grad_norm": 3.0934619903564453, "learning_rate": 1.713774943228487e-07, "loss": 0.3559, "step": 17150 }, { "epoch": 0.8286708218582404, "grad_norm": 3.7485427856445312, "learning_rate": 1.7132917814175967e-07, "loss": 0.2611, "step": 17151 }, { "epoch": 0.8287191380393294, "grad_norm": 2.718231201171875, "learning_rate": 1.7128086196067063e-07, "loss": 0.2383, "step": 17152 }, { "epoch": 0.8287674542204184, "grad_norm": 3.8910701274871826, "learning_rate": 1.7123254577958157e-07, "loss": 0.2972, "step": 17153 }, { "epoch": 0.8288157704015074, "grad_norm": 1.6566468477249146, "learning_rate": 1.711842295984925e-07, "loss": 0.1641, "step": 17154 }, { "epoch": 0.8288640865825965, "grad_norm": 3.5637660026550293, "learning_rate": 1.711359134174035e-07, "loss": 0.2565, "step": 17155 }, { "epoch": 0.8289124027636856, "grad_norm": 2.7457971572875977, "learning_rate": 1.7108759723631443e-07, "loss": 0.3581, "step": 17156 }, { "epoch": 0.8289607189447746, "grad_norm": 3.2064707279205322, "learning_rate": 1.710392810552254e-07, "loss": 0.2274, "step": 17157 }, { "epoch": 0.8290090351258637, "grad_norm": 2.8254573345184326, "learning_rate": 1.7099096487413633e-07, "loss": 0.2759, "step": 17158 }, { "epoch": 0.8290573513069527, "grad_norm": 2.1713762283325195, "learning_rate": 1.7094264869304727e-07, "loss": 0.307, "step": 17159 }, { "epoch": 0.8291056674880417, "grad_norm": 2.781038999557495, "learning_rate": 1.7089433251195826e-07, "loss": 0.372, "step": 17160 }, { "epoch": 0.8291539836691308, "grad_norm": 2.537627696990967, "learning_rate": 1.708460163308692e-07, "loss": 0.3032, "step": 17161 }, { "epoch": 0.8292022998502199, "grad_norm": 2.7272591590881348, "learning_rate": 1.7079770014978014e-07, "loss": 0.3096, "step": 17162 }, { "epoch": 0.8292506160313089, "grad_norm": 2.2895572185516357, "learning_rate": 1.707493839686911e-07, "loss": 0.2303, "step": 17163 }, { "epoch": 0.8292989322123979, "grad_norm": 7.835686206817627, "learning_rate": 1.7070106778760206e-07, "loss": 0.2691, "step": 17164 }, { "epoch": 0.829347248393487, "grad_norm": 3.008256673812866, "learning_rate": 1.7065275160651303e-07, "loss": 0.3203, "step": 17165 }, { "epoch": 0.829395564574576, "grad_norm": 2.740192413330078, "learning_rate": 1.7060443542542397e-07, "loss": 0.1934, "step": 17166 }, { "epoch": 0.8294438807556651, "grad_norm": 3.184741497039795, "learning_rate": 1.705561192443349e-07, "loss": 0.3214, "step": 17167 }, { "epoch": 0.8294921969367541, "grad_norm": 7.618971347808838, "learning_rate": 1.705078030632459e-07, "loss": 0.439, "step": 17168 }, { "epoch": 0.8295405131178432, "grad_norm": 1.9257123470306396, "learning_rate": 1.7045948688215683e-07, "loss": 0.2074, "step": 17169 }, { "epoch": 0.8295888292989322, "grad_norm": 1.9782328605651855, "learning_rate": 1.7041117070106777e-07, "loss": 0.2116, "step": 17170 }, { "epoch": 0.8296371454800212, "grad_norm": 2.014808177947998, "learning_rate": 1.7036285451997873e-07, "loss": 0.2268, "step": 17171 }, { "epoch": 0.8296854616611103, "grad_norm": 2.7679431438446045, "learning_rate": 1.7031453833888967e-07, "loss": 0.2314, "step": 17172 }, { "epoch": 0.8297337778421994, "grad_norm": 3.238813638687134, "learning_rate": 1.7026622215780066e-07, "loss": 0.3103, "step": 17173 }, { "epoch": 0.8297820940232884, "grad_norm": 3.3869850635528564, "learning_rate": 1.702179059767116e-07, "loss": 0.2691, "step": 17174 }, { "epoch": 0.8298304102043774, "grad_norm": 2.600315809249878, "learning_rate": 1.7016958979562253e-07, "loss": 0.2157, "step": 17175 }, { "epoch": 0.8298787263854664, "grad_norm": 3.423384189605713, "learning_rate": 1.701212736145335e-07, "loss": 0.2573, "step": 17176 }, { "epoch": 0.8299270425665556, "grad_norm": 2.4980735778808594, "learning_rate": 1.7007295743344446e-07, "loss": 0.3577, "step": 17177 }, { "epoch": 0.8299753587476446, "grad_norm": 4.107992649078369, "learning_rate": 1.700246412523554e-07, "loss": 0.41, "step": 17178 }, { "epoch": 0.8300236749287336, "grad_norm": 3.357679843902588, "learning_rate": 1.6997632507126636e-07, "loss": 0.3432, "step": 17179 }, { "epoch": 0.8300719911098227, "grad_norm": 2.7003610134124756, "learning_rate": 1.699280088901773e-07, "loss": 0.306, "step": 17180 }, { "epoch": 0.8301203072909117, "grad_norm": 2.564087152481079, "learning_rate": 1.698796927090883e-07, "loss": 0.1952, "step": 17181 }, { "epoch": 0.8301686234720008, "grad_norm": 3.0075888633728027, "learning_rate": 1.6983137652799923e-07, "loss": 0.3372, "step": 17182 }, { "epoch": 0.8302169396530898, "grad_norm": 2.120450258255005, "learning_rate": 1.6978306034691016e-07, "loss": 0.2227, "step": 17183 }, { "epoch": 0.8302652558341789, "grad_norm": 4.109323024749756, "learning_rate": 1.6973474416582113e-07, "loss": 0.2996, "step": 17184 }, { "epoch": 0.8303135720152679, "grad_norm": 5.759344100952148, "learning_rate": 1.6968642798473206e-07, "loss": 0.2114, "step": 17185 }, { "epoch": 0.8303618881963569, "grad_norm": 2.631802558898926, "learning_rate": 1.6963811180364303e-07, "loss": 0.2011, "step": 17186 }, { "epoch": 0.8304102043774461, "grad_norm": 2.8350706100463867, "learning_rate": 1.69589795622554e-07, "loss": 0.4463, "step": 17187 }, { "epoch": 0.8304585205585351, "grad_norm": 2.6384525299072266, "learning_rate": 1.6954147944146493e-07, "loss": 0.2916, "step": 17188 }, { "epoch": 0.8305068367396241, "grad_norm": 2.2495276927948, "learning_rate": 1.694931632603759e-07, "loss": 0.2498, "step": 17189 }, { "epoch": 0.8305551529207131, "grad_norm": 3.1286208629608154, "learning_rate": 1.6944484707928686e-07, "loss": 0.3047, "step": 17190 }, { "epoch": 0.8306034691018022, "grad_norm": 2.7145395278930664, "learning_rate": 1.693965308981978e-07, "loss": 0.2477, "step": 17191 }, { "epoch": 0.8306517852828912, "grad_norm": 4.351510524749756, "learning_rate": 1.6934821471710876e-07, "loss": 0.2921, "step": 17192 }, { "epoch": 0.8307001014639803, "grad_norm": 2.4030635356903076, "learning_rate": 1.692998985360197e-07, "loss": 0.2887, "step": 17193 }, { "epoch": 0.8307484176450693, "grad_norm": 3.213254451751709, "learning_rate": 1.6925158235493066e-07, "loss": 0.4148, "step": 17194 }, { "epoch": 0.8307967338261584, "grad_norm": 7.198740482330322, "learning_rate": 1.6920326617384162e-07, "loss": 0.2931, "step": 17195 }, { "epoch": 0.8308450500072474, "grad_norm": 2.7135913372039795, "learning_rate": 1.6915494999275256e-07, "loss": 0.2552, "step": 17196 }, { "epoch": 0.8308933661883364, "grad_norm": 2.3975281715393066, "learning_rate": 1.6910663381166352e-07, "loss": 0.2937, "step": 17197 }, { "epoch": 0.8309416823694256, "grad_norm": 2.502683162689209, "learning_rate": 1.6905831763057446e-07, "loss": 0.2637, "step": 17198 }, { "epoch": 0.8309899985505146, "grad_norm": 2.3331820964813232, "learning_rate": 1.6901000144948542e-07, "loss": 0.285, "step": 17199 }, { "epoch": 0.8310383147316036, "grad_norm": 2.263507604598999, "learning_rate": 1.689616852683964e-07, "loss": 0.267, "step": 17200 }, { "epoch": 0.8310866309126926, "grad_norm": 2.121673345565796, "learning_rate": 1.6891336908730733e-07, "loss": 0.2244, "step": 17201 }, { "epoch": 0.8311349470937817, "grad_norm": 5.164801120758057, "learning_rate": 1.6886505290621826e-07, "loss": 0.4103, "step": 17202 }, { "epoch": 0.8311832632748708, "grad_norm": 2.9403553009033203, "learning_rate": 1.6881673672512925e-07, "loss": 0.3425, "step": 17203 }, { "epoch": 0.8312315794559598, "grad_norm": 4.916228294372559, "learning_rate": 1.687684205440402e-07, "loss": 0.2626, "step": 17204 }, { "epoch": 0.8312798956370488, "grad_norm": 2.5393521785736084, "learning_rate": 1.6872010436295115e-07, "loss": 0.3726, "step": 17205 }, { "epoch": 0.8313282118181379, "grad_norm": 2.4516355991363525, "learning_rate": 1.686717881818621e-07, "loss": 0.27, "step": 17206 }, { "epoch": 0.8313765279992269, "grad_norm": 2.6883275508880615, "learning_rate": 1.6862347200077306e-07, "loss": 0.2605, "step": 17207 }, { "epoch": 0.831424844180316, "grad_norm": 6.365386962890625, "learning_rate": 1.6857515581968402e-07, "loss": 0.3552, "step": 17208 }, { "epoch": 0.8314731603614051, "grad_norm": 2.829820156097412, "learning_rate": 1.6852683963859496e-07, "loss": 0.3066, "step": 17209 }, { "epoch": 0.8315214765424941, "grad_norm": 2.020824432373047, "learning_rate": 1.684785234575059e-07, "loss": 0.2633, "step": 17210 }, { "epoch": 0.8315697927235831, "grad_norm": 2.542518138885498, "learning_rate": 1.6843020727641686e-07, "loss": 0.2676, "step": 17211 }, { "epoch": 0.8316181089046721, "grad_norm": 2.6561808586120605, "learning_rate": 1.6838189109532782e-07, "loss": 0.2494, "step": 17212 }, { "epoch": 0.8316664250857613, "grad_norm": 2.791189193725586, "learning_rate": 1.6833357491423878e-07, "loss": 0.2529, "step": 17213 }, { "epoch": 0.8317147412668503, "grad_norm": 2.8160336017608643, "learning_rate": 1.6828525873314972e-07, "loss": 0.3259, "step": 17214 }, { "epoch": 0.8317630574479393, "grad_norm": 2.559459924697876, "learning_rate": 1.6823694255206066e-07, "loss": 0.2623, "step": 17215 }, { "epoch": 0.8318113736290283, "grad_norm": 3.7080893516540527, "learning_rate": 1.6818862637097165e-07, "loss": 0.3483, "step": 17216 }, { "epoch": 0.8318596898101174, "grad_norm": 2.666877269744873, "learning_rate": 1.681403101898826e-07, "loss": 0.331, "step": 17217 }, { "epoch": 0.8319080059912064, "grad_norm": 3.4388067722320557, "learning_rate": 1.6809199400879352e-07, "loss": 0.224, "step": 17218 }, { "epoch": 0.8319563221722955, "grad_norm": 2.4685258865356445, "learning_rate": 1.680436778277045e-07, "loss": 0.288, "step": 17219 }, { "epoch": 0.8320046383533846, "grad_norm": 5.037429332733154, "learning_rate": 1.6799536164661545e-07, "loss": 0.2411, "step": 17220 }, { "epoch": 0.8320529545344736, "grad_norm": 2.391658067703247, "learning_rate": 1.6794704546552642e-07, "loss": 0.2392, "step": 17221 }, { "epoch": 0.8321012707155626, "grad_norm": 22.990812301635742, "learning_rate": 1.6789872928443735e-07, "loss": 0.2295, "step": 17222 }, { "epoch": 0.8321495868966516, "grad_norm": 4.247941017150879, "learning_rate": 1.678504131033483e-07, "loss": 0.3552, "step": 17223 }, { "epoch": 0.8321979030777408, "grad_norm": 4.534827709197998, "learning_rate": 1.6780209692225925e-07, "loss": 0.298, "step": 17224 }, { "epoch": 0.8322462192588298, "grad_norm": 4.084892749786377, "learning_rate": 1.6775378074117022e-07, "loss": 0.2811, "step": 17225 }, { "epoch": 0.8322945354399188, "grad_norm": 3.522029161453247, "learning_rate": 1.6770546456008115e-07, "loss": 0.4057, "step": 17226 }, { "epoch": 0.8323428516210079, "grad_norm": 5.867452621459961, "learning_rate": 1.6765714837899212e-07, "loss": 0.2053, "step": 17227 }, { "epoch": 0.8323911678020969, "grad_norm": 2.456974744796753, "learning_rate": 1.6760883219790306e-07, "loss": 0.2928, "step": 17228 }, { "epoch": 0.832439483983186, "grad_norm": 2.4820210933685303, "learning_rate": 1.6756051601681405e-07, "loss": 0.2993, "step": 17229 }, { "epoch": 0.832487800164275, "grad_norm": 1.7716463804244995, "learning_rate": 1.6751219983572498e-07, "loss": 0.2066, "step": 17230 }, { "epoch": 0.8325361163453641, "grad_norm": 2.6370999813079834, "learning_rate": 1.6746388365463592e-07, "loss": 0.325, "step": 17231 }, { "epoch": 0.8325844325264531, "grad_norm": 2.320702314376831, "learning_rate": 1.6741556747354688e-07, "loss": 0.3108, "step": 17232 }, { "epoch": 0.8326327487075421, "grad_norm": 2.98929762840271, "learning_rate": 1.6736725129245782e-07, "loss": 0.3232, "step": 17233 }, { "epoch": 0.8326810648886313, "grad_norm": 3.2020864486694336, "learning_rate": 1.6731893511136879e-07, "loss": 0.2361, "step": 17234 }, { "epoch": 0.8327293810697203, "grad_norm": 2.3439652919769287, "learning_rate": 1.6727061893027975e-07, "loss": 0.2981, "step": 17235 }, { "epoch": 0.8327776972508093, "grad_norm": 32.07022476196289, "learning_rate": 1.6722230274919069e-07, "loss": 0.3423, "step": 17236 }, { "epoch": 0.8328260134318983, "grad_norm": 2.058201313018799, "learning_rate": 1.6717398656810165e-07, "loss": 0.2285, "step": 17237 }, { "epoch": 0.8328743296129874, "grad_norm": 2.720644474029541, "learning_rate": 1.6712567038701261e-07, "loss": 0.3999, "step": 17238 }, { "epoch": 0.8329226457940765, "grad_norm": 3.9242985248565674, "learning_rate": 1.6707735420592355e-07, "loss": 0.3825, "step": 17239 }, { "epoch": 0.8329709619751655, "grad_norm": 2.4906704425811768, "learning_rate": 1.6702903802483451e-07, "loss": 0.2189, "step": 17240 }, { "epoch": 0.8330192781562545, "grad_norm": 2.4446611404418945, "learning_rate": 1.6698072184374545e-07, "loss": 0.251, "step": 17241 }, { "epoch": 0.8330675943373436, "grad_norm": 2.4911506175994873, "learning_rate": 1.6693240566265642e-07, "loss": 0.2839, "step": 17242 }, { "epoch": 0.8331159105184326, "grad_norm": 2.0338895320892334, "learning_rate": 1.6688408948156738e-07, "loss": 0.2459, "step": 17243 }, { "epoch": 0.8331642266995216, "grad_norm": 7.162862300872803, "learning_rate": 1.6683577330047832e-07, "loss": 0.2477, "step": 17244 }, { "epoch": 0.8332125428806108, "grad_norm": 2.3581104278564453, "learning_rate": 1.6678745711938928e-07, "loss": 0.2508, "step": 17245 }, { "epoch": 0.8332608590616998, "grad_norm": 3.320173740386963, "learning_rate": 1.6673914093830022e-07, "loss": 0.2723, "step": 17246 }, { "epoch": 0.8333091752427888, "grad_norm": 1.6041115522384644, "learning_rate": 1.6669082475721118e-07, "loss": 0.1781, "step": 17247 }, { "epoch": 0.8333574914238778, "grad_norm": 3.136855363845825, "learning_rate": 1.6664250857612215e-07, "loss": 0.3697, "step": 17248 }, { "epoch": 0.8334058076049669, "grad_norm": 2.7449729442596436, "learning_rate": 1.6659419239503308e-07, "loss": 0.2849, "step": 17249 }, { "epoch": 0.833454123786056, "grad_norm": 4.051504611968994, "learning_rate": 1.6654587621394402e-07, "loss": 0.2883, "step": 17250 }, { "epoch": 0.833502439967145, "grad_norm": 2.7990217208862305, "learning_rate": 1.66497560032855e-07, "loss": 0.2411, "step": 17251 }, { "epoch": 0.833550756148234, "grad_norm": 3.1900582313537598, "learning_rate": 1.6644924385176595e-07, "loss": 0.3732, "step": 17252 }, { "epoch": 0.8335990723293231, "grad_norm": 13.486212730407715, "learning_rate": 1.664009276706769e-07, "loss": 0.3842, "step": 17253 }, { "epoch": 0.8336473885104121, "grad_norm": 2.575310230255127, "learning_rate": 1.6635261148958785e-07, "loss": 0.294, "step": 17254 }, { "epoch": 0.8336957046915012, "grad_norm": 3.077976703643799, "learning_rate": 1.663042953084988e-07, "loss": 0.3147, "step": 17255 }, { "epoch": 0.8337440208725903, "grad_norm": 4.624264717102051, "learning_rate": 1.6625597912740978e-07, "loss": 0.238, "step": 17256 }, { "epoch": 0.8337923370536793, "grad_norm": 2.490762948989868, "learning_rate": 1.6620766294632071e-07, "loss": 0.3581, "step": 17257 }, { "epoch": 0.8338406532347683, "grad_norm": 2.9479763507843018, "learning_rate": 1.6615934676523165e-07, "loss": 0.3508, "step": 17258 }, { "epoch": 0.8338889694158573, "grad_norm": 2.315281629562378, "learning_rate": 1.6611103058414261e-07, "loss": 0.2532, "step": 17259 }, { "epoch": 0.8339372855969465, "grad_norm": 1.6620450019836426, "learning_rate": 1.6606271440305358e-07, "loss": 0.1596, "step": 17260 }, { "epoch": 0.8339856017780355, "grad_norm": 2.626652479171753, "learning_rate": 1.6601439822196454e-07, "loss": 0.3211, "step": 17261 }, { "epoch": 0.8340339179591245, "grad_norm": 2.278341770172119, "learning_rate": 1.6596608204087548e-07, "loss": 0.2645, "step": 17262 }, { "epoch": 0.8340822341402135, "grad_norm": 12.111666679382324, "learning_rate": 1.6591776585978642e-07, "loss": 0.4063, "step": 17263 }, { "epoch": 0.8341305503213026, "grad_norm": 9.100775718688965, "learning_rate": 1.658694496786974e-07, "loss": 0.4582, "step": 17264 }, { "epoch": 0.8341788665023917, "grad_norm": 2.172769784927368, "learning_rate": 1.6582113349760834e-07, "loss": 0.2443, "step": 17265 }, { "epoch": 0.8342271826834807, "grad_norm": 1.8505843877792358, "learning_rate": 1.6577281731651928e-07, "loss": 0.2083, "step": 17266 }, { "epoch": 0.8342754988645698, "grad_norm": 2.73115873336792, "learning_rate": 1.6572450113543024e-07, "loss": 0.2899, "step": 17267 }, { "epoch": 0.8343238150456588, "grad_norm": 2.9450109004974365, "learning_rate": 1.656761849543412e-07, "loss": 0.2944, "step": 17268 }, { "epoch": 0.8343721312267478, "grad_norm": 1.7706120014190674, "learning_rate": 1.6562786877325217e-07, "loss": 0.1626, "step": 17269 }, { "epoch": 0.8344204474078368, "grad_norm": 2.7687714099884033, "learning_rate": 1.655795525921631e-07, "loss": 0.2106, "step": 17270 }, { "epoch": 0.834468763588926, "grad_norm": 2.27052903175354, "learning_rate": 1.6553123641107405e-07, "loss": 0.2921, "step": 17271 }, { "epoch": 0.834517079770015, "grad_norm": 4.515117168426514, "learning_rate": 1.65482920229985e-07, "loss": 0.3096, "step": 17272 }, { "epoch": 0.834565395951104, "grad_norm": 2.6082839965820312, "learning_rate": 1.6543460404889597e-07, "loss": 0.317, "step": 17273 }, { "epoch": 0.834613712132193, "grad_norm": 8.072124481201172, "learning_rate": 1.653862878678069e-07, "loss": 0.3008, "step": 17274 }, { "epoch": 0.8346620283132821, "grad_norm": 2.8047242164611816, "learning_rate": 1.6533797168671788e-07, "loss": 0.303, "step": 17275 }, { "epoch": 0.8347103444943712, "grad_norm": 2.7927074432373047, "learning_rate": 1.652896555056288e-07, "loss": 0.1626, "step": 17276 }, { "epoch": 0.8347586606754602, "grad_norm": 3.54280686378479, "learning_rate": 1.652413393245398e-07, "loss": 0.3384, "step": 17277 }, { "epoch": 0.8348069768565493, "grad_norm": 2.4771881103515625, "learning_rate": 1.6519302314345074e-07, "loss": 0.3214, "step": 17278 }, { "epoch": 0.8348552930376383, "grad_norm": 3.1016063690185547, "learning_rate": 1.6514470696236168e-07, "loss": 0.2875, "step": 17279 }, { "epoch": 0.8349036092187273, "grad_norm": 1.8874081373214722, "learning_rate": 1.6509639078127264e-07, "loss": 0.2084, "step": 17280 }, { "epoch": 0.8349519253998164, "grad_norm": 2.0757858753204346, "learning_rate": 1.650480746001836e-07, "loss": 0.1699, "step": 17281 }, { "epoch": 0.8350002415809055, "grad_norm": 4.11244010925293, "learning_rate": 1.6499975841909454e-07, "loss": 0.2509, "step": 17282 }, { "epoch": 0.8350485577619945, "grad_norm": 3.2611451148986816, "learning_rate": 1.649514422380055e-07, "loss": 0.4295, "step": 17283 }, { "epoch": 0.8350968739430835, "grad_norm": 2.187833309173584, "learning_rate": 1.6490312605691644e-07, "loss": 0.2361, "step": 17284 }, { "epoch": 0.8351451901241725, "grad_norm": 1.9948166608810425, "learning_rate": 1.648548098758274e-07, "loss": 0.1334, "step": 17285 }, { "epoch": 0.8351935063052617, "grad_norm": 2.8320212364196777, "learning_rate": 1.6480649369473837e-07, "loss": 0.2186, "step": 17286 }, { "epoch": 0.8352418224863507, "grad_norm": 1.84086275100708, "learning_rate": 1.647581775136493e-07, "loss": 0.1495, "step": 17287 }, { "epoch": 0.8352901386674397, "grad_norm": 2.5227675437927246, "learning_rate": 1.6470986133256027e-07, "loss": 0.3435, "step": 17288 }, { "epoch": 0.8353384548485288, "grad_norm": 2.3493494987487793, "learning_rate": 1.646615451514712e-07, "loss": 0.2853, "step": 17289 }, { "epoch": 0.8353867710296178, "grad_norm": 5.474855422973633, "learning_rate": 1.6461322897038217e-07, "loss": 0.2751, "step": 17290 }, { "epoch": 0.8354350872107069, "grad_norm": 3.0648834705352783, "learning_rate": 1.6456491278929314e-07, "loss": 0.2789, "step": 17291 }, { "epoch": 0.8354834033917959, "grad_norm": 2.568574905395508, "learning_rate": 1.6451659660820407e-07, "loss": 0.2896, "step": 17292 }, { "epoch": 0.835531719572885, "grad_norm": 4.2505621910095215, "learning_rate": 1.6446828042711504e-07, "loss": 0.2379, "step": 17293 }, { "epoch": 0.835580035753974, "grad_norm": 2.6235954761505127, "learning_rate": 1.64419964246026e-07, "loss": 0.3575, "step": 17294 }, { "epoch": 0.835628351935063, "grad_norm": 2.6233370304107666, "learning_rate": 1.6437164806493694e-07, "loss": 0.2347, "step": 17295 }, { "epoch": 0.8356766681161522, "grad_norm": 2.7329461574554443, "learning_rate": 1.643233318838479e-07, "loss": 0.3247, "step": 17296 }, { "epoch": 0.8357249842972412, "grad_norm": 3.147529363632202, "learning_rate": 1.6427501570275884e-07, "loss": 0.3721, "step": 17297 }, { "epoch": 0.8357733004783302, "grad_norm": 3.776712656021118, "learning_rate": 1.6422669952166978e-07, "loss": 0.284, "step": 17298 }, { "epoch": 0.8358216166594192, "grad_norm": 76.68071746826172, "learning_rate": 1.6417838334058077e-07, "loss": 0.3542, "step": 17299 }, { "epoch": 0.8358699328405083, "grad_norm": 2.9175636768341064, "learning_rate": 1.641300671594917e-07, "loss": 0.3349, "step": 17300 }, { "epoch": 0.8359182490215973, "grad_norm": 2.5606296062469482, "learning_rate": 1.6408175097840267e-07, "loss": 0.3085, "step": 17301 }, { "epoch": 0.8359665652026864, "grad_norm": 3.025496482849121, "learning_rate": 1.640334347973136e-07, "loss": 0.4564, "step": 17302 }, { "epoch": 0.8360148813837754, "grad_norm": 2.9486098289489746, "learning_rate": 1.6398511861622457e-07, "loss": 0.2403, "step": 17303 }, { "epoch": 0.8360631975648645, "grad_norm": 3.5693044662475586, "learning_rate": 1.6393680243513553e-07, "loss": 0.2651, "step": 17304 }, { "epoch": 0.8361115137459535, "grad_norm": 2.7834818363189697, "learning_rate": 1.6388848625404647e-07, "loss": 0.3826, "step": 17305 }, { "epoch": 0.8361598299270425, "grad_norm": 3.711890459060669, "learning_rate": 1.638401700729574e-07, "loss": 0.3322, "step": 17306 }, { "epoch": 0.8362081461081317, "grad_norm": 3.776594638824463, "learning_rate": 1.637918538918684e-07, "loss": 0.4116, "step": 17307 }, { "epoch": 0.8362564622892207, "grad_norm": 2.3650717735290527, "learning_rate": 1.6374353771077933e-07, "loss": 0.2965, "step": 17308 }, { "epoch": 0.8363047784703097, "grad_norm": 3.112830400466919, "learning_rate": 1.636952215296903e-07, "loss": 0.4325, "step": 17309 }, { "epoch": 0.8363530946513987, "grad_norm": 2.195070505142212, "learning_rate": 1.6364690534860124e-07, "loss": 0.2237, "step": 17310 }, { "epoch": 0.8364014108324878, "grad_norm": 2.3391520977020264, "learning_rate": 1.6359858916751217e-07, "loss": 0.2447, "step": 17311 }, { "epoch": 0.8364497270135769, "grad_norm": 3.4863622188568115, "learning_rate": 1.6355027298642316e-07, "loss": 0.2521, "step": 17312 }, { "epoch": 0.8364980431946659, "grad_norm": 3.634920835494995, "learning_rate": 1.635019568053341e-07, "loss": 0.2752, "step": 17313 }, { "epoch": 0.8365463593757549, "grad_norm": 2.5678064823150635, "learning_rate": 1.6345364062424504e-07, "loss": 0.2057, "step": 17314 }, { "epoch": 0.836594675556844, "grad_norm": 5.117906093597412, "learning_rate": 1.63405324443156e-07, "loss": 0.4056, "step": 17315 }, { "epoch": 0.836642991737933, "grad_norm": 1.9041398763656616, "learning_rate": 1.6335700826206697e-07, "loss": 0.1976, "step": 17316 }, { "epoch": 0.8366913079190221, "grad_norm": 2.332885980606079, "learning_rate": 1.6330869208097793e-07, "loss": 0.2889, "step": 17317 }, { "epoch": 0.8367396241001112, "grad_norm": 3.9857943058013916, "learning_rate": 1.6326037589988887e-07, "loss": 0.2769, "step": 17318 }, { "epoch": 0.8367879402812002, "grad_norm": 2.6578941345214844, "learning_rate": 1.632120597187998e-07, "loss": 0.2421, "step": 17319 }, { "epoch": 0.8368362564622892, "grad_norm": 2.829493522644043, "learning_rate": 1.631637435377108e-07, "loss": 0.2777, "step": 17320 }, { "epoch": 0.8368845726433782, "grad_norm": 3.2554197311401367, "learning_rate": 1.6311542735662173e-07, "loss": 0.3466, "step": 17321 }, { "epoch": 0.8369328888244674, "grad_norm": 2.9196414947509766, "learning_rate": 1.6306711117553267e-07, "loss": 0.2882, "step": 17322 }, { "epoch": 0.8369812050055564, "grad_norm": 2.399207830429077, "learning_rate": 1.6301879499444363e-07, "loss": 0.2383, "step": 17323 }, { "epoch": 0.8370295211866454, "grad_norm": 2.3643746376037598, "learning_rate": 1.6297047881335457e-07, "loss": 0.297, "step": 17324 }, { "epoch": 0.8370778373677344, "grad_norm": 4.676817893981934, "learning_rate": 1.6292216263226556e-07, "loss": 0.2655, "step": 17325 }, { "epoch": 0.8371261535488235, "grad_norm": 3.7512001991271973, "learning_rate": 1.628738464511765e-07, "loss": 0.3881, "step": 17326 }, { "epoch": 0.8371744697299125, "grad_norm": 2.0155856609344482, "learning_rate": 1.6282553027008743e-07, "loss": 0.1441, "step": 17327 }, { "epoch": 0.8372227859110016, "grad_norm": 3.4944024085998535, "learning_rate": 1.627772140889984e-07, "loss": 0.2377, "step": 17328 }, { "epoch": 0.8372711020920907, "grad_norm": 4.123831272125244, "learning_rate": 1.6272889790790936e-07, "loss": 0.1641, "step": 17329 }, { "epoch": 0.8373194182731797, "grad_norm": 2.7653605937957764, "learning_rate": 1.626805817268203e-07, "loss": 0.3093, "step": 17330 }, { "epoch": 0.8373677344542687, "grad_norm": 2.4627697467803955, "learning_rate": 1.6263226554573126e-07, "loss": 0.2869, "step": 17331 }, { "epoch": 0.8374160506353577, "grad_norm": 3.2569491863250732, "learning_rate": 1.625839493646422e-07, "loss": 0.4207, "step": 17332 }, { "epoch": 0.8374643668164469, "grad_norm": 3.437244415283203, "learning_rate": 1.625356331835532e-07, "loss": 0.4292, "step": 17333 }, { "epoch": 0.8375126829975359, "grad_norm": 3.678591728210449, "learning_rate": 1.6248731700246413e-07, "loss": 0.2335, "step": 17334 }, { "epoch": 0.8375609991786249, "grad_norm": 2.506389856338501, "learning_rate": 1.6243900082137506e-07, "loss": 0.2808, "step": 17335 }, { "epoch": 0.837609315359714, "grad_norm": 3.9566562175750732, "learning_rate": 1.6239068464028603e-07, "loss": 0.2141, "step": 17336 }, { "epoch": 0.837657631540803, "grad_norm": 2.2366693019866943, "learning_rate": 1.6234236845919697e-07, "loss": 0.2582, "step": 17337 }, { "epoch": 0.8377059477218921, "grad_norm": 2.0471930503845215, "learning_rate": 1.6229405227810793e-07, "loss": 0.2198, "step": 17338 }, { "epoch": 0.8377542639029811, "grad_norm": 2.6686341762542725, "learning_rate": 1.622457360970189e-07, "loss": 0.3057, "step": 17339 }, { "epoch": 0.8378025800840702, "grad_norm": 3.9272639751434326, "learning_rate": 1.6219741991592983e-07, "loss": 0.2405, "step": 17340 }, { "epoch": 0.8378508962651592, "grad_norm": 3.8493800163269043, "learning_rate": 1.6214910373484077e-07, "loss": 0.3781, "step": 17341 }, { "epoch": 0.8378992124462482, "grad_norm": 2.6177356243133545, "learning_rate": 1.6210078755375176e-07, "loss": 0.3354, "step": 17342 }, { "epoch": 0.8379475286273373, "grad_norm": 6.247360706329346, "learning_rate": 1.620524713726627e-07, "loss": 0.218, "step": 17343 }, { "epoch": 0.8379958448084264, "grad_norm": 2.295665979385376, "learning_rate": 1.6200415519157366e-07, "loss": 0.2987, "step": 17344 }, { "epoch": 0.8380441609895154, "grad_norm": 9.409013748168945, "learning_rate": 1.619558390104846e-07, "loss": 0.2758, "step": 17345 }, { "epoch": 0.8380924771706044, "grad_norm": 2.3664228916168213, "learning_rate": 1.6190752282939556e-07, "loss": 0.1825, "step": 17346 }, { "epoch": 0.8381407933516934, "grad_norm": 32.87590408325195, "learning_rate": 1.6185920664830652e-07, "loss": 0.2119, "step": 17347 }, { "epoch": 0.8381891095327826, "grad_norm": 2.951298713684082, "learning_rate": 1.6181089046721746e-07, "loss": 0.3502, "step": 17348 }, { "epoch": 0.8382374257138716, "grad_norm": 3.1575381755828857, "learning_rate": 1.617625742861284e-07, "loss": 0.3273, "step": 17349 }, { "epoch": 0.8382857418949606, "grad_norm": 2.3446409702301025, "learning_rate": 1.6171425810503936e-07, "loss": 0.2454, "step": 17350 }, { "epoch": 0.8383340580760497, "grad_norm": 2.284238815307617, "learning_rate": 1.6166594192395033e-07, "loss": 0.254, "step": 17351 }, { "epoch": 0.8383823742571387, "grad_norm": 2.434041738510132, "learning_rate": 1.616176257428613e-07, "loss": 0.2299, "step": 17352 }, { "epoch": 0.8384306904382277, "grad_norm": 4.156389236450195, "learning_rate": 1.6156930956177223e-07, "loss": 0.3088, "step": 17353 }, { "epoch": 0.8384790066193168, "grad_norm": 2.042145013809204, "learning_rate": 1.6152099338068316e-07, "loss": 0.2063, "step": 17354 }, { "epoch": 0.8385273228004059, "grad_norm": 2.7544894218444824, "learning_rate": 1.6147267719959415e-07, "loss": 0.2593, "step": 17355 }, { "epoch": 0.8385756389814949, "grad_norm": 2.04606556892395, "learning_rate": 1.614243610185051e-07, "loss": 0.2124, "step": 17356 }, { "epoch": 0.8386239551625839, "grad_norm": 2.9540188312530518, "learning_rate": 1.6137604483741603e-07, "loss": 0.3047, "step": 17357 }, { "epoch": 0.838672271343673, "grad_norm": 2.953993797302246, "learning_rate": 1.61327728656327e-07, "loss": 0.2744, "step": 17358 }, { "epoch": 0.8387205875247621, "grad_norm": 2.9309024810791016, "learning_rate": 1.6127941247523796e-07, "loss": 0.3723, "step": 17359 }, { "epoch": 0.8387689037058511, "grad_norm": 3.3140931129455566, "learning_rate": 1.6123109629414892e-07, "loss": 0.4627, "step": 17360 }, { "epoch": 0.8388172198869401, "grad_norm": 3.007550001144409, "learning_rate": 1.6118278011305986e-07, "loss": 0.2572, "step": 17361 }, { "epoch": 0.8388655360680292, "grad_norm": 5.08951997756958, "learning_rate": 1.611344639319708e-07, "loss": 0.208, "step": 17362 }, { "epoch": 0.8389138522491182, "grad_norm": 7.5853962898254395, "learning_rate": 1.6108614775088176e-07, "loss": 0.3706, "step": 17363 }, { "epoch": 0.8389621684302073, "grad_norm": 3.093660593032837, "learning_rate": 1.6103783156979272e-07, "loss": 0.3035, "step": 17364 }, { "epoch": 0.8390104846112963, "grad_norm": 2.5564286708831787, "learning_rate": 1.6098951538870366e-07, "loss": 0.2638, "step": 17365 }, { "epoch": 0.8390588007923854, "grad_norm": 3.095144271850586, "learning_rate": 1.6094119920761462e-07, "loss": 0.3197, "step": 17366 }, { "epoch": 0.8391071169734744, "grad_norm": 3.969268560409546, "learning_rate": 1.6089288302652556e-07, "loss": 0.4028, "step": 17367 }, { "epoch": 0.8391554331545634, "grad_norm": 2.7878119945526123, "learning_rate": 1.6084456684543655e-07, "loss": 0.3549, "step": 17368 }, { "epoch": 0.8392037493356526, "grad_norm": 2.019423484802246, "learning_rate": 1.607962506643475e-07, "loss": 0.2871, "step": 17369 }, { "epoch": 0.8392520655167416, "grad_norm": 3.7163639068603516, "learning_rate": 1.6074793448325843e-07, "loss": 0.2908, "step": 17370 }, { "epoch": 0.8393003816978306, "grad_norm": 2.056854009628296, "learning_rate": 1.606996183021694e-07, "loss": 0.2009, "step": 17371 }, { "epoch": 0.8393486978789196, "grad_norm": 3.134719133377075, "learning_rate": 1.6065130212108033e-07, "loss": 0.269, "step": 17372 }, { "epoch": 0.8393970140600087, "grad_norm": 2.6848816871643066, "learning_rate": 1.6060298593999132e-07, "loss": 0.3247, "step": 17373 }, { "epoch": 0.8394453302410978, "grad_norm": 2.7165088653564453, "learning_rate": 1.6055466975890225e-07, "loss": 0.2307, "step": 17374 }, { "epoch": 0.8394936464221868, "grad_norm": 3.3535244464874268, "learning_rate": 1.605063535778132e-07, "loss": 0.2535, "step": 17375 }, { "epoch": 0.8395419626032758, "grad_norm": 4.366071701049805, "learning_rate": 1.6045803739672415e-07, "loss": 0.2679, "step": 17376 }, { "epoch": 0.8395902787843649, "grad_norm": 2.8145952224731445, "learning_rate": 1.6040972121563512e-07, "loss": 0.3435, "step": 17377 }, { "epoch": 0.8396385949654539, "grad_norm": 4.931336879730225, "learning_rate": 1.6036140503454606e-07, "loss": 0.2121, "step": 17378 }, { "epoch": 0.8396869111465429, "grad_norm": 2.0333728790283203, "learning_rate": 1.6031308885345702e-07, "loss": 0.2032, "step": 17379 }, { "epoch": 0.8397352273276321, "grad_norm": 3.7383925914764404, "learning_rate": 1.6026477267236796e-07, "loss": 0.4096, "step": 17380 }, { "epoch": 0.8397835435087211, "grad_norm": 2.627206802368164, "learning_rate": 1.6021645649127895e-07, "loss": 0.3956, "step": 17381 }, { "epoch": 0.8398318596898101, "grad_norm": 4.697111129760742, "learning_rate": 1.6016814031018988e-07, "loss": 0.3332, "step": 17382 }, { "epoch": 0.8398801758708991, "grad_norm": 3.009129762649536, "learning_rate": 1.6011982412910082e-07, "loss": 0.3023, "step": 17383 }, { "epoch": 0.8399284920519882, "grad_norm": 2.5081167221069336, "learning_rate": 1.6007150794801179e-07, "loss": 0.3185, "step": 17384 }, { "epoch": 0.8399768082330773, "grad_norm": 2.5355353355407715, "learning_rate": 1.6002319176692272e-07, "loss": 0.2793, "step": 17385 }, { "epoch": 0.8400251244141663, "grad_norm": 3.0842416286468506, "learning_rate": 1.5997487558583369e-07, "loss": 0.2396, "step": 17386 }, { "epoch": 0.8400734405952553, "grad_norm": 2.4715046882629395, "learning_rate": 1.5992655940474465e-07, "loss": 0.2954, "step": 17387 }, { "epoch": 0.8401217567763444, "grad_norm": 2.477074384689331, "learning_rate": 1.598782432236556e-07, "loss": 0.3042, "step": 17388 }, { "epoch": 0.8401700729574334, "grad_norm": 3.4184882640838623, "learning_rate": 1.5982992704256652e-07, "loss": 0.3066, "step": 17389 }, { "epoch": 0.8402183891385225, "grad_norm": 12.953420639038086, "learning_rate": 1.5978161086147752e-07, "loss": 0.2362, "step": 17390 }, { "epoch": 0.8402667053196116, "grad_norm": 2.67187237739563, "learning_rate": 1.5973329468038845e-07, "loss": 0.2919, "step": 17391 }, { "epoch": 0.8403150215007006, "grad_norm": 2.6841094493865967, "learning_rate": 1.5968497849929942e-07, "loss": 0.3045, "step": 17392 }, { "epoch": 0.8403633376817896, "grad_norm": 3.222209930419922, "learning_rate": 1.5963666231821035e-07, "loss": 0.4342, "step": 17393 }, { "epoch": 0.8404116538628786, "grad_norm": 2.0179200172424316, "learning_rate": 1.5958834613712132e-07, "loss": 0.1595, "step": 17394 }, { "epoch": 0.8404599700439678, "grad_norm": 3.8224639892578125, "learning_rate": 1.5954002995603228e-07, "loss": 0.3052, "step": 17395 }, { "epoch": 0.8405082862250568, "grad_norm": 3.1861517429351807, "learning_rate": 1.5949171377494322e-07, "loss": 0.2428, "step": 17396 }, { "epoch": 0.8405566024061458, "grad_norm": 1.684382677078247, "learning_rate": 1.5944339759385416e-07, "loss": 0.1944, "step": 17397 }, { "epoch": 0.8406049185872349, "grad_norm": 2.8716816902160645, "learning_rate": 1.5939508141276512e-07, "loss": 0.2917, "step": 17398 }, { "epoch": 0.8406532347683239, "grad_norm": 4.7191481590271, "learning_rate": 1.5934676523167608e-07, "loss": 0.3321, "step": 17399 }, { "epoch": 0.840701550949413, "grad_norm": 2.3131723403930664, "learning_rate": 1.5929844905058705e-07, "loss": 0.2553, "step": 17400 }, { "epoch": 0.840749867130502, "grad_norm": 2.4343667030334473, "learning_rate": 1.5925013286949798e-07, "loss": 0.277, "step": 17401 }, { "epoch": 0.8407981833115911, "grad_norm": 2.6477572917938232, "learning_rate": 1.5920181668840892e-07, "loss": 0.3333, "step": 17402 }, { "epoch": 0.8408464994926801, "grad_norm": 2.7807154655456543, "learning_rate": 1.591535005073199e-07, "loss": 0.3028, "step": 17403 }, { "epoch": 0.8408948156737691, "grad_norm": 3.002725601196289, "learning_rate": 1.5910518432623085e-07, "loss": 0.4245, "step": 17404 }, { "epoch": 0.8409431318548581, "grad_norm": 2.1466150283813477, "learning_rate": 1.5905686814514179e-07, "loss": 0.2777, "step": 17405 }, { "epoch": 0.8409914480359473, "grad_norm": 5.304934024810791, "learning_rate": 1.5900855196405275e-07, "loss": 0.2229, "step": 17406 }, { "epoch": 0.8410397642170363, "grad_norm": 2.0775394439697266, "learning_rate": 1.5896023578296371e-07, "loss": 0.289, "step": 17407 }, { "epoch": 0.8410880803981253, "grad_norm": 3.0782084465026855, "learning_rate": 1.5891191960187468e-07, "loss": 0.2595, "step": 17408 }, { "epoch": 0.8411363965792144, "grad_norm": 11.099468231201172, "learning_rate": 1.5886360342078561e-07, "loss": 0.3348, "step": 17409 }, { "epoch": 0.8411847127603034, "grad_norm": 2.495994806289673, "learning_rate": 1.5881528723969655e-07, "loss": 0.2448, "step": 17410 }, { "epoch": 0.8412330289413925, "grad_norm": 2.94004487991333, "learning_rate": 1.5876697105860752e-07, "loss": 0.3973, "step": 17411 }, { "epoch": 0.8412813451224815, "grad_norm": 2.1811954975128174, "learning_rate": 1.5871865487751848e-07, "loss": 0.2454, "step": 17412 }, { "epoch": 0.8413296613035706, "grad_norm": 2.790048122406006, "learning_rate": 1.5867033869642942e-07, "loss": 0.2913, "step": 17413 }, { "epoch": 0.8413779774846596, "grad_norm": 3.846705675125122, "learning_rate": 1.5862202251534038e-07, "loss": 0.317, "step": 17414 }, { "epoch": 0.8414262936657486, "grad_norm": 2.2377734184265137, "learning_rate": 1.5857370633425132e-07, "loss": 0.2147, "step": 17415 }, { "epoch": 0.8414746098468378, "grad_norm": 2.3297204971313477, "learning_rate": 1.585253901531623e-07, "loss": 0.3157, "step": 17416 }, { "epoch": 0.8415229260279268, "grad_norm": 2.127002716064453, "learning_rate": 1.5847707397207324e-07, "loss": 0.1883, "step": 17417 }, { "epoch": 0.8415712422090158, "grad_norm": 1.9513883590698242, "learning_rate": 1.5842875779098418e-07, "loss": 0.2037, "step": 17418 }, { "epoch": 0.8416195583901048, "grad_norm": 3.564169406890869, "learning_rate": 1.5838044160989515e-07, "loss": 0.2475, "step": 17419 }, { "epoch": 0.8416678745711939, "grad_norm": 2.2966744899749756, "learning_rate": 1.583321254288061e-07, "loss": 0.2716, "step": 17420 }, { "epoch": 0.841716190752283, "grad_norm": 2.3149054050445557, "learning_rate": 1.5828380924771705e-07, "loss": 0.2255, "step": 17421 }, { "epoch": 0.841764506933372, "grad_norm": 2.6672017574310303, "learning_rate": 1.58235493066628e-07, "loss": 0.1791, "step": 17422 }, { "epoch": 0.841812823114461, "grad_norm": 1.8729573488235474, "learning_rate": 1.5818717688553895e-07, "loss": 0.1991, "step": 17423 }, { "epoch": 0.8418611392955501, "grad_norm": 4.76664924621582, "learning_rate": 1.581388607044499e-07, "loss": 0.2426, "step": 17424 }, { "epoch": 0.8419094554766391, "grad_norm": 3.9787802696228027, "learning_rate": 1.5809054452336088e-07, "loss": 0.3263, "step": 17425 }, { "epoch": 0.8419577716577282, "grad_norm": 9.573637008666992, "learning_rate": 1.580422283422718e-07, "loss": 0.4344, "step": 17426 }, { "epoch": 0.8420060878388173, "grad_norm": 2.6571056842803955, "learning_rate": 1.5799391216118278e-07, "loss": 0.2217, "step": 17427 }, { "epoch": 0.8420544040199063, "grad_norm": 2.777890920639038, "learning_rate": 1.5794559598009371e-07, "loss": 0.2836, "step": 17428 }, { "epoch": 0.8421027202009953, "grad_norm": 1.6534769535064697, "learning_rate": 1.5789727979900468e-07, "loss": 0.1807, "step": 17429 }, { "epoch": 0.8421510363820843, "grad_norm": 3.1337039470672607, "learning_rate": 1.5784896361791564e-07, "loss": 0.3288, "step": 17430 }, { "epoch": 0.8421993525631734, "grad_norm": 15.509149551391602, "learning_rate": 1.5780064743682658e-07, "loss": 0.2717, "step": 17431 }, { "epoch": 0.8422476687442625, "grad_norm": 3.0872397422790527, "learning_rate": 1.5775233125573754e-07, "loss": 0.2806, "step": 17432 }, { "epoch": 0.8422959849253515, "grad_norm": 2.6571359634399414, "learning_rate": 1.577040150746485e-07, "loss": 0.2467, "step": 17433 }, { "epoch": 0.8423443011064405, "grad_norm": 3.954333543777466, "learning_rate": 1.5765569889355944e-07, "loss": 0.2338, "step": 17434 }, { "epoch": 0.8423926172875296, "grad_norm": 7.973721981048584, "learning_rate": 1.576073827124704e-07, "loss": 0.2201, "step": 17435 }, { "epoch": 0.8424409334686186, "grad_norm": 2.600417137145996, "learning_rate": 1.5755906653138134e-07, "loss": 0.2449, "step": 17436 }, { "epoch": 0.8424892496497077, "grad_norm": 2.449819326400757, "learning_rate": 1.5751075035029228e-07, "loss": 0.2055, "step": 17437 }, { "epoch": 0.8425375658307968, "grad_norm": 2.452061176300049, "learning_rate": 1.5746243416920327e-07, "loss": 0.3416, "step": 17438 }, { "epoch": 0.8425858820118858, "grad_norm": 6.5167436599731445, "learning_rate": 1.574141179881142e-07, "loss": 0.3465, "step": 17439 }, { "epoch": 0.8426341981929748, "grad_norm": 2.6460933685302734, "learning_rate": 1.5736580180702517e-07, "loss": 0.3704, "step": 17440 }, { "epoch": 0.8426825143740638, "grad_norm": 6.903487205505371, "learning_rate": 1.573174856259361e-07, "loss": 0.2162, "step": 17441 }, { "epoch": 0.842730830555153, "grad_norm": 2.8838417530059814, "learning_rate": 1.5726916944484707e-07, "loss": 0.3684, "step": 17442 }, { "epoch": 0.842779146736242, "grad_norm": 4.633402347564697, "learning_rate": 1.5722085326375804e-07, "loss": 0.2496, "step": 17443 }, { "epoch": 0.842827462917331, "grad_norm": 2.4598817825317383, "learning_rate": 1.5717253708266897e-07, "loss": 0.1752, "step": 17444 }, { "epoch": 0.84287577909842, "grad_norm": 2.7025182247161865, "learning_rate": 1.571242209015799e-07, "loss": 0.3119, "step": 17445 }, { "epoch": 0.8429240952795091, "grad_norm": 2.73364520072937, "learning_rate": 1.570759047204909e-07, "loss": 0.3946, "step": 17446 }, { "epoch": 0.8429724114605982, "grad_norm": 1.9906210899353027, "learning_rate": 1.5702758853940184e-07, "loss": 0.1688, "step": 17447 }, { "epoch": 0.8430207276416872, "grad_norm": 2.731640338897705, "learning_rate": 1.569792723583128e-07, "loss": 0.2854, "step": 17448 }, { "epoch": 0.8430690438227763, "grad_norm": 3.6388165950775146, "learning_rate": 1.5693095617722374e-07, "loss": 0.5149, "step": 17449 }, { "epoch": 0.8431173600038653, "grad_norm": 2.727311611175537, "learning_rate": 1.5688263999613468e-07, "loss": 0.2651, "step": 17450 }, { "epoch": 0.8431656761849543, "grad_norm": 3.250811815261841, "learning_rate": 1.5683432381504567e-07, "loss": 0.3561, "step": 17451 }, { "epoch": 0.8432139923660434, "grad_norm": 2.1772966384887695, "learning_rate": 1.567860076339566e-07, "loss": 0.2511, "step": 17452 }, { "epoch": 0.8432623085471325, "grad_norm": 2.2125518321990967, "learning_rate": 1.5673769145286754e-07, "loss": 0.2469, "step": 17453 }, { "epoch": 0.8433106247282215, "grad_norm": 3.680041790008545, "learning_rate": 1.566893752717785e-07, "loss": 0.3621, "step": 17454 }, { "epoch": 0.8433589409093105, "grad_norm": 2.817359209060669, "learning_rate": 1.5664105909068947e-07, "loss": 0.3556, "step": 17455 }, { "epoch": 0.8434072570903995, "grad_norm": 2.123189687728882, "learning_rate": 1.5659274290960043e-07, "loss": 0.2193, "step": 17456 }, { "epoch": 0.8434555732714886, "grad_norm": 2.46996808052063, "learning_rate": 1.5654442672851137e-07, "loss": 0.3411, "step": 17457 }, { "epoch": 0.8435038894525777, "grad_norm": 5.190951824188232, "learning_rate": 1.564961105474223e-07, "loss": 0.2839, "step": 17458 }, { "epoch": 0.8435522056336667, "grad_norm": 2.5088930130004883, "learning_rate": 1.564477943663333e-07, "loss": 0.2315, "step": 17459 }, { "epoch": 0.8436005218147558, "grad_norm": 3.154895782470703, "learning_rate": 1.5639947818524424e-07, "loss": 0.2291, "step": 17460 }, { "epoch": 0.8436488379958448, "grad_norm": 2.9214224815368652, "learning_rate": 1.5635116200415517e-07, "loss": 0.3666, "step": 17461 }, { "epoch": 0.8436971541769338, "grad_norm": 3.0221807956695557, "learning_rate": 1.5630284582306614e-07, "loss": 0.2279, "step": 17462 }, { "epoch": 0.8437454703580229, "grad_norm": 3.6718411445617676, "learning_rate": 1.5625452964197707e-07, "loss": 0.3825, "step": 17463 }, { "epoch": 0.843793786539112, "grad_norm": 3.3129494190216064, "learning_rate": 1.5620621346088806e-07, "loss": 0.2521, "step": 17464 }, { "epoch": 0.843842102720201, "grad_norm": 1.8453370332717896, "learning_rate": 1.56157897279799e-07, "loss": 0.2186, "step": 17465 }, { "epoch": 0.84389041890129, "grad_norm": 3.8043951988220215, "learning_rate": 1.5610958109870994e-07, "loss": 0.4608, "step": 17466 }, { "epoch": 0.843938735082379, "grad_norm": 2.900517225265503, "learning_rate": 1.560612649176209e-07, "loss": 0.2633, "step": 17467 }, { "epoch": 0.8439870512634682, "grad_norm": 4.38233757019043, "learning_rate": 1.5601294873653187e-07, "loss": 0.335, "step": 17468 }, { "epoch": 0.8440353674445572, "grad_norm": 2.365140676498413, "learning_rate": 1.559646325554428e-07, "loss": 0.3183, "step": 17469 }, { "epoch": 0.8440836836256462, "grad_norm": 2.543949842453003, "learning_rate": 1.5591631637435377e-07, "loss": 0.2919, "step": 17470 }, { "epoch": 0.8441319998067353, "grad_norm": 4.340764999389648, "learning_rate": 1.558680001932647e-07, "loss": 0.3218, "step": 17471 }, { "epoch": 0.8441803159878243, "grad_norm": 3.2222554683685303, "learning_rate": 1.558196840121757e-07, "loss": 0.4049, "step": 17472 }, { "epoch": 0.8442286321689134, "grad_norm": 2.3463151454925537, "learning_rate": 1.5577136783108663e-07, "loss": 0.2229, "step": 17473 }, { "epoch": 0.8442769483500024, "grad_norm": 3.4100804328918457, "learning_rate": 1.5572305164999757e-07, "loss": 0.2685, "step": 17474 }, { "epoch": 0.8443252645310915, "grad_norm": 3.337472915649414, "learning_rate": 1.5567473546890853e-07, "loss": 0.3026, "step": 17475 }, { "epoch": 0.8443735807121805, "grad_norm": 2.7376139163970947, "learning_rate": 1.5562641928781947e-07, "loss": 0.3315, "step": 17476 }, { "epoch": 0.8444218968932695, "grad_norm": 5.704186916351318, "learning_rate": 1.5557810310673043e-07, "loss": 0.3789, "step": 17477 }, { "epoch": 0.8444702130743587, "grad_norm": 2.52213978767395, "learning_rate": 1.555297869256414e-07, "loss": 0.1567, "step": 17478 }, { "epoch": 0.8445185292554477, "grad_norm": 1.331952691078186, "learning_rate": 1.5548147074455234e-07, "loss": 0.1181, "step": 17479 }, { "epoch": 0.8445668454365367, "grad_norm": 2.655787229537964, "learning_rate": 1.554331545634633e-07, "loss": 0.3085, "step": 17480 }, { "epoch": 0.8446151616176257, "grad_norm": 2.35064697265625, "learning_rate": 1.5538483838237426e-07, "loss": 0.2367, "step": 17481 }, { "epoch": 0.8446634777987148, "grad_norm": 2.9884767532348633, "learning_rate": 1.553365222012852e-07, "loss": 0.4824, "step": 17482 }, { "epoch": 0.8447117939798038, "grad_norm": 2.8633108139038086, "learning_rate": 1.5528820602019616e-07, "loss": 0.1083, "step": 17483 }, { "epoch": 0.8447601101608929, "grad_norm": 2.6488921642303467, "learning_rate": 1.552398898391071e-07, "loss": 0.2765, "step": 17484 }, { "epoch": 0.8448084263419819, "grad_norm": 2.102677583694458, "learning_rate": 1.5519157365801807e-07, "loss": 0.2851, "step": 17485 }, { "epoch": 0.844856742523071, "grad_norm": 3.7159464359283447, "learning_rate": 1.5514325747692903e-07, "loss": 0.4085, "step": 17486 }, { "epoch": 0.84490505870416, "grad_norm": 2.0364677906036377, "learning_rate": 1.5509494129583997e-07, "loss": 0.2192, "step": 17487 }, { "epoch": 0.844953374885249, "grad_norm": 3.4925858974456787, "learning_rate": 1.5504662511475093e-07, "loss": 0.3281, "step": 17488 }, { "epoch": 0.8450016910663382, "grad_norm": 2.397918462753296, "learning_rate": 1.5499830893366187e-07, "loss": 0.2362, "step": 17489 }, { "epoch": 0.8450500072474272, "grad_norm": 2.149364471435547, "learning_rate": 1.5494999275257283e-07, "loss": 0.2954, "step": 17490 }, { "epoch": 0.8450983234285162, "grad_norm": 3.002326250076294, "learning_rate": 1.549016765714838e-07, "loss": 0.3354, "step": 17491 }, { "epoch": 0.8451466396096052, "grad_norm": 3.833597421646118, "learning_rate": 1.5485336039039473e-07, "loss": 0.3787, "step": 17492 }, { "epoch": 0.8451949557906943, "grad_norm": 2.5497169494628906, "learning_rate": 1.5480504420930567e-07, "loss": 0.2772, "step": 17493 }, { "epoch": 0.8452432719717834, "grad_norm": 2.7925877571105957, "learning_rate": 1.5475672802821666e-07, "loss": 0.2695, "step": 17494 }, { "epoch": 0.8452915881528724, "grad_norm": 2.778707265853882, "learning_rate": 1.547084118471276e-07, "loss": 0.2513, "step": 17495 }, { "epoch": 0.8453399043339614, "grad_norm": 3.9061388969421387, "learning_rate": 1.5466009566603856e-07, "loss": 0.4354, "step": 17496 }, { "epoch": 0.8453882205150505, "grad_norm": 1.925632119178772, "learning_rate": 1.546117794849495e-07, "loss": 0.1812, "step": 17497 }, { "epoch": 0.8454365366961395, "grad_norm": 2.2152838706970215, "learning_rate": 1.5456346330386046e-07, "loss": 0.2237, "step": 17498 }, { "epoch": 0.8454848528772286, "grad_norm": 2.516334295272827, "learning_rate": 1.5451514712277143e-07, "loss": 0.3005, "step": 17499 }, { "epoch": 0.8455331690583177, "grad_norm": 2.2817318439483643, "learning_rate": 1.5446683094168236e-07, "loss": 0.2203, "step": 17500 }, { "epoch": 0.8455814852394067, "grad_norm": 1.4945268630981445, "learning_rate": 1.544185147605933e-07, "loss": 0.1542, "step": 17501 }, { "epoch": 0.8456298014204957, "grad_norm": 2.5108957290649414, "learning_rate": 1.5437019857950426e-07, "loss": 0.2979, "step": 17502 }, { "epoch": 0.8456781176015847, "grad_norm": 2.5281248092651367, "learning_rate": 1.5432188239841523e-07, "loss": 0.3015, "step": 17503 }, { "epoch": 0.8457264337826739, "grad_norm": 2.104163885116577, "learning_rate": 1.542735662173262e-07, "loss": 0.1902, "step": 17504 }, { "epoch": 0.8457747499637629, "grad_norm": 2.827684164047241, "learning_rate": 1.5422525003623713e-07, "loss": 0.3135, "step": 17505 }, { "epoch": 0.8458230661448519, "grad_norm": 3.4605133533477783, "learning_rate": 1.5417693385514807e-07, "loss": 0.4472, "step": 17506 }, { "epoch": 0.845871382325941, "grad_norm": 2.5519535541534424, "learning_rate": 1.5412861767405906e-07, "loss": 0.2565, "step": 17507 }, { "epoch": 0.84591969850703, "grad_norm": 34.94184112548828, "learning_rate": 1.5408030149297e-07, "loss": 0.2856, "step": 17508 }, { "epoch": 0.845968014688119, "grad_norm": 2.316328525543213, "learning_rate": 1.5403198531188093e-07, "loss": 0.3182, "step": 17509 }, { "epoch": 0.8460163308692081, "grad_norm": 2.736590623855591, "learning_rate": 1.539836691307919e-07, "loss": 0.2204, "step": 17510 }, { "epoch": 0.8460646470502972, "grad_norm": 3.2876670360565186, "learning_rate": 1.5393535294970283e-07, "loss": 0.3558, "step": 17511 }, { "epoch": 0.8461129632313862, "grad_norm": 2.8795852661132812, "learning_rate": 1.5388703676861382e-07, "loss": 0.2932, "step": 17512 }, { "epoch": 0.8461612794124752, "grad_norm": 3.2165446281433105, "learning_rate": 1.5383872058752476e-07, "loss": 0.3737, "step": 17513 }, { "epoch": 0.8462095955935642, "grad_norm": 2.528095245361328, "learning_rate": 1.537904044064357e-07, "loss": 0.2802, "step": 17514 }, { "epoch": 0.8462579117746534, "grad_norm": 48.638980865478516, "learning_rate": 1.5374208822534666e-07, "loss": 0.2053, "step": 17515 }, { "epoch": 0.8463062279557424, "grad_norm": 3.016223430633545, "learning_rate": 1.5369377204425762e-07, "loss": 0.3186, "step": 17516 }, { "epoch": 0.8463545441368314, "grad_norm": 4.203125476837158, "learning_rate": 1.5364545586316856e-07, "loss": 0.3428, "step": 17517 }, { "epoch": 0.8464028603179204, "grad_norm": 3.0517995357513428, "learning_rate": 1.5359713968207952e-07, "loss": 0.2166, "step": 17518 }, { "epoch": 0.8464511764990095, "grad_norm": 2.662442684173584, "learning_rate": 1.5354882350099046e-07, "loss": 0.2918, "step": 17519 }, { "epoch": 0.8464994926800986, "grad_norm": 2.1431970596313477, "learning_rate": 1.5350050731990145e-07, "loss": 0.2529, "step": 17520 }, { "epoch": 0.8465478088611876, "grad_norm": 2.163593292236328, "learning_rate": 1.534521911388124e-07, "loss": 0.245, "step": 17521 }, { "epoch": 0.8465961250422767, "grad_norm": 3.322563886642456, "learning_rate": 1.5340387495772333e-07, "loss": 0.3279, "step": 17522 }, { "epoch": 0.8466444412233657, "grad_norm": 2.5907280445098877, "learning_rate": 1.533555587766343e-07, "loss": 0.3274, "step": 17523 }, { "epoch": 0.8466927574044547, "grad_norm": 3.5603573322296143, "learning_rate": 1.5330724259554523e-07, "loss": 0.3075, "step": 17524 }, { "epoch": 0.8467410735855438, "grad_norm": 2.6972084045410156, "learning_rate": 1.532589264144562e-07, "loss": 0.2537, "step": 17525 }, { "epoch": 0.8467893897666329, "grad_norm": 5.776915073394775, "learning_rate": 1.5321061023336716e-07, "loss": 0.2861, "step": 17526 }, { "epoch": 0.8468377059477219, "grad_norm": 3.2174274921417236, "learning_rate": 1.531622940522781e-07, "loss": 0.3466, "step": 17527 }, { "epoch": 0.8468860221288109, "grad_norm": 2.530383825302124, "learning_rate": 1.5311397787118903e-07, "loss": 0.2666, "step": 17528 }, { "epoch": 0.8469343383099, "grad_norm": 2.0249476432800293, "learning_rate": 1.5306566169010002e-07, "loss": 0.241, "step": 17529 }, { "epoch": 0.8469826544909891, "grad_norm": 2.374178171157837, "learning_rate": 1.5301734550901096e-07, "loss": 0.2764, "step": 17530 }, { "epoch": 0.8470309706720781, "grad_norm": 3.679190158843994, "learning_rate": 1.5296902932792192e-07, "loss": 0.5307, "step": 17531 }, { "epoch": 0.8470792868531671, "grad_norm": 2.22165584564209, "learning_rate": 1.5292071314683286e-07, "loss": 0.2751, "step": 17532 }, { "epoch": 0.8471276030342562, "grad_norm": 3.2211010456085205, "learning_rate": 1.5287239696574382e-07, "loss": 0.3311, "step": 17533 }, { "epoch": 0.8471759192153452, "grad_norm": 2.31874942779541, "learning_rate": 1.5282408078465479e-07, "loss": 0.2376, "step": 17534 }, { "epoch": 0.8472242353964342, "grad_norm": 2.855137348175049, "learning_rate": 1.5277576460356572e-07, "loss": 0.3495, "step": 17535 }, { "epoch": 0.8472725515775233, "grad_norm": 3.692424774169922, "learning_rate": 1.5272744842247669e-07, "loss": 0.3265, "step": 17536 }, { "epoch": 0.8473208677586124, "grad_norm": 2.638796091079712, "learning_rate": 1.5267913224138762e-07, "loss": 0.2388, "step": 17537 }, { "epoch": 0.8473691839397014, "grad_norm": 10.230581283569336, "learning_rate": 1.526308160602986e-07, "loss": 0.1995, "step": 17538 }, { "epoch": 0.8474175001207904, "grad_norm": 3.0028624534606934, "learning_rate": 1.5258249987920955e-07, "loss": 0.2693, "step": 17539 }, { "epoch": 0.8474658163018794, "grad_norm": 3.66218638420105, "learning_rate": 1.525341836981205e-07, "loss": 0.341, "step": 17540 }, { "epoch": 0.8475141324829686, "grad_norm": 1.7377736568450928, "learning_rate": 1.5248586751703143e-07, "loss": 0.1513, "step": 17541 }, { "epoch": 0.8475624486640576, "grad_norm": 3.1442654132843018, "learning_rate": 1.5243755133594242e-07, "loss": 0.3082, "step": 17542 }, { "epoch": 0.8476107648451466, "grad_norm": 2.8191354274749756, "learning_rate": 1.5238923515485335e-07, "loss": 0.3223, "step": 17543 }, { "epoch": 0.8476590810262357, "grad_norm": 1.8717601299285889, "learning_rate": 1.5234091897376432e-07, "loss": 0.1963, "step": 17544 }, { "epoch": 0.8477073972073247, "grad_norm": 5.602736949920654, "learning_rate": 1.5229260279267525e-07, "loss": 0.2682, "step": 17545 }, { "epoch": 0.8477557133884138, "grad_norm": 3.4933788776397705, "learning_rate": 1.5224428661158622e-07, "loss": 0.5575, "step": 17546 }, { "epoch": 0.8478040295695028, "grad_norm": 2.3192853927612305, "learning_rate": 1.5219597043049718e-07, "loss": 0.2142, "step": 17547 }, { "epoch": 0.8478523457505919, "grad_norm": 3.196192979812622, "learning_rate": 1.5214765424940812e-07, "loss": 0.2943, "step": 17548 }, { "epoch": 0.8479006619316809, "grad_norm": 2.6176793575286865, "learning_rate": 1.5209933806831906e-07, "loss": 0.2983, "step": 17549 }, { "epoch": 0.8479489781127699, "grad_norm": 3.2142341136932373, "learning_rate": 1.5205102188723002e-07, "loss": 0.5725, "step": 17550 }, { "epoch": 0.8479972942938591, "grad_norm": 2.476806640625, "learning_rate": 1.5200270570614098e-07, "loss": 0.3145, "step": 17551 }, { "epoch": 0.8480456104749481, "grad_norm": 3.6689159870147705, "learning_rate": 1.5195438952505195e-07, "loss": 0.2677, "step": 17552 }, { "epoch": 0.8480939266560371, "grad_norm": 2.9330461025238037, "learning_rate": 1.5190607334396289e-07, "loss": 0.3228, "step": 17553 }, { "epoch": 0.8481422428371261, "grad_norm": 2.5978593826293945, "learning_rate": 1.5185775716287382e-07, "loss": 0.3815, "step": 17554 }, { "epoch": 0.8481905590182152, "grad_norm": 2.7673158645629883, "learning_rate": 1.518094409817848e-07, "loss": 0.2842, "step": 17555 }, { "epoch": 0.8482388751993043, "grad_norm": 3.135878562927246, "learning_rate": 1.5176112480069575e-07, "loss": 0.2733, "step": 17556 }, { "epoch": 0.8482871913803933, "grad_norm": 12.818699836730957, "learning_rate": 1.517128086196067e-07, "loss": 0.3276, "step": 17557 }, { "epoch": 0.8483355075614823, "grad_norm": 2.6316757202148438, "learning_rate": 1.5166449243851765e-07, "loss": 0.3066, "step": 17558 }, { "epoch": 0.8483838237425714, "grad_norm": 3.473707914352417, "learning_rate": 1.5161617625742861e-07, "loss": 0.292, "step": 17559 }, { "epoch": 0.8484321399236604, "grad_norm": 2.6415956020355225, "learning_rate": 1.5156786007633958e-07, "loss": 0.2125, "step": 17560 }, { "epoch": 0.8484804561047494, "grad_norm": 2.9041812419891357, "learning_rate": 1.5151954389525052e-07, "loss": 0.2473, "step": 17561 }, { "epoch": 0.8485287722858386, "grad_norm": 3.1244566440582275, "learning_rate": 1.5147122771416145e-07, "loss": 0.3061, "step": 17562 }, { "epoch": 0.8485770884669276, "grad_norm": 3.0802254676818848, "learning_rate": 1.5142291153307242e-07, "loss": 0.3596, "step": 17563 }, { "epoch": 0.8486254046480166, "grad_norm": 2.9560000896453857, "learning_rate": 1.5137459535198338e-07, "loss": 0.3538, "step": 17564 }, { "epoch": 0.8486737208291056, "grad_norm": 2.941765785217285, "learning_rate": 1.5132627917089432e-07, "loss": 0.3582, "step": 17565 }, { "epoch": 0.8487220370101947, "grad_norm": 2.28857421875, "learning_rate": 1.5127796298980528e-07, "loss": 0.2886, "step": 17566 }, { "epoch": 0.8487703531912838, "grad_norm": 5.387705326080322, "learning_rate": 1.5122964680871622e-07, "loss": 0.2108, "step": 17567 }, { "epoch": 0.8488186693723728, "grad_norm": 1.7300156354904175, "learning_rate": 1.511813306276272e-07, "loss": 0.195, "step": 17568 }, { "epoch": 0.8488669855534619, "grad_norm": 3.1646029949188232, "learning_rate": 1.5113301444653815e-07, "loss": 0.4427, "step": 17569 }, { "epoch": 0.8489153017345509, "grad_norm": 2.4844915866851807, "learning_rate": 1.5108469826544908e-07, "loss": 0.2636, "step": 17570 }, { "epoch": 0.8489636179156399, "grad_norm": 2.0421056747436523, "learning_rate": 1.5103638208436005e-07, "loss": 0.186, "step": 17571 }, { "epoch": 0.849011934096729, "grad_norm": 2.183803081512451, "learning_rate": 1.50988065903271e-07, "loss": 0.2283, "step": 17572 }, { "epoch": 0.8490602502778181, "grad_norm": 1.6626712083816528, "learning_rate": 1.5093974972218195e-07, "loss": 0.1497, "step": 17573 }, { "epoch": 0.8491085664589071, "grad_norm": 2.7572665214538574, "learning_rate": 1.508914335410929e-07, "loss": 0.2352, "step": 17574 }, { "epoch": 0.8491568826399961, "grad_norm": 2.922779083251953, "learning_rate": 1.5084311736000385e-07, "loss": 0.2973, "step": 17575 }, { "epoch": 0.8492051988210851, "grad_norm": 2.8296258449554443, "learning_rate": 1.5079480117891479e-07, "loss": 0.3885, "step": 17576 }, { "epoch": 0.8492535150021743, "grad_norm": 2.7498762607574463, "learning_rate": 1.5074648499782578e-07, "loss": 0.3719, "step": 17577 }, { "epoch": 0.8493018311832633, "grad_norm": 4.335966110229492, "learning_rate": 1.5069816881673671e-07, "loss": 0.1857, "step": 17578 }, { "epoch": 0.8493501473643523, "grad_norm": 2.560004711151123, "learning_rate": 1.5064985263564768e-07, "loss": 0.3652, "step": 17579 }, { "epoch": 0.8493984635454414, "grad_norm": 2.5789220333099365, "learning_rate": 1.5060153645455862e-07, "loss": 0.3342, "step": 17580 }, { "epoch": 0.8494467797265304, "grad_norm": 2.3889451026916504, "learning_rate": 1.5055322027346958e-07, "loss": 0.2351, "step": 17581 }, { "epoch": 0.8494950959076195, "grad_norm": 2.17677640914917, "learning_rate": 1.5050490409238054e-07, "loss": 0.2703, "step": 17582 }, { "epoch": 0.8495434120887085, "grad_norm": 5.0132341384887695, "learning_rate": 1.5045658791129148e-07, "loss": 0.2885, "step": 17583 }, { "epoch": 0.8495917282697976, "grad_norm": 6.4768266677856445, "learning_rate": 1.5040827173020242e-07, "loss": 0.1712, "step": 17584 }, { "epoch": 0.8496400444508866, "grad_norm": 3.423672914505005, "learning_rate": 1.503599555491134e-07, "loss": 0.318, "step": 17585 }, { "epoch": 0.8496883606319756, "grad_norm": 2.9348597526550293, "learning_rate": 1.5031163936802434e-07, "loss": 0.2259, "step": 17586 }, { "epoch": 0.8497366768130648, "grad_norm": 3.4796228408813477, "learning_rate": 1.502633231869353e-07, "loss": 0.4259, "step": 17587 }, { "epoch": 0.8497849929941538, "grad_norm": 3.9796948432922363, "learning_rate": 1.5021500700584625e-07, "loss": 0.244, "step": 17588 }, { "epoch": 0.8498333091752428, "grad_norm": 2.0388214588165283, "learning_rate": 1.5016669082475718e-07, "loss": 0.2689, "step": 17589 }, { "epoch": 0.8498816253563318, "grad_norm": 6.556214332580566, "learning_rate": 1.5011837464366817e-07, "loss": 0.2813, "step": 17590 }, { "epoch": 0.8499299415374209, "grad_norm": 1.7938748598098755, "learning_rate": 1.500700584625791e-07, "loss": 0.1695, "step": 17591 }, { "epoch": 0.8499782577185099, "grad_norm": 17.51791763305664, "learning_rate": 1.5002174228149005e-07, "loss": 0.339, "step": 17592 }, { "epoch": 0.850026573899599, "grad_norm": 5.537509441375732, "learning_rate": 1.49973426100401e-07, "loss": 0.2479, "step": 17593 }, { "epoch": 0.850074890080688, "grad_norm": 2.0768144130706787, "learning_rate": 1.4992510991931198e-07, "loss": 0.2967, "step": 17594 }, { "epoch": 0.8501232062617771, "grad_norm": 2.9064159393310547, "learning_rate": 1.4987679373822294e-07, "loss": 0.3221, "step": 17595 }, { "epoch": 0.8501715224428661, "grad_norm": 2.4644229412078857, "learning_rate": 1.4982847755713388e-07, "loss": 0.2753, "step": 17596 }, { "epoch": 0.8502198386239551, "grad_norm": 2.584097146987915, "learning_rate": 1.4978016137604481e-07, "loss": 0.2329, "step": 17597 }, { "epoch": 0.8502681548050443, "grad_norm": 2.6375226974487305, "learning_rate": 1.497318451949558e-07, "loss": 0.3858, "step": 17598 }, { "epoch": 0.8503164709861333, "grad_norm": 2.3642642498016357, "learning_rate": 1.4968352901386674e-07, "loss": 0.2517, "step": 17599 }, { "epoch": 0.8503647871672223, "grad_norm": 5.022940635681152, "learning_rate": 1.4963521283277768e-07, "loss": 0.3527, "step": 17600 }, { "epoch": 0.8504131033483113, "grad_norm": 3.039045810699463, "learning_rate": 1.4958689665168864e-07, "loss": 0.3989, "step": 17601 }, { "epoch": 0.8504614195294004, "grad_norm": 2.324164628982544, "learning_rate": 1.4953858047059958e-07, "loss": 0.2253, "step": 17602 }, { "epoch": 0.8505097357104895, "grad_norm": 2.193307638168335, "learning_rate": 1.4949026428951057e-07, "loss": 0.2494, "step": 17603 }, { "epoch": 0.8505580518915785, "grad_norm": 2.520878314971924, "learning_rate": 1.494419481084215e-07, "loss": 0.2489, "step": 17604 }, { "epoch": 0.8506063680726675, "grad_norm": 2.661454677581787, "learning_rate": 1.4939363192733244e-07, "loss": 0.3975, "step": 17605 }, { "epoch": 0.8506546842537566, "grad_norm": 4.520151138305664, "learning_rate": 1.493453157462434e-07, "loss": 0.3099, "step": 17606 }, { "epoch": 0.8507030004348456, "grad_norm": 2.4614737033843994, "learning_rate": 1.4929699956515437e-07, "loss": 0.3487, "step": 17607 }, { "epoch": 0.8507513166159347, "grad_norm": 2.850299596786499, "learning_rate": 1.492486833840653e-07, "loss": 0.3724, "step": 17608 }, { "epoch": 0.8507996327970238, "grad_norm": 5.591697692871094, "learning_rate": 1.4920036720297627e-07, "loss": 0.3626, "step": 17609 }, { "epoch": 0.8508479489781128, "grad_norm": 2.2976527214050293, "learning_rate": 1.491520510218872e-07, "loss": 0.253, "step": 17610 }, { "epoch": 0.8508962651592018, "grad_norm": 2.5563852787017822, "learning_rate": 1.491037348407982e-07, "loss": 0.271, "step": 17611 }, { "epoch": 0.8509445813402908, "grad_norm": 3.4010355472564697, "learning_rate": 1.4905541865970914e-07, "loss": 0.3274, "step": 17612 }, { "epoch": 0.85099289752138, "grad_norm": 2.5973968505859375, "learning_rate": 1.4900710247862007e-07, "loss": 0.3332, "step": 17613 }, { "epoch": 0.851041213702469, "grad_norm": 3.173313856124878, "learning_rate": 1.4895878629753104e-07, "loss": 0.3651, "step": 17614 }, { "epoch": 0.851089529883558, "grad_norm": 4.882840633392334, "learning_rate": 1.4891047011644198e-07, "loss": 0.2476, "step": 17615 }, { "epoch": 0.851137846064647, "grad_norm": 3.219261646270752, "learning_rate": 1.4886215393535294e-07, "loss": 0.3003, "step": 17616 }, { "epoch": 0.8511861622457361, "grad_norm": 2.4865615367889404, "learning_rate": 1.488138377542639e-07, "loss": 0.2467, "step": 17617 }, { "epoch": 0.8512344784268251, "grad_norm": 1.9693161249160767, "learning_rate": 1.4876552157317484e-07, "loss": 0.2091, "step": 17618 }, { "epoch": 0.8512827946079142, "grad_norm": 2.9952924251556396, "learning_rate": 1.487172053920858e-07, "loss": 0.3393, "step": 17619 }, { "epoch": 0.8513311107890033, "grad_norm": 2.933790683746338, "learning_rate": 1.4866888921099677e-07, "loss": 0.2551, "step": 17620 }, { "epoch": 0.8513794269700923, "grad_norm": 1.8568462133407593, "learning_rate": 1.486205730299077e-07, "loss": 0.1891, "step": 17621 }, { "epoch": 0.8514277431511813, "grad_norm": 5.142446041107178, "learning_rate": 1.4857225684881867e-07, "loss": 0.2857, "step": 17622 }, { "epoch": 0.8514760593322703, "grad_norm": 2.1441268920898438, "learning_rate": 1.485239406677296e-07, "loss": 0.2551, "step": 17623 }, { "epoch": 0.8515243755133595, "grad_norm": 1.966378092765808, "learning_rate": 1.4847562448664057e-07, "loss": 0.1736, "step": 17624 }, { "epoch": 0.8515726916944485, "grad_norm": 1.628737211227417, "learning_rate": 1.4842730830555153e-07, "loss": 0.1874, "step": 17625 }, { "epoch": 0.8516210078755375, "grad_norm": 225.8810577392578, "learning_rate": 1.4837899212446247e-07, "loss": 0.4262, "step": 17626 }, { "epoch": 0.8516693240566265, "grad_norm": 3.2426698207855225, "learning_rate": 1.4833067594337343e-07, "loss": 0.2817, "step": 17627 }, { "epoch": 0.8517176402377156, "grad_norm": 2.7972412109375, "learning_rate": 1.4828235976228437e-07, "loss": 0.296, "step": 17628 }, { "epoch": 0.8517659564188047, "grad_norm": 2.8295106887817383, "learning_rate": 1.4823404358119534e-07, "loss": 0.2323, "step": 17629 }, { "epoch": 0.8518142725998937, "grad_norm": 3.04679274559021, "learning_rate": 1.481857274001063e-07, "loss": 0.255, "step": 17630 }, { "epoch": 0.8518625887809828, "grad_norm": 11.039766311645508, "learning_rate": 1.4813741121901724e-07, "loss": 0.2764, "step": 17631 }, { "epoch": 0.8519109049620718, "grad_norm": 2.4710450172424316, "learning_rate": 1.4808909503792817e-07, "loss": 0.2982, "step": 17632 }, { "epoch": 0.8519592211431608, "grad_norm": 3.206422805786133, "learning_rate": 1.4804077885683916e-07, "loss": 0.3984, "step": 17633 }, { "epoch": 0.8520075373242499, "grad_norm": 2.0394482612609863, "learning_rate": 1.479924626757501e-07, "loss": 0.2425, "step": 17634 }, { "epoch": 0.852055853505339, "grad_norm": 1.7299002408981323, "learning_rate": 1.4794414649466107e-07, "loss": 0.2033, "step": 17635 }, { "epoch": 0.852104169686428, "grad_norm": 2.8819284439086914, "learning_rate": 1.47895830313572e-07, "loss": 0.3238, "step": 17636 }, { "epoch": 0.852152485867517, "grad_norm": 2.5474603176116943, "learning_rate": 1.4784751413248297e-07, "loss": 0.2139, "step": 17637 }, { "epoch": 0.852200802048606, "grad_norm": 2.886622190475464, "learning_rate": 1.4779919795139393e-07, "loss": 0.3258, "step": 17638 }, { "epoch": 0.8522491182296952, "grad_norm": 3.059312343597412, "learning_rate": 1.4775088177030487e-07, "loss": 0.378, "step": 17639 }, { "epoch": 0.8522974344107842, "grad_norm": 2.351930618286133, "learning_rate": 1.477025655892158e-07, "loss": 0.1892, "step": 17640 }, { "epoch": 0.8523457505918732, "grad_norm": 3.2757487297058105, "learning_rate": 1.4765424940812677e-07, "loss": 0.3836, "step": 17641 }, { "epoch": 0.8523940667729623, "grad_norm": 2.1194651126861572, "learning_rate": 1.4760593322703773e-07, "loss": 0.2394, "step": 17642 }, { "epoch": 0.8524423829540513, "grad_norm": 2.7325029373168945, "learning_rate": 1.475576170459487e-07, "loss": 0.2928, "step": 17643 }, { "epoch": 0.8524906991351403, "grad_norm": 3.1017909049987793, "learning_rate": 1.4750930086485963e-07, "loss": 0.3001, "step": 17644 }, { "epoch": 0.8525390153162294, "grad_norm": 2.445941209793091, "learning_rate": 1.4746098468377057e-07, "loss": 0.2446, "step": 17645 }, { "epoch": 0.8525873314973185, "grad_norm": 3.831916332244873, "learning_rate": 1.4741266850268156e-07, "loss": 0.2074, "step": 17646 }, { "epoch": 0.8526356476784075, "grad_norm": 2.789862632751465, "learning_rate": 1.473643523215925e-07, "loss": 0.2583, "step": 17647 }, { "epoch": 0.8526839638594965, "grad_norm": 3.025463104248047, "learning_rate": 1.4731603614050344e-07, "loss": 0.3261, "step": 17648 }, { "epoch": 0.8527322800405855, "grad_norm": 3.1608076095581055, "learning_rate": 1.472677199594144e-07, "loss": 0.3262, "step": 17649 }, { "epoch": 0.8527805962216747, "grad_norm": 2.1693341732025146, "learning_rate": 1.4721940377832534e-07, "loss": 0.2714, "step": 17650 }, { "epoch": 0.8528289124027637, "grad_norm": 1.6010884046554565, "learning_rate": 1.4717108759723633e-07, "loss": 0.1643, "step": 17651 }, { "epoch": 0.8528772285838527, "grad_norm": 1.548915982246399, "learning_rate": 1.4712277141614726e-07, "loss": 0.1951, "step": 17652 }, { "epoch": 0.8529255447649418, "grad_norm": 2.928666591644287, "learning_rate": 1.470744552350582e-07, "loss": 0.2887, "step": 17653 }, { "epoch": 0.8529738609460308, "grad_norm": 3.697892665863037, "learning_rate": 1.4702613905396916e-07, "loss": 0.4253, "step": 17654 }, { "epoch": 0.8530221771271199, "grad_norm": 2.8685953617095947, "learning_rate": 1.4697782287288013e-07, "loss": 0.4025, "step": 17655 }, { "epoch": 0.8530704933082089, "grad_norm": 2.355590343475342, "learning_rate": 1.4692950669179107e-07, "loss": 0.288, "step": 17656 }, { "epoch": 0.853118809489298, "grad_norm": 1.9889116287231445, "learning_rate": 1.4688119051070203e-07, "loss": 0.2348, "step": 17657 }, { "epoch": 0.853167125670387, "grad_norm": 2.37774658203125, "learning_rate": 1.4683287432961297e-07, "loss": 0.3256, "step": 17658 }, { "epoch": 0.853215441851476, "grad_norm": 2.8368897438049316, "learning_rate": 1.4678455814852396e-07, "loss": 0.3445, "step": 17659 }, { "epoch": 0.8532637580325652, "grad_norm": 3.0206637382507324, "learning_rate": 1.467362419674349e-07, "loss": 0.3758, "step": 17660 }, { "epoch": 0.8533120742136542, "grad_norm": 2.9051918983459473, "learning_rate": 1.4668792578634583e-07, "loss": 0.3988, "step": 17661 }, { "epoch": 0.8533603903947432, "grad_norm": 3.4321975708007812, "learning_rate": 1.466396096052568e-07, "loss": 0.3611, "step": 17662 }, { "epoch": 0.8534087065758322, "grad_norm": 2.3909780979156494, "learning_rate": 1.4659129342416773e-07, "loss": 0.3272, "step": 17663 }, { "epoch": 0.8534570227569213, "grad_norm": 3.290820598602295, "learning_rate": 1.465429772430787e-07, "loss": 0.3739, "step": 17664 }, { "epoch": 0.8535053389380104, "grad_norm": 2.776397943496704, "learning_rate": 1.4649466106198966e-07, "loss": 0.3136, "step": 17665 }, { "epoch": 0.8535536551190994, "grad_norm": 2.250131607055664, "learning_rate": 1.464463448809006e-07, "loss": 0.2133, "step": 17666 }, { "epoch": 0.8536019713001884, "grad_norm": 2.8651297092437744, "learning_rate": 1.4639802869981156e-07, "loss": 0.1931, "step": 17667 }, { "epoch": 0.8536502874812775, "grad_norm": 2.206974983215332, "learning_rate": 1.4634971251872252e-07, "loss": 0.232, "step": 17668 }, { "epoch": 0.8536986036623665, "grad_norm": 12.638077735900879, "learning_rate": 1.4630139633763346e-07, "loss": 0.3441, "step": 17669 }, { "epoch": 0.8537469198434555, "grad_norm": 3.1261494159698486, "learning_rate": 1.4625308015654443e-07, "loss": 0.3673, "step": 17670 }, { "epoch": 0.8537952360245447, "grad_norm": 2.3706727027893066, "learning_rate": 1.4620476397545536e-07, "loss": 0.2964, "step": 17671 }, { "epoch": 0.8538435522056337, "grad_norm": 2.711276054382324, "learning_rate": 1.4615644779436633e-07, "loss": 0.2637, "step": 17672 }, { "epoch": 0.8538918683867227, "grad_norm": 2.2776098251342773, "learning_rate": 1.461081316132773e-07, "loss": 0.2543, "step": 17673 }, { "epoch": 0.8539401845678117, "grad_norm": 9.337637901306152, "learning_rate": 1.4605981543218823e-07, "loss": 0.3188, "step": 17674 }, { "epoch": 0.8539885007489008, "grad_norm": 2.344902276992798, "learning_rate": 1.460114992510992e-07, "loss": 0.2267, "step": 17675 }, { "epoch": 0.8540368169299899, "grad_norm": 2.2407004833221436, "learning_rate": 1.4596318307001013e-07, "loss": 0.1416, "step": 17676 }, { "epoch": 0.8540851331110789, "grad_norm": 2.2038066387176514, "learning_rate": 1.459148668889211e-07, "loss": 0.2537, "step": 17677 }, { "epoch": 0.854133449292168, "grad_norm": 6.397116184234619, "learning_rate": 1.4586655070783206e-07, "loss": 0.2889, "step": 17678 }, { "epoch": 0.854181765473257, "grad_norm": 5.591981887817383, "learning_rate": 1.45818234526743e-07, "loss": 0.2364, "step": 17679 }, { "epoch": 0.854230081654346, "grad_norm": 2.0922110080718994, "learning_rate": 1.4576991834565393e-07, "loss": 0.2004, "step": 17680 }, { "epoch": 0.8542783978354351, "grad_norm": 5.093725681304932, "learning_rate": 1.4572160216456492e-07, "loss": 0.3069, "step": 17681 }, { "epoch": 0.8543267140165242, "grad_norm": 3.864767551422119, "learning_rate": 1.4567328598347586e-07, "loss": 0.2649, "step": 17682 }, { "epoch": 0.8543750301976132, "grad_norm": 2.2096357345581055, "learning_rate": 1.4562496980238682e-07, "loss": 0.3071, "step": 17683 }, { "epoch": 0.8544233463787022, "grad_norm": 2.6230998039245605, "learning_rate": 1.4557665362129776e-07, "loss": 0.3301, "step": 17684 }, { "epoch": 0.8544716625597912, "grad_norm": 3.0450832843780518, "learning_rate": 1.4552833744020872e-07, "loss": 0.3859, "step": 17685 }, { "epoch": 0.8545199787408804, "grad_norm": 3.158787488937378, "learning_rate": 1.454800212591197e-07, "loss": 0.3339, "step": 17686 }, { "epoch": 0.8545682949219694, "grad_norm": 2.1377408504486084, "learning_rate": 1.4543170507803062e-07, "loss": 0.2531, "step": 17687 }, { "epoch": 0.8546166111030584, "grad_norm": 2.277223587036133, "learning_rate": 1.4538338889694156e-07, "loss": 0.294, "step": 17688 }, { "epoch": 0.8546649272841474, "grad_norm": 2.6401126384735107, "learning_rate": 1.4533507271585253e-07, "loss": 0.3623, "step": 17689 }, { "epoch": 0.8547132434652365, "grad_norm": 1.6746872663497925, "learning_rate": 1.452867565347635e-07, "loss": 0.1797, "step": 17690 }, { "epoch": 0.8547615596463256, "grad_norm": 4.124258041381836, "learning_rate": 1.4523844035367445e-07, "loss": 0.2732, "step": 17691 }, { "epoch": 0.8548098758274146, "grad_norm": 2.2139108180999756, "learning_rate": 1.451901241725854e-07, "loss": 0.3127, "step": 17692 }, { "epoch": 0.8548581920085037, "grad_norm": 7.217626571655273, "learning_rate": 1.4514180799149633e-07, "loss": 0.4101, "step": 17693 }, { "epoch": 0.8549065081895927, "grad_norm": 2.930570602416992, "learning_rate": 1.4509349181040732e-07, "loss": 0.3535, "step": 17694 }, { "epoch": 0.8549548243706817, "grad_norm": 1.9683862924575806, "learning_rate": 1.4504517562931825e-07, "loss": 0.1781, "step": 17695 }, { "epoch": 0.8550031405517707, "grad_norm": 4.431149005889893, "learning_rate": 1.449968594482292e-07, "loss": 0.2771, "step": 17696 }, { "epoch": 0.8550514567328599, "grad_norm": 6.458134651184082, "learning_rate": 1.4494854326714016e-07, "loss": 0.3168, "step": 17697 }, { "epoch": 0.8550997729139489, "grad_norm": 3.9666285514831543, "learning_rate": 1.4490022708605112e-07, "loss": 0.2965, "step": 17698 }, { "epoch": 0.8551480890950379, "grad_norm": 4.484841346740723, "learning_rate": 1.4485191090496208e-07, "loss": 0.311, "step": 17699 }, { "epoch": 0.855196405276127, "grad_norm": 2.0615787506103516, "learning_rate": 1.4480359472387302e-07, "loss": 0.1949, "step": 17700 }, { "epoch": 0.855244721457216, "grad_norm": 2.070486068725586, "learning_rate": 1.4475527854278396e-07, "loss": 0.1834, "step": 17701 }, { "epoch": 0.8552930376383051, "grad_norm": 2.6065189838409424, "learning_rate": 1.4470696236169492e-07, "loss": 0.3555, "step": 17702 }, { "epoch": 0.8553413538193941, "grad_norm": 2.348569393157959, "learning_rate": 1.4465864618060589e-07, "loss": 0.3025, "step": 17703 }, { "epoch": 0.8553896700004832, "grad_norm": 1.9376262426376343, "learning_rate": 1.4461032999951682e-07, "loss": 0.2393, "step": 17704 }, { "epoch": 0.8554379861815722, "grad_norm": 4.118010520935059, "learning_rate": 1.4456201381842779e-07, "loss": 0.31, "step": 17705 }, { "epoch": 0.8554863023626612, "grad_norm": 2.797096014022827, "learning_rate": 1.4451369763733872e-07, "loss": 0.35, "step": 17706 }, { "epoch": 0.8555346185437503, "grad_norm": 2.758744478225708, "learning_rate": 1.4446538145624971e-07, "loss": 0.2729, "step": 17707 }, { "epoch": 0.8555829347248394, "grad_norm": 3.422753095626831, "learning_rate": 1.4441706527516065e-07, "loss": 0.2708, "step": 17708 }, { "epoch": 0.8556312509059284, "grad_norm": 5.362987995147705, "learning_rate": 1.443687490940716e-07, "loss": 0.2103, "step": 17709 }, { "epoch": 0.8556795670870174, "grad_norm": 2.969055652618408, "learning_rate": 1.4432043291298255e-07, "loss": 0.3313, "step": 17710 }, { "epoch": 0.8557278832681064, "grad_norm": 2.4181694984436035, "learning_rate": 1.4427211673189352e-07, "loss": 0.2903, "step": 17711 }, { "epoch": 0.8557761994491956, "grad_norm": 2.8657162189483643, "learning_rate": 1.4422380055080445e-07, "loss": 0.3042, "step": 17712 }, { "epoch": 0.8558245156302846, "grad_norm": 2.5403289794921875, "learning_rate": 1.4417548436971542e-07, "loss": 0.2955, "step": 17713 }, { "epoch": 0.8558728318113736, "grad_norm": 4.084091663360596, "learning_rate": 1.4412716818862635e-07, "loss": 0.2549, "step": 17714 }, { "epoch": 0.8559211479924627, "grad_norm": 2.6813883781433105, "learning_rate": 1.4407885200753732e-07, "loss": 0.3677, "step": 17715 }, { "epoch": 0.8559694641735517, "grad_norm": 2.4329264163970947, "learning_rate": 1.4403053582644828e-07, "loss": 0.2598, "step": 17716 }, { "epoch": 0.8560177803546408, "grad_norm": 2.4117274284362793, "learning_rate": 1.4398221964535922e-07, "loss": 0.2768, "step": 17717 }, { "epoch": 0.8560660965357298, "grad_norm": 2.792623281478882, "learning_rate": 1.4393390346427018e-07, "loss": 0.3253, "step": 17718 }, { "epoch": 0.8561144127168189, "grad_norm": 3.8786380290985107, "learning_rate": 1.4388558728318112e-07, "loss": 0.2756, "step": 17719 }, { "epoch": 0.8561627288979079, "grad_norm": 3.5484862327575684, "learning_rate": 1.4383727110209208e-07, "loss": 0.3331, "step": 17720 }, { "epoch": 0.8562110450789969, "grad_norm": 8.646921157836914, "learning_rate": 1.4378895492100305e-07, "loss": 0.2525, "step": 17721 }, { "epoch": 0.856259361260086, "grad_norm": 4.063703536987305, "learning_rate": 1.4374063873991398e-07, "loss": 0.3341, "step": 17722 }, { "epoch": 0.8563076774411751, "grad_norm": 3.0234785079956055, "learning_rate": 1.4369232255882495e-07, "loss": 0.3287, "step": 17723 }, { "epoch": 0.8563559936222641, "grad_norm": 3.2304275035858154, "learning_rate": 1.436440063777359e-07, "loss": 0.2589, "step": 17724 }, { "epoch": 0.8564043098033531, "grad_norm": 2.8443126678466797, "learning_rate": 1.4359569019664685e-07, "loss": 0.3166, "step": 17725 }, { "epoch": 0.8564526259844422, "grad_norm": 3.769665241241455, "learning_rate": 1.435473740155578e-07, "loss": 0.1602, "step": 17726 }, { "epoch": 0.8565009421655312, "grad_norm": 2.330085277557373, "learning_rate": 1.4349905783446875e-07, "loss": 0.2243, "step": 17727 }, { "epoch": 0.8565492583466203, "grad_norm": 1.9252026081085205, "learning_rate": 1.434507416533797e-07, "loss": 0.1779, "step": 17728 }, { "epoch": 0.8565975745277093, "grad_norm": 4.948566436767578, "learning_rate": 1.4340242547229068e-07, "loss": 0.3311, "step": 17729 }, { "epoch": 0.8566458907087984, "grad_norm": 4.066060543060303, "learning_rate": 1.4335410929120162e-07, "loss": 0.3012, "step": 17730 }, { "epoch": 0.8566942068898874, "grad_norm": 2.732046604156494, "learning_rate": 1.4330579311011258e-07, "loss": 0.3277, "step": 17731 }, { "epoch": 0.8567425230709764, "grad_norm": 2.5783255100250244, "learning_rate": 1.4325747692902352e-07, "loss": 0.3592, "step": 17732 }, { "epoch": 0.8567908392520656, "grad_norm": 2.4101336002349854, "learning_rate": 1.4320916074793448e-07, "loss": 0.2385, "step": 17733 }, { "epoch": 0.8568391554331546, "grad_norm": 2.321396589279175, "learning_rate": 1.4316084456684544e-07, "loss": 0.2745, "step": 17734 }, { "epoch": 0.8568874716142436, "grad_norm": 2.947582483291626, "learning_rate": 1.4311252838575638e-07, "loss": 0.2223, "step": 17735 }, { "epoch": 0.8569357877953326, "grad_norm": 3.525831460952759, "learning_rate": 1.4306421220466732e-07, "loss": 0.2778, "step": 17736 }, { "epoch": 0.8569841039764217, "grad_norm": 5.157817840576172, "learning_rate": 1.430158960235783e-07, "loss": 0.403, "step": 17737 }, { "epoch": 0.8570324201575108, "grad_norm": 3.0070183277130127, "learning_rate": 1.4296757984248925e-07, "loss": 0.3288, "step": 17738 }, { "epoch": 0.8570807363385998, "grad_norm": 5.9952802658081055, "learning_rate": 1.429192636614002e-07, "loss": 0.3134, "step": 17739 }, { "epoch": 0.8571290525196888, "grad_norm": 2.2351207733154297, "learning_rate": 1.4287094748031115e-07, "loss": 0.2316, "step": 17740 }, { "epoch": 0.8571773687007779, "grad_norm": 3.7330493927001953, "learning_rate": 1.4282263129922208e-07, "loss": 0.363, "step": 17741 }, { "epoch": 0.8572256848818669, "grad_norm": 2.6279098987579346, "learning_rate": 1.4277431511813307e-07, "loss": 0.2917, "step": 17742 }, { "epoch": 0.857274001062956, "grad_norm": 1.6574513912200928, "learning_rate": 1.42725998937044e-07, "loss": 0.168, "step": 17743 }, { "epoch": 0.8573223172440451, "grad_norm": 2.474592924118042, "learning_rate": 1.4267768275595495e-07, "loss": 0.2898, "step": 17744 }, { "epoch": 0.8573706334251341, "grad_norm": 3.405240297317505, "learning_rate": 1.426293665748659e-07, "loss": 0.3734, "step": 17745 }, { "epoch": 0.8574189496062231, "grad_norm": 3.8114237785339355, "learning_rate": 1.4258105039377688e-07, "loss": 0.322, "step": 17746 }, { "epoch": 0.8574672657873121, "grad_norm": 1.921595811843872, "learning_rate": 1.4253273421268784e-07, "loss": 0.2302, "step": 17747 }, { "epoch": 0.8575155819684012, "grad_norm": 3.784829616546631, "learning_rate": 1.4248441803159878e-07, "loss": 0.3087, "step": 17748 }, { "epoch": 0.8575638981494903, "grad_norm": 2.5719809532165527, "learning_rate": 1.4243610185050971e-07, "loss": 0.358, "step": 17749 }, { "epoch": 0.8576122143305793, "grad_norm": 4.639178276062012, "learning_rate": 1.423877856694207e-07, "loss": 0.3354, "step": 17750 }, { "epoch": 0.8576605305116684, "grad_norm": 3.8623836040496826, "learning_rate": 1.4233946948833164e-07, "loss": 0.4391, "step": 17751 }, { "epoch": 0.8577088466927574, "grad_norm": 10.752788543701172, "learning_rate": 1.4229115330724258e-07, "loss": 0.3129, "step": 17752 }, { "epoch": 0.8577571628738464, "grad_norm": 1.815140962600708, "learning_rate": 1.4224283712615354e-07, "loss": 0.1663, "step": 17753 }, { "epoch": 0.8578054790549355, "grad_norm": 2.067870616912842, "learning_rate": 1.4219452094506448e-07, "loss": 0.1919, "step": 17754 }, { "epoch": 0.8578537952360246, "grad_norm": 7.224113464355469, "learning_rate": 1.4214620476397547e-07, "loss": 0.3583, "step": 17755 }, { "epoch": 0.8579021114171136, "grad_norm": 2.629863739013672, "learning_rate": 1.420978885828864e-07, "loss": 0.293, "step": 17756 }, { "epoch": 0.8579504275982026, "grad_norm": 1.990259051322937, "learning_rate": 1.4204957240179735e-07, "loss": 0.219, "step": 17757 }, { "epoch": 0.8579987437792916, "grad_norm": 2.4539740085601807, "learning_rate": 1.420012562207083e-07, "loss": 0.3079, "step": 17758 }, { "epoch": 0.8580470599603808, "grad_norm": 2.3552021980285645, "learning_rate": 1.4195294003961927e-07, "loss": 0.2736, "step": 17759 }, { "epoch": 0.8580953761414698, "grad_norm": 54.06694412231445, "learning_rate": 1.419046238585302e-07, "loss": 0.3287, "step": 17760 }, { "epoch": 0.8581436923225588, "grad_norm": 2.655337333679199, "learning_rate": 1.4185630767744117e-07, "loss": 0.2436, "step": 17761 }, { "epoch": 0.8581920085036479, "grad_norm": 3.34267520904541, "learning_rate": 1.418079914963521e-07, "loss": 0.2879, "step": 17762 }, { "epoch": 0.8582403246847369, "grad_norm": 3.5674352645874023, "learning_rate": 1.417596753152631e-07, "loss": 0.2953, "step": 17763 }, { "epoch": 0.858288640865826, "grad_norm": 2.490710973739624, "learning_rate": 1.4171135913417404e-07, "loss": 0.3061, "step": 17764 }, { "epoch": 0.858336957046915, "grad_norm": 4.498745441436768, "learning_rate": 1.4166304295308498e-07, "loss": 0.2427, "step": 17765 }, { "epoch": 0.8583852732280041, "grad_norm": 1.8028359413146973, "learning_rate": 1.4161472677199594e-07, "loss": 0.2127, "step": 17766 }, { "epoch": 0.8584335894090931, "grad_norm": 2.995760679244995, "learning_rate": 1.4156641059090688e-07, "loss": 0.333, "step": 17767 }, { "epoch": 0.8584819055901821, "grad_norm": 3.7480804920196533, "learning_rate": 1.4151809440981784e-07, "loss": 0.3224, "step": 17768 }, { "epoch": 0.8585302217712713, "grad_norm": 1.856207251548767, "learning_rate": 1.414697782287288e-07, "loss": 0.2057, "step": 17769 }, { "epoch": 0.8585785379523603, "grad_norm": 2.48563289642334, "learning_rate": 1.4142146204763974e-07, "loss": 0.3661, "step": 17770 }, { "epoch": 0.8586268541334493, "grad_norm": 3.1174685955047607, "learning_rate": 1.4137314586655068e-07, "loss": 0.2828, "step": 17771 }, { "epoch": 0.8586751703145383, "grad_norm": 2.606891632080078, "learning_rate": 1.4132482968546167e-07, "loss": 0.2656, "step": 17772 }, { "epoch": 0.8587234864956274, "grad_norm": 2.5766429901123047, "learning_rate": 1.412765135043726e-07, "loss": 0.2035, "step": 17773 }, { "epoch": 0.8587718026767164, "grad_norm": 2.67350435256958, "learning_rate": 1.4122819732328357e-07, "loss": 0.3268, "step": 17774 }, { "epoch": 0.8588201188578055, "grad_norm": 2.954119920730591, "learning_rate": 1.411798811421945e-07, "loss": 0.4171, "step": 17775 }, { "epoch": 0.8588684350388945, "grad_norm": 2.54726505279541, "learning_rate": 1.4113156496110547e-07, "loss": 0.2218, "step": 17776 }, { "epoch": 0.8589167512199836, "grad_norm": 2.2497329711914062, "learning_rate": 1.4108324878001643e-07, "loss": 0.2402, "step": 17777 }, { "epoch": 0.8589650674010726, "grad_norm": 3.395582675933838, "learning_rate": 1.4103493259892737e-07, "loss": 0.3231, "step": 17778 }, { "epoch": 0.8590133835821616, "grad_norm": 2.368548631668091, "learning_rate": 1.409866164178383e-07, "loss": 0.2981, "step": 17779 }, { "epoch": 0.8590616997632508, "grad_norm": 3.170529842376709, "learning_rate": 1.4093830023674927e-07, "loss": 0.3215, "step": 17780 }, { "epoch": 0.8591100159443398, "grad_norm": 5.8314924240112305, "learning_rate": 1.4088998405566024e-07, "loss": 0.3213, "step": 17781 }, { "epoch": 0.8591583321254288, "grad_norm": 2.459672689437866, "learning_rate": 1.408416678745712e-07, "loss": 0.3169, "step": 17782 }, { "epoch": 0.8592066483065178, "grad_norm": 1.788213849067688, "learning_rate": 1.4079335169348214e-07, "loss": 0.1992, "step": 17783 }, { "epoch": 0.8592549644876069, "grad_norm": 2.8804759979248047, "learning_rate": 1.4074503551239308e-07, "loss": 0.3258, "step": 17784 }, { "epoch": 0.859303280668696, "grad_norm": 2.9536938667297363, "learning_rate": 1.4069671933130407e-07, "loss": 0.331, "step": 17785 }, { "epoch": 0.859351596849785, "grad_norm": 3.340749740600586, "learning_rate": 1.40648403150215e-07, "loss": 0.2918, "step": 17786 }, { "epoch": 0.859399913030874, "grad_norm": 3.039198160171509, "learning_rate": 1.4060008696912594e-07, "loss": 0.265, "step": 17787 }, { "epoch": 0.8594482292119631, "grad_norm": 2.6793675422668457, "learning_rate": 1.405517707880369e-07, "loss": 0.3002, "step": 17788 }, { "epoch": 0.8594965453930521, "grad_norm": 2.0843145847320557, "learning_rate": 1.4050345460694787e-07, "loss": 0.218, "step": 17789 }, { "epoch": 0.8595448615741412, "grad_norm": 1.8258216381072998, "learning_rate": 1.4045513842585883e-07, "loss": 0.2282, "step": 17790 }, { "epoch": 0.8595931777552303, "grad_norm": 2.4463398456573486, "learning_rate": 1.4040682224476977e-07, "loss": 0.2116, "step": 17791 }, { "epoch": 0.8596414939363193, "grad_norm": 3.6904823780059814, "learning_rate": 1.403585060636807e-07, "loss": 0.3899, "step": 17792 }, { "epoch": 0.8596898101174083, "grad_norm": 1.9074007272720337, "learning_rate": 1.4031018988259167e-07, "loss": 0.2157, "step": 17793 }, { "epoch": 0.8597381262984973, "grad_norm": 10.611675262451172, "learning_rate": 1.4026187370150263e-07, "loss": 0.4528, "step": 17794 }, { "epoch": 0.8597864424795865, "grad_norm": 5.061300277709961, "learning_rate": 1.4021355752041357e-07, "loss": 0.2023, "step": 17795 }, { "epoch": 0.8598347586606755, "grad_norm": 2.479231119155884, "learning_rate": 1.4016524133932453e-07, "loss": 0.2064, "step": 17796 }, { "epoch": 0.8598830748417645, "grad_norm": 2.5470006465911865, "learning_rate": 1.4011692515823547e-07, "loss": 0.2479, "step": 17797 }, { "epoch": 0.8599313910228535, "grad_norm": 5.95849609375, "learning_rate": 1.4006860897714646e-07, "loss": 0.2696, "step": 17798 }, { "epoch": 0.8599797072039426, "grad_norm": 2.7789018154144287, "learning_rate": 1.400202927960574e-07, "loss": 0.354, "step": 17799 }, { "epoch": 0.8600280233850316, "grad_norm": 4.9635725021362305, "learning_rate": 1.3997197661496834e-07, "loss": 0.2365, "step": 17800 }, { "epoch": 0.8600763395661207, "grad_norm": 3.530247449874878, "learning_rate": 1.399236604338793e-07, "loss": 0.2653, "step": 17801 }, { "epoch": 0.8601246557472098, "grad_norm": 2.5107953548431396, "learning_rate": 1.3987534425279024e-07, "loss": 0.2637, "step": 17802 }, { "epoch": 0.8601729719282988, "grad_norm": 3.381439208984375, "learning_rate": 1.398270280717012e-07, "loss": 0.3418, "step": 17803 }, { "epoch": 0.8602212881093878, "grad_norm": 2.3237698078155518, "learning_rate": 1.3977871189061216e-07, "loss": 0.2151, "step": 17804 }, { "epoch": 0.8602696042904768, "grad_norm": 2.4134676456451416, "learning_rate": 1.397303957095231e-07, "loss": 0.2522, "step": 17805 }, { "epoch": 0.860317920471566, "grad_norm": 3.2323691844940186, "learning_rate": 1.3968207952843407e-07, "loss": 0.2482, "step": 17806 }, { "epoch": 0.860366236652655, "grad_norm": 1.7793667316436768, "learning_rate": 1.3963376334734503e-07, "loss": 0.1902, "step": 17807 }, { "epoch": 0.860414552833744, "grad_norm": 2.2043304443359375, "learning_rate": 1.3958544716625597e-07, "loss": 0.2147, "step": 17808 }, { "epoch": 0.860462869014833, "grad_norm": 1.743414282798767, "learning_rate": 1.3953713098516693e-07, "loss": 0.2032, "step": 17809 }, { "epoch": 0.8605111851959221, "grad_norm": 4.816310882568359, "learning_rate": 1.3948881480407787e-07, "loss": 0.2492, "step": 17810 }, { "epoch": 0.8605595013770112, "grad_norm": 2.4188852310180664, "learning_rate": 1.3944049862298883e-07, "loss": 0.3053, "step": 17811 }, { "epoch": 0.8606078175581002, "grad_norm": 2.00542950630188, "learning_rate": 1.393921824418998e-07, "loss": 0.2666, "step": 17812 }, { "epoch": 0.8606561337391893, "grad_norm": 2.5663373470306396, "learning_rate": 1.3934386626081073e-07, "loss": 0.3411, "step": 17813 }, { "epoch": 0.8607044499202783, "grad_norm": 2.2264907360076904, "learning_rate": 1.392955500797217e-07, "loss": 0.2196, "step": 17814 }, { "epoch": 0.8607527661013673, "grad_norm": 13.819181442260742, "learning_rate": 1.3924723389863263e-07, "loss": 0.3793, "step": 17815 }, { "epoch": 0.8608010822824564, "grad_norm": 3.652292013168335, "learning_rate": 1.391989177175436e-07, "loss": 0.3392, "step": 17816 }, { "epoch": 0.8608493984635455, "grad_norm": 4.192506313323975, "learning_rate": 1.3915060153645456e-07, "loss": 0.3449, "step": 17817 }, { "epoch": 0.8608977146446345, "grad_norm": 2.628190040588379, "learning_rate": 1.391022853553655e-07, "loss": 0.2687, "step": 17818 }, { "epoch": 0.8609460308257235, "grad_norm": 4.2891621589660645, "learning_rate": 1.3905396917427644e-07, "loss": 0.3611, "step": 17819 }, { "epoch": 0.8609943470068125, "grad_norm": 29.640024185180664, "learning_rate": 1.3900565299318743e-07, "loss": 0.2752, "step": 17820 }, { "epoch": 0.8610426631879017, "grad_norm": 3.4569168090820312, "learning_rate": 1.3895733681209836e-07, "loss": 0.3427, "step": 17821 }, { "epoch": 0.8610909793689907, "grad_norm": 2.9125478267669678, "learning_rate": 1.3890902063100933e-07, "loss": 0.2083, "step": 17822 }, { "epoch": 0.8611392955500797, "grad_norm": 3.6389427185058594, "learning_rate": 1.3886070444992026e-07, "loss": 0.1912, "step": 17823 }, { "epoch": 0.8611876117311688, "grad_norm": 2.4094653129577637, "learning_rate": 1.3881238826883123e-07, "loss": 0.3296, "step": 17824 }, { "epoch": 0.8612359279122578, "grad_norm": 2.500314474105835, "learning_rate": 1.387640720877422e-07, "loss": 0.263, "step": 17825 }, { "epoch": 0.8612842440933468, "grad_norm": 2.8293371200561523, "learning_rate": 1.3871575590665313e-07, "loss": 0.3547, "step": 17826 }, { "epoch": 0.8613325602744359, "grad_norm": 2.42313551902771, "learning_rate": 1.3866743972556407e-07, "loss": 0.2254, "step": 17827 }, { "epoch": 0.861380876455525, "grad_norm": 2.4783120155334473, "learning_rate": 1.3861912354447503e-07, "loss": 0.2971, "step": 17828 }, { "epoch": 0.861429192636614, "grad_norm": 2.524791717529297, "learning_rate": 1.38570807363386e-07, "loss": 0.303, "step": 17829 }, { "epoch": 0.861477508817703, "grad_norm": 2.9579508304595947, "learning_rate": 1.3852249118229696e-07, "loss": 0.3084, "step": 17830 }, { "epoch": 0.861525824998792, "grad_norm": 5.792121410369873, "learning_rate": 1.384741750012079e-07, "loss": 0.2314, "step": 17831 }, { "epoch": 0.8615741411798812, "grad_norm": 2.6516895294189453, "learning_rate": 1.3842585882011883e-07, "loss": 0.3078, "step": 17832 }, { "epoch": 0.8616224573609702, "grad_norm": 2.479339361190796, "learning_rate": 1.3837754263902982e-07, "loss": 0.2397, "step": 17833 }, { "epoch": 0.8616707735420592, "grad_norm": 2.843109607696533, "learning_rate": 1.3832922645794076e-07, "loss": 0.3897, "step": 17834 }, { "epoch": 0.8617190897231483, "grad_norm": 5.108142375946045, "learning_rate": 1.382809102768517e-07, "loss": 0.283, "step": 17835 }, { "epoch": 0.8617674059042373, "grad_norm": 2.7287473678588867, "learning_rate": 1.3823259409576266e-07, "loss": 0.3006, "step": 17836 }, { "epoch": 0.8618157220853264, "grad_norm": 2.897303581237793, "learning_rate": 1.3818427791467362e-07, "loss": 0.3844, "step": 17837 }, { "epoch": 0.8618640382664154, "grad_norm": 3.446516275405884, "learning_rate": 1.381359617335846e-07, "loss": 0.4019, "step": 17838 }, { "epoch": 0.8619123544475045, "grad_norm": 3.5634167194366455, "learning_rate": 1.3808764555249553e-07, "loss": 0.3603, "step": 17839 }, { "epoch": 0.8619606706285935, "grad_norm": 2.6925277709960938, "learning_rate": 1.3803932937140646e-07, "loss": 0.3237, "step": 17840 }, { "epoch": 0.8620089868096825, "grad_norm": 3.6968488693237305, "learning_rate": 1.3799101319031743e-07, "loss": 0.3791, "step": 17841 }, { "epoch": 0.8620573029907717, "grad_norm": 3.5250113010406494, "learning_rate": 1.379426970092284e-07, "loss": 0.2755, "step": 17842 }, { "epoch": 0.8621056191718607, "grad_norm": 2.4303009510040283, "learning_rate": 1.3789438082813933e-07, "loss": 0.2904, "step": 17843 }, { "epoch": 0.8621539353529497, "grad_norm": 2.5693368911743164, "learning_rate": 1.378460646470503e-07, "loss": 0.3064, "step": 17844 }, { "epoch": 0.8622022515340387, "grad_norm": 5.097300052642822, "learning_rate": 1.3779774846596123e-07, "loss": 0.2673, "step": 17845 }, { "epoch": 0.8622505677151278, "grad_norm": 3.3243601322174072, "learning_rate": 1.3774943228487222e-07, "loss": 0.4576, "step": 17846 }, { "epoch": 0.8622988838962169, "grad_norm": 2.5207083225250244, "learning_rate": 1.3770111610378316e-07, "loss": 0.2219, "step": 17847 }, { "epoch": 0.8623472000773059, "grad_norm": 2.3686087131500244, "learning_rate": 1.376527999226941e-07, "loss": 0.2534, "step": 17848 }, { "epoch": 0.862395516258395, "grad_norm": 2.383700132369995, "learning_rate": 1.3760448374160506e-07, "loss": 0.2664, "step": 17849 }, { "epoch": 0.862443832439484, "grad_norm": 2.897310972213745, "learning_rate": 1.3755616756051602e-07, "loss": 0.2986, "step": 17850 }, { "epoch": 0.862492148620573, "grad_norm": 1.969055414199829, "learning_rate": 1.3750785137942696e-07, "loss": 0.1894, "step": 17851 }, { "epoch": 0.862540464801662, "grad_norm": 3.8376822471618652, "learning_rate": 1.3745953519833792e-07, "loss": 0.2983, "step": 17852 }, { "epoch": 0.8625887809827512, "grad_norm": 2.2879419326782227, "learning_rate": 1.3741121901724886e-07, "loss": 0.2018, "step": 17853 }, { "epoch": 0.8626370971638402, "grad_norm": 1.9416464567184448, "learning_rate": 1.3736290283615982e-07, "loss": 0.2287, "step": 17854 }, { "epoch": 0.8626854133449292, "grad_norm": 2.2693424224853516, "learning_rate": 1.3731458665507079e-07, "loss": 0.2282, "step": 17855 }, { "epoch": 0.8627337295260182, "grad_norm": 1.1450918912887573, "learning_rate": 1.3726627047398172e-07, "loss": 0.1185, "step": 17856 }, { "epoch": 0.8627820457071073, "grad_norm": 3.6490478515625, "learning_rate": 1.372179542928927e-07, "loss": 0.264, "step": 17857 }, { "epoch": 0.8628303618881964, "grad_norm": 3.610548496246338, "learning_rate": 1.3716963811180362e-07, "loss": 0.3252, "step": 17858 }, { "epoch": 0.8628786780692854, "grad_norm": 5.592649459838867, "learning_rate": 1.371213219307146e-07, "loss": 0.2091, "step": 17859 }, { "epoch": 0.8629269942503744, "grad_norm": 2.5500991344451904, "learning_rate": 1.3707300574962555e-07, "loss": 0.3003, "step": 17860 }, { "epoch": 0.8629753104314635, "grad_norm": 3.3315536975860596, "learning_rate": 1.370246895685365e-07, "loss": 0.3634, "step": 17861 }, { "epoch": 0.8630236266125525, "grad_norm": 2.777575969696045, "learning_rate": 1.3697637338744745e-07, "loss": 0.3382, "step": 17862 }, { "epoch": 0.8630719427936416, "grad_norm": 2.5195631980895996, "learning_rate": 1.3692805720635842e-07, "loss": 0.2837, "step": 17863 }, { "epoch": 0.8631202589747307, "grad_norm": 3.2037105560302734, "learning_rate": 1.3687974102526935e-07, "loss": 0.2466, "step": 17864 }, { "epoch": 0.8631685751558197, "grad_norm": 2.089031457901001, "learning_rate": 1.3683142484418032e-07, "loss": 0.237, "step": 17865 }, { "epoch": 0.8632168913369087, "grad_norm": 2.0916078090667725, "learning_rate": 1.3678310866309126e-07, "loss": 0.177, "step": 17866 }, { "epoch": 0.8632652075179977, "grad_norm": 2.2167551517486572, "learning_rate": 1.367347924820022e-07, "loss": 0.284, "step": 17867 }, { "epoch": 0.8633135236990869, "grad_norm": 5.007218837738037, "learning_rate": 1.3668647630091318e-07, "loss": 0.4009, "step": 17868 }, { "epoch": 0.8633618398801759, "grad_norm": 6.116081714630127, "learning_rate": 1.3663816011982412e-07, "loss": 0.3009, "step": 17869 }, { "epoch": 0.8634101560612649, "grad_norm": 2.8437132835388184, "learning_rate": 1.3658984393873508e-07, "loss": 0.3474, "step": 17870 }, { "epoch": 0.863458472242354, "grad_norm": 3.26515531539917, "learning_rate": 1.3654152775764602e-07, "loss": 0.2596, "step": 17871 }, { "epoch": 0.863506788423443, "grad_norm": 3.189899444580078, "learning_rate": 1.3649321157655698e-07, "loss": 0.2946, "step": 17872 }, { "epoch": 0.8635551046045321, "grad_norm": 12.931365966796875, "learning_rate": 1.3644489539546795e-07, "loss": 0.194, "step": 17873 }, { "epoch": 0.8636034207856211, "grad_norm": 2.9124741554260254, "learning_rate": 1.3639657921437889e-07, "loss": 0.391, "step": 17874 }, { "epoch": 0.8636517369667102, "grad_norm": 4.888509273529053, "learning_rate": 1.3634826303328982e-07, "loss": 0.2747, "step": 17875 }, { "epoch": 0.8637000531477992, "grad_norm": 4.229952812194824, "learning_rate": 1.3629994685220081e-07, "loss": 0.2962, "step": 17876 }, { "epoch": 0.8637483693288882, "grad_norm": 3.3919060230255127, "learning_rate": 1.3625163067111175e-07, "loss": 0.3789, "step": 17877 }, { "epoch": 0.8637966855099773, "grad_norm": 2.1621062755584717, "learning_rate": 1.3620331449002271e-07, "loss": 0.2405, "step": 17878 }, { "epoch": 0.8638450016910664, "grad_norm": 2.232009172439575, "learning_rate": 1.3615499830893365e-07, "loss": 0.2685, "step": 17879 }, { "epoch": 0.8638933178721554, "grad_norm": 3.1422557830810547, "learning_rate": 1.361066821278446e-07, "loss": 0.3543, "step": 17880 }, { "epoch": 0.8639416340532444, "grad_norm": 3.1152141094207764, "learning_rate": 1.3605836594675558e-07, "loss": 0.3294, "step": 17881 }, { "epoch": 0.8639899502343334, "grad_norm": 2.156850814819336, "learning_rate": 1.3601004976566652e-07, "loss": 0.2116, "step": 17882 }, { "epoch": 0.8640382664154225, "grad_norm": 23.54070281982422, "learning_rate": 1.3596173358457745e-07, "loss": 0.329, "step": 17883 }, { "epoch": 0.8640865825965116, "grad_norm": 2.430039167404175, "learning_rate": 1.3591341740348842e-07, "loss": 0.3245, "step": 17884 }, { "epoch": 0.8641348987776006, "grad_norm": 3.2227284908294678, "learning_rate": 1.3586510122239938e-07, "loss": 0.2412, "step": 17885 }, { "epoch": 0.8641832149586897, "grad_norm": 3.31492018699646, "learning_rate": 1.3581678504131034e-07, "loss": 0.2614, "step": 17886 }, { "epoch": 0.8642315311397787, "grad_norm": 2.044652223587036, "learning_rate": 1.3576846886022128e-07, "loss": 0.2101, "step": 17887 }, { "epoch": 0.8642798473208677, "grad_norm": 1.8872085809707642, "learning_rate": 1.3572015267913222e-07, "loss": 0.2424, "step": 17888 }, { "epoch": 0.8643281635019568, "grad_norm": 2.485473155975342, "learning_rate": 1.356718364980432e-07, "loss": 0.3699, "step": 17889 }, { "epoch": 0.8643764796830459, "grad_norm": 2.8798177242279053, "learning_rate": 1.3562352031695415e-07, "loss": 0.3185, "step": 17890 }, { "epoch": 0.8644247958641349, "grad_norm": 2.8846213817596436, "learning_rate": 1.3557520413586508e-07, "loss": 0.2776, "step": 17891 }, { "epoch": 0.8644731120452239, "grad_norm": 2.542625665664673, "learning_rate": 1.3552688795477605e-07, "loss": 0.3111, "step": 17892 }, { "epoch": 0.864521428226313, "grad_norm": 2.4454891681671143, "learning_rate": 1.3547857177368699e-07, "loss": 0.2956, "step": 17893 }, { "epoch": 0.8645697444074021, "grad_norm": 1.9858168363571167, "learning_rate": 1.3543025559259798e-07, "loss": 0.2268, "step": 17894 }, { "epoch": 0.8646180605884911, "grad_norm": 1.6668741703033447, "learning_rate": 1.353819394115089e-07, "loss": 0.2029, "step": 17895 }, { "epoch": 0.8646663767695801, "grad_norm": 3.4445202350616455, "learning_rate": 1.3533362323041985e-07, "loss": 0.2824, "step": 17896 }, { "epoch": 0.8647146929506692, "grad_norm": 3.893406867980957, "learning_rate": 1.3528530704933081e-07, "loss": 0.3734, "step": 17897 }, { "epoch": 0.8647630091317582, "grad_norm": 2.367920398712158, "learning_rate": 1.3523699086824178e-07, "loss": 0.1496, "step": 17898 }, { "epoch": 0.8648113253128473, "grad_norm": 3.8443069458007812, "learning_rate": 1.3518867468715271e-07, "loss": 0.4035, "step": 17899 }, { "epoch": 0.8648596414939363, "grad_norm": 3.5578856468200684, "learning_rate": 1.3514035850606368e-07, "loss": 0.4176, "step": 17900 }, { "epoch": 0.8649079576750254, "grad_norm": 2.3650336265563965, "learning_rate": 1.3509204232497462e-07, "loss": 0.2671, "step": 17901 }, { "epoch": 0.8649562738561144, "grad_norm": 4.139988899230957, "learning_rate": 1.350437261438856e-07, "loss": 0.3467, "step": 17902 }, { "epoch": 0.8650045900372034, "grad_norm": 1.9611732959747314, "learning_rate": 1.3499540996279654e-07, "loss": 0.2135, "step": 17903 }, { "epoch": 0.8650529062182926, "grad_norm": 2.3346610069274902, "learning_rate": 1.3494709378170748e-07, "loss": 0.2445, "step": 17904 }, { "epoch": 0.8651012223993816, "grad_norm": 3.345600128173828, "learning_rate": 1.3489877760061844e-07, "loss": 0.3398, "step": 17905 }, { "epoch": 0.8651495385804706, "grad_norm": 2.0961058139801025, "learning_rate": 1.3485046141952938e-07, "loss": 0.2561, "step": 17906 }, { "epoch": 0.8651978547615596, "grad_norm": 2.440922260284424, "learning_rate": 1.3480214523844035e-07, "loss": 0.223, "step": 17907 }, { "epoch": 0.8652461709426487, "grad_norm": 5.24702262878418, "learning_rate": 1.347538290573513e-07, "loss": 0.3289, "step": 17908 }, { "epoch": 0.8652944871237377, "grad_norm": 2.338228464126587, "learning_rate": 1.3470551287626225e-07, "loss": 0.2553, "step": 17909 }, { "epoch": 0.8653428033048268, "grad_norm": 2.5082178115844727, "learning_rate": 1.346571966951732e-07, "loss": 0.1978, "step": 17910 }, { "epoch": 0.8653911194859158, "grad_norm": 1.576050043106079, "learning_rate": 1.3460888051408417e-07, "loss": 0.2399, "step": 17911 }, { "epoch": 0.8654394356670049, "grad_norm": 2.706528425216675, "learning_rate": 1.345605643329951e-07, "loss": 0.2896, "step": 17912 }, { "epoch": 0.8654877518480939, "grad_norm": 2.2522568702697754, "learning_rate": 1.3451224815190607e-07, "loss": 0.235, "step": 17913 }, { "epoch": 0.8655360680291829, "grad_norm": 2.000248670578003, "learning_rate": 1.34463931970817e-07, "loss": 0.2401, "step": 17914 }, { "epoch": 0.8655843842102721, "grad_norm": 2.7923521995544434, "learning_rate": 1.3441561578972798e-07, "loss": 0.3835, "step": 17915 }, { "epoch": 0.8656327003913611, "grad_norm": 2.0797181129455566, "learning_rate": 1.3436729960863894e-07, "loss": 0.2274, "step": 17916 }, { "epoch": 0.8656810165724501, "grad_norm": 8.797137260437012, "learning_rate": 1.3431898342754988e-07, "loss": 0.3289, "step": 17917 }, { "epoch": 0.8657293327535391, "grad_norm": 3.2955262660980225, "learning_rate": 1.3427066724646084e-07, "loss": 0.2952, "step": 17918 }, { "epoch": 0.8657776489346282, "grad_norm": 2.5103185176849365, "learning_rate": 1.3422235106537178e-07, "loss": 0.3459, "step": 17919 }, { "epoch": 0.8658259651157173, "grad_norm": 2.9137041568756104, "learning_rate": 1.3417403488428274e-07, "loss": 0.3262, "step": 17920 }, { "epoch": 0.8658742812968063, "grad_norm": 4.289802074432373, "learning_rate": 1.341257187031937e-07, "loss": 0.3688, "step": 17921 }, { "epoch": 0.8659225974778954, "grad_norm": 2.742950677871704, "learning_rate": 1.3407740252210464e-07, "loss": 0.3559, "step": 17922 }, { "epoch": 0.8659709136589844, "grad_norm": 4.1646928787231445, "learning_rate": 1.3402908634101558e-07, "loss": 0.423, "step": 17923 }, { "epoch": 0.8660192298400734, "grad_norm": 5.517480850219727, "learning_rate": 1.3398077015992657e-07, "loss": 0.3047, "step": 17924 }, { "epoch": 0.8660675460211625, "grad_norm": 2.443227767944336, "learning_rate": 1.339324539788375e-07, "loss": 0.2761, "step": 17925 }, { "epoch": 0.8661158622022516, "grad_norm": 2.5818653106689453, "learning_rate": 1.3388413779774847e-07, "loss": 0.3253, "step": 17926 }, { "epoch": 0.8661641783833406, "grad_norm": 2.8707733154296875, "learning_rate": 1.338358216166594e-07, "loss": 0.3163, "step": 17927 }, { "epoch": 0.8662124945644296, "grad_norm": 6.071489334106445, "learning_rate": 1.3378750543557037e-07, "loss": 0.3228, "step": 17928 }, { "epoch": 0.8662608107455186, "grad_norm": 2.5856363773345947, "learning_rate": 1.3373918925448134e-07, "loss": 0.3335, "step": 17929 }, { "epoch": 0.8663091269266078, "grad_norm": 3.006115436553955, "learning_rate": 1.3369087307339227e-07, "loss": 0.4027, "step": 17930 }, { "epoch": 0.8663574431076968, "grad_norm": 2.1149468421936035, "learning_rate": 1.336425568923032e-07, "loss": 0.1917, "step": 17931 }, { "epoch": 0.8664057592887858, "grad_norm": 2.929889678955078, "learning_rate": 1.3359424071121417e-07, "loss": 0.2089, "step": 17932 }, { "epoch": 0.8664540754698749, "grad_norm": 3.445176601409912, "learning_rate": 1.3354592453012514e-07, "loss": 0.4034, "step": 17933 }, { "epoch": 0.8665023916509639, "grad_norm": 10.561186790466309, "learning_rate": 1.334976083490361e-07, "loss": 0.3858, "step": 17934 }, { "epoch": 0.8665507078320529, "grad_norm": 2.8371262550354004, "learning_rate": 1.3344929216794704e-07, "loss": 0.3416, "step": 17935 }, { "epoch": 0.866599024013142, "grad_norm": 2.3553762435913086, "learning_rate": 1.3340097598685798e-07, "loss": 0.2506, "step": 17936 }, { "epoch": 0.8666473401942311, "grad_norm": 7.455921649932861, "learning_rate": 1.3335265980576897e-07, "loss": 0.4225, "step": 17937 }, { "epoch": 0.8666956563753201, "grad_norm": 4.1274614334106445, "learning_rate": 1.333043436246799e-07, "loss": 0.2907, "step": 17938 }, { "epoch": 0.8667439725564091, "grad_norm": 2.0871469974517822, "learning_rate": 1.3325602744359084e-07, "loss": 0.2883, "step": 17939 }, { "epoch": 0.8667922887374981, "grad_norm": 2.4963481426239014, "learning_rate": 1.332077112625018e-07, "loss": 0.266, "step": 17940 }, { "epoch": 0.8668406049185873, "grad_norm": 91.74504089355469, "learning_rate": 1.3315939508141274e-07, "loss": 0.3027, "step": 17941 }, { "epoch": 0.8668889210996763, "grad_norm": 1.8471187353134155, "learning_rate": 1.3311107890032373e-07, "loss": 0.2208, "step": 17942 }, { "epoch": 0.8669372372807653, "grad_norm": 5.287723064422607, "learning_rate": 1.3306276271923467e-07, "loss": 0.3271, "step": 17943 }, { "epoch": 0.8669855534618544, "grad_norm": 1.7446168661117554, "learning_rate": 1.330144465381456e-07, "loss": 0.204, "step": 17944 }, { "epoch": 0.8670338696429434, "grad_norm": 2.665546178817749, "learning_rate": 1.3296613035705657e-07, "loss": 0.2162, "step": 17945 }, { "epoch": 0.8670821858240325, "grad_norm": 4.889764308929443, "learning_rate": 1.3291781417596753e-07, "loss": 0.3394, "step": 17946 }, { "epoch": 0.8671305020051215, "grad_norm": 9.976469039916992, "learning_rate": 1.3286949799487847e-07, "loss": 0.3958, "step": 17947 }, { "epoch": 0.8671788181862106, "grad_norm": 2.95212459564209, "learning_rate": 1.3282118181378944e-07, "loss": 0.32, "step": 17948 }, { "epoch": 0.8672271343672996, "grad_norm": 2.540302038192749, "learning_rate": 1.3277286563270037e-07, "loss": 0.2032, "step": 17949 }, { "epoch": 0.8672754505483886, "grad_norm": 2.563002824783325, "learning_rate": 1.3272454945161136e-07, "loss": 0.2916, "step": 17950 }, { "epoch": 0.8673237667294778, "grad_norm": 10.074248313903809, "learning_rate": 1.326762332705223e-07, "loss": 0.348, "step": 17951 }, { "epoch": 0.8673720829105668, "grad_norm": 6.279904842376709, "learning_rate": 1.3262791708943324e-07, "loss": 0.2578, "step": 17952 }, { "epoch": 0.8674203990916558, "grad_norm": 2.976912260055542, "learning_rate": 1.325796009083442e-07, "loss": 0.443, "step": 17953 }, { "epoch": 0.8674687152727448, "grad_norm": 3.405214309692383, "learning_rate": 1.3253128472725514e-07, "loss": 0.4567, "step": 17954 }, { "epoch": 0.8675170314538339, "grad_norm": 3.1877694129943848, "learning_rate": 1.324829685461661e-07, "loss": 0.3537, "step": 17955 }, { "epoch": 0.867565347634923, "grad_norm": 2.982081890106201, "learning_rate": 1.3243465236507707e-07, "loss": 0.3316, "step": 17956 }, { "epoch": 0.867613663816012, "grad_norm": 2.8838961124420166, "learning_rate": 1.32386336183988e-07, "loss": 0.3566, "step": 17957 }, { "epoch": 0.867661979997101, "grad_norm": 2.7960522174835205, "learning_rate": 1.3233802000289894e-07, "loss": 0.3526, "step": 17958 }, { "epoch": 0.8677102961781901, "grad_norm": 2.417619228363037, "learning_rate": 1.3228970382180993e-07, "loss": 0.2454, "step": 17959 }, { "epoch": 0.8677586123592791, "grad_norm": 2.4700284004211426, "learning_rate": 1.3224138764072087e-07, "loss": 0.318, "step": 17960 }, { "epoch": 0.8678069285403681, "grad_norm": 2.62361216545105, "learning_rate": 1.3219307145963183e-07, "loss": 0.3173, "step": 17961 }, { "epoch": 0.8678552447214573, "grad_norm": 4.379849433898926, "learning_rate": 1.3214475527854277e-07, "loss": 0.3212, "step": 17962 }, { "epoch": 0.8679035609025463, "grad_norm": 2.2355477809906006, "learning_rate": 1.3209643909745373e-07, "loss": 0.2557, "step": 17963 }, { "epoch": 0.8679518770836353, "grad_norm": 3.0187366008758545, "learning_rate": 1.320481229163647e-07, "loss": 0.2009, "step": 17964 }, { "epoch": 0.8680001932647243, "grad_norm": 4.172095775604248, "learning_rate": 1.3199980673527563e-07, "loss": 0.2928, "step": 17965 }, { "epoch": 0.8680485094458134, "grad_norm": 3.703922748565674, "learning_rate": 1.3195149055418657e-07, "loss": 0.2634, "step": 17966 }, { "epoch": 0.8680968256269025, "grad_norm": 2.3694093227386475, "learning_rate": 1.3190317437309753e-07, "loss": 0.2601, "step": 17967 }, { "epoch": 0.8681451418079915, "grad_norm": 3.241372585296631, "learning_rate": 1.318548581920085e-07, "loss": 0.339, "step": 17968 }, { "epoch": 0.8681934579890805, "grad_norm": 1.5132099390029907, "learning_rate": 1.3180654201091946e-07, "loss": 0.1998, "step": 17969 }, { "epoch": 0.8682417741701696, "grad_norm": 5.006902694702148, "learning_rate": 1.317582258298304e-07, "loss": 0.2691, "step": 17970 }, { "epoch": 0.8682900903512586, "grad_norm": 2.264113664627075, "learning_rate": 1.3170990964874134e-07, "loss": 0.2249, "step": 17971 }, { "epoch": 0.8683384065323477, "grad_norm": 2.115201473236084, "learning_rate": 1.3166159346765233e-07, "loss": 0.284, "step": 17972 }, { "epoch": 0.8683867227134368, "grad_norm": 2.488391637802124, "learning_rate": 1.3161327728656326e-07, "loss": 0.3135, "step": 17973 }, { "epoch": 0.8684350388945258, "grad_norm": 2.6494593620300293, "learning_rate": 1.315649611054742e-07, "loss": 0.2647, "step": 17974 }, { "epoch": 0.8684833550756148, "grad_norm": 3.2788097858428955, "learning_rate": 1.3151664492438517e-07, "loss": 0.3616, "step": 17975 }, { "epoch": 0.8685316712567038, "grad_norm": 2.649698257446289, "learning_rate": 1.3146832874329613e-07, "loss": 0.1618, "step": 17976 }, { "epoch": 0.868579987437793, "grad_norm": 2.4795162677764893, "learning_rate": 1.314200125622071e-07, "loss": 0.3017, "step": 17977 }, { "epoch": 0.868628303618882, "grad_norm": 3.8974881172180176, "learning_rate": 1.3137169638111803e-07, "loss": 0.3302, "step": 17978 }, { "epoch": 0.868676619799971, "grad_norm": 2.4375975131988525, "learning_rate": 1.3132338020002897e-07, "loss": 0.3303, "step": 17979 }, { "epoch": 0.86872493598106, "grad_norm": 4.074283599853516, "learning_rate": 1.3127506401893993e-07, "loss": 0.2303, "step": 17980 }, { "epoch": 0.8687732521621491, "grad_norm": 1.8338536024093628, "learning_rate": 1.312267478378509e-07, "loss": 0.1541, "step": 17981 }, { "epoch": 0.8688215683432382, "grad_norm": 2.120000123977661, "learning_rate": 1.3117843165676186e-07, "loss": 0.2122, "step": 17982 }, { "epoch": 0.8688698845243272, "grad_norm": 3.5102622509002686, "learning_rate": 1.311301154756728e-07, "loss": 0.3793, "step": 17983 }, { "epoch": 0.8689182007054163, "grad_norm": 6.290352821350098, "learning_rate": 1.3108179929458373e-07, "loss": 0.2943, "step": 17984 }, { "epoch": 0.8689665168865053, "grad_norm": 1.9083597660064697, "learning_rate": 1.3103348311349472e-07, "loss": 0.1708, "step": 17985 }, { "epoch": 0.8690148330675943, "grad_norm": 1.9134514331817627, "learning_rate": 1.3098516693240566e-07, "loss": 0.2015, "step": 17986 }, { "epoch": 0.8690631492486833, "grad_norm": 1.8943676948547363, "learning_rate": 1.309368507513166e-07, "loss": 0.1879, "step": 17987 }, { "epoch": 0.8691114654297725, "grad_norm": 2.8258843421936035, "learning_rate": 1.3088853457022756e-07, "loss": 0.1638, "step": 17988 }, { "epoch": 0.8691597816108615, "grad_norm": 3.2312026023864746, "learning_rate": 1.3084021838913853e-07, "loss": 0.3683, "step": 17989 }, { "epoch": 0.8692080977919505, "grad_norm": 4.321529865264893, "learning_rate": 1.307919022080495e-07, "loss": 0.3501, "step": 17990 }, { "epoch": 0.8692564139730395, "grad_norm": 2.310239315032959, "learning_rate": 1.3074358602696043e-07, "loss": 0.2378, "step": 17991 }, { "epoch": 0.8693047301541286, "grad_norm": 2.346614360809326, "learning_rate": 1.3069526984587136e-07, "loss": 0.3163, "step": 17992 }, { "epoch": 0.8693530463352177, "grad_norm": 1.9948610067367554, "learning_rate": 1.3064695366478233e-07, "loss": 0.2004, "step": 17993 }, { "epoch": 0.8694013625163067, "grad_norm": 3.9206955432891846, "learning_rate": 1.305986374836933e-07, "loss": 0.3455, "step": 17994 }, { "epoch": 0.8694496786973958, "grad_norm": 2.5146172046661377, "learning_rate": 1.3055032130260423e-07, "loss": 0.3266, "step": 17995 }, { "epoch": 0.8694979948784848, "grad_norm": 3.3105592727661133, "learning_rate": 1.305020051215152e-07, "loss": 0.3467, "step": 17996 }, { "epoch": 0.8695463110595738, "grad_norm": 2.5610146522521973, "learning_rate": 1.3045368894042613e-07, "loss": 0.279, "step": 17997 }, { "epoch": 0.8695946272406629, "grad_norm": 2.8253841400146484, "learning_rate": 1.3040537275933712e-07, "loss": 0.3789, "step": 17998 }, { "epoch": 0.869642943421752, "grad_norm": 2.8507561683654785, "learning_rate": 1.3035705657824806e-07, "loss": 0.3513, "step": 17999 }, { "epoch": 0.869691259602841, "grad_norm": 2.4948859214782715, "learning_rate": 1.30308740397159e-07, "loss": 0.3011, "step": 18000 }, { "epoch": 0.86973957578393, "grad_norm": 2.0644941329956055, "learning_rate": 1.3026042421606996e-07, "loss": 0.2431, "step": 18001 }, { "epoch": 0.869787891965019, "grad_norm": 6.580483436584473, "learning_rate": 1.3021210803498092e-07, "loss": 0.416, "step": 18002 }, { "epoch": 0.8698362081461082, "grad_norm": 4.301342010498047, "learning_rate": 1.3016379185389186e-07, "loss": 0.2887, "step": 18003 }, { "epoch": 0.8698845243271972, "grad_norm": 2.162412643432617, "learning_rate": 1.3011547567280282e-07, "loss": 0.2527, "step": 18004 }, { "epoch": 0.8699328405082862, "grad_norm": 1.7665014266967773, "learning_rate": 1.3006715949171376e-07, "loss": 0.213, "step": 18005 }, { "epoch": 0.8699811566893753, "grad_norm": 2.5992395877838135, "learning_rate": 1.300188433106247e-07, "loss": 0.2892, "step": 18006 }, { "epoch": 0.8700294728704643, "grad_norm": 3.2390778064727783, "learning_rate": 1.299705271295357e-07, "loss": 0.4, "step": 18007 }, { "epoch": 0.8700777890515534, "grad_norm": 3.13571834564209, "learning_rate": 1.2992221094844662e-07, "loss": 0.3853, "step": 18008 }, { "epoch": 0.8701261052326424, "grad_norm": 3.155461549758911, "learning_rate": 1.298738947673576e-07, "loss": 0.4143, "step": 18009 }, { "epoch": 0.8701744214137315, "grad_norm": 1.2321354150772095, "learning_rate": 1.2982557858626853e-07, "loss": 0.113, "step": 18010 }, { "epoch": 0.8702227375948205, "grad_norm": 3.044830560684204, "learning_rate": 1.297772624051795e-07, "loss": 0.4291, "step": 18011 }, { "epoch": 0.8702710537759095, "grad_norm": 2.137453317642212, "learning_rate": 1.2972894622409045e-07, "loss": 0.2769, "step": 18012 }, { "epoch": 0.8703193699569985, "grad_norm": 1.803870439529419, "learning_rate": 1.296806300430014e-07, "loss": 0.201, "step": 18013 }, { "epoch": 0.8703676861380877, "grad_norm": 2.641556978225708, "learning_rate": 1.2963231386191233e-07, "loss": 0.3206, "step": 18014 }, { "epoch": 0.8704160023191767, "grad_norm": 2.835719347000122, "learning_rate": 1.2958399768082332e-07, "loss": 0.3458, "step": 18015 }, { "epoch": 0.8704643185002657, "grad_norm": 1.9746508598327637, "learning_rate": 1.2953568149973426e-07, "loss": 0.2099, "step": 18016 }, { "epoch": 0.8705126346813548, "grad_norm": 3.970426559448242, "learning_rate": 1.2948736531864522e-07, "loss": 0.2164, "step": 18017 }, { "epoch": 0.8705609508624438, "grad_norm": 2.9677011966705322, "learning_rate": 1.2943904913755616e-07, "loss": 0.3519, "step": 18018 }, { "epoch": 0.8706092670435329, "grad_norm": 2.2513506412506104, "learning_rate": 1.293907329564671e-07, "loss": 0.2369, "step": 18019 }, { "epoch": 0.870657583224622, "grad_norm": 1.9547756910324097, "learning_rate": 1.2934241677537808e-07, "loss": 0.2455, "step": 18020 }, { "epoch": 0.870705899405711, "grad_norm": 4.514400959014893, "learning_rate": 1.2929410059428902e-07, "loss": 0.3481, "step": 18021 }, { "epoch": 0.8707542155868, "grad_norm": 3.462585687637329, "learning_rate": 1.2924578441319996e-07, "loss": 0.2972, "step": 18022 }, { "epoch": 0.870802531767889, "grad_norm": 3.019413471221924, "learning_rate": 1.2919746823211092e-07, "loss": 0.28, "step": 18023 }, { "epoch": 0.8708508479489782, "grad_norm": 2.9293479919433594, "learning_rate": 1.2914915205102189e-07, "loss": 0.2416, "step": 18024 }, { "epoch": 0.8708991641300672, "grad_norm": 1.6402806043624878, "learning_rate": 1.2910083586993285e-07, "loss": 0.2189, "step": 18025 }, { "epoch": 0.8709474803111562, "grad_norm": 2.125507116317749, "learning_rate": 1.290525196888438e-07, "loss": 0.2627, "step": 18026 }, { "epoch": 0.8709957964922452, "grad_norm": 2.398625135421753, "learning_rate": 1.2900420350775472e-07, "loss": 0.2963, "step": 18027 }, { "epoch": 0.8710441126733343, "grad_norm": 4.4389214515686035, "learning_rate": 1.2895588732666571e-07, "loss": 0.3882, "step": 18028 }, { "epoch": 0.8710924288544234, "grad_norm": 3.7247064113616943, "learning_rate": 1.2890757114557665e-07, "loss": 0.296, "step": 18029 }, { "epoch": 0.8711407450355124, "grad_norm": 2.3639333248138428, "learning_rate": 1.288592549644876e-07, "loss": 0.1966, "step": 18030 }, { "epoch": 0.8711890612166014, "grad_norm": 2.9198739528656006, "learning_rate": 1.2881093878339855e-07, "loss": 0.278, "step": 18031 }, { "epoch": 0.8712373773976905, "grad_norm": 2.948704719543457, "learning_rate": 1.287626226023095e-07, "loss": 0.3269, "step": 18032 }, { "epoch": 0.8712856935787795, "grad_norm": 3.9141626358032227, "learning_rate": 1.2871430642122048e-07, "loss": 0.3481, "step": 18033 }, { "epoch": 0.8713340097598686, "grad_norm": 2.726503610610962, "learning_rate": 1.2866599024013142e-07, "loss": 0.3232, "step": 18034 }, { "epoch": 0.8713823259409577, "grad_norm": 2.7765979766845703, "learning_rate": 1.2861767405904235e-07, "loss": 0.2555, "step": 18035 }, { "epoch": 0.8714306421220467, "grad_norm": 1.7345051765441895, "learning_rate": 1.2856935787795332e-07, "loss": 0.1905, "step": 18036 }, { "epoch": 0.8714789583031357, "grad_norm": 2.6957998275756836, "learning_rate": 1.2852104169686428e-07, "loss": 0.3572, "step": 18037 }, { "epoch": 0.8715272744842247, "grad_norm": 3.645047664642334, "learning_rate": 1.2847272551577522e-07, "loss": 0.3615, "step": 18038 }, { "epoch": 0.8715755906653138, "grad_norm": 1.987317681312561, "learning_rate": 1.2842440933468618e-07, "loss": 0.2595, "step": 18039 }, { "epoch": 0.8716239068464029, "grad_norm": 2.61678409576416, "learning_rate": 1.2837609315359712e-07, "loss": 0.3573, "step": 18040 }, { "epoch": 0.8716722230274919, "grad_norm": 2.578157424926758, "learning_rate": 1.283277769725081e-07, "loss": 0.277, "step": 18041 }, { "epoch": 0.871720539208581, "grad_norm": 2.202357530593872, "learning_rate": 1.2827946079141905e-07, "loss": 0.2542, "step": 18042 }, { "epoch": 0.87176885538967, "grad_norm": 2.5782554149627686, "learning_rate": 1.2823114461032999e-07, "loss": 0.2588, "step": 18043 }, { "epoch": 0.871817171570759, "grad_norm": 2.180903911590576, "learning_rate": 1.2818282842924095e-07, "loss": 0.2771, "step": 18044 }, { "epoch": 0.8718654877518481, "grad_norm": 3.4508891105651855, "learning_rate": 1.2813451224815189e-07, "loss": 0.4588, "step": 18045 }, { "epoch": 0.8719138039329372, "grad_norm": 2.443624973297119, "learning_rate": 1.2808619606706285e-07, "loss": 0.226, "step": 18046 }, { "epoch": 0.8719621201140262, "grad_norm": 9.432350158691406, "learning_rate": 1.2803787988597381e-07, "loss": 0.4194, "step": 18047 }, { "epoch": 0.8720104362951152, "grad_norm": 2.27944016456604, "learning_rate": 1.2798956370488475e-07, "loss": 0.2406, "step": 18048 }, { "epoch": 0.8720587524762042, "grad_norm": 3.808568000793457, "learning_rate": 1.2794124752379571e-07, "loss": 0.2315, "step": 18049 }, { "epoch": 0.8721070686572934, "grad_norm": 5.718408584594727, "learning_rate": 1.2789293134270668e-07, "loss": 0.4385, "step": 18050 }, { "epoch": 0.8721553848383824, "grad_norm": 1.7777347564697266, "learning_rate": 1.2784461516161762e-07, "loss": 0.1949, "step": 18051 }, { "epoch": 0.8722037010194714, "grad_norm": 2.053420066833496, "learning_rate": 1.2779629898052858e-07, "loss": 0.2452, "step": 18052 }, { "epoch": 0.8722520172005604, "grad_norm": 2.9670791625976562, "learning_rate": 1.2774798279943952e-07, "loss": 0.3382, "step": 18053 }, { "epoch": 0.8723003333816495, "grad_norm": 2.8820788860321045, "learning_rate": 1.2769966661835048e-07, "loss": 0.4004, "step": 18054 }, { "epoch": 0.8723486495627386, "grad_norm": 2.607712984085083, "learning_rate": 1.2765135043726144e-07, "loss": 0.1914, "step": 18055 }, { "epoch": 0.8723969657438276, "grad_norm": 2.6535284519195557, "learning_rate": 1.2760303425617238e-07, "loss": 0.3481, "step": 18056 }, { "epoch": 0.8724452819249167, "grad_norm": 11.372424125671387, "learning_rate": 1.2755471807508335e-07, "loss": 0.3148, "step": 18057 }, { "epoch": 0.8724935981060057, "grad_norm": 2.768871784210205, "learning_rate": 1.2750640189399428e-07, "loss": 0.3625, "step": 18058 }, { "epoch": 0.8725419142870947, "grad_norm": 2.5614662170410156, "learning_rate": 1.2745808571290525e-07, "loss": 0.2609, "step": 18059 }, { "epoch": 0.8725902304681838, "grad_norm": 5.674498558044434, "learning_rate": 1.274097695318162e-07, "loss": 0.3853, "step": 18060 }, { "epoch": 0.8726385466492729, "grad_norm": 3.109753131866455, "learning_rate": 1.2736145335072715e-07, "loss": 0.348, "step": 18061 }, { "epoch": 0.8726868628303619, "grad_norm": 12.14460277557373, "learning_rate": 1.2731313716963808e-07, "loss": 0.2357, "step": 18062 }, { "epoch": 0.8727351790114509, "grad_norm": 7.14979362487793, "learning_rate": 1.2726482098854907e-07, "loss": 0.3753, "step": 18063 }, { "epoch": 0.87278349519254, "grad_norm": 3.036330223083496, "learning_rate": 1.2721650480746e-07, "loss": 0.318, "step": 18064 }, { "epoch": 0.872831811373629, "grad_norm": 3.6270956993103027, "learning_rate": 1.2716818862637098e-07, "loss": 0.2883, "step": 18065 }, { "epoch": 0.8728801275547181, "grad_norm": 3.551041603088379, "learning_rate": 1.2711987244528191e-07, "loss": 0.3095, "step": 18066 }, { "epoch": 0.8729284437358071, "grad_norm": 2.0504183769226074, "learning_rate": 1.2707155626419288e-07, "loss": 0.2461, "step": 18067 }, { "epoch": 0.8729767599168962, "grad_norm": 2.564060688018799, "learning_rate": 1.2702324008310384e-07, "loss": 0.3069, "step": 18068 }, { "epoch": 0.8730250760979852, "grad_norm": 2.150563955307007, "learning_rate": 1.2697492390201478e-07, "loss": 0.2744, "step": 18069 }, { "epoch": 0.8730733922790742, "grad_norm": 4.1740312576293945, "learning_rate": 1.2692660772092572e-07, "loss": 0.5057, "step": 18070 }, { "epoch": 0.8731217084601633, "grad_norm": 2.9993090629577637, "learning_rate": 1.2687829153983668e-07, "loss": 0.2961, "step": 18071 }, { "epoch": 0.8731700246412524, "grad_norm": 2.5874345302581787, "learning_rate": 1.2682997535874764e-07, "loss": 0.2965, "step": 18072 }, { "epoch": 0.8732183408223414, "grad_norm": 6.085785388946533, "learning_rate": 1.267816591776586e-07, "loss": 0.3287, "step": 18073 }, { "epoch": 0.8732666570034304, "grad_norm": 45.723628997802734, "learning_rate": 1.2673334299656954e-07, "loss": 0.286, "step": 18074 }, { "epoch": 0.8733149731845194, "grad_norm": 2.267630100250244, "learning_rate": 1.2668502681548048e-07, "loss": 0.2575, "step": 18075 }, { "epoch": 0.8733632893656086, "grad_norm": 2.186974048614502, "learning_rate": 1.2663671063439147e-07, "loss": 0.2446, "step": 18076 }, { "epoch": 0.8734116055466976, "grad_norm": 3.2508909702301025, "learning_rate": 1.265883944533024e-07, "loss": 0.4202, "step": 18077 }, { "epoch": 0.8734599217277866, "grad_norm": 2.1396868228912354, "learning_rate": 1.2654007827221335e-07, "loss": 0.2213, "step": 18078 }, { "epoch": 0.8735082379088757, "grad_norm": 3.3666815757751465, "learning_rate": 1.264917620911243e-07, "loss": 0.44, "step": 18079 }, { "epoch": 0.8735565540899647, "grad_norm": 4.732492446899414, "learning_rate": 1.2644344591003525e-07, "loss": 0.2586, "step": 18080 }, { "epoch": 0.8736048702710538, "grad_norm": 2.5873024463653564, "learning_rate": 1.2639512972894624e-07, "loss": 0.1497, "step": 18081 }, { "epoch": 0.8736531864521428, "grad_norm": 3.8014891147613525, "learning_rate": 1.2634681354785717e-07, "loss": 0.3997, "step": 18082 }, { "epoch": 0.8737015026332319, "grad_norm": 3.062223434448242, "learning_rate": 1.262984973667681e-07, "loss": 0.3193, "step": 18083 }, { "epoch": 0.8737498188143209, "grad_norm": 2.822221040725708, "learning_rate": 1.2625018118567908e-07, "loss": 0.3419, "step": 18084 }, { "epoch": 0.8737981349954099, "grad_norm": 2.312417507171631, "learning_rate": 1.2620186500459004e-07, "loss": 0.205, "step": 18085 }, { "epoch": 0.8738464511764991, "grad_norm": 3.04730224609375, "learning_rate": 1.2615354882350098e-07, "loss": 0.4826, "step": 18086 }, { "epoch": 0.8738947673575881, "grad_norm": 3.3351309299468994, "learning_rate": 1.2610523264241194e-07, "loss": 0.4077, "step": 18087 }, { "epoch": 0.8739430835386771, "grad_norm": 13.407524108886719, "learning_rate": 1.2605691646132288e-07, "loss": 0.289, "step": 18088 }, { "epoch": 0.8739913997197661, "grad_norm": 6.856489658355713, "learning_rate": 1.2600860028023387e-07, "loss": 0.2066, "step": 18089 }, { "epoch": 0.8740397159008552, "grad_norm": 1.9008567333221436, "learning_rate": 1.259602840991448e-07, "loss": 0.2012, "step": 18090 }, { "epoch": 0.8740880320819442, "grad_norm": 2.206707000732422, "learning_rate": 1.2591196791805574e-07, "loss": 0.2823, "step": 18091 }, { "epoch": 0.8741363482630333, "grad_norm": 1.9202334880828857, "learning_rate": 1.258636517369667e-07, "loss": 0.233, "step": 18092 }, { "epoch": 0.8741846644441224, "grad_norm": 3.1214005947113037, "learning_rate": 1.2581533555587764e-07, "loss": 0.405, "step": 18093 }, { "epoch": 0.8742329806252114, "grad_norm": 2.651670217514038, "learning_rate": 1.257670193747886e-07, "loss": 0.3196, "step": 18094 }, { "epoch": 0.8742812968063004, "grad_norm": 2.3511905670166016, "learning_rate": 1.2571870319369957e-07, "loss": 0.1857, "step": 18095 }, { "epoch": 0.8743296129873894, "grad_norm": 4.838155746459961, "learning_rate": 1.256703870126105e-07, "loss": 0.3044, "step": 18096 }, { "epoch": 0.8743779291684786, "grad_norm": 8.99012279510498, "learning_rate": 1.2562207083152147e-07, "loss": 0.2131, "step": 18097 }, { "epoch": 0.8744262453495676, "grad_norm": 2.8429925441741943, "learning_rate": 1.2557375465043244e-07, "loss": 0.4021, "step": 18098 }, { "epoch": 0.8744745615306566, "grad_norm": 4.05272102355957, "learning_rate": 1.2552543846934337e-07, "loss": 0.2383, "step": 18099 }, { "epoch": 0.8745228777117456, "grad_norm": 2.8251125812530518, "learning_rate": 1.2547712228825434e-07, "loss": 0.2888, "step": 18100 }, { "epoch": 0.8745711938928347, "grad_norm": 2.7495975494384766, "learning_rate": 1.2542880610716527e-07, "loss": 0.2381, "step": 18101 }, { "epoch": 0.8746195100739238, "grad_norm": 2.9374566078186035, "learning_rate": 1.2538048992607624e-07, "loss": 0.2678, "step": 18102 }, { "epoch": 0.8746678262550128, "grad_norm": 2.568924903869629, "learning_rate": 1.253321737449872e-07, "loss": 0.3093, "step": 18103 }, { "epoch": 0.8747161424361019, "grad_norm": 18.19991111755371, "learning_rate": 1.2528385756389814e-07, "loss": 0.2714, "step": 18104 }, { "epoch": 0.8747644586171909, "grad_norm": 1.6755867004394531, "learning_rate": 1.252355413828091e-07, "loss": 0.164, "step": 18105 }, { "epoch": 0.8748127747982799, "grad_norm": 2.467318058013916, "learning_rate": 1.2518722520172004e-07, "loss": 0.2629, "step": 18106 }, { "epoch": 0.874861090979369, "grad_norm": 1.6298969984054565, "learning_rate": 1.25138909020631e-07, "loss": 0.1874, "step": 18107 }, { "epoch": 0.8749094071604581, "grad_norm": 2.8308756351470947, "learning_rate": 1.2509059283954197e-07, "loss": 0.2627, "step": 18108 }, { "epoch": 0.8749577233415471, "grad_norm": 2.998887538909912, "learning_rate": 1.250422766584529e-07, "loss": 0.3548, "step": 18109 }, { "epoch": 0.8750060395226361, "grad_norm": 2.828923463821411, "learning_rate": 1.2499396047736387e-07, "loss": 0.2784, "step": 18110 }, { "epoch": 0.8750543557037251, "grad_norm": 2.7499501705169678, "learning_rate": 1.249456442962748e-07, "loss": 0.3276, "step": 18111 }, { "epoch": 0.8751026718848143, "grad_norm": 2.3867990970611572, "learning_rate": 1.2489732811518577e-07, "loss": 0.263, "step": 18112 }, { "epoch": 0.8751509880659033, "grad_norm": 2.304340362548828, "learning_rate": 1.2484901193409673e-07, "loss": 0.183, "step": 18113 }, { "epoch": 0.8751993042469923, "grad_norm": 3.3499574661254883, "learning_rate": 1.2480069575300767e-07, "loss": 0.2895, "step": 18114 }, { "epoch": 0.8752476204280814, "grad_norm": 2.3199565410614014, "learning_rate": 1.2475237957191863e-07, "loss": 0.2415, "step": 18115 }, { "epoch": 0.8752959366091704, "grad_norm": 3.047612428665161, "learning_rate": 1.247040633908296e-07, "loss": 0.3508, "step": 18116 }, { "epoch": 0.8753442527902594, "grad_norm": 2.4000093936920166, "learning_rate": 1.2465574720974053e-07, "loss": 0.1887, "step": 18117 }, { "epoch": 0.8753925689713485, "grad_norm": 2.5360324382781982, "learning_rate": 1.2460743102865147e-07, "loss": 0.2667, "step": 18118 }, { "epoch": 0.8754408851524376, "grad_norm": 1.8270659446716309, "learning_rate": 1.2455911484756244e-07, "loss": 0.1694, "step": 18119 }, { "epoch": 0.8754892013335266, "grad_norm": 3.146007537841797, "learning_rate": 1.245107986664734e-07, "loss": 0.3919, "step": 18120 }, { "epoch": 0.8755375175146156, "grad_norm": 3.3027701377868652, "learning_rate": 1.2446248248538436e-07, "loss": 0.4311, "step": 18121 }, { "epoch": 0.8755858336957046, "grad_norm": 2.4062390327453613, "learning_rate": 1.244141663042953e-07, "loss": 0.2611, "step": 18122 }, { "epoch": 0.8756341498767938, "grad_norm": 1.7431385517120361, "learning_rate": 1.2436585012320626e-07, "loss": 0.1702, "step": 18123 }, { "epoch": 0.8756824660578828, "grad_norm": 2.746494770050049, "learning_rate": 1.243175339421172e-07, "loss": 0.3211, "step": 18124 }, { "epoch": 0.8757307822389718, "grad_norm": 2.5746243000030518, "learning_rate": 1.2426921776102817e-07, "loss": 0.3241, "step": 18125 }, { "epoch": 0.8757790984200609, "grad_norm": 1.933170199394226, "learning_rate": 1.242209015799391e-07, "loss": 0.2552, "step": 18126 }, { "epoch": 0.8758274146011499, "grad_norm": 3.0028960704803467, "learning_rate": 1.2417258539885007e-07, "loss": 0.2747, "step": 18127 }, { "epoch": 0.875875730782239, "grad_norm": 2.4171762466430664, "learning_rate": 1.2412426921776103e-07, "loss": 0.2308, "step": 18128 }, { "epoch": 0.875924046963328, "grad_norm": 2.9679369926452637, "learning_rate": 1.24075953036672e-07, "loss": 0.3006, "step": 18129 }, { "epoch": 0.8759723631444171, "grad_norm": 2.9701104164123535, "learning_rate": 1.2402763685558293e-07, "loss": 0.4142, "step": 18130 }, { "epoch": 0.8760206793255061, "grad_norm": 9.30978012084961, "learning_rate": 1.2397932067449387e-07, "loss": 0.3304, "step": 18131 }, { "epoch": 0.8760689955065951, "grad_norm": 10.293246269226074, "learning_rate": 1.2393100449340483e-07, "loss": 0.2802, "step": 18132 }, { "epoch": 0.8761173116876843, "grad_norm": 4.324476718902588, "learning_rate": 1.238826883123158e-07, "loss": 0.312, "step": 18133 }, { "epoch": 0.8761656278687733, "grad_norm": 3.1973140239715576, "learning_rate": 1.2383437213122673e-07, "loss": 0.2734, "step": 18134 }, { "epoch": 0.8762139440498623, "grad_norm": 2.687896966934204, "learning_rate": 1.237860559501377e-07, "loss": 0.2234, "step": 18135 }, { "epoch": 0.8762622602309513, "grad_norm": 3.843912124633789, "learning_rate": 1.2373773976904866e-07, "loss": 0.2148, "step": 18136 }, { "epoch": 0.8763105764120404, "grad_norm": 2.6443400382995605, "learning_rate": 1.236894235879596e-07, "loss": 0.2777, "step": 18137 }, { "epoch": 0.8763588925931295, "grad_norm": 2.416127920150757, "learning_rate": 1.2364110740687056e-07, "loss": 0.2755, "step": 18138 }, { "epoch": 0.8764072087742185, "grad_norm": 2.503718614578247, "learning_rate": 1.235927912257815e-07, "loss": 0.2758, "step": 18139 }, { "epoch": 0.8764555249553075, "grad_norm": 2.4627938270568848, "learning_rate": 1.2354447504469246e-07, "loss": 0.2741, "step": 18140 }, { "epoch": 0.8765038411363966, "grad_norm": 2.5986201763153076, "learning_rate": 1.2349615886360343e-07, "loss": 0.2255, "step": 18141 }, { "epoch": 0.8765521573174856, "grad_norm": 2.4414470195770264, "learning_rate": 1.2344784268251436e-07, "loss": 0.2125, "step": 18142 }, { "epoch": 0.8766004734985746, "grad_norm": 2.5864098072052, "learning_rate": 1.2339952650142533e-07, "loss": 0.3663, "step": 18143 }, { "epoch": 0.8766487896796638, "grad_norm": 4.912001132965088, "learning_rate": 1.2335121032033626e-07, "loss": 0.4261, "step": 18144 }, { "epoch": 0.8766971058607528, "grad_norm": 3.979708433151245, "learning_rate": 1.2330289413924723e-07, "loss": 0.3369, "step": 18145 }, { "epoch": 0.8767454220418418, "grad_norm": 3.0594019889831543, "learning_rate": 1.2325457795815817e-07, "loss": 0.2955, "step": 18146 }, { "epoch": 0.8767937382229308, "grad_norm": 3.058788537979126, "learning_rate": 1.2320626177706913e-07, "loss": 0.3971, "step": 18147 }, { "epoch": 0.8768420544040199, "grad_norm": 1.9122395515441895, "learning_rate": 1.231579455959801e-07, "loss": 0.2143, "step": 18148 }, { "epoch": 0.876890370585109, "grad_norm": 1.6023610830307007, "learning_rate": 1.2310962941489106e-07, "loss": 0.1408, "step": 18149 }, { "epoch": 0.876938686766198, "grad_norm": 2.394815683364868, "learning_rate": 1.23061313233802e-07, "loss": 0.265, "step": 18150 }, { "epoch": 0.876987002947287, "grad_norm": 3.105862617492676, "learning_rate": 1.2301299705271296e-07, "loss": 0.3237, "step": 18151 }, { "epoch": 0.8770353191283761, "grad_norm": 27.060457229614258, "learning_rate": 1.229646808716239e-07, "loss": 0.2194, "step": 18152 }, { "epoch": 0.8770836353094651, "grad_norm": 2.641904592514038, "learning_rate": 1.2291636469053486e-07, "loss": 0.3329, "step": 18153 }, { "epoch": 0.8771319514905542, "grad_norm": 2.391197919845581, "learning_rate": 1.228680485094458e-07, "loss": 0.2385, "step": 18154 }, { "epoch": 0.8771802676716433, "grad_norm": 4.157001972198486, "learning_rate": 1.2281973232835676e-07, "loss": 0.4823, "step": 18155 }, { "epoch": 0.8772285838527323, "grad_norm": 4.509152889251709, "learning_rate": 1.2277141614726772e-07, "loss": 0.2595, "step": 18156 }, { "epoch": 0.8772769000338213, "grad_norm": 1.9922329187393188, "learning_rate": 1.2272309996617866e-07, "loss": 0.1898, "step": 18157 }, { "epoch": 0.8773252162149103, "grad_norm": 3.0924899578094482, "learning_rate": 1.2267478378508962e-07, "loss": 0.2222, "step": 18158 }, { "epoch": 0.8773735323959995, "grad_norm": 2.83896541595459, "learning_rate": 1.2262646760400056e-07, "loss": 0.2661, "step": 18159 }, { "epoch": 0.8774218485770885, "grad_norm": 3.3629379272460938, "learning_rate": 1.2257815142291153e-07, "loss": 0.2808, "step": 18160 }, { "epoch": 0.8774701647581775, "grad_norm": 6.910697937011719, "learning_rate": 1.225298352418225e-07, "loss": 0.4121, "step": 18161 }, { "epoch": 0.8775184809392665, "grad_norm": 3.1448442935943604, "learning_rate": 1.2248151906073343e-07, "loss": 0.3453, "step": 18162 }, { "epoch": 0.8775667971203556, "grad_norm": 3.16752552986145, "learning_rate": 1.224332028796444e-07, "loss": 0.3812, "step": 18163 }, { "epoch": 0.8776151133014447, "grad_norm": 1.8657010793685913, "learning_rate": 1.2238488669855535e-07, "loss": 0.2101, "step": 18164 }, { "epoch": 0.8776634294825337, "grad_norm": 2.6419615745544434, "learning_rate": 1.223365705174663e-07, "loss": 0.3414, "step": 18165 }, { "epoch": 0.8777117456636228, "grad_norm": 2.7963263988494873, "learning_rate": 1.2228825433637726e-07, "loss": 0.3059, "step": 18166 }, { "epoch": 0.8777600618447118, "grad_norm": 2.8581013679504395, "learning_rate": 1.222399381552882e-07, "loss": 0.364, "step": 18167 }, { "epoch": 0.8778083780258008, "grad_norm": 4.067201614379883, "learning_rate": 1.2219162197419916e-07, "loss": 0.3232, "step": 18168 }, { "epoch": 0.8778566942068899, "grad_norm": 2.9017157554626465, "learning_rate": 1.2214330579311012e-07, "loss": 0.2353, "step": 18169 }, { "epoch": 0.877905010387979, "grad_norm": 2.215423107147217, "learning_rate": 1.2209498961202106e-07, "loss": 0.221, "step": 18170 }, { "epoch": 0.877953326569068, "grad_norm": 2.8096258640289307, "learning_rate": 1.2204667343093202e-07, "loss": 0.3437, "step": 18171 }, { "epoch": 0.878001642750157, "grad_norm": 3.515648126602173, "learning_rate": 1.2199835724984296e-07, "loss": 0.3661, "step": 18172 }, { "epoch": 0.878049958931246, "grad_norm": 2.6943790912628174, "learning_rate": 1.2195004106875392e-07, "loss": 0.2775, "step": 18173 }, { "epoch": 0.8780982751123351, "grad_norm": 4.091357707977295, "learning_rate": 1.2190172488766486e-07, "loss": 0.403, "step": 18174 }, { "epoch": 0.8781465912934242, "grad_norm": 2.040487051010132, "learning_rate": 1.2185340870657582e-07, "loss": 0.2271, "step": 18175 }, { "epoch": 0.8781949074745132, "grad_norm": 6.493741035461426, "learning_rate": 1.218050925254868e-07, "loss": 0.1888, "step": 18176 }, { "epoch": 0.8782432236556023, "grad_norm": 4.2216315269470215, "learning_rate": 1.2175677634439775e-07, "loss": 0.3276, "step": 18177 }, { "epoch": 0.8782915398366913, "grad_norm": 2.4128193855285645, "learning_rate": 1.217084601633087e-07, "loss": 0.2609, "step": 18178 }, { "epoch": 0.8783398560177803, "grad_norm": 2.7497739791870117, "learning_rate": 1.2166014398221965e-07, "loss": 0.322, "step": 18179 }, { "epoch": 0.8783881721988694, "grad_norm": 5.764045715332031, "learning_rate": 1.216118278011306e-07, "loss": 0.3706, "step": 18180 }, { "epoch": 0.8784364883799585, "grad_norm": 2.495400905609131, "learning_rate": 1.2156351162004153e-07, "loss": 0.3113, "step": 18181 }, { "epoch": 0.8784848045610475, "grad_norm": 2.2913920879364014, "learning_rate": 1.215151954389525e-07, "loss": 0.28, "step": 18182 }, { "epoch": 0.8785331207421365, "grad_norm": 4.402527332305908, "learning_rate": 1.2146687925786345e-07, "loss": 0.2826, "step": 18183 }, { "epoch": 0.8785814369232255, "grad_norm": 2.782839059829712, "learning_rate": 1.2141856307677442e-07, "loss": 0.3325, "step": 18184 }, { "epoch": 0.8786297531043147, "grad_norm": 3.1402220726013184, "learning_rate": 1.2137024689568535e-07, "loss": 0.3661, "step": 18185 }, { "epoch": 0.8786780692854037, "grad_norm": 1.802984595298767, "learning_rate": 1.2132193071459632e-07, "loss": 0.1932, "step": 18186 }, { "epoch": 0.8787263854664927, "grad_norm": 2.2835569381713867, "learning_rate": 1.2127361453350726e-07, "loss": 0.2941, "step": 18187 }, { "epoch": 0.8787747016475818, "grad_norm": 3.86191987991333, "learning_rate": 1.2122529835241822e-07, "loss": 0.3055, "step": 18188 }, { "epoch": 0.8788230178286708, "grad_norm": 3.5585899353027344, "learning_rate": 1.2117698217132916e-07, "loss": 0.5825, "step": 18189 }, { "epoch": 0.8788713340097599, "grad_norm": 22.586288452148438, "learning_rate": 1.2112866599024012e-07, "loss": 0.359, "step": 18190 }, { "epoch": 0.878919650190849, "grad_norm": 7.624368190765381, "learning_rate": 1.2108034980915108e-07, "loss": 0.3439, "step": 18191 }, { "epoch": 0.878967966371938, "grad_norm": 2.00602650642395, "learning_rate": 1.2103203362806205e-07, "loss": 0.1969, "step": 18192 }, { "epoch": 0.879016282553027, "grad_norm": 3.8095993995666504, "learning_rate": 1.2098371744697299e-07, "loss": 0.2584, "step": 18193 }, { "epoch": 0.879064598734116, "grad_norm": 2.5372633934020996, "learning_rate": 1.2093540126588392e-07, "loss": 0.2303, "step": 18194 }, { "epoch": 0.8791129149152052, "grad_norm": 3.326547622680664, "learning_rate": 1.2088708508479489e-07, "loss": 0.2654, "step": 18195 }, { "epoch": 0.8791612310962942, "grad_norm": 2.4701006412506104, "learning_rate": 1.2083876890370585e-07, "loss": 0.3329, "step": 18196 }, { "epoch": 0.8792095472773832, "grad_norm": 2.5507774353027344, "learning_rate": 1.207904527226168e-07, "loss": 0.2963, "step": 18197 }, { "epoch": 0.8792578634584722, "grad_norm": 3.0748515129089355, "learning_rate": 1.2074213654152775e-07, "loss": 0.4358, "step": 18198 }, { "epoch": 0.8793061796395613, "grad_norm": 2.7677011489868164, "learning_rate": 1.2069382036043872e-07, "loss": 0.3796, "step": 18199 }, { "epoch": 0.8793544958206503, "grad_norm": 2.772397518157959, "learning_rate": 1.2064550417934965e-07, "loss": 0.2494, "step": 18200 }, { "epoch": 0.8794028120017394, "grad_norm": 4.948770999908447, "learning_rate": 1.2059718799826062e-07, "loss": 0.4844, "step": 18201 }, { "epoch": 0.8794511281828284, "grad_norm": 1.5072373151779175, "learning_rate": 1.2054887181717155e-07, "loss": 0.201, "step": 18202 }, { "epoch": 0.8794994443639175, "grad_norm": 3.141012191772461, "learning_rate": 1.2050055563608252e-07, "loss": 0.2969, "step": 18203 }, { "epoch": 0.8795477605450065, "grad_norm": 2.5201799869537354, "learning_rate": 1.2045223945499348e-07, "loss": 0.3577, "step": 18204 }, { "epoch": 0.8795960767260955, "grad_norm": 2.3694913387298584, "learning_rate": 1.2040392327390444e-07, "loss": 0.3186, "step": 18205 }, { "epoch": 0.8796443929071847, "grad_norm": 3.149667739868164, "learning_rate": 1.2035560709281538e-07, "loss": 0.3325, "step": 18206 }, { "epoch": 0.8796927090882737, "grad_norm": 2.2976322174072266, "learning_rate": 1.2030729091172632e-07, "loss": 0.2773, "step": 18207 }, { "epoch": 0.8797410252693627, "grad_norm": 4.192543983459473, "learning_rate": 1.2025897473063728e-07, "loss": 0.2034, "step": 18208 }, { "epoch": 0.8797893414504517, "grad_norm": 6.178609371185303, "learning_rate": 1.2021065854954822e-07, "loss": 0.3275, "step": 18209 }, { "epoch": 0.8798376576315408, "grad_norm": 3.1904187202453613, "learning_rate": 1.2016234236845918e-07, "loss": 0.2627, "step": 18210 }, { "epoch": 0.8798859738126299, "grad_norm": 2.056358575820923, "learning_rate": 1.2011402618737015e-07, "loss": 0.2305, "step": 18211 }, { "epoch": 0.8799342899937189, "grad_norm": 3.2125673294067383, "learning_rate": 1.200657100062811e-07, "loss": 0.3902, "step": 18212 }, { "epoch": 0.879982606174808, "grad_norm": 3.7079880237579346, "learning_rate": 1.2001739382519205e-07, "loss": 0.2515, "step": 18213 }, { "epoch": 0.880030922355897, "grad_norm": 2.483551263809204, "learning_rate": 1.19969077644103e-07, "loss": 0.2169, "step": 18214 }, { "epoch": 0.880079238536986, "grad_norm": 2.3620545864105225, "learning_rate": 1.1992076146301395e-07, "loss": 0.2389, "step": 18215 }, { "epoch": 0.8801275547180751, "grad_norm": 3.9832077026367188, "learning_rate": 1.1987244528192491e-07, "loss": 0.3169, "step": 18216 }, { "epoch": 0.8801758708991642, "grad_norm": 3.8872952461242676, "learning_rate": 1.1982412910083585e-07, "loss": 0.3864, "step": 18217 }, { "epoch": 0.8802241870802532, "grad_norm": 2.230262041091919, "learning_rate": 1.1977581291974681e-07, "loss": 0.196, "step": 18218 }, { "epoch": 0.8802725032613422, "grad_norm": 2.5391685962677, "learning_rate": 1.1972749673865778e-07, "loss": 0.3562, "step": 18219 }, { "epoch": 0.8803208194424312, "grad_norm": 3.4557390213012695, "learning_rate": 1.1967918055756872e-07, "loss": 0.2322, "step": 18220 }, { "epoch": 0.8803691356235204, "grad_norm": 4.097918510437012, "learning_rate": 1.1963086437647968e-07, "loss": 0.3199, "step": 18221 }, { "epoch": 0.8804174518046094, "grad_norm": 2.344684362411499, "learning_rate": 1.1958254819539062e-07, "loss": 0.2663, "step": 18222 }, { "epoch": 0.8804657679856984, "grad_norm": 3.527566432952881, "learning_rate": 1.1953423201430158e-07, "loss": 0.2068, "step": 18223 }, { "epoch": 0.8805140841667874, "grad_norm": 2.6729233264923096, "learning_rate": 1.1948591583321254e-07, "loss": 0.3007, "step": 18224 }, { "epoch": 0.8805624003478765, "grad_norm": 4.731503486633301, "learning_rate": 1.1943759965212348e-07, "loss": 0.2375, "step": 18225 }, { "epoch": 0.8806107165289655, "grad_norm": 2.931440591812134, "learning_rate": 1.1938928347103445e-07, "loss": 0.3534, "step": 18226 }, { "epoch": 0.8806590327100546, "grad_norm": 4.113996505737305, "learning_rate": 1.193409672899454e-07, "loss": 0.4897, "step": 18227 }, { "epoch": 0.8807073488911437, "grad_norm": 2.0514376163482666, "learning_rate": 1.1929265110885635e-07, "loss": 0.2054, "step": 18228 }, { "epoch": 0.8807556650722327, "grad_norm": 2.1424288749694824, "learning_rate": 1.192443349277673e-07, "loss": 0.2417, "step": 18229 }, { "epoch": 0.8808039812533217, "grad_norm": 2.6273610591888428, "learning_rate": 1.1919601874667826e-07, "loss": 0.3209, "step": 18230 }, { "epoch": 0.8808522974344107, "grad_norm": 2.334749698638916, "learning_rate": 1.1914770256558921e-07, "loss": 0.22, "step": 18231 }, { "epoch": 0.8809006136154999, "grad_norm": 2.7407801151275635, "learning_rate": 1.1909938638450016e-07, "loss": 0.3246, "step": 18232 }, { "epoch": 0.8809489297965889, "grad_norm": 2.4830777645111084, "learning_rate": 1.1905107020341111e-07, "loss": 0.2658, "step": 18233 }, { "epoch": 0.8809972459776779, "grad_norm": 3.204216480255127, "learning_rate": 1.1900275402232208e-07, "loss": 0.2656, "step": 18234 }, { "epoch": 0.881045562158767, "grad_norm": 2.0511891841888428, "learning_rate": 1.1895443784123301e-07, "loss": 0.2064, "step": 18235 }, { "epoch": 0.881093878339856, "grad_norm": 2.33805251121521, "learning_rate": 1.1890612166014398e-07, "loss": 0.2918, "step": 18236 }, { "epoch": 0.8811421945209451, "grad_norm": 2.248464345932007, "learning_rate": 1.1885780547905493e-07, "loss": 0.1973, "step": 18237 }, { "epoch": 0.8811905107020341, "grad_norm": 3.4285058975219727, "learning_rate": 1.1880948929796589e-07, "loss": 0.3262, "step": 18238 }, { "epoch": 0.8812388268831232, "grad_norm": 2.0820677280426025, "learning_rate": 1.1876117311687683e-07, "loss": 0.2185, "step": 18239 }, { "epoch": 0.8812871430642122, "grad_norm": 2.6720635890960693, "learning_rate": 1.1871285693578779e-07, "loss": 0.2868, "step": 18240 }, { "epoch": 0.8813354592453012, "grad_norm": 10.395806312561035, "learning_rate": 1.1866454075469874e-07, "loss": 0.3193, "step": 18241 }, { "epoch": 0.8813837754263903, "grad_norm": 2.740523338317871, "learning_rate": 1.186162245736097e-07, "loss": 0.3799, "step": 18242 }, { "epoch": 0.8814320916074794, "grad_norm": 2.016744375228882, "learning_rate": 1.1856790839252064e-07, "loss": 0.1833, "step": 18243 }, { "epoch": 0.8814804077885684, "grad_norm": 3.3761038780212402, "learning_rate": 1.1851959221143161e-07, "loss": 0.4876, "step": 18244 }, { "epoch": 0.8815287239696574, "grad_norm": 2.1337108612060547, "learning_rate": 1.1847127603034256e-07, "loss": 0.2569, "step": 18245 }, { "epoch": 0.8815770401507464, "grad_norm": 2.5599253177642822, "learning_rate": 1.1842295984925351e-07, "loss": 0.314, "step": 18246 }, { "epoch": 0.8816253563318356, "grad_norm": 2.049319267272949, "learning_rate": 1.1837464366816446e-07, "loss": 0.2413, "step": 18247 }, { "epoch": 0.8816736725129246, "grad_norm": 4.430449485778809, "learning_rate": 1.1832632748707541e-07, "loss": 0.3033, "step": 18248 }, { "epoch": 0.8817219886940136, "grad_norm": 3.0152199268341064, "learning_rate": 1.1827801130598637e-07, "loss": 0.3087, "step": 18249 }, { "epoch": 0.8817703048751027, "grad_norm": 2.384470224380493, "learning_rate": 1.1822969512489732e-07, "loss": 0.2592, "step": 18250 }, { "epoch": 0.8818186210561917, "grad_norm": 2.7647364139556885, "learning_rate": 1.1818137894380827e-07, "loss": 0.2616, "step": 18251 }, { "epoch": 0.8818669372372807, "grad_norm": 2.419844150543213, "learning_rate": 1.1813306276271922e-07, "loss": 0.2031, "step": 18252 }, { "epoch": 0.8819152534183698, "grad_norm": 7.216305732727051, "learning_rate": 1.1808474658163019e-07, "loss": 0.2215, "step": 18253 }, { "epoch": 0.8819635695994589, "grad_norm": 104.66384887695312, "learning_rate": 1.1803643040054114e-07, "loss": 0.4815, "step": 18254 }, { "epoch": 0.8820118857805479, "grad_norm": 3.860734701156616, "learning_rate": 1.1798811421945209e-07, "loss": 0.3111, "step": 18255 }, { "epoch": 0.8820602019616369, "grad_norm": 1.6547988653182983, "learning_rate": 1.1793979803836304e-07, "loss": 0.2028, "step": 18256 }, { "epoch": 0.882108518142726, "grad_norm": 3.2442400455474854, "learning_rate": 1.1789148185727399e-07, "loss": 0.3699, "step": 18257 }, { "epoch": 0.8821568343238151, "grad_norm": 2.2192420959472656, "learning_rate": 1.1784316567618495e-07, "loss": 0.245, "step": 18258 }, { "epoch": 0.8822051505049041, "grad_norm": 2.3108742237091064, "learning_rate": 1.1779484949509589e-07, "loss": 0.2522, "step": 18259 }, { "epoch": 0.8822534666859931, "grad_norm": 1.8441931009292603, "learning_rate": 1.1774653331400686e-07, "loss": 0.2173, "step": 18260 }, { "epoch": 0.8823017828670822, "grad_norm": 3.554415225982666, "learning_rate": 1.176982171329178e-07, "loss": 0.407, "step": 18261 }, { "epoch": 0.8823500990481712, "grad_norm": 2.728259563446045, "learning_rate": 1.1764990095182877e-07, "loss": 0.3934, "step": 18262 }, { "epoch": 0.8823984152292603, "grad_norm": 2.905658721923828, "learning_rate": 1.176015847707397e-07, "loss": 0.2178, "step": 18263 }, { "epoch": 0.8824467314103493, "grad_norm": 2.208369731903076, "learning_rate": 1.1755326858965067e-07, "loss": 0.2928, "step": 18264 }, { "epoch": 0.8824950475914384, "grad_norm": 1.8889024257659912, "learning_rate": 1.1750495240856162e-07, "loss": 0.1772, "step": 18265 }, { "epoch": 0.8825433637725274, "grad_norm": 3.00119686126709, "learning_rate": 1.1745663622747258e-07, "loss": 0.4156, "step": 18266 }, { "epoch": 0.8825916799536164, "grad_norm": 2.2160744667053223, "learning_rate": 1.1740832004638352e-07, "loss": 0.2457, "step": 18267 }, { "epoch": 0.8826399961347056, "grad_norm": 2.9288573265075684, "learning_rate": 1.1736000386529449e-07, "loss": 0.1879, "step": 18268 }, { "epoch": 0.8826883123157946, "grad_norm": 2.43806791305542, "learning_rate": 1.1731168768420544e-07, "loss": 0.258, "step": 18269 }, { "epoch": 0.8827366284968836, "grad_norm": 5.2289204597473145, "learning_rate": 1.1726337150311639e-07, "loss": 0.3273, "step": 18270 }, { "epoch": 0.8827849446779726, "grad_norm": 2.039843797683716, "learning_rate": 1.1721505532202734e-07, "loss": 0.2388, "step": 18271 }, { "epoch": 0.8828332608590617, "grad_norm": 1.582183837890625, "learning_rate": 1.1716673914093829e-07, "loss": 0.174, "step": 18272 }, { "epoch": 0.8828815770401508, "grad_norm": 2.8307979106903076, "learning_rate": 1.1711842295984925e-07, "loss": 0.3237, "step": 18273 }, { "epoch": 0.8829298932212398, "grad_norm": 2.3810060024261475, "learning_rate": 1.170701067787602e-07, "loss": 0.2697, "step": 18274 }, { "epoch": 0.8829782094023289, "grad_norm": 3.4285030364990234, "learning_rate": 1.1702179059767115e-07, "loss": 0.3798, "step": 18275 }, { "epoch": 0.8830265255834179, "grad_norm": 3.9788267612457275, "learning_rate": 1.169734744165821e-07, "loss": 0.4602, "step": 18276 }, { "epoch": 0.8830748417645069, "grad_norm": 3.8521978855133057, "learning_rate": 1.1692515823549307e-07, "loss": 0.3224, "step": 18277 }, { "epoch": 0.8831231579455959, "grad_norm": 3.3004884719848633, "learning_rate": 1.1687684205440402e-07, "loss": 0.347, "step": 18278 }, { "epoch": 0.8831714741266851, "grad_norm": 2.4577417373657227, "learning_rate": 1.1682852587331497e-07, "loss": 0.2567, "step": 18279 }, { "epoch": 0.8832197903077741, "grad_norm": 2.894890785217285, "learning_rate": 1.1678020969222592e-07, "loss": 0.3448, "step": 18280 }, { "epoch": 0.8832681064888631, "grad_norm": 1.6270898580551147, "learning_rate": 1.1673189351113688e-07, "loss": 0.1398, "step": 18281 }, { "epoch": 0.8833164226699521, "grad_norm": 3.4007370471954346, "learning_rate": 1.1668357733004783e-07, "loss": 0.357, "step": 18282 }, { "epoch": 0.8833647388510412, "grad_norm": 1.6617425680160522, "learning_rate": 1.1663526114895877e-07, "loss": 0.1927, "step": 18283 }, { "epoch": 0.8834130550321303, "grad_norm": 16.510501861572266, "learning_rate": 1.1658694496786973e-07, "loss": 0.31, "step": 18284 }, { "epoch": 0.8834613712132193, "grad_norm": 1.3632283210754395, "learning_rate": 1.1653862878678068e-07, "loss": 0.154, "step": 18285 }, { "epoch": 0.8835096873943084, "grad_norm": 5.4606218338012695, "learning_rate": 1.1649031260569165e-07, "loss": 0.3256, "step": 18286 }, { "epoch": 0.8835580035753974, "grad_norm": 3.1103405952453613, "learning_rate": 1.1644199642460259e-07, "loss": 0.2677, "step": 18287 }, { "epoch": 0.8836063197564864, "grad_norm": 2.6328887939453125, "learning_rate": 1.1639368024351355e-07, "loss": 0.2964, "step": 18288 }, { "epoch": 0.8836546359375755, "grad_norm": 3.637897253036499, "learning_rate": 1.163453640624245e-07, "loss": 0.4202, "step": 18289 }, { "epoch": 0.8837029521186646, "grad_norm": 2.5993010997772217, "learning_rate": 1.1629704788133546e-07, "loss": 0.3934, "step": 18290 }, { "epoch": 0.8837512682997536, "grad_norm": 5.2101287841796875, "learning_rate": 1.162487317002464e-07, "loss": 0.3156, "step": 18291 }, { "epoch": 0.8837995844808426, "grad_norm": 3.0332674980163574, "learning_rate": 1.1620041551915736e-07, "loss": 0.3141, "step": 18292 }, { "epoch": 0.8838479006619316, "grad_norm": 2.9130866527557373, "learning_rate": 1.1615209933806831e-07, "loss": 0.2566, "step": 18293 }, { "epoch": 0.8838962168430208, "grad_norm": 3.319368839263916, "learning_rate": 1.1610378315697928e-07, "loss": 0.2407, "step": 18294 }, { "epoch": 0.8839445330241098, "grad_norm": 1.9707306623458862, "learning_rate": 1.1605546697589022e-07, "loss": 0.2138, "step": 18295 }, { "epoch": 0.8839928492051988, "grad_norm": 3.1967225074768066, "learning_rate": 1.1600715079480117e-07, "loss": 0.1308, "step": 18296 }, { "epoch": 0.8840411653862879, "grad_norm": 2.479849100112915, "learning_rate": 1.1595883461371213e-07, "loss": 0.2831, "step": 18297 }, { "epoch": 0.8840894815673769, "grad_norm": 2.0263330936431885, "learning_rate": 1.1591051843262308e-07, "loss": 0.2275, "step": 18298 }, { "epoch": 0.884137797748466, "grad_norm": 4.877132892608643, "learning_rate": 1.1586220225153403e-07, "loss": 0.3095, "step": 18299 }, { "epoch": 0.884186113929555, "grad_norm": 3.0941829681396484, "learning_rate": 1.1581388607044498e-07, "loss": 0.284, "step": 18300 }, { "epoch": 0.8842344301106441, "grad_norm": 2.3991546630859375, "learning_rate": 1.1576556988935595e-07, "loss": 0.3122, "step": 18301 }, { "epoch": 0.8842827462917331, "grad_norm": 5.957760810852051, "learning_rate": 1.157172537082669e-07, "loss": 0.334, "step": 18302 }, { "epoch": 0.8843310624728221, "grad_norm": 2.9836668968200684, "learning_rate": 1.1566893752717785e-07, "loss": 0.3272, "step": 18303 }, { "epoch": 0.8843793786539111, "grad_norm": 2.454005718231201, "learning_rate": 1.156206213460888e-07, "loss": 0.2195, "step": 18304 }, { "epoch": 0.8844276948350003, "grad_norm": 23.89756965637207, "learning_rate": 1.1557230516499976e-07, "loss": 0.3907, "step": 18305 }, { "epoch": 0.8844760110160893, "grad_norm": 12.493182182312012, "learning_rate": 1.1552398898391071e-07, "loss": 0.2385, "step": 18306 }, { "epoch": 0.8845243271971783, "grad_norm": 3.2426509857177734, "learning_rate": 1.1547567280282166e-07, "loss": 0.2453, "step": 18307 }, { "epoch": 0.8845726433782674, "grad_norm": 2.3998188972473145, "learning_rate": 1.1542735662173261e-07, "loss": 0.2232, "step": 18308 }, { "epoch": 0.8846209595593564, "grad_norm": 3.3501834869384766, "learning_rate": 1.1537904044064356e-07, "loss": 0.2583, "step": 18309 }, { "epoch": 0.8846692757404455, "grad_norm": 2.3493621349334717, "learning_rate": 1.1533072425955453e-07, "loss": 0.3193, "step": 18310 }, { "epoch": 0.8847175919215345, "grad_norm": 2.439605712890625, "learning_rate": 1.1528240807846546e-07, "loss": 0.2761, "step": 18311 }, { "epoch": 0.8847659081026236, "grad_norm": 4.80808162689209, "learning_rate": 1.1523409189737643e-07, "loss": 0.3536, "step": 18312 }, { "epoch": 0.8848142242837126, "grad_norm": 2.2739813327789307, "learning_rate": 1.1518577571628738e-07, "loss": 0.2785, "step": 18313 }, { "epoch": 0.8848625404648016, "grad_norm": 1.9190764427185059, "learning_rate": 1.1513745953519834e-07, "loss": 0.1823, "step": 18314 }, { "epoch": 0.8849108566458908, "grad_norm": 3.637232780456543, "learning_rate": 1.1508914335410928e-07, "loss": 0.3508, "step": 18315 }, { "epoch": 0.8849591728269798, "grad_norm": 3.0386886596679688, "learning_rate": 1.1504082717302024e-07, "loss": 0.2919, "step": 18316 }, { "epoch": 0.8850074890080688, "grad_norm": 3.1774275302886963, "learning_rate": 1.1499251099193119e-07, "loss": 0.1368, "step": 18317 }, { "epoch": 0.8850558051891578, "grad_norm": 2.7929961681365967, "learning_rate": 1.1494419481084216e-07, "loss": 0.3297, "step": 18318 }, { "epoch": 0.8851041213702469, "grad_norm": 8.690749168395996, "learning_rate": 1.148958786297531e-07, "loss": 0.4469, "step": 18319 }, { "epoch": 0.885152437551336, "grad_norm": 6.925000190734863, "learning_rate": 1.1484756244866406e-07, "loss": 0.3323, "step": 18320 }, { "epoch": 0.885200753732425, "grad_norm": 1.9232819080352783, "learning_rate": 1.1479924626757501e-07, "loss": 0.2228, "step": 18321 }, { "epoch": 0.885249069913514, "grad_norm": 2.0566000938415527, "learning_rate": 1.1475093008648596e-07, "loss": 0.2401, "step": 18322 }, { "epoch": 0.8852973860946031, "grad_norm": 2.8699488639831543, "learning_rate": 1.1470261390539691e-07, "loss": 0.3637, "step": 18323 }, { "epoch": 0.8853457022756921, "grad_norm": 2.30146861076355, "learning_rate": 1.1465429772430786e-07, "loss": 0.1984, "step": 18324 }, { "epoch": 0.8853940184567812, "grad_norm": 3.174494981765747, "learning_rate": 1.1460598154321882e-07, "loss": 0.3424, "step": 18325 }, { "epoch": 0.8854423346378703, "grad_norm": 2.3157548904418945, "learning_rate": 1.1455766536212977e-07, "loss": 0.303, "step": 18326 }, { "epoch": 0.8854906508189593, "grad_norm": 1.7882781028747559, "learning_rate": 1.1450934918104072e-07, "loss": 0.1841, "step": 18327 }, { "epoch": 0.8855389670000483, "grad_norm": 2.7563412189483643, "learning_rate": 1.1446103299995168e-07, "loss": 0.2825, "step": 18328 }, { "epoch": 0.8855872831811373, "grad_norm": 2.979754686355591, "learning_rate": 1.1441271681886264e-07, "loss": 0.275, "step": 18329 }, { "epoch": 0.8856355993622264, "grad_norm": 2.460162878036499, "learning_rate": 1.1436440063777359e-07, "loss": 0.3048, "step": 18330 }, { "epoch": 0.8856839155433155, "grad_norm": 2.1735548973083496, "learning_rate": 1.1431608445668454e-07, "loss": 0.2482, "step": 18331 }, { "epoch": 0.8857322317244045, "grad_norm": 4.50866174697876, "learning_rate": 1.1426776827559549e-07, "loss": 0.2627, "step": 18332 }, { "epoch": 0.8857805479054935, "grad_norm": 2.2215542793273926, "learning_rate": 1.1421945209450644e-07, "loss": 0.2678, "step": 18333 }, { "epoch": 0.8858288640865826, "grad_norm": 2.897585153579712, "learning_rate": 1.141711359134174e-07, "loss": 0.2912, "step": 18334 }, { "epoch": 0.8858771802676716, "grad_norm": 2.614724636077881, "learning_rate": 1.1412281973232834e-07, "loss": 0.2419, "step": 18335 }, { "epoch": 0.8859254964487607, "grad_norm": 1.764196753501892, "learning_rate": 1.140745035512393e-07, "loss": 0.1705, "step": 18336 }, { "epoch": 0.8859738126298498, "grad_norm": 2.2502012252807617, "learning_rate": 1.1402618737015026e-07, "loss": 0.177, "step": 18337 }, { "epoch": 0.8860221288109388, "grad_norm": 3.170424222946167, "learning_rate": 1.1397787118906122e-07, "loss": 0.305, "step": 18338 }, { "epoch": 0.8860704449920278, "grad_norm": 3.072606086730957, "learning_rate": 1.1392955500797216e-07, "loss": 0.2599, "step": 18339 }, { "epoch": 0.8861187611731168, "grad_norm": 4.246242046356201, "learning_rate": 1.1388123882688312e-07, "loss": 0.3687, "step": 18340 }, { "epoch": 0.886167077354206, "grad_norm": 3.757277727127075, "learning_rate": 1.1383292264579407e-07, "loss": 0.2479, "step": 18341 }, { "epoch": 0.886215393535295, "grad_norm": 3.7426974773406982, "learning_rate": 1.1378460646470504e-07, "loss": 0.2587, "step": 18342 }, { "epoch": 0.886263709716384, "grad_norm": 2.496541738510132, "learning_rate": 1.1373629028361597e-07, "loss": 0.1989, "step": 18343 }, { "epoch": 0.886312025897473, "grad_norm": 4.210475444793701, "learning_rate": 1.1368797410252694e-07, "loss": 0.2813, "step": 18344 }, { "epoch": 0.8863603420785621, "grad_norm": 2.5979292392730713, "learning_rate": 1.1363965792143789e-07, "loss": 0.3704, "step": 18345 }, { "epoch": 0.8864086582596512, "grad_norm": 3.139280080795288, "learning_rate": 1.1359134174034884e-07, "loss": 0.2034, "step": 18346 }, { "epoch": 0.8864569744407402, "grad_norm": 4.151144504547119, "learning_rate": 1.1354302555925979e-07, "loss": 0.3483, "step": 18347 }, { "epoch": 0.8865052906218293, "grad_norm": 2.754220485687256, "learning_rate": 1.1349470937817074e-07, "loss": 0.2684, "step": 18348 }, { "epoch": 0.8865536068029183, "grad_norm": 2.776724338531494, "learning_rate": 1.134463931970817e-07, "loss": 0.4283, "step": 18349 }, { "epoch": 0.8866019229840073, "grad_norm": 3.1799709796905518, "learning_rate": 1.1339807701599265e-07, "loss": 0.3679, "step": 18350 }, { "epoch": 0.8866502391650964, "grad_norm": 1.9817348718643188, "learning_rate": 1.133497608349036e-07, "loss": 0.1815, "step": 18351 }, { "epoch": 0.8866985553461855, "grad_norm": 3.285304069519043, "learning_rate": 1.1330144465381455e-07, "loss": 0.369, "step": 18352 }, { "epoch": 0.8867468715272745, "grad_norm": 2.683840751647949, "learning_rate": 1.1325312847272552e-07, "loss": 0.3044, "step": 18353 }, { "epoch": 0.8867951877083635, "grad_norm": 2.9365074634552, "learning_rate": 1.1320481229163647e-07, "loss": 0.3239, "step": 18354 }, { "epoch": 0.8868435038894525, "grad_norm": 2.701082468032837, "learning_rate": 1.1315649611054742e-07, "loss": 0.404, "step": 18355 }, { "epoch": 0.8868918200705416, "grad_norm": 3.5742275714874268, "learning_rate": 1.1310817992945837e-07, "loss": 0.4669, "step": 18356 }, { "epoch": 0.8869401362516307, "grad_norm": 2.692838668823242, "learning_rate": 1.1305986374836933e-07, "loss": 0.2892, "step": 18357 }, { "epoch": 0.8869884524327197, "grad_norm": 2.75278377532959, "learning_rate": 1.1301154756728028e-07, "loss": 0.1778, "step": 18358 }, { "epoch": 0.8870367686138088, "grad_norm": 2.3038928508758545, "learning_rate": 1.1296323138619122e-07, "loss": 0.1425, "step": 18359 }, { "epoch": 0.8870850847948978, "grad_norm": 141.4427947998047, "learning_rate": 1.1291491520510218e-07, "loss": 0.2286, "step": 18360 }, { "epoch": 0.8871334009759868, "grad_norm": 5.543612480163574, "learning_rate": 1.1286659902401313e-07, "loss": 0.2544, "step": 18361 }, { "epoch": 0.8871817171570759, "grad_norm": 3.2831547260284424, "learning_rate": 1.128182828429241e-07, "loss": 0.3262, "step": 18362 }, { "epoch": 0.887230033338165, "grad_norm": 3.926147937774658, "learning_rate": 1.1276996666183504e-07, "loss": 0.4897, "step": 18363 }, { "epoch": 0.887278349519254, "grad_norm": 3.888887643814087, "learning_rate": 1.12721650480746e-07, "loss": 0.3515, "step": 18364 }, { "epoch": 0.887326665700343, "grad_norm": 2.898705244064331, "learning_rate": 1.1267333429965695e-07, "loss": 0.2372, "step": 18365 }, { "epoch": 0.887374981881432, "grad_norm": 2.0469961166381836, "learning_rate": 1.1262501811856791e-07, "loss": 0.2467, "step": 18366 }, { "epoch": 0.8874232980625212, "grad_norm": 19.020265579223633, "learning_rate": 1.1257670193747885e-07, "loss": 0.2766, "step": 18367 }, { "epoch": 0.8874716142436102, "grad_norm": 1.9658199548721313, "learning_rate": 1.1252838575638981e-07, "loss": 0.1531, "step": 18368 }, { "epoch": 0.8875199304246992, "grad_norm": 2.7429358959198, "learning_rate": 1.1248006957530077e-07, "loss": 0.3442, "step": 18369 }, { "epoch": 0.8875682466057883, "grad_norm": 2.4847912788391113, "learning_rate": 1.1243175339421173e-07, "loss": 0.2484, "step": 18370 }, { "epoch": 0.8876165627868773, "grad_norm": 4.551248073577881, "learning_rate": 1.1238343721312267e-07, "loss": 0.3454, "step": 18371 }, { "epoch": 0.8876648789679664, "grad_norm": 2.7989256381988525, "learning_rate": 1.1233512103203362e-07, "loss": 0.2694, "step": 18372 }, { "epoch": 0.8877131951490554, "grad_norm": 2.9983911514282227, "learning_rate": 1.1228680485094458e-07, "loss": 0.2586, "step": 18373 }, { "epoch": 0.8877615113301445, "grad_norm": 2.159891366958618, "learning_rate": 1.1223848866985553e-07, "loss": 0.2124, "step": 18374 }, { "epoch": 0.8878098275112335, "grad_norm": 2.33463454246521, "learning_rate": 1.1219017248876648e-07, "loss": 0.2814, "step": 18375 }, { "epoch": 0.8878581436923225, "grad_norm": 2.5889132022857666, "learning_rate": 1.1214185630767743e-07, "loss": 0.1667, "step": 18376 }, { "epoch": 0.8879064598734117, "grad_norm": 3.729874610900879, "learning_rate": 1.120935401265884e-07, "loss": 0.4226, "step": 18377 }, { "epoch": 0.8879547760545007, "grad_norm": 2.640442371368408, "learning_rate": 1.1204522394549935e-07, "loss": 0.2294, "step": 18378 }, { "epoch": 0.8880030922355897, "grad_norm": 2.8343658447265625, "learning_rate": 1.119969077644103e-07, "loss": 0.4354, "step": 18379 }, { "epoch": 0.8880514084166787, "grad_norm": 2.753039598464966, "learning_rate": 1.1194859158332125e-07, "loss": 0.3006, "step": 18380 }, { "epoch": 0.8880997245977678, "grad_norm": 2.9209699630737305, "learning_rate": 1.1190027540223221e-07, "loss": 0.3043, "step": 18381 }, { "epoch": 0.8881480407788568, "grad_norm": 2.797814130783081, "learning_rate": 1.1185195922114316e-07, "loss": 0.3417, "step": 18382 }, { "epoch": 0.8881963569599459, "grad_norm": 2.1740915775299072, "learning_rate": 1.1180364304005411e-07, "loss": 0.2052, "step": 18383 }, { "epoch": 0.888244673141035, "grad_norm": 3.656921148300171, "learning_rate": 1.1175532685896506e-07, "loss": 0.2182, "step": 18384 }, { "epoch": 0.888292989322124, "grad_norm": 3.3844826221466064, "learning_rate": 1.1170701067787601e-07, "loss": 0.2952, "step": 18385 }, { "epoch": 0.888341305503213, "grad_norm": 6.520571708679199, "learning_rate": 1.1165869449678698e-07, "loss": 0.3425, "step": 18386 }, { "epoch": 0.888389621684302, "grad_norm": 5.550695896148682, "learning_rate": 1.1161037831569791e-07, "loss": 0.4084, "step": 18387 }, { "epoch": 0.8884379378653912, "grad_norm": 2.7677273750305176, "learning_rate": 1.1156206213460888e-07, "loss": 0.3142, "step": 18388 }, { "epoch": 0.8884862540464802, "grad_norm": 2.112295389175415, "learning_rate": 1.1151374595351983e-07, "loss": 0.2434, "step": 18389 }, { "epoch": 0.8885345702275692, "grad_norm": 3.2629852294921875, "learning_rate": 1.1146542977243079e-07, "loss": 0.3264, "step": 18390 }, { "epoch": 0.8885828864086582, "grad_norm": 3.1296184062957764, "learning_rate": 1.1141711359134173e-07, "loss": 0.3529, "step": 18391 }, { "epoch": 0.8886312025897473, "grad_norm": 2.506700038909912, "learning_rate": 1.1136879741025269e-07, "loss": 0.2231, "step": 18392 }, { "epoch": 0.8886795187708364, "grad_norm": 2.822854518890381, "learning_rate": 1.1132048122916364e-07, "loss": 0.2227, "step": 18393 }, { "epoch": 0.8887278349519254, "grad_norm": 2.4383673667907715, "learning_rate": 1.1127216504807461e-07, "loss": 0.2815, "step": 18394 }, { "epoch": 0.8887761511330144, "grad_norm": 3.123535633087158, "learning_rate": 1.1122384886698554e-07, "loss": 0.2727, "step": 18395 }, { "epoch": 0.8888244673141035, "grad_norm": 2.4009478092193604, "learning_rate": 1.111755326858965e-07, "loss": 0.3418, "step": 18396 }, { "epoch": 0.8888727834951925, "grad_norm": 2.1238481998443604, "learning_rate": 1.1112721650480746e-07, "loss": 0.2546, "step": 18397 }, { "epoch": 0.8889210996762816, "grad_norm": 2.8775272369384766, "learning_rate": 1.110789003237184e-07, "loss": 0.2981, "step": 18398 }, { "epoch": 0.8889694158573707, "grad_norm": 2.449294090270996, "learning_rate": 1.1103058414262936e-07, "loss": 0.2987, "step": 18399 }, { "epoch": 0.8890177320384597, "grad_norm": 2.4387238025665283, "learning_rate": 1.1098226796154031e-07, "loss": 0.293, "step": 18400 }, { "epoch": 0.8890660482195487, "grad_norm": 3.3726537227630615, "learning_rate": 1.1093395178045127e-07, "loss": 0.4392, "step": 18401 }, { "epoch": 0.8891143644006377, "grad_norm": 1.9908950328826904, "learning_rate": 1.1088563559936221e-07, "loss": 0.245, "step": 18402 }, { "epoch": 0.8891626805817269, "grad_norm": 3.4645979404449463, "learning_rate": 1.1083731941827318e-07, "loss": 0.3594, "step": 18403 }, { "epoch": 0.8892109967628159, "grad_norm": 2.737217426300049, "learning_rate": 1.1078900323718413e-07, "loss": 0.3498, "step": 18404 }, { "epoch": 0.8892593129439049, "grad_norm": 3.379908561706543, "learning_rate": 1.1074068705609509e-07, "loss": 0.2466, "step": 18405 }, { "epoch": 0.889307629124994, "grad_norm": 2.594106674194336, "learning_rate": 1.1069237087500603e-07, "loss": 0.2612, "step": 18406 }, { "epoch": 0.889355945306083, "grad_norm": 1.9137754440307617, "learning_rate": 1.1064405469391699e-07, "loss": 0.1728, "step": 18407 }, { "epoch": 0.889404261487172, "grad_norm": 3.248884439468384, "learning_rate": 1.1059573851282794e-07, "loss": 0.2839, "step": 18408 }, { "epoch": 0.8894525776682611, "grad_norm": 2.7540969848632812, "learning_rate": 1.1054742233173889e-07, "loss": 0.278, "step": 18409 }, { "epoch": 0.8895008938493502, "grad_norm": 4.05664587020874, "learning_rate": 1.1049910615064984e-07, "loss": 0.3635, "step": 18410 }, { "epoch": 0.8895492100304392, "grad_norm": 6.726964473724365, "learning_rate": 1.1045078996956079e-07, "loss": 0.2111, "step": 18411 }, { "epoch": 0.8895975262115282, "grad_norm": 2.5157268047332764, "learning_rate": 1.1040247378847176e-07, "loss": 0.2705, "step": 18412 }, { "epoch": 0.8896458423926172, "grad_norm": 3.0803141593933105, "learning_rate": 1.1035415760738271e-07, "loss": 0.2829, "step": 18413 }, { "epoch": 0.8896941585737064, "grad_norm": 2.696709156036377, "learning_rate": 1.1030584142629366e-07, "loss": 0.3165, "step": 18414 }, { "epoch": 0.8897424747547954, "grad_norm": 7.680639743804932, "learning_rate": 1.1025752524520461e-07, "loss": 0.2984, "step": 18415 }, { "epoch": 0.8897907909358844, "grad_norm": 2.3189077377319336, "learning_rate": 1.1020920906411557e-07, "loss": 0.2262, "step": 18416 }, { "epoch": 0.8898391071169734, "grad_norm": 3.311309337615967, "learning_rate": 1.1016089288302652e-07, "loss": 0.2369, "step": 18417 }, { "epoch": 0.8898874232980625, "grad_norm": 2.4299376010894775, "learning_rate": 1.1011257670193747e-07, "loss": 0.2989, "step": 18418 }, { "epoch": 0.8899357394791516, "grad_norm": 2.054886817932129, "learning_rate": 1.1006426052084842e-07, "loss": 0.2577, "step": 18419 }, { "epoch": 0.8899840556602406, "grad_norm": 5.982266426086426, "learning_rate": 1.1001594433975939e-07, "loss": 0.2718, "step": 18420 }, { "epoch": 0.8900323718413297, "grad_norm": 5.157499313354492, "learning_rate": 1.0996762815867034e-07, "loss": 0.3339, "step": 18421 }, { "epoch": 0.8900806880224187, "grad_norm": 5.542489528656006, "learning_rate": 1.0991931197758127e-07, "loss": 0.2267, "step": 18422 }, { "epoch": 0.8901290042035077, "grad_norm": 2.717655897140503, "learning_rate": 1.0987099579649224e-07, "loss": 0.2465, "step": 18423 }, { "epoch": 0.8901773203845968, "grad_norm": 1.9211606979370117, "learning_rate": 1.0982267961540319e-07, "loss": 0.2023, "step": 18424 }, { "epoch": 0.8902256365656859, "grad_norm": 2.1679065227508545, "learning_rate": 1.0977436343431415e-07, "loss": 0.286, "step": 18425 }, { "epoch": 0.8902739527467749, "grad_norm": 1.6611987352371216, "learning_rate": 1.0972604725322509e-07, "loss": 0.1337, "step": 18426 }, { "epoch": 0.8903222689278639, "grad_norm": 2.034550905227661, "learning_rate": 1.0967773107213605e-07, "loss": 0.2291, "step": 18427 }, { "epoch": 0.890370585108953, "grad_norm": 3.5954768657684326, "learning_rate": 1.09629414891047e-07, "loss": 0.2329, "step": 18428 }, { "epoch": 0.8904189012900421, "grad_norm": 2.25099515914917, "learning_rate": 1.0958109870995797e-07, "loss": 0.2919, "step": 18429 }, { "epoch": 0.8904672174711311, "grad_norm": 2.9402554035186768, "learning_rate": 1.095327825288689e-07, "loss": 0.3116, "step": 18430 }, { "epoch": 0.8905155336522201, "grad_norm": 2.164194107055664, "learning_rate": 1.0948446634777987e-07, "loss": 0.2605, "step": 18431 }, { "epoch": 0.8905638498333092, "grad_norm": 3.643321990966797, "learning_rate": 1.0943615016669082e-07, "loss": 0.3698, "step": 18432 }, { "epoch": 0.8906121660143982, "grad_norm": 2.935894250869751, "learning_rate": 1.0938783398560178e-07, "loss": 0.3315, "step": 18433 }, { "epoch": 0.8906604821954873, "grad_norm": 2.955862045288086, "learning_rate": 1.0933951780451272e-07, "loss": 0.3336, "step": 18434 }, { "epoch": 0.8907087983765763, "grad_norm": 2.907104730606079, "learning_rate": 1.0929120162342367e-07, "loss": 0.3065, "step": 18435 }, { "epoch": 0.8907571145576654, "grad_norm": 2.412593364715576, "learning_rate": 1.0924288544233463e-07, "loss": 0.211, "step": 18436 }, { "epoch": 0.8908054307387544, "grad_norm": 2.299900770187378, "learning_rate": 1.0919456926124559e-07, "loss": 0.266, "step": 18437 }, { "epoch": 0.8908537469198434, "grad_norm": 31.45005226135254, "learning_rate": 1.0914625308015654e-07, "loss": 0.3514, "step": 18438 }, { "epoch": 0.8909020631009325, "grad_norm": 3.2362184524536133, "learning_rate": 1.0909793689906749e-07, "loss": 0.3555, "step": 18439 }, { "epoch": 0.8909503792820216, "grad_norm": 2.157590389251709, "learning_rate": 1.0904962071797845e-07, "loss": 0.1959, "step": 18440 }, { "epoch": 0.8909986954631106, "grad_norm": 2.2302937507629395, "learning_rate": 1.090013045368894e-07, "loss": 0.1953, "step": 18441 }, { "epoch": 0.8910470116441996, "grad_norm": 2.5069243907928467, "learning_rate": 1.0895298835580035e-07, "loss": 0.3376, "step": 18442 }, { "epoch": 0.8910953278252887, "grad_norm": 3.481581211090088, "learning_rate": 1.089046721747113e-07, "loss": 0.2851, "step": 18443 }, { "epoch": 0.8911436440063777, "grad_norm": 2.1998517513275146, "learning_rate": 1.0885635599362227e-07, "loss": 0.1985, "step": 18444 }, { "epoch": 0.8911919601874668, "grad_norm": 3.8887832164764404, "learning_rate": 1.0880803981253322e-07, "loss": 0.2538, "step": 18445 }, { "epoch": 0.8912402763685559, "grad_norm": 2.432314872741699, "learning_rate": 1.0875972363144417e-07, "loss": 0.2996, "step": 18446 }, { "epoch": 0.8912885925496449, "grad_norm": 10.681747436523438, "learning_rate": 1.0871140745035512e-07, "loss": 0.4151, "step": 18447 }, { "epoch": 0.8913369087307339, "grad_norm": 2.6453003883361816, "learning_rate": 1.0866309126926607e-07, "loss": 0.2625, "step": 18448 }, { "epoch": 0.8913852249118229, "grad_norm": 2.172694206237793, "learning_rate": 1.0861477508817703e-07, "loss": 0.1983, "step": 18449 }, { "epoch": 0.8914335410929121, "grad_norm": 2.964906930923462, "learning_rate": 1.0856645890708797e-07, "loss": 0.2947, "step": 18450 }, { "epoch": 0.8914818572740011, "grad_norm": 3.8956656455993652, "learning_rate": 1.0851814272599893e-07, "loss": 0.4214, "step": 18451 }, { "epoch": 0.8915301734550901, "grad_norm": 1.9036120176315308, "learning_rate": 1.0846982654490988e-07, "loss": 0.2084, "step": 18452 }, { "epoch": 0.8915784896361791, "grad_norm": 1.843198299407959, "learning_rate": 1.0842151036382085e-07, "loss": 0.1978, "step": 18453 }, { "epoch": 0.8916268058172682, "grad_norm": 2.8239681720733643, "learning_rate": 1.0837319418273178e-07, "loss": 0.1775, "step": 18454 }, { "epoch": 0.8916751219983573, "grad_norm": 3.048546552658081, "learning_rate": 1.0832487800164275e-07, "loss": 0.3434, "step": 18455 }, { "epoch": 0.8917234381794463, "grad_norm": 2.9484434127807617, "learning_rate": 1.082765618205537e-07, "loss": 0.2691, "step": 18456 }, { "epoch": 0.8917717543605354, "grad_norm": 2.7576284408569336, "learning_rate": 1.0822824563946466e-07, "loss": 0.2707, "step": 18457 }, { "epoch": 0.8918200705416244, "grad_norm": 4.282049655914307, "learning_rate": 1.081799294583756e-07, "loss": 0.1926, "step": 18458 }, { "epoch": 0.8918683867227134, "grad_norm": 2.3122432231903076, "learning_rate": 1.0813161327728656e-07, "loss": 0.3126, "step": 18459 }, { "epoch": 0.8919167029038025, "grad_norm": 2.4579129219055176, "learning_rate": 1.0808329709619751e-07, "loss": 0.2395, "step": 18460 }, { "epoch": 0.8919650190848916, "grad_norm": 2.45186710357666, "learning_rate": 1.0803498091510846e-07, "loss": 0.2985, "step": 18461 }, { "epoch": 0.8920133352659806, "grad_norm": 3.656665325164795, "learning_rate": 1.0798666473401941e-07, "loss": 0.2407, "step": 18462 }, { "epoch": 0.8920616514470696, "grad_norm": 3.643686056137085, "learning_rate": 1.0793834855293036e-07, "loss": 0.3465, "step": 18463 }, { "epoch": 0.8921099676281586, "grad_norm": 3.7393717765808105, "learning_rate": 1.0789003237184133e-07, "loss": 0.3386, "step": 18464 }, { "epoch": 0.8921582838092477, "grad_norm": 2.6824169158935547, "learning_rate": 1.0784171619075228e-07, "loss": 0.3093, "step": 18465 }, { "epoch": 0.8922065999903368, "grad_norm": 2.136524200439453, "learning_rate": 1.0779340000966323e-07, "loss": 0.2332, "step": 18466 }, { "epoch": 0.8922549161714258, "grad_norm": 2.838526964187622, "learning_rate": 1.0774508382857418e-07, "loss": 0.3338, "step": 18467 }, { "epoch": 0.8923032323525149, "grad_norm": 1.8621799945831299, "learning_rate": 1.0769676764748514e-07, "loss": 0.186, "step": 18468 }, { "epoch": 0.8923515485336039, "grad_norm": 2.1448428630828857, "learning_rate": 1.076484514663961e-07, "loss": 0.2141, "step": 18469 }, { "epoch": 0.8923998647146929, "grad_norm": 3.125807046890259, "learning_rate": 1.0760013528530704e-07, "loss": 0.3467, "step": 18470 }, { "epoch": 0.892448180895782, "grad_norm": 4.377630710601807, "learning_rate": 1.07551819104218e-07, "loss": 0.3557, "step": 18471 }, { "epoch": 0.8924964970768711, "grad_norm": 4.298348426818848, "learning_rate": 1.0750350292312895e-07, "loss": 0.3387, "step": 18472 }, { "epoch": 0.8925448132579601, "grad_norm": 3.1365411281585693, "learning_rate": 1.0745518674203991e-07, "loss": 0.1971, "step": 18473 }, { "epoch": 0.8925931294390491, "grad_norm": 2.422173261642456, "learning_rate": 1.0740687056095085e-07, "loss": 0.2598, "step": 18474 }, { "epoch": 0.8926414456201381, "grad_norm": 3.3654448986053467, "learning_rate": 1.0735855437986181e-07, "loss": 0.3051, "step": 18475 }, { "epoch": 0.8926897618012273, "grad_norm": 2.329472064971924, "learning_rate": 1.0731023819877276e-07, "loss": 0.222, "step": 18476 }, { "epoch": 0.8927380779823163, "grad_norm": 4.244966983795166, "learning_rate": 1.0726192201768372e-07, "loss": 0.233, "step": 18477 }, { "epoch": 0.8927863941634053, "grad_norm": 2.6957361698150635, "learning_rate": 1.0721360583659466e-07, "loss": 0.287, "step": 18478 }, { "epoch": 0.8928347103444944, "grad_norm": 2.317878246307373, "learning_rate": 1.0716528965550563e-07, "loss": 0.2371, "step": 18479 }, { "epoch": 0.8928830265255834, "grad_norm": 2.6292200088500977, "learning_rate": 1.0711697347441658e-07, "loss": 0.3423, "step": 18480 }, { "epoch": 0.8929313427066725, "grad_norm": 1.9604231119155884, "learning_rate": 1.0706865729332754e-07, "loss": 0.1991, "step": 18481 }, { "epoch": 0.8929796588877615, "grad_norm": 2.504636764526367, "learning_rate": 1.0702034111223848e-07, "loss": 0.2301, "step": 18482 }, { "epoch": 0.8930279750688506, "grad_norm": 2.0216712951660156, "learning_rate": 1.0697202493114944e-07, "loss": 0.27, "step": 18483 }, { "epoch": 0.8930762912499396, "grad_norm": 2.2796249389648438, "learning_rate": 1.0692370875006039e-07, "loss": 0.267, "step": 18484 }, { "epoch": 0.8931246074310286, "grad_norm": 2.370161533355713, "learning_rate": 1.0687539256897134e-07, "loss": 0.2755, "step": 18485 }, { "epoch": 0.8931729236121178, "grad_norm": 2.271251916885376, "learning_rate": 1.0682707638788229e-07, "loss": 0.2321, "step": 18486 }, { "epoch": 0.8932212397932068, "grad_norm": 3.2510788440704346, "learning_rate": 1.0677876020679324e-07, "loss": 0.2828, "step": 18487 }, { "epoch": 0.8932695559742958, "grad_norm": 2.293539524078369, "learning_rate": 1.0673044402570421e-07, "loss": 0.2392, "step": 18488 }, { "epoch": 0.8933178721553848, "grad_norm": 1.710170030593872, "learning_rate": 1.0668212784461516e-07, "loss": 0.1531, "step": 18489 }, { "epoch": 0.8933661883364739, "grad_norm": 2.4472432136535645, "learning_rate": 1.0663381166352611e-07, "loss": 0.2701, "step": 18490 }, { "epoch": 0.8934145045175629, "grad_norm": 3.557553768157959, "learning_rate": 1.0658549548243706e-07, "loss": 0.3292, "step": 18491 }, { "epoch": 0.893462820698652, "grad_norm": 2.4920923709869385, "learning_rate": 1.0653717930134802e-07, "loss": 0.2304, "step": 18492 }, { "epoch": 0.893511136879741, "grad_norm": 2.1100313663482666, "learning_rate": 1.0648886312025897e-07, "loss": 0.2331, "step": 18493 }, { "epoch": 0.8935594530608301, "grad_norm": 2.708857536315918, "learning_rate": 1.0644054693916992e-07, "loss": 0.2643, "step": 18494 }, { "epoch": 0.8936077692419191, "grad_norm": 2.528282880783081, "learning_rate": 1.0639223075808087e-07, "loss": 0.3137, "step": 18495 }, { "epoch": 0.8936560854230081, "grad_norm": 2.6430728435516357, "learning_rate": 1.0634391457699184e-07, "loss": 0.2581, "step": 18496 }, { "epoch": 0.8937044016040973, "grad_norm": 9.109210014343262, "learning_rate": 1.0629559839590279e-07, "loss": 0.3168, "step": 18497 }, { "epoch": 0.8937527177851863, "grad_norm": 10.45206069946289, "learning_rate": 1.0624728221481373e-07, "loss": 0.3397, "step": 18498 }, { "epoch": 0.8938010339662753, "grad_norm": 3.2983109951019287, "learning_rate": 1.0619896603372469e-07, "loss": 0.3188, "step": 18499 }, { "epoch": 0.8938493501473643, "grad_norm": 2.646066665649414, "learning_rate": 1.0615064985263564e-07, "loss": 0.3104, "step": 18500 }, { "epoch": 0.8938976663284534, "grad_norm": 3.0573036670684814, "learning_rate": 1.061023336715466e-07, "loss": 0.3958, "step": 18501 }, { "epoch": 0.8939459825095425, "grad_norm": 1.9498941898345947, "learning_rate": 1.0605401749045754e-07, "loss": 0.2389, "step": 18502 }, { "epoch": 0.8939942986906315, "grad_norm": 2.3901398181915283, "learning_rate": 1.060057013093685e-07, "loss": 0.3261, "step": 18503 }, { "epoch": 0.8940426148717205, "grad_norm": 4.732509613037109, "learning_rate": 1.0595738512827945e-07, "loss": 0.266, "step": 18504 }, { "epoch": 0.8940909310528096, "grad_norm": 3.0911896228790283, "learning_rate": 1.0590906894719042e-07, "loss": 0.3136, "step": 18505 }, { "epoch": 0.8941392472338986, "grad_norm": 2.002234697341919, "learning_rate": 1.0586075276610136e-07, "loss": 0.1963, "step": 18506 }, { "epoch": 0.8941875634149877, "grad_norm": 1.9833906888961792, "learning_rate": 1.0581243658501232e-07, "loss": 0.2059, "step": 18507 }, { "epoch": 0.8942358795960768, "grad_norm": 2.6193392276763916, "learning_rate": 1.0576412040392327e-07, "loss": 0.3269, "step": 18508 }, { "epoch": 0.8942841957771658, "grad_norm": 3.074882745742798, "learning_rate": 1.0571580422283423e-07, "loss": 0.3318, "step": 18509 }, { "epoch": 0.8943325119582548, "grad_norm": 2.223376512527466, "learning_rate": 1.0566748804174517e-07, "loss": 0.1984, "step": 18510 }, { "epoch": 0.8943808281393438, "grad_norm": 2.138911008834839, "learning_rate": 1.0561917186065612e-07, "loss": 0.2529, "step": 18511 }, { "epoch": 0.894429144320433, "grad_norm": 2.332679033279419, "learning_rate": 1.0557085567956709e-07, "loss": 0.2372, "step": 18512 }, { "epoch": 0.894477460501522, "grad_norm": 2.4875242710113525, "learning_rate": 1.0552253949847804e-07, "loss": 0.2429, "step": 18513 }, { "epoch": 0.894525776682611, "grad_norm": 3.6896040439605713, "learning_rate": 1.0547422331738899e-07, "loss": 0.4599, "step": 18514 }, { "epoch": 0.8945740928637, "grad_norm": 2.6701104640960693, "learning_rate": 1.0542590713629994e-07, "loss": 0.3703, "step": 18515 }, { "epoch": 0.8946224090447891, "grad_norm": 2.2220582962036133, "learning_rate": 1.053775909552109e-07, "loss": 0.2405, "step": 18516 }, { "epoch": 0.8946707252258781, "grad_norm": 4.328278541564941, "learning_rate": 1.0532927477412185e-07, "loss": 0.2724, "step": 18517 }, { "epoch": 0.8947190414069672, "grad_norm": 3.703421115875244, "learning_rate": 1.052809585930328e-07, "loss": 0.3649, "step": 18518 }, { "epoch": 0.8947673575880563, "grad_norm": 8.130453109741211, "learning_rate": 1.0523264241194375e-07, "loss": 0.3578, "step": 18519 }, { "epoch": 0.8948156737691453, "grad_norm": 3.0195984840393066, "learning_rate": 1.0518432623085472e-07, "loss": 0.2978, "step": 18520 }, { "epoch": 0.8948639899502343, "grad_norm": 2.305548667907715, "learning_rate": 1.0513601004976567e-07, "loss": 0.1472, "step": 18521 }, { "epoch": 0.8949123061313233, "grad_norm": 1.8437305688858032, "learning_rate": 1.0508769386867662e-07, "loss": 0.1911, "step": 18522 }, { "epoch": 0.8949606223124125, "grad_norm": 1.6588349342346191, "learning_rate": 1.0503937768758757e-07, "loss": 0.1744, "step": 18523 }, { "epoch": 0.8950089384935015, "grad_norm": 12.383773803710938, "learning_rate": 1.0499106150649852e-07, "loss": 0.3136, "step": 18524 }, { "epoch": 0.8950572546745905, "grad_norm": 5.255821704864502, "learning_rate": 1.0494274532540948e-07, "loss": 0.3258, "step": 18525 }, { "epoch": 0.8951055708556795, "grad_norm": 2.3224799633026123, "learning_rate": 1.0489442914432042e-07, "loss": 0.2347, "step": 18526 }, { "epoch": 0.8951538870367686, "grad_norm": 1.5152599811553955, "learning_rate": 1.0484611296323138e-07, "loss": 0.1506, "step": 18527 }, { "epoch": 0.8952022032178577, "grad_norm": 6.36182975769043, "learning_rate": 1.0479779678214233e-07, "loss": 0.223, "step": 18528 }, { "epoch": 0.8952505193989467, "grad_norm": 5.066771507263184, "learning_rate": 1.047494806010533e-07, "loss": 0.2743, "step": 18529 }, { "epoch": 0.8952988355800358, "grad_norm": 2.854048728942871, "learning_rate": 1.0470116441996423e-07, "loss": 0.276, "step": 18530 }, { "epoch": 0.8953471517611248, "grad_norm": 3.9921462535858154, "learning_rate": 1.046528482388752e-07, "loss": 0.2766, "step": 18531 }, { "epoch": 0.8953954679422138, "grad_norm": 8.358230590820312, "learning_rate": 1.0460453205778615e-07, "loss": 0.3137, "step": 18532 }, { "epoch": 0.8954437841233029, "grad_norm": 3.1754183769226074, "learning_rate": 1.0455621587669711e-07, "loss": 0.2133, "step": 18533 }, { "epoch": 0.895492100304392, "grad_norm": 2.178497552871704, "learning_rate": 1.0450789969560805e-07, "loss": 0.2655, "step": 18534 }, { "epoch": 0.895540416485481, "grad_norm": 4.358216285705566, "learning_rate": 1.0445958351451901e-07, "loss": 0.2653, "step": 18535 }, { "epoch": 0.89558873266657, "grad_norm": 3.8811264038085938, "learning_rate": 1.0441126733342996e-07, "loss": 0.1816, "step": 18536 }, { "epoch": 0.895637048847659, "grad_norm": 2.645730972290039, "learning_rate": 1.0436295115234091e-07, "loss": 0.3293, "step": 18537 }, { "epoch": 0.8956853650287482, "grad_norm": 2.940169334411621, "learning_rate": 1.0431463497125186e-07, "loss": 0.3562, "step": 18538 }, { "epoch": 0.8957336812098372, "grad_norm": 2.660282850265503, "learning_rate": 1.0426631879016282e-07, "loss": 0.286, "step": 18539 }, { "epoch": 0.8957819973909262, "grad_norm": 2.286120653152466, "learning_rate": 1.0421800260907378e-07, "loss": 0.3141, "step": 18540 }, { "epoch": 0.8958303135720153, "grad_norm": 2.3280820846557617, "learning_rate": 1.0416968642798473e-07, "loss": 0.2703, "step": 18541 }, { "epoch": 0.8958786297531043, "grad_norm": 3.4782216548919678, "learning_rate": 1.0412137024689568e-07, "loss": 0.2375, "step": 18542 }, { "epoch": 0.8959269459341933, "grad_norm": 2.321352958679199, "learning_rate": 1.0407305406580663e-07, "loss": 0.2553, "step": 18543 }, { "epoch": 0.8959752621152824, "grad_norm": 2.7413368225097656, "learning_rate": 1.040247378847176e-07, "loss": 0.2118, "step": 18544 }, { "epoch": 0.8960235782963715, "grad_norm": 14.755990982055664, "learning_rate": 1.0397642170362854e-07, "loss": 0.2704, "step": 18545 }, { "epoch": 0.8960718944774605, "grad_norm": 2.6142423152923584, "learning_rate": 1.039281055225395e-07, "loss": 0.2533, "step": 18546 }, { "epoch": 0.8961202106585495, "grad_norm": 7.697299003601074, "learning_rate": 1.0387978934145045e-07, "loss": 0.4391, "step": 18547 }, { "epoch": 0.8961685268396385, "grad_norm": 2.2028419971466064, "learning_rate": 1.038314731603614e-07, "loss": 0.1418, "step": 18548 }, { "epoch": 0.8962168430207277, "grad_norm": 5.590784072875977, "learning_rate": 1.0378315697927236e-07, "loss": 0.3321, "step": 18549 }, { "epoch": 0.8962651592018167, "grad_norm": 2.0333251953125, "learning_rate": 1.037348407981833e-07, "loss": 0.2316, "step": 18550 }, { "epoch": 0.8963134753829057, "grad_norm": 3.667174816131592, "learning_rate": 1.0368652461709426e-07, "loss": 0.2747, "step": 18551 }, { "epoch": 0.8963617915639948, "grad_norm": 2.695204496383667, "learning_rate": 1.0363820843600521e-07, "loss": 0.2867, "step": 18552 }, { "epoch": 0.8964101077450838, "grad_norm": 2.937626361846924, "learning_rate": 1.0358989225491618e-07, "loss": 0.2643, "step": 18553 }, { "epoch": 0.8964584239261729, "grad_norm": 2.7324182987213135, "learning_rate": 1.0354157607382711e-07, "loss": 0.3399, "step": 18554 }, { "epoch": 0.896506740107262, "grad_norm": 1.9783912897109985, "learning_rate": 1.0349325989273808e-07, "loss": 0.2426, "step": 18555 }, { "epoch": 0.896555056288351, "grad_norm": 2.8713972568511963, "learning_rate": 1.0344494371164903e-07, "loss": 0.2675, "step": 18556 }, { "epoch": 0.89660337246944, "grad_norm": 3.478327512741089, "learning_rate": 1.0339662753055999e-07, "loss": 0.3497, "step": 18557 }, { "epoch": 0.896651688650529, "grad_norm": 3.354473114013672, "learning_rate": 1.0334831134947093e-07, "loss": 0.3117, "step": 18558 }, { "epoch": 0.8967000048316182, "grad_norm": 2.252211809158325, "learning_rate": 1.0329999516838189e-07, "loss": 0.2121, "step": 18559 }, { "epoch": 0.8967483210127072, "grad_norm": 2.3031177520751953, "learning_rate": 1.0325167898729284e-07, "loss": 0.279, "step": 18560 }, { "epoch": 0.8967966371937962, "grad_norm": 2.7747559547424316, "learning_rate": 1.0320336280620379e-07, "loss": 0.2469, "step": 18561 }, { "epoch": 0.8968449533748852, "grad_norm": 2.915616989135742, "learning_rate": 1.0315504662511474e-07, "loss": 0.3687, "step": 18562 }, { "epoch": 0.8968932695559743, "grad_norm": 3.549025058746338, "learning_rate": 1.031067304440257e-07, "loss": 0.2433, "step": 18563 }, { "epoch": 0.8969415857370634, "grad_norm": 3.2215898036956787, "learning_rate": 1.0305841426293666e-07, "loss": 0.245, "step": 18564 }, { "epoch": 0.8969899019181524, "grad_norm": 3.3786513805389404, "learning_rate": 1.0301009808184761e-07, "loss": 0.2506, "step": 18565 }, { "epoch": 0.8970382180992414, "grad_norm": 2.661029577255249, "learning_rate": 1.0296178190075856e-07, "loss": 0.2355, "step": 18566 }, { "epoch": 0.8970865342803305, "grad_norm": 1.850199580192566, "learning_rate": 1.0291346571966951e-07, "loss": 0.2118, "step": 18567 }, { "epoch": 0.8971348504614195, "grad_norm": 2.180478572845459, "learning_rate": 1.0286514953858047e-07, "loss": 0.2769, "step": 18568 }, { "epoch": 0.8971831666425085, "grad_norm": 2.9207282066345215, "learning_rate": 1.0281683335749142e-07, "loss": 0.2981, "step": 18569 }, { "epoch": 0.8972314828235977, "grad_norm": 2.713772773742676, "learning_rate": 1.0276851717640237e-07, "loss": 0.2544, "step": 18570 }, { "epoch": 0.8972797990046867, "grad_norm": 4.733936786651611, "learning_rate": 1.0272020099531332e-07, "loss": 0.3516, "step": 18571 }, { "epoch": 0.8973281151857757, "grad_norm": 3.3264176845550537, "learning_rate": 1.0267188481422429e-07, "loss": 0.3676, "step": 18572 }, { "epoch": 0.8973764313668647, "grad_norm": 3.7294979095458984, "learning_rate": 1.0262356863313524e-07, "loss": 0.2985, "step": 18573 }, { "epoch": 0.8974247475479538, "grad_norm": 2.3916544914245605, "learning_rate": 1.0257525245204618e-07, "loss": 0.3063, "step": 18574 }, { "epoch": 0.8974730637290429, "grad_norm": 3.4165103435516357, "learning_rate": 1.0252693627095714e-07, "loss": 0.3566, "step": 18575 }, { "epoch": 0.8975213799101319, "grad_norm": 7.057642936706543, "learning_rate": 1.0247862008986809e-07, "loss": 0.3646, "step": 18576 }, { "epoch": 0.897569696091221, "grad_norm": 2.2203073501586914, "learning_rate": 1.0243030390877905e-07, "loss": 0.2236, "step": 18577 }, { "epoch": 0.89761801227231, "grad_norm": 2.480928897857666, "learning_rate": 1.0238198772768999e-07, "loss": 0.2415, "step": 18578 }, { "epoch": 0.897666328453399, "grad_norm": 1.6814407110214233, "learning_rate": 1.0233367154660095e-07, "loss": 0.1353, "step": 18579 }, { "epoch": 0.8977146446344881, "grad_norm": 2.798016309738159, "learning_rate": 1.022853553655119e-07, "loss": 0.2795, "step": 18580 }, { "epoch": 0.8977629608155772, "grad_norm": 3.5511648654937744, "learning_rate": 1.0223703918442287e-07, "loss": 0.3093, "step": 18581 }, { "epoch": 0.8978112769966662, "grad_norm": 4.0514678955078125, "learning_rate": 1.021887230033338e-07, "loss": 0.3512, "step": 18582 }, { "epoch": 0.8978595931777552, "grad_norm": 4.851337432861328, "learning_rate": 1.0214040682224477e-07, "loss": 0.282, "step": 18583 }, { "epoch": 0.8979079093588442, "grad_norm": 2.918109178543091, "learning_rate": 1.0209209064115572e-07, "loss": 0.3173, "step": 18584 }, { "epoch": 0.8979562255399334, "grad_norm": 2.7746152877807617, "learning_rate": 1.0204377446006668e-07, "loss": 0.2602, "step": 18585 }, { "epoch": 0.8980045417210224, "grad_norm": 2.6655728816986084, "learning_rate": 1.0199545827897762e-07, "loss": 0.3048, "step": 18586 }, { "epoch": 0.8980528579021114, "grad_norm": 5.196576118469238, "learning_rate": 1.0194714209788857e-07, "loss": 0.1969, "step": 18587 }, { "epoch": 0.8981011740832004, "grad_norm": 3.4286153316497803, "learning_rate": 1.0189882591679954e-07, "loss": 0.3318, "step": 18588 }, { "epoch": 0.8981494902642895, "grad_norm": 3.3240160942077637, "learning_rate": 1.0185050973571049e-07, "loss": 0.4111, "step": 18589 }, { "epoch": 0.8981978064453786, "grad_norm": 3.1537046432495117, "learning_rate": 1.0180219355462144e-07, "loss": 0.2644, "step": 18590 }, { "epoch": 0.8982461226264676, "grad_norm": 7.378652572631836, "learning_rate": 1.0175387737353239e-07, "loss": 0.349, "step": 18591 }, { "epoch": 0.8982944388075567, "grad_norm": 5.967849254608154, "learning_rate": 1.0170556119244335e-07, "loss": 0.2909, "step": 18592 }, { "epoch": 0.8983427549886457, "grad_norm": 11.871134757995605, "learning_rate": 1.016572450113543e-07, "loss": 0.372, "step": 18593 }, { "epoch": 0.8983910711697347, "grad_norm": 2.2308714389801025, "learning_rate": 1.0160892883026525e-07, "loss": 0.2766, "step": 18594 }, { "epoch": 0.8984393873508237, "grad_norm": 2.605586051940918, "learning_rate": 1.015606126491762e-07, "loss": 0.3204, "step": 18595 }, { "epoch": 0.8984877035319129, "grad_norm": 1.5015347003936768, "learning_rate": 1.0151229646808717e-07, "loss": 0.1511, "step": 18596 }, { "epoch": 0.8985360197130019, "grad_norm": 2.2549242973327637, "learning_rate": 1.0146398028699812e-07, "loss": 0.1896, "step": 18597 }, { "epoch": 0.8985843358940909, "grad_norm": 4.605568885803223, "learning_rate": 1.0141566410590907e-07, "loss": 0.3529, "step": 18598 }, { "epoch": 0.89863265207518, "grad_norm": 2.5821800231933594, "learning_rate": 1.0136734792482002e-07, "loss": 0.306, "step": 18599 }, { "epoch": 0.898680968256269, "grad_norm": 4.092065811157227, "learning_rate": 1.0131903174373097e-07, "loss": 0.3421, "step": 18600 }, { "epoch": 0.8987292844373581, "grad_norm": 2.24103045463562, "learning_rate": 1.0127071556264193e-07, "loss": 0.246, "step": 18601 }, { "epoch": 0.8987776006184471, "grad_norm": 7.399600028991699, "learning_rate": 1.0122239938155287e-07, "loss": 0.3297, "step": 18602 }, { "epoch": 0.8988259167995362, "grad_norm": 3.9145913124084473, "learning_rate": 1.0117408320046383e-07, "loss": 0.4213, "step": 18603 }, { "epoch": 0.8988742329806252, "grad_norm": 14.300390243530273, "learning_rate": 1.0112576701937478e-07, "loss": 0.236, "step": 18604 }, { "epoch": 0.8989225491617142, "grad_norm": 2.1384265422821045, "learning_rate": 1.0107745083828575e-07, "loss": 0.199, "step": 18605 }, { "epoch": 0.8989708653428033, "grad_norm": 2.649448871612549, "learning_rate": 1.0102913465719668e-07, "loss": 0.2526, "step": 18606 }, { "epoch": 0.8990191815238924, "grad_norm": 7.870757579803467, "learning_rate": 1.0098081847610765e-07, "loss": 0.2515, "step": 18607 }, { "epoch": 0.8990674977049814, "grad_norm": 3.129939317703247, "learning_rate": 1.009325022950186e-07, "loss": 0.2926, "step": 18608 }, { "epoch": 0.8991158138860704, "grad_norm": 3.5834367275238037, "learning_rate": 1.0088418611392956e-07, "loss": 0.2917, "step": 18609 }, { "epoch": 0.8991641300671595, "grad_norm": 3.388171672821045, "learning_rate": 1.008358699328405e-07, "loss": 0.3345, "step": 18610 }, { "epoch": 0.8992124462482486, "grad_norm": 2.983027458190918, "learning_rate": 1.0078755375175145e-07, "loss": 0.4331, "step": 18611 }, { "epoch": 0.8992607624293376, "grad_norm": 3.91475772857666, "learning_rate": 1.0073923757066241e-07, "loss": 0.318, "step": 18612 }, { "epoch": 0.8993090786104266, "grad_norm": 3.0373337268829346, "learning_rate": 1.0069092138957335e-07, "loss": 0.3844, "step": 18613 }, { "epoch": 0.8993573947915157, "grad_norm": 2.733919382095337, "learning_rate": 1.0064260520848432e-07, "loss": 0.2887, "step": 18614 }, { "epoch": 0.8994057109726047, "grad_norm": 3.4783990383148193, "learning_rate": 1.0059428902739527e-07, "loss": 0.2776, "step": 18615 }, { "epoch": 0.8994540271536938, "grad_norm": 2.1345067024230957, "learning_rate": 1.0054597284630623e-07, "loss": 0.1791, "step": 18616 }, { "epoch": 0.8995023433347829, "grad_norm": 2.9411745071411133, "learning_rate": 1.0049765666521717e-07, "loss": 0.3219, "step": 18617 }, { "epoch": 0.8995506595158719, "grad_norm": 2.4899165630340576, "learning_rate": 1.0044934048412813e-07, "loss": 0.2852, "step": 18618 }, { "epoch": 0.8995989756969609, "grad_norm": 2.6853713989257812, "learning_rate": 1.0040102430303908e-07, "loss": 0.3735, "step": 18619 }, { "epoch": 0.8996472918780499, "grad_norm": 2.546497106552124, "learning_rate": 1.0035270812195004e-07, "loss": 0.313, "step": 18620 }, { "epoch": 0.899695608059139, "grad_norm": 2.9151008129119873, "learning_rate": 1.0030439194086098e-07, "loss": 0.3028, "step": 18621 }, { "epoch": 0.8997439242402281, "grad_norm": 5.2554802894592285, "learning_rate": 1.0025607575977195e-07, "loss": 0.3613, "step": 18622 }, { "epoch": 0.8997922404213171, "grad_norm": 2.4513776302337646, "learning_rate": 1.002077595786829e-07, "loss": 0.3854, "step": 18623 }, { "epoch": 0.8998405566024061, "grad_norm": 5.069086074829102, "learning_rate": 1.0015944339759385e-07, "loss": 0.2447, "step": 18624 }, { "epoch": 0.8998888727834952, "grad_norm": 2.24501895904541, "learning_rate": 1.001111272165048e-07, "loss": 0.2401, "step": 18625 }, { "epoch": 0.8999371889645842, "grad_norm": 3.9386255741119385, "learning_rate": 1.0006281103541575e-07, "loss": 0.1679, "step": 18626 }, { "epoch": 0.8999855051456733, "grad_norm": 3.715390682220459, "learning_rate": 1.0001449485432671e-07, "loss": 0.2299, "step": 18627 }, { "epoch": 0.9000338213267624, "grad_norm": 2.081315517425537, "learning_rate": 9.996617867323766e-08, "loss": 0.2513, "step": 18628 }, { "epoch": 0.9000821375078514, "grad_norm": 2.314091444015503, "learning_rate": 9.991786249214861e-08, "loss": 0.2257, "step": 18629 }, { "epoch": 0.9001304536889404, "grad_norm": 2.226393699645996, "learning_rate": 9.986954631105956e-08, "loss": 0.2267, "step": 18630 }, { "epoch": 0.9001787698700294, "grad_norm": 2.277860641479492, "learning_rate": 9.982123012997053e-08, "loss": 0.2084, "step": 18631 }, { "epoch": 0.9002270860511186, "grad_norm": 5.518989086151123, "learning_rate": 9.977291394888148e-08, "loss": 0.1886, "step": 18632 }, { "epoch": 0.9002754022322076, "grad_norm": 2.7304866313934326, "learning_rate": 9.972459776779243e-08, "loss": 0.296, "step": 18633 }, { "epoch": 0.9003237184132966, "grad_norm": 2.507542371749878, "learning_rate": 9.967628158670338e-08, "loss": 0.1622, "step": 18634 }, { "epoch": 0.9003720345943856, "grad_norm": 2.821686267852783, "learning_rate": 9.962796540561434e-08, "loss": 0.3631, "step": 18635 }, { "epoch": 0.9004203507754747, "grad_norm": 4.7886762619018555, "learning_rate": 9.957964922452529e-08, "loss": 0.2066, "step": 18636 }, { "epoch": 0.9004686669565638, "grad_norm": 3.333984136581421, "learning_rate": 9.953133304343623e-08, "loss": 0.2559, "step": 18637 }, { "epoch": 0.9005169831376528, "grad_norm": 2.6056058406829834, "learning_rate": 9.94830168623472e-08, "loss": 0.2315, "step": 18638 }, { "epoch": 0.9005652993187419, "grad_norm": 2.587412118911743, "learning_rate": 9.943470068125814e-08, "loss": 0.339, "step": 18639 }, { "epoch": 0.9006136154998309, "grad_norm": 2.527738332748413, "learning_rate": 9.938638450016911e-08, "loss": 0.2949, "step": 18640 }, { "epoch": 0.9006619316809199, "grad_norm": 3.3953182697296143, "learning_rate": 9.933806831908005e-08, "loss": 0.3196, "step": 18641 }, { "epoch": 0.900710247862009, "grad_norm": 2.259638547897339, "learning_rate": 9.928975213799101e-08, "loss": 0.2397, "step": 18642 }, { "epoch": 0.9007585640430981, "grad_norm": 21.566133499145508, "learning_rate": 9.924143595690196e-08, "loss": 0.1946, "step": 18643 }, { "epoch": 0.9008068802241871, "grad_norm": 2.10823130607605, "learning_rate": 9.919311977581292e-08, "loss": 0.2395, "step": 18644 }, { "epoch": 0.9008551964052761, "grad_norm": 2.6856067180633545, "learning_rate": 9.914480359472386e-08, "loss": 0.3077, "step": 18645 }, { "epoch": 0.9009035125863651, "grad_norm": 3.2323923110961914, "learning_rate": 9.909648741363482e-08, "loss": 0.3184, "step": 18646 }, { "epoch": 0.9009518287674542, "grad_norm": 2.7998013496398926, "learning_rate": 9.904817123254577e-08, "loss": 0.265, "step": 18647 }, { "epoch": 0.9010001449485433, "grad_norm": 3.7291810512542725, "learning_rate": 9.899985505145674e-08, "loss": 0.3706, "step": 18648 }, { "epoch": 0.9010484611296323, "grad_norm": 4.970522403717041, "learning_rate": 9.895153887036768e-08, "loss": 0.2859, "step": 18649 }, { "epoch": 0.9010967773107214, "grad_norm": 2.8629181385040283, "learning_rate": 9.890322268927863e-08, "loss": 0.1861, "step": 18650 }, { "epoch": 0.9011450934918104, "grad_norm": 3.1975440979003906, "learning_rate": 9.885490650818959e-08, "loss": 0.3562, "step": 18651 }, { "epoch": 0.9011934096728994, "grad_norm": 2.105004072189331, "learning_rate": 9.880659032710054e-08, "loss": 0.2367, "step": 18652 }, { "epoch": 0.9012417258539885, "grad_norm": 2.9897358417510986, "learning_rate": 9.875827414601149e-08, "loss": 0.3119, "step": 18653 }, { "epoch": 0.9012900420350776, "grad_norm": 2.015756845474243, "learning_rate": 9.870995796492244e-08, "loss": 0.1622, "step": 18654 }, { "epoch": 0.9013383582161666, "grad_norm": 3.1169326305389404, "learning_rate": 9.86616417838334e-08, "loss": 0.3704, "step": 18655 }, { "epoch": 0.9013866743972556, "grad_norm": 23.98021697998047, "learning_rate": 9.861332560274436e-08, "loss": 0.4061, "step": 18656 }, { "epoch": 0.9014349905783446, "grad_norm": 2.297367811203003, "learning_rate": 9.85650094216553e-08, "loss": 0.2752, "step": 18657 }, { "epoch": 0.9014833067594338, "grad_norm": 2.838334083557129, "learning_rate": 9.851669324056626e-08, "loss": 0.344, "step": 18658 }, { "epoch": 0.9015316229405228, "grad_norm": 3.2392425537109375, "learning_rate": 9.846837705947722e-08, "loss": 0.2682, "step": 18659 }, { "epoch": 0.9015799391216118, "grad_norm": 1.8210043907165527, "learning_rate": 9.842006087838817e-08, "loss": 0.2055, "step": 18660 }, { "epoch": 0.9016282553027009, "grad_norm": 9.384611129760742, "learning_rate": 9.837174469729912e-08, "loss": 0.342, "step": 18661 }, { "epoch": 0.9016765714837899, "grad_norm": 2.3690731525421143, "learning_rate": 9.832342851621007e-08, "loss": 0.2312, "step": 18662 }, { "epoch": 0.901724887664879, "grad_norm": 4.106116771697998, "learning_rate": 9.827511233512102e-08, "loss": 0.3311, "step": 18663 }, { "epoch": 0.901773203845968, "grad_norm": 2.7836198806762695, "learning_rate": 9.822679615403199e-08, "loss": 0.3309, "step": 18664 }, { "epoch": 0.9018215200270571, "grad_norm": 4.879960060119629, "learning_rate": 9.817847997294292e-08, "loss": 0.2484, "step": 18665 }, { "epoch": 0.9018698362081461, "grad_norm": 2.523963212966919, "learning_rate": 9.813016379185389e-08, "loss": 0.2048, "step": 18666 }, { "epoch": 0.9019181523892351, "grad_norm": 2.765984535217285, "learning_rate": 9.808184761076484e-08, "loss": 0.3614, "step": 18667 }, { "epoch": 0.9019664685703243, "grad_norm": 2.3491761684417725, "learning_rate": 9.80335314296758e-08, "loss": 0.3398, "step": 18668 }, { "epoch": 0.9020147847514133, "grad_norm": 2.291677474975586, "learning_rate": 9.798521524858674e-08, "loss": 0.3005, "step": 18669 }, { "epoch": 0.9020631009325023, "grad_norm": 2.7825052738189697, "learning_rate": 9.79368990674977e-08, "loss": 0.2866, "step": 18670 }, { "epoch": 0.9021114171135913, "grad_norm": 2.3367578983306885, "learning_rate": 9.788858288640865e-08, "loss": 0.2545, "step": 18671 }, { "epoch": 0.9021597332946804, "grad_norm": 3.590820789337158, "learning_rate": 9.784026670531962e-08, "loss": 0.4188, "step": 18672 }, { "epoch": 0.9022080494757694, "grad_norm": 8.284183502197266, "learning_rate": 9.779195052423055e-08, "loss": 0.5304, "step": 18673 }, { "epoch": 0.9022563656568585, "grad_norm": 2.8959479331970215, "learning_rate": 9.774363434314152e-08, "loss": 0.3294, "step": 18674 }, { "epoch": 0.9023046818379475, "grad_norm": 4.139759540557861, "learning_rate": 9.769531816205247e-08, "loss": 0.3112, "step": 18675 }, { "epoch": 0.9023529980190366, "grad_norm": 2.9060781002044678, "learning_rate": 9.764700198096342e-08, "loss": 0.2679, "step": 18676 }, { "epoch": 0.9024013142001256, "grad_norm": 2.6926252841949463, "learning_rate": 9.759868579987437e-08, "loss": 0.3015, "step": 18677 }, { "epoch": 0.9024496303812146, "grad_norm": 2.549530029296875, "learning_rate": 9.755036961878532e-08, "loss": 0.2259, "step": 18678 }, { "epoch": 0.9024979465623038, "grad_norm": 7.015931606292725, "learning_rate": 9.750205343769628e-08, "loss": 0.3312, "step": 18679 }, { "epoch": 0.9025462627433928, "grad_norm": 26.677701950073242, "learning_rate": 9.745373725660723e-08, "loss": 0.2736, "step": 18680 }, { "epoch": 0.9025945789244818, "grad_norm": 4.369446754455566, "learning_rate": 9.740542107551818e-08, "loss": 0.3828, "step": 18681 }, { "epoch": 0.9026428951055708, "grad_norm": 1.9417310953140259, "learning_rate": 9.735710489442914e-08, "loss": 0.2233, "step": 18682 }, { "epoch": 0.9026912112866599, "grad_norm": 2.415797233581543, "learning_rate": 9.73087887133401e-08, "loss": 0.2901, "step": 18683 }, { "epoch": 0.902739527467749, "grad_norm": 2.374251365661621, "learning_rate": 9.726047253225105e-08, "loss": 0.2641, "step": 18684 }, { "epoch": 0.902787843648838, "grad_norm": 1.9314537048339844, "learning_rate": 9.7212156351162e-08, "loss": 0.1961, "step": 18685 }, { "epoch": 0.902836159829927, "grad_norm": 2.288606643676758, "learning_rate": 9.716384017007295e-08, "loss": 0.1748, "step": 18686 }, { "epoch": 0.9028844760110161, "grad_norm": 2.3727214336395264, "learning_rate": 9.71155239889839e-08, "loss": 0.3069, "step": 18687 }, { "epoch": 0.9029327921921051, "grad_norm": 4.42308235168457, "learning_rate": 9.706720780789486e-08, "loss": 0.2919, "step": 18688 }, { "epoch": 0.9029811083731942, "grad_norm": 10.091219902038574, "learning_rate": 9.70188916268058e-08, "loss": 0.3273, "step": 18689 }, { "epoch": 0.9030294245542833, "grad_norm": 2.3173162937164307, "learning_rate": 9.697057544571677e-08, "loss": 0.2524, "step": 18690 }, { "epoch": 0.9030777407353723, "grad_norm": 1.9452674388885498, "learning_rate": 9.692225926462772e-08, "loss": 0.1718, "step": 18691 }, { "epoch": 0.9031260569164613, "grad_norm": 3.5373470783233643, "learning_rate": 9.687394308353868e-08, "loss": 0.2429, "step": 18692 }, { "epoch": 0.9031743730975503, "grad_norm": 2.2582740783691406, "learning_rate": 9.682562690244962e-08, "loss": 0.2447, "step": 18693 }, { "epoch": 0.9032226892786395, "grad_norm": 2.8683152198791504, "learning_rate": 9.677731072136058e-08, "loss": 0.3052, "step": 18694 }, { "epoch": 0.9032710054597285, "grad_norm": 8.639643669128418, "learning_rate": 9.672899454027153e-08, "loss": 0.2757, "step": 18695 }, { "epoch": 0.9033193216408175, "grad_norm": 2.891353130340576, "learning_rate": 9.66806783591825e-08, "loss": 0.4096, "step": 18696 }, { "epoch": 0.9033676378219065, "grad_norm": 2.576645851135254, "learning_rate": 9.663236217809343e-08, "loss": 0.3988, "step": 18697 }, { "epoch": 0.9034159540029956, "grad_norm": 3.1892833709716797, "learning_rate": 9.65840459970044e-08, "loss": 0.4883, "step": 18698 }, { "epoch": 0.9034642701840846, "grad_norm": 2.0570335388183594, "learning_rate": 9.653572981591535e-08, "loss": 0.2052, "step": 18699 }, { "epoch": 0.9035125863651737, "grad_norm": 10.209425926208496, "learning_rate": 9.64874136348263e-08, "loss": 0.2321, "step": 18700 }, { "epoch": 0.9035609025462628, "grad_norm": 2.638963460922241, "learning_rate": 9.643909745373725e-08, "loss": 0.3331, "step": 18701 }, { "epoch": 0.9036092187273518, "grad_norm": 3.0622665882110596, "learning_rate": 9.63907812726482e-08, "loss": 0.2905, "step": 18702 }, { "epoch": 0.9036575349084408, "grad_norm": 2.5515806674957275, "learning_rate": 9.634246509155916e-08, "loss": 0.1602, "step": 18703 }, { "epoch": 0.9037058510895298, "grad_norm": 3.7925405502319336, "learning_rate": 9.629414891047011e-08, "loss": 0.4821, "step": 18704 }, { "epoch": 0.903754167270619, "grad_norm": 2.9077134132385254, "learning_rate": 9.624583272938106e-08, "loss": 0.3102, "step": 18705 }, { "epoch": 0.903802483451708, "grad_norm": 9.554817199707031, "learning_rate": 9.619751654829201e-08, "loss": 0.3372, "step": 18706 }, { "epoch": 0.903850799632797, "grad_norm": 8.647250175476074, "learning_rate": 9.614920036720298e-08, "loss": 0.3484, "step": 18707 }, { "epoch": 0.903899115813886, "grad_norm": 7.152533054351807, "learning_rate": 9.610088418611393e-08, "loss": 0.2231, "step": 18708 }, { "epoch": 0.9039474319949751, "grad_norm": 1.645652174949646, "learning_rate": 9.605256800502488e-08, "loss": 0.1379, "step": 18709 }, { "epoch": 0.9039957481760642, "grad_norm": 4.000421524047852, "learning_rate": 9.600425182393583e-08, "loss": 0.2663, "step": 18710 }, { "epoch": 0.9040440643571532, "grad_norm": 2.665032386779785, "learning_rate": 9.595593564284679e-08, "loss": 0.2421, "step": 18711 }, { "epoch": 0.9040923805382423, "grad_norm": 1.697080373764038, "learning_rate": 9.590761946175774e-08, "loss": 0.1644, "step": 18712 }, { "epoch": 0.9041406967193313, "grad_norm": 2.686476945877075, "learning_rate": 9.585930328066868e-08, "loss": 0.3525, "step": 18713 }, { "epoch": 0.9041890129004203, "grad_norm": 2.327342987060547, "learning_rate": 9.581098709957964e-08, "loss": 0.2147, "step": 18714 }, { "epoch": 0.9042373290815094, "grad_norm": 2.7259485721588135, "learning_rate": 9.57626709184906e-08, "loss": 0.4011, "step": 18715 }, { "epoch": 0.9042856452625985, "grad_norm": 2.646648645401001, "learning_rate": 9.571435473740156e-08, "loss": 0.3579, "step": 18716 }, { "epoch": 0.9043339614436875, "grad_norm": 3.75445294380188, "learning_rate": 9.56660385563125e-08, "loss": 0.3274, "step": 18717 }, { "epoch": 0.9043822776247765, "grad_norm": 2.523651599884033, "learning_rate": 9.561772237522346e-08, "loss": 0.2241, "step": 18718 }, { "epoch": 0.9044305938058655, "grad_norm": 2.808504819869995, "learning_rate": 9.556940619413441e-08, "loss": 0.3471, "step": 18719 }, { "epoch": 0.9044789099869547, "grad_norm": 2.5121002197265625, "learning_rate": 9.552109001304537e-08, "loss": 0.379, "step": 18720 }, { "epoch": 0.9045272261680437, "grad_norm": 2.038527727127075, "learning_rate": 9.547277383195631e-08, "loss": 0.2694, "step": 18721 }, { "epoch": 0.9045755423491327, "grad_norm": 2.2659363746643066, "learning_rate": 9.542445765086727e-08, "loss": 0.2665, "step": 18722 }, { "epoch": 0.9046238585302218, "grad_norm": 3.7453126907348633, "learning_rate": 9.537614146977823e-08, "loss": 0.3746, "step": 18723 }, { "epoch": 0.9046721747113108, "grad_norm": 3.2983648777008057, "learning_rate": 9.532782528868919e-08, "loss": 0.4269, "step": 18724 }, { "epoch": 0.9047204908923999, "grad_norm": 2.8444809913635254, "learning_rate": 9.527950910760013e-08, "loss": 0.2471, "step": 18725 }, { "epoch": 0.904768807073489, "grad_norm": 5.160414218902588, "learning_rate": 9.523119292651108e-08, "loss": 0.2465, "step": 18726 }, { "epoch": 0.904817123254578, "grad_norm": 2.9283995628356934, "learning_rate": 9.518287674542204e-08, "loss": 0.3812, "step": 18727 }, { "epoch": 0.904865439435667, "grad_norm": 2.5278449058532715, "learning_rate": 9.513456056433299e-08, "loss": 0.2846, "step": 18728 }, { "epoch": 0.904913755616756, "grad_norm": 1.878501296043396, "learning_rate": 9.508624438324394e-08, "loss": 0.171, "step": 18729 }, { "epoch": 0.904962071797845, "grad_norm": 1.8091033697128296, "learning_rate": 9.503792820215489e-08, "loss": 0.2184, "step": 18730 }, { "epoch": 0.9050103879789342, "grad_norm": 4.603839874267578, "learning_rate": 9.498961202106586e-08, "loss": 0.3085, "step": 18731 }, { "epoch": 0.9050587041600232, "grad_norm": 2.7144718170166016, "learning_rate": 9.49412958399768e-08, "loss": 0.2649, "step": 18732 }, { "epoch": 0.9051070203411122, "grad_norm": 2.0314996242523193, "learning_rate": 9.489297965888776e-08, "loss": 0.194, "step": 18733 }, { "epoch": 0.9051553365222013, "grad_norm": 3.284252405166626, "learning_rate": 9.484466347779871e-08, "loss": 0.2587, "step": 18734 }, { "epoch": 0.9052036527032903, "grad_norm": 2.9109413623809814, "learning_rate": 9.479634729670967e-08, "loss": 0.2816, "step": 18735 }, { "epoch": 0.9052519688843794, "grad_norm": 3.4252471923828125, "learning_rate": 9.474803111562062e-08, "loss": 0.31, "step": 18736 }, { "epoch": 0.9053002850654684, "grad_norm": 2.3864591121673584, "learning_rate": 9.469971493453157e-08, "loss": 0.2334, "step": 18737 }, { "epoch": 0.9053486012465575, "grad_norm": 2.890070915222168, "learning_rate": 9.465139875344252e-08, "loss": 0.3455, "step": 18738 }, { "epoch": 0.9053969174276465, "grad_norm": 3.4199352264404297, "learning_rate": 9.460308257235347e-08, "loss": 0.3963, "step": 18739 }, { "epoch": 0.9054452336087355, "grad_norm": 2.6677348613739014, "learning_rate": 9.455476639126444e-08, "loss": 0.2853, "step": 18740 }, { "epoch": 0.9054935497898247, "grad_norm": 3.001255989074707, "learning_rate": 9.450645021017537e-08, "loss": 0.2265, "step": 18741 }, { "epoch": 0.9055418659709137, "grad_norm": 2.6682002544403076, "learning_rate": 9.445813402908634e-08, "loss": 0.3193, "step": 18742 }, { "epoch": 0.9055901821520027, "grad_norm": 3.1717989444732666, "learning_rate": 9.440981784799729e-08, "loss": 0.2965, "step": 18743 }, { "epoch": 0.9056384983330917, "grad_norm": 3.777540922164917, "learning_rate": 9.436150166690825e-08, "loss": 0.2956, "step": 18744 }, { "epoch": 0.9056868145141808, "grad_norm": 1.9734278917312622, "learning_rate": 9.431318548581919e-08, "loss": 0.187, "step": 18745 }, { "epoch": 0.9057351306952699, "grad_norm": 4.8603715896606445, "learning_rate": 9.426486930473015e-08, "loss": 0.3343, "step": 18746 }, { "epoch": 0.9057834468763589, "grad_norm": 2.726775646209717, "learning_rate": 9.42165531236411e-08, "loss": 0.2902, "step": 18747 }, { "epoch": 0.905831763057448, "grad_norm": 1.9271646738052368, "learning_rate": 9.416823694255207e-08, "loss": 0.245, "step": 18748 }, { "epoch": 0.905880079238537, "grad_norm": 2.2345433235168457, "learning_rate": 9.4119920761463e-08, "loss": 0.2705, "step": 18749 }, { "epoch": 0.905928395419626, "grad_norm": 2.567298412322998, "learning_rate": 9.407160458037396e-08, "loss": 0.2309, "step": 18750 }, { "epoch": 0.9059767116007151, "grad_norm": 1.5564346313476562, "learning_rate": 9.402328839928492e-08, "loss": 0.1578, "step": 18751 }, { "epoch": 0.9060250277818042, "grad_norm": 2.4944818019866943, "learning_rate": 9.397497221819587e-08, "loss": 0.2115, "step": 18752 }, { "epoch": 0.9060733439628932, "grad_norm": 2.4815196990966797, "learning_rate": 9.392665603710682e-08, "loss": 0.3265, "step": 18753 }, { "epoch": 0.9061216601439822, "grad_norm": 3.425196409225464, "learning_rate": 9.387833985601777e-08, "loss": 0.37, "step": 18754 }, { "epoch": 0.9061699763250712, "grad_norm": 2.645895481109619, "learning_rate": 9.383002367492873e-08, "loss": 0.2959, "step": 18755 }, { "epoch": 0.9062182925061603, "grad_norm": 1.967429518699646, "learning_rate": 9.378170749383968e-08, "loss": 0.1603, "step": 18756 }, { "epoch": 0.9062666086872494, "grad_norm": 3.9325995445251465, "learning_rate": 9.373339131275064e-08, "loss": 0.3674, "step": 18757 }, { "epoch": 0.9063149248683384, "grad_norm": 2.4373581409454346, "learning_rate": 9.368507513166159e-08, "loss": 0.2821, "step": 18758 }, { "epoch": 0.9063632410494274, "grad_norm": 2.973031520843506, "learning_rate": 9.363675895057255e-08, "loss": 0.3725, "step": 18759 }, { "epoch": 0.9064115572305165, "grad_norm": 2.887697219848633, "learning_rate": 9.35884427694835e-08, "loss": 0.5012, "step": 18760 }, { "epoch": 0.9064598734116055, "grad_norm": 2.374321460723877, "learning_rate": 9.354012658839445e-08, "loss": 0.2304, "step": 18761 }, { "epoch": 0.9065081895926946, "grad_norm": 2.436833620071411, "learning_rate": 9.34918104073054e-08, "loss": 0.2814, "step": 18762 }, { "epoch": 0.9065565057737837, "grad_norm": 2.182234764099121, "learning_rate": 9.344349422621635e-08, "loss": 0.3094, "step": 18763 }, { "epoch": 0.9066048219548727, "grad_norm": 2.9907140731811523, "learning_rate": 9.339517804512732e-08, "loss": 0.3846, "step": 18764 }, { "epoch": 0.9066531381359617, "grad_norm": 1.7650120258331299, "learning_rate": 9.334686186403825e-08, "loss": 0.1904, "step": 18765 }, { "epoch": 0.9067014543170507, "grad_norm": 3.5848639011383057, "learning_rate": 9.329854568294922e-08, "loss": 0.3145, "step": 18766 }, { "epoch": 0.9067497704981399, "grad_norm": 2.2973856925964355, "learning_rate": 9.325022950186017e-08, "loss": 0.18, "step": 18767 }, { "epoch": 0.9067980866792289, "grad_norm": 2.314276695251465, "learning_rate": 9.320191332077113e-08, "loss": 0.2722, "step": 18768 }, { "epoch": 0.9068464028603179, "grad_norm": 2.519582748413086, "learning_rate": 9.315359713968207e-08, "loss": 0.2685, "step": 18769 }, { "epoch": 0.906894719041407, "grad_norm": 2.109124183654785, "learning_rate": 9.310528095859303e-08, "loss": 0.1814, "step": 18770 }, { "epoch": 0.906943035222496, "grad_norm": 4.870119094848633, "learning_rate": 9.305696477750398e-08, "loss": 0.2866, "step": 18771 }, { "epoch": 0.9069913514035851, "grad_norm": 2.398977518081665, "learning_rate": 9.300864859641495e-08, "loss": 0.2188, "step": 18772 }, { "epoch": 0.9070396675846741, "grad_norm": 3.081587314605713, "learning_rate": 9.296033241532588e-08, "loss": 0.3024, "step": 18773 }, { "epoch": 0.9070879837657632, "grad_norm": 2.9301600456237793, "learning_rate": 9.291201623423685e-08, "loss": 0.4155, "step": 18774 }, { "epoch": 0.9071362999468522, "grad_norm": 2.750204563140869, "learning_rate": 9.28637000531478e-08, "loss": 0.252, "step": 18775 }, { "epoch": 0.9071846161279412, "grad_norm": 1.5704829692840576, "learning_rate": 9.281538387205875e-08, "loss": 0.1586, "step": 18776 }, { "epoch": 0.9072329323090303, "grad_norm": 2.952889919281006, "learning_rate": 9.27670676909697e-08, "loss": 0.251, "step": 18777 }, { "epoch": 0.9072812484901194, "grad_norm": 2.905599594116211, "learning_rate": 9.271875150988065e-08, "loss": 0.3409, "step": 18778 }, { "epoch": 0.9073295646712084, "grad_norm": 2.1279501914978027, "learning_rate": 9.267043532879161e-08, "loss": 0.2305, "step": 18779 }, { "epoch": 0.9073778808522974, "grad_norm": 4.160427570343018, "learning_rate": 9.262211914770256e-08, "loss": 0.2317, "step": 18780 }, { "epoch": 0.9074261970333865, "grad_norm": 2.6391544342041016, "learning_rate": 9.257380296661351e-08, "loss": 0.3009, "step": 18781 }, { "epoch": 0.9074745132144755, "grad_norm": 6.66144323348999, "learning_rate": 9.252548678552446e-08, "loss": 0.3062, "step": 18782 }, { "epoch": 0.9075228293955646, "grad_norm": 2.6594350337982178, "learning_rate": 9.247717060443543e-08, "loss": 0.3341, "step": 18783 }, { "epoch": 0.9075711455766536, "grad_norm": 2.9162495136260986, "learning_rate": 9.242885442334638e-08, "loss": 0.2659, "step": 18784 }, { "epoch": 0.9076194617577427, "grad_norm": 2.628119945526123, "learning_rate": 9.238053824225733e-08, "loss": 0.2766, "step": 18785 }, { "epoch": 0.9076677779388317, "grad_norm": 2.87813138961792, "learning_rate": 9.233222206116828e-08, "loss": 0.2679, "step": 18786 }, { "epoch": 0.9077160941199207, "grad_norm": 2.579859733581543, "learning_rate": 9.228390588007924e-08, "loss": 0.2941, "step": 18787 }, { "epoch": 0.9077644103010098, "grad_norm": 1.8889856338500977, "learning_rate": 9.22355896989902e-08, "loss": 0.2085, "step": 18788 }, { "epoch": 0.9078127264820989, "grad_norm": 5.254733085632324, "learning_rate": 9.218727351790113e-08, "loss": 0.2768, "step": 18789 }, { "epoch": 0.9078610426631879, "grad_norm": 3.487293004989624, "learning_rate": 9.21389573368121e-08, "loss": 0.3625, "step": 18790 }, { "epoch": 0.9079093588442769, "grad_norm": 2.8861331939697266, "learning_rate": 9.209064115572305e-08, "loss": 0.2697, "step": 18791 }, { "epoch": 0.907957675025366, "grad_norm": 3.0265040397644043, "learning_rate": 9.204232497463401e-08, "loss": 0.3202, "step": 18792 }, { "epoch": 0.9080059912064551, "grad_norm": 2.7344067096710205, "learning_rate": 9.199400879354495e-08, "loss": 0.3241, "step": 18793 }, { "epoch": 0.9080543073875441, "grad_norm": 2.5521178245544434, "learning_rate": 9.194569261245591e-08, "loss": 0.2029, "step": 18794 }, { "epoch": 0.9081026235686331, "grad_norm": 2.5440728664398193, "learning_rate": 9.189737643136686e-08, "loss": 0.2228, "step": 18795 }, { "epoch": 0.9081509397497222, "grad_norm": 2.6947262287139893, "learning_rate": 9.184906025027782e-08, "loss": 0.2382, "step": 18796 }, { "epoch": 0.9081992559308112, "grad_norm": 3.469614028930664, "learning_rate": 9.180074406918876e-08, "loss": 0.298, "step": 18797 }, { "epoch": 0.9082475721119003, "grad_norm": 3.602609872817993, "learning_rate": 9.175242788809973e-08, "loss": 0.2775, "step": 18798 }, { "epoch": 0.9082958882929894, "grad_norm": 1.2788763046264648, "learning_rate": 9.170411170701068e-08, "loss": 0.1226, "step": 18799 }, { "epoch": 0.9083442044740784, "grad_norm": 2.079371452331543, "learning_rate": 9.165579552592164e-08, "loss": 0.2426, "step": 18800 }, { "epoch": 0.9083925206551674, "grad_norm": 1.9274600744247437, "learning_rate": 9.160747934483258e-08, "loss": 0.1778, "step": 18801 }, { "epoch": 0.9084408368362564, "grad_norm": 2.432833433151245, "learning_rate": 9.155916316374353e-08, "loss": 0.3402, "step": 18802 }, { "epoch": 0.9084891530173456, "grad_norm": 2.8539068698883057, "learning_rate": 9.151084698265449e-08, "loss": 0.3624, "step": 18803 }, { "epoch": 0.9085374691984346, "grad_norm": 3.3457233905792236, "learning_rate": 9.146253080156543e-08, "loss": 0.4312, "step": 18804 }, { "epoch": 0.9085857853795236, "grad_norm": 2.2759523391723633, "learning_rate": 9.141421462047639e-08, "loss": 0.2894, "step": 18805 }, { "epoch": 0.9086341015606126, "grad_norm": 2.1795711517333984, "learning_rate": 9.136589843938734e-08, "loss": 0.2319, "step": 18806 }, { "epoch": 0.9086824177417017, "grad_norm": 1.8389394283294678, "learning_rate": 9.13175822582983e-08, "loss": 0.1798, "step": 18807 }, { "epoch": 0.9087307339227907, "grad_norm": 3.792661428451538, "learning_rate": 9.126926607720924e-08, "loss": 0.3285, "step": 18808 }, { "epoch": 0.9087790501038798, "grad_norm": 4.064174652099609, "learning_rate": 9.122094989612021e-08, "loss": 0.3127, "step": 18809 }, { "epoch": 0.9088273662849689, "grad_norm": 1.9386218786239624, "learning_rate": 9.117263371503116e-08, "loss": 0.2439, "step": 18810 }, { "epoch": 0.9088756824660579, "grad_norm": 2.6708269119262695, "learning_rate": 9.112431753394212e-08, "loss": 0.282, "step": 18811 }, { "epoch": 0.9089239986471469, "grad_norm": 4.525595188140869, "learning_rate": 9.107600135285307e-08, "loss": 0.4007, "step": 18812 }, { "epoch": 0.9089723148282359, "grad_norm": 1.770896553993225, "learning_rate": 9.102768517176402e-08, "loss": 0.211, "step": 18813 }, { "epoch": 0.9090206310093251, "grad_norm": 2.93503475189209, "learning_rate": 9.097936899067497e-08, "loss": 0.3981, "step": 18814 }, { "epoch": 0.9090689471904141, "grad_norm": 3.0642337799072266, "learning_rate": 9.093105280958592e-08, "loss": 0.3209, "step": 18815 }, { "epoch": 0.9091172633715031, "grad_norm": 2.0534017086029053, "learning_rate": 9.088273662849689e-08, "loss": 0.2554, "step": 18816 }, { "epoch": 0.9091655795525921, "grad_norm": 3.7001776695251465, "learning_rate": 9.083442044740782e-08, "loss": 0.3338, "step": 18817 }, { "epoch": 0.9092138957336812, "grad_norm": 3.8593404293060303, "learning_rate": 9.078610426631879e-08, "loss": 0.2782, "step": 18818 }, { "epoch": 0.9092622119147703, "grad_norm": 2.470332622528076, "learning_rate": 9.073778808522974e-08, "loss": 0.2372, "step": 18819 }, { "epoch": 0.9093105280958593, "grad_norm": 2.7396514415740967, "learning_rate": 9.06894719041407e-08, "loss": 0.3066, "step": 18820 }, { "epoch": 0.9093588442769484, "grad_norm": 2.980768918991089, "learning_rate": 9.064115572305164e-08, "loss": 0.3665, "step": 18821 }, { "epoch": 0.9094071604580374, "grad_norm": 2.6919233798980713, "learning_rate": 9.05928395419626e-08, "loss": 0.2603, "step": 18822 }, { "epoch": 0.9094554766391264, "grad_norm": 2.2113823890686035, "learning_rate": 9.054452336087355e-08, "loss": 0.249, "step": 18823 }, { "epoch": 0.9095037928202155, "grad_norm": 2.011427402496338, "learning_rate": 9.049620717978452e-08, "loss": 0.2364, "step": 18824 }, { "epoch": 0.9095521090013046, "grad_norm": 2.5373566150665283, "learning_rate": 9.044789099869546e-08, "loss": 0.1878, "step": 18825 }, { "epoch": 0.9096004251823936, "grad_norm": 5.8395094871521, "learning_rate": 9.03995748176064e-08, "loss": 0.3695, "step": 18826 }, { "epoch": 0.9096487413634826, "grad_norm": 3.317857265472412, "learning_rate": 9.035125863651737e-08, "loss": 0.312, "step": 18827 }, { "epoch": 0.9096970575445716, "grad_norm": 2.5549581050872803, "learning_rate": 9.030294245542831e-08, "loss": 0.2491, "step": 18828 }, { "epoch": 0.9097453737256608, "grad_norm": 2.5457704067230225, "learning_rate": 9.025462627433927e-08, "loss": 0.3122, "step": 18829 }, { "epoch": 0.9097936899067498, "grad_norm": 2.3818249702453613, "learning_rate": 9.020631009325022e-08, "loss": 0.2274, "step": 18830 }, { "epoch": 0.9098420060878388, "grad_norm": 5.726845741271973, "learning_rate": 9.015799391216118e-08, "loss": 0.2955, "step": 18831 }, { "epoch": 0.9098903222689279, "grad_norm": 2.1267004013061523, "learning_rate": 9.010967773107212e-08, "loss": 0.2259, "step": 18832 }, { "epoch": 0.9099386384500169, "grad_norm": 2.146934747695923, "learning_rate": 9.006136154998309e-08, "loss": 0.2153, "step": 18833 }, { "epoch": 0.9099869546311059, "grad_norm": 2.984225034713745, "learning_rate": 9.001304536889404e-08, "loss": 0.3736, "step": 18834 }, { "epoch": 0.910035270812195, "grad_norm": 2.5364813804626465, "learning_rate": 8.9964729187805e-08, "loss": 0.3191, "step": 18835 }, { "epoch": 0.9100835869932841, "grad_norm": 2.265486240386963, "learning_rate": 8.991641300671594e-08, "loss": 0.2963, "step": 18836 }, { "epoch": 0.9101319031743731, "grad_norm": 2.966527223587036, "learning_rate": 8.98680968256269e-08, "loss": 0.242, "step": 18837 }, { "epoch": 0.9101802193554621, "grad_norm": 2.5548360347747803, "learning_rate": 8.981978064453785e-08, "loss": 0.3542, "step": 18838 }, { "epoch": 0.9102285355365511, "grad_norm": 2.8706367015838623, "learning_rate": 8.97714644634488e-08, "loss": 0.2609, "step": 18839 }, { "epoch": 0.9102768517176403, "grad_norm": 2.528743028640747, "learning_rate": 8.972314828235975e-08, "loss": 0.2494, "step": 18840 }, { "epoch": 0.9103251678987293, "grad_norm": 1.949573040008545, "learning_rate": 8.96748321012707e-08, "loss": 0.2077, "step": 18841 }, { "epoch": 0.9103734840798183, "grad_norm": 1.987481951713562, "learning_rate": 8.962651592018167e-08, "loss": 0.244, "step": 18842 }, { "epoch": 0.9104218002609074, "grad_norm": 2.2815980911254883, "learning_rate": 8.957819973909262e-08, "loss": 0.2708, "step": 18843 }, { "epoch": 0.9104701164419964, "grad_norm": 1.5709106922149658, "learning_rate": 8.952988355800357e-08, "loss": 0.1796, "step": 18844 }, { "epoch": 0.9105184326230855, "grad_norm": 2.4231340885162354, "learning_rate": 8.948156737691452e-08, "loss": 0.2113, "step": 18845 }, { "epoch": 0.9105667488041745, "grad_norm": 2.903127670288086, "learning_rate": 8.943325119582548e-08, "loss": 0.3868, "step": 18846 }, { "epoch": 0.9106150649852636, "grad_norm": 2.921909809112549, "learning_rate": 8.938493501473643e-08, "loss": 0.3092, "step": 18847 }, { "epoch": 0.9106633811663526, "grad_norm": 2.65120530128479, "learning_rate": 8.933661883364738e-08, "loss": 0.2545, "step": 18848 }, { "epoch": 0.9107116973474416, "grad_norm": 2.306187868118286, "learning_rate": 8.928830265255833e-08, "loss": 0.2851, "step": 18849 }, { "epoch": 0.9107600135285308, "grad_norm": 2.783355712890625, "learning_rate": 8.92399864714693e-08, "loss": 0.2486, "step": 18850 }, { "epoch": 0.9108083297096198, "grad_norm": 3.5294816493988037, "learning_rate": 8.919167029038025e-08, "loss": 0.3838, "step": 18851 }, { "epoch": 0.9108566458907088, "grad_norm": 2.58695387840271, "learning_rate": 8.914335410929119e-08, "loss": 0.2877, "step": 18852 }, { "epoch": 0.9109049620717978, "grad_norm": 2.5909907817840576, "learning_rate": 8.909503792820215e-08, "loss": 0.2633, "step": 18853 }, { "epoch": 0.9109532782528869, "grad_norm": 2.4124081134796143, "learning_rate": 8.90467217471131e-08, "loss": 0.2202, "step": 18854 }, { "epoch": 0.911001594433976, "grad_norm": 3.9948885440826416, "learning_rate": 8.899840556602406e-08, "loss": 0.2829, "step": 18855 }, { "epoch": 0.911049910615065, "grad_norm": 2.190577983856201, "learning_rate": 8.8950089384935e-08, "loss": 0.1896, "step": 18856 }, { "epoch": 0.911098226796154, "grad_norm": 2.5085103511810303, "learning_rate": 8.890177320384596e-08, "loss": 0.2416, "step": 18857 }, { "epoch": 0.9111465429772431, "grad_norm": 2.2872400283813477, "learning_rate": 8.885345702275691e-08, "loss": 0.1811, "step": 18858 }, { "epoch": 0.9111948591583321, "grad_norm": 2.619499921798706, "learning_rate": 8.880514084166788e-08, "loss": 0.3313, "step": 18859 }, { "epoch": 0.9112431753394211, "grad_norm": 2.2998201847076416, "learning_rate": 8.875682466057882e-08, "loss": 0.2547, "step": 18860 }, { "epoch": 0.9112914915205103, "grad_norm": 2.658200979232788, "learning_rate": 8.870850847948978e-08, "loss": 0.3617, "step": 18861 }, { "epoch": 0.9113398077015993, "grad_norm": 2.9041693210601807, "learning_rate": 8.866019229840073e-08, "loss": 0.3315, "step": 18862 }, { "epoch": 0.9113881238826883, "grad_norm": 2.1189188957214355, "learning_rate": 8.86118761173117e-08, "loss": 0.2343, "step": 18863 }, { "epoch": 0.9114364400637773, "grad_norm": 4.1335062980651855, "learning_rate": 8.856355993622263e-08, "loss": 0.3615, "step": 18864 }, { "epoch": 0.9114847562448664, "grad_norm": 3.0379202365875244, "learning_rate": 8.851524375513358e-08, "loss": 0.3896, "step": 18865 }, { "epoch": 0.9115330724259555, "grad_norm": 3.9218509197235107, "learning_rate": 8.846692757404455e-08, "loss": 0.2993, "step": 18866 }, { "epoch": 0.9115813886070445, "grad_norm": 2.2502524852752686, "learning_rate": 8.84186113929555e-08, "loss": 0.2689, "step": 18867 }, { "epoch": 0.9116297047881335, "grad_norm": 4.278073310852051, "learning_rate": 8.837029521186645e-08, "loss": 0.3274, "step": 18868 }, { "epoch": 0.9116780209692226, "grad_norm": 4.072285175323486, "learning_rate": 8.83219790307774e-08, "loss": 0.2765, "step": 18869 }, { "epoch": 0.9117263371503116, "grad_norm": 3.701789140701294, "learning_rate": 8.827366284968836e-08, "loss": 0.4091, "step": 18870 }, { "epoch": 0.9117746533314007, "grad_norm": 2.20025372505188, "learning_rate": 8.822534666859931e-08, "loss": 0.2409, "step": 18871 }, { "epoch": 0.9118229695124898, "grad_norm": 2.8694450855255127, "learning_rate": 8.817703048751026e-08, "loss": 0.3118, "step": 18872 }, { "epoch": 0.9118712856935788, "grad_norm": 2.4797725677490234, "learning_rate": 8.812871430642121e-08, "loss": 0.252, "step": 18873 }, { "epoch": 0.9119196018746678, "grad_norm": 2.0771913528442383, "learning_rate": 8.808039812533218e-08, "loss": 0.1671, "step": 18874 }, { "epoch": 0.9119679180557568, "grad_norm": 2.962559938430786, "learning_rate": 8.803208194424313e-08, "loss": 0.2599, "step": 18875 }, { "epoch": 0.912016234236846, "grad_norm": 2.406092405319214, "learning_rate": 8.798376576315408e-08, "loss": 0.2656, "step": 18876 }, { "epoch": 0.912064550417935, "grad_norm": 3.2523422241210938, "learning_rate": 8.793544958206503e-08, "loss": 0.3693, "step": 18877 }, { "epoch": 0.912112866599024, "grad_norm": 2.6233623027801514, "learning_rate": 8.788713340097598e-08, "loss": 0.1756, "step": 18878 }, { "epoch": 0.912161182780113, "grad_norm": 2.8330676555633545, "learning_rate": 8.783881721988694e-08, "loss": 0.3473, "step": 18879 }, { "epoch": 0.9122094989612021, "grad_norm": 2.3895585536956787, "learning_rate": 8.779050103879788e-08, "loss": 0.2321, "step": 18880 }, { "epoch": 0.9122578151422912, "grad_norm": 2.4029271602630615, "learning_rate": 8.774218485770884e-08, "loss": 0.1612, "step": 18881 }, { "epoch": 0.9123061313233802, "grad_norm": 7.463865280151367, "learning_rate": 8.76938686766198e-08, "loss": 0.2105, "step": 18882 }, { "epoch": 0.9123544475044693, "grad_norm": 3.379364013671875, "learning_rate": 8.764555249553076e-08, "loss": 0.3742, "step": 18883 }, { "epoch": 0.9124027636855583, "grad_norm": 5.714110374450684, "learning_rate": 8.75972363144417e-08, "loss": 0.2799, "step": 18884 }, { "epoch": 0.9124510798666473, "grad_norm": 2.3928380012512207, "learning_rate": 8.754892013335266e-08, "loss": 0.2861, "step": 18885 }, { "epoch": 0.9124993960477363, "grad_norm": 3.2865095138549805, "learning_rate": 8.750060395226361e-08, "loss": 0.3005, "step": 18886 }, { "epoch": 0.9125477122288255, "grad_norm": 3.7390263080596924, "learning_rate": 8.745228777117457e-08, "loss": 0.2159, "step": 18887 }, { "epoch": 0.9125960284099145, "grad_norm": 2.9964098930358887, "learning_rate": 8.740397159008551e-08, "loss": 0.4137, "step": 18888 }, { "epoch": 0.9126443445910035, "grad_norm": 3.2004446983337402, "learning_rate": 8.735565540899647e-08, "loss": 0.3873, "step": 18889 }, { "epoch": 0.9126926607720925, "grad_norm": 4.74766731262207, "learning_rate": 8.730733922790742e-08, "loss": 0.3665, "step": 18890 }, { "epoch": 0.9127409769531816, "grad_norm": 1.8284499645233154, "learning_rate": 8.725902304681837e-08, "loss": 0.2674, "step": 18891 }, { "epoch": 0.9127892931342707, "grad_norm": 2.8117754459381104, "learning_rate": 8.721070686572932e-08, "loss": 0.2763, "step": 18892 }, { "epoch": 0.9128376093153597, "grad_norm": 2.1097354888916016, "learning_rate": 8.716239068464028e-08, "loss": 0.2454, "step": 18893 }, { "epoch": 0.9128859254964488, "grad_norm": 2.8779542446136475, "learning_rate": 8.711407450355124e-08, "loss": 0.214, "step": 18894 }, { "epoch": 0.9129342416775378, "grad_norm": 3.435239315032959, "learning_rate": 8.706575832246219e-08, "loss": 0.2073, "step": 18895 }, { "epoch": 0.9129825578586268, "grad_norm": 3.2052853107452393, "learning_rate": 8.701744214137314e-08, "loss": 0.356, "step": 18896 }, { "epoch": 0.913030874039716, "grad_norm": 3.8264708518981934, "learning_rate": 8.696912596028409e-08, "loss": 0.3364, "step": 18897 }, { "epoch": 0.913079190220805, "grad_norm": 3.549224853515625, "learning_rate": 8.692080977919505e-08, "loss": 0.4063, "step": 18898 }, { "epoch": 0.913127506401894, "grad_norm": 4.860581874847412, "learning_rate": 8.6872493598106e-08, "loss": 0.3238, "step": 18899 }, { "epoch": 0.913175822582983, "grad_norm": 2.297393798828125, "learning_rate": 8.682417741701696e-08, "loss": 0.2748, "step": 18900 }, { "epoch": 0.913224138764072, "grad_norm": 2.454127788543701, "learning_rate": 8.67758612359279e-08, "loss": 0.2754, "step": 18901 }, { "epoch": 0.9132724549451612, "grad_norm": 6.149909496307373, "learning_rate": 8.672754505483886e-08, "loss": 0.2663, "step": 18902 }, { "epoch": 0.9133207711262502, "grad_norm": 2.810934543609619, "learning_rate": 8.667922887374982e-08, "loss": 0.2219, "step": 18903 }, { "epoch": 0.9133690873073392, "grad_norm": 2.9663877487182617, "learning_rate": 8.663091269266076e-08, "loss": 0.2498, "step": 18904 }, { "epoch": 0.9134174034884283, "grad_norm": 2.9959828853607178, "learning_rate": 8.658259651157172e-08, "loss": 0.412, "step": 18905 }, { "epoch": 0.9134657196695173, "grad_norm": 2.206552028656006, "learning_rate": 8.653428033048267e-08, "loss": 0.2565, "step": 18906 }, { "epoch": 0.9135140358506064, "grad_norm": 2.3611488342285156, "learning_rate": 8.648596414939364e-08, "loss": 0.1905, "step": 18907 }, { "epoch": 0.9135623520316954, "grad_norm": 2.062065839767456, "learning_rate": 8.643764796830457e-08, "loss": 0.1972, "step": 18908 }, { "epoch": 0.9136106682127845, "grad_norm": 2.9496445655822754, "learning_rate": 8.638933178721554e-08, "loss": 0.4482, "step": 18909 }, { "epoch": 0.9136589843938735, "grad_norm": 2.5558035373687744, "learning_rate": 8.634101560612649e-08, "loss": 0.3434, "step": 18910 }, { "epoch": 0.9137073005749625, "grad_norm": 1.998443365097046, "learning_rate": 8.629269942503745e-08, "loss": 0.1968, "step": 18911 }, { "epoch": 0.9137556167560515, "grad_norm": 2.405320167541504, "learning_rate": 8.624438324394839e-08, "loss": 0.2684, "step": 18912 }, { "epoch": 0.9138039329371407, "grad_norm": 3.2492668628692627, "learning_rate": 8.619606706285935e-08, "loss": 0.2451, "step": 18913 }, { "epoch": 0.9138522491182297, "grad_norm": 2.60125732421875, "learning_rate": 8.61477508817703e-08, "loss": 0.272, "step": 18914 }, { "epoch": 0.9139005652993187, "grad_norm": 4.153304100036621, "learning_rate": 8.609943470068125e-08, "loss": 0.208, "step": 18915 }, { "epoch": 0.9139488814804078, "grad_norm": 4.7493896484375, "learning_rate": 8.60511185195922e-08, "loss": 0.4307, "step": 18916 }, { "epoch": 0.9139971976614968, "grad_norm": 3.244504928588867, "learning_rate": 8.600280233850315e-08, "loss": 0.3109, "step": 18917 }, { "epoch": 0.9140455138425859, "grad_norm": 3.440134048461914, "learning_rate": 8.595448615741412e-08, "loss": 0.3459, "step": 18918 }, { "epoch": 0.914093830023675, "grad_norm": 2.9063565731048584, "learning_rate": 8.590616997632507e-08, "loss": 0.4531, "step": 18919 }, { "epoch": 0.914142146204764, "grad_norm": 3.2740697860717773, "learning_rate": 8.585785379523602e-08, "loss": 0.2842, "step": 18920 }, { "epoch": 0.914190462385853, "grad_norm": 3.1498916149139404, "learning_rate": 8.580953761414697e-08, "loss": 0.2722, "step": 18921 }, { "epoch": 0.914238778566942, "grad_norm": 8.944080352783203, "learning_rate": 8.576122143305793e-08, "loss": 0.2148, "step": 18922 }, { "epoch": 0.9142870947480312, "grad_norm": 3.083798885345459, "learning_rate": 8.571290525196888e-08, "loss": 0.3512, "step": 18923 }, { "epoch": 0.9143354109291202, "grad_norm": 3.1158087253570557, "learning_rate": 8.566458907087983e-08, "loss": 0.2996, "step": 18924 }, { "epoch": 0.9143837271102092, "grad_norm": 2.8890323638916016, "learning_rate": 8.561627288979078e-08, "loss": 0.3576, "step": 18925 }, { "epoch": 0.9144320432912982, "grad_norm": 2.7291646003723145, "learning_rate": 8.556795670870175e-08, "loss": 0.42, "step": 18926 }, { "epoch": 0.9144803594723873, "grad_norm": 2.6324658393859863, "learning_rate": 8.55196405276127e-08, "loss": 0.3634, "step": 18927 }, { "epoch": 0.9145286756534764, "grad_norm": 2.412137746810913, "learning_rate": 8.547132434652364e-08, "loss": 0.2944, "step": 18928 }, { "epoch": 0.9145769918345654, "grad_norm": 2.5608959197998047, "learning_rate": 8.54230081654346e-08, "loss": 0.3432, "step": 18929 }, { "epoch": 0.9146253080156544, "grad_norm": 2.3086307048797607, "learning_rate": 8.537469198434555e-08, "loss": 0.2892, "step": 18930 }, { "epoch": 0.9146736241967435, "grad_norm": 2.9489798545837402, "learning_rate": 8.532637580325651e-08, "loss": 0.3143, "step": 18931 }, { "epoch": 0.9147219403778325, "grad_norm": 1.8562390804290771, "learning_rate": 8.527805962216745e-08, "loss": 0.2309, "step": 18932 }, { "epoch": 0.9147702565589216, "grad_norm": 2.3928797245025635, "learning_rate": 8.522974344107842e-08, "loss": 0.2443, "step": 18933 }, { "epoch": 0.9148185727400107, "grad_norm": 2.7864131927490234, "learning_rate": 8.518142725998937e-08, "loss": 0.2857, "step": 18934 }, { "epoch": 0.9148668889210997, "grad_norm": 1.8470622301101685, "learning_rate": 8.513311107890033e-08, "loss": 0.215, "step": 18935 }, { "epoch": 0.9149152051021887, "grad_norm": 2.7980329990386963, "learning_rate": 8.508479489781127e-08, "loss": 0.2742, "step": 18936 }, { "epoch": 0.9149635212832777, "grad_norm": 2.6562812328338623, "learning_rate": 8.503647871672223e-08, "loss": 0.2449, "step": 18937 }, { "epoch": 0.9150118374643668, "grad_norm": 2.7919247150421143, "learning_rate": 8.498816253563318e-08, "loss": 0.3619, "step": 18938 }, { "epoch": 0.9150601536454559, "grad_norm": 6.497939586639404, "learning_rate": 8.493984635454414e-08, "loss": 0.2016, "step": 18939 }, { "epoch": 0.9151084698265449, "grad_norm": 9.767253875732422, "learning_rate": 8.489153017345508e-08, "loss": 0.3237, "step": 18940 }, { "epoch": 0.915156786007634, "grad_norm": 2.944845676422119, "learning_rate": 8.484321399236603e-08, "loss": 0.2501, "step": 18941 }, { "epoch": 0.915205102188723, "grad_norm": 10.47232723236084, "learning_rate": 8.4794897811277e-08, "loss": 0.2939, "step": 18942 }, { "epoch": 0.915253418369812, "grad_norm": 3.239441156387329, "learning_rate": 8.474658163018795e-08, "loss": 0.3128, "step": 18943 }, { "epoch": 0.9153017345509011, "grad_norm": 1.9399940967559814, "learning_rate": 8.46982654490989e-08, "loss": 0.194, "step": 18944 }, { "epoch": 0.9153500507319902, "grad_norm": 3.5230941772460938, "learning_rate": 8.464994926800985e-08, "loss": 0.4452, "step": 18945 }, { "epoch": 0.9153983669130792, "grad_norm": 3.2677419185638428, "learning_rate": 8.460163308692081e-08, "loss": 0.2032, "step": 18946 }, { "epoch": 0.9154466830941682, "grad_norm": 18.367950439453125, "learning_rate": 8.455331690583176e-08, "loss": 0.3118, "step": 18947 }, { "epoch": 0.9154949992752572, "grad_norm": 2.1968331336975098, "learning_rate": 8.450500072474271e-08, "loss": 0.1871, "step": 18948 }, { "epoch": 0.9155433154563464, "grad_norm": 2.5091657638549805, "learning_rate": 8.445668454365366e-08, "loss": 0.3311, "step": 18949 }, { "epoch": 0.9155916316374354, "grad_norm": 4.558913230895996, "learning_rate": 8.440836836256463e-08, "loss": 0.3121, "step": 18950 }, { "epoch": 0.9156399478185244, "grad_norm": 2.1888821125030518, "learning_rate": 8.436005218147558e-08, "loss": 0.2301, "step": 18951 }, { "epoch": 0.9156882639996134, "grad_norm": 2.774587392807007, "learning_rate": 8.431173600038653e-08, "loss": 0.3814, "step": 18952 }, { "epoch": 0.9157365801807025, "grad_norm": 3.0075714588165283, "learning_rate": 8.426341981929748e-08, "loss": 0.3635, "step": 18953 }, { "epoch": 0.9157848963617916, "grad_norm": 2.6342360973358154, "learning_rate": 8.421510363820843e-08, "loss": 0.2606, "step": 18954 }, { "epoch": 0.9158332125428806, "grad_norm": 2.9741251468658447, "learning_rate": 8.416678745711939e-08, "loss": 0.2808, "step": 18955 }, { "epoch": 0.9158815287239697, "grad_norm": 2.782402276992798, "learning_rate": 8.411847127603033e-08, "loss": 0.306, "step": 18956 }, { "epoch": 0.9159298449050587, "grad_norm": 3.9868788719177246, "learning_rate": 8.40701550949413e-08, "loss": 0.2186, "step": 18957 }, { "epoch": 0.9159781610861477, "grad_norm": 2.3797409534454346, "learning_rate": 8.402183891385224e-08, "loss": 0.2495, "step": 18958 }, { "epoch": 0.9160264772672368, "grad_norm": 2.402904748916626, "learning_rate": 8.397352273276321e-08, "loss": 0.32, "step": 18959 }, { "epoch": 0.9160747934483259, "grad_norm": 2.9717700481414795, "learning_rate": 8.392520655167414e-08, "loss": 0.3278, "step": 18960 }, { "epoch": 0.9161231096294149, "grad_norm": 3.0445213317871094, "learning_rate": 8.387689037058511e-08, "loss": 0.3113, "step": 18961 }, { "epoch": 0.9161714258105039, "grad_norm": 3.081376791000366, "learning_rate": 8.382857418949606e-08, "loss": 0.2611, "step": 18962 }, { "epoch": 0.916219741991593, "grad_norm": 3.060255289077759, "learning_rate": 8.378025800840702e-08, "loss": 0.3213, "step": 18963 }, { "epoch": 0.916268058172682, "grad_norm": 9.724661827087402, "learning_rate": 8.373194182731796e-08, "loss": 0.3889, "step": 18964 }, { "epoch": 0.9163163743537711, "grad_norm": 3.800945997238159, "learning_rate": 8.368362564622891e-08, "loss": 0.267, "step": 18965 }, { "epoch": 0.9163646905348601, "grad_norm": 4.593433380126953, "learning_rate": 8.363530946513987e-08, "loss": 0.2931, "step": 18966 }, { "epoch": 0.9164130067159492, "grad_norm": 4.903666019439697, "learning_rate": 8.358699328405083e-08, "loss": 0.3869, "step": 18967 }, { "epoch": 0.9164613228970382, "grad_norm": 3.6876919269561768, "learning_rate": 8.353867710296178e-08, "loss": 0.3224, "step": 18968 }, { "epoch": 0.9165096390781272, "grad_norm": 3.4517457485198975, "learning_rate": 8.349036092187273e-08, "loss": 0.4007, "step": 18969 }, { "epoch": 0.9165579552592164, "grad_norm": 4.1421217918396, "learning_rate": 8.344204474078369e-08, "loss": 0.4261, "step": 18970 }, { "epoch": 0.9166062714403054, "grad_norm": 3.6263198852539062, "learning_rate": 8.339372855969464e-08, "loss": 0.3638, "step": 18971 }, { "epoch": 0.9166545876213944, "grad_norm": 1.7236069440841675, "learning_rate": 8.334541237860559e-08, "loss": 0.1863, "step": 18972 }, { "epoch": 0.9167029038024834, "grad_norm": 3.9007303714752197, "learning_rate": 8.329709619751654e-08, "loss": 0.2797, "step": 18973 }, { "epoch": 0.9167512199835725, "grad_norm": 2.6502318382263184, "learning_rate": 8.32487800164275e-08, "loss": 0.2564, "step": 18974 }, { "epoch": 0.9167995361646616, "grad_norm": 2.9851725101470947, "learning_rate": 8.320046383533846e-08, "loss": 0.3619, "step": 18975 }, { "epoch": 0.9168478523457506, "grad_norm": 2.8417627811431885, "learning_rate": 8.31521476542494e-08, "loss": 0.3839, "step": 18976 }, { "epoch": 0.9168961685268396, "grad_norm": 1.9664353132247925, "learning_rate": 8.310383147316036e-08, "loss": 0.1939, "step": 18977 }, { "epoch": 0.9169444847079287, "grad_norm": 2.7336578369140625, "learning_rate": 8.305551529207131e-08, "loss": 0.2765, "step": 18978 }, { "epoch": 0.9169928008890177, "grad_norm": 3.1071434020996094, "learning_rate": 8.300719911098227e-08, "loss": 0.3351, "step": 18979 }, { "epoch": 0.9170411170701068, "grad_norm": 2.2107508182525635, "learning_rate": 8.295888292989321e-08, "loss": 0.1849, "step": 18980 }, { "epoch": 0.9170894332511959, "grad_norm": 2.6568500995635986, "learning_rate": 8.291056674880417e-08, "loss": 0.2625, "step": 18981 }, { "epoch": 0.9171377494322849, "grad_norm": 12.193902015686035, "learning_rate": 8.286225056771512e-08, "loss": 0.2612, "step": 18982 }, { "epoch": 0.9171860656133739, "grad_norm": 2.5925161838531494, "learning_rate": 8.281393438662609e-08, "loss": 0.2542, "step": 18983 }, { "epoch": 0.9172343817944629, "grad_norm": 2.570636510848999, "learning_rate": 8.276561820553702e-08, "loss": 0.2495, "step": 18984 }, { "epoch": 0.9172826979755521, "grad_norm": 2.5380032062530518, "learning_rate": 8.271730202444799e-08, "loss": 0.2471, "step": 18985 }, { "epoch": 0.9173310141566411, "grad_norm": 29.499771118164062, "learning_rate": 8.266898584335894e-08, "loss": 0.3755, "step": 18986 }, { "epoch": 0.9173793303377301, "grad_norm": 2.0314242839813232, "learning_rate": 8.26206696622699e-08, "loss": 0.235, "step": 18987 }, { "epoch": 0.9174276465188191, "grad_norm": 2.3364734649658203, "learning_rate": 8.257235348118084e-08, "loss": 0.2836, "step": 18988 }, { "epoch": 0.9174759626999082, "grad_norm": 3.019855260848999, "learning_rate": 8.25240373000918e-08, "loss": 0.3223, "step": 18989 }, { "epoch": 0.9175242788809972, "grad_norm": 3.9109275341033936, "learning_rate": 8.247572111900275e-08, "loss": 0.4034, "step": 18990 }, { "epoch": 0.9175725950620863, "grad_norm": 2.6967661380767822, "learning_rate": 8.24274049379137e-08, "loss": 0.3494, "step": 18991 }, { "epoch": 0.9176209112431754, "grad_norm": 3.3715975284576416, "learning_rate": 8.237908875682465e-08, "loss": 0.389, "step": 18992 }, { "epoch": 0.9176692274242644, "grad_norm": 3.0475592613220215, "learning_rate": 8.23307725757356e-08, "loss": 0.3863, "step": 18993 }, { "epoch": 0.9177175436053534, "grad_norm": 3.1584391593933105, "learning_rate": 8.228245639464657e-08, "loss": 0.2126, "step": 18994 }, { "epoch": 0.9177658597864424, "grad_norm": 3.0046937465667725, "learning_rate": 8.223414021355752e-08, "loss": 0.2843, "step": 18995 }, { "epoch": 0.9178141759675316, "grad_norm": 1.7584130764007568, "learning_rate": 8.218582403246847e-08, "loss": 0.2614, "step": 18996 }, { "epoch": 0.9178624921486206, "grad_norm": 2.7341349124908447, "learning_rate": 8.213750785137942e-08, "loss": 0.3823, "step": 18997 }, { "epoch": 0.9179108083297096, "grad_norm": 3.0779976844787598, "learning_rate": 8.208919167029038e-08, "loss": 0.3852, "step": 18998 }, { "epoch": 0.9179591245107986, "grad_norm": 2.4308981895446777, "learning_rate": 8.204087548920133e-08, "loss": 0.3103, "step": 18999 }, { "epoch": 0.9180074406918877, "grad_norm": 3.263763904571533, "learning_rate": 8.199255930811228e-08, "loss": 0.357, "step": 19000 }, { "epoch": 0.9180557568729768, "grad_norm": 2.772817373275757, "learning_rate": 8.194424312702324e-08, "loss": 0.3654, "step": 19001 }, { "epoch": 0.9181040730540658, "grad_norm": 2.639843702316284, "learning_rate": 8.18959269459342e-08, "loss": 0.322, "step": 19002 }, { "epoch": 0.9181523892351549, "grad_norm": 2.4111328125, "learning_rate": 8.184761076484515e-08, "loss": 0.2306, "step": 19003 }, { "epoch": 0.9182007054162439, "grad_norm": 2.6430985927581787, "learning_rate": 8.179929458375609e-08, "loss": 0.407, "step": 19004 }, { "epoch": 0.9182490215973329, "grad_norm": 5.647798538208008, "learning_rate": 8.175097840266705e-08, "loss": 0.2575, "step": 19005 }, { "epoch": 0.918297337778422, "grad_norm": 2.7852094173431396, "learning_rate": 8.1702662221578e-08, "loss": 0.4001, "step": 19006 }, { "epoch": 0.9183456539595111, "grad_norm": 7.2014875411987305, "learning_rate": 8.165434604048896e-08, "loss": 0.3347, "step": 19007 }, { "epoch": 0.9183939701406001, "grad_norm": 2.972531318664551, "learning_rate": 8.16060298593999e-08, "loss": 0.2562, "step": 19008 }, { "epoch": 0.9184422863216891, "grad_norm": 1.98923659324646, "learning_rate": 8.155771367831087e-08, "loss": 0.1877, "step": 19009 }, { "epoch": 0.9184906025027781, "grad_norm": 3.273293972015381, "learning_rate": 8.150939749722182e-08, "loss": 0.3599, "step": 19010 }, { "epoch": 0.9185389186838673, "grad_norm": 3.583875894546509, "learning_rate": 8.146108131613278e-08, "loss": 0.2321, "step": 19011 }, { "epoch": 0.9185872348649563, "grad_norm": 2.519949436187744, "learning_rate": 8.141276513504372e-08, "loss": 0.3101, "step": 19012 }, { "epoch": 0.9186355510460453, "grad_norm": 3.3675448894500732, "learning_rate": 8.136444895395468e-08, "loss": 0.3089, "step": 19013 }, { "epoch": 0.9186838672271344, "grad_norm": 3.535520076751709, "learning_rate": 8.131613277286563e-08, "loss": 0.3831, "step": 19014 }, { "epoch": 0.9187321834082234, "grad_norm": 6.914755344390869, "learning_rate": 8.12678165917766e-08, "loss": 0.3522, "step": 19015 }, { "epoch": 0.9187804995893125, "grad_norm": 4.279550552368164, "learning_rate": 8.121950041068753e-08, "loss": 0.5133, "step": 19016 }, { "epoch": 0.9188288157704015, "grad_norm": 3.6135213375091553, "learning_rate": 8.117118422959848e-08, "loss": 0.3072, "step": 19017 }, { "epoch": 0.9188771319514906, "grad_norm": 2.2085719108581543, "learning_rate": 8.112286804850945e-08, "loss": 0.2423, "step": 19018 }, { "epoch": 0.9189254481325796, "grad_norm": 4.6490631103515625, "learning_rate": 8.107455186742038e-08, "loss": 0.3172, "step": 19019 }, { "epoch": 0.9189737643136686, "grad_norm": 3.5886406898498535, "learning_rate": 8.102623568633135e-08, "loss": 0.2809, "step": 19020 }, { "epoch": 0.9190220804947576, "grad_norm": 2.387608289718628, "learning_rate": 8.09779195052423e-08, "loss": 0.2488, "step": 19021 }, { "epoch": 0.9190703966758468, "grad_norm": 2.751286506652832, "learning_rate": 8.092960332415326e-08, "loss": 0.2335, "step": 19022 }, { "epoch": 0.9191187128569358, "grad_norm": 2.151434898376465, "learning_rate": 8.08812871430642e-08, "loss": 0.2132, "step": 19023 }, { "epoch": 0.9191670290380248, "grad_norm": 2.6851978302001953, "learning_rate": 8.083297096197516e-08, "loss": 0.3798, "step": 19024 }, { "epoch": 0.9192153452191139, "grad_norm": 2.6997451782226562, "learning_rate": 8.078465478088611e-08, "loss": 0.3527, "step": 19025 }, { "epoch": 0.9192636614002029, "grad_norm": 10.437475204467773, "learning_rate": 8.073633859979708e-08, "loss": 0.3036, "step": 19026 }, { "epoch": 0.919311977581292, "grad_norm": 3.8525469303131104, "learning_rate": 8.068802241870801e-08, "loss": 0.2726, "step": 19027 }, { "epoch": 0.919360293762381, "grad_norm": 2.6752865314483643, "learning_rate": 8.063970623761898e-08, "loss": 0.2817, "step": 19028 }, { "epoch": 0.9194086099434701, "grad_norm": 2.740863084793091, "learning_rate": 8.059139005652993e-08, "loss": 0.2974, "step": 19029 }, { "epoch": 0.9194569261245591, "grad_norm": 3.0018701553344727, "learning_rate": 8.054307387544088e-08, "loss": 0.4639, "step": 19030 }, { "epoch": 0.9195052423056481, "grad_norm": 1.7043986320495605, "learning_rate": 8.049475769435183e-08, "loss": 0.1763, "step": 19031 }, { "epoch": 0.9195535584867373, "grad_norm": 2.8840720653533936, "learning_rate": 8.044644151326278e-08, "loss": 0.3433, "step": 19032 }, { "epoch": 0.9196018746678263, "grad_norm": 2.347348213195801, "learning_rate": 8.039812533217374e-08, "loss": 0.3694, "step": 19033 }, { "epoch": 0.9196501908489153, "grad_norm": 2.595494031906128, "learning_rate": 8.03498091510847e-08, "loss": 0.3163, "step": 19034 }, { "epoch": 0.9196985070300043, "grad_norm": 3.0891830921173096, "learning_rate": 8.030149296999566e-08, "loss": 0.3665, "step": 19035 }, { "epoch": 0.9197468232110934, "grad_norm": 3.1842613220214844, "learning_rate": 8.02531767889066e-08, "loss": 0.2941, "step": 19036 }, { "epoch": 0.9197951393921825, "grad_norm": 3.91371488571167, "learning_rate": 8.020486060781756e-08, "loss": 0.2925, "step": 19037 }, { "epoch": 0.9198434555732715, "grad_norm": 7.880457878112793, "learning_rate": 8.015654442672851e-08, "loss": 0.3599, "step": 19038 }, { "epoch": 0.9198917717543605, "grad_norm": 2.136793375015259, "learning_rate": 8.010822824563947e-08, "loss": 0.242, "step": 19039 }, { "epoch": 0.9199400879354496, "grad_norm": 5.972498893737793, "learning_rate": 8.005991206455041e-08, "loss": 0.2725, "step": 19040 }, { "epoch": 0.9199884041165386, "grad_norm": 2.9005672931671143, "learning_rate": 8.001159588346136e-08, "loss": 0.3533, "step": 19041 }, { "epoch": 0.9200367202976277, "grad_norm": 2.6894922256469727, "learning_rate": 7.996327970237233e-08, "loss": 0.2707, "step": 19042 }, { "epoch": 0.9200850364787168, "grad_norm": 3.4877991676330566, "learning_rate": 7.991496352128326e-08, "loss": 0.379, "step": 19043 }, { "epoch": 0.9201333526598058, "grad_norm": 2.1437599658966064, "learning_rate": 7.986664734019423e-08, "loss": 0.2495, "step": 19044 }, { "epoch": 0.9201816688408948, "grad_norm": 2.049596071243286, "learning_rate": 7.981833115910518e-08, "loss": 0.2654, "step": 19045 }, { "epoch": 0.9202299850219838, "grad_norm": 2.646946668624878, "learning_rate": 7.977001497801614e-08, "loss": 0.2413, "step": 19046 }, { "epoch": 0.9202783012030729, "grad_norm": 2.8291525840759277, "learning_rate": 7.972169879692708e-08, "loss": 0.3125, "step": 19047 }, { "epoch": 0.920326617384162, "grad_norm": 2.5509510040283203, "learning_rate": 7.967338261583804e-08, "loss": 0.2708, "step": 19048 }, { "epoch": 0.920374933565251, "grad_norm": 3.0243616104125977, "learning_rate": 7.962506643474899e-08, "loss": 0.292, "step": 19049 }, { "epoch": 0.92042324974634, "grad_norm": 3.1983423233032227, "learning_rate": 7.957675025365996e-08, "loss": 0.3842, "step": 19050 }, { "epoch": 0.9204715659274291, "grad_norm": 2.772047519683838, "learning_rate": 7.952843407257089e-08, "loss": 0.3008, "step": 19051 }, { "epoch": 0.9205198821085181, "grad_norm": 2.690408706665039, "learning_rate": 7.948011789148186e-08, "loss": 0.3297, "step": 19052 }, { "epoch": 0.9205681982896072, "grad_norm": 4.232253074645996, "learning_rate": 7.943180171039281e-08, "loss": 0.3244, "step": 19053 }, { "epoch": 0.9206165144706963, "grad_norm": 2.9358413219451904, "learning_rate": 7.938348552930376e-08, "loss": 0.2085, "step": 19054 }, { "epoch": 0.9206648306517853, "grad_norm": 4.767272472381592, "learning_rate": 7.933516934821471e-08, "loss": 0.3392, "step": 19055 }, { "epoch": 0.9207131468328743, "grad_norm": 2.1790692806243896, "learning_rate": 7.928685316712566e-08, "loss": 0.2098, "step": 19056 }, { "epoch": 0.9207614630139633, "grad_norm": 2.409872531890869, "learning_rate": 7.923853698603662e-08, "loss": 0.2774, "step": 19057 }, { "epoch": 0.9208097791950525, "grad_norm": 4.119383335113525, "learning_rate": 7.919022080494757e-08, "loss": 0.3812, "step": 19058 }, { "epoch": 0.9208580953761415, "grad_norm": 2.3153181076049805, "learning_rate": 7.914190462385852e-08, "loss": 0.2555, "step": 19059 }, { "epoch": 0.9209064115572305, "grad_norm": 3.4597506523132324, "learning_rate": 7.909358844276947e-08, "loss": 0.3263, "step": 19060 }, { "epoch": 0.9209547277383195, "grad_norm": 2.596468210220337, "learning_rate": 7.904527226168044e-08, "loss": 0.33, "step": 19061 }, { "epoch": 0.9210030439194086, "grad_norm": 3.975632429122925, "learning_rate": 7.899695608059139e-08, "loss": 0.4517, "step": 19062 }, { "epoch": 0.9210513601004977, "grad_norm": 2.7985377311706543, "learning_rate": 7.894863989950234e-08, "loss": 0.285, "step": 19063 }, { "epoch": 0.9210996762815867, "grad_norm": 8.609846115112305, "learning_rate": 7.890032371841329e-08, "loss": 0.4144, "step": 19064 }, { "epoch": 0.9211479924626758, "grad_norm": 3.859135389328003, "learning_rate": 7.885200753732425e-08, "loss": 0.2899, "step": 19065 }, { "epoch": 0.9211963086437648, "grad_norm": 2.5157923698425293, "learning_rate": 7.88036913562352e-08, "loss": 0.3203, "step": 19066 }, { "epoch": 0.9212446248248538, "grad_norm": 2.9623894691467285, "learning_rate": 7.875537517514614e-08, "loss": 0.2902, "step": 19067 }, { "epoch": 0.921292941005943, "grad_norm": 3.7685840129852295, "learning_rate": 7.87070589940571e-08, "loss": 0.3991, "step": 19068 }, { "epoch": 0.921341257187032, "grad_norm": 2.7792937755584717, "learning_rate": 7.865874281296806e-08, "loss": 0.3563, "step": 19069 }, { "epoch": 0.921389573368121, "grad_norm": 5.933513164520264, "learning_rate": 7.861042663187902e-08, "loss": 0.3005, "step": 19070 }, { "epoch": 0.92143788954921, "grad_norm": 2.085742235183716, "learning_rate": 7.856211045078996e-08, "loss": 0.2174, "step": 19071 }, { "epoch": 0.921486205730299, "grad_norm": 2.8372714519500732, "learning_rate": 7.851379426970092e-08, "loss": 0.2461, "step": 19072 }, { "epoch": 0.9215345219113881, "grad_norm": 3.9903666973114014, "learning_rate": 7.846547808861187e-08, "loss": 0.38, "step": 19073 }, { "epoch": 0.9215828380924772, "grad_norm": 2.7219080924987793, "learning_rate": 7.841716190752283e-08, "loss": 0.3415, "step": 19074 }, { "epoch": 0.9216311542735662, "grad_norm": 2.7383995056152344, "learning_rate": 7.836884572643377e-08, "loss": 0.3607, "step": 19075 }, { "epoch": 0.9216794704546553, "grad_norm": 2.518385648727417, "learning_rate": 7.832052954534474e-08, "loss": 0.2434, "step": 19076 }, { "epoch": 0.9217277866357443, "grad_norm": 11.258025169372559, "learning_rate": 7.827221336425569e-08, "loss": 0.2525, "step": 19077 }, { "epoch": 0.9217761028168333, "grad_norm": 3.0537734031677246, "learning_rate": 7.822389718316665e-08, "loss": 0.3867, "step": 19078 }, { "epoch": 0.9218244189979224, "grad_norm": 3.1705780029296875, "learning_rate": 7.817558100207759e-08, "loss": 0.3139, "step": 19079 }, { "epoch": 0.9218727351790115, "grad_norm": 2.5494062900543213, "learning_rate": 7.812726482098854e-08, "loss": 0.3216, "step": 19080 }, { "epoch": 0.9219210513601005, "grad_norm": 2.6445679664611816, "learning_rate": 7.80789486398995e-08, "loss": 0.2395, "step": 19081 }, { "epoch": 0.9219693675411895, "grad_norm": 1.6696293354034424, "learning_rate": 7.803063245881045e-08, "loss": 0.1917, "step": 19082 }, { "epoch": 0.9220176837222785, "grad_norm": 1.7010221481323242, "learning_rate": 7.79823162777214e-08, "loss": 0.1792, "step": 19083 }, { "epoch": 0.9220659999033677, "grad_norm": 2.4895882606506348, "learning_rate": 7.793400009663235e-08, "loss": 0.2747, "step": 19084 }, { "epoch": 0.9221143160844567, "grad_norm": 2.9460232257843018, "learning_rate": 7.788568391554332e-08, "loss": 0.2535, "step": 19085 }, { "epoch": 0.9221626322655457, "grad_norm": 3.565650463104248, "learning_rate": 7.783736773445427e-08, "loss": 0.4452, "step": 19086 }, { "epoch": 0.9222109484466348, "grad_norm": 4.301132678985596, "learning_rate": 7.778905155336522e-08, "loss": 0.2486, "step": 19087 }, { "epoch": 0.9222592646277238, "grad_norm": 2.2135322093963623, "learning_rate": 7.774073537227617e-08, "loss": 0.2547, "step": 19088 }, { "epoch": 0.9223075808088129, "grad_norm": 2.413916826248169, "learning_rate": 7.769241919118713e-08, "loss": 0.37, "step": 19089 }, { "epoch": 0.922355896989902, "grad_norm": 3.4975435733795166, "learning_rate": 7.764410301009808e-08, "loss": 0.3463, "step": 19090 }, { "epoch": 0.922404213170991, "grad_norm": 2.8258678913116455, "learning_rate": 7.759578682900903e-08, "loss": 0.1944, "step": 19091 }, { "epoch": 0.92245252935208, "grad_norm": 7.180146217346191, "learning_rate": 7.754747064791998e-08, "loss": 0.2978, "step": 19092 }, { "epoch": 0.922500845533169, "grad_norm": 1.869401216506958, "learning_rate": 7.749915446683093e-08, "loss": 0.2059, "step": 19093 }, { "epoch": 0.9225491617142582, "grad_norm": 2.870068073272705, "learning_rate": 7.74508382857419e-08, "loss": 0.3252, "step": 19094 }, { "epoch": 0.9225974778953472, "grad_norm": 3.8442182540893555, "learning_rate": 7.740252210465283e-08, "loss": 0.2782, "step": 19095 }, { "epoch": 0.9226457940764362, "grad_norm": 4.818033695220947, "learning_rate": 7.73542059235638e-08, "loss": 0.2344, "step": 19096 }, { "epoch": 0.9226941102575252, "grad_norm": 5.876935958862305, "learning_rate": 7.730588974247475e-08, "loss": 0.2936, "step": 19097 }, { "epoch": 0.9227424264386143, "grad_norm": 2.502514600753784, "learning_rate": 7.725757356138571e-08, "loss": 0.2025, "step": 19098 }, { "epoch": 0.9227907426197033, "grad_norm": 2.3747715950012207, "learning_rate": 7.720925738029665e-08, "loss": 0.2738, "step": 19099 }, { "epoch": 0.9228390588007924, "grad_norm": 3.619025707244873, "learning_rate": 7.716094119920761e-08, "loss": 0.3762, "step": 19100 }, { "epoch": 0.9228873749818814, "grad_norm": 2.3167123794555664, "learning_rate": 7.711262501811856e-08, "loss": 0.2905, "step": 19101 }, { "epoch": 0.9229356911629705, "grad_norm": 3.099506139755249, "learning_rate": 7.706430883702953e-08, "loss": 0.2913, "step": 19102 }, { "epoch": 0.9229840073440595, "grad_norm": 2.3994476795196533, "learning_rate": 7.701599265594047e-08, "loss": 0.2776, "step": 19103 }, { "epoch": 0.9230323235251485, "grad_norm": 3.059469699859619, "learning_rate": 7.696767647485142e-08, "loss": 0.3309, "step": 19104 }, { "epoch": 0.9230806397062377, "grad_norm": 3.9633398056030273, "learning_rate": 7.691936029376238e-08, "loss": 0.3963, "step": 19105 }, { "epoch": 0.9231289558873267, "grad_norm": 2.9558699131011963, "learning_rate": 7.687104411267333e-08, "loss": 0.3534, "step": 19106 }, { "epoch": 0.9231772720684157, "grad_norm": 2.6261160373687744, "learning_rate": 7.682272793158428e-08, "loss": 0.3138, "step": 19107 }, { "epoch": 0.9232255882495047, "grad_norm": 2.5248494148254395, "learning_rate": 7.677441175049523e-08, "loss": 0.2827, "step": 19108 }, { "epoch": 0.9232739044305938, "grad_norm": 5.194558143615723, "learning_rate": 7.67260955694062e-08, "loss": 0.352, "step": 19109 }, { "epoch": 0.9233222206116829, "grad_norm": 2.4816181659698486, "learning_rate": 7.667777938831715e-08, "loss": 0.2502, "step": 19110 }, { "epoch": 0.9233705367927719, "grad_norm": 2.8157873153686523, "learning_rate": 7.66294632072281e-08, "loss": 0.3966, "step": 19111 }, { "epoch": 0.923418852973861, "grad_norm": 2.2165541648864746, "learning_rate": 7.658114702613905e-08, "loss": 0.2506, "step": 19112 }, { "epoch": 0.92346716915495, "grad_norm": 2.3026764392852783, "learning_rate": 7.653283084505001e-08, "loss": 0.2714, "step": 19113 }, { "epoch": 0.923515485336039, "grad_norm": 2.420907974243164, "learning_rate": 7.648451466396096e-08, "loss": 0.1815, "step": 19114 }, { "epoch": 0.9235638015171281, "grad_norm": 2.9240047931671143, "learning_rate": 7.643619848287191e-08, "loss": 0.3449, "step": 19115 }, { "epoch": 0.9236121176982172, "grad_norm": 2.716146469116211, "learning_rate": 7.638788230178286e-08, "loss": 0.2212, "step": 19116 }, { "epoch": 0.9236604338793062, "grad_norm": 3.092796564102173, "learning_rate": 7.633956612069381e-08, "loss": 0.3212, "step": 19117 }, { "epoch": 0.9237087500603952, "grad_norm": 2.7617948055267334, "learning_rate": 7.629124993960478e-08, "loss": 0.3897, "step": 19118 }, { "epoch": 0.9237570662414842, "grad_norm": 45.42377853393555, "learning_rate": 7.624293375851571e-08, "loss": 0.2565, "step": 19119 }, { "epoch": 0.9238053824225734, "grad_norm": 9.337677955627441, "learning_rate": 7.619461757742668e-08, "loss": 0.25, "step": 19120 }, { "epoch": 0.9238536986036624, "grad_norm": 2.2579092979431152, "learning_rate": 7.614630139633763e-08, "loss": 0.2934, "step": 19121 }, { "epoch": 0.9239020147847514, "grad_norm": 2.671720504760742, "learning_rate": 7.609798521524859e-08, "loss": 0.2972, "step": 19122 }, { "epoch": 0.9239503309658404, "grad_norm": 4.442725658416748, "learning_rate": 7.604966903415953e-08, "loss": 0.2662, "step": 19123 }, { "epoch": 0.9239986471469295, "grad_norm": 3.254222869873047, "learning_rate": 7.600135285307049e-08, "loss": 0.3238, "step": 19124 }, { "epoch": 0.9240469633280185, "grad_norm": 2.47072172164917, "learning_rate": 7.595303667198144e-08, "loss": 0.3269, "step": 19125 }, { "epoch": 0.9240952795091076, "grad_norm": 2.817279577255249, "learning_rate": 7.59047204908924e-08, "loss": 0.2804, "step": 19126 }, { "epoch": 0.9241435956901967, "grad_norm": 2.613889455795288, "learning_rate": 7.585640430980334e-08, "loss": 0.2184, "step": 19127 }, { "epoch": 0.9241919118712857, "grad_norm": 3.2378554344177246, "learning_rate": 7.580808812871431e-08, "loss": 0.3115, "step": 19128 }, { "epoch": 0.9242402280523747, "grad_norm": 2.4195170402526855, "learning_rate": 7.575977194762526e-08, "loss": 0.2477, "step": 19129 }, { "epoch": 0.9242885442334637, "grad_norm": 3.276357650756836, "learning_rate": 7.571145576653621e-08, "loss": 0.3549, "step": 19130 }, { "epoch": 0.9243368604145529, "grad_norm": 3.581252336502075, "learning_rate": 7.566313958544716e-08, "loss": 0.4002, "step": 19131 }, { "epoch": 0.9243851765956419, "grad_norm": 2.95497465133667, "learning_rate": 7.561482340435811e-08, "loss": 0.3866, "step": 19132 }, { "epoch": 0.9244334927767309, "grad_norm": 1.7687475681304932, "learning_rate": 7.556650722326907e-08, "loss": 0.2537, "step": 19133 }, { "epoch": 0.92448180895782, "grad_norm": 2.9968042373657227, "learning_rate": 7.551819104218002e-08, "loss": 0.4219, "step": 19134 }, { "epoch": 0.924530125138909, "grad_norm": 2.82790470123291, "learning_rate": 7.546987486109097e-08, "loss": 0.3503, "step": 19135 }, { "epoch": 0.9245784413199981, "grad_norm": 3.0077834129333496, "learning_rate": 7.542155868000192e-08, "loss": 0.3424, "step": 19136 }, { "epoch": 0.9246267575010871, "grad_norm": 3.085339307785034, "learning_rate": 7.537324249891289e-08, "loss": 0.305, "step": 19137 }, { "epoch": 0.9246750736821762, "grad_norm": 2.462364435195923, "learning_rate": 7.532492631782384e-08, "loss": 0.2749, "step": 19138 }, { "epoch": 0.9247233898632652, "grad_norm": 3.3621320724487305, "learning_rate": 7.527661013673479e-08, "loss": 0.2864, "step": 19139 }, { "epoch": 0.9247717060443542, "grad_norm": 3.07452392578125, "learning_rate": 7.522829395564574e-08, "loss": 0.3704, "step": 19140 }, { "epoch": 0.9248200222254434, "grad_norm": 2.7442092895507812, "learning_rate": 7.51799777745567e-08, "loss": 0.3245, "step": 19141 }, { "epoch": 0.9248683384065324, "grad_norm": 2.3969318866729736, "learning_rate": 7.513166159346765e-08, "loss": 0.2108, "step": 19142 }, { "epoch": 0.9249166545876214, "grad_norm": 2.038658857345581, "learning_rate": 7.508334541237859e-08, "loss": 0.228, "step": 19143 }, { "epoch": 0.9249649707687104, "grad_norm": 2.5583739280700684, "learning_rate": 7.503502923128956e-08, "loss": 0.3005, "step": 19144 }, { "epoch": 0.9250132869497995, "grad_norm": 3.001601457595825, "learning_rate": 7.49867130502005e-08, "loss": 0.263, "step": 19145 }, { "epoch": 0.9250616031308886, "grad_norm": 3.1543209552764893, "learning_rate": 7.493839686911147e-08, "loss": 0.3581, "step": 19146 }, { "epoch": 0.9251099193119776, "grad_norm": 2.5793228149414062, "learning_rate": 7.489008068802241e-08, "loss": 0.3495, "step": 19147 }, { "epoch": 0.9251582354930666, "grad_norm": 2.480553388595581, "learning_rate": 7.484176450693337e-08, "loss": 0.2719, "step": 19148 }, { "epoch": 0.9252065516741557, "grad_norm": 2.6613707542419434, "learning_rate": 7.479344832584432e-08, "loss": 0.2848, "step": 19149 }, { "epoch": 0.9252548678552447, "grad_norm": 3.253898859024048, "learning_rate": 7.474513214475528e-08, "loss": 0.4432, "step": 19150 }, { "epoch": 0.9253031840363337, "grad_norm": 4.258608818054199, "learning_rate": 7.469681596366622e-08, "loss": 0.3131, "step": 19151 }, { "epoch": 0.9253515002174229, "grad_norm": 29.581623077392578, "learning_rate": 7.464849978257719e-08, "loss": 0.2395, "step": 19152 }, { "epoch": 0.9253998163985119, "grad_norm": 2.1865644454956055, "learning_rate": 7.460018360148814e-08, "loss": 0.24, "step": 19153 }, { "epoch": 0.9254481325796009, "grad_norm": 2.8351306915283203, "learning_rate": 7.45518674203991e-08, "loss": 0.3774, "step": 19154 }, { "epoch": 0.9254964487606899, "grad_norm": 2.715087413787842, "learning_rate": 7.450355123931004e-08, "loss": 0.3237, "step": 19155 }, { "epoch": 0.925544764941779, "grad_norm": 9.402433395385742, "learning_rate": 7.445523505822099e-08, "loss": 0.3756, "step": 19156 }, { "epoch": 0.9255930811228681, "grad_norm": 3.005286931991577, "learning_rate": 7.440691887713195e-08, "loss": 0.405, "step": 19157 }, { "epoch": 0.9256413973039571, "grad_norm": 2.783278226852417, "learning_rate": 7.43586026960429e-08, "loss": 0.2514, "step": 19158 }, { "epoch": 0.9256897134850461, "grad_norm": 3.243915557861328, "learning_rate": 7.431028651495385e-08, "loss": 0.4186, "step": 19159 }, { "epoch": 0.9257380296661352, "grad_norm": 2.601261615753174, "learning_rate": 7.42619703338648e-08, "loss": 0.3397, "step": 19160 }, { "epoch": 0.9257863458472242, "grad_norm": 2.63010311126709, "learning_rate": 7.421365415277577e-08, "loss": 0.1933, "step": 19161 }, { "epoch": 0.9258346620283133, "grad_norm": 4.515179634094238, "learning_rate": 7.416533797168672e-08, "loss": 0.2889, "step": 19162 }, { "epoch": 0.9258829782094024, "grad_norm": 2.4524412155151367, "learning_rate": 7.411702179059767e-08, "loss": 0.3323, "step": 19163 }, { "epoch": 0.9259312943904914, "grad_norm": 1.8204729557037354, "learning_rate": 7.406870560950862e-08, "loss": 0.207, "step": 19164 }, { "epoch": 0.9259796105715804, "grad_norm": 2.440452814102173, "learning_rate": 7.402038942841958e-08, "loss": 0.3703, "step": 19165 }, { "epoch": 0.9260279267526694, "grad_norm": 3.126028537750244, "learning_rate": 7.397207324733053e-08, "loss": 0.3978, "step": 19166 }, { "epoch": 0.9260762429337586, "grad_norm": 3.1593329906463623, "learning_rate": 7.392375706624148e-08, "loss": 0.1964, "step": 19167 }, { "epoch": 0.9261245591148476, "grad_norm": 2.939556121826172, "learning_rate": 7.387544088515243e-08, "loss": 0.2153, "step": 19168 }, { "epoch": 0.9261728752959366, "grad_norm": 9.831035614013672, "learning_rate": 7.382712470406338e-08, "loss": 0.3514, "step": 19169 }, { "epoch": 0.9262211914770256, "grad_norm": 2.636641263961792, "learning_rate": 7.377880852297435e-08, "loss": 0.3119, "step": 19170 }, { "epoch": 0.9262695076581147, "grad_norm": 3.6085143089294434, "learning_rate": 7.373049234188529e-08, "loss": 0.4119, "step": 19171 }, { "epoch": 0.9263178238392038, "grad_norm": 2.077350616455078, "learning_rate": 7.368217616079625e-08, "loss": 0.1794, "step": 19172 }, { "epoch": 0.9263661400202928, "grad_norm": 7.4951491355896, "learning_rate": 7.36338599797072e-08, "loss": 0.3582, "step": 19173 }, { "epoch": 0.9264144562013819, "grad_norm": 2.20603084564209, "learning_rate": 7.358554379861816e-08, "loss": 0.2705, "step": 19174 }, { "epoch": 0.9264627723824709, "grad_norm": 10.223661422729492, "learning_rate": 7.35372276175291e-08, "loss": 0.3325, "step": 19175 }, { "epoch": 0.9265110885635599, "grad_norm": 4.361868381500244, "learning_rate": 7.348891143644006e-08, "loss": 0.4468, "step": 19176 }, { "epoch": 0.9265594047446489, "grad_norm": 2.731147527694702, "learning_rate": 7.344059525535101e-08, "loss": 0.2842, "step": 19177 }, { "epoch": 0.9266077209257381, "grad_norm": 4.28366756439209, "learning_rate": 7.339227907426198e-08, "loss": 0.3571, "step": 19178 }, { "epoch": 0.9266560371068271, "grad_norm": 3.0342845916748047, "learning_rate": 7.334396289317292e-08, "loss": 0.2966, "step": 19179 }, { "epoch": 0.9267043532879161, "grad_norm": 3.1694300174713135, "learning_rate": 7.329564671208387e-08, "loss": 0.1894, "step": 19180 }, { "epoch": 0.9267526694690051, "grad_norm": 2.567791223526001, "learning_rate": 7.324733053099483e-08, "loss": 0.2597, "step": 19181 }, { "epoch": 0.9268009856500942, "grad_norm": 9.547707557678223, "learning_rate": 7.319901434990578e-08, "loss": 0.3767, "step": 19182 }, { "epoch": 0.9268493018311833, "grad_norm": 11.124794006347656, "learning_rate": 7.315069816881673e-08, "loss": 0.3051, "step": 19183 }, { "epoch": 0.9268976180122723, "grad_norm": 4.7031097412109375, "learning_rate": 7.310238198772768e-08, "loss": 0.2159, "step": 19184 }, { "epoch": 0.9269459341933614, "grad_norm": 2.3751773834228516, "learning_rate": 7.305406580663865e-08, "loss": 0.2789, "step": 19185 }, { "epoch": 0.9269942503744504, "grad_norm": 2.1816818714141846, "learning_rate": 7.30057496255496e-08, "loss": 0.2223, "step": 19186 }, { "epoch": 0.9270425665555394, "grad_norm": 2.48905611038208, "learning_rate": 7.295743344446055e-08, "loss": 0.2762, "step": 19187 }, { "epoch": 0.9270908827366285, "grad_norm": 3.4874019622802734, "learning_rate": 7.29091172633715e-08, "loss": 0.4173, "step": 19188 }, { "epoch": 0.9271391989177176, "grad_norm": 2.750986099243164, "learning_rate": 7.286080108228246e-08, "loss": 0.3309, "step": 19189 }, { "epoch": 0.9271875150988066, "grad_norm": 2.8278634548187256, "learning_rate": 7.281248490119341e-08, "loss": 0.3466, "step": 19190 }, { "epoch": 0.9272358312798956, "grad_norm": 4.170119762420654, "learning_rate": 7.276416872010436e-08, "loss": 0.2572, "step": 19191 }, { "epoch": 0.9272841474609846, "grad_norm": 3.1287899017333984, "learning_rate": 7.271585253901531e-08, "loss": 0.2714, "step": 19192 }, { "epoch": 0.9273324636420738, "grad_norm": 3.212691068649292, "learning_rate": 7.266753635792626e-08, "loss": 0.3042, "step": 19193 }, { "epoch": 0.9273807798231628, "grad_norm": 2.38997220993042, "learning_rate": 7.261922017683723e-08, "loss": 0.2031, "step": 19194 }, { "epoch": 0.9274290960042518, "grad_norm": 2.6642873287200928, "learning_rate": 7.257090399574816e-08, "loss": 0.3327, "step": 19195 }, { "epoch": 0.9274774121853409, "grad_norm": 2.3886947631835938, "learning_rate": 7.252258781465913e-08, "loss": 0.254, "step": 19196 }, { "epoch": 0.9275257283664299, "grad_norm": 2.9540772438049316, "learning_rate": 7.247427163357008e-08, "loss": 0.3588, "step": 19197 }, { "epoch": 0.927574044547519, "grad_norm": 2.2871735095977783, "learning_rate": 7.242595545248104e-08, "loss": 0.2495, "step": 19198 }, { "epoch": 0.927622360728608, "grad_norm": 2.291907787322998, "learning_rate": 7.237763927139198e-08, "loss": 0.2723, "step": 19199 }, { "epoch": 0.9276706769096971, "grad_norm": 1.9721410274505615, "learning_rate": 7.232932309030294e-08, "loss": 0.2015, "step": 19200 }, { "epoch": 0.9277189930907861, "grad_norm": 1.912175178527832, "learning_rate": 7.228100690921389e-08, "loss": 0.1961, "step": 19201 }, { "epoch": 0.9277673092718751, "grad_norm": 2.3162388801574707, "learning_rate": 7.223269072812486e-08, "loss": 0.2632, "step": 19202 }, { "epoch": 0.9278156254529641, "grad_norm": 3.9871106147766113, "learning_rate": 7.21843745470358e-08, "loss": 0.3672, "step": 19203 }, { "epoch": 0.9278639416340533, "grad_norm": 3.71743106842041, "learning_rate": 7.213605836594676e-08, "loss": 0.3581, "step": 19204 }, { "epoch": 0.9279122578151423, "grad_norm": 2.4249703884124756, "learning_rate": 7.208774218485771e-08, "loss": 0.1973, "step": 19205 }, { "epoch": 0.9279605739962313, "grad_norm": 4.938290119171143, "learning_rate": 7.203942600376866e-08, "loss": 0.2743, "step": 19206 }, { "epoch": 0.9280088901773204, "grad_norm": 3.5718884468078613, "learning_rate": 7.199110982267961e-08, "loss": 0.4096, "step": 19207 }, { "epoch": 0.9280572063584094, "grad_norm": 3.539179801940918, "learning_rate": 7.194279364159056e-08, "loss": 0.3421, "step": 19208 }, { "epoch": 0.9281055225394985, "grad_norm": 3.8023393154144287, "learning_rate": 7.189447746050152e-08, "loss": 0.4465, "step": 19209 }, { "epoch": 0.9281538387205875, "grad_norm": 2.493664026260376, "learning_rate": 7.184616127941247e-08, "loss": 0.3268, "step": 19210 }, { "epoch": 0.9282021549016766, "grad_norm": 3.7151308059692383, "learning_rate": 7.179784509832342e-08, "loss": 0.3693, "step": 19211 }, { "epoch": 0.9282504710827656, "grad_norm": 2.9686834812164307, "learning_rate": 7.174952891723438e-08, "loss": 0.3246, "step": 19212 }, { "epoch": 0.9282987872638546, "grad_norm": 2.9100136756896973, "learning_rate": 7.170121273614534e-08, "loss": 0.2677, "step": 19213 }, { "epoch": 0.9283471034449438, "grad_norm": 2.352045774459839, "learning_rate": 7.165289655505629e-08, "loss": 0.2467, "step": 19214 }, { "epoch": 0.9283954196260328, "grad_norm": 2.121939182281494, "learning_rate": 7.160458037396724e-08, "loss": 0.2222, "step": 19215 }, { "epoch": 0.9284437358071218, "grad_norm": 1.7907037734985352, "learning_rate": 7.155626419287819e-08, "loss": 0.1766, "step": 19216 }, { "epoch": 0.9284920519882108, "grad_norm": 3.3216209411621094, "learning_rate": 7.150794801178915e-08, "loss": 0.3893, "step": 19217 }, { "epoch": 0.9285403681692999, "grad_norm": 3.1766135692596436, "learning_rate": 7.14596318307001e-08, "loss": 0.2993, "step": 19218 }, { "epoch": 0.928588684350389, "grad_norm": 2.389882802963257, "learning_rate": 7.141131564961104e-08, "loss": 0.2145, "step": 19219 }, { "epoch": 0.928637000531478, "grad_norm": 7.977161884307861, "learning_rate": 7.1362999468522e-08, "loss": 0.2237, "step": 19220 }, { "epoch": 0.928685316712567, "grad_norm": 3.105811834335327, "learning_rate": 7.131468328743296e-08, "loss": 0.3126, "step": 19221 }, { "epoch": 0.9287336328936561, "grad_norm": 3.295633554458618, "learning_rate": 7.126636710634392e-08, "loss": 0.2707, "step": 19222 }, { "epoch": 0.9287819490747451, "grad_norm": 1.952483892440796, "learning_rate": 7.121805092525486e-08, "loss": 0.1958, "step": 19223 }, { "epoch": 0.9288302652558342, "grad_norm": 2.8457658290863037, "learning_rate": 7.116973474416582e-08, "loss": 0.3244, "step": 19224 }, { "epoch": 0.9288785814369233, "grad_norm": 3.43422269821167, "learning_rate": 7.112141856307677e-08, "loss": 0.4092, "step": 19225 }, { "epoch": 0.9289268976180123, "grad_norm": 2.94779634475708, "learning_rate": 7.107310238198774e-08, "loss": 0.2976, "step": 19226 }, { "epoch": 0.9289752137991013, "grad_norm": 7.798415660858154, "learning_rate": 7.102478620089867e-08, "loss": 0.3718, "step": 19227 }, { "epoch": 0.9290235299801903, "grad_norm": 5.656299591064453, "learning_rate": 7.097647001980964e-08, "loss": 0.266, "step": 19228 }, { "epoch": 0.9290718461612794, "grad_norm": 2.166795253753662, "learning_rate": 7.092815383872059e-08, "loss": 0.2022, "step": 19229 }, { "epoch": 0.9291201623423685, "grad_norm": 17.607091903686523, "learning_rate": 7.087983765763155e-08, "loss": 0.2574, "step": 19230 }, { "epoch": 0.9291684785234575, "grad_norm": 2.7531702518463135, "learning_rate": 7.083152147654249e-08, "loss": 0.3414, "step": 19231 }, { "epoch": 0.9292167947045465, "grad_norm": 48.44902420043945, "learning_rate": 7.078320529545344e-08, "loss": 0.256, "step": 19232 }, { "epoch": 0.9292651108856356, "grad_norm": 2.5705933570861816, "learning_rate": 7.07348891143644e-08, "loss": 0.2247, "step": 19233 }, { "epoch": 0.9293134270667246, "grad_norm": 2.330153703689575, "learning_rate": 7.068657293327534e-08, "loss": 0.2897, "step": 19234 }, { "epoch": 0.9293617432478137, "grad_norm": 2.1369996070861816, "learning_rate": 7.06382567521863e-08, "loss": 0.2492, "step": 19235 }, { "epoch": 0.9294100594289028, "grad_norm": 2.4208059310913086, "learning_rate": 7.058994057109725e-08, "loss": 0.2114, "step": 19236 }, { "epoch": 0.9294583756099918, "grad_norm": 2.668243169784546, "learning_rate": 7.054162439000822e-08, "loss": 0.3712, "step": 19237 }, { "epoch": 0.9295066917910808, "grad_norm": 3.1490814685821533, "learning_rate": 7.049330820891915e-08, "loss": 0.2797, "step": 19238 }, { "epoch": 0.9295550079721698, "grad_norm": 1.8886741399765015, "learning_rate": 7.044499202783012e-08, "loss": 0.2122, "step": 19239 }, { "epoch": 0.929603324153259, "grad_norm": 2.991257429122925, "learning_rate": 7.039667584674107e-08, "loss": 0.3788, "step": 19240 }, { "epoch": 0.929651640334348, "grad_norm": 2.5462255477905273, "learning_rate": 7.034835966565203e-08, "loss": 0.2355, "step": 19241 }, { "epoch": 0.929699956515437, "grad_norm": 2.7525742053985596, "learning_rate": 7.030004348456297e-08, "loss": 0.3332, "step": 19242 }, { "epoch": 0.929748272696526, "grad_norm": 1.9645535945892334, "learning_rate": 7.025172730347393e-08, "loss": 0.2607, "step": 19243 }, { "epoch": 0.9297965888776151, "grad_norm": 2.749957799911499, "learning_rate": 7.020341112238488e-08, "loss": 0.2689, "step": 19244 }, { "epoch": 0.9298449050587042, "grad_norm": 3.032223701477051, "learning_rate": 7.015509494129583e-08, "loss": 0.3091, "step": 19245 }, { "epoch": 0.9298932212397932, "grad_norm": 2.47481369972229, "learning_rate": 7.010677876020679e-08, "loss": 0.3275, "step": 19246 }, { "epoch": 0.9299415374208823, "grad_norm": 5.065354347229004, "learning_rate": 7.005846257911774e-08, "loss": 0.2964, "step": 19247 }, { "epoch": 0.9299898536019713, "grad_norm": 3.4778802394866943, "learning_rate": 7.00101463980287e-08, "loss": 0.431, "step": 19248 }, { "epoch": 0.9300381697830603, "grad_norm": 3.09818172454834, "learning_rate": 6.996183021693965e-08, "loss": 0.3894, "step": 19249 }, { "epoch": 0.9300864859641494, "grad_norm": 2.704960584640503, "learning_rate": 6.99135140358506e-08, "loss": 0.28, "step": 19250 }, { "epoch": 0.9301348021452385, "grad_norm": 4.484879493713379, "learning_rate": 6.986519785476155e-08, "loss": 0.4044, "step": 19251 }, { "epoch": 0.9301831183263275, "grad_norm": 3.9433813095092773, "learning_rate": 6.981688167367251e-08, "loss": 0.4174, "step": 19252 }, { "epoch": 0.9302314345074165, "grad_norm": 2.8147621154785156, "learning_rate": 6.976856549258347e-08, "loss": 0.2394, "step": 19253 }, { "epoch": 0.9302797506885055, "grad_norm": 3.0165748596191406, "learning_rate": 6.972024931149442e-08, "loss": 0.4131, "step": 19254 }, { "epoch": 0.9303280668695946, "grad_norm": 2.963993549346924, "learning_rate": 6.967193313040537e-08, "loss": 0.3779, "step": 19255 }, { "epoch": 0.9303763830506837, "grad_norm": 2.1635069847106934, "learning_rate": 6.962361694931632e-08, "loss": 0.28, "step": 19256 }, { "epoch": 0.9304246992317727, "grad_norm": 1.4766846895217896, "learning_rate": 6.957530076822728e-08, "loss": 0.1427, "step": 19257 }, { "epoch": 0.9304730154128618, "grad_norm": 2.408750534057617, "learning_rate": 6.952698458713822e-08, "loss": 0.1692, "step": 19258 }, { "epoch": 0.9305213315939508, "grad_norm": 4.014430522918701, "learning_rate": 6.947866840604918e-08, "loss": 0.3343, "step": 19259 }, { "epoch": 0.9305696477750398, "grad_norm": 3.8405535221099854, "learning_rate": 6.943035222496013e-08, "loss": 0.3197, "step": 19260 }, { "epoch": 0.930617963956129, "grad_norm": 4.189101219177246, "learning_rate": 6.93820360438711e-08, "loss": 0.4173, "step": 19261 }, { "epoch": 0.930666280137218, "grad_norm": 3.774286985397339, "learning_rate": 6.933371986278203e-08, "loss": 0.3528, "step": 19262 }, { "epoch": 0.930714596318307, "grad_norm": 3.2623414993286133, "learning_rate": 6.9285403681693e-08, "loss": 0.2572, "step": 19263 }, { "epoch": 0.930762912499396, "grad_norm": 2.474879026412964, "learning_rate": 6.923708750060395e-08, "loss": 0.3369, "step": 19264 }, { "epoch": 0.930811228680485, "grad_norm": 2.350390672683716, "learning_rate": 6.918877131951491e-08, "loss": 0.2444, "step": 19265 }, { "epoch": 0.9308595448615742, "grad_norm": 1.6307543516159058, "learning_rate": 6.914045513842585e-08, "loss": 0.1783, "step": 19266 }, { "epoch": 0.9309078610426632, "grad_norm": 3.24283766746521, "learning_rate": 6.909213895733681e-08, "loss": 0.4117, "step": 19267 }, { "epoch": 0.9309561772237522, "grad_norm": 3.3474481105804443, "learning_rate": 6.904382277624776e-08, "loss": 0.4195, "step": 19268 }, { "epoch": 0.9310044934048413, "grad_norm": 2.965461492538452, "learning_rate": 6.899550659515871e-08, "loss": 0.3321, "step": 19269 }, { "epoch": 0.9310528095859303, "grad_norm": 2.2080135345458984, "learning_rate": 6.894719041406966e-08, "loss": 0.1871, "step": 19270 }, { "epoch": 0.9311011257670194, "grad_norm": 2.1791794300079346, "learning_rate": 6.889887423298061e-08, "loss": 0.2384, "step": 19271 }, { "epoch": 0.9311494419481084, "grad_norm": 4.113063812255859, "learning_rate": 6.885055805189158e-08, "loss": 0.2757, "step": 19272 }, { "epoch": 0.9311977581291975, "grad_norm": 2.1023566722869873, "learning_rate": 6.880224187080253e-08, "loss": 0.221, "step": 19273 }, { "epoch": 0.9312460743102865, "grad_norm": 2.8902134895324707, "learning_rate": 6.875392568971348e-08, "loss": 0.3102, "step": 19274 }, { "epoch": 0.9312943904913755, "grad_norm": 3.634415864944458, "learning_rate": 6.870560950862443e-08, "loss": 0.2908, "step": 19275 }, { "epoch": 0.9313427066724647, "grad_norm": 3.068899631500244, "learning_rate": 6.865729332753539e-08, "loss": 0.3615, "step": 19276 }, { "epoch": 0.9313910228535537, "grad_norm": 4.42730188369751, "learning_rate": 6.860897714644634e-08, "loss": 0.3092, "step": 19277 }, { "epoch": 0.9314393390346427, "grad_norm": 2.174187183380127, "learning_rate": 6.85606609653573e-08, "loss": 0.2315, "step": 19278 }, { "epoch": 0.9314876552157317, "grad_norm": 2.499858856201172, "learning_rate": 6.851234478426824e-08, "loss": 0.3107, "step": 19279 }, { "epoch": 0.9315359713968208, "grad_norm": 2.0362532138824463, "learning_rate": 6.846402860317921e-08, "loss": 0.2108, "step": 19280 }, { "epoch": 0.9315842875779099, "grad_norm": 2.9922006130218506, "learning_rate": 6.841571242209016e-08, "loss": 0.2182, "step": 19281 }, { "epoch": 0.9316326037589989, "grad_norm": 2.556051254272461, "learning_rate": 6.83673962410011e-08, "loss": 0.3437, "step": 19282 }, { "epoch": 0.931680919940088, "grad_norm": 3.5399529933929443, "learning_rate": 6.831908005991206e-08, "loss": 0.2133, "step": 19283 }, { "epoch": 0.931729236121177, "grad_norm": 2.65580415725708, "learning_rate": 6.827076387882301e-08, "loss": 0.354, "step": 19284 }, { "epoch": 0.931777552302266, "grad_norm": 2.548792839050293, "learning_rate": 6.822244769773397e-08, "loss": 0.2081, "step": 19285 }, { "epoch": 0.931825868483355, "grad_norm": 4.937906742095947, "learning_rate": 6.817413151664491e-08, "loss": 0.2107, "step": 19286 }, { "epoch": 0.9318741846644442, "grad_norm": 2.279109477996826, "learning_rate": 6.812581533555588e-08, "loss": 0.19, "step": 19287 }, { "epoch": 0.9319225008455332, "grad_norm": 1.7887166738510132, "learning_rate": 6.807749915446683e-08, "loss": 0.1808, "step": 19288 }, { "epoch": 0.9319708170266222, "grad_norm": 2.621882200241089, "learning_rate": 6.802918297337779e-08, "loss": 0.2794, "step": 19289 }, { "epoch": 0.9320191332077112, "grad_norm": 2.7791025638580322, "learning_rate": 6.798086679228873e-08, "loss": 0.2241, "step": 19290 }, { "epoch": 0.9320674493888003, "grad_norm": 3.0645973682403564, "learning_rate": 6.793255061119969e-08, "loss": 0.1763, "step": 19291 }, { "epoch": 0.9321157655698894, "grad_norm": 3.397136688232422, "learning_rate": 6.788423443011064e-08, "loss": 0.3576, "step": 19292 }, { "epoch": 0.9321640817509784, "grad_norm": 1.7815088033676147, "learning_rate": 6.78359182490216e-08, "loss": 0.2081, "step": 19293 }, { "epoch": 0.9322123979320674, "grad_norm": 2.757913112640381, "learning_rate": 6.778760206793254e-08, "loss": 0.3261, "step": 19294 }, { "epoch": 0.9322607141131565, "grad_norm": 2.3294708728790283, "learning_rate": 6.773928588684349e-08, "loss": 0.2601, "step": 19295 }, { "epoch": 0.9323090302942455, "grad_norm": 2.7440993785858154, "learning_rate": 6.769096970575446e-08, "loss": 0.2876, "step": 19296 }, { "epoch": 0.9323573464753346, "grad_norm": 2.791710615158081, "learning_rate": 6.764265352466541e-08, "loss": 0.31, "step": 19297 }, { "epoch": 0.9324056626564237, "grad_norm": 2.5202436447143555, "learning_rate": 6.759433734357636e-08, "loss": 0.2865, "step": 19298 }, { "epoch": 0.9324539788375127, "grad_norm": 2.7483487129211426, "learning_rate": 6.754602116248731e-08, "loss": 0.2969, "step": 19299 }, { "epoch": 0.9325022950186017, "grad_norm": 3.425806999206543, "learning_rate": 6.749770498139827e-08, "loss": 0.3941, "step": 19300 }, { "epoch": 0.9325506111996907, "grad_norm": 1.8148576021194458, "learning_rate": 6.744938880030922e-08, "loss": 0.1868, "step": 19301 }, { "epoch": 0.9325989273807799, "grad_norm": 1.674438238143921, "learning_rate": 6.740107261922017e-08, "loss": 0.1839, "step": 19302 }, { "epoch": 0.9326472435618689, "grad_norm": 2.87996506690979, "learning_rate": 6.735275643813112e-08, "loss": 0.229, "step": 19303 }, { "epoch": 0.9326955597429579, "grad_norm": 1.9566608667373657, "learning_rate": 6.730444025704209e-08, "loss": 0.2499, "step": 19304 }, { "epoch": 0.932743875924047, "grad_norm": 9.397282600402832, "learning_rate": 6.725612407595304e-08, "loss": 0.2339, "step": 19305 }, { "epoch": 0.932792192105136, "grad_norm": 5.205350875854492, "learning_rate": 6.720780789486399e-08, "loss": 0.3138, "step": 19306 }, { "epoch": 0.9328405082862251, "grad_norm": 3.3813326358795166, "learning_rate": 6.715949171377494e-08, "loss": 0.3005, "step": 19307 }, { "epoch": 0.9328888244673141, "grad_norm": 3.2976064682006836, "learning_rate": 6.711117553268589e-08, "loss": 0.3703, "step": 19308 }, { "epoch": 0.9329371406484032, "grad_norm": 2.889683485031128, "learning_rate": 6.706285935159685e-08, "loss": 0.2794, "step": 19309 }, { "epoch": 0.9329854568294922, "grad_norm": 2.325526714324951, "learning_rate": 6.701454317050779e-08, "loss": 0.2493, "step": 19310 }, { "epoch": 0.9330337730105812, "grad_norm": 2.19242787361145, "learning_rate": 6.696622698941875e-08, "loss": 0.2572, "step": 19311 }, { "epoch": 0.9330820891916702, "grad_norm": 6.520901679992676, "learning_rate": 6.69179108083297e-08, "loss": 0.2818, "step": 19312 }, { "epoch": 0.9331304053727594, "grad_norm": 2.440835952758789, "learning_rate": 6.686959462724067e-08, "loss": 0.2773, "step": 19313 }, { "epoch": 0.9331787215538484, "grad_norm": 2.564573287963867, "learning_rate": 6.68212784461516e-08, "loss": 0.2738, "step": 19314 }, { "epoch": 0.9332270377349374, "grad_norm": 2.8606014251708984, "learning_rate": 6.677296226506257e-08, "loss": 0.4, "step": 19315 }, { "epoch": 0.9332753539160265, "grad_norm": 5.60567569732666, "learning_rate": 6.672464608397352e-08, "loss": 0.1952, "step": 19316 }, { "epoch": 0.9333236700971155, "grad_norm": 2.034825563430786, "learning_rate": 6.667632990288448e-08, "loss": 0.2125, "step": 19317 }, { "epoch": 0.9333719862782046, "grad_norm": 6.11604118347168, "learning_rate": 6.662801372179542e-08, "loss": 0.2767, "step": 19318 }, { "epoch": 0.9334203024592936, "grad_norm": 4.919561862945557, "learning_rate": 6.657969754070637e-08, "loss": 0.3142, "step": 19319 }, { "epoch": 0.9334686186403827, "grad_norm": 1.8950798511505127, "learning_rate": 6.653138135961733e-08, "loss": 0.1956, "step": 19320 }, { "epoch": 0.9335169348214717, "grad_norm": 2.3022937774658203, "learning_rate": 6.648306517852829e-08, "loss": 0.2173, "step": 19321 }, { "epoch": 0.9335652510025607, "grad_norm": 2.5775821208953857, "learning_rate": 6.643474899743924e-08, "loss": 0.2093, "step": 19322 }, { "epoch": 0.9336135671836499, "grad_norm": 3.001103401184082, "learning_rate": 6.638643281635019e-08, "loss": 0.2674, "step": 19323 }, { "epoch": 0.9336618833647389, "grad_norm": 2.2208001613616943, "learning_rate": 6.633811663526115e-08, "loss": 0.2358, "step": 19324 }, { "epoch": 0.9337101995458279, "grad_norm": 12.291778564453125, "learning_rate": 6.62898004541721e-08, "loss": 0.1626, "step": 19325 }, { "epoch": 0.9337585157269169, "grad_norm": 1.8974775075912476, "learning_rate": 6.624148427308305e-08, "loss": 0.2434, "step": 19326 }, { "epoch": 0.933806831908006, "grad_norm": 2.363096237182617, "learning_rate": 6.6193168091994e-08, "loss": 0.2401, "step": 19327 }, { "epoch": 0.9338551480890951, "grad_norm": 2.360032558441162, "learning_rate": 6.614485191090497e-08, "loss": 0.2546, "step": 19328 }, { "epoch": 0.9339034642701841, "grad_norm": 5.385598182678223, "learning_rate": 6.609653572981592e-08, "loss": 0.3649, "step": 19329 }, { "epoch": 0.9339517804512731, "grad_norm": 2.3125243186950684, "learning_rate": 6.604821954872687e-08, "loss": 0.2802, "step": 19330 }, { "epoch": 0.9340000966323622, "grad_norm": 2.7637619972229004, "learning_rate": 6.599990336763782e-08, "loss": 0.2669, "step": 19331 }, { "epoch": 0.9340484128134512, "grad_norm": 2.271796464920044, "learning_rate": 6.595158718654877e-08, "loss": 0.2799, "step": 19332 }, { "epoch": 0.9340967289945403, "grad_norm": 2.6292388439178467, "learning_rate": 6.590327100545973e-08, "loss": 0.2443, "step": 19333 }, { "epoch": 0.9341450451756294, "grad_norm": 3.515349864959717, "learning_rate": 6.585495482437067e-08, "loss": 0.346, "step": 19334 }, { "epoch": 0.9341933613567184, "grad_norm": 1.6915229558944702, "learning_rate": 6.580663864328163e-08, "loss": 0.1772, "step": 19335 }, { "epoch": 0.9342416775378074, "grad_norm": 9.654169082641602, "learning_rate": 6.575832246219258e-08, "loss": 0.3967, "step": 19336 }, { "epoch": 0.9342899937188964, "grad_norm": 11.06276798248291, "learning_rate": 6.571000628110355e-08, "loss": 0.312, "step": 19337 }, { "epoch": 0.9343383098999855, "grad_norm": 21.145315170288086, "learning_rate": 6.566169010001448e-08, "loss": 0.3191, "step": 19338 }, { "epoch": 0.9343866260810746, "grad_norm": 3.1121816635131836, "learning_rate": 6.561337391892545e-08, "loss": 0.3587, "step": 19339 }, { "epoch": 0.9344349422621636, "grad_norm": 3.911247968673706, "learning_rate": 6.55650577378364e-08, "loss": 0.1946, "step": 19340 }, { "epoch": 0.9344832584432526, "grad_norm": 8.556495666503906, "learning_rate": 6.551674155674736e-08, "loss": 0.2717, "step": 19341 }, { "epoch": 0.9345315746243417, "grad_norm": 2.216062068939209, "learning_rate": 6.54684253756583e-08, "loss": 0.2209, "step": 19342 }, { "epoch": 0.9345798908054307, "grad_norm": 2.280710458755493, "learning_rate": 6.542010919456926e-08, "loss": 0.2988, "step": 19343 }, { "epoch": 0.9346282069865198, "grad_norm": 2.2013611793518066, "learning_rate": 6.537179301348021e-08, "loss": 0.2435, "step": 19344 }, { "epoch": 0.9346765231676089, "grad_norm": 2.3264641761779785, "learning_rate": 6.532347683239116e-08, "loss": 0.2138, "step": 19345 }, { "epoch": 0.9347248393486979, "grad_norm": 3.5969510078430176, "learning_rate": 6.527516065130211e-08, "loss": 0.2749, "step": 19346 }, { "epoch": 0.9347731555297869, "grad_norm": 1.7015784978866577, "learning_rate": 6.522684447021306e-08, "loss": 0.1467, "step": 19347 }, { "epoch": 0.9348214717108759, "grad_norm": 2.2081298828125, "learning_rate": 6.517852828912403e-08, "loss": 0.2213, "step": 19348 }, { "epoch": 0.9348697878919651, "grad_norm": 2.274718761444092, "learning_rate": 6.513021210803498e-08, "loss": 0.2683, "step": 19349 }, { "epoch": 0.9349181040730541, "grad_norm": 3.0506231784820557, "learning_rate": 6.508189592694593e-08, "loss": 0.3441, "step": 19350 }, { "epoch": 0.9349664202541431, "grad_norm": 2.098100423812866, "learning_rate": 6.503357974585688e-08, "loss": 0.2497, "step": 19351 }, { "epoch": 0.9350147364352321, "grad_norm": 2.3801698684692383, "learning_rate": 6.498526356476784e-08, "loss": 0.2823, "step": 19352 }, { "epoch": 0.9350630526163212, "grad_norm": 2.0118162631988525, "learning_rate": 6.49369473836788e-08, "loss": 0.204, "step": 19353 }, { "epoch": 0.9351113687974103, "grad_norm": 2.91263484954834, "learning_rate": 6.488863120258974e-08, "loss": 0.3929, "step": 19354 }, { "epoch": 0.9351596849784993, "grad_norm": 2.6088318824768066, "learning_rate": 6.48403150215007e-08, "loss": 0.2709, "step": 19355 }, { "epoch": 0.9352080011595884, "grad_norm": 4.270007610321045, "learning_rate": 6.479199884041166e-08, "loss": 0.2647, "step": 19356 }, { "epoch": 0.9352563173406774, "grad_norm": 3.113891363143921, "learning_rate": 6.474368265932261e-08, "loss": 0.2957, "step": 19357 }, { "epoch": 0.9353046335217664, "grad_norm": 6.395543098449707, "learning_rate": 6.469536647823355e-08, "loss": 0.3452, "step": 19358 }, { "epoch": 0.9353529497028555, "grad_norm": 4.764474391937256, "learning_rate": 6.464705029714451e-08, "loss": 0.2688, "step": 19359 }, { "epoch": 0.9354012658839446, "grad_norm": 2.06390643119812, "learning_rate": 6.459873411605546e-08, "loss": 0.2094, "step": 19360 }, { "epoch": 0.9354495820650336, "grad_norm": 1.881683349609375, "learning_rate": 6.455041793496642e-08, "loss": 0.1716, "step": 19361 }, { "epoch": 0.9354978982461226, "grad_norm": 6.21505880355835, "learning_rate": 6.450210175387736e-08, "loss": 0.2758, "step": 19362 }, { "epoch": 0.9355462144272116, "grad_norm": 2.884702444076538, "learning_rate": 6.445378557278833e-08, "loss": 0.3074, "step": 19363 }, { "epoch": 0.9355945306083007, "grad_norm": 2.9467809200286865, "learning_rate": 6.440546939169928e-08, "loss": 0.1991, "step": 19364 }, { "epoch": 0.9356428467893898, "grad_norm": 2.7027816772460938, "learning_rate": 6.435715321061024e-08, "loss": 0.3801, "step": 19365 }, { "epoch": 0.9356911629704788, "grad_norm": 2.9649431705474854, "learning_rate": 6.430883702952118e-08, "loss": 0.3832, "step": 19366 }, { "epoch": 0.9357394791515679, "grad_norm": 2.8873684406280518, "learning_rate": 6.426052084843214e-08, "loss": 0.2565, "step": 19367 }, { "epoch": 0.9357877953326569, "grad_norm": 2.557278871536255, "learning_rate": 6.421220466734309e-08, "loss": 0.257, "step": 19368 }, { "epoch": 0.9358361115137459, "grad_norm": 3.5787811279296875, "learning_rate": 6.416388848625406e-08, "loss": 0.414, "step": 19369 }, { "epoch": 0.935884427694835, "grad_norm": 21.69418716430664, "learning_rate": 6.411557230516499e-08, "loss": 0.2139, "step": 19370 }, { "epoch": 0.9359327438759241, "grad_norm": 2.1287074089050293, "learning_rate": 6.406725612407594e-08, "loss": 0.2465, "step": 19371 }, { "epoch": 0.9359810600570131, "grad_norm": 2.1350717544555664, "learning_rate": 6.401893994298691e-08, "loss": 0.1975, "step": 19372 }, { "epoch": 0.9360293762381021, "grad_norm": 2.5009069442749023, "learning_rate": 6.397062376189786e-08, "loss": 0.2292, "step": 19373 }, { "epoch": 0.9360776924191911, "grad_norm": 2.3337979316711426, "learning_rate": 6.392230758080881e-08, "loss": 0.1944, "step": 19374 }, { "epoch": 0.9361260086002803, "grad_norm": 3.5952413082122803, "learning_rate": 6.387399139971976e-08, "loss": 0.252, "step": 19375 }, { "epoch": 0.9361743247813693, "grad_norm": 2.005814552307129, "learning_rate": 6.382567521863072e-08, "loss": 0.2376, "step": 19376 }, { "epoch": 0.9362226409624583, "grad_norm": 2.9297382831573486, "learning_rate": 6.377735903754167e-08, "loss": 0.2689, "step": 19377 }, { "epoch": 0.9362709571435474, "grad_norm": 3.1531338691711426, "learning_rate": 6.372904285645262e-08, "loss": 0.3305, "step": 19378 }, { "epoch": 0.9363192733246364, "grad_norm": 3.001915693283081, "learning_rate": 6.368072667536357e-08, "loss": 0.3214, "step": 19379 }, { "epoch": 0.9363675895057255, "grad_norm": 14.889780044555664, "learning_rate": 6.363241049427454e-08, "loss": 0.3122, "step": 19380 }, { "epoch": 0.9364159056868145, "grad_norm": 3.4321649074554443, "learning_rate": 6.358409431318549e-08, "loss": 0.287, "step": 19381 }, { "epoch": 0.9364642218679036, "grad_norm": 2.421445369720459, "learning_rate": 6.353577813209644e-08, "loss": 0.2416, "step": 19382 }, { "epoch": 0.9365125380489926, "grad_norm": 2.803039312362671, "learning_rate": 6.348746195100739e-08, "loss": 0.2949, "step": 19383 }, { "epoch": 0.9365608542300816, "grad_norm": 2.276337146759033, "learning_rate": 6.343914576991834e-08, "loss": 0.2811, "step": 19384 }, { "epoch": 0.9366091704111708, "grad_norm": 3.5655627250671387, "learning_rate": 6.33908295888293e-08, "loss": 0.3446, "step": 19385 }, { "epoch": 0.9366574865922598, "grad_norm": 2.79366397857666, "learning_rate": 6.334251340774024e-08, "loss": 0.3263, "step": 19386 }, { "epoch": 0.9367058027733488, "grad_norm": 2.686612606048584, "learning_rate": 6.32941972266512e-08, "loss": 0.2249, "step": 19387 }, { "epoch": 0.9367541189544378, "grad_norm": 3.746152400970459, "learning_rate": 6.324588104556215e-08, "loss": 0.2895, "step": 19388 }, { "epoch": 0.9368024351355269, "grad_norm": 2.795618772506714, "learning_rate": 6.319756486447312e-08, "loss": 0.3256, "step": 19389 }, { "epoch": 0.9368507513166159, "grad_norm": 2.8798322677612305, "learning_rate": 6.314924868338406e-08, "loss": 0.3179, "step": 19390 }, { "epoch": 0.936899067497705, "grad_norm": 2.5221099853515625, "learning_rate": 6.310093250229502e-08, "loss": 0.3201, "step": 19391 }, { "epoch": 0.936947383678794, "grad_norm": 3.5922417640686035, "learning_rate": 6.305261632120597e-08, "loss": 0.3295, "step": 19392 }, { "epoch": 0.9369956998598831, "grad_norm": 2.0877115726470947, "learning_rate": 6.300430014011693e-08, "loss": 0.2216, "step": 19393 }, { "epoch": 0.9370440160409721, "grad_norm": 4.269381523132324, "learning_rate": 6.295598395902787e-08, "loss": 0.3544, "step": 19394 }, { "epoch": 0.9370923322220611, "grad_norm": 2.330386161804199, "learning_rate": 6.290766777793882e-08, "loss": 0.303, "step": 19395 }, { "epoch": 0.9371406484031503, "grad_norm": 3.620283842086792, "learning_rate": 6.285935159684979e-08, "loss": 0.3103, "step": 19396 }, { "epoch": 0.9371889645842393, "grad_norm": 2.6180591583251953, "learning_rate": 6.281103541576074e-08, "loss": 0.3186, "step": 19397 }, { "epoch": 0.9372372807653283, "grad_norm": 2.2656009197235107, "learning_rate": 6.276271923467169e-08, "loss": 0.2848, "step": 19398 }, { "epoch": 0.9372855969464173, "grad_norm": 2.211372137069702, "learning_rate": 6.271440305358264e-08, "loss": 0.2493, "step": 19399 }, { "epoch": 0.9373339131275064, "grad_norm": 3.898951292037964, "learning_rate": 6.26660868724936e-08, "loss": 0.3213, "step": 19400 }, { "epoch": 0.9373822293085955, "grad_norm": 2.674907684326172, "learning_rate": 6.261777069140455e-08, "loss": 0.2273, "step": 19401 }, { "epoch": 0.9374305454896845, "grad_norm": 4.982419490814209, "learning_rate": 6.25694545103155e-08, "loss": 0.2911, "step": 19402 }, { "epoch": 0.9374788616707735, "grad_norm": 2.774036407470703, "learning_rate": 6.252113832922645e-08, "loss": 0.2795, "step": 19403 }, { "epoch": 0.9375271778518626, "grad_norm": 3.4513866901397705, "learning_rate": 6.24728221481374e-08, "loss": 0.3023, "step": 19404 }, { "epoch": 0.9375754940329516, "grad_norm": 2.2786145210266113, "learning_rate": 6.242450596704837e-08, "loss": 0.2674, "step": 19405 }, { "epoch": 0.9376238102140407, "grad_norm": 2.9997012615203857, "learning_rate": 6.237618978595932e-08, "loss": 0.287, "step": 19406 }, { "epoch": 0.9376721263951298, "grad_norm": 3.181964874267578, "learning_rate": 6.232787360487027e-08, "loss": 0.3483, "step": 19407 }, { "epoch": 0.9377204425762188, "grad_norm": 2.2298784255981445, "learning_rate": 6.227955742378122e-08, "loss": 0.2317, "step": 19408 }, { "epoch": 0.9377687587573078, "grad_norm": 3.522521495819092, "learning_rate": 6.223124124269218e-08, "loss": 0.1993, "step": 19409 }, { "epoch": 0.9378170749383968, "grad_norm": 2.3578104972839355, "learning_rate": 6.218292506160313e-08, "loss": 0.1843, "step": 19410 }, { "epoch": 0.937865391119486, "grad_norm": 2.589219570159912, "learning_rate": 6.213460888051408e-08, "loss": 0.271, "step": 19411 }, { "epoch": 0.937913707300575, "grad_norm": 2.258017063140869, "learning_rate": 6.208629269942503e-08, "loss": 0.2905, "step": 19412 }, { "epoch": 0.937962023481664, "grad_norm": 2.2094154357910156, "learning_rate": 6.2037976518336e-08, "loss": 0.2087, "step": 19413 }, { "epoch": 0.938010339662753, "grad_norm": 2.9484546184539795, "learning_rate": 6.198966033724693e-08, "loss": 0.3596, "step": 19414 }, { "epoch": 0.9380586558438421, "grad_norm": 2.4480788707733154, "learning_rate": 6.19413441561579e-08, "loss": 0.1792, "step": 19415 }, { "epoch": 0.9381069720249311, "grad_norm": 2.474381923675537, "learning_rate": 6.189302797506885e-08, "loss": 0.3069, "step": 19416 }, { "epoch": 0.9381552882060202, "grad_norm": 3.694444417953491, "learning_rate": 6.18447117939798e-08, "loss": 0.3521, "step": 19417 }, { "epoch": 0.9382036043871093, "grad_norm": 3.0418102741241455, "learning_rate": 6.179639561289075e-08, "loss": 0.3526, "step": 19418 }, { "epoch": 0.9382519205681983, "grad_norm": 2.226929187774658, "learning_rate": 6.174807943180171e-08, "loss": 0.2947, "step": 19419 }, { "epoch": 0.9383002367492873, "grad_norm": 4.682210445404053, "learning_rate": 6.169976325071266e-08, "loss": 0.2564, "step": 19420 }, { "epoch": 0.9383485529303763, "grad_norm": 4.369717121124268, "learning_rate": 6.165144706962361e-08, "loss": 0.2357, "step": 19421 }, { "epoch": 0.9383968691114655, "grad_norm": 11.350380897521973, "learning_rate": 6.160313088853456e-08, "loss": 0.2375, "step": 19422 }, { "epoch": 0.9384451852925545, "grad_norm": 4.645724773406982, "learning_rate": 6.155481470744553e-08, "loss": 0.4049, "step": 19423 }, { "epoch": 0.9384935014736435, "grad_norm": 2.996863842010498, "learning_rate": 6.150649852635648e-08, "loss": 0.289, "step": 19424 }, { "epoch": 0.9385418176547325, "grad_norm": 2.3429160118103027, "learning_rate": 6.145818234526743e-08, "loss": 0.2637, "step": 19425 }, { "epoch": 0.9385901338358216, "grad_norm": 2.7741830348968506, "learning_rate": 6.140986616417838e-08, "loss": 0.3355, "step": 19426 }, { "epoch": 0.9386384500169107, "grad_norm": 3.226263999938965, "learning_rate": 6.136154998308933e-08, "loss": 0.2135, "step": 19427 }, { "epoch": 0.9386867661979997, "grad_norm": 3.4524412155151367, "learning_rate": 6.131323380200028e-08, "loss": 0.3404, "step": 19428 }, { "epoch": 0.9387350823790888, "grad_norm": 2.416076421737671, "learning_rate": 6.126491762091124e-08, "loss": 0.3168, "step": 19429 }, { "epoch": 0.9387833985601778, "grad_norm": 2.7620484828948975, "learning_rate": 6.12166014398222e-08, "loss": 0.3534, "step": 19430 }, { "epoch": 0.9388317147412668, "grad_norm": 2.524468183517456, "learning_rate": 6.116828525873315e-08, "loss": 0.3272, "step": 19431 }, { "epoch": 0.938880030922356, "grad_norm": 2.0695173740386963, "learning_rate": 6.11199690776441e-08, "loss": 0.2168, "step": 19432 }, { "epoch": 0.938928347103445, "grad_norm": 3.694697618484497, "learning_rate": 6.107165289655506e-08, "loss": 0.2235, "step": 19433 }, { "epoch": 0.938976663284534, "grad_norm": 2.0704429149627686, "learning_rate": 6.102333671546601e-08, "loss": 0.2767, "step": 19434 }, { "epoch": 0.939024979465623, "grad_norm": 2.7153995037078857, "learning_rate": 6.097502053437696e-08, "loss": 0.2098, "step": 19435 }, { "epoch": 0.939073295646712, "grad_norm": 2.213466167449951, "learning_rate": 6.092670435328791e-08, "loss": 0.2741, "step": 19436 }, { "epoch": 0.9391216118278012, "grad_norm": 2.0129106044769287, "learning_rate": 6.087838817219888e-08, "loss": 0.2161, "step": 19437 }, { "epoch": 0.9391699280088902, "grad_norm": 2.436952590942383, "learning_rate": 6.083007199110983e-08, "loss": 0.2948, "step": 19438 }, { "epoch": 0.9392182441899792, "grad_norm": 2.360886335372925, "learning_rate": 6.078175581002076e-08, "loss": 0.2874, "step": 19439 }, { "epoch": 0.9392665603710683, "grad_norm": 4.243950843811035, "learning_rate": 6.073343962893173e-08, "loss": 0.33, "step": 19440 }, { "epoch": 0.9393148765521573, "grad_norm": 2.7457022666931152, "learning_rate": 6.068512344784268e-08, "loss": 0.3494, "step": 19441 }, { "epoch": 0.9393631927332463, "grad_norm": 2.0176234245300293, "learning_rate": 6.063680726675363e-08, "loss": 0.2131, "step": 19442 }, { "epoch": 0.9394115089143354, "grad_norm": 2.494352340698242, "learning_rate": 6.058849108566458e-08, "loss": 0.2696, "step": 19443 }, { "epoch": 0.9394598250954245, "grad_norm": 3.7807493209838867, "learning_rate": 6.054017490457554e-08, "loss": 0.1569, "step": 19444 }, { "epoch": 0.9395081412765135, "grad_norm": 3.896233081817627, "learning_rate": 6.049185872348649e-08, "loss": 0.347, "step": 19445 }, { "epoch": 0.9395564574576025, "grad_norm": 2.060879707336426, "learning_rate": 6.044354254239744e-08, "loss": 0.2242, "step": 19446 }, { "epoch": 0.9396047736386915, "grad_norm": 2.854668617248535, "learning_rate": 6.03952263613084e-08, "loss": 0.2307, "step": 19447 }, { "epoch": 0.9396530898197807, "grad_norm": 2.381875991821289, "learning_rate": 6.034691018021936e-08, "loss": 0.2269, "step": 19448 }, { "epoch": 0.9397014060008697, "grad_norm": 2.656860113143921, "learning_rate": 6.029859399913031e-08, "loss": 0.2908, "step": 19449 }, { "epoch": 0.9397497221819587, "grad_norm": 3.8986358642578125, "learning_rate": 6.025027781804126e-08, "loss": 0.3912, "step": 19450 }, { "epoch": 0.9397980383630478, "grad_norm": 2.2362060546875, "learning_rate": 6.020196163695222e-08, "loss": 0.2682, "step": 19451 }, { "epoch": 0.9398463545441368, "grad_norm": 2.6514556407928467, "learning_rate": 6.015364545586316e-08, "loss": 0.2416, "step": 19452 }, { "epoch": 0.9398946707252259, "grad_norm": 2.656754970550537, "learning_rate": 6.010532927477411e-08, "loss": 0.2706, "step": 19453 }, { "epoch": 0.939942986906315, "grad_norm": 3.8814964294433594, "learning_rate": 6.005701309368507e-08, "loss": 0.275, "step": 19454 }, { "epoch": 0.939991303087404, "grad_norm": 2.792717695236206, "learning_rate": 6.000869691259602e-08, "loss": 0.1933, "step": 19455 }, { "epoch": 0.940039619268493, "grad_norm": 2.321420431137085, "learning_rate": 5.996038073150697e-08, "loss": 0.2285, "step": 19456 }, { "epoch": 0.940087935449582, "grad_norm": 5.682835102081299, "learning_rate": 5.991206455041793e-08, "loss": 0.296, "step": 19457 }, { "epoch": 0.9401362516306712, "grad_norm": 8.0428466796875, "learning_rate": 5.986374836932889e-08, "loss": 0.293, "step": 19458 }, { "epoch": 0.9401845678117602, "grad_norm": 3.4755680561065674, "learning_rate": 5.981543218823984e-08, "loss": 0.2334, "step": 19459 }, { "epoch": 0.9402328839928492, "grad_norm": 8.732531547546387, "learning_rate": 5.976711600715079e-08, "loss": 0.3105, "step": 19460 }, { "epoch": 0.9402812001739382, "grad_norm": 2.573634147644043, "learning_rate": 5.971879982606174e-08, "loss": 0.3442, "step": 19461 }, { "epoch": 0.9403295163550273, "grad_norm": 3.0260350704193115, "learning_rate": 5.96704836449727e-08, "loss": 0.2621, "step": 19462 }, { "epoch": 0.9403778325361164, "grad_norm": 2.6478090286254883, "learning_rate": 5.962216746388365e-08, "loss": 0.2803, "step": 19463 }, { "epoch": 0.9404261487172054, "grad_norm": 3.408292293548584, "learning_rate": 5.9573851282794605e-08, "loss": 0.4558, "step": 19464 }, { "epoch": 0.9404744648982944, "grad_norm": 2.271850824356079, "learning_rate": 5.9525535101705556e-08, "loss": 0.191, "step": 19465 }, { "epoch": 0.9405227810793835, "grad_norm": 5.109781742095947, "learning_rate": 5.9477218920616506e-08, "loss": 0.3811, "step": 19466 }, { "epoch": 0.9405710972604725, "grad_norm": 3.1737406253814697, "learning_rate": 5.9428902739527464e-08, "loss": 0.4049, "step": 19467 }, { "epoch": 0.9406194134415615, "grad_norm": 3.1774280071258545, "learning_rate": 5.9380586558438414e-08, "loss": 0.2753, "step": 19468 }, { "epoch": 0.9406677296226507, "grad_norm": 2.213364839553833, "learning_rate": 5.933227037734937e-08, "loss": 0.218, "step": 19469 }, { "epoch": 0.9407160458037397, "grad_norm": 2.828450918197632, "learning_rate": 5.928395419626032e-08, "loss": 0.3616, "step": 19470 }, { "epoch": 0.9407643619848287, "grad_norm": 2.6014935970306396, "learning_rate": 5.923563801517128e-08, "loss": 0.3415, "step": 19471 }, { "epoch": 0.9408126781659177, "grad_norm": 3.6011173725128174, "learning_rate": 5.918732183408223e-08, "loss": 0.3986, "step": 19472 }, { "epoch": 0.9408609943470068, "grad_norm": 4.058624744415283, "learning_rate": 5.9139005652993186e-08, "loss": 0.4873, "step": 19473 }, { "epoch": 0.9409093105280959, "grad_norm": 10.100605010986328, "learning_rate": 5.909068947190414e-08, "loss": 0.2843, "step": 19474 }, { "epoch": 0.9409576267091849, "grad_norm": 1.995627760887146, "learning_rate": 5.9042373290815094e-08, "loss": 0.2043, "step": 19475 }, { "epoch": 0.941005942890274, "grad_norm": 2.9172186851501465, "learning_rate": 5.8994057109726045e-08, "loss": 0.288, "step": 19476 }, { "epoch": 0.941054259071363, "grad_norm": 2.299973487854004, "learning_rate": 5.8945740928636995e-08, "loss": 0.2782, "step": 19477 }, { "epoch": 0.941102575252452, "grad_norm": 2.126847505569458, "learning_rate": 5.8897424747547946e-08, "loss": 0.253, "step": 19478 }, { "epoch": 0.9411508914335411, "grad_norm": 2.8256711959838867, "learning_rate": 5.88491085664589e-08, "loss": 0.3407, "step": 19479 }, { "epoch": 0.9411992076146302, "grad_norm": 2.6487741470336914, "learning_rate": 5.880079238536985e-08, "loss": 0.3287, "step": 19480 }, { "epoch": 0.9412475237957192, "grad_norm": 2.02247953414917, "learning_rate": 5.875247620428081e-08, "loss": 0.2211, "step": 19481 }, { "epoch": 0.9412958399768082, "grad_norm": 2.916729211807251, "learning_rate": 5.870416002319176e-08, "loss": 0.2656, "step": 19482 }, { "epoch": 0.9413441561578972, "grad_norm": 2.9980905055999756, "learning_rate": 5.865584384210272e-08, "loss": 0.3451, "step": 19483 }, { "epoch": 0.9413924723389864, "grad_norm": 4.361861228942871, "learning_rate": 5.860752766101367e-08, "loss": 0.2661, "step": 19484 }, { "epoch": 0.9414407885200754, "grad_norm": 3.027451276779175, "learning_rate": 5.8559211479924626e-08, "loss": 0.3709, "step": 19485 }, { "epoch": 0.9414891047011644, "grad_norm": 3.5987894535064697, "learning_rate": 5.8510895298835576e-08, "loss": 0.4563, "step": 19486 }, { "epoch": 0.9415374208822535, "grad_norm": 4.354104042053223, "learning_rate": 5.8462579117746533e-08, "loss": 0.4354, "step": 19487 }, { "epoch": 0.9415857370633425, "grad_norm": 2.6921823024749756, "learning_rate": 5.8414262936657484e-08, "loss": 0.2634, "step": 19488 }, { "epoch": 0.9416340532444316, "grad_norm": 2.246891736984253, "learning_rate": 5.836594675556844e-08, "loss": 0.2601, "step": 19489 }, { "epoch": 0.9416823694255206, "grad_norm": 2.559884786605835, "learning_rate": 5.8317630574479385e-08, "loss": 0.3563, "step": 19490 }, { "epoch": 0.9417306856066097, "grad_norm": 1.9537702798843384, "learning_rate": 5.826931439339034e-08, "loss": 0.2276, "step": 19491 }, { "epoch": 0.9417790017876987, "grad_norm": 2.4253876209259033, "learning_rate": 5.822099821230129e-08, "loss": 0.2641, "step": 19492 }, { "epoch": 0.9418273179687877, "grad_norm": 2.7953438758850098, "learning_rate": 5.817268203121225e-08, "loss": 0.418, "step": 19493 }, { "epoch": 0.9418756341498767, "grad_norm": 1.9989067316055298, "learning_rate": 5.81243658501232e-08, "loss": 0.2274, "step": 19494 }, { "epoch": 0.9419239503309659, "grad_norm": 2.1655142307281494, "learning_rate": 5.807604966903416e-08, "loss": 0.2869, "step": 19495 }, { "epoch": 0.9419722665120549, "grad_norm": 2.0991055965423584, "learning_rate": 5.802773348794511e-08, "loss": 0.2127, "step": 19496 }, { "epoch": 0.9420205826931439, "grad_norm": 2.6619369983673096, "learning_rate": 5.7979417306856065e-08, "loss": 0.2985, "step": 19497 }, { "epoch": 0.942068898874233, "grad_norm": 2.8905158042907715, "learning_rate": 5.7931101125767015e-08, "loss": 0.345, "step": 19498 }, { "epoch": 0.942117215055322, "grad_norm": 2.621293306350708, "learning_rate": 5.788278494467797e-08, "loss": 0.2668, "step": 19499 }, { "epoch": 0.9421655312364111, "grad_norm": 1.9359880685806274, "learning_rate": 5.783446876358892e-08, "loss": 0.2393, "step": 19500 }, { "epoch": 0.9422138474175001, "grad_norm": 2.3918349742889404, "learning_rate": 5.778615258249988e-08, "loss": 0.252, "step": 19501 }, { "epoch": 0.9422621635985892, "grad_norm": 3.0675301551818848, "learning_rate": 5.773783640141083e-08, "loss": 0.3238, "step": 19502 }, { "epoch": 0.9423104797796782, "grad_norm": 3.3425745964050293, "learning_rate": 5.768952022032178e-08, "loss": 0.3928, "step": 19503 }, { "epoch": 0.9423587959607672, "grad_norm": 2.6506731510162354, "learning_rate": 5.764120403923273e-08, "loss": 0.3002, "step": 19504 }, { "epoch": 0.9424071121418564, "grad_norm": 2.3244900703430176, "learning_rate": 5.759288785814369e-08, "loss": 0.3532, "step": 19505 }, { "epoch": 0.9424554283229454, "grad_norm": 5.50718355178833, "learning_rate": 5.754457167705464e-08, "loss": 0.3881, "step": 19506 }, { "epoch": 0.9425037445040344, "grad_norm": 3.457709550857544, "learning_rate": 5.7496255495965596e-08, "loss": 0.4024, "step": 19507 }, { "epoch": 0.9425520606851234, "grad_norm": 8.921121597290039, "learning_rate": 5.744793931487655e-08, "loss": 0.4379, "step": 19508 }, { "epoch": 0.9426003768662125, "grad_norm": 8.951606750488281, "learning_rate": 5.7399623133787504e-08, "loss": 0.3493, "step": 19509 }, { "epoch": 0.9426486930473016, "grad_norm": 3.7731313705444336, "learning_rate": 5.7351306952698455e-08, "loss": 0.3324, "step": 19510 }, { "epoch": 0.9426970092283906, "grad_norm": 2.1745047569274902, "learning_rate": 5.730299077160941e-08, "loss": 0.2083, "step": 19511 }, { "epoch": 0.9427453254094796, "grad_norm": 2.835641622543335, "learning_rate": 5.725467459052036e-08, "loss": 0.2801, "step": 19512 }, { "epoch": 0.9427936415905687, "grad_norm": 2.9203388690948486, "learning_rate": 5.720635840943132e-08, "loss": 0.3231, "step": 19513 }, { "epoch": 0.9428419577716577, "grad_norm": 2.770164728164673, "learning_rate": 5.715804222834227e-08, "loss": 0.3334, "step": 19514 }, { "epoch": 0.9428902739527468, "grad_norm": 3.338810682296753, "learning_rate": 5.710972604725322e-08, "loss": 0.3249, "step": 19515 }, { "epoch": 0.9429385901338359, "grad_norm": 2.9079010486602783, "learning_rate": 5.706140986616417e-08, "loss": 0.3452, "step": 19516 }, { "epoch": 0.9429869063149249, "grad_norm": 2.822754144668579, "learning_rate": 5.701309368507513e-08, "loss": 0.3818, "step": 19517 }, { "epoch": 0.9430352224960139, "grad_norm": 2.4435718059539795, "learning_rate": 5.696477750398608e-08, "loss": 0.2628, "step": 19518 }, { "epoch": 0.9430835386771029, "grad_norm": 2.98345685005188, "learning_rate": 5.6916461322897036e-08, "loss": 0.407, "step": 19519 }, { "epoch": 0.943131854858192, "grad_norm": 2.5843441486358643, "learning_rate": 5.6868145141807986e-08, "loss": 0.331, "step": 19520 }, { "epoch": 0.9431801710392811, "grad_norm": 1.755852460861206, "learning_rate": 5.6819828960718943e-08, "loss": 0.1911, "step": 19521 }, { "epoch": 0.9432284872203701, "grad_norm": 2.6004326343536377, "learning_rate": 5.6771512779629894e-08, "loss": 0.2289, "step": 19522 }, { "epoch": 0.9432768034014591, "grad_norm": 6.0818281173706055, "learning_rate": 5.672319659854085e-08, "loss": 0.3869, "step": 19523 }, { "epoch": 0.9433251195825482, "grad_norm": 2.8826074600219727, "learning_rate": 5.66748804174518e-08, "loss": 0.286, "step": 19524 }, { "epoch": 0.9433734357636372, "grad_norm": 2.1612584590911865, "learning_rate": 5.662656423636276e-08, "loss": 0.1961, "step": 19525 }, { "epoch": 0.9434217519447263, "grad_norm": 3.0316319465637207, "learning_rate": 5.657824805527371e-08, "loss": 0.337, "step": 19526 }, { "epoch": 0.9434700681258154, "grad_norm": 2.5799667835235596, "learning_rate": 5.6529931874184666e-08, "loss": 0.2542, "step": 19527 }, { "epoch": 0.9435183843069044, "grad_norm": 5.5309529304504395, "learning_rate": 5.648161569309561e-08, "loss": 0.2893, "step": 19528 }, { "epoch": 0.9435667004879934, "grad_norm": 4.038092613220215, "learning_rate": 5.643329951200657e-08, "loss": 0.2443, "step": 19529 }, { "epoch": 0.9436150166690824, "grad_norm": 2.984414577484131, "learning_rate": 5.638498333091752e-08, "loss": 0.3668, "step": 19530 }, { "epoch": 0.9436633328501716, "grad_norm": 2.096210479736328, "learning_rate": 5.6336667149828475e-08, "loss": 0.2434, "step": 19531 }, { "epoch": 0.9437116490312606, "grad_norm": 2.8908133506774902, "learning_rate": 5.6288350968739425e-08, "loss": 0.3165, "step": 19532 }, { "epoch": 0.9437599652123496, "grad_norm": 3.1907498836517334, "learning_rate": 5.624003478765038e-08, "loss": 0.1925, "step": 19533 }, { "epoch": 0.9438082813934386, "grad_norm": 5.490699291229248, "learning_rate": 5.619171860656133e-08, "loss": 0.384, "step": 19534 }, { "epoch": 0.9438565975745277, "grad_norm": 3.030280590057373, "learning_rate": 5.614340242547229e-08, "loss": 0.381, "step": 19535 }, { "epoch": 0.9439049137556168, "grad_norm": 4.329235076904297, "learning_rate": 5.609508624438324e-08, "loss": 0.2658, "step": 19536 }, { "epoch": 0.9439532299367058, "grad_norm": 1.8664323091506958, "learning_rate": 5.60467700632942e-08, "loss": 0.2215, "step": 19537 }, { "epoch": 0.9440015461177949, "grad_norm": 1.5506545305252075, "learning_rate": 5.599845388220515e-08, "loss": 0.1759, "step": 19538 }, { "epoch": 0.9440498622988839, "grad_norm": 1.6527973413467407, "learning_rate": 5.5950137701116105e-08, "loss": 0.1671, "step": 19539 }, { "epoch": 0.9440981784799729, "grad_norm": 2.5114834308624268, "learning_rate": 5.5901821520027056e-08, "loss": 0.2575, "step": 19540 }, { "epoch": 0.944146494661062, "grad_norm": 3.0890166759490967, "learning_rate": 5.5853505338938007e-08, "loss": 0.3212, "step": 19541 }, { "epoch": 0.9441948108421511, "grad_norm": 2.4307570457458496, "learning_rate": 5.580518915784896e-08, "loss": 0.3525, "step": 19542 }, { "epoch": 0.9442431270232401, "grad_norm": 2.111377716064453, "learning_rate": 5.5756872976759914e-08, "loss": 0.232, "step": 19543 }, { "epoch": 0.9442914432043291, "grad_norm": 3.023406982421875, "learning_rate": 5.5708556795670865e-08, "loss": 0.3368, "step": 19544 }, { "epoch": 0.9443397593854181, "grad_norm": 2.7048065662384033, "learning_rate": 5.566024061458182e-08, "loss": 0.3127, "step": 19545 }, { "epoch": 0.9443880755665072, "grad_norm": 1.9079831838607788, "learning_rate": 5.561192443349277e-08, "loss": 0.1713, "step": 19546 }, { "epoch": 0.9444363917475963, "grad_norm": 9.986278533935547, "learning_rate": 5.556360825240373e-08, "loss": 0.3592, "step": 19547 }, { "epoch": 0.9444847079286853, "grad_norm": 2.7238550186157227, "learning_rate": 5.551529207131468e-08, "loss": 0.246, "step": 19548 }, { "epoch": 0.9445330241097744, "grad_norm": 3.47164249420166, "learning_rate": 5.546697589022564e-08, "loss": 0.4624, "step": 19549 }, { "epoch": 0.9445813402908634, "grad_norm": 2.531933546066284, "learning_rate": 5.541865970913659e-08, "loss": 0.1737, "step": 19550 }, { "epoch": 0.9446296564719524, "grad_norm": 2.2647476196289062, "learning_rate": 5.5370343528047545e-08, "loss": 0.2605, "step": 19551 }, { "epoch": 0.9446779726530415, "grad_norm": 2.5396783351898193, "learning_rate": 5.5322027346958495e-08, "loss": 0.1874, "step": 19552 }, { "epoch": 0.9447262888341306, "grad_norm": 7.462188720703125, "learning_rate": 5.5273711165869446e-08, "loss": 0.3937, "step": 19553 }, { "epoch": 0.9447746050152196, "grad_norm": 1.9145619869232178, "learning_rate": 5.5225394984780396e-08, "loss": 0.1855, "step": 19554 }, { "epoch": 0.9448229211963086, "grad_norm": 2.4826228618621826, "learning_rate": 5.5177078803691353e-08, "loss": 0.2756, "step": 19555 }, { "epoch": 0.9448712373773976, "grad_norm": 3.484170913696289, "learning_rate": 5.5128762622602304e-08, "loss": 0.3396, "step": 19556 }, { "epoch": 0.9449195535584868, "grad_norm": 2.574598789215088, "learning_rate": 5.508044644151326e-08, "loss": 0.2336, "step": 19557 }, { "epoch": 0.9449678697395758, "grad_norm": 2.964444160461426, "learning_rate": 5.503213026042421e-08, "loss": 0.3318, "step": 19558 }, { "epoch": 0.9450161859206648, "grad_norm": 2.7114131450653076, "learning_rate": 5.498381407933517e-08, "loss": 0.2574, "step": 19559 }, { "epoch": 0.9450645021017539, "grad_norm": 2.6899449825286865, "learning_rate": 5.493549789824612e-08, "loss": 0.2634, "step": 19560 }, { "epoch": 0.9451128182828429, "grad_norm": 2.3470730781555176, "learning_rate": 5.4887181717157076e-08, "loss": 0.202, "step": 19561 }, { "epoch": 0.945161134463932, "grad_norm": 5.805455684661865, "learning_rate": 5.483886553606803e-08, "loss": 0.3536, "step": 19562 }, { "epoch": 0.945209450645021, "grad_norm": 3.131986141204834, "learning_rate": 5.4790549354978984e-08, "loss": 0.4863, "step": 19563 }, { "epoch": 0.9452577668261101, "grad_norm": 4.1702117919921875, "learning_rate": 5.4742233173889934e-08, "loss": 0.2364, "step": 19564 }, { "epoch": 0.9453060830071991, "grad_norm": 2.067535638809204, "learning_rate": 5.469391699280089e-08, "loss": 0.2256, "step": 19565 }, { "epoch": 0.9453543991882881, "grad_norm": 3.785792589187622, "learning_rate": 5.4645600811711835e-08, "loss": 0.2498, "step": 19566 }, { "epoch": 0.9454027153693773, "grad_norm": 2.6571357250213623, "learning_rate": 5.459728463062279e-08, "loss": 0.2517, "step": 19567 }, { "epoch": 0.9454510315504663, "grad_norm": 2.3055055141448975, "learning_rate": 5.454896844953374e-08, "loss": 0.2676, "step": 19568 }, { "epoch": 0.9454993477315553, "grad_norm": 5.449632167816162, "learning_rate": 5.45006522684447e-08, "loss": 0.3506, "step": 19569 }, { "epoch": 0.9455476639126443, "grad_norm": 2.108612537384033, "learning_rate": 5.445233608735565e-08, "loss": 0.2349, "step": 19570 }, { "epoch": 0.9455959800937334, "grad_norm": 2.0320329666137695, "learning_rate": 5.440401990626661e-08, "loss": 0.2602, "step": 19571 }, { "epoch": 0.9456442962748225, "grad_norm": 2.596278429031372, "learning_rate": 5.435570372517756e-08, "loss": 0.3052, "step": 19572 }, { "epoch": 0.9456926124559115, "grad_norm": 2.7996022701263428, "learning_rate": 5.4307387544088516e-08, "loss": 0.2534, "step": 19573 }, { "epoch": 0.9457409286370005, "grad_norm": 2.5863335132598877, "learning_rate": 5.4259071362999466e-08, "loss": 0.2259, "step": 19574 }, { "epoch": 0.9457892448180896, "grad_norm": 5.067599296569824, "learning_rate": 5.421075518191042e-08, "loss": 0.3045, "step": 19575 }, { "epoch": 0.9458375609991786, "grad_norm": 3.043609619140625, "learning_rate": 5.4162439000821374e-08, "loss": 0.2382, "step": 19576 }, { "epoch": 0.9458858771802676, "grad_norm": 9.832952499389648, "learning_rate": 5.411412281973233e-08, "loss": 0.2402, "step": 19577 }, { "epoch": 0.9459341933613568, "grad_norm": 2.091590404510498, "learning_rate": 5.406580663864328e-08, "loss": 0.2896, "step": 19578 }, { "epoch": 0.9459825095424458, "grad_norm": 3.38800048828125, "learning_rate": 5.401749045755423e-08, "loss": 0.1814, "step": 19579 }, { "epoch": 0.9460308257235348, "grad_norm": 2.840956926345825, "learning_rate": 5.396917427646518e-08, "loss": 0.2895, "step": 19580 }, { "epoch": 0.9460791419046238, "grad_norm": 2.6935906410217285, "learning_rate": 5.392085809537614e-08, "loss": 0.3484, "step": 19581 }, { "epoch": 0.9461274580857129, "grad_norm": 2.4954957962036133, "learning_rate": 5.387254191428709e-08, "loss": 0.2447, "step": 19582 }, { "epoch": 0.946175774266802, "grad_norm": 4.144618034362793, "learning_rate": 5.382422573319805e-08, "loss": 0.2659, "step": 19583 }, { "epoch": 0.946224090447891, "grad_norm": 2.226653814315796, "learning_rate": 5.3775909552109e-08, "loss": 0.2635, "step": 19584 }, { "epoch": 0.94627240662898, "grad_norm": 3.656883955001831, "learning_rate": 5.3727593371019955e-08, "loss": 0.2603, "step": 19585 }, { "epoch": 0.9463207228100691, "grad_norm": 5.94815731048584, "learning_rate": 5.3679277189930905e-08, "loss": 0.4109, "step": 19586 }, { "epoch": 0.9463690389911581, "grad_norm": 2.7362060546875, "learning_rate": 5.363096100884186e-08, "loss": 0.3693, "step": 19587 }, { "epoch": 0.9464173551722472, "grad_norm": 3.3422038555145264, "learning_rate": 5.358264482775281e-08, "loss": 0.3478, "step": 19588 }, { "epoch": 0.9464656713533363, "grad_norm": 2.608487367630005, "learning_rate": 5.353432864666377e-08, "loss": 0.3047, "step": 19589 }, { "epoch": 0.9465139875344253, "grad_norm": 2.5078539848327637, "learning_rate": 5.348601246557472e-08, "loss": 0.2695, "step": 19590 }, { "epoch": 0.9465623037155143, "grad_norm": 3.792962074279785, "learning_rate": 5.343769628448567e-08, "loss": 0.284, "step": 19591 }, { "epoch": 0.9466106198966033, "grad_norm": 1.3741222620010376, "learning_rate": 5.338938010339662e-08, "loss": 0.1622, "step": 19592 }, { "epoch": 0.9466589360776925, "grad_norm": 5.474990367889404, "learning_rate": 5.334106392230758e-08, "loss": 0.2705, "step": 19593 }, { "epoch": 0.9467072522587815, "grad_norm": 2.1645472049713135, "learning_rate": 5.329274774121853e-08, "loss": 0.2611, "step": 19594 }, { "epoch": 0.9467555684398705, "grad_norm": 2.818718671798706, "learning_rate": 5.3244431560129486e-08, "loss": 0.2172, "step": 19595 }, { "epoch": 0.9468038846209595, "grad_norm": 2.766683578491211, "learning_rate": 5.319611537904044e-08, "loss": 0.2608, "step": 19596 }, { "epoch": 0.9468522008020486, "grad_norm": 1.4837034940719604, "learning_rate": 5.3147799197951394e-08, "loss": 0.159, "step": 19597 }, { "epoch": 0.9469005169831377, "grad_norm": 2.576511859893799, "learning_rate": 5.3099483016862344e-08, "loss": 0.3123, "step": 19598 }, { "epoch": 0.9469488331642267, "grad_norm": 2.1075150966644287, "learning_rate": 5.30511668357733e-08, "loss": 0.2279, "step": 19599 }, { "epoch": 0.9469971493453158, "grad_norm": 2.668724775314331, "learning_rate": 5.300285065468425e-08, "loss": 0.2897, "step": 19600 }, { "epoch": 0.9470454655264048, "grad_norm": 4.11907434463501, "learning_rate": 5.295453447359521e-08, "loss": 0.3637, "step": 19601 }, { "epoch": 0.9470937817074938, "grad_norm": 2.432647943496704, "learning_rate": 5.290621829250616e-08, "loss": 0.2324, "step": 19602 }, { "epoch": 0.9471420978885828, "grad_norm": 3.0795488357543945, "learning_rate": 5.285790211141712e-08, "loss": 0.4134, "step": 19603 }, { "epoch": 0.947190414069672, "grad_norm": 2.33896541595459, "learning_rate": 5.280958593032806e-08, "loss": 0.2922, "step": 19604 }, { "epoch": 0.947238730250761, "grad_norm": 5.0972700119018555, "learning_rate": 5.276126974923902e-08, "loss": 0.3271, "step": 19605 }, { "epoch": 0.94728704643185, "grad_norm": 2.3000354766845703, "learning_rate": 5.271295356814997e-08, "loss": 0.2552, "step": 19606 }, { "epoch": 0.947335362612939, "grad_norm": 1.9920368194580078, "learning_rate": 5.2664637387060926e-08, "loss": 0.1698, "step": 19607 }, { "epoch": 0.9473836787940281, "grad_norm": 2.656346321105957, "learning_rate": 5.2616321205971876e-08, "loss": 0.2861, "step": 19608 }, { "epoch": 0.9474319949751172, "grad_norm": 2.39920973777771, "learning_rate": 5.256800502488283e-08, "loss": 0.2344, "step": 19609 }, { "epoch": 0.9474803111562062, "grad_norm": 2.9354867935180664, "learning_rate": 5.2519688843793784e-08, "loss": 0.2328, "step": 19610 }, { "epoch": 0.9475286273372953, "grad_norm": 2.7665793895721436, "learning_rate": 5.247137266270474e-08, "loss": 0.3267, "step": 19611 }, { "epoch": 0.9475769435183843, "grad_norm": 3.301231622695923, "learning_rate": 5.242305648161569e-08, "loss": 0.3367, "step": 19612 }, { "epoch": 0.9476252596994733, "grad_norm": 2.8439252376556396, "learning_rate": 5.237474030052665e-08, "loss": 0.2845, "step": 19613 }, { "epoch": 0.9476735758805624, "grad_norm": 2.667529344558716, "learning_rate": 5.23264241194376e-08, "loss": 0.3401, "step": 19614 }, { "epoch": 0.9477218920616515, "grad_norm": 4.62738561630249, "learning_rate": 5.2278107938348556e-08, "loss": 0.3422, "step": 19615 }, { "epoch": 0.9477702082427405, "grad_norm": 2.5290095806121826, "learning_rate": 5.2229791757259507e-08, "loss": 0.2447, "step": 19616 }, { "epoch": 0.9478185244238295, "grad_norm": 2.2981913089752197, "learning_rate": 5.218147557617046e-08, "loss": 0.231, "step": 19617 }, { "epoch": 0.9478668406049185, "grad_norm": 2.7943077087402344, "learning_rate": 5.213315939508141e-08, "loss": 0.3226, "step": 19618 }, { "epoch": 0.9479151567860077, "grad_norm": 8.322554588317871, "learning_rate": 5.2084843213992365e-08, "loss": 0.3212, "step": 19619 }, { "epoch": 0.9479634729670967, "grad_norm": 2.683272123336792, "learning_rate": 5.2036527032903315e-08, "loss": 0.2781, "step": 19620 }, { "epoch": 0.9480117891481857, "grad_norm": 3.2861242294311523, "learning_rate": 5.198821085181427e-08, "loss": 0.2896, "step": 19621 }, { "epoch": 0.9480601053292748, "grad_norm": 2.776418447494507, "learning_rate": 5.193989467072522e-08, "loss": 0.5115, "step": 19622 }, { "epoch": 0.9481084215103638, "grad_norm": 2.1425156593322754, "learning_rate": 5.189157848963618e-08, "loss": 0.1995, "step": 19623 }, { "epoch": 0.9481567376914529, "grad_norm": 3.3989875316619873, "learning_rate": 5.184326230854713e-08, "loss": 0.3661, "step": 19624 }, { "epoch": 0.948205053872542, "grad_norm": 3.339197874069214, "learning_rate": 5.179494612745809e-08, "loss": 0.3518, "step": 19625 }, { "epoch": 0.948253370053631, "grad_norm": 8.639466285705566, "learning_rate": 5.174662994636904e-08, "loss": 0.3888, "step": 19626 }, { "epoch": 0.94830168623472, "grad_norm": 3.566718816757202, "learning_rate": 5.1698313765279995e-08, "loss": 0.3002, "step": 19627 }, { "epoch": 0.948350002415809, "grad_norm": 3.7704198360443115, "learning_rate": 5.1649997584190946e-08, "loss": 0.3548, "step": 19628 }, { "epoch": 0.948398318596898, "grad_norm": 3.233123779296875, "learning_rate": 5.1601681403101896e-08, "loss": 0.28, "step": 19629 }, { "epoch": 0.9484466347779872, "grad_norm": 2.660233736038208, "learning_rate": 5.155336522201285e-08, "loss": 0.2408, "step": 19630 }, { "epoch": 0.9484949509590762, "grad_norm": 2.2971842288970947, "learning_rate": 5.1505049040923804e-08, "loss": 0.2786, "step": 19631 }, { "epoch": 0.9485432671401652, "grad_norm": 3.8947083950042725, "learning_rate": 5.1456732859834754e-08, "loss": 0.2376, "step": 19632 }, { "epoch": 0.9485915833212543, "grad_norm": 2.8538894653320312, "learning_rate": 5.140841667874571e-08, "loss": 0.3838, "step": 19633 }, { "epoch": 0.9486398995023433, "grad_norm": 3.21283221244812, "learning_rate": 5.136010049765666e-08, "loss": 0.152, "step": 19634 }, { "epoch": 0.9486882156834324, "grad_norm": 2.636564016342163, "learning_rate": 5.131178431656762e-08, "loss": 0.2719, "step": 19635 }, { "epoch": 0.9487365318645214, "grad_norm": 2.4636411666870117, "learning_rate": 5.126346813547857e-08, "loss": 0.2789, "step": 19636 }, { "epoch": 0.9487848480456105, "grad_norm": 3.751059055328369, "learning_rate": 5.121515195438953e-08, "loss": 0.3555, "step": 19637 }, { "epoch": 0.9488331642266995, "grad_norm": 2.345621109008789, "learning_rate": 5.116683577330048e-08, "loss": 0.2707, "step": 19638 }, { "epoch": 0.9488814804077885, "grad_norm": 3.7474164962768555, "learning_rate": 5.1118519592211435e-08, "loss": 0.369, "step": 19639 }, { "epoch": 0.9489297965888777, "grad_norm": 2.9916248321533203, "learning_rate": 5.1070203411122385e-08, "loss": 0.367, "step": 19640 }, { "epoch": 0.9489781127699667, "grad_norm": 3.0443453788757324, "learning_rate": 5.102188723003334e-08, "loss": 0.4051, "step": 19641 }, { "epoch": 0.9490264289510557, "grad_norm": 2.3754467964172363, "learning_rate": 5.0973571048944286e-08, "loss": 0.2403, "step": 19642 }, { "epoch": 0.9490747451321447, "grad_norm": 2.978842258453369, "learning_rate": 5.092525486785524e-08, "loss": 0.2887, "step": 19643 }, { "epoch": 0.9491230613132338, "grad_norm": 3.0201051235198975, "learning_rate": 5.0876938686766194e-08, "loss": 0.3498, "step": 19644 }, { "epoch": 0.9491713774943229, "grad_norm": 2.642177104949951, "learning_rate": 5.082862250567715e-08, "loss": 0.3018, "step": 19645 }, { "epoch": 0.9492196936754119, "grad_norm": 2.5076305866241455, "learning_rate": 5.07803063245881e-08, "loss": 0.2965, "step": 19646 }, { "epoch": 0.949268009856501, "grad_norm": 2.886657238006592, "learning_rate": 5.073199014349906e-08, "loss": 0.3596, "step": 19647 }, { "epoch": 0.94931632603759, "grad_norm": 4.93083381652832, "learning_rate": 5.068367396241001e-08, "loss": 0.2036, "step": 19648 }, { "epoch": 0.949364642218679, "grad_norm": 2.3137285709381104, "learning_rate": 5.0635357781320966e-08, "loss": 0.2783, "step": 19649 }, { "epoch": 0.9494129583997681, "grad_norm": 2.79740047454834, "learning_rate": 5.0587041600231917e-08, "loss": 0.4048, "step": 19650 }, { "epoch": 0.9494612745808572, "grad_norm": 2.3401830196380615, "learning_rate": 5.0538725419142874e-08, "loss": 0.2125, "step": 19651 }, { "epoch": 0.9495095907619462, "grad_norm": 1.9618136882781982, "learning_rate": 5.0490409238053824e-08, "loss": 0.2212, "step": 19652 }, { "epoch": 0.9495579069430352, "grad_norm": 1.9960377216339111, "learning_rate": 5.044209305696478e-08, "loss": 0.2126, "step": 19653 }, { "epoch": 0.9496062231241242, "grad_norm": 2.3761112689971924, "learning_rate": 5.0393776875875725e-08, "loss": 0.3305, "step": 19654 }, { "epoch": 0.9496545393052133, "grad_norm": 17.369844436645508, "learning_rate": 5.0345460694786676e-08, "loss": 0.6015, "step": 19655 }, { "epoch": 0.9497028554863024, "grad_norm": 2.065626859664917, "learning_rate": 5.029714451369763e-08, "loss": 0.2574, "step": 19656 }, { "epoch": 0.9497511716673914, "grad_norm": 2.0190274715423584, "learning_rate": 5.0248828332608583e-08, "loss": 0.2073, "step": 19657 }, { "epoch": 0.9497994878484805, "grad_norm": 17.10382652282715, "learning_rate": 5.020051215151954e-08, "loss": 0.3672, "step": 19658 }, { "epoch": 0.9498478040295695, "grad_norm": 3.4494521617889404, "learning_rate": 5.015219597043049e-08, "loss": 0.2976, "step": 19659 }, { "epoch": 0.9498961202106585, "grad_norm": 3.7233352661132812, "learning_rate": 5.010387978934145e-08, "loss": 0.238, "step": 19660 }, { "epoch": 0.9499444363917476, "grad_norm": 11.903691291809082, "learning_rate": 5.00555636082524e-08, "loss": 0.3237, "step": 19661 }, { "epoch": 0.9499927525728367, "grad_norm": 3.47636079788208, "learning_rate": 5.0007247427163356e-08, "loss": 0.4366, "step": 19662 }, { "epoch": 0.9500410687539257, "grad_norm": 3.0603764057159424, "learning_rate": 4.9958931246074306e-08, "loss": 0.4055, "step": 19663 }, { "epoch": 0.9500893849350147, "grad_norm": 2.2797749042510986, "learning_rate": 4.9910615064985263e-08, "loss": 0.2305, "step": 19664 }, { "epoch": 0.9501377011161037, "grad_norm": 2.9071929454803467, "learning_rate": 4.9862298883896214e-08, "loss": 0.3577, "step": 19665 }, { "epoch": 0.9501860172971929, "grad_norm": 2.3619611263275146, "learning_rate": 4.981398270280717e-08, "loss": 0.1622, "step": 19666 }, { "epoch": 0.9502343334782819, "grad_norm": 2.555469036102295, "learning_rate": 4.9765666521718115e-08, "loss": 0.2749, "step": 19667 }, { "epoch": 0.9502826496593709, "grad_norm": 2.098276138305664, "learning_rate": 4.971735034062907e-08, "loss": 0.2025, "step": 19668 }, { "epoch": 0.95033096584046, "grad_norm": 2.314603090286255, "learning_rate": 4.966903415954002e-08, "loss": 0.1673, "step": 19669 }, { "epoch": 0.950379282021549, "grad_norm": 2.194927453994751, "learning_rate": 4.962071797845098e-08, "loss": 0.2176, "step": 19670 }, { "epoch": 0.9504275982026381, "grad_norm": 2.4501211643218994, "learning_rate": 4.957240179736193e-08, "loss": 0.2188, "step": 19671 }, { "epoch": 0.9504759143837271, "grad_norm": 4.964770317077637, "learning_rate": 4.952408561627289e-08, "loss": 0.3109, "step": 19672 }, { "epoch": 0.9505242305648162, "grad_norm": 4.032841205596924, "learning_rate": 4.947576943518384e-08, "loss": 0.3537, "step": 19673 }, { "epoch": 0.9505725467459052, "grad_norm": 6.22056245803833, "learning_rate": 4.9427453254094795e-08, "loss": 0.3189, "step": 19674 }, { "epoch": 0.9506208629269942, "grad_norm": 1.7995697259902954, "learning_rate": 4.9379137073005746e-08, "loss": 0.1707, "step": 19675 }, { "epoch": 0.9506691791080834, "grad_norm": 2.1770966053009033, "learning_rate": 4.93308208919167e-08, "loss": 0.2373, "step": 19676 }, { "epoch": 0.9507174952891724, "grad_norm": 1.615932822227478, "learning_rate": 4.928250471082765e-08, "loss": 0.1821, "step": 19677 }, { "epoch": 0.9507658114702614, "grad_norm": 2.9277751445770264, "learning_rate": 4.923418852973861e-08, "loss": 0.3012, "step": 19678 }, { "epoch": 0.9508141276513504, "grad_norm": 2.0542266368865967, "learning_rate": 4.918587234864956e-08, "loss": 0.2672, "step": 19679 }, { "epoch": 0.9508624438324395, "grad_norm": 1.8828959465026855, "learning_rate": 4.913755616756051e-08, "loss": 0.2144, "step": 19680 }, { "epoch": 0.9509107600135285, "grad_norm": 1.6886837482452393, "learning_rate": 4.908923998647146e-08, "loss": 0.1741, "step": 19681 }, { "epoch": 0.9509590761946176, "grad_norm": 3.2394161224365234, "learning_rate": 4.904092380538242e-08, "loss": 0.228, "step": 19682 }, { "epoch": 0.9510073923757066, "grad_norm": 2.1040265560150146, "learning_rate": 4.899260762429337e-08, "loss": 0.2759, "step": 19683 }, { "epoch": 0.9510557085567957, "grad_norm": 2.0649285316467285, "learning_rate": 4.8944291443204327e-08, "loss": 0.24, "step": 19684 }, { "epoch": 0.9511040247378847, "grad_norm": 2.9302077293395996, "learning_rate": 4.889597526211528e-08, "loss": 0.2161, "step": 19685 }, { "epoch": 0.9511523409189737, "grad_norm": 6.6275835037231445, "learning_rate": 4.8847659081026234e-08, "loss": 0.2672, "step": 19686 }, { "epoch": 0.9512006571000629, "grad_norm": 1.7598567008972168, "learning_rate": 4.8799342899937185e-08, "loss": 0.1804, "step": 19687 }, { "epoch": 0.9512489732811519, "grad_norm": 2.5995559692382812, "learning_rate": 4.875102671884814e-08, "loss": 0.251, "step": 19688 }, { "epoch": 0.9512972894622409, "grad_norm": 3.441244125366211, "learning_rate": 4.870271053775909e-08, "loss": 0.3495, "step": 19689 }, { "epoch": 0.9513456056433299, "grad_norm": 3.507026433944702, "learning_rate": 4.865439435667005e-08, "loss": 0.3353, "step": 19690 }, { "epoch": 0.951393921824419, "grad_norm": 2.538686513900757, "learning_rate": 4.8606078175581e-08, "loss": 0.3279, "step": 19691 }, { "epoch": 0.9514422380055081, "grad_norm": 3.563844680786133, "learning_rate": 4.855776199449195e-08, "loss": 0.3635, "step": 19692 }, { "epoch": 0.9514905541865971, "grad_norm": 3.9794600009918213, "learning_rate": 4.85094458134029e-08, "loss": 0.3748, "step": 19693 }, { "epoch": 0.9515388703676861, "grad_norm": 3.2260406017303467, "learning_rate": 4.846112963231386e-08, "loss": 0.2552, "step": 19694 }, { "epoch": 0.9515871865487752, "grad_norm": 2.6572670936584473, "learning_rate": 4.841281345122481e-08, "loss": 0.2115, "step": 19695 }, { "epoch": 0.9516355027298642, "grad_norm": 2.6377110481262207, "learning_rate": 4.8364497270135766e-08, "loss": 0.3284, "step": 19696 }, { "epoch": 0.9516838189109533, "grad_norm": 2.901179790496826, "learning_rate": 4.8316181089046716e-08, "loss": 0.3308, "step": 19697 }, { "epoch": 0.9517321350920424, "grad_norm": 2.329852342605591, "learning_rate": 4.8267864907957673e-08, "loss": 0.3456, "step": 19698 }, { "epoch": 0.9517804512731314, "grad_norm": 2.4720709323883057, "learning_rate": 4.8219548726868624e-08, "loss": 0.2012, "step": 19699 }, { "epoch": 0.9518287674542204, "grad_norm": 2.297132730484009, "learning_rate": 4.817123254577958e-08, "loss": 0.3761, "step": 19700 }, { "epoch": 0.9518770836353094, "grad_norm": 1.8300801515579224, "learning_rate": 4.812291636469053e-08, "loss": 0.1959, "step": 19701 }, { "epoch": 0.9519253998163986, "grad_norm": 2.462369203567505, "learning_rate": 4.807460018360149e-08, "loss": 0.281, "step": 19702 }, { "epoch": 0.9519737159974876, "grad_norm": 3.2848877906799316, "learning_rate": 4.802628400251244e-08, "loss": 0.3497, "step": 19703 }, { "epoch": 0.9520220321785766, "grad_norm": 2.882444381713867, "learning_rate": 4.7977967821423396e-08, "loss": 0.2592, "step": 19704 }, { "epoch": 0.9520703483596656, "grad_norm": 58.122886657714844, "learning_rate": 4.792965164033434e-08, "loss": 0.2956, "step": 19705 }, { "epoch": 0.9521186645407547, "grad_norm": 3.049426794052124, "learning_rate": 4.78813354592453e-08, "loss": 0.3903, "step": 19706 }, { "epoch": 0.9521669807218437, "grad_norm": 2.273888111114502, "learning_rate": 4.783301927815625e-08, "loss": 0.2734, "step": 19707 }, { "epoch": 0.9522152969029328, "grad_norm": 4.204732894897461, "learning_rate": 4.7784703097067205e-08, "loss": 0.365, "step": 19708 }, { "epoch": 0.9522636130840219, "grad_norm": 3.8494954109191895, "learning_rate": 4.7736386915978156e-08, "loss": 0.4611, "step": 19709 }, { "epoch": 0.9523119292651109, "grad_norm": 2.500978946685791, "learning_rate": 4.768807073488911e-08, "loss": 0.2253, "step": 19710 }, { "epoch": 0.9523602454461999, "grad_norm": 5.917387008666992, "learning_rate": 4.763975455380006e-08, "loss": 0.3299, "step": 19711 }, { "epoch": 0.9524085616272889, "grad_norm": 3.208153247833252, "learning_rate": 4.759143837271102e-08, "loss": 0.3891, "step": 19712 }, { "epoch": 0.9524568778083781, "grad_norm": 3.988968849182129, "learning_rate": 4.754312219162197e-08, "loss": 0.3118, "step": 19713 }, { "epoch": 0.9525051939894671, "grad_norm": 3.0565080642700195, "learning_rate": 4.749480601053293e-08, "loss": 0.3582, "step": 19714 }, { "epoch": 0.9525535101705561, "grad_norm": 2.184574842453003, "learning_rate": 4.744648982944388e-08, "loss": 0.2199, "step": 19715 }, { "epoch": 0.9526018263516451, "grad_norm": 2.0750606060028076, "learning_rate": 4.7398173648354836e-08, "loss": 0.1965, "step": 19716 }, { "epoch": 0.9526501425327342, "grad_norm": 2.505950927734375, "learning_rate": 4.7349857467265786e-08, "loss": 0.1737, "step": 19717 }, { "epoch": 0.9526984587138233, "grad_norm": 8.360088348388672, "learning_rate": 4.7301541286176737e-08, "loss": 0.3706, "step": 19718 }, { "epoch": 0.9527467748949123, "grad_norm": 3.0662639141082764, "learning_rate": 4.725322510508769e-08, "loss": 0.2598, "step": 19719 }, { "epoch": 0.9527950910760014, "grad_norm": 2.5668816566467285, "learning_rate": 4.7204908923998644e-08, "loss": 0.2853, "step": 19720 }, { "epoch": 0.9528434072570904, "grad_norm": 2.5258333683013916, "learning_rate": 4.7156592742909595e-08, "loss": 0.3492, "step": 19721 }, { "epoch": 0.9528917234381794, "grad_norm": 3.233156681060791, "learning_rate": 4.710827656182055e-08, "loss": 0.2637, "step": 19722 }, { "epoch": 0.9529400396192685, "grad_norm": 8.049478530883789, "learning_rate": 4.70599603807315e-08, "loss": 0.4391, "step": 19723 }, { "epoch": 0.9529883558003576, "grad_norm": 1.7870417833328247, "learning_rate": 4.701164419964246e-08, "loss": 0.2029, "step": 19724 }, { "epoch": 0.9530366719814466, "grad_norm": 2.9683914184570312, "learning_rate": 4.696332801855341e-08, "loss": 0.3485, "step": 19725 }, { "epoch": 0.9530849881625356, "grad_norm": 2.0982508659362793, "learning_rate": 4.691501183746437e-08, "loss": 0.2886, "step": 19726 }, { "epoch": 0.9531333043436246, "grad_norm": 2.754297971725464, "learning_rate": 4.686669565637532e-08, "loss": 0.3416, "step": 19727 }, { "epoch": 0.9531816205247138, "grad_norm": 5.640925407409668, "learning_rate": 4.6818379475286275e-08, "loss": 0.3851, "step": 19728 }, { "epoch": 0.9532299367058028, "grad_norm": 2.3921377658843994, "learning_rate": 4.6770063294197225e-08, "loss": 0.2335, "step": 19729 }, { "epoch": 0.9532782528868918, "grad_norm": 2.305119276046753, "learning_rate": 4.6721747113108176e-08, "loss": 0.2542, "step": 19730 }, { "epoch": 0.9533265690679809, "grad_norm": 8.188312530517578, "learning_rate": 4.6673430932019126e-08, "loss": 0.2169, "step": 19731 }, { "epoch": 0.9533748852490699, "grad_norm": 3.4970016479492188, "learning_rate": 4.6625114750930083e-08, "loss": 0.4107, "step": 19732 }, { "epoch": 0.9534232014301589, "grad_norm": 3.6381585597991943, "learning_rate": 4.6576798569841034e-08, "loss": 0.314, "step": 19733 }, { "epoch": 0.953471517611248, "grad_norm": 4.509352684020996, "learning_rate": 4.652848238875199e-08, "loss": 0.52, "step": 19734 }, { "epoch": 0.9535198337923371, "grad_norm": 3.0808753967285156, "learning_rate": 4.648016620766294e-08, "loss": 0.2033, "step": 19735 }, { "epoch": 0.9535681499734261, "grad_norm": 2.381117343902588, "learning_rate": 4.64318500265739e-08, "loss": 0.33, "step": 19736 }, { "epoch": 0.9536164661545151, "grad_norm": 6.032904148101807, "learning_rate": 4.638353384548485e-08, "loss": 0.1693, "step": 19737 }, { "epoch": 0.9536647823356041, "grad_norm": 3.0258231163024902, "learning_rate": 4.6335217664395806e-08, "loss": 0.3853, "step": 19738 }, { "epoch": 0.9537130985166933, "grad_norm": 2.308971405029297, "learning_rate": 4.628690148330676e-08, "loss": 0.2315, "step": 19739 }, { "epoch": 0.9537614146977823, "grad_norm": 2.806687593460083, "learning_rate": 4.6238585302217714e-08, "loss": 0.3367, "step": 19740 }, { "epoch": 0.9538097308788713, "grad_norm": 2.3486380577087402, "learning_rate": 4.6190269121128665e-08, "loss": 0.285, "step": 19741 }, { "epoch": 0.9538580470599604, "grad_norm": 2.3555667400360107, "learning_rate": 4.614195294003962e-08, "loss": 0.2266, "step": 19742 }, { "epoch": 0.9539063632410494, "grad_norm": 3.286959171295166, "learning_rate": 4.6093636758950566e-08, "loss": 0.4575, "step": 19743 }, { "epoch": 0.9539546794221385, "grad_norm": 3.0708155632019043, "learning_rate": 4.604532057786152e-08, "loss": 0.264, "step": 19744 }, { "epoch": 0.9540029956032275, "grad_norm": 2.2526986598968506, "learning_rate": 4.599700439677247e-08, "loss": 0.2933, "step": 19745 }, { "epoch": 0.9540513117843166, "grad_norm": 3.521519660949707, "learning_rate": 4.594868821568343e-08, "loss": 0.3893, "step": 19746 }, { "epoch": 0.9540996279654056, "grad_norm": 4.496156692504883, "learning_rate": 4.590037203459438e-08, "loss": 0.292, "step": 19747 }, { "epoch": 0.9541479441464946, "grad_norm": 4.638427257537842, "learning_rate": 4.585205585350534e-08, "loss": 0.2051, "step": 19748 }, { "epoch": 0.9541962603275838, "grad_norm": 2.752049207687378, "learning_rate": 4.580373967241629e-08, "loss": 0.275, "step": 19749 }, { "epoch": 0.9542445765086728, "grad_norm": 3.42974591255188, "learning_rate": 4.5755423491327246e-08, "loss": 0.327, "step": 19750 }, { "epoch": 0.9542928926897618, "grad_norm": 5.215369701385498, "learning_rate": 4.5707107310238196e-08, "loss": 0.2952, "step": 19751 }, { "epoch": 0.9543412088708508, "grad_norm": 13.199724197387695, "learning_rate": 4.565879112914915e-08, "loss": 0.2589, "step": 19752 }, { "epoch": 0.9543895250519399, "grad_norm": 2.8490421772003174, "learning_rate": 4.5610474948060104e-08, "loss": 0.2914, "step": 19753 }, { "epoch": 0.954437841233029, "grad_norm": 2.66153621673584, "learning_rate": 4.556215876697106e-08, "loss": 0.3686, "step": 19754 }, { "epoch": 0.954486157414118, "grad_norm": 4.81763219833374, "learning_rate": 4.551384258588201e-08, "loss": 0.2516, "step": 19755 }, { "epoch": 0.954534473595207, "grad_norm": 2.511765241622925, "learning_rate": 4.546552640479296e-08, "loss": 0.3609, "step": 19756 }, { "epoch": 0.9545827897762961, "grad_norm": 3.1320247650146484, "learning_rate": 4.541721022370391e-08, "loss": 0.2689, "step": 19757 }, { "epoch": 0.9546311059573851, "grad_norm": 3.0280282497406006, "learning_rate": 4.536889404261487e-08, "loss": 0.3374, "step": 19758 }, { "epoch": 0.9546794221384741, "grad_norm": 3.6882662773132324, "learning_rate": 4.532057786152582e-08, "loss": 0.3623, "step": 19759 }, { "epoch": 0.9547277383195633, "grad_norm": 3.6356301307678223, "learning_rate": 4.527226168043678e-08, "loss": 0.4802, "step": 19760 }, { "epoch": 0.9547760545006523, "grad_norm": 2.7204513549804688, "learning_rate": 4.522394549934773e-08, "loss": 0.3089, "step": 19761 }, { "epoch": 0.9548243706817413, "grad_norm": 1.75819993019104, "learning_rate": 4.5175629318258685e-08, "loss": 0.1788, "step": 19762 }, { "epoch": 0.9548726868628303, "grad_norm": 2.4328014850616455, "learning_rate": 4.5127313137169635e-08, "loss": 0.204, "step": 19763 }, { "epoch": 0.9549210030439194, "grad_norm": 3.2354180812835693, "learning_rate": 4.507899695608059e-08, "loss": 0.3986, "step": 19764 }, { "epoch": 0.9549693192250085, "grad_norm": 1.9597406387329102, "learning_rate": 4.503068077499154e-08, "loss": 0.1807, "step": 19765 }, { "epoch": 0.9550176354060975, "grad_norm": 4.772862434387207, "learning_rate": 4.49823645939025e-08, "loss": 0.3419, "step": 19766 }, { "epoch": 0.9550659515871865, "grad_norm": 4.792972087860107, "learning_rate": 4.493404841281345e-08, "loss": 0.2686, "step": 19767 }, { "epoch": 0.9551142677682756, "grad_norm": 1.9909158945083618, "learning_rate": 4.48857322317244e-08, "loss": 0.1903, "step": 19768 }, { "epoch": 0.9551625839493646, "grad_norm": 2.336595296859741, "learning_rate": 4.483741605063535e-08, "loss": 0.3424, "step": 19769 }, { "epoch": 0.9552109001304537, "grad_norm": 2.6496081352233887, "learning_rate": 4.478909986954631e-08, "loss": 0.3071, "step": 19770 }, { "epoch": 0.9552592163115428, "grad_norm": 3.55798077583313, "learning_rate": 4.474078368845726e-08, "loss": 0.3028, "step": 19771 }, { "epoch": 0.9553075324926318, "grad_norm": 16.115842819213867, "learning_rate": 4.4692467507368216e-08, "loss": 0.2346, "step": 19772 }, { "epoch": 0.9553558486737208, "grad_norm": 1.796505331993103, "learning_rate": 4.464415132627917e-08, "loss": 0.2227, "step": 19773 }, { "epoch": 0.9554041648548098, "grad_norm": 2.612597942352295, "learning_rate": 4.4595835145190124e-08, "loss": 0.319, "step": 19774 }, { "epoch": 0.955452481035899, "grad_norm": 1.4335824251174927, "learning_rate": 4.4547518964101075e-08, "loss": 0.1415, "step": 19775 }, { "epoch": 0.955500797216988, "grad_norm": 3.0857229232788086, "learning_rate": 4.449920278301203e-08, "loss": 0.3746, "step": 19776 }, { "epoch": 0.955549113398077, "grad_norm": 3.3443849086761475, "learning_rate": 4.445088660192298e-08, "loss": 0.259, "step": 19777 }, { "epoch": 0.955597429579166, "grad_norm": 2.6522819995880127, "learning_rate": 4.440257042083394e-08, "loss": 0.2301, "step": 19778 }, { "epoch": 0.9556457457602551, "grad_norm": 2.56955885887146, "learning_rate": 4.435425423974489e-08, "loss": 0.3083, "step": 19779 }, { "epoch": 0.9556940619413442, "grad_norm": 2.411076545715332, "learning_rate": 4.430593805865585e-08, "loss": 0.2385, "step": 19780 }, { "epoch": 0.9557423781224332, "grad_norm": 3.168621063232422, "learning_rate": 4.425762187756679e-08, "loss": 0.2887, "step": 19781 }, { "epoch": 0.9557906943035223, "grad_norm": 4.689026355743408, "learning_rate": 4.420930569647775e-08, "loss": 0.4437, "step": 19782 }, { "epoch": 0.9558390104846113, "grad_norm": 2.8672142028808594, "learning_rate": 4.41609895153887e-08, "loss": 0.3802, "step": 19783 }, { "epoch": 0.9558873266657003, "grad_norm": 3.799069404602051, "learning_rate": 4.4112673334299656e-08, "loss": 0.3391, "step": 19784 }, { "epoch": 0.9559356428467893, "grad_norm": 4.332901954650879, "learning_rate": 4.4064357153210606e-08, "loss": 0.1295, "step": 19785 }, { "epoch": 0.9559839590278785, "grad_norm": 2.884761095046997, "learning_rate": 4.401604097212156e-08, "loss": 0.352, "step": 19786 }, { "epoch": 0.9560322752089675, "grad_norm": 2.6275203227996826, "learning_rate": 4.3967724791032514e-08, "loss": 0.2704, "step": 19787 }, { "epoch": 0.9560805913900565, "grad_norm": 3.8492650985717773, "learning_rate": 4.391940860994347e-08, "loss": 0.2047, "step": 19788 }, { "epoch": 0.9561289075711455, "grad_norm": 2.9053964614868164, "learning_rate": 4.387109242885442e-08, "loss": 0.4318, "step": 19789 }, { "epoch": 0.9561772237522346, "grad_norm": 2.9589929580688477, "learning_rate": 4.382277624776538e-08, "loss": 0.3105, "step": 19790 }, { "epoch": 0.9562255399333237, "grad_norm": 2.696054458618164, "learning_rate": 4.377446006667633e-08, "loss": 0.3447, "step": 19791 }, { "epoch": 0.9562738561144127, "grad_norm": 2.1451094150543213, "learning_rate": 4.3726143885587286e-08, "loss": 0.2037, "step": 19792 }, { "epoch": 0.9563221722955018, "grad_norm": 2.626059055328369, "learning_rate": 4.367782770449824e-08, "loss": 0.3206, "step": 19793 }, { "epoch": 0.9563704884765908, "grad_norm": 2.648768424987793, "learning_rate": 4.362951152340919e-08, "loss": 0.3563, "step": 19794 }, { "epoch": 0.9564188046576798, "grad_norm": 1.7874928712844849, "learning_rate": 4.358119534232014e-08, "loss": 0.2022, "step": 19795 }, { "epoch": 0.956467120838769, "grad_norm": 2.2080211639404297, "learning_rate": 4.3532879161231095e-08, "loss": 0.2057, "step": 19796 }, { "epoch": 0.956515437019858, "grad_norm": 2.2596426010131836, "learning_rate": 4.3484562980142045e-08, "loss": 0.2151, "step": 19797 }, { "epoch": 0.956563753200947, "grad_norm": 4.117873668670654, "learning_rate": 4.3436246799053e-08, "loss": 0.2855, "step": 19798 }, { "epoch": 0.956612069382036, "grad_norm": 2.326367139816284, "learning_rate": 4.338793061796395e-08, "loss": 0.2031, "step": 19799 }, { "epoch": 0.956660385563125, "grad_norm": 2.3172965049743652, "learning_rate": 4.333961443687491e-08, "loss": 0.2444, "step": 19800 }, { "epoch": 0.9567087017442142, "grad_norm": 4.638236045837402, "learning_rate": 4.329129825578586e-08, "loss": 0.3159, "step": 19801 }, { "epoch": 0.9567570179253032, "grad_norm": 2.087562322616577, "learning_rate": 4.324298207469682e-08, "loss": 0.2421, "step": 19802 }, { "epoch": 0.9568053341063922, "grad_norm": 2.438291072845459, "learning_rate": 4.319466589360777e-08, "loss": 0.28, "step": 19803 }, { "epoch": 0.9568536502874813, "grad_norm": 1.7101070880889893, "learning_rate": 4.3146349712518725e-08, "loss": 0.1727, "step": 19804 }, { "epoch": 0.9569019664685703, "grad_norm": 12.245003700256348, "learning_rate": 4.3098033531429676e-08, "loss": 0.348, "step": 19805 }, { "epoch": 0.9569502826496594, "grad_norm": 2.1423275470733643, "learning_rate": 4.3049717350340626e-08, "loss": 0.2124, "step": 19806 }, { "epoch": 0.9569985988307484, "grad_norm": 3.7445850372314453, "learning_rate": 4.300140116925158e-08, "loss": 0.3501, "step": 19807 }, { "epoch": 0.9570469150118375, "grad_norm": 2.9684338569641113, "learning_rate": 4.2953084988162534e-08, "loss": 0.269, "step": 19808 }, { "epoch": 0.9570952311929265, "grad_norm": 2.299971103668213, "learning_rate": 4.2904768807073485e-08, "loss": 0.2109, "step": 19809 }, { "epoch": 0.9571435473740155, "grad_norm": 30.203310012817383, "learning_rate": 4.285645262598444e-08, "loss": 0.2492, "step": 19810 }, { "epoch": 0.9571918635551045, "grad_norm": 2.7473089694976807, "learning_rate": 4.280813644489539e-08, "loss": 0.2757, "step": 19811 }, { "epoch": 0.9572401797361937, "grad_norm": 3.4638729095458984, "learning_rate": 4.275982026380635e-08, "loss": 0.2407, "step": 19812 }, { "epoch": 0.9572884959172827, "grad_norm": 2.379887104034424, "learning_rate": 4.27115040827173e-08, "loss": 0.2386, "step": 19813 }, { "epoch": 0.9573368120983717, "grad_norm": 2.129653215408325, "learning_rate": 4.266318790162826e-08, "loss": 0.2014, "step": 19814 }, { "epoch": 0.9573851282794608, "grad_norm": 1.5080926418304443, "learning_rate": 4.261487172053921e-08, "loss": 0.1583, "step": 19815 }, { "epoch": 0.9574334444605498, "grad_norm": 2.4652023315429688, "learning_rate": 4.2566555539450165e-08, "loss": 0.2547, "step": 19816 }, { "epoch": 0.9574817606416389, "grad_norm": 2.255500316619873, "learning_rate": 4.2518239358361115e-08, "loss": 0.2303, "step": 19817 }, { "epoch": 0.957530076822728, "grad_norm": 2.807168483734131, "learning_rate": 4.246992317727207e-08, "loss": 0.3114, "step": 19818 }, { "epoch": 0.957578393003817, "grad_norm": 2.1420235633850098, "learning_rate": 4.2421606996183016e-08, "loss": 0.304, "step": 19819 }, { "epoch": 0.957626709184906, "grad_norm": 3.303056240081787, "learning_rate": 4.237329081509397e-08, "loss": 0.3833, "step": 19820 }, { "epoch": 0.957675025365995, "grad_norm": 2.76975417137146, "learning_rate": 4.2324974634004924e-08, "loss": 0.3637, "step": 19821 }, { "epoch": 0.9577233415470842, "grad_norm": 3.1928491592407227, "learning_rate": 4.227665845291588e-08, "loss": 0.3274, "step": 19822 }, { "epoch": 0.9577716577281732, "grad_norm": 1.5264573097229004, "learning_rate": 4.222834227182683e-08, "loss": 0.1414, "step": 19823 }, { "epoch": 0.9578199739092622, "grad_norm": 2.538609504699707, "learning_rate": 4.218002609073779e-08, "loss": 0.1765, "step": 19824 }, { "epoch": 0.9578682900903512, "grad_norm": 3.882007122039795, "learning_rate": 4.213170990964874e-08, "loss": 0.3194, "step": 19825 }, { "epoch": 0.9579166062714403, "grad_norm": 4.676425933837891, "learning_rate": 4.2083393728559696e-08, "loss": 0.3133, "step": 19826 }, { "epoch": 0.9579649224525294, "grad_norm": 2.021134376525879, "learning_rate": 4.203507754747065e-08, "loss": 0.236, "step": 19827 }, { "epoch": 0.9580132386336184, "grad_norm": 2.0558154582977295, "learning_rate": 4.1986761366381604e-08, "loss": 0.252, "step": 19828 }, { "epoch": 0.9580615548147075, "grad_norm": 5.65927791595459, "learning_rate": 4.1938445185292554e-08, "loss": 0.2926, "step": 19829 }, { "epoch": 0.9581098709957965, "grad_norm": 6.819599628448486, "learning_rate": 4.189012900420351e-08, "loss": 0.2728, "step": 19830 }, { "epoch": 0.9581581871768855, "grad_norm": 3.154627561569214, "learning_rate": 4.1841812823114455e-08, "loss": 0.4003, "step": 19831 }, { "epoch": 0.9582065033579746, "grad_norm": 2.487996816635132, "learning_rate": 4.179349664202541e-08, "loss": 0.2739, "step": 19832 }, { "epoch": 0.9582548195390637, "grad_norm": 5.201072692871094, "learning_rate": 4.174518046093636e-08, "loss": 0.3239, "step": 19833 }, { "epoch": 0.9583031357201527, "grad_norm": 2.0246469974517822, "learning_rate": 4.169686427984732e-08, "loss": 0.2112, "step": 19834 }, { "epoch": 0.9583514519012417, "grad_norm": 2.087423801422119, "learning_rate": 4.164854809875827e-08, "loss": 0.2609, "step": 19835 }, { "epoch": 0.9583997680823307, "grad_norm": 3.415786027908325, "learning_rate": 4.160023191766923e-08, "loss": 0.3919, "step": 19836 }, { "epoch": 0.9584480842634198, "grad_norm": 2.662726879119873, "learning_rate": 4.155191573658018e-08, "loss": 0.2488, "step": 19837 }, { "epoch": 0.9584964004445089, "grad_norm": 3.2592062950134277, "learning_rate": 4.1503599555491135e-08, "loss": 0.3999, "step": 19838 }, { "epoch": 0.9585447166255979, "grad_norm": 2.18731689453125, "learning_rate": 4.1455283374402086e-08, "loss": 0.2319, "step": 19839 }, { "epoch": 0.958593032806687, "grad_norm": 2.1945042610168457, "learning_rate": 4.140696719331304e-08, "loss": 0.3041, "step": 19840 }, { "epoch": 0.958641348987776, "grad_norm": 3.118314743041992, "learning_rate": 4.1358651012223994e-08, "loss": 0.3514, "step": 19841 }, { "epoch": 0.958689665168865, "grad_norm": 2.066261053085327, "learning_rate": 4.131033483113495e-08, "loss": 0.2301, "step": 19842 }, { "epoch": 0.9587379813499541, "grad_norm": 2.0674026012420654, "learning_rate": 4.12620186500459e-08, "loss": 0.2945, "step": 19843 }, { "epoch": 0.9587862975310432, "grad_norm": 2.844332218170166, "learning_rate": 4.121370246895685e-08, "loss": 0.2432, "step": 19844 }, { "epoch": 0.9588346137121322, "grad_norm": 2.323765516281128, "learning_rate": 4.11653862878678e-08, "loss": 0.2662, "step": 19845 }, { "epoch": 0.9588829298932212, "grad_norm": 2.7882914543151855, "learning_rate": 4.111707010677876e-08, "loss": 0.3722, "step": 19846 }, { "epoch": 0.9589312460743102, "grad_norm": 1.7486480474472046, "learning_rate": 4.106875392568971e-08, "loss": 0.2301, "step": 19847 }, { "epoch": 0.9589795622553994, "grad_norm": 2.617898464202881, "learning_rate": 4.102043774460067e-08, "loss": 0.352, "step": 19848 }, { "epoch": 0.9590278784364884, "grad_norm": 3.9625730514526367, "learning_rate": 4.097212156351162e-08, "loss": 0.3512, "step": 19849 }, { "epoch": 0.9590761946175774, "grad_norm": 3.155409336090088, "learning_rate": 4.0923805382422575e-08, "loss": 0.522, "step": 19850 }, { "epoch": 0.9591245107986665, "grad_norm": 2.4983177185058594, "learning_rate": 4.0875489201333525e-08, "loss": 0.2888, "step": 19851 }, { "epoch": 0.9591728269797555, "grad_norm": 2.5580897331237793, "learning_rate": 4.082717302024448e-08, "loss": 0.2113, "step": 19852 }, { "epoch": 0.9592211431608446, "grad_norm": 1.986823320388794, "learning_rate": 4.077885683915543e-08, "loss": 0.224, "step": 19853 }, { "epoch": 0.9592694593419336, "grad_norm": 5.650635242462158, "learning_rate": 4.073054065806639e-08, "loss": 0.1957, "step": 19854 }, { "epoch": 0.9593177755230227, "grad_norm": 2.883565664291382, "learning_rate": 4.068222447697734e-08, "loss": 0.2243, "step": 19855 }, { "epoch": 0.9593660917041117, "grad_norm": 1.760701060295105, "learning_rate": 4.06339082958883e-08, "loss": 0.209, "step": 19856 }, { "epoch": 0.9594144078852007, "grad_norm": 3.498746871948242, "learning_rate": 4.058559211479924e-08, "loss": 0.3, "step": 19857 }, { "epoch": 0.9594627240662899, "grad_norm": 3.631376028060913, "learning_rate": 4.053727593371019e-08, "loss": 0.2358, "step": 19858 }, { "epoch": 0.9595110402473789, "grad_norm": 7.056704998016357, "learning_rate": 4.048895975262115e-08, "loss": 0.381, "step": 19859 }, { "epoch": 0.9595593564284679, "grad_norm": 2.904062509536743, "learning_rate": 4.04406435715321e-08, "loss": 0.2464, "step": 19860 }, { "epoch": 0.9596076726095569, "grad_norm": 4.248994827270508, "learning_rate": 4.039232739044306e-08, "loss": 0.3224, "step": 19861 }, { "epoch": 0.959655988790646, "grad_norm": 2.920475959777832, "learning_rate": 4.034401120935401e-08, "loss": 0.2849, "step": 19862 }, { "epoch": 0.9597043049717351, "grad_norm": 3.0767054557800293, "learning_rate": 4.0295695028264964e-08, "loss": 0.4619, "step": 19863 }, { "epoch": 0.9597526211528241, "grad_norm": 1.7380362749099731, "learning_rate": 4.0247378847175915e-08, "loss": 0.2019, "step": 19864 }, { "epoch": 0.9598009373339131, "grad_norm": 3.290875196456909, "learning_rate": 4.019906266608687e-08, "loss": 0.2354, "step": 19865 }, { "epoch": 0.9598492535150022, "grad_norm": 3.4890241622924805, "learning_rate": 4.015074648499783e-08, "loss": 0.417, "step": 19866 }, { "epoch": 0.9598975696960912, "grad_norm": 3.7833025455474854, "learning_rate": 4.010243030390878e-08, "loss": 0.2251, "step": 19867 }, { "epoch": 0.9599458858771802, "grad_norm": 2.197402000427246, "learning_rate": 4.005411412281974e-08, "loss": 0.2128, "step": 19868 }, { "epoch": 0.9599942020582694, "grad_norm": 3.0033059120178223, "learning_rate": 4.000579794173068e-08, "loss": 0.3493, "step": 19869 }, { "epoch": 0.9600425182393584, "grad_norm": 5.756442070007324, "learning_rate": 3.995748176064163e-08, "loss": 0.1661, "step": 19870 }, { "epoch": 0.9600908344204474, "grad_norm": 2.728351354598999, "learning_rate": 3.990916557955259e-08, "loss": 0.3632, "step": 19871 }, { "epoch": 0.9601391506015364, "grad_norm": 2.415630340576172, "learning_rate": 3.986084939846354e-08, "loss": 0.286, "step": 19872 }, { "epoch": 0.9601874667826255, "grad_norm": 2.5701959133148193, "learning_rate": 3.9812533217374496e-08, "loss": 0.2151, "step": 19873 }, { "epoch": 0.9602357829637146, "grad_norm": 2.1379082202911377, "learning_rate": 3.9764217036285446e-08, "loss": 0.2318, "step": 19874 }, { "epoch": 0.9602840991448036, "grad_norm": 2.5565409660339355, "learning_rate": 3.9715900855196404e-08, "loss": 0.3136, "step": 19875 }, { "epoch": 0.9603324153258926, "grad_norm": 3.9966657161712646, "learning_rate": 3.9667584674107354e-08, "loss": 0.3384, "step": 19876 }, { "epoch": 0.9603807315069817, "grad_norm": 2.939795970916748, "learning_rate": 3.961926849301831e-08, "loss": 0.325, "step": 19877 }, { "epoch": 0.9604290476880707, "grad_norm": 3.7324063777923584, "learning_rate": 3.957095231192926e-08, "loss": 0.2454, "step": 19878 }, { "epoch": 0.9604773638691598, "grad_norm": 6.1639723777771, "learning_rate": 3.952263613084022e-08, "loss": 0.3353, "step": 19879 }, { "epoch": 0.9605256800502489, "grad_norm": 3.9391605854034424, "learning_rate": 3.947431994975117e-08, "loss": 0.3302, "step": 19880 }, { "epoch": 0.9605739962313379, "grad_norm": 3.70078706741333, "learning_rate": 3.9426003768662127e-08, "loss": 0.2725, "step": 19881 }, { "epoch": 0.9606223124124269, "grad_norm": 8.458281517028809, "learning_rate": 3.937768758757307e-08, "loss": 0.3595, "step": 19882 }, { "epoch": 0.9606706285935159, "grad_norm": 2.5774521827697754, "learning_rate": 3.932937140648403e-08, "loss": 0.1544, "step": 19883 }, { "epoch": 0.9607189447746051, "grad_norm": 2.3493268489837646, "learning_rate": 3.928105522539498e-08, "loss": 0.25, "step": 19884 }, { "epoch": 0.9607672609556941, "grad_norm": 2.080547571182251, "learning_rate": 3.9232739044305935e-08, "loss": 0.221, "step": 19885 }, { "epoch": 0.9608155771367831, "grad_norm": 2.5000784397125244, "learning_rate": 3.9184422863216886e-08, "loss": 0.2032, "step": 19886 }, { "epoch": 0.9608638933178721, "grad_norm": 2.676146984100342, "learning_rate": 3.913610668212784e-08, "loss": 0.2672, "step": 19887 }, { "epoch": 0.9609122094989612, "grad_norm": 2.5663726329803467, "learning_rate": 3.9087790501038793e-08, "loss": 0.2647, "step": 19888 }, { "epoch": 0.9609605256800503, "grad_norm": 3.458080291748047, "learning_rate": 3.903947431994975e-08, "loss": 0.3291, "step": 19889 }, { "epoch": 0.9610088418611393, "grad_norm": 3.2144148349761963, "learning_rate": 3.89911581388607e-08, "loss": 0.3117, "step": 19890 }, { "epoch": 0.9610571580422284, "grad_norm": 2.199467897415161, "learning_rate": 3.894284195777166e-08, "loss": 0.274, "step": 19891 }, { "epoch": 0.9611054742233174, "grad_norm": 2.3651678562164307, "learning_rate": 3.889452577668261e-08, "loss": 0.2131, "step": 19892 }, { "epoch": 0.9611537904044064, "grad_norm": 1.829979419708252, "learning_rate": 3.8846209595593566e-08, "loss": 0.1932, "step": 19893 }, { "epoch": 0.9612021065854954, "grad_norm": 7.660750865936279, "learning_rate": 3.8797893414504516e-08, "loss": 0.3995, "step": 19894 }, { "epoch": 0.9612504227665846, "grad_norm": 2.7291765213012695, "learning_rate": 3.874957723341547e-08, "loss": 0.1846, "step": 19895 }, { "epoch": 0.9612987389476736, "grad_norm": 3.359971046447754, "learning_rate": 3.870126105232642e-08, "loss": 0.3213, "step": 19896 }, { "epoch": 0.9613470551287626, "grad_norm": 2.8770668506622314, "learning_rate": 3.8652944871237374e-08, "loss": 0.2396, "step": 19897 }, { "epoch": 0.9613953713098516, "grad_norm": 3.4068636894226074, "learning_rate": 3.8604628690148325e-08, "loss": 0.3154, "step": 19898 }, { "epoch": 0.9614436874909407, "grad_norm": 2.889659881591797, "learning_rate": 3.855631250905928e-08, "loss": 0.4207, "step": 19899 }, { "epoch": 0.9614920036720298, "grad_norm": 2.8840181827545166, "learning_rate": 3.850799632797023e-08, "loss": 0.2915, "step": 19900 }, { "epoch": 0.9615403198531188, "grad_norm": 4.125518321990967, "learning_rate": 3.845968014688119e-08, "loss": 0.3197, "step": 19901 }, { "epoch": 0.9615886360342079, "grad_norm": 3.130516767501831, "learning_rate": 3.841136396579214e-08, "loss": 0.3538, "step": 19902 }, { "epoch": 0.9616369522152969, "grad_norm": 5.424555778503418, "learning_rate": 3.83630477847031e-08, "loss": 0.2229, "step": 19903 }, { "epoch": 0.9616852683963859, "grad_norm": 2.3749849796295166, "learning_rate": 3.831473160361405e-08, "loss": 0.295, "step": 19904 }, { "epoch": 0.961733584577475, "grad_norm": 3.0385968685150146, "learning_rate": 3.8266415422525005e-08, "loss": 0.247, "step": 19905 }, { "epoch": 0.9617819007585641, "grad_norm": 3.786109447479248, "learning_rate": 3.8218099241435955e-08, "loss": 0.2082, "step": 19906 }, { "epoch": 0.9618302169396531, "grad_norm": 3.5041472911834717, "learning_rate": 3.8169783060346906e-08, "loss": 0.3096, "step": 19907 }, { "epoch": 0.9618785331207421, "grad_norm": 2.8975632190704346, "learning_rate": 3.8121466879257856e-08, "loss": 0.3154, "step": 19908 }, { "epoch": 0.9619268493018311, "grad_norm": 2.630397081375122, "learning_rate": 3.8073150698168814e-08, "loss": 0.3125, "step": 19909 }, { "epoch": 0.9619751654829203, "grad_norm": 2.7502119541168213, "learning_rate": 3.8024834517079764e-08, "loss": 0.3381, "step": 19910 }, { "epoch": 0.9620234816640093, "grad_norm": 2.6488659381866455, "learning_rate": 3.797651833599072e-08, "loss": 0.2574, "step": 19911 }, { "epoch": 0.9620717978450983, "grad_norm": 4.724020481109619, "learning_rate": 3.792820215490167e-08, "loss": 0.2184, "step": 19912 }, { "epoch": 0.9621201140261874, "grad_norm": 2.7859623432159424, "learning_rate": 3.787988597381263e-08, "loss": 0.3388, "step": 19913 }, { "epoch": 0.9621684302072764, "grad_norm": 2.138444662094116, "learning_rate": 3.783156979272358e-08, "loss": 0.1962, "step": 19914 }, { "epoch": 0.9622167463883655, "grad_norm": 2.7758636474609375, "learning_rate": 3.7783253611634537e-08, "loss": 0.4244, "step": 19915 }, { "epoch": 0.9622650625694545, "grad_norm": 2.915105104446411, "learning_rate": 3.773493743054549e-08, "loss": 0.3768, "step": 19916 }, { "epoch": 0.9623133787505436, "grad_norm": 2.10831356048584, "learning_rate": 3.7686621249456444e-08, "loss": 0.2286, "step": 19917 }, { "epoch": 0.9623616949316326, "grad_norm": 8.100057601928711, "learning_rate": 3.7638305068367395e-08, "loss": 0.338, "step": 19918 }, { "epoch": 0.9624100111127216, "grad_norm": 2.5367627143859863, "learning_rate": 3.758998888727835e-08, "loss": 0.2741, "step": 19919 }, { "epoch": 0.9624583272938106, "grad_norm": 1.7027004957199097, "learning_rate": 3.7541672706189296e-08, "loss": 0.1621, "step": 19920 }, { "epoch": 0.9625066434748998, "grad_norm": 2.471543788909912, "learning_rate": 3.749335652510025e-08, "loss": 0.2765, "step": 19921 }, { "epoch": 0.9625549596559888, "grad_norm": 2.5988569259643555, "learning_rate": 3.7445040344011203e-08, "loss": 0.3749, "step": 19922 }, { "epoch": 0.9626032758370778, "grad_norm": 2.704030752182007, "learning_rate": 3.739672416292216e-08, "loss": 0.3462, "step": 19923 }, { "epoch": 0.9626515920181669, "grad_norm": 2.8207550048828125, "learning_rate": 3.734840798183311e-08, "loss": 0.284, "step": 19924 }, { "epoch": 0.9626999081992559, "grad_norm": 8.216349601745605, "learning_rate": 3.730009180074407e-08, "loss": 0.419, "step": 19925 }, { "epoch": 0.962748224380345, "grad_norm": 2.9945614337921143, "learning_rate": 3.725177561965502e-08, "loss": 0.4457, "step": 19926 }, { "epoch": 0.962796540561434, "grad_norm": 2.8611292839050293, "learning_rate": 3.7203459438565976e-08, "loss": 0.3863, "step": 19927 }, { "epoch": 0.9628448567425231, "grad_norm": 4.972321510314941, "learning_rate": 3.7155143257476926e-08, "loss": 0.3018, "step": 19928 }, { "epoch": 0.9628931729236121, "grad_norm": 2.810267448425293, "learning_rate": 3.7106827076387883e-08, "loss": 0.2774, "step": 19929 }, { "epoch": 0.9629414891047011, "grad_norm": 4.616400718688965, "learning_rate": 3.7058510895298834e-08, "loss": 0.3214, "step": 19930 }, { "epoch": 0.9629898052857903, "grad_norm": 3.1813042163848877, "learning_rate": 3.701019471420979e-08, "loss": 0.274, "step": 19931 }, { "epoch": 0.9630381214668793, "grad_norm": 2.6832780838012695, "learning_rate": 3.696187853312074e-08, "loss": 0.2804, "step": 19932 }, { "epoch": 0.9630864376479683, "grad_norm": 2.315499782562256, "learning_rate": 3.691356235203169e-08, "loss": 0.3378, "step": 19933 }, { "epoch": 0.9631347538290573, "grad_norm": 2.356825351715088, "learning_rate": 3.686524617094264e-08, "loss": 0.2917, "step": 19934 }, { "epoch": 0.9631830700101464, "grad_norm": 2.7155282497406006, "learning_rate": 3.68169299898536e-08, "loss": 0.3363, "step": 19935 }, { "epoch": 0.9632313861912355, "grad_norm": 1.906588077545166, "learning_rate": 3.676861380876455e-08, "loss": 0.3096, "step": 19936 }, { "epoch": 0.9632797023723245, "grad_norm": 3.220691680908203, "learning_rate": 3.672029762767551e-08, "loss": 0.1918, "step": 19937 }, { "epoch": 0.9633280185534135, "grad_norm": 2.6168642044067383, "learning_rate": 3.667198144658646e-08, "loss": 0.3113, "step": 19938 }, { "epoch": 0.9633763347345026, "grad_norm": 3.1096553802490234, "learning_rate": 3.6623665265497415e-08, "loss": 0.338, "step": 19939 }, { "epoch": 0.9634246509155916, "grad_norm": 2.701629638671875, "learning_rate": 3.6575349084408365e-08, "loss": 0.2698, "step": 19940 }, { "epoch": 0.9634729670966807, "grad_norm": 2.344909906387329, "learning_rate": 3.652703290331932e-08, "loss": 0.2383, "step": 19941 }, { "epoch": 0.9635212832777698, "grad_norm": 4.027120113372803, "learning_rate": 3.647871672223027e-08, "loss": 0.3135, "step": 19942 }, { "epoch": 0.9635695994588588, "grad_norm": 2.8176841735839844, "learning_rate": 3.643040054114123e-08, "loss": 0.2914, "step": 19943 }, { "epoch": 0.9636179156399478, "grad_norm": 2.612074613571167, "learning_rate": 3.638208436005218e-08, "loss": 0.3666, "step": 19944 }, { "epoch": 0.9636662318210368, "grad_norm": 1.687281847000122, "learning_rate": 3.633376817896313e-08, "loss": 0.2142, "step": 19945 }, { "epoch": 0.9637145480021259, "grad_norm": 2.9975225925445557, "learning_rate": 3.628545199787408e-08, "loss": 0.3368, "step": 19946 }, { "epoch": 0.963762864183215, "grad_norm": 2.1385862827301025, "learning_rate": 3.623713581678504e-08, "loss": 0.1917, "step": 19947 }, { "epoch": 0.963811180364304, "grad_norm": 2.1698763370513916, "learning_rate": 3.618881963569599e-08, "loss": 0.2757, "step": 19948 }, { "epoch": 0.963859496545393, "grad_norm": 3.5882861614227295, "learning_rate": 3.6140503454606947e-08, "loss": 0.2644, "step": 19949 }, { "epoch": 0.9639078127264821, "grad_norm": 2.4027459621429443, "learning_rate": 3.60921872735179e-08, "loss": 0.2503, "step": 19950 }, { "epoch": 0.9639561289075711, "grad_norm": 17.00968360900879, "learning_rate": 3.6043871092428854e-08, "loss": 0.2835, "step": 19951 }, { "epoch": 0.9640044450886602, "grad_norm": 2.0931458473205566, "learning_rate": 3.5995554911339805e-08, "loss": 0.2646, "step": 19952 }, { "epoch": 0.9640527612697493, "grad_norm": 2.4657132625579834, "learning_rate": 3.594723873025076e-08, "loss": 0.2962, "step": 19953 }, { "epoch": 0.9641010774508383, "grad_norm": 2.5412425994873047, "learning_rate": 3.589892254916171e-08, "loss": 0.2789, "step": 19954 }, { "epoch": 0.9641493936319273, "grad_norm": 4.202724456787109, "learning_rate": 3.585060636807267e-08, "loss": 0.429, "step": 19955 }, { "epoch": 0.9641977098130163, "grad_norm": 2.5195796489715576, "learning_rate": 3.580229018698362e-08, "loss": 0.3418, "step": 19956 }, { "epoch": 0.9642460259941055, "grad_norm": 2.696704387664795, "learning_rate": 3.575397400589458e-08, "loss": 0.3093, "step": 19957 }, { "epoch": 0.9642943421751945, "grad_norm": 3.017385244369507, "learning_rate": 3.570565782480552e-08, "loss": 0.3064, "step": 19958 }, { "epoch": 0.9643426583562835, "grad_norm": 2.9905359745025635, "learning_rate": 3.565734164371648e-08, "loss": 0.3926, "step": 19959 }, { "epoch": 0.9643909745373725, "grad_norm": 2.894922971725464, "learning_rate": 3.560902546262743e-08, "loss": 0.3822, "step": 19960 }, { "epoch": 0.9644392907184616, "grad_norm": 4.594244956970215, "learning_rate": 3.5560709281538386e-08, "loss": 0.1935, "step": 19961 }, { "epoch": 0.9644876068995507, "grad_norm": 1.8742645978927612, "learning_rate": 3.5512393100449336e-08, "loss": 0.2218, "step": 19962 }, { "epoch": 0.9645359230806397, "grad_norm": 3.366105318069458, "learning_rate": 3.5464076919360293e-08, "loss": 0.3784, "step": 19963 }, { "epoch": 0.9645842392617288, "grad_norm": 2.45739483833313, "learning_rate": 3.5415760738271244e-08, "loss": 0.2375, "step": 19964 }, { "epoch": 0.9646325554428178, "grad_norm": 1.908246397972107, "learning_rate": 3.53674445571822e-08, "loss": 0.1859, "step": 19965 }, { "epoch": 0.9646808716239068, "grad_norm": 2.192744731903076, "learning_rate": 3.531912837609315e-08, "loss": 0.1945, "step": 19966 }, { "epoch": 0.964729187804996, "grad_norm": 2.2610034942626953, "learning_rate": 3.527081219500411e-08, "loss": 0.2026, "step": 19967 }, { "epoch": 0.964777503986085, "grad_norm": 2.0426981449127197, "learning_rate": 3.522249601391506e-08, "loss": 0.2273, "step": 19968 }, { "epoch": 0.964825820167174, "grad_norm": 2.0838727951049805, "learning_rate": 3.5174179832826016e-08, "loss": 0.268, "step": 19969 }, { "epoch": 0.964874136348263, "grad_norm": 2.499845266342163, "learning_rate": 3.512586365173697e-08, "loss": 0.2937, "step": 19970 }, { "epoch": 0.964922452529352, "grad_norm": 5.706758499145508, "learning_rate": 3.507754747064792e-08, "loss": 0.2443, "step": 19971 }, { "epoch": 0.9649707687104411, "grad_norm": 2.3883867263793945, "learning_rate": 3.502923128955887e-08, "loss": 0.2411, "step": 19972 }, { "epoch": 0.9650190848915302, "grad_norm": 1.7494581937789917, "learning_rate": 3.4980915108469825e-08, "loss": 0.1904, "step": 19973 }, { "epoch": 0.9650674010726192, "grad_norm": 3.199687957763672, "learning_rate": 3.4932598927380776e-08, "loss": 0.4171, "step": 19974 }, { "epoch": 0.9651157172537083, "grad_norm": 2.207521677017212, "learning_rate": 3.488428274629173e-08, "loss": 0.3027, "step": 19975 }, { "epoch": 0.9651640334347973, "grad_norm": 2.660210371017456, "learning_rate": 3.483596656520268e-08, "loss": 0.2928, "step": 19976 }, { "epoch": 0.9652123496158863, "grad_norm": 3.96404767036438, "learning_rate": 3.478765038411364e-08, "loss": 0.4649, "step": 19977 }, { "epoch": 0.9652606657969754, "grad_norm": 3.1254994869232178, "learning_rate": 3.473933420302459e-08, "loss": 0.4109, "step": 19978 }, { "epoch": 0.9653089819780645, "grad_norm": 2.8463780879974365, "learning_rate": 3.469101802193555e-08, "loss": 0.3112, "step": 19979 }, { "epoch": 0.9653572981591535, "grad_norm": 4.3524346351623535, "learning_rate": 3.46427018408465e-08, "loss": 0.3785, "step": 19980 }, { "epoch": 0.9654056143402425, "grad_norm": 2.343858242034912, "learning_rate": 3.4594385659757456e-08, "loss": 0.2779, "step": 19981 }, { "epoch": 0.9654539305213315, "grad_norm": 14.062790870666504, "learning_rate": 3.4546069478668406e-08, "loss": 0.1592, "step": 19982 }, { "epoch": 0.9655022467024207, "grad_norm": 2.1846351623535156, "learning_rate": 3.4497753297579357e-08, "loss": 0.2095, "step": 19983 }, { "epoch": 0.9655505628835097, "grad_norm": 4.091536045074463, "learning_rate": 3.444943711649031e-08, "loss": 0.3116, "step": 19984 }, { "epoch": 0.9655988790645987, "grad_norm": 1.835278868675232, "learning_rate": 3.4401120935401264e-08, "loss": 0.1956, "step": 19985 }, { "epoch": 0.9656471952456878, "grad_norm": 7.037148475646973, "learning_rate": 3.4352804754312215e-08, "loss": 0.19, "step": 19986 }, { "epoch": 0.9656955114267768, "grad_norm": 2.6245901584625244, "learning_rate": 3.430448857322317e-08, "loss": 0.2979, "step": 19987 }, { "epoch": 0.9657438276078659, "grad_norm": 3.794339418411255, "learning_rate": 3.425617239213412e-08, "loss": 0.3602, "step": 19988 }, { "epoch": 0.965792143788955, "grad_norm": 2.1068503856658936, "learning_rate": 3.420785621104508e-08, "loss": 0.2001, "step": 19989 }, { "epoch": 0.965840459970044, "grad_norm": 1.8989976644515991, "learning_rate": 3.415954002995603e-08, "loss": 0.1461, "step": 19990 }, { "epoch": 0.965888776151133, "grad_norm": 2.071525812149048, "learning_rate": 3.411122384886699e-08, "loss": 0.2829, "step": 19991 }, { "epoch": 0.965937092332222, "grad_norm": 2.6575894355773926, "learning_rate": 3.406290766777794e-08, "loss": 0.2449, "step": 19992 }, { "epoch": 0.9659854085133112, "grad_norm": 2.37373685836792, "learning_rate": 3.4014591486688895e-08, "loss": 0.1952, "step": 19993 }, { "epoch": 0.9660337246944002, "grad_norm": 2.665161371231079, "learning_rate": 3.3966275305599845e-08, "loss": 0.2798, "step": 19994 }, { "epoch": 0.9660820408754892, "grad_norm": 16.156070709228516, "learning_rate": 3.39179591245108e-08, "loss": 0.2467, "step": 19995 }, { "epoch": 0.9661303570565782, "grad_norm": 3.4117703437805176, "learning_rate": 3.3869642943421746e-08, "loss": 0.4577, "step": 19996 }, { "epoch": 0.9661786732376673, "grad_norm": 2.36441707611084, "learning_rate": 3.3821326762332703e-08, "loss": 0.278, "step": 19997 }, { "epoch": 0.9662269894187563, "grad_norm": 2.2606823444366455, "learning_rate": 3.3773010581243654e-08, "loss": 0.2984, "step": 19998 }, { "epoch": 0.9662753055998454, "grad_norm": 5.546999454498291, "learning_rate": 3.372469440015461e-08, "loss": 0.2874, "step": 19999 }, { "epoch": 0.9663236217809344, "grad_norm": 2.4261837005615234, "learning_rate": 3.367637821906556e-08, "loss": 0.2564, "step": 20000 } ], "logging_steps": 1.0, "max_steps": 20697, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 1000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 7.530982694646186e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }