{ "best_metric": null, "best_model_checkpoint": null, "epoch": 1.0, "eval_steps": 500, "global_step": 6750, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00014814814814814815, "grad_norm": 2.1235709190368652, "learning_rate": 4e-05, "loss": 2.8688, "step": 1 }, { "epoch": 0.0002962962962962963, "grad_norm": 3.357743978500366, "learning_rate": 8e-05, "loss": 2.6483, "step": 2 }, { "epoch": 0.00044444444444444447, "grad_norm": 2.011920213699341, "learning_rate": 0.00012, "loss": 2.696, "step": 3 }, { "epoch": 0.0005925925925925926, "grad_norm": 3.150110960006714, "learning_rate": 0.00016, "loss": 2.9999, "step": 4 }, { "epoch": 0.0007407407407407407, "grad_norm": 2.520505905151367, "learning_rate": 0.0002, "loss": 2.0684, "step": 5 }, { "epoch": 0.0008888888888888889, "grad_norm": 1.2569918632507324, "learning_rate": 0.00019997034840622683, "loss": 2.1813, "step": 6 }, { "epoch": 0.001037037037037037, "grad_norm": 1.747957706451416, "learning_rate": 0.0001999406968124537, "loss": 2.0401, "step": 7 }, { "epoch": 0.0011851851851851852, "grad_norm": 1.1225775480270386, "learning_rate": 0.00019991104521868052, "loss": 1.7564, "step": 8 }, { "epoch": 0.0013333333333333333, "grad_norm": 1.16183340549469, "learning_rate": 0.00019988139362490733, "loss": 1.7111, "step": 9 }, { "epoch": 0.0014814814814814814, "grad_norm": 1.3514748811721802, "learning_rate": 0.0001998517420311342, "loss": 1.9612, "step": 10 }, { "epoch": 0.0016296296296296295, "grad_norm": 1.3388046026229858, "learning_rate": 0.00019982209043736102, "loss": 1.6278, "step": 11 }, { "epoch": 0.0017777777777777779, "grad_norm": 1.4107303619384766, "learning_rate": 0.00019979243884358784, "loss": 1.8498, "step": 12 }, { "epoch": 0.001925925925925926, "grad_norm": 1.2822532653808594, "learning_rate": 0.0001997627872498147, "loss": 1.4322, "step": 13 }, { "epoch": 0.002074074074074074, "grad_norm": 1.0885272026062012, "learning_rate": 0.00019973313565604153, "loss": 1.691, "step": 14 }, { "epoch": 0.0022222222222222222, "grad_norm": 0.9695473313331604, "learning_rate": 0.00019970348406226835, "loss": 1.6351, "step": 15 }, { "epoch": 0.0023703703703703703, "grad_norm": 1.0999016761779785, "learning_rate": 0.0001996738324684952, "loss": 1.4707, "step": 16 }, { "epoch": 0.0025185185185185185, "grad_norm": 1.2312846183776855, "learning_rate": 0.00019964418087472203, "loss": 1.5608, "step": 17 }, { "epoch": 0.0026666666666666666, "grad_norm": 1.980854868888855, "learning_rate": 0.00019961452928094885, "loss": 1.5096, "step": 18 }, { "epoch": 0.0028148148148148147, "grad_norm": 1.4264863729476929, "learning_rate": 0.0001995848776871757, "loss": 1.4313, "step": 19 }, { "epoch": 0.002962962962962963, "grad_norm": 1.5596296787261963, "learning_rate": 0.00019955522609340254, "loss": 1.2828, "step": 20 }, { "epoch": 0.003111111111111111, "grad_norm": 1.3198904991149902, "learning_rate": 0.00019952557449962936, "loss": 1.5241, "step": 21 }, { "epoch": 0.003259259259259259, "grad_norm": 1.263428807258606, "learning_rate": 0.0001994959229058562, "loss": 1.499, "step": 22 }, { "epoch": 0.0034074074074074076, "grad_norm": 1.1441594362258911, "learning_rate": 0.00019946627131208305, "loss": 1.2826, "step": 23 }, { "epoch": 0.0035555555555555557, "grad_norm": 1.7674256563186646, "learning_rate": 0.00019943661971830986, "loss": 1.3383, "step": 24 }, { "epoch": 0.003703703703703704, "grad_norm": 1.1815110445022583, "learning_rate": 0.0001994069681245367, "loss": 1.269, "step": 25 }, { "epoch": 0.003851851851851852, "grad_norm": 1.5254690647125244, "learning_rate": 0.00019937731653076355, "loss": 1.161, "step": 26 }, { "epoch": 0.004, "grad_norm": 1.4483976364135742, "learning_rate": 0.00019934766493699037, "loss": 1.2054, "step": 27 }, { "epoch": 0.004148148148148148, "grad_norm": 1.6680272817611694, "learning_rate": 0.00019931801334321722, "loss": 1.3768, "step": 28 }, { "epoch": 0.004296296296296296, "grad_norm": 1.6189104318618774, "learning_rate": 0.00019928836174944406, "loss": 1.5359, "step": 29 }, { "epoch": 0.0044444444444444444, "grad_norm": 1.8396241664886475, "learning_rate": 0.00019925871015567088, "loss": 1.2609, "step": 30 }, { "epoch": 0.0045925925925925926, "grad_norm": 1.2982949018478394, "learning_rate": 0.0001992290585618977, "loss": 1.3573, "step": 31 }, { "epoch": 0.004740740740740741, "grad_norm": 1.2802674770355225, "learning_rate": 0.00019919940696812454, "loss": 1.3206, "step": 32 }, { "epoch": 0.004888888888888889, "grad_norm": 1.320926547050476, "learning_rate": 0.00019916975537435138, "loss": 1.313, "step": 33 }, { "epoch": 0.005037037037037037, "grad_norm": 0.9515355825424194, "learning_rate": 0.0001991401037805782, "loss": 1.377, "step": 34 }, { "epoch": 0.005185185185185185, "grad_norm": 1.0064878463745117, "learning_rate": 0.00019911045218680505, "loss": 1.2234, "step": 35 }, { "epoch": 0.005333333333333333, "grad_norm": 0.8821234703063965, "learning_rate": 0.0001990808005930319, "loss": 1.2351, "step": 36 }, { "epoch": 0.005481481481481481, "grad_norm": 1.1091601848602295, "learning_rate": 0.0001990511489992587, "loss": 1.3856, "step": 37 }, { "epoch": 0.005629629629629629, "grad_norm": 1.2138992547988892, "learning_rate": 0.00019902149740548555, "loss": 1.489, "step": 38 }, { "epoch": 0.0057777777777777775, "grad_norm": 1.040979266166687, "learning_rate": 0.0001989918458117124, "loss": 1.2152, "step": 39 }, { "epoch": 0.005925925925925926, "grad_norm": 1.0536178350448608, "learning_rate": 0.0001989621942179392, "loss": 1.2148, "step": 40 }, { "epoch": 0.006074074074074074, "grad_norm": 1.0311094522476196, "learning_rate": 0.00019893254262416606, "loss": 1.3002, "step": 41 }, { "epoch": 0.006222222222222222, "grad_norm": 0.8636873960494995, "learning_rate": 0.0001989028910303929, "loss": 1.2371, "step": 42 }, { "epoch": 0.00637037037037037, "grad_norm": 0.8415317535400391, "learning_rate": 0.00019887323943661972, "loss": 1.4714, "step": 43 }, { "epoch": 0.006518518518518518, "grad_norm": 1.3478702306747437, "learning_rate": 0.00019884358784284656, "loss": 1.5319, "step": 44 }, { "epoch": 0.006666666666666667, "grad_norm": 0.9216668605804443, "learning_rate": 0.0001988139362490734, "loss": 1.3122, "step": 45 }, { "epoch": 0.006814814814814815, "grad_norm": 0.9138181209564209, "learning_rate": 0.00019878428465530023, "loss": 1.0854, "step": 46 }, { "epoch": 0.006962962962962963, "grad_norm": 0.890804648399353, "learning_rate": 0.00019875463306152707, "loss": 1.3739, "step": 47 }, { "epoch": 0.0071111111111111115, "grad_norm": 1.20708167552948, "learning_rate": 0.00019872498146775391, "loss": 1.3702, "step": 48 }, { "epoch": 0.00725925925925926, "grad_norm": 1.4061371088027954, "learning_rate": 0.00019869532987398073, "loss": 1.3931, "step": 49 }, { "epoch": 0.007407407407407408, "grad_norm": 0.9063286781311035, "learning_rate": 0.00019866567828020758, "loss": 1.2466, "step": 50 }, { "epoch": 0.007555555555555556, "grad_norm": 1.1027034521102905, "learning_rate": 0.0001986360266864344, "loss": 1.3566, "step": 51 }, { "epoch": 0.007703703703703704, "grad_norm": 0.9901520609855652, "learning_rate": 0.00019860637509266124, "loss": 1.3878, "step": 52 }, { "epoch": 0.007851851851851851, "grad_norm": 0.9693011045455933, "learning_rate": 0.00019857672349888808, "loss": 1.1488, "step": 53 }, { "epoch": 0.008, "grad_norm": 0.792296826839447, "learning_rate": 0.0001985470719051149, "loss": 1.4231, "step": 54 }, { "epoch": 0.008148148148148147, "grad_norm": 0.9005415439605713, "learning_rate": 0.00019851742031134174, "loss": 1.2315, "step": 55 }, { "epoch": 0.008296296296296296, "grad_norm": 0.7594810128211975, "learning_rate": 0.0001984877687175686, "loss": 1.3575, "step": 56 }, { "epoch": 0.008444444444444444, "grad_norm": 1.562156081199646, "learning_rate": 0.0001984581171237954, "loss": 1.3894, "step": 57 }, { "epoch": 0.008592592592592593, "grad_norm": 0.9959694743156433, "learning_rate": 0.00019842846553002225, "loss": 1.3206, "step": 58 }, { "epoch": 0.00874074074074074, "grad_norm": 1.3404779434204102, "learning_rate": 0.0001983988139362491, "loss": 1.2058, "step": 59 }, { "epoch": 0.008888888888888889, "grad_norm": 1.0158309936523438, "learning_rate": 0.0001983691623424759, "loss": 1.1776, "step": 60 }, { "epoch": 0.009037037037037038, "grad_norm": 1.3582422733306885, "learning_rate": 0.00019833951074870276, "loss": 1.4476, "step": 61 }, { "epoch": 0.009185185185185185, "grad_norm": 1.078850269317627, "learning_rate": 0.0001983098591549296, "loss": 1.3258, "step": 62 }, { "epoch": 0.009333333333333334, "grad_norm": 0.9518314599990845, "learning_rate": 0.00019828020756115642, "loss": 1.2733, "step": 63 }, { "epoch": 0.009481481481481481, "grad_norm": NaN, "learning_rate": 0.00019828020756115642, "loss": 1.2108, "step": 64 }, { "epoch": 0.00962962962962963, "grad_norm": 0.8653088212013245, "learning_rate": 0.00019825055596738326, "loss": 1.1989, "step": 65 }, { "epoch": 0.009777777777777778, "grad_norm": 1.1971681118011475, "learning_rate": 0.0001982209043736101, "loss": 1.316, "step": 66 }, { "epoch": 0.009925925925925927, "grad_norm": 0.8550506830215454, "learning_rate": 0.00019819125277983693, "loss": 1.5866, "step": 67 }, { "epoch": 0.010074074074074074, "grad_norm": 0.7203028798103333, "learning_rate": 0.00019816160118606374, "loss": 1.1993, "step": 68 }, { "epoch": 0.010222222222222223, "grad_norm": 1.342175841331482, "learning_rate": 0.0001981319495922906, "loss": 1.4759, "step": 69 }, { "epoch": 0.01037037037037037, "grad_norm": 0.8517811894416809, "learning_rate": 0.00019810229799851743, "loss": 1.3233, "step": 70 }, { "epoch": 0.010518518518518519, "grad_norm": 0.991734504699707, "learning_rate": 0.00019807264640474425, "loss": 1.3343, "step": 71 }, { "epoch": 0.010666666666666666, "grad_norm": 0.9519258737564087, "learning_rate": 0.0001980429948109711, "loss": 1.2763, "step": 72 }, { "epoch": 0.010814814814814815, "grad_norm": 2.1876156330108643, "learning_rate": 0.00019801334321719794, "loss": 1.4588, "step": 73 }, { "epoch": 0.010962962962962963, "grad_norm": 1.0274714231491089, "learning_rate": 0.00019798369162342476, "loss": 1.1687, "step": 74 }, { "epoch": 0.011111111111111112, "grad_norm": 0.8644994497299194, "learning_rate": 0.0001979540400296516, "loss": 1.4098, "step": 75 }, { "epoch": 0.011259259259259259, "grad_norm": 0.8542577624320984, "learning_rate": 0.00019792438843587844, "loss": 1.5521, "step": 76 }, { "epoch": 0.011407407407407408, "grad_norm": 0.9984826445579529, "learning_rate": 0.00019789473684210526, "loss": 1.1638, "step": 77 }, { "epoch": 0.011555555555555555, "grad_norm": 0.7212148904800415, "learning_rate": 0.0001978650852483321, "loss": 1.1242, "step": 78 }, { "epoch": 0.011703703703703704, "grad_norm": 1.859447717666626, "learning_rate": 0.00019783543365455895, "loss": 1.3361, "step": 79 }, { "epoch": 0.011851851851851851, "grad_norm": 0.8758236765861511, "learning_rate": 0.00019780578206078577, "loss": 1.0517, "step": 80 }, { "epoch": 0.012, "grad_norm": 1.2347968816757202, "learning_rate": 0.0001977761304670126, "loss": 1.0352, "step": 81 }, { "epoch": 0.012148148148148148, "grad_norm": 0.7254672050476074, "learning_rate": 0.00019774647887323946, "loss": 1.1428, "step": 82 }, { "epoch": 0.012296296296296296, "grad_norm": 1.1427414417266846, "learning_rate": 0.00019771682727946627, "loss": 1.3168, "step": 83 }, { "epoch": 0.012444444444444444, "grad_norm": 0.8925330638885498, "learning_rate": 0.0001976871756856931, "loss": 1.2553, "step": 84 }, { "epoch": 0.012592592592592593, "grad_norm": 0.6908546090126038, "learning_rate": 0.00019765752409191996, "loss": 1.2659, "step": 85 }, { "epoch": 0.01274074074074074, "grad_norm": 0.8214904069900513, "learning_rate": 0.00019762787249814678, "loss": 1.2111, "step": 86 }, { "epoch": 0.012888888888888889, "grad_norm": 0.9786846041679382, "learning_rate": 0.0001975982209043736, "loss": 1.3289, "step": 87 }, { "epoch": 0.013037037037037036, "grad_norm": 0.8723524212837219, "learning_rate": 0.00019756856931060047, "loss": 1.5589, "step": 88 }, { "epoch": 0.013185185185185185, "grad_norm": 1.081800937652588, "learning_rate": 0.0001975389177168273, "loss": 1.2323, "step": 89 }, { "epoch": 0.013333333333333334, "grad_norm": 0.8913035988807678, "learning_rate": 0.0001975092661230541, "loss": 1.2034, "step": 90 }, { "epoch": 0.013481481481481481, "grad_norm": 0.7650095820426941, "learning_rate": 0.00019747961452928098, "loss": 1.2324, "step": 91 }, { "epoch": 0.01362962962962963, "grad_norm": 2.0756051540374756, "learning_rate": 0.0001974499629355078, "loss": 1.0702, "step": 92 }, { "epoch": 0.013777777777777778, "grad_norm": 1.5064998865127563, "learning_rate": 0.0001974203113417346, "loss": 1.2101, "step": 93 }, { "epoch": 0.013925925925925927, "grad_norm": 0.9713321924209595, "learning_rate": 0.00019739065974796148, "loss": 1.4464, "step": 94 }, { "epoch": 0.014074074074074074, "grad_norm": 1.0733256340026855, "learning_rate": 0.0001973610081541883, "loss": 1.3696, "step": 95 }, { "epoch": 0.014222222222222223, "grad_norm": 1.0742157697677612, "learning_rate": 0.00019733135656041512, "loss": 1.3056, "step": 96 }, { "epoch": 0.01437037037037037, "grad_norm": 1.2496274709701538, "learning_rate": 0.000197301704966642, "loss": 1.179, "step": 97 }, { "epoch": 0.01451851851851852, "grad_norm": 1.0434441566467285, "learning_rate": 0.0001972720533728688, "loss": 1.0619, "step": 98 }, { "epoch": 0.014666666666666666, "grad_norm": 0.7900615334510803, "learning_rate": 0.00019724240177909562, "loss": 1.0743, "step": 99 }, { "epoch": 0.014814814814814815, "grad_norm": 0.84372878074646, "learning_rate": 0.0001972127501853225, "loss": 1.1668, "step": 100 }, { "epoch": 0.014962962962962963, "grad_norm": 1.3834422826766968, "learning_rate": 0.0001971830985915493, "loss": 1.2704, "step": 101 }, { "epoch": 0.015111111111111112, "grad_norm": 1.077656865119934, "learning_rate": 0.00019715344699777613, "loss": 1.2489, "step": 102 }, { "epoch": 0.015259259259259259, "grad_norm": 1.0293247699737549, "learning_rate": 0.00019712379540400297, "loss": 1.1533, "step": 103 }, { "epoch": 0.015407407407407408, "grad_norm": 0.8808808922767639, "learning_rate": 0.00019709414381022982, "loss": 1.1044, "step": 104 }, { "epoch": 0.015555555555555555, "grad_norm": 2.607654571533203, "learning_rate": 0.00019706449221645664, "loss": 1.2633, "step": 105 }, { "epoch": 0.015703703703703702, "grad_norm": 1.1236181259155273, "learning_rate": 0.00019703484062268348, "loss": 1.3062, "step": 106 }, { "epoch": 0.015851851851851853, "grad_norm": 0.9974605441093445, "learning_rate": 0.00019700518902891032, "loss": 1.42, "step": 107 }, { "epoch": 0.016, "grad_norm": 0.9278566241264343, "learning_rate": 0.00019697553743513714, "loss": 1.3455, "step": 108 }, { "epoch": 0.016148148148148148, "grad_norm": 1.077166199684143, "learning_rate": 0.00019694588584136399, "loss": 1.2423, "step": 109 }, { "epoch": 0.016296296296296295, "grad_norm": 1.052445411682129, "learning_rate": 0.00019691623424759083, "loss": 1.1738, "step": 110 }, { "epoch": 0.016444444444444446, "grad_norm": 0.9899265170097351, "learning_rate": 0.00019688658265381765, "loss": 1.4544, "step": 111 }, { "epoch": 0.016592592592592593, "grad_norm": 1.0286669731140137, "learning_rate": 0.0001968569310600445, "loss": 1.3348, "step": 112 }, { "epoch": 0.01674074074074074, "grad_norm": 2.1646833419799805, "learning_rate": 0.00019682727946627134, "loss": 1.1908, "step": 113 }, { "epoch": 0.016888888888888887, "grad_norm": 1.1112664937973022, "learning_rate": 0.00019679762787249815, "loss": 1.0064, "step": 114 }, { "epoch": 0.017037037037037038, "grad_norm": 0.9720861315727234, "learning_rate": 0.000196767976278725, "loss": 1.2794, "step": 115 }, { "epoch": 0.017185185185185185, "grad_norm": 1.5623220205307007, "learning_rate": 0.00019673832468495184, "loss": 1.2101, "step": 116 }, { "epoch": 0.017333333333333333, "grad_norm": 1.046308994293213, "learning_rate": 0.00019670867309117866, "loss": 1.1242, "step": 117 }, { "epoch": 0.01748148148148148, "grad_norm": 1.716864824295044, "learning_rate": 0.00019667902149740548, "loss": 1.1136, "step": 118 }, { "epoch": 0.01762962962962963, "grad_norm": 0.9031343460083008, "learning_rate": 0.00019664936990363232, "loss": 1.0777, "step": 119 }, { "epoch": 0.017777777777777778, "grad_norm": 0.8096110820770264, "learning_rate": 0.00019661971830985917, "loss": 1.2524, "step": 120 }, { "epoch": 0.017925925925925925, "grad_norm": 0.7985750436782837, "learning_rate": 0.00019659006671608598, "loss": 1.3701, "step": 121 }, { "epoch": 0.018074074074074076, "grad_norm": 0.6972211599349976, "learning_rate": 0.00019656041512231283, "loss": 1.2531, "step": 122 }, { "epoch": 0.018222222222222223, "grad_norm": 0.8134813904762268, "learning_rate": 0.00019653076352853967, "loss": 1.3938, "step": 123 }, { "epoch": 0.01837037037037037, "grad_norm": 1.100056767463684, "learning_rate": 0.0001965011119347665, "loss": 1.3501, "step": 124 }, { "epoch": 0.018518518518518517, "grad_norm": 0.6855674982070923, "learning_rate": 0.00019647146034099333, "loss": 1.3988, "step": 125 }, { "epoch": 0.018666666666666668, "grad_norm": 1.3612394332885742, "learning_rate": 0.00019644180874722018, "loss": 1.289, "step": 126 }, { "epoch": 0.018814814814814815, "grad_norm": NaN, "learning_rate": 0.00019644180874722018, "loss": 1.2401, "step": 127 }, { "epoch": 0.018962962962962963, "grad_norm": 0.6646182537078857, "learning_rate": 0.000196412157153447, "loss": 1.0761, "step": 128 }, { "epoch": 0.01911111111111111, "grad_norm": 1.1864745616912842, "learning_rate": 0.00019638250555967384, "loss": 1.3836, "step": 129 }, { "epoch": 0.01925925925925926, "grad_norm": 1.0745700597763062, "learning_rate": 0.00019635285396590069, "loss": 1.1912, "step": 130 }, { "epoch": 0.019407407407407408, "grad_norm": 1.649613618850708, "learning_rate": 0.0001963232023721275, "loss": 1.3176, "step": 131 }, { "epoch": 0.019555555555555555, "grad_norm": 1.0064952373504639, "learning_rate": 0.00019629355077835435, "loss": 1.3275, "step": 132 }, { "epoch": 0.019703703703703702, "grad_norm": 0.9298867583274841, "learning_rate": 0.0001962638991845812, "loss": 1.2191, "step": 133 }, { "epoch": 0.019851851851851853, "grad_norm": 1.6126947402954102, "learning_rate": 0.000196234247590808, "loss": 1.3715, "step": 134 }, { "epoch": 0.02, "grad_norm": 1.1250628232955933, "learning_rate": 0.00019620459599703485, "loss": 1.2638, "step": 135 }, { "epoch": 0.020148148148148148, "grad_norm": 1.1649386882781982, "learning_rate": 0.0001961749444032617, "loss": 1.2886, "step": 136 }, { "epoch": 0.020296296296296295, "grad_norm": 1.7831798791885376, "learning_rate": 0.00019614529280948852, "loss": 1.0661, "step": 137 }, { "epoch": 0.020444444444444446, "grad_norm": 1.2029664516448975, "learning_rate": 0.00019611564121571536, "loss": 1.455, "step": 138 }, { "epoch": 0.020592592592592593, "grad_norm": 0.9483388662338257, "learning_rate": 0.00019608598962194218, "loss": 1.2736, "step": 139 }, { "epoch": 0.02074074074074074, "grad_norm": 0.9334256649017334, "learning_rate": 0.00019605633802816902, "loss": 1.0822, "step": 140 }, { "epoch": 0.020888888888888887, "grad_norm": 0.8102211952209473, "learning_rate": 0.00019602668643439587, "loss": 1.3988, "step": 141 }, { "epoch": 0.021037037037037038, "grad_norm": 0.9568620324134827, "learning_rate": 0.00019599703484062268, "loss": 1.3997, "step": 142 }, { "epoch": 0.021185185185185185, "grad_norm": 1.1323370933532715, "learning_rate": 0.00019596738324684953, "loss": 1.3574, "step": 143 }, { "epoch": 0.021333333333333333, "grad_norm": 0.9975650310516357, "learning_rate": 0.00019593773165307637, "loss": 1.1898, "step": 144 }, { "epoch": 0.02148148148148148, "grad_norm": 0.9566569924354553, "learning_rate": 0.0001959080800593032, "loss": 1.2258, "step": 145 }, { "epoch": 0.02162962962962963, "grad_norm": 0.8966617584228516, "learning_rate": 0.00019587842846553003, "loss": 1.2678, "step": 146 }, { "epoch": 0.021777777777777778, "grad_norm": 0.8554395437240601, "learning_rate": 0.00019584877687175688, "loss": 1.1973, "step": 147 }, { "epoch": 0.021925925925925925, "grad_norm": 0.8343006372451782, "learning_rate": 0.0001958191252779837, "loss": 1.4802, "step": 148 }, { "epoch": 0.022074074074074072, "grad_norm": 1.0070141553878784, "learning_rate": 0.00019578947368421054, "loss": 1.1165, "step": 149 }, { "epoch": 0.022222222222222223, "grad_norm": 0.8232426643371582, "learning_rate": 0.00019575982209043739, "loss": 1.268, "step": 150 }, { "epoch": 0.02237037037037037, "grad_norm": 1.051567554473877, "learning_rate": 0.0001957301704966642, "loss": 1.2323, "step": 151 }, { "epoch": 0.022518518518518518, "grad_norm": 2.46091628074646, "learning_rate": 0.00019570051890289105, "loss": 1.2043, "step": 152 }, { "epoch": 0.02266666666666667, "grad_norm": 1.0516281127929688, "learning_rate": 0.0001956708673091179, "loss": 1.1408, "step": 153 }, { "epoch": 0.022814814814814816, "grad_norm": 0.8365291953086853, "learning_rate": 0.0001956412157153447, "loss": 1.2778, "step": 154 }, { "epoch": 0.022962962962962963, "grad_norm": 0.6768907904624939, "learning_rate": 0.00019561156412157153, "loss": 0.8589, "step": 155 }, { "epoch": 0.02311111111111111, "grad_norm": 2.7300822734832764, "learning_rate": 0.00019558191252779837, "loss": 1.5669, "step": 156 }, { "epoch": 0.02325925925925926, "grad_norm": 1.3450238704681396, "learning_rate": 0.00019555226093402521, "loss": 1.3563, "step": 157 }, { "epoch": 0.023407407407407408, "grad_norm": 0.7438297867774963, "learning_rate": 0.00019552260934025203, "loss": 1.1264, "step": 158 }, { "epoch": 0.023555555555555555, "grad_norm": 0.9578141570091248, "learning_rate": 0.00019549295774647888, "loss": 1.3149, "step": 159 }, { "epoch": 0.023703703703703703, "grad_norm": 1.0204038619995117, "learning_rate": 0.00019546330615270572, "loss": 1.394, "step": 160 }, { "epoch": 0.023851851851851853, "grad_norm": 1.0684940814971924, "learning_rate": 0.00019543365455893254, "loss": 1.2583, "step": 161 }, { "epoch": 0.024, "grad_norm": 0.7158825993537903, "learning_rate": 0.00019540400296515938, "loss": 1.3733, "step": 162 }, { "epoch": 0.024148148148148148, "grad_norm": 0.8597478866577148, "learning_rate": 0.00019537435137138623, "loss": 1.5522, "step": 163 }, { "epoch": 0.024296296296296295, "grad_norm": 0.7853899598121643, "learning_rate": 0.00019534469977761304, "loss": 1.3182, "step": 164 }, { "epoch": 0.024444444444444446, "grad_norm": 0.938490092754364, "learning_rate": 0.0001953150481838399, "loss": 1.0182, "step": 165 }, { "epoch": 0.024592592592592593, "grad_norm": 1.2965304851531982, "learning_rate": 0.00019528539659006673, "loss": 1.04, "step": 166 }, { "epoch": 0.02474074074074074, "grad_norm": 1.3206931352615356, "learning_rate": 0.00019525574499629355, "loss": 1.1929, "step": 167 }, { "epoch": 0.024888888888888887, "grad_norm": 0.8876661658287048, "learning_rate": 0.0001952260934025204, "loss": 1.1008, "step": 168 }, { "epoch": 0.025037037037037038, "grad_norm": 1.236785888671875, "learning_rate": 0.00019519644180874724, "loss": 1.4694, "step": 169 }, { "epoch": 0.025185185185185185, "grad_norm": 0.8049394488334656, "learning_rate": 0.00019516679021497406, "loss": 1.1324, "step": 170 }, { "epoch": 0.025333333333333333, "grad_norm": 1.1204288005828857, "learning_rate": 0.00019513713862120087, "loss": 1.2757, "step": 171 }, { "epoch": 0.02548148148148148, "grad_norm": 1.8202202320098877, "learning_rate": 0.00019510748702742775, "loss": 1.1399, "step": 172 }, { "epoch": 0.02562962962962963, "grad_norm": 0.9476074576377869, "learning_rate": 0.00019507783543365456, "loss": 1.2111, "step": 173 }, { "epoch": 0.025777777777777778, "grad_norm": 0.9113478660583496, "learning_rate": 0.00019504818383988138, "loss": 1.0864, "step": 174 }, { "epoch": 0.025925925925925925, "grad_norm": 0.8001992106437683, "learning_rate": 0.00019501853224610825, "loss": 1.0861, "step": 175 }, { "epoch": 0.026074074074074072, "grad_norm": 0.9687372446060181, "learning_rate": 0.00019498888065233507, "loss": 1.0022, "step": 176 }, { "epoch": 0.026222222222222223, "grad_norm": 1.6159669160842896, "learning_rate": 0.0001949592290585619, "loss": 1.4007, "step": 177 }, { "epoch": 0.02637037037037037, "grad_norm": 0.8484355211257935, "learning_rate": 0.00019492957746478876, "loss": 1.3777, "step": 178 }, { "epoch": 0.026518518518518518, "grad_norm": 1.052414894104004, "learning_rate": 0.00019489992587101558, "loss": 1.1446, "step": 179 }, { "epoch": 0.02666666666666667, "grad_norm": 0.8121458292007446, "learning_rate": 0.0001948702742772424, "loss": 1.3327, "step": 180 }, { "epoch": 0.026814814814814816, "grad_norm": 0.797144889831543, "learning_rate": 0.00019484062268346927, "loss": 1.2831, "step": 181 }, { "epoch": 0.026962962962962963, "grad_norm": 1.1285954713821411, "learning_rate": 0.00019481097108969608, "loss": 1.1895, "step": 182 }, { "epoch": 0.02711111111111111, "grad_norm": 0.9754137396812439, "learning_rate": 0.0001947813194959229, "loss": 1.1129, "step": 183 }, { "epoch": 0.02725925925925926, "grad_norm": 1.1544840335845947, "learning_rate": 0.00019475166790214977, "loss": 1.2382, "step": 184 }, { "epoch": 0.027407407407407408, "grad_norm": 0.8209052085876465, "learning_rate": 0.0001947220163083766, "loss": 1.3218, "step": 185 }, { "epoch": 0.027555555555555555, "grad_norm": 0.9780836701393127, "learning_rate": 0.0001946923647146034, "loss": 1.2419, "step": 186 }, { "epoch": 0.027703703703703703, "grad_norm": 0.9858911633491516, "learning_rate": 0.00019466271312083028, "loss": 1.3105, "step": 187 }, { "epoch": 0.027851851851851853, "grad_norm": 1.0222773551940918, "learning_rate": 0.0001946330615270571, "loss": 1.0789, "step": 188 }, { "epoch": 0.028, "grad_norm": 0.8119564652442932, "learning_rate": 0.0001946034099332839, "loss": 1.1691, "step": 189 }, { "epoch": 0.028148148148148148, "grad_norm": 1.0559577941894531, "learning_rate": 0.00019457375833951076, "loss": 1.2803, "step": 190 }, { "epoch": 0.028296296296296295, "grad_norm": 0.8176133036613464, "learning_rate": 0.0001945441067457376, "loss": 1.0209, "step": 191 }, { "epoch": 0.028444444444444446, "grad_norm": 0.9877428412437439, "learning_rate": 0.00019451445515196442, "loss": 1.3936, "step": 192 }, { "epoch": 0.028592592592592593, "grad_norm": 1.2009166479110718, "learning_rate": 0.00019448480355819126, "loss": 1.3871, "step": 193 }, { "epoch": 0.02874074074074074, "grad_norm": 0.8686572313308716, "learning_rate": 0.0001944551519644181, "loss": 1.0833, "step": 194 }, { "epoch": 0.028888888888888888, "grad_norm": 1.1291698217391968, "learning_rate": 0.00019442550037064492, "loss": 1.4868, "step": 195 }, { "epoch": 0.02903703703703704, "grad_norm": 0.7083054184913635, "learning_rate": 0.00019439584877687177, "loss": 0.9583, "step": 196 }, { "epoch": 0.029185185185185186, "grad_norm": 0.8085622787475586, "learning_rate": 0.00019436619718309861, "loss": 1.3064, "step": 197 }, { "epoch": 0.029333333333333333, "grad_norm": 0.8528979420661926, "learning_rate": 0.00019433654558932543, "loss": 1.3791, "step": 198 }, { "epoch": 0.02948148148148148, "grad_norm": 0.8347198963165283, "learning_rate": 0.00019430689399555228, "loss": 1.3034, "step": 199 }, { "epoch": 0.02962962962962963, "grad_norm": 1.3489162921905518, "learning_rate": 0.00019427724240177912, "loss": 1.4172, "step": 200 }, { "epoch": 0.029777777777777778, "grad_norm": 0.9368568062782288, "learning_rate": 0.00019424759080800594, "loss": 1.1568, "step": 201 }, { "epoch": 0.029925925925925925, "grad_norm": 0.8276304006576538, "learning_rate": 0.00019421793921423278, "loss": 1.2429, "step": 202 }, { "epoch": 0.030074074074074073, "grad_norm": 0.9591898322105408, "learning_rate": 0.00019418828762045963, "loss": 1.2785, "step": 203 }, { "epoch": 0.030222222222222223, "grad_norm": 1.008541464805603, "learning_rate": 0.00019415863602668644, "loss": 1.5194, "step": 204 }, { "epoch": 0.03037037037037037, "grad_norm": 0.7464162707328796, "learning_rate": 0.00019412898443291326, "loss": 1.275, "step": 205 }, { "epoch": 0.030518518518518518, "grad_norm": 1.6648197174072266, "learning_rate": 0.0001940993328391401, "loss": 1.3538, "step": 206 }, { "epoch": 0.030666666666666665, "grad_norm": 1.067299723625183, "learning_rate": 0.00019406968124536695, "loss": 1.2387, "step": 207 }, { "epoch": 0.030814814814814816, "grad_norm": 0.9523988366127014, "learning_rate": 0.00019404002965159377, "loss": 1.4373, "step": 208 }, { "epoch": 0.030962962962962963, "grad_norm": 1.2426611185073853, "learning_rate": 0.0001940103780578206, "loss": 1.3875, "step": 209 }, { "epoch": 0.03111111111111111, "grad_norm": 1.0543694496154785, "learning_rate": 0.00019398072646404746, "loss": 1.2702, "step": 210 }, { "epoch": 0.03125925925925926, "grad_norm": 0.78801429271698, "learning_rate": 0.00019395107487027427, "loss": 1.3753, "step": 211 }, { "epoch": 0.031407407407407405, "grad_norm": 0.9147347807884216, "learning_rate": 0.00019392142327650112, "loss": 1.4496, "step": 212 }, { "epoch": 0.03155555555555556, "grad_norm": 0.9836535453796387, "learning_rate": 0.00019389177168272796, "loss": 1.1552, "step": 213 }, { "epoch": 0.031703703703703706, "grad_norm": 0.9383349418640137, "learning_rate": 0.00019386212008895478, "loss": 1.2849, "step": 214 }, { "epoch": 0.03185185185185185, "grad_norm": 0.8178645968437195, "learning_rate": 0.00019383246849518162, "loss": 1.3611, "step": 215 }, { "epoch": 0.032, "grad_norm": 1.2790336608886719, "learning_rate": 0.00019380281690140847, "loss": 1.544, "step": 216 }, { "epoch": 0.03214814814814815, "grad_norm": 1.6234021186828613, "learning_rate": 0.00019377316530763529, "loss": 1.1774, "step": 217 }, { "epoch": 0.032296296296296295, "grad_norm": 1.1184484958648682, "learning_rate": 0.00019374351371386213, "loss": 1.1779, "step": 218 }, { "epoch": 0.03244444444444444, "grad_norm": 0.7729263305664062, "learning_rate": 0.00019371386212008898, "loss": 1.2238, "step": 219 }, { "epoch": 0.03259259259259259, "grad_norm": 1.2453947067260742, "learning_rate": 0.0001936842105263158, "loss": 1.1779, "step": 220 }, { "epoch": 0.032740740740740744, "grad_norm": 1.0809444189071655, "learning_rate": 0.00019365455893254264, "loss": 1.1391, "step": 221 }, { "epoch": 0.03288888888888889, "grad_norm": 2.4422430992126465, "learning_rate": 0.00019362490733876948, "loss": 1.3656, "step": 222 }, { "epoch": 0.03303703703703704, "grad_norm": 1.194951057434082, "learning_rate": 0.0001935952557449963, "loss": 1.1398, "step": 223 }, { "epoch": 0.033185185185185186, "grad_norm": 0.9725684523582458, "learning_rate": 0.00019356560415122314, "loss": 1.2123, "step": 224 }, { "epoch": 0.03333333333333333, "grad_norm": 0.9106444716453552, "learning_rate": 0.00019353595255744996, "loss": 1.1304, "step": 225 }, { "epoch": 0.03348148148148148, "grad_norm": 0.8902103900909424, "learning_rate": 0.0001935063009636768, "loss": 1.3702, "step": 226 }, { "epoch": 0.03362962962962963, "grad_norm": 0.9396015405654907, "learning_rate": 0.00019347664936990365, "loss": 1.6152, "step": 227 }, { "epoch": 0.033777777777777775, "grad_norm": 1.4745655059814453, "learning_rate": 0.00019344699777613047, "loss": 1.1397, "step": 228 }, { "epoch": 0.03392592592592593, "grad_norm": 0.685135543346405, "learning_rate": 0.0001934173461823573, "loss": 1.3612, "step": 229 }, { "epoch": 0.034074074074074076, "grad_norm": 1.0589948892593384, "learning_rate": 0.00019338769458858416, "loss": 1.0377, "step": 230 }, { "epoch": 0.03422222222222222, "grad_norm": 0.82380610704422, "learning_rate": 0.00019335804299481097, "loss": 1.3124, "step": 231 }, { "epoch": 0.03437037037037037, "grad_norm": 0.9715389609336853, "learning_rate": 0.00019332839140103782, "loss": 1.1181, "step": 232 }, { "epoch": 0.03451851851851852, "grad_norm": 0.8186538815498352, "learning_rate": 0.00019329873980726466, "loss": 1.2927, "step": 233 }, { "epoch": 0.034666666666666665, "grad_norm": 1.2029539346694946, "learning_rate": 0.00019326908821349148, "loss": 1.4152, "step": 234 }, { "epoch": 0.03481481481481481, "grad_norm": 1.2466408014297485, "learning_rate": 0.00019323943661971832, "loss": 1.1035, "step": 235 }, { "epoch": 0.03496296296296296, "grad_norm": 0.7909786105155945, "learning_rate": 0.00019320978502594517, "loss": 1.1657, "step": 236 }, { "epoch": 0.035111111111111114, "grad_norm": 1.3812874555587769, "learning_rate": 0.00019318013343217199, "loss": 1.2617, "step": 237 }, { "epoch": 0.03525925925925926, "grad_norm": 0.9587541222572327, "learning_rate": 0.00019315048183839883, "loss": 1.3889, "step": 238 }, { "epoch": 0.03540740740740741, "grad_norm": 1.1368465423583984, "learning_rate": 0.00019312083024462567, "loss": 1.0769, "step": 239 }, { "epoch": 0.035555555555555556, "grad_norm": 0.9174418449401855, "learning_rate": 0.0001930911786508525, "loss": 1.3081, "step": 240 }, { "epoch": 0.0357037037037037, "grad_norm": 0.7816482782363892, "learning_rate": 0.0001930615270570793, "loss": 1.142, "step": 241 }, { "epoch": 0.03585185185185185, "grad_norm": 1.1396574974060059, "learning_rate": 0.00019303187546330615, "loss": 1.2102, "step": 242 }, { "epoch": 0.036, "grad_norm": 0.8678709864616394, "learning_rate": 0.000193002223869533, "loss": 1.2445, "step": 243 }, { "epoch": 0.03614814814814815, "grad_norm": 0.7911355495452881, "learning_rate": 0.00019297257227575982, "loss": 0.9574, "step": 244 }, { "epoch": 0.0362962962962963, "grad_norm": 0.7782396078109741, "learning_rate": 0.00019294292068198666, "loss": 1.1546, "step": 245 }, { "epoch": 0.036444444444444446, "grad_norm": 0.9505060315132141, "learning_rate": 0.0001929132690882135, "loss": 1.1287, "step": 246 }, { "epoch": 0.03659259259259259, "grad_norm": 1.238294005393982, "learning_rate": 0.00019288361749444032, "loss": 1.0658, "step": 247 }, { "epoch": 0.03674074074074074, "grad_norm": 0.742830753326416, "learning_rate": 0.00019285396590066717, "loss": 1.1948, "step": 248 }, { "epoch": 0.03688888888888889, "grad_norm": 0.7183875441551208, "learning_rate": 0.000192824314306894, "loss": 0.9899, "step": 249 }, { "epoch": 0.037037037037037035, "grad_norm": 0.773074209690094, "learning_rate": 0.00019279466271312083, "loss": 1.2474, "step": 250 }, { "epoch": 0.03718518518518518, "grad_norm": 1.0374746322631836, "learning_rate": 0.00019276501111934767, "loss": 1.0381, "step": 251 }, { "epoch": 0.037333333333333336, "grad_norm": 0.8164530396461487, "learning_rate": 0.00019273535952557452, "loss": 1.0381, "step": 252 }, { "epoch": 0.037481481481481484, "grad_norm": 0.9365907311439514, "learning_rate": 0.00019270570793180133, "loss": 1.4101, "step": 253 }, { "epoch": 0.03762962962962963, "grad_norm": 0.8881521224975586, "learning_rate": 0.00019267605633802818, "loss": 1.1966, "step": 254 }, { "epoch": 0.03777777777777778, "grad_norm": 2.7169456481933594, "learning_rate": 0.00019264640474425502, "loss": 1.1894, "step": 255 }, { "epoch": 0.037925925925925925, "grad_norm": 1.2184417247772217, "learning_rate": 0.00019261675315048184, "loss": 1.2353, "step": 256 }, { "epoch": 0.03807407407407407, "grad_norm": 0.9313091039657593, "learning_rate": 0.00019258710155670869, "loss": 1.321, "step": 257 }, { "epoch": 0.03822222222222222, "grad_norm": 0.8754948973655701, "learning_rate": 0.00019255744996293553, "loss": 1.296, "step": 258 }, { "epoch": 0.03837037037037037, "grad_norm": 1.6140278577804565, "learning_rate": 0.00019252779836916235, "loss": 1.2427, "step": 259 }, { "epoch": 0.03851851851851852, "grad_norm": 0.7046688795089722, "learning_rate": 0.00019249814677538916, "loss": 1.2533, "step": 260 }, { "epoch": 0.03866666666666667, "grad_norm": 1.3836346864700317, "learning_rate": 0.00019246849518161604, "loss": 1.0986, "step": 261 }, { "epoch": 0.038814814814814816, "grad_norm": 0.8215917348861694, "learning_rate": 0.00019243884358784285, "loss": 1.0424, "step": 262 }, { "epoch": 0.03896296296296296, "grad_norm": 1.0078061819076538, "learning_rate": 0.00019240919199406967, "loss": 1.2892, "step": 263 }, { "epoch": 0.03911111111111111, "grad_norm": 1.0687581300735474, "learning_rate": 0.00019237954040029654, "loss": 1.3148, "step": 264 }, { "epoch": 0.03925925925925926, "grad_norm": 0.7134751081466675, "learning_rate": 0.00019234988880652336, "loss": 1.1001, "step": 265 }, { "epoch": 0.039407407407407405, "grad_norm": 0.726123034954071, "learning_rate": 0.00019232023721275018, "loss": 1.3147, "step": 266 }, { "epoch": 0.03955555555555555, "grad_norm": 1.0302845239639282, "learning_rate": 0.00019229058561897705, "loss": 1.3559, "step": 267 }, { "epoch": 0.039703703703703706, "grad_norm": 0.8920096158981323, "learning_rate": 0.00019226093402520387, "loss": 1.4267, "step": 268 }, { "epoch": 0.039851851851851854, "grad_norm": 1.2920289039611816, "learning_rate": 0.00019223128243143068, "loss": 1.2495, "step": 269 }, { "epoch": 0.04, "grad_norm": 0.8217295408248901, "learning_rate": 0.00019220163083765755, "loss": 1.1455, "step": 270 }, { "epoch": 0.04014814814814815, "grad_norm": 0.9490004777908325, "learning_rate": 0.00019217197924388437, "loss": 1.2003, "step": 271 }, { "epoch": 0.040296296296296295, "grad_norm": 0.8513433337211609, "learning_rate": 0.0001921423276501112, "loss": 1.1811, "step": 272 }, { "epoch": 0.04044444444444444, "grad_norm": 1.884893774986267, "learning_rate": 0.00019211267605633806, "loss": 1.2256, "step": 273 }, { "epoch": 0.04059259259259259, "grad_norm": 1.2672890424728394, "learning_rate": 0.00019208302446256488, "loss": 1.4415, "step": 274 }, { "epoch": 0.040740740740740744, "grad_norm": 0.9679176807403564, "learning_rate": 0.0001920533728687917, "loss": 1.2014, "step": 275 }, { "epoch": 0.04088888888888889, "grad_norm": 0.9189028739929199, "learning_rate": 0.00019202372127501854, "loss": 1.3192, "step": 276 }, { "epoch": 0.04103703703703704, "grad_norm": 0.8371793627738953, "learning_rate": 0.00019199406968124538, "loss": 1.2433, "step": 277 }, { "epoch": 0.041185185185185186, "grad_norm": 0.7926132678985596, "learning_rate": 0.0001919644180874722, "loss": 0.9882, "step": 278 }, { "epoch": 0.04133333333333333, "grad_norm": 2.006680488586426, "learning_rate": 0.00019193476649369905, "loss": 1.1837, "step": 279 }, { "epoch": 0.04148148148148148, "grad_norm": 1.062570571899414, "learning_rate": 0.0001919051148999259, "loss": 1.2128, "step": 280 }, { "epoch": 0.04162962962962963, "grad_norm": 0.7273450493812561, "learning_rate": 0.0001918754633061527, "loss": 1.3036, "step": 281 }, { "epoch": 0.041777777777777775, "grad_norm": 0.837178111076355, "learning_rate": 0.00019184581171237955, "loss": 1.2514, "step": 282 }, { "epoch": 0.04192592592592593, "grad_norm": 1.1254032850265503, "learning_rate": 0.0001918161601186064, "loss": 1.1421, "step": 283 }, { "epoch": 0.042074074074074076, "grad_norm": 0.915274441242218, "learning_rate": 0.00019178650852483321, "loss": 1.5691, "step": 284 }, { "epoch": 0.042222222222222223, "grad_norm": 4.748012065887451, "learning_rate": 0.00019175685693106006, "loss": 1.3856, "step": 285 }, { "epoch": 0.04237037037037037, "grad_norm": 1.5594621896743774, "learning_rate": 0.0001917272053372869, "loss": 1.3815, "step": 286 }, { "epoch": 0.04251851851851852, "grad_norm": 0.9001341462135315, "learning_rate": 0.00019169755374351372, "loss": 1.2276, "step": 287 }, { "epoch": 0.042666666666666665, "grad_norm": 1.085966944694519, "learning_rate": 0.00019166790214974057, "loss": 1.3204, "step": 288 }, { "epoch": 0.04281481481481481, "grad_norm": 0.9728178381919861, "learning_rate": 0.0001916382505559674, "loss": 1.1833, "step": 289 }, { "epoch": 0.04296296296296296, "grad_norm": 1.1957199573516846, "learning_rate": 0.00019160859896219423, "loss": 1.1297, "step": 290 }, { "epoch": 0.043111111111111114, "grad_norm": 2.2784461975097656, "learning_rate": 0.00019157894736842104, "loss": 1.3982, "step": 291 }, { "epoch": 0.04325925925925926, "grad_norm": 0.998017430305481, "learning_rate": 0.0001915492957746479, "loss": 1.0903, "step": 292 }, { "epoch": 0.04340740740740741, "grad_norm": 0.8035924434661865, "learning_rate": 0.00019151964418087473, "loss": 1.3742, "step": 293 }, { "epoch": 0.043555555555555556, "grad_norm": 0.9830217361450195, "learning_rate": 0.00019148999258710155, "loss": 1.1857, "step": 294 }, { "epoch": 0.0437037037037037, "grad_norm": 1.034332513809204, "learning_rate": 0.0001914603409933284, "loss": 1.0875, "step": 295 }, { "epoch": 0.04385185185185185, "grad_norm": 0.9594345688819885, "learning_rate": 0.00019143068939955524, "loss": 1.4199, "step": 296 }, { "epoch": 0.044, "grad_norm": 1.0057716369628906, "learning_rate": 0.00019140103780578206, "loss": 1.3263, "step": 297 }, { "epoch": 0.044148148148148145, "grad_norm": 1.2575480937957764, "learning_rate": 0.0001913713862120089, "loss": 1.3083, "step": 298 }, { "epoch": 0.0442962962962963, "grad_norm": 0.9955440759658813, "learning_rate": 0.00019134173461823575, "loss": 1.3674, "step": 299 }, { "epoch": 0.044444444444444446, "grad_norm": 0.7672873139381409, "learning_rate": 0.00019131208302446256, "loss": 1.2488, "step": 300 }, { "epoch": 0.04459259259259259, "grad_norm": 0.950467050075531, "learning_rate": 0.0001912824314306894, "loss": 1.3096, "step": 301 }, { "epoch": 0.04474074074074074, "grad_norm": 1.3131276369094849, "learning_rate": 0.00019125277983691625, "loss": 1.0787, "step": 302 }, { "epoch": 0.04488888888888889, "grad_norm": 1.0086424350738525, "learning_rate": 0.00019122312824314307, "loss": 1.3342, "step": 303 }, { "epoch": 0.045037037037037035, "grad_norm": 1.195286750793457, "learning_rate": 0.00019119347664936991, "loss": 1.109, "step": 304 }, { "epoch": 0.04518518518518518, "grad_norm": 0.877375602722168, "learning_rate": 0.00019116382505559676, "loss": 1.3081, "step": 305 }, { "epoch": 0.04533333333333334, "grad_norm": 0.8676083087921143, "learning_rate": 0.00019113417346182358, "loss": 1.1603, "step": 306 }, { "epoch": 0.045481481481481484, "grad_norm": 1.1526929140090942, "learning_rate": 0.00019110452186805042, "loss": 1.1906, "step": 307 }, { "epoch": 0.04562962962962963, "grad_norm": 0.9462252259254456, "learning_rate": 0.00019107487027427726, "loss": 1.0573, "step": 308 }, { "epoch": 0.04577777777777778, "grad_norm": 0.7923760414123535, "learning_rate": 0.00019104521868050408, "loss": 1.2772, "step": 309 }, { "epoch": 0.045925925925925926, "grad_norm": 1.2533591985702515, "learning_rate": 0.00019101556708673093, "loss": 1.3882, "step": 310 }, { "epoch": 0.04607407407407407, "grad_norm": 0.8806333541870117, "learning_rate": 0.00019098591549295774, "loss": 1.0505, "step": 311 }, { "epoch": 0.04622222222222222, "grad_norm": 0.7805909514427185, "learning_rate": 0.0001909562638991846, "loss": 1.4897, "step": 312 }, { "epoch": 0.04637037037037037, "grad_norm": 0.812323272228241, "learning_rate": 0.00019092661230541143, "loss": 1.1407, "step": 313 }, { "epoch": 0.04651851851851852, "grad_norm": 1.0240209102630615, "learning_rate": 0.00019089696071163825, "loss": 1.2754, "step": 314 }, { "epoch": 0.04666666666666667, "grad_norm": 0.7502384781837463, "learning_rate": 0.0001908673091178651, "loss": 1.178, "step": 315 }, { "epoch": 0.046814814814814816, "grad_norm": 0.7480301856994629, "learning_rate": 0.00019083765752409194, "loss": 0.9225, "step": 316 }, { "epoch": 0.04696296296296296, "grad_norm": 0.8027053475379944, "learning_rate": 0.00019080800593031876, "loss": 1.2066, "step": 317 }, { "epoch": 0.04711111111111111, "grad_norm": 1.5919809341430664, "learning_rate": 0.0001907783543365456, "loss": 1.4634, "step": 318 }, { "epoch": 0.04725925925925926, "grad_norm": 0.9461173415184021, "learning_rate": 0.00019074870274277245, "loss": 1.2798, "step": 319 }, { "epoch": 0.047407407407407405, "grad_norm": 0.9098615050315857, "learning_rate": 0.00019071905114899926, "loss": 1.2508, "step": 320 }, { "epoch": 0.04755555555555555, "grad_norm": 2.0828206539154053, "learning_rate": 0.0001906893995552261, "loss": 1.0461, "step": 321 }, { "epoch": 0.047703703703703706, "grad_norm": 0.975915253162384, "learning_rate": 0.00019065974796145295, "loss": 1.0488, "step": 322 }, { "epoch": 0.047851851851851854, "grad_norm": 0.8300365209579468, "learning_rate": 0.00019063009636767977, "loss": 1.2716, "step": 323 }, { "epoch": 0.048, "grad_norm": 0.8969668745994568, "learning_rate": 0.0001906004447739066, "loss": 1.3268, "step": 324 }, { "epoch": 0.04814814814814815, "grad_norm": 0.7707619071006775, "learning_rate": 0.00019057079318013346, "loss": 1.1056, "step": 325 }, { "epoch": 0.048296296296296296, "grad_norm": 0.941278874874115, "learning_rate": 0.00019054114158636028, "loss": 1.2243, "step": 326 }, { "epoch": 0.04844444444444444, "grad_norm": 2.3351869583129883, "learning_rate": 0.0001905114899925871, "loss": 1.1401, "step": 327 }, { "epoch": 0.04859259259259259, "grad_norm": 0.9738073945045471, "learning_rate": 0.00019048183839881394, "loss": 1.3564, "step": 328 }, { "epoch": 0.048740740740740744, "grad_norm": 1.029203176498413, "learning_rate": 0.00019045218680504078, "loss": 1.2079, "step": 329 }, { "epoch": 0.04888888888888889, "grad_norm": 1.3182631731033325, "learning_rate": 0.0001904225352112676, "loss": 0.9833, "step": 330 }, { "epoch": 0.04903703703703704, "grad_norm": 0.7868254780769348, "learning_rate": 0.00019039288361749444, "loss": 1.0738, "step": 331 }, { "epoch": 0.049185185185185186, "grad_norm": 0.949598491191864, "learning_rate": 0.0001903632320237213, "loss": 1.2144, "step": 332 }, { "epoch": 0.04933333333333333, "grad_norm": 0.8689578175544739, "learning_rate": 0.0001903335804299481, "loss": 0.9466, "step": 333 }, { "epoch": 0.04948148148148148, "grad_norm": 0.6478769183158875, "learning_rate": 0.00019030392883617495, "loss": 1.1753, "step": 334 }, { "epoch": 0.04962962962962963, "grad_norm": 1.025097131729126, "learning_rate": 0.0001902742772424018, "loss": 1.4284, "step": 335 }, { "epoch": 0.049777777777777775, "grad_norm": 1.3263148069381714, "learning_rate": 0.0001902446256486286, "loss": 1.1652, "step": 336 }, { "epoch": 0.04992592592592593, "grad_norm": 0.79034423828125, "learning_rate": 0.00019021497405485546, "loss": 0.9998, "step": 337 }, { "epoch": 0.050074074074074076, "grad_norm": 0.8130358457565308, "learning_rate": 0.0001901853224610823, "loss": 1.2194, "step": 338 }, { "epoch": 0.050222222222222224, "grad_norm": 1.3800885677337646, "learning_rate": 0.00019015567086730912, "loss": 1.0348, "step": 339 }, { "epoch": 0.05037037037037037, "grad_norm": 0.9516430497169495, "learning_rate": 0.00019012601927353596, "loss": 1.354, "step": 340 }, { "epoch": 0.05051851851851852, "grad_norm": 0.8971577286720276, "learning_rate": 0.0001900963676797628, "loss": 1.0234, "step": 341 }, { "epoch": 0.050666666666666665, "grad_norm": 0.7143462896347046, "learning_rate": 0.00019006671608598962, "loss": 1.129, "step": 342 }, { "epoch": 0.05081481481481481, "grad_norm": 1.2593697309494019, "learning_rate": 0.00019003706449221647, "loss": 1.1932, "step": 343 }, { "epoch": 0.05096296296296296, "grad_norm": 1.0689709186553955, "learning_rate": 0.0001900074128984433, "loss": 0.9627, "step": 344 }, { "epoch": 0.051111111111111114, "grad_norm": 0.9543367028236389, "learning_rate": 0.00018997776130467013, "loss": 1.155, "step": 345 }, { "epoch": 0.05125925925925926, "grad_norm": 1.1440194845199585, "learning_rate": 0.00018994810971089695, "loss": 1.4789, "step": 346 }, { "epoch": 0.05140740740740741, "grad_norm": 0.8633337020874023, "learning_rate": 0.00018991845811712382, "loss": 1.3178, "step": 347 }, { "epoch": 0.051555555555555556, "grad_norm": 0.8850125670433044, "learning_rate": 0.00018988880652335064, "loss": 1.1834, "step": 348 }, { "epoch": 0.0517037037037037, "grad_norm": 0.9416225552558899, "learning_rate": 0.00018985915492957745, "loss": 1.1306, "step": 349 }, { "epoch": 0.05185185185185185, "grad_norm": 1.0465419292449951, "learning_rate": 0.00018982950333580433, "loss": 0.9964, "step": 350 }, { "epoch": 0.052, "grad_norm": 0.9418424367904663, "learning_rate": 0.00018979985174203114, "loss": 1.4032, "step": 351 }, { "epoch": 0.052148148148148145, "grad_norm": 1.0868874788284302, "learning_rate": 0.00018977020014825796, "loss": 1.2295, "step": 352 }, { "epoch": 0.0522962962962963, "grad_norm": 1.3639134168624878, "learning_rate": 0.00018974054855448483, "loss": 1.1505, "step": 353 }, { "epoch": 0.052444444444444446, "grad_norm": 0.7342366576194763, "learning_rate": 0.00018971089696071165, "loss": 1.072, "step": 354 }, { "epoch": 0.052592592592592594, "grad_norm": 1.0624979734420776, "learning_rate": 0.00018968124536693847, "loss": 1.3847, "step": 355 }, { "epoch": 0.05274074074074074, "grad_norm": 1.0965144634246826, "learning_rate": 0.00018965159377316534, "loss": 1.1309, "step": 356 }, { "epoch": 0.05288888888888889, "grad_norm": 2.761899948120117, "learning_rate": 0.00018962194217939216, "loss": 1.396, "step": 357 }, { "epoch": 0.053037037037037035, "grad_norm": 1.303941011428833, "learning_rate": 0.00018959229058561897, "loss": 1.3632, "step": 358 }, { "epoch": 0.05318518518518518, "grad_norm": 0.8362237215042114, "learning_rate": 0.00018956263899184584, "loss": 0.9956, "step": 359 }, { "epoch": 0.05333333333333334, "grad_norm": 0.8556802272796631, "learning_rate": 0.00018953298739807266, "loss": 1.3789, "step": 360 }, { "epoch": 0.053481481481481484, "grad_norm": 0.9617559313774109, "learning_rate": 0.00018950333580429948, "loss": 1.1837, "step": 361 }, { "epoch": 0.05362962962962963, "grad_norm": 1.0323243141174316, "learning_rate": 0.00018947368421052632, "loss": 1.3796, "step": 362 }, { "epoch": 0.05377777777777778, "grad_norm": 1.2637794017791748, "learning_rate": 0.00018944403261675317, "loss": 1.1701, "step": 363 }, { "epoch": 0.053925925925925926, "grad_norm": 0.9685412645339966, "learning_rate": 0.00018941438102297999, "loss": 1.3551, "step": 364 }, { "epoch": 0.05407407407407407, "grad_norm": 0.9208672046661377, "learning_rate": 0.00018938472942920683, "loss": 1.5171, "step": 365 }, { "epoch": 0.05422222222222222, "grad_norm": 0.9151871204376221, "learning_rate": 0.00018935507783543367, "loss": 1.1876, "step": 366 }, { "epoch": 0.05437037037037037, "grad_norm": 1.1648972034454346, "learning_rate": 0.0001893254262416605, "loss": 1.2662, "step": 367 }, { "epoch": 0.05451851851851852, "grad_norm": 0.9495143294334412, "learning_rate": 0.00018929577464788734, "loss": 1.2151, "step": 368 }, { "epoch": 0.05466666666666667, "grad_norm": 1.1094683408737183, "learning_rate": 0.00018926612305411418, "loss": 1.2581, "step": 369 }, { "epoch": 0.054814814814814816, "grad_norm": 1.038244605064392, "learning_rate": 0.000189236471460341, "loss": 1.2898, "step": 370 }, { "epoch": 0.05496296296296296, "grad_norm": 1.372955322265625, "learning_rate": 0.00018920681986656784, "loss": 1.2511, "step": 371 }, { "epoch": 0.05511111111111111, "grad_norm": 1.5707471370697021, "learning_rate": 0.0001891771682727947, "loss": 1.4273, "step": 372 }, { "epoch": 0.05525925925925926, "grad_norm": 0.8700966835021973, "learning_rate": 0.0001891475166790215, "loss": 1.2627, "step": 373 }, { "epoch": 0.055407407407407405, "grad_norm": 1.0520998239517212, "learning_rate": 0.00018911786508524835, "loss": 1.1392, "step": 374 }, { "epoch": 0.05555555555555555, "grad_norm": 1.1595327854156494, "learning_rate": 0.0001890882134914752, "loss": 1.2995, "step": 375 }, { "epoch": 0.05570370370370371, "grad_norm": 1.009542465209961, "learning_rate": 0.000189058561897702, "loss": 1.3476, "step": 376 }, { "epoch": 0.055851851851851854, "grad_norm": 1.0016463994979858, "learning_rate": 0.00018902891030392883, "loss": 1.2419, "step": 377 }, { "epoch": 0.056, "grad_norm": 0.843089759349823, "learning_rate": 0.00018899925871015567, "loss": 1.0925, "step": 378 }, { "epoch": 0.05614814814814815, "grad_norm": 1.0765223503112793, "learning_rate": 0.00018896960711638252, "loss": 1.4006, "step": 379 }, { "epoch": 0.056296296296296296, "grad_norm": 0.8268885612487793, "learning_rate": 0.00018893995552260933, "loss": 1.2053, "step": 380 }, { "epoch": 0.05644444444444444, "grad_norm": 1.0966840982437134, "learning_rate": 0.00018891030392883618, "loss": 1.0795, "step": 381 }, { "epoch": 0.05659259259259259, "grad_norm": 1.574753999710083, "learning_rate": 0.00018888065233506302, "loss": 1.0723, "step": 382 }, { "epoch": 0.05674074074074074, "grad_norm": 1.1856094598770142, "learning_rate": 0.00018885100074128984, "loss": 0.9495, "step": 383 }, { "epoch": 0.05688888888888889, "grad_norm": 0.9048572778701782, "learning_rate": 0.00018882134914751668, "loss": 1.1942, "step": 384 }, { "epoch": 0.05703703703703704, "grad_norm": 0.7475482225418091, "learning_rate": 0.00018879169755374353, "loss": 1.0352, "step": 385 }, { "epoch": 0.057185185185185186, "grad_norm": 1.051243543624878, "learning_rate": 0.00018876204595997035, "loss": 1.0815, "step": 386 }, { "epoch": 0.05733333333333333, "grad_norm": 0.8005509972572327, "learning_rate": 0.0001887323943661972, "loss": 1.1604, "step": 387 }, { "epoch": 0.05748148148148148, "grad_norm": 0.951156497001648, "learning_rate": 0.00018870274277242404, "loss": 1.2127, "step": 388 }, { "epoch": 0.05762962962962963, "grad_norm": 0.998855471611023, "learning_rate": 0.00018867309117865085, "loss": 1.1835, "step": 389 }, { "epoch": 0.057777777777777775, "grad_norm": 1.2722110748291016, "learning_rate": 0.0001886434395848777, "loss": 1.2818, "step": 390 }, { "epoch": 0.05792592592592593, "grad_norm": 1.2306925058364868, "learning_rate": 0.00018861378799110454, "loss": 1.0018, "step": 391 }, { "epoch": 0.05807407407407408, "grad_norm": 1.022120475769043, "learning_rate": 0.00018858413639733136, "loss": 1.1503, "step": 392 }, { "epoch": 0.058222222222222224, "grad_norm": 0.9722836017608643, "learning_rate": 0.0001885544848035582, "loss": 1.3875, "step": 393 }, { "epoch": 0.05837037037037037, "grad_norm": 1.0112863779067993, "learning_rate": 0.00018852483320978505, "loss": 1.2048, "step": 394 }, { "epoch": 0.05851851851851852, "grad_norm": 2.09438157081604, "learning_rate": 0.00018849518161601187, "loss": 1.3112, "step": 395 }, { "epoch": 0.058666666666666666, "grad_norm": 1.14156973361969, "learning_rate": 0.0001884655300222387, "loss": 1.2865, "step": 396 }, { "epoch": 0.05881481481481481, "grad_norm": 0.8264205455780029, "learning_rate": 0.00018843587842846553, "loss": 1.0833, "step": 397 }, { "epoch": 0.05896296296296296, "grad_norm": 0.8042988181114197, "learning_rate": 0.00018840622683469237, "loss": 1.2443, "step": 398 }, { "epoch": 0.059111111111111114, "grad_norm": 0.8299493789672852, "learning_rate": 0.00018837657524091922, "loss": 1.1525, "step": 399 }, { "epoch": 0.05925925925925926, "grad_norm": 0.8911139965057373, "learning_rate": 0.00018834692364714603, "loss": 1.1735, "step": 400 }, { "epoch": 0.05940740740740741, "grad_norm": 0.8134204149246216, "learning_rate": 0.00018831727205337288, "loss": 1.0875, "step": 401 }, { "epoch": 0.059555555555555556, "grad_norm": 1.022357702255249, "learning_rate": 0.00018828762045959972, "loss": 1.109, "step": 402 }, { "epoch": 0.0597037037037037, "grad_norm": 1.407626748085022, "learning_rate": 0.00018825796886582654, "loss": 1.232, "step": 403 }, { "epoch": 0.05985185185185185, "grad_norm": 2.914661407470703, "learning_rate": 0.00018822831727205338, "loss": 1.2303, "step": 404 }, { "epoch": 0.06, "grad_norm": 1.400601863861084, "learning_rate": 0.00018819866567828023, "loss": 1.311, "step": 405 }, { "epoch": 0.060148148148148145, "grad_norm": 0.8387115001678467, "learning_rate": 0.00018816901408450705, "loss": 1.2802, "step": 406 }, { "epoch": 0.0602962962962963, "grad_norm": 0.7756436467170715, "learning_rate": 0.0001881393624907339, "loss": 1.2511, "step": 407 }, { "epoch": 0.060444444444444446, "grad_norm": 0.9221392273902893, "learning_rate": 0.00018810971089696073, "loss": 1.0606, "step": 408 }, { "epoch": 0.060592592592592594, "grad_norm": 2.785463571548462, "learning_rate": 0.00018808005930318755, "loss": 1.0115, "step": 409 }, { "epoch": 0.06074074074074074, "grad_norm": 1.5348591804504395, "learning_rate": 0.0001880504077094144, "loss": 1.2014, "step": 410 }, { "epoch": 0.06088888888888889, "grad_norm": 0.8930820822715759, "learning_rate": 0.00018802075611564124, "loss": 1.1861, "step": 411 }, { "epoch": 0.061037037037037035, "grad_norm": 0.9898925423622131, "learning_rate": 0.00018799110452186806, "loss": 1.2266, "step": 412 }, { "epoch": 0.06118518518518518, "grad_norm": 1.4079804420471191, "learning_rate": 0.00018796145292809488, "loss": 1.0795, "step": 413 }, { "epoch": 0.06133333333333333, "grad_norm": 0.7314396500587463, "learning_rate": 0.00018793180133432172, "loss": 1.2817, "step": 414 }, { "epoch": 0.061481481481481484, "grad_norm": 1.0503140687942505, "learning_rate": 0.00018790214974054856, "loss": 1.3144, "step": 415 }, { "epoch": 0.06162962962962963, "grad_norm": 1.046270728111267, "learning_rate": 0.00018787249814677538, "loss": 1.0213, "step": 416 }, { "epoch": 0.06177777777777778, "grad_norm": 0.6629498600959778, "learning_rate": 0.00018784284655300223, "loss": 1.1684, "step": 417 }, { "epoch": 0.061925925925925926, "grad_norm": 1.0851155519485474, "learning_rate": 0.00018781319495922907, "loss": 1.2505, "step": 418 }, { "epoch": 0.06207407407407407, "grad_norm": 1.239920973777771, "learning_rate": 0.0001877835433654559, "loss": 1.0287, "step": 419 }, { "epoch": 0.06222222222222222, "grad_norm": 0.9222789406776428, "learning_rate": 0.00018775389177168273, "loss": 1.4088, "step": 420 }, { "epoch": 0.06237037037037037, "grad_norm": 0.8636415600776672, "learning_rate": 0.00018772424017790958, "loss": 1.1611, "step": 421 }, { "epoch": 0.06251851851851851, "grad_norm": 1.038588047027588, "learning_rate": 0.0001876945885841364, "loss": 1.2972, "step": 422 }, { "epoch": 0.06266666666666666, "grad_norm": 1.2301207780838013, "learning_rate": 0.00018766493699036324, "loss": 1.2349, "step": 423 }, { "epoch": 0.06281481481481481, "grad_norm": 0.7099899053573608, "learning_rate": 0.00018763528539659008, "loss": 1.1068, "step": 424 }, { "epoch": 0.06296296296296296, "grad_norm": 1.002394676208496, "learning_rate": 0.0001876056338028169, "loss": 1.3357, "step": 425 }, { "epoch": 0.06311111111111112, "grad_norm": 0.8066929578781128, "learning_rate": 0.00018757598220904375, "loss": 1.6873, "step": 426 }, { "epoch": 0.06325925925925927, "grad_norm": 0.9624059200286865, "learning_rate": 0.0001875463306152706, "loss": 1.137, "step": 427 }, { "epoch": 0.06340740740740741, "grad_norm": 2.6266281604766846, "learning_rate": 0.0001875166790214974, "loss": 1.216, "step": 428 }, { "epoch": 0.06355555555555556, "grad_norm": 1.0235140323638916, "learning_rate": 0.00018748702742772425, "loss": 1.3116, "step": 429 }, { "epoch": 0.0637037037037037, "grad_norm": 1.022839903831482, "learning_rate": 0.0001874573758339511, "loss": 1.3446, "step": 430 }, { "epoch": 0.06385185185185185, "grad_norm": 0.9741849303245544, "learning_rate": 0.00018742772424017791, "loss": 1.0908, "step": 431 }, { "epoch": 0.064, "grad_norm": 0.7452192306518555, "learning_rate": 0.00018739807264640473, "loss": 1.0021, "step": 432 }, { "epoch": 0.06414814814814815, "grad_norm": 0.8392693400382996, "learning_rate": 0.0001873684210526316, "loss": 1.6745, "step": 433 }, { "epoch": 0.0642962962962963, "grad_norm": 0.937454342842102, "learning_rate": 0.00018733876945885842, "loss": 1.3547, "step": 434 }, { "epoch": 0.06444444444444444, "grad_norm": 4.195993423461914, "learning_rate": 0.00018730911786508524, "loss": 1.2984, "step": 435 }, { "epoch": 0.06459259259259259, "grad_norm": 0.9298253059387207, "learning_rate": 0.0001872794662713121, "loss": 1.1374, "step": 436 }, { "epoch": 0.06474074074074074, "grad_norm": 1.1820027828216553, "learning_rate": 0.00018724981467753893, "loss": 1.169, "step": 437 }, { "epoch": 0.06488888888888888, "grad_norm": 0.7461003065109253, "learning_rate": 0.00018722016308376574, "loss": 1.1656, "step": 438 }, { "epoch": 0.06503703703703703, "grad_norm": 0.9983235001564026, "learning_rate": 0.00018719051148999262, "loss": 1.3256, "step": 439 }, { "epoch": 0.06518518518518518, "grad_norm": 1.3242130279541016, "learning_rate": 0.00018716085989621943, "loss": 1.0237, "step": 440 }, { "epoch": 0.06533333333333333, "grad_norm": 1.371294617652893, "learning_rate": 0.00018713120830244625, "loss": 1.1403, "step": 441 }, { "epoch": 0.06548148148148149, "grad_norm": 1.2605317831039429, "learning_rate": 0.00018710155670867312, "loss": 1.1972, "step": 442 }, { "epoch": 0.06562962962962963, "grad_norm": 1.1461695432662964, "learning_rate": 0.00018707190511489994, "loss": 1.3529, "step": 443 }, { "epoch": 0.06577777777777778, "grad_norm": 0.8331106901168823, "learning_rate": 0.00018704225352112676, "loss": 1.1217, "step": 444 }, { "epoch": 0.06592592592592593, "grad_norm": 1.2743287086486816, "learning_rate": 0.00018701260192735363, "loss": 1.2748, "step": 445 }, { "epoch": 0.06607407407407408, "grad_norm": 1.6404948234558105, "learning_rate": 0.00018698295033358044, "loss": 1.1709, "step": 446 }, { "epoch": 0.06622222222222222, "grad_norm": 1.4575682878494263, "learning_rate": 0.00018695329873980726, "loss": 1.1936, "step": 447 }, { "epoch": 0.06637037037037037, "grad_norm": 1.053147315979004, "learning_rate": 0.0001869236471460341, "loss": 1.0561, "step": 448 }, { "epoch": 0.06651851851851852, "grad_norm": 1.0501112937927246, "learning_rate": 0.00018689399555226095, "loss": 1.1234, "step": 449 }, { "epoch": 0.06666666666666667, "grad_norm": 0.9422358274459839, "learning_rate": 0.00018686434395848777, "loss": 1.1992, "step": 450 }, { "epoch": 0.06681481481481481, "grad_norm": 1.2424194812774658, "learning_rate": 0.0001868346923647146, "loss": 1.1539, "step": 451 }, { "epoch": 0.06696296296296296, "grad_norm": 0.9493004083633423, "learning_rate": 0.00018680504077094146, "loss": 1.0244, "step": 452 }, { "epoch": 0.06711111111111111, "grad_norm": 1.1930363178253174, "learning_rate": 0.00018677538917716827, "loss": 0.9746, "step": 453 }, { "epoch": 0.06725925925925925, "grad_norm": 0.8539004921913147, "learning_rate": 0.00018674573758339512, "loss": 1.1283, "step": 454 }, { "epoch": 0.0674074074074074, "grad_norm": 1.1996976137161255, "learning_rate": 0.00018671608598962196, "loss": 1.3479, "step": 455 }, { "epoch": 0.06755555555555555, "grad_norm": 1.668487548828125, "learning_rate": 0.00018668643439584878, "loss": 1.1042, "step": 456 }, { "epoch": 0.06770370370370371, "grad_norm": 0.8974586129188538, "learning_rate": 0.00018665678280207563, "loss": 1.2239, "step": 457 }, { "epoch": 0.06785185185185186, "grad_norm": 0.8592154383659363, "learning_rate": 0.00018662713120830247, "loss": 0.924, "step": 458 }, { "epoch": 0.068, "grad_norm": 1.0311380624771118, "learning_rate": 0.0001865974796145293, "loss": 1.1042, "step": 459 }, { "epoch": 0.06814814814814815, "grad_norm": 0.8845215439796448, "learning_rate": 0.00018656782802075613, "loss": 1.0193, "step": 460 }, { "epoch": 0.0682962962962963, "grad_norm": 0.8208663463592529, "learning_rate": 0.00018653817642698298, "loss": 1.0014, "step": 461 }, { "epoch": 0.06844444444444445, "grad_norm": 1.7937710285186768, "learning_rate": 0.0001865085248332098, "loss": 1.0882, "step": 462 }, { "epoch": 0.0685925925925926, "grad_norm": 0.9765856266021729, "learning_rate": 0.0001864788732394366, "loss": 1.1428, "step": 463 }, { "epoch": 0.06874074074074074, "grad_norm": 1.0087882280349731, "learning_rate": 0.00018644922164566348, "loss": 1.1982, "step": 464 }, { "epoch": 0.06888888888888889, "grad_norm": 1.0147068500518799, "learning_rate": 0.0001864195700518903, "loss": 1.1694, "step": 465 }, { "epoch": 0.06903703703703704, "grad_norm": 0.9086894392967224, "learning_rate": 0.00018638991845811712, "loss": 1.1188, "step": 466 }, { "epoch": 0.06918518518518518, "grad_norm": 0.7124015092849731, "learning_rate": 0.00018636026686434396, "loss": 1.1703, "step": 467 }, { "epoch": 0.06933333333333333, "grad_norm": 1.072105884552002, "learning_rate": 0.0001863306152705708, "loss": 1.2042, "step": 468 }, { "epoch": 0.06948148148148148, "grad_norm": 1.7827683687210083, "learning_rate": 0.00018630096367679762, "loss": 1.1303, "step": 469 }, { "epoch": 0.06962962962962962, "grad_norm": 1.5798730850219727, "learning_rate": 0.00018627131208302447, "loss": 1.0927, "step": 470 }, { "epoch": 0.06977777777777777, "grad_norm": 0.8328156471252441, "learning_rate": 0.0001862416604892513, "loss": 1.1426, "step": 471 }, { "epoch": 0.06992592592592592, "grad_norm": 0.8577338457107544, "learning_rate": 0.00018621200889547813, "loss": 1.2541, "step": 472 }, { "epoch": 0.07007407407407408, "grad_norm": 0.9907087087631226, "learning_rate": 0.00018618235730170497, "loss": 1.2424, "step": 473 }, { "epoch": 0.07022222222222223, "grad_norm": 0.7621241211891174, "learning_rate": 0.00018615270570793182, "loss": 1.1052, "step": 474 }, { "epoch": 0.07037037037037037, "grad_norm": 0.7610787153244019, "learning_rate": 0.00018612305411415864, "loss": 1.1561, "step": 475 }, { "epoch": 0.07051851851851852, "grad_norm": 1.0455390214920044, "learning_rate": 0.00018609340252038548, "loss": 1.1155, "step": 476 }, { "epoch": 0.07066666666666667, "grad_norm": 0.9702383279800415, "learning_rate": 0.00018606375092661233, "loss": 1.5349, "step": 477 }, { "epoch": 0.07081481481481482, "grad_norm": 0.9245896935462952, "learning_rate": 0.00018603409933283914, "loss": 1.629, "step": 478 }, { "epoch": 0.07096296296296296, "grad_norm": 0.9411025643348694, "learning_rate": 0.000186004447739066, "loss": 1.2111, "step": 479 }, { "epoch": 0.07111111111111111, "grad_norm": 3.217911958694458, "learning_rate": 0.00018597479614529283, "loss": 1.4453, "step": 480 }, { "epoch": 0.07125925925925926, "grad_norm": 0.8464000821113586, "learning_rate": 0.00018594514455151965, "loss": 1.131, "step": 481 }, { "epoch": 0.0714074074074074, "grad_norm": 6.36705207824707, "learning_rate": 0.0001859154929577465, "loss": 1.0089, "step": 482 }, { "epoch": 0.07155555555555555, "grad_norm": 0.8953663110733032, "learning_rate": 0.0001858858413639733, "loss": 1.2024, "step": 483 }, { "epoch": 0.0717037037037037, "grad_norm": 1.0531768798828125, "learning_rate": 0.00018585618977020015, "loss": 1.4019, "step": 484 }, { "epoch": 0.07185185185185185, "grad_norm": 2.170644998550415, "learning_rate": 0.000185826538176427, "loss": 1.376, "step": 485 }, { "epoch": 0.072, "grad_norm": 0.6142813563346863, "learning_rate": 0.00018579688658265382, "loss": 0.925, "step": 486 }, { "epoch": 0.07214814814814814, "grad_norm": 0.7336257100105286, "learning_rate": 0.00018576723498888066, "loss": 1.1232, "step": 487 }, { "epoch": 0.0722962962962963, "grad_norm": 0.6803194880485535, "learning_rate": 0.0001857375833951075, "loss": 1.1407, "step": 488 }, { "epoch": 0.07244444444444445, "grad_norm": 0.8505472540855408, "learning_rate": 0.00018570793180133432, "loss": 1.318, "step": 489 }, { "epoch": 0.0725925925925926, "grad_norm": 0.8538836240768433, "learning_rate": 0.00018567828020756117, "loss": 1.1678, "step": 490 }, { "epoch": 0.07274074074074074, "grad_norm": 0.8287623524665833, "learning_rate": 0.000185648628613788, "loss": 1.2591, "step": 491 }, { "epoch": 0.07288888888888889, "grad_norm": 1.3350564241409302, "learning_rate": 0.00018561897702001483, "loss": 1.4278, "step": 492 }, { "epoch": 0.07303703703703704, "grad_norm": 0.992910623550415, "learning_rate": 0.00018558932542624167, "loss": 1.2179, "step": 493 }, { "epoch": 0.07318518518518519, "grad_norm": 2.1098670959472656, "learning_rate": 0.00018555967383246852, "loss": 1.311, "step": 494 }, { "epoch": 0.07333333333333333, "grad_norm": 0.7079705595970154, "learning_rate": 0.00018553002223869534, "loss": 1.0287, "step": 495 }, { "epoch": 0.07348148148148148, "grad_norm": 0.9057304859161377, "learning_rate": 0.00018550037064492218, "loss": 1.2524, "step": 496 }, { "epoch": 0.07362962962962963, "grad_norm": 0.936161458492279, "learning_rate": 0.00018547071905114902, "loss": 1.1956, "step": 497 }, { "epoch": 0.07377777777777778, "grad_norm": 1.3111516237258911, "learning_rate": 0.00018544106745737584, "loss": 1.2858, "step": 498 }, { "epoch": 0.07392592592592592, "grad_norm": 0.8875294923782349, "learning_rate": 0.00018541141586360266, "loss": 1.0537, "step": 499 }, { "epoch": 0.07407407407407407, "grad_norm": 1.14637291431427, "learning_rate": 0.0001853817642698295, "loss": 1.4708, "step": 500 }, { "epoch": 0.07422222222222222, "grad_norm": 0.8916139006614685, "learning_rate": 0.00018535211267605635, "loss": 1.1583, "step": 501 }, { "epoch": 0.07437037037037036, "grad_norm": 0.798608124256134, "learning_rate": 0.00018532246108228317, "loss": 1.4402, "step": 502 }, { "epoch": 0.07451851851851851, "grad_norm": 0.8326433897018433, "learning_rate": 0.00018529280948851, "loss": 1.3123, "step": 503 }, { "epoch": 0.07466666666666667, "grad_norm": 1.6419179439544678, "learning_rate": 0.00018526315789473685, "loss": 1.1254, "step": 504 }, { "epoch": 0.07481481481481482, "grad_norm": 0.7575996518135071, "learning_rate": 0.00018523350630096367, "loss": 1.3618, "step": 505 }, { "epoch": 0.07496296296296297, "grad_norm": 0.8234692215919495, "learning_rate": 0.00018520385470719052, "loss": 1.1657, "step": 506 }, { "epoch": 0.07511111111111111, "grad_norm": 0.9207050204277039, "learning_rate": 0.00018517420311341736, "loss": 1.4156, "step": 507 }, { "epoch": 0.07525925925925926, "grad_norm": 0.9526640772819519, "learning_rate": 0.00018514455151964418, "loss": 1.2533, "step": 508 }, { "epoch": 0.07540740740740741, "grad_norm": 1.0630484819412231, "learning_rate": 0.00018511489992587102, "loss": 1.187, "step": 509 }, { "epoch": 0.07555555555555556, "grad_norm": 1.4691499471664429, "learning_rate": 0.00018508524833209787, "loss": 1.4744, "step": 510 }, { "epoch": 0.0757037037037037, "grad_norm": 0.8877632021903992, "learning_rate": 0.00018505559673832468, "loss": 1.3136, "step": 511 }, { "epoch": 0.07585185185185185, "grad_norm": 0.9979971051216125, "learning_rate": 0.00018502594514455153, "loss": 1.0359, "step": 512 }, { "epoch": 0.076, "grad_norm": 0.7808818817138672, "learning_rate": 0.00018499629355077837, "loss": 1.1469, "step": 513 }, { "epoch": 0.07614814814814815, "grad_norm": 0.7668399810791016, "learning_rate": 0.0001849666419570052, "loss": 0.9518, "step": 514 }, { "epoch": 0.07629629629629629, "grad_norm": 0.6751396059989929, "learning_rate": 0.00018493699036323204, "loss": 1.0435, "step": 515 }, { "epoch": 0.07644444444444444, "grad_norm": 2.380500316619873, "learning_rate": 0.00018490733876945888, "loss": 1.2916, "step": 516 }, { "epoch": 0.07659259259259259, "grad_norm": 1.5779838562011719, "learning_rate": 0.0001848776871756857, "loss": 1.1652, "step": 517 }, { "epoch": 0.07674074074074073, "grad_norm": 0.8845908045768738, "learning_rate": 0.00018484803558191251, "loss": 1.3284, "step": 518 }, { "epoch": 0.0768888888888889, "grad_norm": 0.826222836971283, "learning_rate": 0.00018481838398813939, "loss": 0.9343, "step": 519 }, { "epoch": 0.07703703703703704, "grad_norm": 0.7673617601394653, "learning_rate": 0.0001847887323943662, "loss": 1.0863, "step": 520 }, { "epoch": 0.07718518518518519, "grad_norm": 0.8270271420478821, "learning_rate": 0.00018475908080059302, "loss": 1.3003, "step": 521 }, { "epoch": 0.07733333333333334, "grad_norm": 1.4127448797225952, "learning_rate": 0.0001847294292068199, "loss": 1.3734, "step": 522 }, { "epoch": 0.07748148148148148, "grad_norm": 0.8243331909179688, "learning_rate": 0.0001846997776130467, "loss": 1.1697, "step": 523 }, { "epoch": 0.07762962962962963, "grad_norm": 0.7939732670783997, "learning_rate": 0.00018467012601927353, "loss": 1.3484, "step": 524 }, { "epoch": 0.07777777777777778, "grad_norm": 1.120720386505127, "learning_rate": 0.0001846404744255004, "loss": 1.1484, "step": 525 }, { "epoch": 0.07792592592592593, "grad_norm": 0.8500152230262756, "learning_rate": 0.00018461082283172722, "loss": 1.4174, "step": 526 }, { "epoch": 0.07807407407407407, "grad_norm": 0.9206627011299133, "learning_rate": 0.00018458117123795403, "loss": 1.4041, "step": 527 }, { "epoch": 0.07822222222222222, "grad_norm": 0.8654802441596985, "learning_rate": 0.0001845515196441809, "loss": 1.4201, "step": 528 }, { "epoch": 0.07837037037037037, "grad_norm": 0.9089827537536621, "learning_rate": 0.00018452186805040772, "loss": 1.3494, "step": 529 }, { "epoch": 0.07851851851851852, "grad_norm": 0.851271390914917, "learning_rate": 0.00018449221645663454, "loss": 1.1615, "step": 530 }, { "epoch": 0.07866666666666666, "grad_norm": 0.9235059022903442, "learning_rate": 0.0001844625648628614, "loss": 1.1841, "step": 531 }, { "epoch": 0.07881481481481481, "grad_norm": 1.1651939153671265, "learning_rate": 0.00018443291326908823, "loss": 1.1423, "step": 532 }, { "epoch": 0.07896296296296296, "grad_norm": 1.5201992988586426, "learning_rate": 0.00018440326167531505, "loss": 1.0807, "step": 533 }, { "epoch": 0.0791111111111111, "grad_norm": 0.9012873768806458, "learning_rate": 0.0001843736100815419, "loss": 1.0604, "step": 534 }, { "epoch": 0.07925925925925927, "grad_norm": 0.9370338320732117, "learning_rate": 0.00018434395848776873, "loss": 1.033, "step": 535 }, { "epoch": 0.07940740740740741, "grad_norm": 1.0847934484481812, "learning_rate": 0.00018431430689399555, "loss": 1.2449, "step": 536 }, { "epoch": 0.07955555555555556, "grad_norm": 1.0478984117507935, "learning_rate": 0.0001842846553002224, "loss": 1.2035, "step": 537 }, { "epoch": 0.07970370370370371, "grad_norm": 0.9668722152709961, "learning_rate": 0.00018425500370644924, "loss": 1.1051, "step": 538 }, { "epoch": 0.07985185185185185, "grad_norm": 1.078900933265686, "learning_rate": 0.00018422535211267606, "loss": 1.1831, "step": 539 }, { "epoch": 0.08, "grad_norm": 1.2907154560089111, "learning_rate": 0.0001841957005189029, "loss": 1.2905, "step": 540 }, { "epoch": 0.08014814814814815, "grad_norm": 0.8596047759056091, "learning_rate": 0.00018416604892512975, "loss": 1.0783, "step": 541 }, { "epoch": 0.0802962962962963, "grad_norm": 1.0501773357391357, "learning_rate": 0.00018413639733135656, "loss": 1.2523, "step": 542 }, { "epoch": 0.08044444444444444, "grad_norm": 1.099936842918396, "learning_rate": 0.0001841067457375834, "loss": 1.0595, "step": 543 }, { "epoch": 0.08059259259259259, "grad_norm": 2.2023308277130127, "learning_rate": 0.00018407709414381025, "loss": 1.1073, "step": 544 }, { "epoch": 0.08074074074074074, "grad_norm": 1.065362572669983, "learning_rate": 0.00018404744255003707, "loss": 1.1908, "step": 545 }, { "epoch": 0.08088888888888889, "grad_norm": 1.262590765953064, "learning_rate": 0.00018401779095626392, "loss": 1.1085, "step": 546 }, { "epoch": 0.08103703703703703, "grad_norm": 2.105292558670044, "learning_rate": 0.00018398813936249076, "loss": 1.1783, "step": 547 }, { "epoch": 0.08118518518518518, "grad_norm": 0.8273268938064575, "learning_rate": 0.00018395848776871758, "loss": 1.3093, "step": 548 }, { "epoch": 0.08133333333333333, "grad_norm": 0.870421826839447, "learning_rate": 0.0001839288361749444, "loss": 1.2127, "step": 549 }, { "epoch": 0.08148148148148149, "grad_norm": 0.9928128123283386, "learning_rate": 0.00018389918458117127, "loss": 1.2102, "step": 550 }, { "epoch": 0.08162962962962964, "grad_norm": 1.2662063837051392, "learning_rate": 0.00018386953298739808, "loss": 1.2983, "step": 551 }, { "epoch": 0.08177777777777778, "grad_norm": 0.717269241809845, "learning_rate": 0.0001838398813936249, "loss": 1.0745, "step": 552 }, { "epoch": 0.08192592592592593, "grad_norm": 0.8523573875427246, "learning_rate": 0.00018381022979985175, "loss": 1.2344, "step": 553 }, { "epoch": 0.08207407407407408, "grad_norm": 0.8732525110244751, "learning_rate": 0.0001837805782060786, "loss": 1.1399, "step": 554 }, { "epoch": 0.08222222222222222, "grad_norm": 0.8165760636329651, "learning_rate": 0.0001837509266123054, "loss": 1.2417, "step": 555 }, { "epoch": 0.08237037037037037, "grad_norm": 0.7446961998939514, "learning_rate": 0.00018372127501853225, "loss": 1.0925, "step": 556 }, { "epoch": 0.08251851851851852, "grad_norm": 0.8104601502418518, "learning_rate": 0.0001836916234247591, "loss": 1.2704, "step": 557 }, { "epoch": 0.08266666666666667, "grad_norm": 0.7920854091644287, "learning_rate": 0.0001836619718309859, "loss": 1.047, "step": 558 }, { "epoch": 0.08281481481481481, "grad_norm": 1.0525939464569092, "learning_rate": 0.00018363232023721276, "loss": 1.3214, "step": 559 }, { "epoch": 0.08296296296296296, "grad_norm": 0.9023261666297913, "learning_rate": 0.0001836026686434396, "loss": 1.059, "step": 560 }, { "epoch": 0.08311111111111111, "grad_norm": 1.2181390523910522, "learning_rate": 0.00018357301704966642, "loss": 1.1573, "step": 561 }, { "epoch": 0.08325925925925926, "grad_norm": 1.012555480003357, "learning_rate": 0.00018354336545589326, "loss": 1.2668, "step": 562 }, { "epoch": 0.0834074074074074, "grad_norm": 1.155202865600586, "learning_rate": 0.0001835137138621201, "loss": 1.1001, "step": 563 }, { "epoch": 0.08355555555555555, "grad_norm": 0.9464355707168579, "learning_rate": 0.00018348406226834693, "loss": 1.0413, "step": 564 }, { "epoch": 0.0837037037037037, "grad_norm": 0.8857262134552002, "learning_rate": 0.00018345441067457377, "loss": 1.243, "step": 565 }, { "epoch": 0.08385185185185186, "grad_norm": 0.9362701773643494, "learning_rate": 0.00018342475908080061, "loss": 1.2346, "step": 566 }, { "epoch": 0.084, "grad_norm": 0.7799801826477051, "learning_rate": 0.00018339510748702743, "loss": 0.9897, "step": 567 }, { "epoch": 0.08414814814814815, "grad_norm": 0.8452311754226685, "learning_rate": 0.00018336545589325428, "loss": 1.1353, "step": 568 }, { "epoch": 0.0842962962962963, "grad_norm": 0.8381460309028625, "learning_rate": 0.0001833358042994811, "loss": 1.3495, "step": 569 }, { "epoch": 0.08444444444444445, "grad_norm": 0.8649606108665466, "learning_rate": 0.00018330615270570794, "loss": 1.0649, "step": 570 }, { "epoch": 0.0845925925925926, "grad_norm": 1.4266537427902222, "learning_rate": 0.00018327650111193478, "loss": 1.2434, "step": 571 }, { "epoch": 0.08474074074074074, "grad_norm": 1.5174586772918701, "learning_rate": 0.0001832468495181616, "loss": 0.9991, "step": 572 }, { "epoch": 0.08488888888888889, "grad_norm": 0.9220426082611084, "learning_rate": 0.00018321719792438844, "loss": 1.1843, "step": 573 }, { "epoch": 0.08503703703703704, "grad_norm": 1.0861490964889526, "learning_rate": 0.0001831875463306153, "loss": 1.1983, "step": 574 }, { "epoch": 0.08518518518518518, "grad_norm": 1.193626046180725, "learning_rate": 0.0001831578947368421, "loss": 1.0819, "step": 575 }, { "epoch": 0.08533333333333333, "grad_norm": 1.0166492462158203, "learning_rate": 0.00018312824314306895, "loss": 1.1051, "step": 576 }, { "epoch": 0.08548148148148148, "grad_norm": 1.6874605417251587, "learning_rate": 0.0001830985915492958, "loss": 1.3142, "step": 577 }, { "epoch": 0.08562962962962962, "grad_norm": 0.8959518074989319, "learning_rate": 0.0001830689399555226, "loss": 1.1314, "step": 578 }, { "epoch": 0.08577777777777777, "grad_norm": 0.89152592420578, "learning_rate": 0.00018303928836174946, "loss": 1.1101, "step": 579 }, { "epoch": 0.08592592592592592, "grad_norm": 0.9080824851989746, "learning_rate": 0.0001830096367679763, "loss": 1.3752, "step": 580 }, { "epoch": 0.08607407407407408, "grad_norm": 1.0774247646331787, "learning_rate": 0.00018297998517420312, "loss": 1.1069, "step": 581 }, { "epoch": 0.08622222222222223, "grad_norm": 0.9702334403991699, "learning_rate": 0.00018295033358042996, "loss": 0.9372, "step": 582 }, { "epoch": 0.08637037037037038, "grad_norm": 1.751625895500183, "learning_rate": 0.0001829206819866568, "loss": 1.1275, "step": 583 }, { "epoch": 0.08651851851851852, "grad_norm": 1.5056380033493042, "learning_rate": 0.00018289103039288363, "loss": 1.2005, "step": 584 }, { "epoch": 0.08666666666666667, "grad_norm": 0.9727258682250977, "learning_rate": 0.00018286137879911044, "loss": 1.2625, "step": 585 }, { "epoch": 0.08681481481481482, "grad_norm": 1.2621508836746216, "learning_rate": 0.0001828317272053373, "loss": 1.114, "step": 586 }, { "epoch": 0.08696296296296296, "grad_norm": 1.268631100654602, "learning_rate": 0.00018280207561156413, "loss": 1.4013, "step": 587 }, { "epoch": 0.08711111111111111, "grad_norm": 1.8268654346466064, "learning_rate": 0.00018277242401779095, "loss": 1.251, "step": 588 }, { "epoch": 0.08725925925925926, "grad_norm": 1.0079928636550903, "learning_rate": 0.0001827427724240178, "loss": 1.4428, "step": 589 }, { "epoch": 0.0874074074074074, "grad_norm": 4.245646953582764, "learning_rate": 0.00018271312083024464, "loss": 1.2148, "step": 590 }, { "epoch": 0.08755555555555555, "grad_norm": 0.7979462742805481, "learning_rate": 0.00018268346923647146, "loss": 1.1625, "step": 591 }, { "epoch": 0.0877037037037037, "grad_norm": 0.8092358112335205, "learning_rate": 0.0001826538176426983, "loss": 1.3047, "step": 592 }, { "epoch": 0.08785185185185185, "grad_norm": 1.1461204290390015, "learning_rate": 0.00018262416604892514, "loss": 1.2695, "step": 593 }, { "epoch": 0.088, "grad_norm": 0.840783417224884, "learning_rate": 0.00018259451445515196, "loss": 1.1151, "step": 594 }, { "epoch": 0.08814814814814814, "grad_norm": 1.8119466304779053, "learning_rate": 0.0001825648628613788, "loss": 1.102, "step": 595 }, { "epoch": 0.08829629629629629, "grad_norm": 0.9149336814880371, "learning_rate": 0.00018253521126760565, "loss": 1.3933, "step": 596 }, { "epoch": 0.08844444444444445, "grad_norm": 1.2133201360702515, "learning_rate": 0.00018250555967383247, "loss": 1.5016, "step": 597 }, { "epoch": 0.0885925925925926, "grad_norm": 0.9949682950973511, "learning_rate": 0.0001824759080800593, "loss": 1.0059, "step": 598 }, { "epoch": 0.08874074074074075, "grad_norm": 0.8984311819076538, "learning_rate": 0.00018244625648628616, "loss": 1.3352, "step": 599 }, { "epoch": 0.08888888888888889, "grad_norm": 0.94035804271698, "learning_rate": 0.00018241660489251297, "loss": 1.2192, "step": 600 }, { "epoch": 0.08903703703703704, "grad_norm": 1.6803605556488037, "learning_rate": 0.00018238695329873982, "loss": 1.5195, "step": 601 }, { "epoch": 0.08918518518518519, "grad_norm": 0.9661155939102173, "learning_rate": 0.00018235730170496666, "loss": 1.1799, "step": 602 }, { "epoch": 0.08933333333333333, "grad_norm": 0.8699297904968262, "learning_rate": 0.00018232765011119348, "loss": 1.2836, "step": 603 }, { "epoch": 0.08948148148148148, "grad_norm": 1.2877745628356934, "learning_rate": 0.0001822979985174203, "loss": 1.3247, "step": 604 }, { "epoch": 0.08962962962962963, "grad_norm": 0.9368343949317932, "learning_rate": 0.00018226834692364717, "loss": 1.1784, "step": 605 }, { "epoch": 0.08977777777777778, "grad_norm": 1.4999383687973022, "learning_rate": 0.00018223869532987399, "loss": 1.0687, "step": 606 }, { "epoch": 0.08992592592592592, "grad_norm": 1.1466591358184814, "learning_rate": 0.0001822090437361008, "loss": 1.1705, "step": 607 }, { "epoch": 0.09007407407407407, "grad_norm": 0.9255340695381165, "learning_rate": 0.00018217939214232768, "loss": 1.1218, "step": 608 }, { "epoch": 0.09022222222222222, "grad_norm": 0.883299708366394, "learning_rate": 0.0001821497405485545, "loss": 1.1096, "step": 609 }, { "epoch": 0.09037037037037036, "grad_norm": 1.1371686458587646, "learning_rate": 0.0001821200889547813, "loss": 1.0435, "step": 610 }, { "epoch": 0.09051851851851851, "grad_norm": 1.531836748123169, "learning_rate": 0.00018209043736100818, "loss": 0.9625, "step": 611 }, { "epoch": 0.09066666666666667, "grad_norm": 1.1650729179382324, "learning_rate": 0.000182060785767235, "loss": 1.0427, "step": 612 }, { "epoch": 0.09081481481481482, "grad_norm": 0.9275507926940918, "learning_rate": 0.00018203113417346182, "loss": 1.2767, "step": 613 }, { "epoch": 0.09096296296296297, "grad_norm": 2.034724473953247, "learning_rate": 0.0001820014825796887, "loss": 1.068, "step": 614 }, { "epoch": 0.09111111111111111, "grad_norm": 0.7717260122299194, "learning_rate": 0.0001819718309859155, "loss": 1.072, "step": 615 }, { "epoch": 0.09125925925925926, "grad_norm": 0.9517045021057129, "learning_rate": 0.00018194217939214232, "loss": 0.9528, "step": 616 }, { "epoch": 0.09140740740740741, "grad_norm": 0.9404609799385071, "learning_rate": 0.0001819125277983692, "loss": 1.0691, "step": 617 }, { "epoch": 0.09155555555555556, "grad_norm": 1.0251749753952026, "learning_rate": 0.000181882876204596, "loss": 1.2236, "step": 618 }, { "epoch": 0.0917037037037037, "grad_norm": 0.8374236226081848, "learning_rate": 0.00018185322461082283, "loss": 0.8373, "step": 619 }, { "epoch": 0.09185185185185185, "grad_norm": 2.0672378540039062, "learning_rate": 0.00018182357301704967, "loss": 1.3187, "step": 620 }, { "epoch": 0.092, "grad_norm": 1.762125849723816, "learning_rate": 0.00018179392142327652, "loss": 1.3841, "step": 621 }, { "epoch": 0.09214814814814815, "grad_norm": 0.9544039964675903, "learning_rate": 0.00018176426982950334, "loss": 1.2255, "step": 622 }, { "epoch": 0.09229629629629629, "grad_norm": 1.0860487222671509, "learning_rate": 0.00018173461823573018, "loss": 1.3558, "step": 623 }, { "epoch": 0.09244444444444444, "grad_norm": 1.07772958278656, "learning_rate": 0.00018170496664195702, "loss": 1.3927, "step": 624 }, { "epoch": 0.09259259259259259, "grad_norm": 0.7878835201263428, "learning_rate": 0.00018167531504818384, "loss": 1.2057, "step": 625 }, { "epoch": 0.09274074074074073, "grad_norm": 1.000695824623108, "learning_rate": 0.00018164566345441069, "loss": 1.0641, "step": 626 }, { "epoch": 0.09288888888888888, "grad_norm": 1.0378655195236206, "learning_rate": 0.00018161601186063753, "loss": 1.1221, "step": 627 }, { "epoch": 0.09303703703703704, "grad_norm": 1.1828982830047607, "learning_rate": 0.00018158636026686435, "loss": 1.1044, "step": 628 }, { "epoch": 0.09318518518518519, "grad_norm": 0.841529905796051, "learning_rate": 0.0001815567086730912, "loss": 1.0851, "step": 629 }, { "epoch": 0.09333333333333334, "grad_norm": 0.9000698924064636, "learning_rate": 0.00018152705707931804, "loss": 1.2912, "step": 630 }, { "epoch": 0.09348148148148148, "grad_norm": 1.4891201257705688, "learning_rate": 0.00018149740548554485, "loss": 1.1035, "step": 631 }, { "epoch": 0.09362962962962963, "grad_norm": 0.7513223886489868, "learning_rate": 0.0001814677538917717, "loss": 1.0536, "step": 632 }, { "epoch": 0.09377777777777778, "grad_norm": 0.941815197467804, "learning_rate": 0.00018143810229799854, "loss": 1.0365, "step": 633 }, { "epoch": 0.09392592592592593, "grad_norm": 1.2023797035217285, "learning_rate": 0.00018140845070422536, "loss": 1.4159, "step": 634 }, { "epoch": 0.09407407407407407, "grad_norm": 1.007938027381897, "learning_rate": 0.00018137879911045218, "loss": 1.357, "step": 635 }, { "epoch": 0.09422222222222222, "grad_norm": 1.2334319353103638, "learning_rate": 0.00018134914751667905, "loss": 0.8031, "step": 636 }, { "epoch": 0.09437037037037037, "grad_norm": 1.6498353481292725, "learning_rate": 0.00018131949592290587, "loss": 1.4231, "step": 637 }, { "epoch": 0.09451851851851852, "grad_norm": 0.720307469367981, "learning_rate": 0.00018128984432913268, "loss": 0.8953, "step": 638 }, { "epoch": 0.09466666666666666, "grad_norm": 1.5557289123535156, "learning_rate": 0.00018126019273535953, "loss": 1.2389, "step": 639 }, { "epoch": 0.09481481481481481, "grad_norm": 1.052612543106079, "learning_rate": 0.00018123054114158637, "loss": 1.2308, "step": 640 }, { "epoch": 0.09496296296296296, "grad_norm": 0.9883959293365479, "learning_rate": 0.0001812008895478132, "loss": 1.1262, "step": 641 }, { "epoch": 0.0951111111111111, "grad_norm": 0.9268585443496704, "learning_rate": 0.00018117123795404003, "loss": 1.2834, "step": 642 }, { "epoch": 0.09525925925925927, "grad_norm": 1.0678269863128662, "learning_rate": 0.00018114158636026688, "loss": 1.1876, "step": 643 }, { "epoch": 0.09540740740740741, "grad_norm": 0.8583171963691711, "learning_rate": 0.0001811119347664937, "loss": 1.1843, "step": 644 }, { "epoch": 0.09555555555555556, "grad_norm": 1.6480978727340698, "learning_rate": 0.00018108228317272054, "loss": 1.1791, "step": 645 }, { "epoch": 0.09570370370370371, "grad_norm": 0.9286375641822815, "learning_rate": 0.00018105263157894739, "loss": 1.0981, "step": 646 }, { "epoch": 0.09585185185185185, "grad_norm": 1.2206205129623413, "learning_rate": 0.0001810229799851742, "loss": 1.0437, "step": 647 }, { "epoch": 0.096, "grad_norm": 1.1012060642242432, "learning_rate": 0.00018099332839140105, "loss": 1.6352, "step": 648 }, { "epoch": 0.09614814814814815, "grad_norm": 0.9242655038833618, "learning_rate": 0.0001809636767976279, "loss": 1.3694, "step": 649 }, { "epoch": 0.0962962962962963, "grad_norm": 1.2646676301956177, "learning_rate": 0.0001809340252038547, "loss": 1.0467, "step": 650 }, { "epoch": 0.09644444444444444, "grad_norm": 1.1692533493041992, "learning_rate": 0.00018090437361008155, "loss": 1.2235, "step": 651 }, { "epoch": 0.09659259259259259, "grad_norm": 0.8595184087753296, "learning_rate": 0.0001808747220163084, "loss": 1.6275, "step": 652 }, { "epoch": 0.09674074074074074, "grad_norm": 1.010973572731018, "learning_rate": 0.00018084507042253522, "loss": 1.2827, "step": 653 }, { "epoch": 0.09688888888888889, "grad_norm": 0.9244717359542847, "learning_rate": 0.00018081541882876206, "loss": 0.8571, "step": 654 }, { "epoch": 0.09703703703703703, "grad_norm": 0.7265779376029968, "learning_rate": 0.00018078576723498888, "loss": 0.8999, "step": 655 }, { "epoch": 0.09718518518518518, "grad_norm": 0.7790964245796204, "learning_rate": 0.00018075611564121572, "loss": 1.0935, "step": 656 }, { "epoch": 0.09733333333333333, "grad_norm": 0.9619722962379456, "learning_rate": 0.00018072646404744257, "loss": 1.0339, "step": 657 }, { "epoch": 0.09748148148148149, "grad_norm": 1.0757215023040771, "learning_rate": 0.00018069681245366938, "loss": 1.1048, "step": 658 }, { "epoch": 0.09762962962962964, "grad_norm": 1.017087459564209, "learning_rate": 0.00018066716085989623, "loss": 1.0618, "step": 659 }, { "epoch": 0.09777777777777778, "grad_norm": 1.15996515750885, "learning_rate": 0.00018063750926612307, "loss": 1.4437, "step": 660 }, { "epoch": 0.09792592592592593, "grad_norm": 1.0291752815246582, "learning_rate": 0.0001806078576723499, "loss": 1.3827, "step": 661 }, { "epoch": 0.09807407407407408, "grad_norm": 1.2072279453277588, "learning_rate": 0.00018057820607857673, "loss": 1.263, "step": 662 }, { "epoch": 0.09822222222222222, "grad_norm": 0.8682127594947815, "learning_rate": 0.00018054855448480358, "loss": 1.225, "step": 663 }, { "epoch": 0.09837037037037037, "grad_norm": 1.5121533870697021, "learning_rate": 0.0001805189028910304, "loss": 1.0637, "step": 664 }, { "epoch": 0.09851851851851852, "grad_norm": 1.2017015218734741, "learning_rate": 0.00018048925129725724, "loss": 1.348, "step": 665 }, { "epoch": 0.09866666666666667, "grad_norm": 0.8419830203056335, "learning_rate": 0.00018045959970348408, "loss": 1.0992, "step": 666 }, { "epoch": 0.09881481481481481, "grad_norm": 1.1485445499420166, "learning_rate": 0.0001804299481097109, "loss": 1.0252, "step": 667 }, { "epoch": 0.09896296296296296, "grad_norm": 1.7020227909088135, "learning_rate": 0.00018040029651593775, "loss": 1.1653, "step": 668 }, { "epoch": 0.09911111111111111, "grad_norm": 1.0807623863220215, "learning_rate": 0.0001803706449221646, "loss": 0.9006, "step": 669 }, { "epoch": 0.09925925925925926, "grad_norm": 1.100339651107788, "learning_rate": 0.0001803409933283914, "loss": 1.2011, "step": 670 }, { "epoch": 0.0994074074074074, "grad_norm": 1.3941802978515625, "learning_rate": 0.00018031134173461825, "loss": 1.3815, "step": 671 }, { "epoch": 0.09955555555555555, "grad_norm": 0.8945662379264832, "learning_rate": 0.00018028169014084507, "loss": 1.2971, "step": 672 }, { "epoch": 0.0997037037037037, "grad_norm": 0.9066154360771179, "learning_rate": 0.00018025203854707191, "loss": 1.4333, "step": 673 }, { "epoch": 0.09985185185185186, "grad_norm": 1.3932311534881592, "learning_rate": 0.00018022238695329873, "loss": 1.3624, "step": 674 }, { "epoch": 0.1, "grad_norm": 1.272756814956665, "learning_rate": 0.00018019273535952558, "loss": 1.5097, "step": 675 }, { "epoch": 0.10014814814814815, "grad_norm": 1.024742841720581, "learning_rate": 0.00018016308376575242, "loss": 1.1012, "step": 676 }, { "epoch": 0.1002962962962963, "grad_norm": 1.3817062377929688, "learning_rate": 0.00018013343217197924, "loss": 1.1021, "step": 677 }, { "epoch": 0.10044444444444445, "grad_norm": 1.0953186750411987, "learning_rate": 0.00018010378057820608, "loss": 1.2881, "step": 678 }, { "epoch": 0.1005925925925926, "grad_norm": 1.1058462858200073, "learning_rate": 0.00018007412898443293, "loss": 1.1268, "step": 679 }, { "epoch": 0.10074074074074074, "grad_norm": 0.9799386858940125, "learning_rate": 0.00018004447739065974, "loss": 1.0524, "step": 680 }, { "epoch": 0.10088888888888889, "grad_norm": 4.252458572387695, "learning_rate": 0.0001800148257968866, "loss": 1.0699, "step": 681 }, { "epoch": 0.10103703703703704, "grad_norm": 0.836798369884491, "learning_rate": 0.00017998517420311343, "loss": 0.8132, "step": 682 }, { "epoch": 0.10118518518518518, "grad_norm": 1.1359460353851318, "learning_rate": 0.00017995552260934025, "loss": 1.1005, "step": 683 }, { "epoch": 0.10133333333333333, "grad_norm": 0.8894252181053162, "learning_rate": 0.0001799258710155671, "loss": 1.0666, "step": 684 }, { "epoch": 0.10148148148148148, "grad_norm": 1.1909353733062744, "learning_rate": 0.00017989621942179394, "loss": 1.3341, "step": 685 }, { "epoch": 0.10162962962962963, "grad_norm": 1.236089825630188, "learning_rate": 0.00017986656782802076, "loss": 1.0689, "step": 686 }, { "epoch": 0.10177777777777777, "grad_norm": 1.7130799293518066, "learning_rate": 0.0001798369162342476, "loss": 0.9953, "step": 687 }, { "epoch": 0.10192592592592592, "grad_norm": 1.1958650350570679, "learning_rate": 0.00017980726464047445, "loss": 1.2324, "step": 688 }, { "epoch": 0.10207407407407408, "grad_norm": 1.11331045627594, "learning_rate": 0.00017977761304670126, "loss": 1.1123, "step": 689 }, { "epoch": 0.10222222222222223, "grad_norm": 1.2462921142578125, "learning_rate": 0.00017974796145292808, "loss": 1.1015, "step": 690 }, { "epoch": 0.10237037037037038, "grad_norm": 1.2919429540634155, "learning_rate": 0.00017971830985915495, "loss": 1.0648, "step": 691 }, { "epoch": 0.10251851851851852, "grad_norm": 1.6416014432907104, "learning_rate": 0.00017968865826538177, "loss": 1.2218, "step": 692 }, { "epoch": 0.10266666666666667, "grad_norm": 1.135933756828308, "learning_rate": 0.0001796590066716086, "loss": 1.2607, "step": 693 }, { "epoch": 0.10281481481481482, "grad_norm": 1.9447219371795654, "learning_rate": 0.00017962935507783546, "loss": 1.0926, "step": 694 }, { "epoch": 0.10296296296296296, "grad_norm": 2.0542564392089844, "learning_rate": 0.00017959970348406228, "loss": 1.1081, "step": 695 }, { "epoch": 0.10311111111111111, "grad_norm": 1.4082337617874146, "learning_rate": 0.0001795700518902891, "loss": 1.3908, "step": 696 }, { "epoch": 0.10325925925925926, "grad_norm": 1.6165364980697632, "learning_rate": 0.00017954040029651596, "loss": 1.2273, "step": 697 }, { "epoch": 0.1034074074074074, "grad_norm": 0.9326266646385193, "learning_rate": 0.00017951074870274278, "loss": 0.9182, "step": 698 }, { "epoch": 0.10355555555555555, "grad_norm": 2.343601942062378, "learning_rate": 0.0001794810971089696, "loss": 0.9752, "step": 699 }, { "epoch": 0.1037037037037037, "grad_norm": 1.147686243057251, "learning_rate": 0.00017945144551519647, "loss": 1.0687, "step": 700 }, { "epoch": 0.10385185185185185, "grad_norm": 1.8303850889205933, "learning_rate": 0.0001794217939214233, "loss": 0.998, "step": 701 }, { "epoch": 0.104, "grad_norm": 1.1002910137176514, "learning_rate": 0.0001793921423276501, "loss": 1.1393, "step": 702 }, { "epoch": 0.10414814814814814, "grad_norm": 1.217954397201538, "learning_rate": 0.00017936249073387698, "loss": 1.1456, "step": 703 }, { "epoch": 0.10429629629629629, "grad_norm": 0.9953997135162354, "learning_rate": 0.0001793328391401038, "loss": 1.1105, "step": 704 }, { "epoch": 0.10444444444444445, "grad_norm": 0.8896104097366333, "learning_rate": 0.0001793031875463306, "loss": 1.234, "step": 705 }, { "epoch": 0.1045925925925926, "grad_norm": 1.017242670059204, "learning_rate": 0.00017927353595255746, "loss": 1.0124, "step": 706 }, { "epoch": 0.10474074074074075, "grad_norm": 1.1732721328735352, "learning_rate": 0.0001792438843587843, "loss": 1.109, "step": 707 }, { "epoch": 0.10488888888888889, "grad_norm": 1.5762159824371338, "learning_rate": 0.00017921423276501112, "loss": 1.0034, "step": 708 }, { "epoch": 0.10503703703703704, "grad_norm": 0.991986870765686, "learning_rate": 0.00017918458117123796, "loss": 1.1069, "step": 709 }, { "epoch": 0.10518518518518519, "grad_norm": 0.9856958389282227, "learning_rate": 0.0001791549295774648, "loss": 1.2329, "step": 710 }, { "epoch": 0.10533333333333333, "grad_norm": 1.2896099090576172, "learning_rate": 0.00017912527798369162, "loss": 1.1182, "step": 711 }, { "epoch": 0.10548148148148148, "grad_norm": 1.8374145030975342, "learning_rate": 0.00017909562638991847, "loss": 1.1333, "step": 712 }, { "epoch": 0.10562962962962963, "grad_norm": 1.0625301599502563, "learning_rate": 0.00017906597479614531, "loss": 1.1764, "step": 713 }, { "epoch": 0.10577777777777778, "grad_norm": 1.678533673286438, "learning_rate": 0.00017903632320237213, "loss": 1.1756, "step": 714 }, { "epoch": 0.10592592592592592, "grad_norm": 1.2041304111480713, "learning_rate": 0.00017900667160859898, "loss": 1.1431, "step": 715 }, { "epoch": 0.10607407407407407, "grad_norm": 1.6350512504577637, "learning_rate": 0.00017897702001482582, "loss": 0.9288, "step": 716 }, { "epoch": 0.10622222222222222, "grad_norm": 0.8590714931488037, "learning_rate": 0.00017894736842105264, "loss": 0.9684, "step": 717 }, { "epoch": 0.10637037037037037, "grad_norm": 1.4090867042541504, "learning_rate": 0.00017891771682727948, "loss": 1.0675, "step": 718 }, { "epoch": 0.10651851851851851, "grad_norm": 0.9573653340339661, "learning_rate": 0.00017888806523350633, "loss": 1.3571, "step": 719 }, { "epoch": 0.10666666666666667, "grad_norm": 1.591746211051941, "learning_rate": 0.00017885841363973314, "loss": 1.2832, "step": 720 }, { "epoch": 0.10681481481481482, "grad_norm": 1.0578999519348145, "learning_rate": 0.00017882876204595996, "loss": 1.3521, "step": 721 }, { "epoch": 0.10696296296296297, "grad_norm": 1.2212194204330444, "learning_rate": 0.00017879911045218683, "loss": 1.5566, "step": 722 }, { "epoch": 0.10711111111111112, "grad_norm": 0.9653564691543579, "learning_rate": 0.00017876945885841365, "loss": 1.3195, "step": 723 }, { "epoch": 0.10725925925925926, "grad_norm": 1.4364689588546753, "learning_rate": 0.00017873980726464047, "loss": 1.2876, "step": 724 }, { "epoch": 0.10740740740740741, "grad_norm": 1.106291651725769, "learning_rate": 0.0001787101556708673, "loss": 1.4856, "step": 725 }, { "epoch": 0.10755555555555556, "grad_norm": 1.1380616426467896, "learning_rate": 0.00017868050407709416, "loss": 1.2512, "step": 726 }, { "epoch": 0.1077037037037037, "grad_norm": 1.0685898065567017, "learning_rate": 0.00017865085248332097, "loss": 1.2495, "step": 727 }, { "epoch": 0.10785185185185185, "grad_norm": 1.0301077365875244, "learning_rate": 0.00017862120088954782, "loss": 1.17, "step": 728 }, { "epoch": 0.108, "grad_norm": 1.0483813285827637, "learning_rate": 0.00017859154929577466, "loss": 0.995, "step": 729 }, { "epoch": 0.10814814814814815, "grad_norm": 0.8973188400268555, "learning_rate": 0.00017856189770200148, "loss": 1.1569, "step": 730 }, { "epoch": 0.1082962962962963, "grad_norm": 0.9379794597625732, "learning_rate": 0.00017853224610822832, "loss": 1.2935, "step": 731 }, { "epoch": 0.10844444444444444, "grad_norm": 1.0541515350341797, "learning_rate": 0.00017850259451445517, "loss": 1.3934, "step": 732 }, { "epoch": 0.10859259259259259, "grad_norm": 0.9139310121536255, "learning_rate": 0.00017847294292068199, "loss": 0.9618, "step": 733 }, { "epoch": 0.10874074074074074, "grad_norm": 2.54887056350708, "learning_rate": 0.00017844329132690883, "loss": 1.1157, "step": 734 }, { "epoch": 0.10888888888888888, "grad_norm": 1.2888338565826416, "learning_rate": 0.00017841363973313567, "loss": 1.1304, "step": 735 }, { "epoch": 0.10903703703703704, "grad_norm": 1.361585259437561, "learning_rate": 0.0001783839881393625, "loss": 1.116, "step": 736 }, { "epoch": 0.10918518518518519, "grad_norm": 1.1790413856506348, "learning_rate": 0.00017835433654558934, "loss": 0.9679, "step": 737 }, { "epoch": 0.10933333333333334, "grad_norm": 0.9589934349060059, "learning_rate": 0.00017832468495181618, "loss": 1.2085, "step": 738 }, { "epoch": 0.10948148148148149, "grad_norm": 1.2599700689315796, "learning_rate": 0.000178295033358043, "loss": 1.2303, "step": 739 }, { "epoch": 0.10962962962962963, "grad_norm": 2.133882761001587, "learning_rate": 0.00017826538176426984, "loss": 1.2466, "step": 740 }, { "epoch": 0.10977777777777778, "grad_norm": 1.2228134870529175, "learning_rate": 0.00017823573017049666, "loss": 1.4695, "step": 741 }, { "epoch": 0.10992592592592593, "grad_norm": 1.8407241106033325, "learning_rate": 0.0001782060785767235, "loss": 0.9995, "step": 742 }, { "epoch": 0.11007407407407407, "grad_norm": 1.3531813621520996, "learning_rate": 0.00017817642698295035, "loss": 1.2148, "step": 743 }, { "epoch": 0.11022222222222222, "grad_norm": 1.0597742795944214, "learning_rate": 0.00017814677538917717, "loss": 1.2799, "step": 744 }, { "epoch": 0.11037037037037037, "grad_norm": 1.049060583114624, "learning_rate": 0.000178117123795404, "loss": 0.9831, "step": 745 }, { "epoch": 0.11051851851851852, "grad_norm": 1.0642156600952148, "learning_rate": 0.00017808747220163086, "loss": 1.2659, "step": 746 }, { "epoch": 0.11066666666666666, "grad_norm": 1.0166077613830566, "learning_rate": 0.00017805782060785767, "loss": 1.1209, "step": 747 }, { "epoch": 0.11081481481481481, "grad_norm": 1.0508230924606323, "learning_rate": 0.00017802816901408452, "loss": 1.1257, "step": 748 }, { "epoch": 0.11096296296296296, "grad_norm": 1.1801953315734863, "learning_rate": 0.00017799851742031136, "loss": 1.2034, "step": 749 }, { "epoch": 0.1111111111111111, "grad_norm": 1.217050552368164, "learning_rate": 0.00017796886582653818, "loss": 1.1783, "step": 750 }, { "epoch": 0.11125925925925927, "grad_norm": 1.0050989389419556, "learning_rate": 0.00017793921423276502, "loss": 0.973, "step": 751 }, { "epoch": 0.11140740740740741, "grad_norm": 1.00999116897583, "learning_rate": 0.00017790956263899187, "loss": 1.0116, "step": 752 }, { "epoch": 0.11155555555555556, "grad_norm": 1.4370405673980713, "learning_rate": 0.00017787991104521869, "loss": 1.1189, "step": 753 }, { "epoch": 0.11170370370370371, "grad_norm": 1.6145302057266235, "learning_rate": 0.00017785025945144553, "loss": 1.2394, "step": 754 }, { "epoch": 0.11185185185185186, "grad_norm": 1.486937165260315, "learning_rate": 0.00017782060785767237, "loss": 1.2607, "step": 755 }, { "epoch": 0.112, "grad_norm": 1.037169098854065, "learning_rate": 0.0001777909562638992, "loss": 1.1249, "step": 756 }, { "epoch": 0.11214814814814815, "grad_norm": 1.543372631072998, "learning_rate": 0.00017776130467012604, "loss": 1.1736, "step": 757 }, { "epoch": 0.1122962962962963, "grad_norm": 1.5751320123672485, "learning_rate": 0.00017773165307635285, "loss": 1.3599, "step": 758 }, { "epoch": 0.11244444444444444, "grad_norm": 0.9869830012321472, "learning_rate": 0.0001777020014825797, "loss": 1.2253, "step": 759 }, { "epoch": 0.11259259259259259, "grad_norm": 1.6567195653915405, "learning_rate": 0.00017767234988880652, "loss": 1.4151, "step": 760 }, { "epoch": 0.11274074074074074, "grad_norm": 1.5292116403579712, "learning_rate": 0.00017764269829503336, "loss": 1.2516, "step": 761 }, { "epoch": 0.11288888888888889, "grad_norm": 2.383601188659668, "learning_rate": 0.0001776130467012602, "loss": 1.2525, "step": 762 }, { "epoch": 0.11303703703703703, "grad_norm": 1.6173779964447021, "learning_rate": 0.00017758339510748702, "loss": 1.326, "step": 763 }, { "epoch": 0.11318518518518518, "grad_norm": 1.1858290433883667, "learning_rate": 0.00017755374351371387, "loss": 1.2835, "step": 764 }, { "epoch": 0.11333333333333333, "grad_norm": 1.27888023853302, "learning_rate": 0.0001775240919199407, "loss": 1.4655, "step": 765 }, { "epoch": 0.11348148148148147, "grad_norm": 4.245856761932373, "learning_rate": 0.00017749444032616753, "loss": 1.2797, "step": 766 }, { "epoch": 0.11362962962962964, "grad_norm": 1.5455471277236938, "learning_rate": 0.00017746478873239437, "loss": 1.1448, "step": 767 }, { "epoch": 0.11377777777777778, "grad_norm": 1.727730631828308, "learning_rate": 0.00017743513713862122, "loss": 1.055, "step": 768 }, { "epoch": 0.11392592592592593, "grad_norm": 2.2069005966186523, "learning_rate": 0.00017740548554484803, "loss": 1.1359, "step": 769 }, { "epoch": 0.11407407407407408, "grad_norm": 2.4828689098358154, "learning_rate": 0.00017737583395107488, "loss": 1.1874, "step": 770 }, { "epoch": 0.11422222222222222, "grad_norm": 2.375779867172241, "learning_rate": 0.00017734618235730172, "loss": 1.2403, "step": 771 }, { "epoch": 0.11437037037037037, "grad_norm": 2.210402250289917, "learning_rate": 0.00017731653076352854, "loss": 1.0164, "step": 772 }, { "epoch": 0.11451851851851852, "grad_norm": 1.034450888633728, "learning_rate": 0.00017728687916975538, "loss": 1.0978, "step": 773 }, { "epoch": 0.11466666666666667, "grad_norm": 2.2124218940734863, "learning_rate": 0.00017725722757598223, "loss": 1.1375, "step": 774 }, { "epoch": 0.11481481481481481, "grad_norm": 2.627687931060791, "learning_rate": 0.00017722757598220905, "loss": 1.1139, "step": 775 }, { "epoch": 0.11496296296296296, "grad_norm": 1.6046452522277832, "learning_rate": 0.00017719792438843586, "loss": 1.4081, "step": 776 }, { "epoch": 0.11511111111111111, "grad_norm": 2.07920241355896, "learning_rate": 0.00017716827279466274, "loss": 0.9552, "step": 777 }, { "epoch": 0.11525925925925926, "grad_norm": 2.331041097640991, "learning_rate": 0.00017713862120088955, "loss": 1.396, "step": 778 }, { "epoch": 0.1154074074074074, "grad_norm": 4.453252792358398, "learning_rate": 0.00017710896960711637, "loss": 1.0503, "step": 779 }, { "epoch": 0.11555555555555555, "grad_norm": 2.3019750118255615, "learning_rate": 0.00017707931801334324, "loss": 0.9527, "step": 780 }, { "epoch": 0.1157037037037037, "grad_norm": 1.7517541646957397, "learning_rate": 0.00017704966641957006, "loss": 1.3174, "step": 781 }, { "epoch": 0.11585185185185186, "grad_norm": 2.5055603981018066, "learning_rate": 0.00017702001482579688, "loss": 1.0664, "step": 782 }, { "epoch": 0.116, "grad_norm": 1.181646466255188, "learning_rate": 0.00017699036323202375, "loss": 1.2119, "step": 783 }, { "epoch": 0.11614814814814815, "grad_norm": 1.1410211324691772, "learning_rate": 0.00017696071163825057, "loss": 1.232, "step": 784 }, { "epoch": 0.1162962962962963, "grad_norm": 1.6544922590255737, "learning_rate": 0.00017693106004447738, "loss": 1.1537, "step": 785 }, { "epoch": 0.11644444444444445, "grad_norm": 1.8878635168075562, "learning_rate": 0.00017690140845070425, "loss": 1.061, "step": 786 }, { "epoch": 0.1165925925925926, "grad_norm": 1.3190855979919434, "learning_rate": 0.00017687175685693107, "loss": 0.9475, "step": 787 }, { "epoch": 0.11674074074074074, "grad_norm": 1.0858039855957031, "learning_rate": 0.0001768421052631579, "loss": 1.2793, "step": 788 }, { "epoch": 0.11688888888888889, "grad_norm": 1.147354245185852, "learning_rate": 0.00017681245366938476, "loss": 0.9593, "step": 789 }, { "epoch": 0.11703703703703704, "grad_norm": 1.5271817445755005, "learning_rate": 0.00017678280207561158, "loss": 1.0165, "step": 790 }, { "epoch": 0.11718518518518518, "grad_norm": 2.1167054176330566, "learning_rate": 0.0001767531504818384, "loss": 1.3867, "step": 791 }, { "epoch": 0.11733333333333333, "grad_norm": 1.4731441736221313, "learning_rate": 0.00017672349888806524, "loss": 1.2066, "step": 792 }, { "epoch": 0.11748148148148148, "grad_norm": 0.9553131461143494, "learning_rate": 0.00017669384729429208, "loss": 1.4145, "step": 793 }, { "epoch": 0.11762962962962963, "grad_norm": 6.962506294250488, "learning_rate": 0.0001766641957005189, "loss": 1.2297, "step": 794 }, { "epoch": 0.11777777777777777, "grad_norm": 1.519862174987793, "learning_rate": 0.00017663454410674575, "loss": 1.1655, "step": 795 }, { "epoch": 0.11792592592592592, "grad_norm": 1.0673969984054565, "learning_rate": 0.0001766048925129726, "loss": 1.1978, "step": 796 }, { "epoch": 0.11807407407407407, "grad_norm": 2.4118587970733643, "learning_rate": 0.0001765752409191994, "loss": 1.2152, "step": 797 }, { "epoch": 0.11822222222222223, "grad_norm": 1.513466477394104, "learning_rate": 0.00017654558932542625, "loss": 0.9631, "step": 798 }, { "epoch": 0.11837037037037038, "grad_norm": 1.4019991159439087, "learning_rate": 0.0001765159377316531, "loss": 1.0298, "step": 799 }, { "epoch": 0.11851851851851852, "grad_norm": 1.4815711975097656, "learning_rate": 0.00017648628613787991, "loss": 1.1588, "step": 800 }, { "epoch": 0.11866666666666667, "grad_norm": 2.318873882293701, "learning_rate": 0.00017645663454410676, "loss": 1.4133, "step": 801 }, { "epoch": 0.11881481481481482, "grad_norm": 1.6941622495651245, "learning_rate": 0.0001764269829503336, "loss": 1.1509, "step": 802 }, { "epoch": 0.11896296296296296, "grad_norm": 0.9023726582527161, "learning_rate": 0.00017639733135656042, "loss": 1.1994, "step": 803 }, { "epoch": 0.11911111111111111, "grad_norm": 2.4571475982666016, "learning_rate": 0.00017636767976278727, "loss": 1.0628, "step": 804 }, { "epoch": 0.11925925925925926, "grad_norm": 3.198491334915161, "learning_rate": 0.0001763380281690141, "loss": 1.0631, "step": 805 }, { "epoch": 0.1194074074074074, "grad_norm": 1.126410722732544, "learning_rate": 0.00017630837657524093, "loss": 1.2265, "step": 806 }, { "epoch": 0.11955555555555555, "grad_norm": 1.1916186809539795, "learning_rate": 0.00017627872498146774, "loss": 1.1347, "step": 807 }, { "epoch": 0.1197037037037037, "grad_norm": 1.1731302738189697, "learning_rate": 0.00017624907338769462, "loss": 0.9825, "step": 808 }, { "epoch": 0.11985185185185185, "grad_norm": 1.3539538383483887, "learning_rate": 0.00017621942179392143, "loss": 1.2031, "step": 809 }, { "epoch": 0.12, "grad_norm": 2.6558806896209717, "learning_rate": 0.00017618977020014825, "loss": 1.0797, "step": 810 }, { "epoch": 0.12014814814814814, "grad_norm": 2.5530829429626465, "learning_rate": 0.0001761601186063751, "loss": 1.0951, "step": 811 }, { "epoch": 0.12029629629629629, "grad_norm": 1.3450182676315308, "learning_rate": 0.00017613046701260194, "loss": 1.3404, "step": 812 }, { "epoch": 0.12044444444444445, "grad_norm": 1.4826587438583374, "learning_rate": 0.00017610081541882876, "loss": 1.3364, "step": 813 }, { "epoch": 0.1205925925925926, "grad_norm": 1.238110065460205, "learning_rate": 0.0001760711638250556, "loss": 1.2395, "step": 814 }, { "epoch": 0.12074074074074075, "grad_norm": 1.6360697746276855, "learning_rate": 0.00017604151223128245, "loss": 1.3256, "step": 815 }, { "epoch": 0.12088888888888889, "grad_norm": 2.127264976501465, "learning_rate": 0.00017601186063750926, "loss": 1.2677, "step": 816 }, { "epoch": 0.12103703703703704, "grad_norm": 2.0152833461761475, "learning_rate": 0.0001759822090437361, "loss": 1.1359, "step": 817 }, { "epoch": 0.12118518518518519, "grad_norm": 1.3907021284103394, "learning_rate": 0.00017595255744996295, "loss": 1.1803, "step": 818 }, { "epoch": 0.12133333333333333, "grad_norm": 1.9472498893737793, "learning_rate": 0.00017592290585618977, "loss": 1.1863, "step": 819 }, { "epoch": 0.12148148148148148, "grad_norm": 1.522857666015625, "learning_rate": 0.00017589325426241661, "loss": 1.1596, "step": 820 }, { "epoch": 0.12162962962962963, "grad_norm": 1.0325987339019775, "learning_rate": 0.00017586360266864346, "loss": 0.9308, "step": 821 }, { "epoch": 0.12177777777777778, "grad_norm": 1.7727866172790527, "learning_rate": 0.00017583395107487028, "loss": 1.2795, "step": 822 }, { "epoch": 0.12192592592592592, "grad_norm": 1.6282867193222046, "learning_rate": 0.00017580429948109712, "loss": 1.1801, "step": 823 }, { "epoch": 0.12207407407407407, "grad_norm": 1.6989792585372925, "learning_rate": 0.00017577464788732396, "loss": 1.068, "step": 824 }, { "epoch": 0.12222222222222222, "grad_norm": 1.0644278526306152, "learning_rate": 0.00017574499629355078, "loss": 1.0973, "step": 825 }, { "epoch": 0.12237037037037037, "grad_norm": 0.9775753617286682, "learning_rate": 0.00017571534469977763, "loss": 1.2895, "step": 826 }, { "epoch": 0.12251851851851851, "grad_norm": 1.5573720932006836, "learning_rate": 0.00017568569310600444, "loss": 1.0362, "step": 827 }, { "epoch": 0.12266666666666666, "grad_norm": 1.888784408569336, "learning_rate": 0.0001756560415122313, "loss": 1.2377, "step": 828 }, { "epoch": 0.12281481481481482, "grad_norm": 1.6177492141723633, "learning_rate": 0.00017562638991845813, "loss": 1.2052, "step": 829 }, { "epoch": 0.12296296296296297, "grad_norm": 1.8377748727798462, "learning_rate": 0.00017559673832468495, "loss": 1.3072, "step": 830 }, { "epoch": 0.12311111111111112, "grad_norm": 1.541957139968872, "learning_rate": 0.0001755670867309118, "loss": 1.116, "step": 831 }, { "epoch": 0.12325925925925926, "grad_norm": 1.854828953742981, "learning_rate": 0.00017553743513713864, "loss": 1.1264, "step": 832 }, { "epoch": 0.12340740740740741, "grad_norm": 1.5496182441711426, "learning_rate": 0.00017550778354336546, "loss": 1.0248, "step": 833 }, { "epoch": 0.12355555555555556, "grad_norm": 1.3529809713363647, "learning_rate": 0.0001754781319495923, "loss": 1.0269, "step": 834 }, { "epoch": 0.1237037037037037, "grad_norm": 1.4036271572113037, "learning_rate": 0.00017544848035581915, "loss": 1.1804, "step": 835 }, { "epoch": 0.12385185185185185, "grad_norm": 1.0726540088653564, "learning_rate": 0.00017541882876204596, "loss": 1.1089, "step": 836 }, { "epoch": 0.124, "grad_norm": 1.2027060985565186, "learning_rate": 0.0001753891771682728, "loss": 1.1218, "step": 837 }, { "epoch": 0.12414814814814815, "grad_norm": 1.0356816053390503, "learning_rate": 0.00017535952557449965, "loss": 1.0432, "step": 838 }, { "epoch": 0.1242962962962963, "grad_norm": 1.271623134613037, "learning_rate": 0.00017532987398072647, "loss": 1.246, "step": 839 }, { "epoch": 0.12444444444444444, "grad_norm": 1.7520036697387695, "learning_rate": 0.0001753002223869533, "loss": 1.1882, "step": 840 }, { "epoch": 0.12459259259259259, "grad_norm": 1.689392328262329, "learning_rate": 0.00017527057079318016, "loss": 1.0743, "step": 841 }, { "epoch": 0.12474074074074074, "grad_norm": 4.215272426605225, "learning_rate": 0.00017524091919940698, "loss": 0.983, "step": 842 }, { "epoch": 0.12488888888888888, "grad_norm": 1.8404631614685059, "learning_rate": 0.00017521126760563382, "loss": 1.0592, "step": 843 }, { "epoch": 0.12503703703703703, "grad_norm": 1.393333077430725, "learning_rate": 0.00017518161601186064, "loss": 1.568, "step": 844 }, { "epoch": 0.12518518518518518, "grad_norm": 1.4102659225463867, "learning_rate": 0.00017515196441808748, "loss": 1.1276, "step": 845 }, { "epoch": 0.12533333333333332, "grad_norm": 1.6933660507202148, "learning_rate": 0.0001751223128243143, "loss": 1.2158, "step": 846 }, { "epoch": 0.12548148148148147, "grad_norm": 2.3846960067749023, "learning_rate": 0.00017509266123054114, "loss": 1.1242, "step": 847 }, { "epoch": 0.12562962962962962, "grad_norm": 1.5024141073226929, "learning_rate": 0.000175063009636768, "loss": 1.1528, "step": 848 }, { "epoch": 0.12577777777777777, "grad_norm": 2.4547317028045654, "learning_rate": 0.0001750333580429948, "loss": 1.0205, "step": 849 }, { "epoch": 0.1259259259259259, "grad_norm": 0.8433824181556702, "learning_rate": 0.00017500370644922165, "loss": 1.0368, "step": 850 }, { "epoch": 0.12607407407407406, "grad_norm": 1.5638988018035889, "learning_rate": 0.0001749740548554485, "loss": 1.2009, "step": 851 }, { "epoch": 0.12622222222222224, "grad_norm": 2.30916690826416, "learning_rate": 0.0001749444032616753, "loss": 1.1026, "step": 852 }, { "epoch": 0.12637037037037038, "grad_norm": 1.8860763311386108, "learning_rate": 0.00017491475166790216, "loss": 0.9593, "step": 853 }, { "epoch": 0.12651851851851853, "grad_norm": 1.499836802482605, "learning_rate": 0.000174885100074129, "loss": 1.0407, "step": 854 }, { "epoch": 0.12666666666666668, "grad_norm": 1.9133578538894653, "learning_rate": 0.00017485544848035582, "loss": 1.2543, "step": 855 }, { "epoch": 0.12681481481481482, "grad_norm": 1.678727149963379, "learning_rate": 0.00017482579688658266, "loss": 1.0556, "step": 856 }, { "epoch": 0.12696296296296297, "grad_norm": 1.3053762912750244, "learning_rate": 0.0001747961452928095, "loss": 1.057, "step": 857 }, { "epoch": 0.12711111111111112, "grad_norm": 1.5819666385650635, "learning_rate": 0.00017476649369903632, "loss": 1.3151, "step": 858 }, { "epoch": 0.12725925925925927, "grad_norm": 1.497564435005188, "learning_rate": 0.00017473684210526317, "loss": 1.1288, "step": 859 }, { "epoch": 0.1274074074074074, "grad_norm": 1.2003209590911865, "learning_rate": 0.00017470719051149, "loss": 1.2003, "step": 860 }, { "epoch": 0.12755555555555556, "grad_norm": 1.4014747142791748, "learning_rate": 0.00017467753891771683, "loss": 1.0234, "step": 861 }, { "epoch": 0.1277037037037037, "grad_norm": 1.9688879251480103, "learning_rate": 0.00017464788732394365, "loss": 1.0934, "step": 862 }, { "epoch": 0.12785185185185186, "grad_norm": 1.6965707540512085, "learning_rate": 0.00017461823573017052, "loss": 1.2557, "step": 863 }, { "epoch": 0.128, "grad_norm": 2.1013545989990234, "learning_rate": 0.00017458858413639734, "loss": 1.0513, "step": 864 }, { "epoch": 0.12814814814814815, "grad_norm": 1.8798348903656006, "learning_rate": 0.00017455893254262415, "loss": 1.3548, "step": 865 }, { "epoch": 0.1282962962962963, "grad_norm": 2.6154160499572754, "learning_rate": 0.00017452928094885103, "loss": 1.0156, "step": 866 }, { "epoch": 0.12844444444444444, "grad_norm": 1.7234320640563965, "learning_rate": 0.00017449962935507784, "loss": 1.0047, "step": 867 }, { "epoch": 0.1285925925925926, "grad_norm": 2.064502716064453, "learning_rate": 0.00017446997776130466, "loss": 0.9285, "step": 868 }, { "epoch": 0.12874074074074074, "grad_norm": 1.2932170629501343, "learning_rate": 0.00017444032616753153, "loss": 1.3768, "step": 869 }, { "epoch": 0.1288888888888889, "grad_norm": 1.4662041664123535, "learning_rate": 0.00017441067457375835, "loss": 1.3052, "step": 870 }, { "epoch": 0.12903703703703703, "grad_norm": 2.8357980251312256, "learning_rate": 0.00017438102297998517, "loss": 1.0833, "step": 871 }, { "epoch": 0.12918518518518518, "grad_norm": 1.3351575136184692, "learning_rate": 0.00017435137138621204, "loss": 1.29, "step": 872 }, { "epoch": 0.12933333333333333, "grad_norm": 0.9936057925224304, "learning_rate": 0.00017432171979243886, "loss": 1.1774, "step": 873 }, { "epoch": 0.12948148148148148, "grad_norm": 1.349957823753357, "learning_rate": 0.00017429206819866567, "loss": 1.0534, "step": 874 }, { "epoch": 0.12962962962962962, "grad_norm": 2.102526903152466, "learning_rate": 0.00017426241660489254, "loss": 1.1639, "step": 875 }, { "epoch": 0.12977777777777777, "grad_norm": 2.4069101810455322, "learning_rate": 0.00017423276501111936, "loss": 1.2314, "step": 876 }, { "epoch": 0.12992592592592592, "grad_norm": 2.2288312911987305, "learning_rate": 0.00017420311341734618, "loss": 1.3554, "step": 877 }, { "epoch": 0.13007407407407406, "grad_norm": 1.6635648012161255, "learning_rate": 0.00017417346182357302, "loss": 1.3372, "step": 878 }, { "epoch": 0.1302222222222222, "grad_norm": 2.2414519786834717, "learning_rate": 0.00017414381022979987, "loss": 1.067, "step": 879 }, { "epoch": 0.13037037037037036, "grad_norm": 3.241548538208008, "learning_rate": 0.00017411415863602669, "loss": 1.0293, "step": 880 }, { "epoch": 0.1305185185185185, "grad_norm": 1.651780605316162, "learning_rate": 0.00017408450704225353, "loss": 1.0491, "step": 881 }, { "epoch": 0.13066666666666665, "grad_norm": 1.4641205072402954, "learning_rate": 0.00017405485544848037, "loss": 1.1071, "step": 882 }, { "epoch": 0.13081481481481483, "grad_norm": 1.5232881307601929, "learning_rate": 0.0001740252038547072, "loss": 1.1261, "step": 883 }, { "epoch": 0.13096296296296298, "grad_norm": 1.9777206182479858, "learning_rate": 0.00017399555226093404, "loss": 1.1046, "step": 884 }, { "epoch": 0.13111111111111112, "grad_norm": 1.5309432744979858, "learning_rate": 0.00017396590066716088, "loss": 1.1768, "step": 885 }, { "epoch": 0.13125925925925927, "grad_norm": 1.988309621810913, "learning_rate": 0.0001739362490733877, "loss": 1.1637, "step": 886 }, { "epoch": 0.13140740740740742, "grad_norm": 1.3992940187454224, "learning_rate": 0.00017390659747961454, "loss": 1.1298, "step": 887 }, { "epoch": 0.13155555555555556, "grad_norm": 1.1663585901260376, "learning_rate": 0.0001738769458858414, "loss": 1.1667, "step": 888 }, { "epoch": 0.1317037037037037, "grad_norm": 1.6041878461837769, "learning_rate": 0.0001738472942920682, "loss": 1.2966, "step": 889 }, { "epoch": 0.13185185185185186, "grad_norm": 1.4323391914367676, "learning_rate": 0.00017381764269829505, "loss": 1.4035, "step": 890 }, { "epoch": 0.132, "grad_norm": 1.9315614700317383, "learning_rate": 0.0001737879911045219, "loss": 1.2564, "step": 891 }, { "epoch": 0.13214814814814815, "grad_norm": 1.3660720586776733, "learning_rate": 0.0001737583395107487, "loss": 1.0422, "step": 892 }, { "epoch": 0.1322962962962963, "grad_norm": 1.206908941268921, "learning_rate": 0.00017372868791697553, "loss": 1.1346, "step": 893 }, { "epoch": 0.13244444444444445, "grad_norm": 2.7856850624084473, "learning_rate": 0.0001736990363232024, "loss": 1.1058, "step": 894 }, { "epoch": 0.1325925925925926, "grad_norm": 1.9803142547607422, "learning_rate": 0.00017366938472942922, "loss": 1.1389, "step": 895 }, { "epoch": 0.13274074074074074, "grad_norm": 2.0166566371917725, "learning_rate": 0.00017363973313565603, "loss": 0.9425, "step": 896 }, { "epoch": 0.1328888888888889, "grad_norm": 1.5326697826385498, "learning_rate": 0.00017361008154188288, "loss": 1.3628, "step": 897 }, { "epoch": 0.13303703703703704, "grad_norm": 1.4127566814422607, "learning_rate": 0.00017358042994810972, "loss": 1.1228, "step": 898 }, { "epoch": 0.13318518518518518, "grad_norm": 1.0187325477600098, "learning_rate": 0.00017355077835433654, "loss": 1.0937, "step": 899 }, { "epoch": 0.13333333333333333, "grad_norm": 1.1690553426742554, "learning_rate": 0.00017352112676056338, "loss": 0.9765, "step": 900 }, { "epoch": 0.13348148148148148, "grad_norm": 2.1607093811035156, "learning_rate": 0.00017349147516679023, "loss": 1.089, "step": 901 }, { "epoch": 0.13362962962962963, "grad_norm": 1.9213364124298096, "learning_rate": 0.00017346182357301705, "loss": 1.2745, "step": 902 }, { "epoch": 0.13377777777777777, "grad_norm": 2.478525161743164, "learning_rate": 0.0001734321719792439, "loss": 0.9527, "step": 903 }, { "epoch": 0.13392592592592592, "grad_norm": 1.3539713621139526, "learning_rate": 0.00017340252038547074, "loss": 1.3241, "step": 904 }, { "epoch": 0.13407407407407407, "grad_norm": 1.1803101301193237, "learning_rate": 0.00017337286879169755, "loss": 1.1488, "step": 905 }, { "epoch": 0.13422222222222221, "grad_norm": 1.7030516862869263, "learning_rate": 0.0001733432171979244, "loss": 1.5023, "step": 906 }, { "epoch": 0.13437037037037036, "grad_norm": 1.156157374382019, "learning_rate": 0.00017331356560415124, "loss": 1.15, "step": 907 }, { "epoch": 0.1345185185185185, "grad_norm": 1.3928496837615967, "learning_rate": 0.00017328391401037806, "loss": 1.0722, "step": 908 }, { "epoch": 0.13466666666666666, "grad_norm": 1.8104082345962524, "learning_rate": 0.0001732542624166049, "loss": 1.1799, "step": 909 }, { "epoch": 0.1348148148148148, "grad_norm": 1.854732871055603, "learning_rate": 0.00017322461082283175, "loss": 0.84, "step": 910 }, { "epoch": 0.13496296296296295, "grad_norm": 1.7013317346572876, "learning_rate": 0.00017319495922905857, "loss": 1.2079, "step": 911 }, { "epoch": 0.1351111111111111, "grad_norm": 1.8916053771972656, "learning_rate": 0.0001731653076352854, "loss": 1.1653, "step": 912 }, { "epoch": 0.13525925925925925, "grad_norm": 1.6114717721939087, "learning_rate": 0.00017313565604151223, "loss": 1.1645, "step": 913 }, { "epoch": 0.13540740740740742, "grad_norm": 4.8172478675842285, "learning_rate": 0.00017310600444773907, "loss": 1.1203, "step": 914 }, { "epoch": 0.13555555555555557, "grad_norm": 2.4642679691314697, "learning_rate": 0.00017307635285396592, "loss": 1.3937, "step": 915 }, { "epoch": 0.13570370370370372, "grad_norm": 1.3399279117584229, "learning_rate": 0.00017304670126019273, "loss": 0.9886, "step": 916 }, { "epoch": 0.13585185185185186, "grad_norm": 1.9045206308364868, "learning_rate": 0.00017301704966641958, "loss": 0.9348, "step": 917 }, { "epoch": 0.136, "grad_norm": 1.8310236930847168, "learning_rate": 0.00017298739807264642, "loss": 1.1142, "step": 918 }, { "epoch": 0.13614814814814816, "grad_norm": 1.48021399974823, "learning_rate": 0.00017295774647887324, "loss": 1.0913, "step": 919 }, { "epoch": 0.1362962962962963, "grad_norm": 1.868577003479004, "learning_rate": 0.00017292809488510008, "loss": 1.0717, "step": 920 }, { "epoch": 0.13644444444444445, "grad_norm": 3.953855276107788, "learning_rate": 0.00017289844329132693, "loss": 1.1454, "step": 921 }, { "epoch": 0.1365925925925926, "grad_norm": 2.0253727436065674, "learning_rate": 0.00017286879169755375, "loss": 1.1603, "step": 922 }, { "epoch": 0.13674074074074075, "grad_norm": 1.7948057651519775, "learning_rate": 0.0001728391401037806, "loss": 0.854, "step": 923 }, { "epoch": 0.1368888888888889, "grad_norm": 1.3979237079620361, "learning_rate": 0.00017280948851000743, "loss": 1.297, "step": 924 }, { "epoch": 0.13703703703703704, "grad_norm": 2.3212087154388428, "learning_rate": 0.00017277983691623425, "loss": 1.1407, "step": 925 }, { "epoch": 0.1371851851851852, "grad_norm": 1.4935330152511597, "learning_rate": 0.0001727501853224611, "loss": 1.0982, "step": 926 }, { "epoch": 0.13733333333333334, "grad_norm": 1.5073734521865845, "learning_rate": 0.00017272053372868794, "loss": 1.2125, "step": 927 }, { "epoch": 0.13748148148148148, "grad_norm": 1.8712886571884155, "learning_rate": 0.00017269088213491476, "loss": 0.9262, "step": 928 }, { "epoch": 0.13762962962962963, "grad_norm": 1.6642515659332275, "learning_rate": 0.0001726612305411416, "loss": 1.2041, "step": 929 }, { "epoch": 0.13777777777777778, "grad_norm": 1.1925135850906372, "learning_rate": 0.00017263157894736842, "loss": 0.8896, "step": 930 }, { "epoch": 0.13792592592592592, "grad_norm": 5.4231038093566895, "learning_rate": 0.00017260192735359526, "loss": 1.15, "step": 931 }, { "epoch": 0.13807407407407407, "grad_norm": 1.3681412935256958, "learning_rate": 0.00017257227575982208, "loss": 1.1461, "step": 932 }, { "epoch": 0.13822222222222222, "grad_norm": 1.0649001598358154, "learning_rate": 0.00017254262416604893, "loss": 1.1943, "step": 933 }, { "epoch": 0.13837037037037037, "grad_norm": 1.8697491884231567, "learning_rate": 0.00017251297257227577, "loss": 0.9972, "step": 934 }, { "epoch": 0.1385185185185185, "grad_norm": 1.898934006690979, "learning_rate": 0.0001724833209785026, "loss": 1.2559, "step": 935 }, { "epoch": 0.13866666666666666, "grad_norm": 2.1527063846588135, "learning_rate": 0.00017245366938472943, "loss": 1.4038, "step": 936 }, { "epoch": 0.1388148148148148, "grad_norm": 2.2986197471618652, "learning_rate": 0.00017242401779095628, "loss": 1.1437, "step": 937 }, { "epoch": 0.13896296296296295, "grad_norm": 2.1097805500030518, "learning_rate": 0.0001723943661971831, "loss": 0.8753, "step": 938 }, { "epoch": 0.1391111111111111, "grad_norm": 2.792574167251587, "learning_rate": 0.00017236471460340994, "loss": 1.2432, "step": 939 }, { "epoch": 0.13925925925925925, "grad_norm": 1.5301166772842407, "learning_rate": 0.00017233506300963678, "loss": 1.2332, "step": 940 }, { "epoch": 0.1394074074074074, "grad_norm": 2.1605138778686523, "learning_rate": 0.0001723054114158636, "loss": 1.0934, "step": 941 }, { "epoch": 0.13955555555555554, "grad_norm": 7.574090003967285, "learning_rate": 0.00017227575982209045, "loss": 1.1196, "step": 942 }, { "epoch": 0.1397037037037037, "grad_norm": 1.6092199087142944, "learning_rate": 0.0001722461082283173, "loss": 1.047, "step": 943 }, { "epoch": 0.13985185185185184, "grad_norm": 2.296266794204712, "learning_rate": 0.0001722164566345441, "loss": 1.1549, "step": 944 }, { "epoch": 0.14, "grad_norm": 1.8765658140182495, "learning_rate": 0.00017218680504077095, "loss": 1.1803, "step": 945 }, { "epoch": 0.14014814814814816, "grad_norm": 2.1477468013763428, "learning_rate": 0.0001721571534469978, "loss": 1.2277, "step": 946 }, { "epoch": 0.1402962962962963, "grad_norm": 1.460718035697937, "learning_rate": 0.0001721275018532246, "loss": 1.3237, "step": 947 }, { "epoch": 0.14044444444444446, "grad_norm": 1.4240020513534546, "learning_rate": 0.00017209785025945143, "loss": 1.0038, "step": 948 }, { "epoch": 0.1405925925925926, "grad_norm": 1.8419314622879028, "learning_rate": 0.0001720681986656783, "loss": 1.3067, "step": 949 }, { "epoch": 0.14074074074074075, "grad_norm": 1.5516164302825928, "learning_rate": 0.00017203854707190512, "loss": 1.0963, "step": 950 }, { "epoch": 0.1408888888888889, "grad_norm": 2.192901134490967, "learning_rate": 0.00017200889547813194, "loss": 1.022, "step": 951 }, { "epoch": 0.14103703703703704, "grad_norm": 2.8840384483337402, "learning_rate": 0.0001719792438843588, "loss": 1.1251, "step": 952 }, { "epoch": 0.1411851851851852, "grad_norm": 2.4674930572509766, "learning_rate": 0.00017194959229058563, "loss": 0.8607, "step": 953 }, { "epoch": 0.14133333333333334, "grad_norm": 1.5810648202896118, "learning_rate": 0.00017191994069681244, "loss": 1.0845, "step": 954 }, { "epoch": 0.14148148148148149, "grad_norm": 2.392319440841675, "learning_rate": 0.00017189028910303931, "loss": 1.1497, "step": 955 }, { "epoch": 0.14162962962962963, "grad_norm": 2.7800850868225098, "learning_rate": 0.00017186063750926613, "loss": 1.0349, "step": 956 }, { "epoch": 0.14177777777777778, "grad_norm": 2.4474740028381348, "learning_rate": 0.00017183098591549295, "loss": 1.1785, "step": 957 }, { "epoch": 0.14192592592592593, "grad_norm": 1.2785381078720093, "learning_rate": 0.00017180133432171982, "loss": 1.1121, "step": 958 }, { "epoch": 0.14207407407407407, "grad_norm": 1.6383781433105469, "learning_rate": 0.00017177168272794664, "loss": 1.1978, "step": 959 }, { "epoch": 0.14222222222222222, "grad_norm": 2.8146605491638184, "learning_rate": 0.00017174203113417346, "loss": 1.0505, "step": 960 }, { "epoch": 0.14237037037037037, "grad_norm": 3.455076217651367, "learning_rate": 0.00017171237954040033, "loss": 1.1984, "step": 961 }, { "epoch": 0.14251851851851852, "grad_norm": 2.968580484390259, "learning_rate": 0.00017168272794662714, "loss": 1.2749, "step": 962 }, { "epoch": 0.14266666666666666, "grad_norm": 1.3419718742370605, "learning_rate": 0.00017165307635285396, "loss": 1.1617, "step": 963 }, { "epoch": 0.1428148148148148, "grad_norm": 4.6509904861450195, "learning_rate": 0.0001716234247590808, "loss": 1.0516, "step": 964 }, { "epoch": 0.14296296296296296, "grad_norm": 1.687054991722107, "learning_rate": 0.00017159377316530765, "loss": 1.1101, "step": 965 }, { "epoch": 0.1431111111111111, "grad_norm": 5.888388633728027, "learning_rate": 0.00017156412157153447, "loss": 1.2839, "step": 966 }, { "epoch": 0.14325925925925925, "grad_norm": 1.7805489301681519, "learning_rate": 0.0001715344699777613, "loss": 1.0585, "step": 967 }, { "epoch": 0.1434074074074074, "grad_norm": 1.6146360635757446, "learning_rate": 0.00017150481838398816, "loss": 1.0834, "step": 968 }, { "epoch": 0.14355555555555555, "grad_norm": 1.9422056674957275, "learning_rate": 0.00017147516679021497, "loss": 0.9914, "step": 969 }, { "epoch": 0.1437037037037037, "grad_norm": 1.6852895021438599, "learning_rate": 0.00017144551519644182, "loss": 1.1439, "step": 970 }, { "epoch": 0.14385185185185184, "grad_norm": 1.960229516029358, "learning_rate": 0.00017141586360266866, "loss": 1.1256, "step": 971 }, { "epoch": 0.144, "grad_norm": 2.058276414871216, "learning_rate": 0.00017138621200889548, "loss": 1.0833, "step": 972 }, { "epoch": 0.14414814814814814, "grad_norm": 3.783475160598755, "learning_rate": 0.00017135656041512233, "loss": 1.1396, "step": 973 }, { "epoch": 0.14429629629629628, "grad_norm": 2.62729811668396, "learning_rate": 0.00017132690882134917, "loss": 0.9766, "step": 974 }, { "epoch": 0.14444444444444443, "grad_norm": 2.9100377559661865, "learning_rate": 0.000171297257227576, "loss": 1.1975, "step": 975 }, { "epoch": 0.1445925925925926, "grad_norm": 2.2580909729003906, "learning_rate": 0.00017126760563380283, "loss": 1.0595, "step": 976 }, { "epoch": 0.14474074074074075, "grad_norm": 1.8175936937332153, "learning_rate": 0.00017123795404002968, "loss": 1.1501, "step": 977 }, { "epoch": 0.1448888888888889, "grad_norm": 1.8463125228881836, "learning_rate": 0.0001712083024462565, "loss": 1.1216, "step": 978 }, { "epoch": 0.14503703703703705, "grad_norm": 7.846401214599609, "learning_rate": 0.0001711786508524833, "loss": 1.0921, "step": 979 }, { "epoch": 0.1451851851851852, "grad_norm": 2.4334380626678467, "learning_rate": 0.00017114899925871018, "loss": 1.1639, "step": 980 }, { "epoch": 0.14533333333333334, "grad_norm": 5.559380531311035, "learning_rate": 0.000171119347664937, "loss": 1.0065, "step": 981 }, { "epoch": 0.1454814814814815, "grad_norm": 2.4271509647369385, "learning_rate": 0.00017108969607116382, "loss": 1.065, "step": 982 }, { "epoch": 0.14562962962962964, "grad_norm": 1.9057291746139526, "learning_rate": 0.00017106004447739066, "loss": 1.2109, "step": 983 }, { "epoch": 0.14577777777777778, "grad_norm": 1.6439474821090698, "learning_rate": 0.0001710303928836175, "loss": 1.0766, "step": 984 }, { "epoch": 0.14592592592592593, "grad_norm": 3.028550863265991, "learning_rate": 0.00017100074128984432, "loss": 1.2883, "step": 985 }, { "epoch": 0.14607407407407408, "grad_norm": 1.0586735010147095, "learning_rate": 0.00017097108969607117, "loss": 1.2154, "step": 986 }, { "epoch": 0.14622222222222223, "grad_norm": 1.668674111366272, "learning_rate": 0.000170941438102298, "loss": 1.0683, "step": 987 }, { "epoch": 0.14637037037037037, "grad_norm": 2.949061870574951, "learning_rate": 0.00017091178650852483, "loss": 1.2461, "step": 988 }, { "epoch": 0.14651851851851852, "grad_norm": 1.5150032043457031, "learning_rate": 0.00017088213491475167, "loss": 1.148, "step": 989 }, { "epoch": 0.14666666666666667, "grad_norm": 9.12592601776123, "learning_rate": 0.00017085248332097852, "loss": 1.2257, "step": 990 }, { "epoch": 0.14681481481481481, "grad_norm": 2.1272830963134766, "learning_rate": 0.00017082283172720534, "loss": 1.1114, "step": 991 }, { "epoch": 0.14696296296296296, "grad_norm": 2.5358645915985107, "learning_rate": 0.00017079318013343218, "loss": 1.1169, "step": 992 }, { "epoch": 0.1471111111111111, "grad_norm": 1.4151437282562256, "learning_rate": 0.00017076352853965902, "loss": 1.093, "step": 993 }, { "epoch": 0.14725925925925926, "grad_norm": 4.572996616363525, "learning_rate": 0.00017073387694588584, "loss": 1.2401, "step": 994 }, { "epoch": 0.1474074074074074, "grad_norm": 1.4581624269485474, "learning_rate": 0.0001707042253521127, "loss": 1.3273, "step": 995 }, { "epoch": 0.14755555555555555, "grad_norm": 2.1116039752960205, "learning_rate": 0.00017067457375833953, "loss": 1.2053, "step": 996 }, { "epoch": 0.1477037037037037, "grad_norm": 2.616701602935791, "learning_rate": 0.00017064492216456635, "loss": 1.0642, "step": 997 }, { "epoch": 0.14785185185185185, "grad_norm": 1.2618801593780518, "learning_rate": 0.0001706152705707932, "loss": 1.5356, "step": 998 }, { "epoch": 0.148, "grad_norm": 2.029723882675171, "learning_rate": 0.00017058561897702, "loss": 0.9674, "step": 999 }, { "epoch": 0.14814814814814814, "grad_norm": 1.6765875816345215, "learning_rate": 0.00017055596738324685, "loss": 1.1802, "step": 1000 }, { "epoch": 0.1482962962962963, "grad_norm": 3.5849623680114746, "learning_rate": 0.0001705263157894737, "loss": 1.1245, "step": 1001 }, { "epoch": 0.14844444444444443, "grad_norm": 2.4030394554138184, "learning_rate": 0.00017049666419570052, "loss": 1.2715, "step": 1002 }, { "epoch": 0.14859259259259258, "grad_norm": 5.763272285461426, "learning_rate": 0.00017046701260192736, "loss": 1.2968, "step": 1003 }, { "epoch": 0.14874074074074073, "grad_norm": 3.2482750415802, "learning_rate": 0.0001704373610081542, "loss": 1.1823, "step": 1004 }, { "epoch": 0.14888888888888888, "grad_norm": 1.9993109703063965, "learning_rate": 0.00017040770941438102, "loss": 1.1634, "step": 1005 }, { "epoch": 0.14903703703703702, "grad_norm": 3.8828346729278564, "learning_rate": 0.00017037805782060787, "loss": 1.4147, "step": 1006 }, { "epoch": 0.1491851851851852, "grad_norm": 1.2229200601577759, "learning_rate": 0.0001703484062268347, "loss": 1.2015, "step": 1007 }, { "epoch": 0.14933333333333335, "grad_norm": 2.444016456604004, "learning_rate": 0.00017031875463306153, "loss": 1.0146, "step": 1008 }, { "epoch": 0.1494814814814815, "grad_norm": 2.006105661392212, "learning_rate": 0.00017028910303928837, "loss": 1.3469, "step": 1009 }, { "epoch": 0.14962962962962964, "grad_norm": 2.110853433609009, "learning_rate": 0.00017025945144551522, "loss": 1.0897, "step": 1010 }, { "epoch": 0.1497777777777778, "grad_norm": 3.2394096851348877, "learning_rate": 0.00017022979985174204, "loss": 1.2767, "step": 1011 }, { "epoch": 0.14992592592592593, "grad_norm": 2.3130080699920654, "learning_rate": 0.00017020014825796888, "loss": 1.071, "step": 1012 }, { "epoch": 0.15007407407407408, "grad_norm": 1.756558895111084, "learning_rate": 0.00017017049666419572, "loss": 1.0203, "step": 1013 }, { "epoch": 0.15022222222222223, "grad_norm": 1.4772218465805054, "learning_rate": 0.00017014084507042254, "loss": 1.1404, "step": 1014 }, { "epoch": 0.15037037037037038, "grad_norm": 2.8919644355773926, "learning_rate": 0.00017011119347664939, "loss": 1.1955, "step": 1015 }, { "epoch": 0.15051851851851852, "grad_norm": 1.9548122882843018, "learning_rate": 0.0001700815418828762, "loss": 1.2325, "step": 1016 }, { "epoch": 0.15066666666666667, "grad_norm": 1.2817782163619995, "learning_rate": 0.00017005189028910305, "loss": 1.2533, "step": 1017 }, { "epoch": 0.15081481481481482, "grad_norm": 1.1539685726165771, "learning_rate": 0.00017002223869532987, "loss": 1.1358, "step": 1018 }, { "epoch": 0.15096296296296297, "grad_norm": 2.1873302459716797, "learning_rate": 0.0001699925871015567, "loss": 1.1667, "step": 1019 }, { "epoch": 0.1511111111111111, "grad_norm": 2.369619131088257, "learning_rate": 0.00016996293550778355, "loss": 1.1854, "step": 1020 }, { "epoch": 0.15125925925925926, "grad_norm": 2.369476556777954, "learning_rate": 0.00016993328391401037, "loss": 1.2106, "step": 1021 }, { "epoch": 0.1514074074074074, "grad_norm": 3.2865421772003174, "learning_rate": 0.00016990363232023722, "loss": 1.0389, "step": 1022 }, { "epoch": 0.15155555555555555, "grad_norm": 1.9383407831192017, "learning_rate": 0.00016987398072646406, "loss": 1.0633, "step": 1023 }, { "epoch": 0.1517037037037037, "grad_norm": 2.9727392196655273, "learning_rate": 0.00016984432913269088, "loss": 1.0315, "step": 1024 }, { "epoch": 0.15185185185185185, "grad_norm": 1.2354108095169067, "learning_rate": 0.00016981467753891772, "loss": 1.1532, "step": 1025 }, { "epoch": 0.152, "grad_norm": 1.7172785997390747, "learning_rate": 0.00016978502594514457, "loss": 0.7329, "step": 1026 }, { "epoch": 0.15214814814814814, "grad_norm": 2.638615846633911, "learning_rate": 0.00016975537435137138, "loss": 1.1379, "step": 1027 }, { "epoch": 0.1522962962962963, "grad_norm": 2.0213356018066406, "learning_rate": 0.00016972572275759823, "loss": 1.1679, "step": 1028 }, { "epoch": 0.15244444444444444, "grad_norm": 2.057471990585327, "learning_rate": 0.00016969607116382507, "loss": 1.0414, "step": 1029 }, { "epoch": 0.15259259259259259, "grad_norm": 1.5801711082458496, "learning_rate": 0.0001696664195700519, "loss": 1.3041, "step": 1030 }, { "epoch": 0.15274074074074073, "grad_norm": 2.0575385093688965, "learning_rate": 0.00016963676797627873, "loss": 1.0981, "step": 1031 }, { "epoch": 0.15288888888888888, "grad_norm": 1.9751660823822021, "learning_rate": 0.00016960711638250558, "loss": 0.9827, "step": 1032 }, { "epoch": 0.15303703703703703, "grad_norm": 1.5350358486175537, "learning_rate": 0.0001695774647887324, "loss": 1.1458, "step": 1033 }, { "epoch": 0.15318518518518517, "grad_norm": 1.4879770278930664, "learning_rate": 0.00016954781319495921, "loss": 1.1881, "step": 1034 }, { "epoch": 0.15333333333333332, "grad_norm": 3.2379486560821533, "learning_rate": 0.00016951816160118609, "loss": 1.1286, "step": 1035 }, { "epoch": 0.15348148148148147, "grad_norm": 5.281479358673096, "learning_rate": 0.0001694885100074129, "loss": 1.1006, "step": 1036 }, { "epoch": 0.15362962962962962, "grad_norm": 1.7295136451721191, "learning_rate": 0.00016945885841363972, "loss": 1.1644, "step": 1037 }, { "epoch": 0.1537777777777778, "grad_norm": 1.8465901613235474, "learning_rate": 0.0001694292068198666, "loss": 1.1519, "step": 1038 }, { "epoch": 0.15392592592592594, "grad_norm": 6.92486047744751, "learning_rate": 0.0001693995552260934, "loss": 1.142, "step": 1039 }, { "epoch": 0.15407407407407409, "grad_norm": 1.9682059288024902, "learning_rate": 0.00016936990363232023, "loss": 0.9864, "step": 1040 }, { "epoch": 0.15422222222222223, "grad_norm": 1.4835504293441772, "learning_rate": 0.0001693402520385471, "loss": 1.0888, "step": 1041 }, { "epoch": 0.15437037037037038, "grad_norm": 1.9731804132461548, "learning_rate": 0.00016931060044477392, "loss": 1.4113, "step": 1042 }, { "epoch": 0.15451851851851853, "grad_norm": 1.9451696872711182, "learning_rate": 0.00016928094885100073, "loss": 1.0769, "step": 1043 }, { "epoch": 0.15466666666666667, "grad_norm": 2.1988463401794434, "learning_rate": 0.0001692512972572276, "loss": 1.0705, "step": 1044 }, { "epoch": 0.15481481481481482, "grad_norm": 4.193809509277344, "learning_rate": 0.00016922164566345442, "loss": 1.3807, "step": 1045 }, { "epoch": 0.15496296296296297, "grad_norm": 4.9835710525512695, "learning_rate": 0.00016919199406968124, "loss": 1.2299, "step": 1046 }, { "epoch": 0.15511111111111112, "grad_norm": 1.6631501913070679, "learning_rate": 0.0001691623424759081, "loss": 1.0437, "step": 1047 }, { "epoch": 0.15525925925925926, "grad_norm": 2.734771728515625, "learning_rate": 0.00016913269088213493, "loss": 1.2403, "step": 1048 }, { "epoch": 0.1554074074074074, "grad_norm": 1.514737844467163, "learning_rate": 0.00016910303928836175, "loss": 1.2392, "step": 1049 }, { "epoch": 0.15555555555555556, "grad_norm": 1.6169673204421997, "learning_rate": 0.0001690733876945886, "loss": 1.0107, "step": 1050 }, { "epoch": 0.1557037037037037, "grad_norm": 1.7885040044784546, "learning_rate": 0.00016904373610081543, "loss": 1.2452, "step": 1051 }, { "epoch": 0.15585185185185185, "grad_norm": 1.7115554809570312, "learning_rate": 0.00016901408450704225, "loss": 1.1554, "step": 1052 }, { "epoch": 0.156, "grad_norm": 1.8804570436477661, "learning_rate": 0.0001689844329132691, "loss": 1.2028, "step": 1053 }, { "epoch": 0.15614814814814815, "grad_norm": 2.993739128112793, "learning_rate": 0.00016895478131949594, "loss": 1.1684, "step": 1054 }, { "epoch": 0.1562962962962963, "grad_norm": 2.713289499282837, "learning_rate": 0.00016892512972572276, "loss": 0.8836, "step": 1055 }, { "epoch": 0.15644444444444444, "grad_norm": 2.5271599292755127, "learning_rate": 0.0001688954781319496, "loss": 1.4292, "step": 1056 }, { "epoch": 0.1565925925925926, "grad_norm": 4.2666425704956055, "learning_rate": 0.00016886582653817645, "loss": 1.3867, "step": 1057 }, { "epoch": 0.15674074074074074, "grad_norm": 1.8929147720336914, "learning_rate": 0.00016883617494440326, "loss": 1.2823, "step": 1058 }, { "epoch": 0.15688888888888888, "grad_norm": 2.1356215476989746, "learning_rate": 0.0001688065233506301, "loss": 1.3934, "step": 1059 }, { "epoch": 0.15703703703703703, "grad_norm": 2.8732924461364746, "learning_rate": 0.00016877687175685695, "loss": 1.3736, "step": 1060 }, { "epoch": 0.15718518518518518, "grad_norm": 2.7370901107788086, "learning_rate": 0.00016874722016308377, "loss": 0.995, "step": 1061 }, { "epoch": 0.15733333333333333, "grad_norm": 1.6540776491165161, "learning_rate": 0.00016871756856931061, "loss": 0.9875, "step": 1062 }, { "epoch": 0.15748148148148147, "grad_norm": 2.5256638526916504, "learning_rate": 0.00016868791697553746, "loss": 0.9672, "step": 1063 }, { "epoch": 0.15762962962962962, "grad_norm": 1.825677752494812, "learning_rate": 0.00016865826538176428, "loss": 1.1755, "step": 1064 }, { "epoch": 0.15777777777777777, "grad_norm": 3.3257060050964355, "learning_rate": 0.0001686286137879911, "loss": 1.1019, "step": 1065 }, { "epoch": 0.15792592592592591, "grad_norm": 3.0843424797058105, "learning_rate": 0.00016859896219421797, "loss": 1.2329, "step": 1066 }, { "epoch": 0.15807407407407406, "grad_norm": NaN, "learning_rate": 0.00016859896219421797, "loss": 1.0131, "step": 1067 }, { "epoch": 0.1582222222222222, "grad_norm": 3.285895347595215, "learning_rate": 0.00016856931060044478, "loss": 1.1322, "step": 1068 }, { "epoch": 0.15837037037037038, "grad_norm": 2.788299322128296, "learning_rate": 0.0001685396590066716, "loss": 1.1749, "step": 1069 }, { "epoch": 0.15851851851851853, "grad_norm": 2.9314417839050293, "learning_rate": 0.00016851000741289844, "loss": 0.9868, "step": 1070 }, { "epoch": 0.15866666666666668, "grad_norm": 8.959638595581055, "learning_rate": 0.0001684803558191253, "loss": 1.1358, "step": 1071 }, { "epoch": 0.15881481481481483, "grad_norm": 3.2001211643218994, "learning_rate": 0.0001684507042253521, "loss": 1.2816, "step": 1072 }, { "epoch": 0.15896296296296297, "grad_norm": 8.979604721069336, "learning_rate": 0.00016842105263157895, "loss": 1.2513, "step": 1073 }, { "epoch": 0.15911111111111112, "grad_norm": 3.510807752609253, "learning_rate": 0.0001683914010378058, "loss": 1.2859, "step": 1074 }, { "epoch": 0.15925925925925927, "grad_norm": 4.32289981842041, "learning_rate": 0.0001683617494440326, "loss": 1.0888, "step": 1075 }, { "epoch": 0.15940740740740741, "grad_norm": 2.1474578380584717, "learning_rate": 0.00016833209785025946, "loss": 1.0386, "step": 1076 }, { "epoch": 0.15955555555555556, "grad_norm": 3.062962293624878, "learning_rate": 0.0001683024462564863, "loss": 1.0207, "step": 1077 }, { "epoch": 0.1597037037037037, "grad_norm": 3.040384292602539, "learning_rate": 0.00016827279466271312, "loss": 1.1263, "step": 1078 }, { "epoch": 0.15985185185185186, "grad_norm": 2.453765630722046, "learning_rate": 0.00016824314306893996, "loss": 1.3174, "step": 1079 }, { "epoch": 0.16, "grad_norm": 4.926877498626709, "learning_rate": 0.0001682134914751668, "loss": 0.9633, "step": 1080 }, { "epoch": 0.16014814814814815, "grad_norm": 3.5459671020507812, "learning_rate": 0.00016818383988139363, "loss": 1.4504, "step": 1081 }, { "epoch": 0.1602962962962963, "grad_norm": 1.3094695806503296, "learning_rate": 0.00016815418828762047, "loss": 1.2689, "step": 1082 }, { "epoch": 0.16044444444444445, "grad_norm": 1.9539918899536133, "learning_rate": 0.00016812453669384731, "loss": 0.8696, "step": 1083 }, { "epoch": 0.1605925925925926, "grad_norm": 4.95810079574585, "learning_rate": 0.00016809488510007413, "loss": 1.026, "step": 1084 }, { "epoch": 0.16074074074074074, "grad_norm": 2.032973051071167, "learning_rate": 0.00016806523350630098, "loss": 0.8564, "step": 1085 }, { "epoch": 0.1608888888888889, "grad_norm": 4.3425068855285645, "learning_rate": 0.00016803558191252782, "loss": 1.1461, "step": 1086 }, { "epoch": 0.16103703703703703, "grad_norm": 2.2817065715789795, "learning_rate": 0.00016800593031875464, "loss": 1.112, "step": 1087 }, { "epoch": 0.16118518518518518, "grad_norm": 1.5987143516540527, "learning_rate": 0.00016797627872498148, "loss": 1.007, "step": 1088 }, { "epoch": 0.16133333333333333, "grad_norm": 1.582229495048523, "learning_rate": 0.0001679466271312083, "loss": 1.1057, "step": 1089 }, { "epoch": 0.16148148148148148, "grad_norm": 1.8761428594589233, "learning_rate": 0.00016791697553743514, "loss": 1.2869, "step": 1090 }, { "epoch": 0.16162962962962962, "grad_norm": 4.022858619689941, "learning_rate": 0.000167887323943662, "loss": 1.2291, "step": 1091 }, { "epoch": 0.16177777777777777, "grad_norm": 1.9940602779388428, "learning_rate": 0.0001678576723498888, "loss": 1.12, "step": 1092 }, { "epoch": 0.16192592592592592, "grad_norm": 2.525167942047119, "learning_rate": 0.00016782802075611565, "loss": 1.2224, "step": 1093 }, { "epoch": 0.16207407407407406, "grad_norm": 2.735330104827881, "learning_rate": 0.0001677983691623425, "loss": 1.1975, "step": 1094 }, { "epoch": 0.1622222222222222, "grad_norm": 2.2123138904571533, "learning_rate": 0.0001677687175685693, "loss": 1.0923, "step": 1095 }, { "epoch": 0.16237037037037036, "grad_norm": 4.129847049713135, "learning_rate": 0.00016773906597479616, "loss": 1.3661, "step": 1096 }, { "epoch": 0.1625185185185185, "grad_norm": 3.07590389251709, "learning_rate": 0.000167709414381023, "loss": 1.0667, "step": 1097 }, { "epoch": 0.16266666666666665, "grad_norm": 3.407912015914917, "learning_rate": 0.00016767976278724982, "loss": 0.9218, "step": 1098 }, { "epoch": 0.1628148148148148, "grad_norm": 4.002742290496826, "learning_rate": 0.00016765011119347666, "loss": 1.0098, "step": 1099 }, { "epoch": 0.16296296296296298, "grad_norm": 1.2676118612289429, "learning_rate": 0.0001676204595997035, "loss": 1.0311, "step": 1100 }, { "epoch": 0.16311111111111112, "grad_norm": 3.6772754192352295, "learning_rate": 0.00016759080800593032, "loss": 1.0986, "step": 1101 }, { "epoch": 0.16325925925925927, "grad_norm": 3.532989501953125, "learning_rate": 0.00016756115641215717, "loss": 1.1303, "step": 1102 }, { "epoch": 0.16340740740740742, "grad_norm": 2.35569429397583, "learning_rate": 0.000167531504818384, "loss": 1.1121, "step": 1103 }, { "epoch": 0.16355555555555557, "grad_norm": 1.7124059200286865, "learning_rate": 0.00016750185322461083, "loss": 1.1505, "step": 1104 }, { "epoch": 0.1637037037037037, "grad_norm": 2.458972692489624, "learning_rate": 0.00016747220163083765, "loss": 1.1398, "step": 1105 }, { "epoch": 0.16385185185185186, "grad_norm": 3.610698699951172, "learning_rate": 0.0001674425500370645, "loss": 1.3289, "step": 1106 }, { "epoch": 0.164, "grad_norm": 2.0191452503204346, "learning_rate": 0.00016741289844329134, "loss": 1.1516, "step": 1107 }, { "epoch": 0.16414814814814815, "grad_norm": 1.9639180898666382, "learning_rate": 0.00016738324684951815, "loss": 0.986, "step": 1108 }, { "epoch": 0.1642962962962963, "grad_norm": 1.9284876585006714, "learning_rate": 0.000167353595255745, "loss": 0.8722, "step": 1109 }, { "epoch": 0.16444444444444445, "grad_norm": 2.1016082763671875, "learning_rate": 0.00016732394366197184, "loss": 1.1622, "step": 1110 }, { "epoch": 0.1645925925925926, "grad_norm": 2.410661220550537, "learning_rate": 0.00016729429206819866, "loss": 1.2704, "step": 1111 }, { "epoch": 0.16474074074074074, "grad_norm": 4.063007354736328, "learning_rate": 0.0001672646404744255, "loss": 1.0752, "step": 1112 }, { "epoch": 0.1648888888888889, "grad_norm": 2.9490597248077393, "learning_rate": 0.00016723498888065235, "loss": 0.9154, "step": 1113 }, { "epoch": 0.16503703703703704, "grad_norm": 2.2835164070129395, "learning_rate": 0.00016720533728687917, "loss": 0.9618, "step": 1114 }, { "epoch": 0.16518518518518518, "grad_norm": 1.3010778427124023, "learning_rate": 0.000167175685693106, "loss": 1.0763, "step": 1115 }, { "epoch": 0.16533333333333333, "grad_norm": 2.7012317180633545, "learning_rate": 0.00016714603409933286, "loss": 1.2822, "step": 1116 }, { "epoch": 0.16548148148148148, "grad_norm": 3.55961275100708, "learning_rate": 0.00016711638250555967, "loss": 1.3006, "step": 1117 }, { "epoch": 0.16562962962962963, "grad_norm": 4.551784992218018, "learning_rate": 0.00016708673091178652, "loss": 1.3546, "step": 1118 }, { "epoch": 0.16577777777777777, "grad_norm": 2.7167952060699463, "learning_rate": 0.00016705707931801336, "loss": 1.2642, "step": 1119 }, { "epoch": 0.16592592592592592, "grad_norm": 3.6416263580322266, "learning_rate": 0.00016702742772424018, "loss": 1.2168, "step": 1120 }, { "epoch": 0.16607407407407407, "grad_norm": 2.924973964691162, "learning_rate": 0.000166997776130467, "loss": 0.9433, "step": 1121 }, { "epoch": 0.16622222222222222, "grad_norm": 1.412368893623352, "learning_rate": 0.00016696812453669387, "loss": 0.9324, "step": 1122 }, { "epoch": 0.16637037037037036, "grad_norm": 1.3757219314575195, "learning_rate": 0.00016693847294292069, "loss": 1.1399, "step": 1123 }, { "epoch": 0.1665185185185185, "grad_norm": 1.7917617559432983, "learning_rate": 0.0001669088213491475, "loss": 1.3018, "step": 1124 }, { "epoch": 0.16666666666666666, "grad_norm": 3.3100757598876953, "learning_rate": 0.00016687916975537438, "loss": 1.0173, "step": 1125 }, { "epoch": 0.1668148148148148, "grad_norm": 1.7143326997756958, "learning_rate": 0.0001668495181616012, "loss": 1.0514, "step": 1126 }, { "epoch": 0.16696296296296295, "grad_norm": 3.0766828060150146, "learning_rate": 0.000166819866567828, "loss": 1.0553, "step": 1127 }, { "epoch": 0.1671111111111111, "grad_norm": 1.6713535785675049, "learning_rate": 0.00016679021497405488, "loss": 1.0935, "step": 1128 }, { "epoch": 0.16725925925925925, "grad_norm": 2.5680131912231445, "learning_rate": 0.0001667605633802817, "loss": 1.096, "step": 1129 }, { "epoch": 0.1674074074074074, "grad_norm": 2.3472912311553955, "learning_rate": 0.00016673091178650852, "loss": 1.0766, "step": 1130 }, { "epoch": 0.16755555555555557, "grad_norm": 2.3274173736572266, "learning_rate": 0.0001667012601927354, "loss": 1.2308, "step": 1131 }, { "epoch": 0.16770370370370372, "grad_norm": 2.9745113849639893, "learning_rate": 0.0001666716085989622, "loss": 1.1308, "step": 1132 }, { "epoch": 0.16785185185185186, "grad_norm": 1.898444652557373, "learning_rate": 0.00016664195700518902, "loss": 1.1928, "step": 1133 }, { "epoch": 0.168, "grad_norm": 3.0440452098846436, "learning_rate": 0.0001666123054114159, "loss": 1.0433, "step": 1134 }, { "epoch": 0.16814814814814816, "grad_norm": 1.7012965679168701, "learning_rate": 0.0001665826538176427, "loss": 1.2063, "step": 1135 }, { "epoch": 0.1682962962962963, "grad_norm": 2.831171989440918, "learning_rate": 0.00016655300222386953, "loss": 0.8884, "step": 1136 }, { "epoch": 0.16844444444444445, "grad_norm": 1.5138641595840454, "learning_rate": 0.00016652335063009637, "loss": 1.255, "step": 1137 }, { "epoch": 0.1685925925925926, "grad_norm": 2.0016908645629883, "learning_rate": 0.00016649369903632322, "loss": 1.2003, "step": 1138 }, { "epoch": 0.16874074074074075, "grad_norm": 1.6556096076965332, "learning_rate": 0.00016646404744255003, "loss": 1.0039, "step": 1139 }, { "epoch": 0.1688888888888889, "grad_norm": 1.9967530965805054, "learning_rate": 0.00016643439584877688, "loss": 0.9737, "step": 1140 }, { "epoch": 0.16903703703703704, "grad_norm": 4.667789936065674, "learning_rate": 0.00016640474425500372, "loss": 1.4546, "step": 1141 }, { "epoch": 0.1691851851851852, "grad_norm": 2.0882341861724854, "learning_rate": 0.00016637509266123054, "loss": 0.9068, "step": 1142 }, { "epoch": 0.16933333333333334, "grad_norm": 2.6171019077301025, "learning_rate": 0.00016634544106745739, "loss": 1.3465, "step": 1143 }, { "epoch": 0.16948148148148148, "grad_norm": 2.0520472526550293, "learning_rate": 0.00016631578947368423, "loss": 1.1366, "step": 1144 }, { "epoch": 0.16962962962962963, "grad_norm": 2.103170156478882, "learning_rate": 0.00016628613787991105, "loss": 1.226, "step": 1145 }, { "epoch": 0.16977777777777778, "grad_norm": 13.238292694091797, "learning_rate": 0.0001662564862861379, "loss": 1.0785, "step": 1146 }, { "epoch": 0.16992592592592592, "grad_norm": 2.532947301864624, "learning_rate": 0.00016622683469236474, "loss": 1.0802, "step": 1147 }, { "epoch": 0.17007407407407407, "grad_norm": 1.3687976598739624, "learning_rate": 0.00016619718309859155, "loss": 1.0488, "step": 1148 }, { "epoch": 0.17022222222222222, "grad_norm": 2.4959707260131836, "learning_rate": 0.0001661675315048184, "loss": 1.1523, "step": 1149 }, { "epoch": 0.17037037037037037, "grad_norm": 1.4208955764770508, "learning_rate": 0.00016613787991104524, "loss": 1.2076, "step": 1150 }, { "epoch": 0.1705185185185185, "grad_norm": 3.3279201984405518, "learning_rate": 0.00016610822831727206, "loss": 0.9391, "step": 1151 }, { "epoch": 0.17066666666666666, "grad_norm": 1.680005669593811, "learning_rate": 0.00016607857672349888, "loss": 1.3179, "step": 1152 }, { "epoch": 0.1708148148148148, "grad_norm": 3.0134124755859375, "learning_rate": 0.00016604892512972575, "loss": 1.6055, "step": 1153 }, { "epoch": 0.17096296296296296, "grad_norm": 1.9432357549667358, "learning_rate": 0.00016601927353595257, "loss": 1.0273, "step": 1154 }, { "epoch": 0.1711111111111111, "grad_norm": 2.375060796737671, "learning_rate": 0.00016598962194217938, "loss": 1.3216, "step": 1155 }, { "epoch": 0.17125925925925925, "grad_norm": 2.5080406665802, "learning_rate": 0.00016595997034840623, "loss": 1.1713, "step": 1156 }, { "epoch": 0.1714074074074074, "grad_norm": 13.426639556884766, "learning_rate": 0.00016593031875463307, "loss": 1.0199, "step": 1157 }, { "epoch": 0.17155555555555554, "grad_norm": 3.609325885772705, "learning_rate": 0.0001659006671608599, "loss": 0.9444, "step": 1158 }, { "epoch": 0.1717037037037037, "grad_norm": 2.1851866245269775, "learning_rate": 0.00016587101556708673, "loss": 1.103, "step": 1159 }, { "epoch": 0.17185185185185184, "grad_norm": 3.400322437286377, "learning_rate": 0.00016584136397331358, "loss": 1.1331, "step": 1160 }, { "epoch": 0.172, "grad_norm": 2.3655338287353516, "learning_rate": 0.0001658117123795404, "loss": 1.2707, "step": 1161 }, { "epoch": 0.17214814814814816, "grad_norm": 1.8025535345077515, "learning_rate": 0.00016578206078576724, "loss": 1.1592, "step": 1162 }, { "epoch": 0.1722962962962963, "grad_norm": 2.439143180847168, "learning_rate": 0.00016575240919199409, "loss": 1.066, "step": 1163 }, { "epoch": 0.17244444444444446, "grad_norm": 3.3526995182037354, "learning_rate": 0.0001657227575982209, "loss": 1.1781, "step": 1164 }, { "epoch": 0.1725925925925926, "grad_norm": 3.699305772781372, "learning_rate": 0.00016569310600444775, "loss": 1.2586, "step": 1165 }, { "epoch": 0.17274074074074075, "grad_norm": 1.3605479001998901, "learning_rate": 0.0001656634544106746, "loss": 0.9209, "step": 1166 }, { "epoch": 0.1728888888888889, "grad_norm": 2.2111520767211914, "learning_rate": 0.0001656338028169014, "loss": 1.1226, "step": 1167 }, { "epoch": 0.17303703703703704, "grad_norm": 2.446354866027832, "learning_rate": 0.00016560415122312825, "loss": 1.1782, "step": 1168 }, { "epoch": 0.1731851851851852, "grad_norm": 1.759831190109253, "learning_rate": 0.0001655744996293551, "loss": 1.3204, "step": 1169 }, { "epoch": 0.17333333333333334, "grad_norm": 1.6880930662155151, "learning_rate": 0.00016554484803558192, "loss": 0.9032, "step": 1170 }, { "epoch": 0.1734814814814815, "grad_norm": 1.861935019493103, "learning_rate": 0.00016551519644180876, "loss": 1.2485, "step": 1171 }, { "epoch": 0.17362962962962963, "grad_norm": 3.1562302112579346, "learning_rate": 0.0001654855448480356, "loss": 1.294, "step": 1172 }, { "epoch": 0.17377777777777778, "grad_norm": 1.7831099033355713, "learning_rate": 0.00016545589325426242, "loss": 1.0631, "step": 1173 }, { "epoch": 0.17392592592592593, "grad_norm": 2.3080148696899414, "learning_rate": 0.00016542624166048927, "loss": 1.04, "step": 1174 }, { "epoch": 0.17407407407407408, "grad_norm": 2.459798574447632, "learning_rate": 0.00016539659006671608, "loss": 1.2923, "step": 1175 }, { "epoch": 0.17422222222222222, "grad_norm": 2.508446455001831, "learning_rate": 0.00016536693847294293, "loss": 1.0659, "step": 1176 }, { "epoch": 0.17437037037037037, "grad_norm": 2.1636300086975098, "learning_rate": 0.00016533728687916977, "loss": 1.1215, "step": 1177 }, { "epoch": 0.17451851851851852, "grad_norm": 4.602794647216797, "learning_rate": 0.0001653076352853966, "loss": 1.3148, "step": 1178 }, { "epoch": 0.17466666666666666, "grad_norm": 1.9078799486160278, "learning_rate": 0.00016527798369162343, "loss": 0.9364, "step": 1179 }, { "epoch": 0.1748148148148148, "grad_norm": 5.264694690704346, "learning_rate": 0.00016524833209785028, "loss": 1.2738, "step": 1180 }, { "epoch": 0.17496296296296296, "grad_norm": 2.355531692504883, "learning_rate": 0.0001652186805040771, "loss": 1.2169, "step": 1181 }, { "epoch": 0.1751111111111111, "grad_norm": 1.698150396347046, "learning_rate": 0.00016518902891030394, "loss": 1.0232, "step": 1182 }, { "epoch": 0.17525925925925925, "grad_norm": 1.9937596321105957, "learning_rate": 0.00016515937731653078, "loss": 1.1532, "step": 1183 }, { "epoch": 0.1754074074074074, "grad_norm": 1.392584204673767, "learning_rate": 0.0001651297257227576, "loss": 1.1634, "step": 1184 }, { "epoch": 0.17555555555555555, "grad_norm": 2.6987364292144775, "learning_rate": 0.00016510007412898445, "loss": 1.418, "step": 1185 }, { "epoch": 0.1757037037037037, "grad_norm": 2.445852041244507, "learning_rate": 0.0001650704225352113, "loss": 0.9981, "step": 1186 }, { "epoch": 0.17585185185185184, "grad_norm": 2.5761144161224365, "learning_rate": 0.0001650407709414381, "loss": 1.0874, "step": 1187 }, { "epoch": 0.176, "grad_norm": 1.2105910778045654, "learning_rate": 0.00016501111934766495, "loss": 1.0968, "step": 1188 }, { "epoch": 0.17614814814814814, "grad_norm": 2.399811029434204, "learning_rate": 0.00016498146775389177, "loss": 1.3, "step": 1189 }, { "epoch": 0.17629629629629628, "grad_norm": 2.0828731060028076, "learning_rate": 0.00016495181616011861, "loss": 1.2034, "step": 1190 }, { "epoch": 0.17644444444444443, "grad_norm": 1.969633936882019, "learning_rate": 0.00016492216456634543, "loss": 0.9231, "step": 1191 }, { "epoch": 0.17659259259259258, "grad_norm": 2.430616617202759, "learning_rate": 0.00016489251297257228, "loss": 1.2606, "step": 1192 }, { "epoch": 0.17674074074074075, "grad_norm": 4.084666728973389, "learning_rate": 0.00016486286137879912, "loss": 0.9322, "step": 1193 }, { "epoch": 0.1768888888888889, "grad_norm": 2.4692203998565674, "learning_rate": 0.00016483320978502594, "loss": 1.2591, "step": 1194 }, { "epoch": 0.17703703703703705, "grad_norm": 2.2353594303131104, "learning_rate": 0.00016480355819125278, "loss": 1.3139, "step": 1195 }, { "epoch": 0.1771851851851852, "grad_norm": 2.550607919692993, "learning_rate": 0.00016477390659747963, "loss": 1.1424, "step": 1196 }, { "epoch": 0.17733333333333334, "grad_norm": 1.8636503219604492, "learning_rate": 0.00016474425500370644, "loss": 0.9783, "step": 1197 }, { "epoch": 0.1774814814814815, "grad_norm": 3.616098165512085, "learning_rate": 0.0001647146034099333, "loss": 1.3557, "step": 1198 }, { "epoch": 0.17762962962962964, "grad_norm": 5.341982364654541, "learning_rate": 0.00016468495181616013, "loss": 0.9768, "step": 1199 }, { "epoch": 0.17777777777777778, "grad_norm": 1.3313606977462769, "learning_rate": 0.00016465530022238695, "loss": 1.1152, "step": 1200 }, { "epoch": 0.17792592592592593, "grad_norm": 3.144308567047119, "learning_rate": 0.0001646256486286138, "loss": 1.1852, "step": 1201 }, { "epoch": 0.17807407407407408, "grad_norm": 1.5455279350280762, "learning_rate": 0.00016459599703484064, "loss": 0.9879, "step": 1202 }, { "epoch": 0.17822222222222223, "grad_norm": 4.991955280303955, "learning_rate": 0.00016456634544106746, "loss": 1.1951, "step": 1203 }, { "epoch": 0.17837037037037037, "grad_norm": 2.8212974071502686, "learning_rate": 0.0001645366938472943, "loss": 1.1659, "step": 1204 }, { "epoch": 0.17851851851851852, "grad_norm": 3.9513165950775146, "learning_rate": 0.00016450704225352115, "loss": 1.3337, "step": 1205 }, { "epoch": 0.17866666666666667, "grad_norm": 3.328399181365967, "learning_rate": 0.00016447739065974796, "loss": 1.2649, "step": 1206 }, { "epoch": 0.17881481481481482, "grad_norm": 2.1954457759857178, "learning_rate": 0.00016444773906597478, "loss": 1.3181, "step": 1207 }, { "epoch": 0.17896296296296296, "grad_norm": 1.7748527526855469, "learning_rate": 0.00016441808747220165, "loss": 1.11, "step": 1208 }, { "epoch": 0.1791111111111111, "grad_norm": 2.2563819885253906, "learning_rate": 0.00016438843587842847, "loss": 1.0285, "step": 1209 }, { "epoch": 0.17925925925925926, "grad_norm": 5.612164497375488, "learning_rate": 0.0001643587842846553, "loss": 1.2267, "step": 1210 }, { "epoch": 0.1794074074074074, "grad_norm": 2.1129345893859863, "learning_rate": 0.00016432913269088216, "loss": 1.1477, "step": 1211 }, { "epoch": 0.17955555555555555, "grad_norm": 2.182945489883423, "learning_rate": 0.00016429948109710898, "loss": 1.1477, "step": 1212 }, { "epoch": 0.1797037037037037, "grad_norm": 2.216829299926758, "learning_rate": 0.0001642698295033358, "loss": 1.1976, "step": 1213 }, { "epoch": 0.17985185185185185, "grad_norm": 2.9617536067962646, "learning_rate": 0.00016424017790956266, "loss": 1.141, "step": 1214 }, { "epoch": 0.18, "grad_norm": 1.9161547422409058, "learning_rate": 0.00016421052631578948, "loss": 1.0007, "step": 1215 }, { "epoch": 0.18014814814814814, "grad_norm": 3.606161594390869, "learning_rate": 0.0001641808747220163, "loss": 1.0588, "step": 1216 }, { "epoch": 0.1802962962962963, "grad_norm": 2.439342737197876, "learning_rate": 0.00016415122312824317, "loss": 1.1524, "step": 1217 }, { "epoch": 0.18044444444444444, "grad_norm": 1.9793710708618164, "learning_rate": 0.00016412157153447, "loss": 0.9695, "step": 1218 }, { "epoch": 0.18059259259259258, "grad_norm": 1.7666544914245605, "learning_rate": 0.0001640919199406968, "loss": 1.3083, "step": 1219 }, { "epoch": 0.18074074074074073, "grad_norm": 2.4542181491851807, "learning_rate": 0.00016406226834692368, "loss": 1.1153, "step": 1220 }, { "epoch": 0.18088888888888888, "grad_norm": 3.1888298988342285, "learning_rate": 0.0001640326167531505, "loss": 0.944, "step": 1221 }, { "epoch": 0.18103703703703702, "grad_norm": 2.3188817501068115, "learning_rate": 0.0001640029651593773, "loss": 1.1371, "step": 1222 }, { "epoch": 0.18118518518518517, "grad_norm": 2.448969841003418, "learning_rate": 0.00016397331356560416, "loss": 1.1699, "step": 1223 }, { "epoch": 0.18133333333333335, "grad_norm": 3.9037697315216064, "learning_rate": 0.000163943661971831, "loss": 1.139, "step": 1224 }, { "epoch": 0.1814814814814815, "grad_norm": 2.6014204025268555, "learning_rate": 0.00016391401037805782, "loss": 1.2577, "step": 1225 }, { "epoch": 0.18162962962962964, "grad_norm": 25.857332229614258, "learning_rate": 0.00016388435878428466, "loss": 0.9306, "step": 1226 }, { "epoch": 0.1817777777777778, "grad_norm": 2.0890629291534424, "learning_rate": 0.0001638547071905115, "loss": 1.3022, "step": 1227 }, { "epoch": 0.18192592592592594, "grad_norm": 1.969598412513733, "learning_rate": 0.00016382505559673832, "loss": 1.1961, "step": 1228 }, { "epoch": 0.18207407407407408, "grad_norm": 1.7499252557754517, "learning_rate": 0.00016379540400296517, "loss": 0.9372, "step": 1229 }, { "epoch": 0.18222222222222223, "grad_norm": 2.8854031562805176, "learning_rate": 0.000163765752409192, "loss": 1.1053, "step": 1230 }, { "epoch": 0.18237037037037038, "grad_norm": 4.263045310974121, "learning_rate": 0.00016373610081541883, "loss": 0.9321, "step": 1231 }, { "epoch": 0.18251851851851852, "grad_norm": 1.8082984685897827, "learning_rate": 0.00016370644922164568, "loss": 1.125, "step": 1232 }, { "epoch": 0.18266666666666667, "grad_norm": 13.007062911987305, "learning_rate": 0.00016367679762787252, "loss": 1.3134, "step": 1233 }, { "epoch": 0.18281481481481482, "grad_norm": 1.9080790281295776, "learning_rate": 0.00016364714603409934, "loss": 1.0975, "step": 1234 }, { "epoch": 0.18296296296296297, "grad_norm": 1.7352561950683594, "learning_rate": 0.00016361749444032618, "loss": 1.2799, "step": 1235 }, { "epoch": 0.1831111111111111, "grad_norm": 2.157141923904419, "learning_rate": 0.00016358784284655303, "loss": 1.2574, "step": 1236 }, { "epoch": 0.18325925925925926, "grad_norm": 2.416133403778076, "learning_rate": 0.00016355819125277984, "loss": 1.1067, "step": 1237 }, { "epoch": 0.1834074074074074, "grad_norm": 1.6308764219284058, "learning_rate": 0.00016352853965900666, "loss": 0.9436, "step": 1238 }, { "epoch": 0.18355555555555556, "grad_norm": 3.1649699211120605, "learning_rate": 0.00016349888806523353, "loss": 0.9486, "step": 1239 }, { "epoch": 0.1837037037037037, "grad_norm": 2.2168164253234863, "learning_rate": 0.00016346923647146035, "loss": 1.1415, "step": 1240 }, { "epoch": 0.18385185185185185, "grad_norm": 2.6989848613739014, "learning_rate": 0.00016343958487768717, "loss": 1.04, "step": 1241 }, { "epoch": 0.184, "grad_norm": 2.743046760559082, "learning_rate": 0.000163409933283914, "loss": 1.2542, "step": 1242 }, { "epoch": 0.18414814814814814, "grad_norm": 2.6614725589752197, "learning_rate": 0.00016338028169014086, "loss": 1.2448, "step": 1243 }, { "epoch": 0.1842962962962963, "grad_norm": 2.142125129699707, "learning_rate": 0.00016335063009636767, "loss": 1.0992, "step": 1244 }, { "epoch": 0.18444444444444444, "grad_norm": 4.400914669036865, "learning_rate": 0.00016332097850259452, "loss": 1.1357, "step": 1245 }, { "epoch": 0.18459259259259259, "grad_norm": 2.421738862991333, "learning_rate": 0.00016329132690882136, "loss": 1.3442, "step": 1246 }, { "epoch": 0.18474074074074073, "grad_norm": 3.050323486328125, "learning_rate": 0.00016326167531504818, "loss": 1.173, "step": 1247 }, { "epoch": 0.18488888888888888, "grad_norm": 1.4971282482147217, "learning_rate": 0.00016323202372127502, "loss": 1.137, "step": 1248 }, { "epoch": 0.18503703703703703, "grad_norm": 1.1616339683532715, "learning_rate": 0.00016320237212750187, "loss": 0.8902, "step": 1249 }, { "epoch": 0.18518518518518517, "grad_norm": 1.8049408197402954, "learning_rate": 0.00016317272053372869, "loss": 1.138, "step": 1250 }, { "epoch": 0.18533333333333332, "grad_norm": 1.7517163753509521, "learning_rate": 0.00016314306893995553, "loss": 1.0648, "step": 1251 }, { "epoch": 0.18548148148148147, "grad_norm": 3.4030308723449707, "learning_rate": 0.00016311341734618237, "loss": 0.9477, "step": 1252 }, { "epoch": 0.18562962962962962, "grad_norm": 2.556241512298584, "learning_rate": 0.0001630837657524092, "loss": 1.2402, "step": 1253 }, { "epoch": 0.18577777777777776, "grad_norm": 2.832058906555176, "learning_rate": 0.00016305411415863604, "loss": 1.1, "step": 1254 }, { "epoch": 0.18592592592592594, "grad_norm": 1.4326435327529907, "learning_rate": 0.00016302446256486288, "loss": 1.4222, "step": 1255 }, { "epoch": 0.1860740740740741, "grad_norm": 1.8035448789596558, "learning_rate": 0.0001629948109710897, "loss": 1.1173, "step": 1256 }, { "epoch": 0.18622222222222223, "grad_norm": 2.3016655445098877, "learning_rate": 0.00016296515937731654, "loss": 1.3771, "step": 1257 }, { "epoch": 0.18637037037037038, "grad_norm": 3.838371515274048, "learning_rate": 0.0001629355077835434, "loss": 1.1765, "step": 1258 }, { "epoch": 0.18651851851851853, "grad_norm": 2.1241445541381836, "learning_rate": 0.0001629058561897702, "loss": 1.1228, "step": 1259 }, { "epoch": 0.18666666666666668, "grad_norm": 2.992121934890747, "learning_rate": 0.00016287620459599705, "loss": 1.4603, "step": 1260 }, { "epoch": 0.18681481481481482, "grad_norm": 2.247199296951294, "learning_rate": 0.00016284655300222387, "loss": 1.2536, "step": 1261 }, { "epoch": 0.18696296296296297, "grad_norm": 4.452000141143799, "learning_rate": 0.0001628169014084507, "loss": 1.2157, "step": 1262 }, { "epoch": 0.18711111111111112, "grad_norm": 2.1935882568359375, "learning_rate": 0.00016278724981467756, "loss": 1.2344, "step": 1263 }, { "epoch": 0.18725925925925926, "grad_norm": 1.2970445156097412, "learning_rate": 0.00016275759822090437, "loss": 1.0382, "step": 1264 }, { "epoch": 0.1874074074074074, "grad_norm": 2.174247980117798, "learning_rate": 0.00016272794662713122, "loss": 1.0524, "step": 1265 }, { "epoch": 0.18755555555555556, "grad_norm": 1.9508861303329468, "learning_rate": 0.00016269829503335806, "loss": 1.3446, "step": 1266 }, { "epoch": 0.1877037037037037, "grad_norm": 2.7138915061950684, "learning_rate": 0.00016266864343958488, "loss": 1.3338, "step": 1267 }, { "epoch": 0.18785185185185185, "grad_norm": 7.125894546508789, "learning_rate": 0.00016263899184581172, "loss": 1.0892, "step": 1268 }, { "epoch": 0.188, "grad_norm": 1.7355868816375732, "learning_rate": 0.00016260934025203857, "loss": 1.2031, "step": 1269 }, { "epoch": 0.18814814814814815, "grad_norm": 1.1919690370559692, "learning_rate": 0.00016257968865826539, "loss": 0.981, "step": 1270 }, { "epoch": 0.1882962962962963, "grad_norm": 1.8908437490463257, "learning_rate": 0.00016255003706449223, "loss": 1.1244, "step": 1271 }, { "epoch": 0.18844444444444444, "grad_norm": 1.9054700136184692, "learning_rate": 0.00016252038547071907, "loss": 1.1896, "step": 1272 }, { "epoch": 0.1885925925925926, "grad_norm": 2.030076742172241, "learning_rate": 0.0001624907338769459, "loss": 1.0656, "step": 1273 }, { "epoch": 0.18874074074074074, "grad_norm": 2.07568097114563, "learning_rate": 0.00016246108228317274, "loss": 1.2487, "step": 1274 }, { "epoch": 0.18888888888888888, "grad_norm": 1.4379876852035522, "learning_rate": 0.00016243143068939955, "loss": 1.1438, "step": 1275 }, { "epoch": 0.18903703703703703, "grad_norm": 3.3925724029541016, "learning_rate": 0.0001624017790956264, "loss": 1.482, "step": 1276 }, { "epoch": 0.18918518518518518, "grad_norm": 1.1773624420166016, "learning_rate": 0.00016237212750185322, "loss": 1.1702, "step": 1277 }, { "epoch": 0.18933333333333333, "grad_norm": 2.5804686546325684, "learning_rate": 0.00016234247590808006, "loss": 1.189, "step": 1278 }, { "epoch": 0.18948148148148147, "grad_norm": 2.311694860458374, "learning_rate": 0.0001623128243143069, "loss": 1.1001, "step": 1279 }, { "epoch": 0.18962962962962962, "grad_norm": 3.100853681564331, "learning_rate": 0.00016228317272053372, "loss": 1.4143, "step": 1280 }, { "epoch": 0.18977777777777777, "grad_norm": 1.9254438877105713, "learning_rate": 0.00016225352112676057, "loss": 1.0859, "step": 1281 }, { "epoch": 0.18992592592592591, "grad_norm": 2.69869065284729, "learning_rate": 0.0001622238695329874, "loss": 1.2862, "step": 1282 }, { "epoch": 0.19007407407407406, "grad_norm": 3.5101282596588135, "learning_rate": 0.00016219421793921423, "loss": 1.004, "step": 1283 }, { "epoch": 0.1902222222222222, "grad_norm": 1.9631364345550537, "learning_rate": 0.00016216456634544107, "loss": 1.2547, "step": 1284 }, { "epoch": 0.19037037037037038, "grad_norm": 4.261918067932129, "learning_rate": 0.00016213491475166792, "loss": 1.3082, "step": 1285 }, { "epoch": 0.19051851851851853, "grad_norm": 3.2264537811279297, "learning_rate": 0.00016210526315789473, "loss": 1.3426, "step": 1286 }, { "epoch": 0.19066666666666668, "grad_norm": 3.6949462890625, "learning_rate": 0.00016207561156412158, "loss": 1.2305, "step": 1287 }, { "epoch": 0.19081481481481483, "grad_norm": 3.31636118888855, "learning_rate": 0.00016204595997034842, "loss": 1.1111, "step": 1288 }, { "epoch": 0.19096296296296297, "grad_norm": 1.7275923490524292, "learning_rate": 0.00016201630837657524, "loss": 1.1604, "step": 1289 }, { "epoch": 0.19111111111111112, "grad_norm": 1.5811316967010498, "learning_rate": 0.00016198665678280208, "loss": 1.0016, "step": 1290 }, { "epoch": 0.19125925925925927, "grad_norm": 2.3568472862243652, "learning_rate": 0.00016195700518902893, "loss": 1.1028, "step": 1291 }, { "epoch": 0.19140740740740741, "grad_norm": 2.253685235977173, "learning_rate": 0.00016192735359525575, "loss": 1.1587, "step": 1292 }, { "epoch": 0.19155555555555556, "grad_norm": 3.685878038406372, "learning_rate": 0.0001618977020014826, "loss": 1.315, "step": 1293 }, { "epoch": 0.1917037037037037, "grad_norm": 2.7187671661376953, "learning_rate": 0.00016186805040770944, "loss": 1.0894, "step": 1294 }, { "epoch": 0.19185185185185186, "grad_norm": 4.728074550628662, "learning_rate": 0.00016183839881393625, "loss": 0.9953, "step": 1295 }, { "epoch": 0.192, "grad_norm": 1.6997319459915161, "learning_rate": 0.00016180874722016307, "loss": 1.3237, "step": 1296 }, { "epoch": 0.19214814814814815, "grad_norm": 2.6872713565826416, "learning_rate": 0.00016177909562638994, "loss": 1.0687, "step": 1297 }, { "epoch": 0.1922962962962963, "grad_norm": 1.523756504058838, "learning_rate": 0.00016174944403261676, "loss": 1.1545, "step": 1298 }, { "epoch": 0.19244444444444445, "grad_norm": 2.7069854736328125, "learning_rate": 0.00016171979243884358, "loss": 1.035, "step": 1299 }, { "epoch": 0.1925925925925926, "grad_norm": 1.2767950296401978, "learning_rate": 0.00016169014084507045, "loss": 1.2159, "step": 1300 }, { "epoch": 0.19274074074074074, "grad_norm": 1.6256358623504639, "learning_rate": 0.00016166048925129727, "loss": 1.2178, "step": 1301 }, { "epoch": 0.1928888888888889, "grad_norm": 3.1067140102386475, "learning_rate": 0.00016163083765752408, "loss": 1.1665, "step": 1302 }, { "epoch": 0.19303703703703703, "grad_norm": 2.419325113296509, "learning_rate": 0.00016160118606375095, "loss": 1.0093, "step": 1303 }, { "epoch": 0.19318518518518518, "grad_norm": 1.551223635673523, "learning_rate": 0.00016157153446997777, "loss": 1.1413, "step": 1304 }, { "epoch": 0.19333333333333333, "grad_norm": 2.0792694091796875, "learning_rate": 0.0001615418828762046, "loss": 1.3, "step": 1305 }, { "epoch": 0.19348148148148148, "grad_norm": 1.577027440071106, "learning_rate": 0.00016151223128243146, "loss": 1.0933, "step": 1306 }, { "epoch": 0.19362962962962962, "grad_norm": 1.7525880336761475, "learning_rate": 0.00016148257968865828, "loss": 1.3021, "step": 1307 }, { "epoch": 0.19377777777777777, "grad_norm": 3.2240004539489746, "learning_rate": 0.0001614529280948851, "loss": 1.2007, "step": 1308 }, { "epoch": 0.19392592592592592, "grad_norm": 2.8471946716308594, "learning_rate": 0.00016142327650111194, "loss": 1.2284, "step": 1309 }, { "epoch": 0.19407407407407407, "grad_norm": 2.295551300048828, "learning_rate": 0.00016139362490733878, "loss": 1.099, "step": 1310 }, { "epoch": 0.1942222222222222, "grad_norm": 4.0938720703125, "learning_rate": 0.0001613639733135656, "loss": 0.8849, "step": 1311 }, { "epoch": 0.19437037037037036, "grad_norm": 1.349643349647522, "learning_rate": 0.00016133432171979245, "loss": 0.9514, "step": 1312 }, { "epoch": 0.1945185185185185, "grad_norm": 2.4366378784179688, "learning_rate": 0.0001613046701260193, "loss": 1.3515, "step": 1313 }, { "epoch": 0.19466666666666665, "grad_norm": 1.8238210678100586, "learning_rate": 0.0001612750185322461, "loss": 1.0687, "step": 1314 }, { "epoch": 0.1948148148148148, "grad_norm": 3.347797393798828, "learning_rate": 0.00016124536693847295, "loss": 1.2437, "step": 1315 }, { "epoch": 0.19496296296296298, "grad_norm": 2.944796562194824, "learning_rate": 0.0001612157153446998, "loss": 0.9848, "step": 1316 }, { "epoch": 0.19511111111111112, "grad_norm": 1.7708215713500977, "learning_rate": 0.00016118606375092661, "loss": 1.0974, "step": 1317 }, { "epoch": 0.19525925925925927, "grad_norm": 2.0808119773864746, "learning_rate": 0.00016115641215715346, "loss": 1.1477, "step": 1318 }, { "epoch": 0.19540740740740742, "grad_norm": 3.2274317741394043, "learning_rate": 0.0001611267605633803, "loss": 1.209, "step": 1319 }, { "epoch": 0.19555555555555557, "grad_norm": 2.4879109859466553, "learning_rate": 0.00016109710896960712, "loss": 1.0825, "step": 1320 }, { "epoch": 0.1957037037037037, "grad_norm": 4.362395286560059, "learning_rate": 0.00016106745737583396, "loss": 1.0334, "step": 1321 }, { "epoch": 0.19585185185185186, "grad_norm": 2.8710215091705322, "learning_rate": 0.0001610378057820608, "loss": 1.3673, "step": 1322 }, { "epoch": 0.196, "grad_norm": 1.7256768941879272, "learning_rate": 0.00016100815418828763, "loss": 1.1933, "step": 1323 }, { "epoch": 0.19614814814814815, "grad_norm": 1.6461589336395264, "learning_rate": 0.00016097850259451444, "loss": 1.0943, "step": 1324 }, { "epoch": 0.1962962962962963, "grad_norm": 2.648347854614258, "learning_rate": 0.00016094885100074132, "loss": 1.1845, "step": 1325 }, { "epoch": 0.19644444444444445, "grad_norm": 1.4237217903137207, "learning_rate": 0.00016091919940696813, "loss": 0.8626, "step": 1326 }, { "epoch": 0.1965925925925926, "grad_norm": 2.1890766620635986, "learning_rate": 0.00016088954781319495, "loss": 0.9657, "step": 1327 }, { "epoch": 0.19674074074074074, "grad_norm": 1.728020191192627, "learning_rate": 0.0001608598962194218, "loss": 1.2267, "step": 1328 }, { "epoch": 0.1968888888888889, "grad_norm": 1.2080868482589722, "learning_rate": 0.00016083024462564864, "loss": 0.9941, "step": 1329 }, { "epoch": 0.19703703703703704, "grad_norm": 2.4535627365112305, "learning_rate": 0.00016080059303187546, "loss": 1.2543, "step": 1330 }, { "epoch": 0.19718518518518519, "grad_norm": 2.3592395782470703, "learning_rate": 0.0001607709414381023, "loss": 1.0558, "step": 1331 }, { "epoch": 0.19733333333333333, "grad_norm": 2.684448480606079, "learning_rate": 0.00016074128984432915, "loss": 0.9364, "step": 1332 }, { "epoch": 0.19748148148148148, "grad_norm": 5.671911716461182, "learning_rate": 0.00016071163825055596, "loss": 1.1495, "step": 1333 }, { "epoch": 0.19762962962962963, "grad_norm": 1.8713393211364746, "learning_rate": 0.0001606819866567828, "loss": 0.9452, "step": 1334 }, { "epoch": 0.19777777777777777, "grad_norm": 1.614362359046936, "learning_rate": 0.00016065233506300965, "loss": 1.1946, "step": 1335 }, { "epoch": 0.19792592592592592, "grad_norm": 3.343510866165161, "learning_rate": 0.00016062268346923647, "loss": 1.3064, "step": 1336 }, { "epoch": 0.19807407407407407, "grad_norm": 2.7058181762695312, "learning_rate": 0.00016059303187546331, "loss": 1.11, "step": 1337 }, { "epoch": 0.19822222222222222, "grad_norm": 2.640108823776245, "learning_rate": 0.00016056338028169016, "loss": 1.2595, "step": 1338 }, { "epoch": 0.19837037037037036, "grad_norm": 3.210209369659424, "learning_rate": 0.00016053372868791698, "loss": 1.0694, "step": 1339 }, { "epoch": 0.1985185185185185, "grad_norm": 2.481147527694702, "learning_rate": 0.00016050407709414382, "loss": 1.1474, "step": 1340 }, { "epoch": 0.19866666666666666, "grad_norm": 1.3905922174453735, "learning_rate": 0.00016047442550037066, "loss": 1.2432, "step": 1341 }, { "epoch": 0.1988148148148148, "grad_norm": 1.8446251153945923, "learning_rate": 0.00016044477390659748, "loss": 1.6521, "step": 1342 }, { "epoch": 0.19896296296296295, "grad_norm": 3.1401638984680176, "learning_rate": 0.00016041512231282433, "loss": 1.0923, "step": 1343 }, { "epoch": 0.1991111111111111, "grad_norm": 2.527815341949463, "learning_rate": 0.00016038547071905117, "loss": 1.0247, "step": 1344 }, { "epoch": 0.19925925925925925, "grad_norm": 5.10316801071167, "learning_rate": 0.000160355819125278, "loss": 1.1991, "step": 1345 }, { "epoch": 0.1994074074074074, "grad_norm": 1.7162216901779175, "learning_rate": 0.00016032616753150483, "loss": 1.2135, "step": 1346 }, { "epoch": 0.19955555555555557, "grad_norm": 1.5862842798233032, "learning_rate": 0.00016029651593773165, "loss": 1.0167, "step": 1347 }, { "epoch": 0.19970370370370372, "grad_norm": 1.537105679512024, "learning_rate": 0.0001602668643439585, "loss": 1.2985, "step": 1348 }, { "epoch": 0.19985185185185186, "grad_norm": 3.7510077953338623, "learning_rate": 0.00016023721275018534, "loss": 1.1062, "step": 1349 }, { "epoch": 0.2, "grad_norm": 1.5134105682373047, "learning_rate": 0.00016020756115641216, "loss": 1.2994, "step": 1350 }, { "epoch": 0.20014814814814816, "grad_norm": 2.0335676670074463, "learning_rate": 0.000160177909562639, "loss": 1.0951, "step": 1351 }, { "epoch": 0.2002962962962963, "grad_norm": 1.9692561626434326, "learning_rate": 0.00016014825796886584, "loss": 1.2001, "step": 1352 }, { "epoch": 0.20044444444444445, "grad_norm": 5.022037029266357, "learning_rate": 0.00016011860637509266, "loss": 1.1642, "step": 1353 }, { "epoch": 0.2005925925925926, "grad_norm": 2.4920811653137207, "learning_rate": 0.0001600889547813195, "loss": 1.3397, "step": 1354 }, { "epoch": 0.20074074074074075, "grad_norm": 1.7635655403137207, "learning_rate": 0.00016005930318754635, "loss": 1.0308, "step": 1355 }, { "epoch": 0.2008888888888889, "grad_norm": 1.5410326719284058, "learning_rate": 0.00016002965159377317, "loss": 1.1857, "step": 1356 }, { "epoch": 0.20103703703703704, "grad_norm": 2.2208480834960938, "learning_rate": 0.00016, "loss": 1.1641, "step": 1357 }, { "epoch": 0.2011851851851852, "grad_norm": 1.3878949880599976, "learning_rate": 0.00015997034840622686, "loss": 1.1996, "step": 1358 }, { "epoch": 0.20133333333333334, "grad_norm": 3.8697714805603027, "learning_rate": 0.00015994069681245367, "loss": 1.2625, "step": 1359 }, { "epoch": 0.20148148148148148, "grad_norm": 2.1626200675964355, "learning_rate": 0.00015991104521868052, "loss": 1.1042, "step": 1360 }, { "epoch": 0.20162962962962963, "grad_norm": 1.4656286239624023, "learning_rate": 0.00015988139362490734, "loss": 1.0454, "step": 1361 }, { "epoch": 0.20177777777777778, "grad_norm": 2.388040065765381, "learning_rate": 0.00015985174203113418, "loss": 0.9111, "step": 1362 }, { "epoch": 0.20192592592592593, "grad_norm": 1.7776933908462524, "learning_rate": 0.000159822090437361, "loss": 1.2934, "step": 1363 }, { "epoch": 0.20207407407407407, "grad_norm": 3.2282278537750244, "learning_rate": 0.00015979243884358784, "loss": 0.9072, "step": 1364 }, { "epoch": 0.20222222222222222, "grad_norm": 2.2581164836883545, "learning_rate": 0.0001597627872498147, "loss": 1.235, "step": 1365 }, { "epoch": 0.20237037037037037, "grad_norm": 2.0510082244873047, "learning_rate": 0.0001597331356560415, "loss": 1.1715, "step": 1366 }, { "epoch": 0.20251851851851851, "grad_norm": 1.0332187414169312, "learning_rate": 0.00015970348406226835, "loss": 1.0326, "step": 1367 }, { "epoch": 0.20266666666666666, "grad_norm": 1.8021240234375, "learning_rate": 0.0001596738324684952, "loss": 1.267, "step": 1368 }, { "epoch": 0.2028148148148148, "grad_norm": 4.383606910705566, "learning_rate": 0.000159644180874722, "loss": 1.2257, "step": 1369 }, { "epoch": 0.20296296296296296, "grad_norm": 1.646316409111023, "learning_rate": 0.00015961452928094886, "loss": 1.0715, "step": 1370 }, { "epoch": 0.2031111111111111, "grad_norm": 2.4021363258361816, "learning_rate": 0.0001595848776871757, "loss": 1.1131, "step": 1371 }, { "epoch": 0.20325925925925925, "grad_norm": 3.759568214416504, "learning_rate": 0.00015955522609340252, "loss": 0.971, "step": 1372 }, { "epoch": 0.2034074074074074, "grad_norm": 1.7920905351638794, "learning_rate": 0.00015952557449962936, "loss": 1.2104, "step": 1373 }, { "epoch": 0.20355555555555555, "grad_norm": 1.354028344154358, "learning_rate": 0.0001594959229058562, "loss": 1.1061, "step": 1374 }, { "epoch": 0.2037037037037037, "grad_norm": 2.5420985221862793, "learning_rate": 0.00015946627131208302, "loss": 1.1685, "step": 1375 }, { "epoch": 0.20385185185185184, "grad_norm": 2.520581007003784, "learning_rate": 0.00015943661971830987, "loss": 1.286, "step": 1376 }, { "epoch": 0.204, "grad_norm": 3.1659348011016846, "learning_rate": 0.0001594069681245367, "loss": 1.423, "step": 1377 }, { "epoch": 0.20414814814814816, "grad_norm": 1.2454274892807007, "learning_rate": 0.00015937731653076353, "loss": 1.06, "step": 1378 }, { "epoch": 0.2042962962962963, "grad_norm": 4.370316505432129, "learning_rate": 0.00015934766493699037, "loss": 1.1003, "step": 1379 }, { "epoch": 0.20444444444444446, "grad_norm": 1.9933221340179443, "learning_rate": 0.00015931801334321722, "loss": 1.201, "step": 1380 }, { "epoch": 0.2045925925925926, "grad_norm": 3.2642831802368164, "learning_rate": 0.00015928836174944404, "loss": 0.9817, "step": 1381 }, { "epoch": 0.20474074074074075, "grad_norm": 1.2640935182571411, "learning_rate": 0.00015925871015567085, "loss": 1.1943, "step": 1382 }, { "epoch": 0.2048888888888889, "grad_norm": 1.4914500713348389, "learning_rate": 0.00015922905856189773, "loss": 1.0585, "step": 1383 }, { "epoch": 0.20503703703703705, "grad_norm": 1.990502119064331, "learning_rate": 0.00015919940696812454, "loss": 1.217, "step": 1384 }, { "epoch": 0.2051851851851852, "grad_norm": 1.5243146419525146, "learning_rate": 0.00015916975537435136, "loss": 1.3086, "step": 1385 }, { "epoch": 0.20533333333333334, "grad_norm": 1.9028431177139282, "learning_rate": 0.00015914010378057823, "loss": 1.1409, "step": 1386 }, { "epoch": 0.2054814814814815, "grad_norm": 2.9515159130096436, "learning_rate": 0.00015911045218680505, "loss": 1.1486, "step": 1387 }, { "epoch": 0.20562962962962963, "grad_norm": 1.4709255695343018, "learning_rate": 0.00015908080059303187, "loss": 1.1272, "step": 1388 }, { "epoch": 0.20577777777777778, "grad_norm": 1.290727972984314, "learning_rate": 0.00015905114899925874, "loss": 1.2133, "step": 1389 }, { "epoch": 0.20592592592592593, "grad_norm": 1.8945701122283936, "learning_rate": 0.00015902149740548556, "loss": 1.2185, "step": 1390 }, { "epoch": 0.20607407407407408, "grad_norm": 1.6242437362670898, "learning_rate": 0.00015899184581171237, "loss": 1.2213, "step": 1391 }, { "epoch": 0.20622222222222222, "grad_norm": 2.466501474380493, "learning_rate": 0.00015896219421793924, "loss": 0.9557, "step": 1392 }, { "epoch": 0.20637037037037037, "grad_norm": 2.058912992477417, "learning_rate": 0.00015893254262416606, "loss": 1.1384, "step": 1393 }, { "epoch": 0.20651851851851852, "grad_norm": 1.4795424938201904, "learning_rate": 0.00015890289103039288, "loss": 1.0766, "step": 1394 }, { "epoch": 0.20666666666666667, "grad_norm": 1.2131915092468262, "learning_rate": 0.00015887323943661972, "loss": 1.1701, "step": 1395 }, { "epoch": 0.2068148148148148, "grad_norm": 1.1304845809936523, "learning_rate": 0.00015884358784284657, "loss": 0.9965, "step": 1396 }, { "epoch": 0.20696296296296296, "grad_norm": 2.0751266479492188, "learning_rate": 0.00015881393624907338, "loss": 1.1225, "step": 1397 }, { "epoch": 0.2071111111111111, "grad_norm": 2.2350399494171143, "learning_rate": 0.00015878428465530023, "loss": 1.029, "step": 1398 }, { "epoch": 0.20725925925925925, "grad_norm": 1.6366523504257202, "learning_rate": 0.00015875463306152707, "loss": 1.0351, "step": 1399 }, { "epoch": 0.2074074074074074, "grad_norm": 1.5569686889648438, "learning_rate": 0.0001587249814677539, "loss": 1.2932, "step": 1400 }, { "epoch": 0.20755555555555555, "grad_norm": 6.361331939697266, "learning_rate": 0.00015869532987398074, "loss": 1.086, "step": 1401 }, { "epoch": 0.2077037037037037, "grad_norm": 1.6677700281143188, "learning_rate": 0.00015866567828020758, "loss": 1.0963, "step": 1402 }, { "epoch": 0.20785185185185184, "grad_norm": 1.473945140838623, "learning_rate": 0.0001586360266864344, "loss": 1.1528, "step": 1403 }, { "epoch": 0.208, "grad_norm": 1.6613620519638062, "learning_rate": 0.00015860637509266124, "loss": 0.9257, "step": 1404 }, { "epoch": 0.20814814814814814, "grad_norm": 5.979618549346924, "learning_rate": 0.00015857672349888809, "loss": 1.2903, "step": 1405 }, { "epoch": 0.20829629629629628, "grad_norm": 2.915433168411255, "learning_rate": 0.0001585470719051149, "loss": 1.1357, "step": 1406 }, { "epoch": 0.20844444444444443, "grad_norm": 2.8678345680236816, "learning_rate": 0.00015851742031134175, "loss": 1.1544, "step": 1407 }, { "epoch": 0.20859259259259258, "grad_norm": 2.497988224029541, "learning_rate": 0.0001584877687175686, "loss": 1.1563, "step": 1408 }, { "epoch": 0.20874074074074075, "grad_norm": 1.2685167789459229, "learning_rate": 0.0001584581171237954, "loss": 1.0737, "step": 1409 }, { "epoch": 0.2088888888888889, "grad_norm": 1.5436787605285645, "learning_rate": 0.00015842846553002223, "loss": 1.0743, "step": 1410 }, { "epoch": 0.20903703703703705, "grad_norm": 1.4325158596038818, "learning_rate": 0.0001583988139362491, "loss": 1.1472, "step": 1411 }, { "epoch": 0.2091851851851852, "grad_norm": 1.1780500411987305, "learning_rate": 0.00015836916234247592, "loss": 1.0339, "step": 1412 }, { "epoch": 0.20933333333333334, "grad_norm": 1.5698156356811523, "learning_rate": 0.00015833951074870273, "loss": 1.1295, "step": 1413 }, { "epoch": 0.2094814814814815, "grad_norm": 2.26485538482666, "learning_rate": 0.00015830985915492958, "loss": 1.2991, "step": 1414 }, { "epoch": 0.20962962962962964, "grad_norm": 2.012516498565674, "learning_rate": 0.00015828020756115642, "loss": 1.014, "step": 1415 }, { "epoch": 0.20977777777777779, "grad_norm": 1.6176815032958984, "learning_rate": 0.00015825055596738324, "loss": 1.2393, "step": 1416 }, { "epoch": 0.20992592592592593, "grad_norm": 1.2089728116989136, "learning_rate": 0.00015822090437361008, "loss": 1.0259, "step": 1417 }, { "epoch": 0.21007407407407408, "grad_norm": 1.3265610933303833, "learning_rate": 0.00015819125277983693, "loss": 1.0523, "step": 1418 }, { "epoch": 0.21022222222222223, "grad_norm": 1.7452551126480103, "learning_rate": 0.00015816160118606375, "loss": 0.9849, "step": 1419 }, { "epoch": 0.21037037037037037, "grad_norm": 1.428599238395691, "learning_rate": 0.0001581319495922906, "loss": 1.1105, "step": 1420 }, { "epoch": 0.21051851851851852, "grad_norm": 1.663043737411499, "learning_rate": 0.00015810229799851744, "loss": 1.5132, "step": 1421 }, { "epoch": 0.21066666666666667, "grad_norm": 1.9859318733215332, "learning_rate": 0.00015807264640474425, "loss": 1.208, "step": 1422 }, { "epoch": 0.21081481481481482, "grad_norm": 1.3973498344421387, "learning_rate": 0.0001580429948109711, "loss": 0.9525, "step": 1423 }, { "epoch": 0.21096296296296296, "grad_norm": 1.3018242120742798, "learning_rate": 0.00015801334321719794, "loss": 1.2355, "step": 1424 }, { "epoch": 0.2111111111111111, "grad_norm": 1.5467562675476074, "learning_rate": 0.00015798369162342476, "loss": 1.2726, "step": 1425 }, { "epoch": 0.21125925925925926, "grad_norm": 2.138482093811035, "learning_rate": 0.0001579540400296516, "loss": 1.2641, "step": 1426 }, { "epoch": 0.2114074074074074, "grad_norm": 1.714917778968811, "learning_rate": 0.00015792438843587845, "loss": 0.9664, "step": 1427 }, { "epoch": 0.21155555555555555, "grad_norm": 1.4636965990066528, "learning_rate": 0.00015789473684210527, "loss": 1.3803, "step": 1428 }, { "epoch": 0.2117037037037037, "grad_norm": 1.4125659465789795, "learning_rate": 0.0001578650852483321, "loss": 1.0361, "step": 1429 }, { "epoch": 0.21185185185185185, "grad_norm": 3.2594969272613525, "learning_rate": 0.00015783543365455895, "loss": 1.115, "step": 1430 }, { "epoch": 0.212, "grad_norm": 1.4381506443023682, "learning_rate": 0.00015780578206078577, "loss": 1.1786, "step": 1431 }, { "epoch": 0.21214814814814814, "grad_norm": 1.2792737483978271, "learning_rate": 0.00015777613046701262, "loss": 0.9593, "step": 1432 }, { "epoch": 0.2122962962962963, "grad_norm": 1.6736551523208618, "learning_rate": 0.00015774647887323943, "loss": 1.1135, "step": 1433 }, { "epoch": 0.21244444444444444, "grad_norm": 1.3635329008102417, "learning_rate": 0.00015771682727946628, "loss": 0.9715, "step": 1434 }, { "epoch": 0.21259259259259258, "grad_norm": 1.551611065864563, "learning_rate": 0.00015768717568569312, "loss": 1.0314, "step": 1435 }, { "epoch": 0.21274074074074073, "grad_norm": 2.0477828979492188, "learning_rate": 0.00015765752409191994, "loss": 1.3357, "step": 1436 }, { "epoch": 0.21288888888888888, "grad_norm": 1.5487900972366333, "learning_rate": 0.00015762787249814678, "loss": 1.0907, "step": 1437 }, { "epoch": 0.21303703703703702, "grad_norm": 5.054993629455566, "learning_rate": 0.00015759822090437363, "loss": 1.0958, "step": 1438 }, { "epoch": 0.21318518518518517, "grad_norm": 1.6109741926193237, "learning_rate": 0.00015756856931060045, "loss": 1.0971, "step": 1439 }, { "epoch": 0.21333333333333335, "grad_norm": 1.518358588218689, "learning_rate": 0.0001575389177168273, "loss": 1.1, "step": 1440 }, { "epoch": 0.2134814814814815, "grad_norm": 1.6584968566894531, "learning_rate": 0.00015750926612305413, "loss": 1.2818, "step": 1441 }, { "epoch": 0.21362962962962964, "grad_norm": 3.345515727996826, "learning_rate": 0.00015747961452928095, "loss": 1.2602, "step": 1442 }, { "epoch": 0.2137777777777778, "grad_norm": 2.380998373031616, "learning_rate": 0.0001574499629355078, "loss": 1.0518, "step": 1443 }, { "epoch": 0.21392592592592594, "grad_norm": 2.689133882522583, "learning_rate": 0.00015742031134173464, "loss": 1.1277, "step": 1444 }, { "epoch": 0.21407407407407408, "grad_norm": 1.4815983772277832, "learning_rate": 0.00015739065974796146, "loss": 1.1333, "step": 1445 }, { "epoch": 0.21422222222222223, "grad_norm": 1.9596880674362183, "learning_rate": 0.0001573610081541883, "loss": 1.2099, "step": 1446 }, { "epoch": 0.21437037037037038, "grad_norm": 1.3206571340560913, "learning_rate": 0.00015733135656041512, "loss": 1.0269, "step": 1447 }, { "epoch": 0.21451851851851853, "grad_norm": 3.195244073867798, "learning_rate": 0.00015730170496664196, "loss": 1.0816, "step": 1448 }, { "epoch": 0.21466666666666667, "grad_norm": 2.082491874694824, "learning_rate": 0.00015727205337286878, "loss": 1.081, "step": 1449 }, { "epoch": 0.21481481481481482, "grad_norm": 1.4490270614624023, "learning_rate": 0.00015724240177909563, "loss": 1.0375, "step": 1450 }, { "epoch": 0.21496296296296297, "grad_norm": 1.2920150756835938, "learning_rate": 0.00015721275018532247, "loss": 1.1174, "step": 1451 }, { "epoch": 0.21511111111111111, "grad_norm": 3.3416709899902344, "learning_rate": 0.0001571830985915493, "loss": 1.2194, "step": 1452 }, { "epoch": 0.21525925925925926, "grad_norm": 1.8580683469772339, "learning_rate": 0.00015715344699777613, "loss": 1.0884, "step": 1453 }, { "epoch": 0.2154074074074074, "grad_norm": 1.8695363998413086, "learning_rate": 0.00015712379540400298, "loss": 1.0645, "step": 1454 }, { "epoch": 0.21555555555555556, "grad_norm": 1.5951566696166992, "learning_rate": 0.0001570941438102298, "loss": 1.0675, "step": 1455 }, { "epoch": 0.2157037037037037, "grad_norm": 1.4753663539886475, "learning_rate": 0.00015706449221645664, "loss": 1.1141, "step": 1456 }, { "epoch": 0.21585185185185185, "grad_norm": 1.8130995035171509, "learning_rate": 0.00015703484062268348, "loss": 1.0426, "step": 1457 }, { "epoch": 0.216, "grad_norm": 1.3725109100341797, "learning_rate": 0.0001570051890289103, "loss": 1.1511, "step": 1458 }, { "epoch": 0.21614814814814814, "grad_norm": 1.8450452089309692, "learning_rate": 0.00015697553743513715, "loss": 0.9767, "step": 1459 }, { "epoch": 0.2162962962962963, "grad_norm": 1.2268054485321045, "learning_rate": 0.000156945885841364, "loss": 0.8711, "step": 1460 }, { "epoch": 0.21644444444444444, "grad_norm": 1.552537441253662, "learning_rate": 0.0001569162342475908, "loss": 1.0715, "step": 1461 }, { "epoch": 0.2165925925925926, "grad_norm": 5.605630397796631, "learning_rate": 0.00015688658265381765, "loss": 1.2085, "step": 1462 }, { "epoch": 0.21674074074074073, "grad_norm": 2.071743965148926, "learning_rate": 0.0001568569310600445, "loss": 1.1515, "step": 1463 }, { "epoch": 0.21688888888888888, "grad_norm": 2.2309176921844482, "learning_rate": 0.0001568272794662713, "loss": 1.1824, "step": 1464 }, { "epoch": 0.21703703703703703, "grad_norm": 2.1204428672790527, "learning_rate": 0.00015679762787249816, "loss": 0.9924, "step": 1465 }, { "epoch": 0.21718518518518518, "grad_norm": 1.2242668867111206, "learning_rate": 0.000156767976278725, "loss": 0.9135, "step": 1466 }, { "epoch": 0.21733333333333332, "grad_norm": 3.142942428588867, "learning_rate": 0.00015673832468495182, "loss": 1.2583, "step": 1467 }, { "epoch": 0.21748148148148147, "grad_norm": 2.464092969894409, "learning_rate": 0.00015670867309117864, "loss": 1.0696, "step": 1468 }, { "epoch": 0.21762962962962962, "grad_norm": 1.8219599723815918, "learning_rate": 0.0001566790214974055, "loss": 1.0506, "step": 1469 }, { "epoch": 0.21777777777777776, "grad_norm": 1.3650825023651123, "learning_rate": 0.00015664936990363233, "loss": 1.217, "step": 1470 }, { "epoch": 0.21792592592592594, "grad_norm": 1.90289306640625, "learning_rate": 0.00015661971830985914, "loss": 1.3279, "step": 1471 }, { "epoch": 0.2180740740740741, "grad_norm": 1.2393357753753662, "learning_rate": 0.00015659006671608601, "loss": 1.1024, "step": 1472 }, { "epoch": 0.21822222222222223, "grad_norm": 1.8062958717346191, "learning_rate": 0.00015656041512231283, "loss": 1.1021, "step": 1473 }, { "epoch": 0.21837037037037038, "grad_norm": 1.8752361536026, "learning_rate": 0.00015653076352853965, "loss": 1.4975, "step": 1474 }, { "epoch": 0.21851851851851853, "grad_norm": 1.5685244798660278, "learning_rate": 0.00015650111193476652, "loss": 0.9332, "step": 1475 }, { "epoch": 0.21866666666666668, "grad_norm": 1.5438872575759888, "learning_rate": 0.00015647146034099334, "loss": 1.2319, "step": 1476 }, { "epoch": 0.21881481481481482, "grad_norm": 1.8651984930038452, "learning_rate": 0.00015644180874722016, "loss": 1.1674, "step": 1477 }, { "epoch": 0.21896296296296297, "grad_norm": 1.3855433464050293, "learning_rate": 0.00015641215715344703, "loss": 1.0712, "step": 1478 }, { "epoch": 0.21911111111111112, "grad_norm": 1.3934948444366455, "learning_rate": 0.00015638250555967384, "loss": 1.0684, "step": 1479 }, { "epoch": 0.21925925925925926, "grad_norm": 1.8999346494674683, "learning_rate": 0.00015635285396590066, "loss": 0.9289, "step": 1480 }, { "epoch": 0.2194074074074074, "grad_norm": 1.4764175415039062, "learning_rate": 0.0001563232023721275, "loss": 1.1297, "step": 1481 }, { "epoch": 0.21955555555555556, "grad_norm": 1.6788074970245361, "learning_rate": 0.00015629355077835435, "loss": 0.8778, "step": 1482 }, { "epoch": 0.2197037037037037, "grad_norm": 1.6045212745666504, "learning_rate": 0.00015626389918458117, "loss": 1.1411, "step": 1483 }, { "epoch": 0.21985185185185185, "grad_norm": 1.5411138534545898, "learning_rate": 0.000156234247590808, "loss": 1.1933, "step": 1484 }, { "epoch": 0.22, "grad_norm": 1.9160683155059814, "learning_rate": 0.00015620459599703486, "loss": 0.9902, "step": 1485 }, { "epoch": 0.22014814814814815, "grad_norm": 1.3305299282073975, "learning_rate": 0.00015617494440326167, "loss": 0.8694, "step": 1486 }, { "epoch": 0.2202962962962963, "grad_norm": 2.571554660797119, "learning_rate": 0.00015614529280948852, "loss": 1.1888, "step": 1487 }, { "epoch": 0.22044444444444444, "grad_norm": 2.218079090118408, "learning_rate": 0.00015611564121571536, "loss": 1.2693, "step": 1488 }, { "epoch": 0.2205925925925926, "grad_norm": 1.1321967840194702, "learning_rate": 0.00015608598962194218, "loss": 1.2222, "step": 1489 }, { "epoch": 0.22074074074074074, "grad_norm": 1.371054768562317, "learning_rate": 0.00015605633802816903, "loss": 1.1179, "step": 1490 }, { "epoch": 0.22088888888888888, "grad_norm": 1.216324806213379, "learning_rate": 0.00015602668643439587, "loss": 1.1176, "step": 1491 }, { "epoch": 0.22103703703703703, "grad_norm": 1.2088987827301025, "learning_rate": 0.0001559970348406227, "loss": 1.2248, "step": 1492 }, { "epoch": 0.22118518518518518, "grad_norm": 1.882351279258728, "learning_rate": 0.00015596738324684953, "loss": 1.3442, "step": 1493 }, { "epoch": 0.22133333333333333, "grad_norm": 1.4370276927947998, "learning_rate": 0.00015593773165307638, "loss": 1.3169, "step": 1494 }, { "epoch": 0.22148148148148147, "grad_norm": 2.124148368835449, "learning_rate": 0.0001559080800593032, "loss": 1.2403, "step": 1495 }, { "epoch": 0.22162962962962962, "grad_norm": 1.5456814765930176, "learning_rate": 0.00015587842846553, "loss": 1.2922, "step": 1496 }, { "epoch": 0.22177777777777777, "grad_norm": 1.3349412679672241, "learning_rate": 0.00015584877687175688, "loss": 1.351, "step": 1497 }, { "epoch": 0.22192592592592592, "grad_norm": 1.2368528842926025, "learning_rate": 0.0001558191252779837, "loss": 1.2629, "step": 1498 }, { "epoch": 0.22207407407407406, "grad_norm": 1.2499349117279053, "learning_rate": 0.00015578947368421052, "loss": 1.2562, "step": 1499 }, { "epoch": 0.2222222222222222, "grad_norm": 1.4824923276901245, "learning_rate": 0.0001557598220904374, "loss": 1.1449, "step": 1500 }, { "epoch": 0.22237037037037036, "grad_norm": 1.833436131477356, "learning_rate": 0.0001557301704966642, "loss": 1.1581, "step": 1501 }, { "epoch": 0.22251851851851853, "grad_norm": 1.676798701286316, "learning_rate": 0.00015570051890289102, "loss": 1.1682, "step": 1502 }, { "epoch": 0.22266666666666668, "grad_norm": 2.183213949203491, "learning_rate": 0.00015567086730911787, "loss": 0.9933, "step": 1503 }, { "epoch": 0.22281481481481483, "grad_norm": 1.5602302551269531, "learning_rate": 0.0001556412157153447, "loss": 1.1624, "step": 1504 }, { "epoch": 0.22296296296296297, "grad_norm": 1.3274348974227905, "learning_rate": 0.00015561156412157153, "loss": 0.9692, "step": 1505 }, { "epoch": 0.22311111111111112, "grad_norm": 1.28597891330719, "learning_rate": 0.00015558191252779837, "loss": 1.0527, "step": 1506 }, { "epoch": 0.22325925925925927, "grad_norm": 1.9549756050109863, "learning_rate": 0.00015555226093402522, "loss": 1.0626, "step": 1507 }, { "epoch": 0.22340740740740742, "grad_norm": 1.334690809249878, "learning_rate": 0.00015552260934025204, "loss": 1.1926, "step": 1508 }, { "epoch": 0.22355555555555556, "grad_norm": 1.8309886455535889, "learning_rate": 0.00015549295774647888, "loss": 1.1651, "step": 1509 }, { "epoch": 0.2237037037037037, "grad_norm": 1.4169001579284668, "learning_rate": 0.00015546330615270572, "loss": 1.1697, "step": 1510 }, { "epoch": 0.22385185185185186, "grad_norm": 1.2612583637237549, "learning_rate": 0.00015543365455893254, "loss": 1.0604, "step": 1511 }, { "epoch": 0.224, "grad_norm": 1.4787007570266724, "learning_rate": 0.0001554040029651594, "loss": 1.1787, "step": 1512 }, { "epoch": 0.22414814814814815, "grad_norm": 2.134244918823242, "learning_rate": 0.00015537435137138623, "loss": 1.0592, "step": 1513 }, { "epoch": 0.2242962962962963, "grad_norm": 2.822035312652588, "learning_rate": 0.00015534469977761305, "loss": 1.1476, "step": 1514 }, { "epoch": 0.22444444444444445, "grad_norm": 1.7734057903289795, "learning_rate": 0.0001553150481838399, "loss": 1.039, "step": 1515 }, { "epoch": 0.2245925925925926, "grad_norm": 1.3871924877166748, "learning_rate": 0.00015528539659006674, "loss": 1.0588, "step": 1516 }, { "epoch": 0.22474074074074074, "grad_norm": 1.3794752359390259, "learning_rate": 0.00015525574499629355, "loss": 1.254, "step": 1517 }, { "epoch": 0.2248888888888889, "grad_norm": 1.6303880214691162, "learning_rate": 0.0001552260934025204, "loss": 1.5117, "step": 1518 }, { "epoch": 0.22503703703703704, "grad_norm": 2.4256293773651123, "learning_rate": 0.00015519644180874722, "loss": 1.1298, "step": 1519 }, { "epoch": 0.22518518518518518, "grad_norm": 2.3574140071868896, "learning_rate": 0.00015516679021497406, "loss": 1.2046, "step": 1520 }, { "epoch": 0.22533333333333333, "grad_norm": 1.2977325916290283, "learning_rate": 0.0001551371386212009, "loss": 1.0207, "step": 1521 }, { "epoch": 0.22548148148148148, "grad_norm": 2.1307456493377686, "learning_rate": 0.00015510748702742772, "loss": 1.2669, "step": 1522 }, { "epoch": 0.22562962962962962, "grad_norm": 2.4574077129364014, "learning_rate": 0.00015507783543365457, "loss": 1.2679, "step": 1523 }, { "epoch": 0.22577777777777777, "grad_norm": 1.542917251586914, "learning_rate": 0.0001550481838398814, "loss": 1.1871, "step": 1524 }, { "epoch": 0.22592592592592592, "grad_norm": 1.4039915800094604, "learning_rate": 0.00015501853224610823, "loss": 0.9045, "step": 1525 }, { "epoch": 0.22607407407407407, "grad_norm": 4.266626834869385, "learning_rate": 0.00015498888065233507, "loss": 1.4225, "step": 1526 }, { "epoch": 0.2262222222222222, "grad_norm": 2.024946451187134, "learning_rate": 0.00015495922905856192, "loss": 1.1265, "step": 1527 }, { "epoch": 0.22637037037037036, "grad_norm": 3.2204794883728027, "learning_rate": 0.00015492957746478874, "loss": 1.4952, "step": 1528 }, { "epoch": 0.2265185185185185, "grad_norm": 2.093524932861328, "learning_rate": 0.00015489992587101558, "loss": 1.2728, "step": 1529 }, { "epoch": 0.22666666666666666, "grad_norm": 2.0702030658721924, "learning_rate": 0.00015487027427724242, "loss": 1.047, "step": 1530 }, { "epoch": 0.2268148148148148, "grad_norm": 1.324537754058838, "learning_rate": 0.00015484062268346924, "loss": 1.3573, "step": 1531 }, { "epoch": 0.22696296296296295, "grad_norm": 3.2942769527435303, "learning_rate": 0.00015481097108969609, "loss": 1.0047, "step": 1532 }, { "epoch": 0.22711111111111112, "grad_norm": 1.4537463188171387, "learning_rate": 0.0001547813194959229, "loss": 1.1266, "step": 1533 }, { "epoch": 0.22725925925925927, "grad_norm": 1.2477037906646729, "learning_rate": 0.00015475166790214975, "loss": 1.1559, "step": 1534 }, { "epoch": 0.22740740740740742, "grad_norm": 2.4413416385650635, "learning_rate": 0.00015472201630837657, "loss": 0.8902, "step": 1535 }, { "epoch": 0.22755555555555557, "grad_norm": 1.791354775428772, "learning_rate": 0.0001546923647146034, "loss": 1.1871, "step": 1536 }, { "epoch": 0.2277037037037037, "grad_norm": 4.12494421005249, "learning_rate": 0.00015466271312083025, "loss": 1.1432, "step": 1537 }, { "epoch": 0.22785185185185186, "grad_norm": 1.4111377000808716, "learning_rate": 0.00015463306152705707, "loss": 1.1645, "step": 1538 }, { "epoch": 0.228, "grad_norm": 1.289906620979309, "learning_rate": 0.00015460340993328392, "loss": 1.1256, "step": 1539 }, { "epoch": 0.22814814814814816, "grad_norm": 1.2543591260910034, "learning_rate": 0.00015457375833951076, "loss": 1.1597, "step": 1540 }, { "epoch": 0.2282962962962963, "grad_norm": 1.2581937313079834, "learning_rate": 0.00015454410674573758, "loss": 1.0717, "step": 1541 }, { "epoch": 0.22844444444444445, "grad_norm": 2.8613216876983643, "learning_rate": 0.00015451445515196442, "loss": 1.1478, "step": 1542 }, { "epoch": 0.2285925925925926, "grad_norm": 1.1726185083389282, "learning_rate": 0.00015448480355819127, "loss": 1.1574, "step": 1543 }, { "epoch": 0.22874074074074074, "grad_norm": 1.0736782550811768, "learning_rate": 0.00015445515196441808, "loss": 0.7646, "step": 1544 }, { "epoch": 0.2288888888888889, "grad_norm": 1.8339978456497192, "learning_rate": 0.00015442550037064493, "loss": 1.0807, "step": 1545 }, { "epoch": 0.22903703703703704, "grad_norm": 1.7938997745513916, "learning_rate": 0.00015439584877687177, "loss": 1.4722, "step": 1546 }, { "epoch": 0.2291851851851852, "grad_norm": 1.0827606916427612, "learning_rate": 0.0001543661971830986, "loss": 1.0839, "step": 1547 }, { "epoch": 0.22933333333333333, "grad_norm": 1.4679458141326904, "learning_rate": 0.00015433654558932543, "loss": 1.0397, "step": 1548 }, { "epoch": 0.22948148148148148, "grad_norm": 1.7668004035949707, "learning_rate": 0.00015430689399555228, "loss": 0.9854, "step": 1549 }, { "epoch": 0.22962962962962963, "grad_norm": 1.282181739807129, "learning_rate": 0.0001542772424017791, "loss": 1.2771, "step": 1550 }, { "epoch": 0.22977777777777778, "grad_norm": 1.6717363595962524, "learning_rate": 0.00015424759080800594, "loss": 1.0952, "step": 1551 }, { "epoch": 0.22992592592592592, "grad_norm": 1.4578113555908203, "learning_rate": 0.00015421793921423279, "loss": 1.1101, "step": 1552 }, { "epoch": 0.23007407407407407, "grad_norm": 1.8867779970169067, "learning_rate": 0.0001541882876204596, "loss": 1.0367, "step": 1553 }, { "epoch": 0.23022222222222222, "grad_norm": 1.3532156944274902, "learning_rate": 0.00015415863602668642, "loss": 0.972, "step": 1554 }, { "epoch": 0.23037037037037036, "grad_norm": 1.3631467819213867, "learning_rate": 0.0001541289844329133, "loss": 0.9205, "step": 1555 }, { "epoch": 0.2305185185185185, "grad_norm": 2.7105939388275146, "learning_rate": 0.0001540993328391401, "loss": 1.2536, "step": 1556 }, { "epoch": 0.23066666666666666, "grad_norm": 1.349776029586792, "learning_rate": 0.00015406968124536693, "loss": 1.3044, "step": 1557 }, { "epoch": 0.2308148148148148, "grad_norm": 1.2606697082519531, "learning_rate": 0.0001540400296515938, "loss": 1.2882, "step": 1558 }, { "epoch": 0.23096296296296295, "grad_norm": 4.026727199554443, "learning_rate": 0.00015401037805782062, "loss": 0.9589, "step": 1559 }, { "epoch": 0.2311111111111111, "grad_norm": 2.5214037895202637, "learning_rate": 0.00015398072646404743, "loss": 1.2341, "step": 1560 }, { "epoch": 0.23125925925925925, "grad_norm": 1.9056528806686401, "learning_rate": 0.0001539510748702743, "loss": 0.7476, "step": 1561 }, { "epoch": 0.2314074074074074, "grad_norm": 5.390804767608643, "learning_rate": 0.00015392142327650112, "loss": 1.2445, "step": 1562 }, { "epoch": 0.23155555555555554, "grad_norm": 1.888152837753296, "learning_rate": 0.00015389177168272794, "loss": 0.9465, "step": 1563 }, { "epoch": 0.23170370370370372, "grad_norm": 2.0070877075195312, "learning_rate": 0.0001538621200889548, "loss": 1.1051, "step": 1564 }, { "epoch": 0.23185185185185186, "grad_norm": 1.2344915866851807, "learning_rate": 0.00015383246849518163, "loss": 0.9859, "step": 1565 }, { "epoch": 0.232, "grad_norm": 1.742415428161621, "learning_rate": 0.00015380281690140845, "loss": 1.1889, "step": 1566 }, { "epoch": 0.23214814814814816, "grad_norm": 3.2890334129333496, "learning_rate": 0.0001537731653076353, "loss": 1.3544, "step": 1567 }, { "epoch": 0.2322962962962963, "grad_norm": 1.8093900680541992, "learning_rate": 0.00015374351371386213, "loss": 1.1094, "step": 1568 }, { "epoch": 0.23244444444444445, "grad_norm": 2.2620792388916016, "learning_rate": 0.00015371386212008895, "loss": 1.1049, "step": 1569 }, { "epoch": 0.2325925925925926, "grad_norm": 1.7136729955673218, "learning_rate": 0.0001536842105263158, "loss": 1.1256, "step": 1570 }, { "epoch": 0.23274074074074075, "grad_norm": 1.5654417276382446, "learning_rate": 0.00015365455893254264, "loss": 1.1673, "step": 1571 }, { "epoch": 0.2328888888888889, "grad_norm": 1.0836280584335327, "learning_rate": 0.00015362490733876946, "loss": 1.2182, "step": 1572 }, { "epoch": 0.23303703703703704, "grad_norm": 1.9667060375213623, "learning_rate": 0.0001535952557449963, "loss": 1.1324, "step": 1573 }, { "epoch": 0.2331851851851852, "grad_norm": 1.7171205282211304, "learning_rate": 0.00015356560415122315, "loss": 1.1833, "step": 1574 }, { "epoch": 0.23333333333333334, "grad_norm": 1.2634750604629517, "learning_rate": 0.00015353595255744996, "loss": 1.2652, "step": 1575 }, { "epoch": 0.23348148148148148, "grad_norm": 1.0828866958618164, "learning_rate": 0.0001535063009636768, "loss": 0.8981, "step": 1576 }, { "epoch": 0.23362962962962963, "grad_norm": 1.451007604598999, "learning_rate": 0.00015347664936990365, "loss": 1.0465, "step": 1577 }, { "epoch": 0.23377777777777778, "grad_norm": 1.5012891292572021, "learning_rate": 0.00015344699777613047, "loss": 1.3538, "step": 1578 }, { "epoch": 0.23392592592592593, "grad_norm": 1.3202357292175293, "learning_rate": 0.00015341734618235731, "loss": 1.192, "step": 1579 }, { "epoch": 0.23407407407407407, "grad_norm": 2.689044952392578, "learning_rate": 0.00015338769458858416, "loss": 1.127, "step": 1580 }, { "epoch": 0.23422222222222222, "grad_norm": 1.9422969818115234, "learning_rate": 0.00015335804299481098, "loss": 0.9591, "step": 1581 }, { "epoch": 0.23437037037037037, "grad_norm": 1.6667768955230713, "learning_rate": 0.0001533283914010378, "loss": 1.227, "step": 1582 }, { "epoch": 0.23451851851851852, "grad_norm": 2.025862216949463, "learning_rate": 0.00015329873980726467, "loss": 1.1708, "step": 1583 }, { "epoch": 0.23466666666666666, "grad_norm": 3.5528149604797363, "learning_rate": 0.00015326908821349148, "loss": 1.2254, "step": 1584 }, { "epoch": 0.2348148148148148, "grad_norm": 1.451198935508728, "learning_rate": 0.0001532394366197183, "loss": 1.1741, "step": 1585 }, { "epoch": 0.23496296296296296, "grad_norm": 1.3051122426986694, "learning_rate": 0.00015320978502594517, "loss": 1.0806, "step": 1586 }, { "epoch": 0.2351111111111111, "grad_norm": 1.3965210914611816, "learning_rate": 0.000153180133432172, "loss": 1.1295, "step": 1587 }, { "epoch": 0.23525925925925925, "grad_norm": 1.2600857019424438, "learning_rate": 0.0001531504818383988, "loss": 0.9167, "step": 1588 }, { "epoch": 0.2354074074074074, "grad_norm": 2.6228692531585693, "learning_rate": 0.00015312083024462565, "loss": 1.394, "step": 1589 }, { "epoch": 0.23555555555555555, "grad_norm": 1.4789807796478271, "learning_rate": 0.0001530911786508525, "loss": 1.2326, "step": 1590 }, { "epoch": 0.2357037037037037, "grad_norm": 1.1614006757736206, "learning_rate": 0.0001530615270570793, "loss": 0.8976, "step": 1591 }, { "epoch": 0.23585185185185184, "grad_norm": 3.0136983394622803, "learning_rate": 0.00015303187546330616, "loss": 1.0333, "step": 1592 }, { "epoch": 0.236, "grad_norm": 1.777445912361145, "learning_rate": 0.000153002223869533, "loss": 1.3849, "step": 1593 }, { "epoch": 0.23614814814814813, "grad_norm": 2.2834534645080566, "learning_rate": 0.00015297257227575982, "loss": 1.5408, "step": 1594 }, { "epoch": 0.2362962962962963, "grad_norm": 1.2909154891967773, "learning_rate": 0.00015294292068198666, "loss": 1.1083, "step": 1595 }, { "epoch": 0.23644444444444446, "grad_norm": 1.6266371011734009, "learning_rate": 0.0001529132690882135, "loss": 1.2947, "step": 1596 }, { "epoch": 0.2365925925925926, "grad_norm": 1.3940317630767822, "learning_rate": 0.00015288361749444033, "loss": 0.9727, "step": 1597 }, { "epoch": 0.23674074074074075, "grad_norm": 1.4263591766357422, "learning_rate": 0.00015285396590066717, "loss": 1.0053, "step": 1598 }, { "epoch": 0.2368888888888889, "grad_norm": 1.289556622505188, "learning_rate": 0.00015282431430689401, "loss": 1.1829, "step": 1599 }, { "epoch": 0.23703703703703705, "grad_norm": 1.2458724975585938, "learning_rate": 0.00015279466271312083, "loss": 0.9879, "step": 1600 }, { "epoch": 0.2371851851851852, "grad_norm": 0.9497737288475037, "learning_rate": 0.00015276501111934768, "loss": 1.1464, "step": 1601 }, { "epoch": 0.23733333333333334, "grad_norm": 1.2145698070526123, "learning_rate": 0.00015273535952557452, "loss": 1.176, "step": 1602 }, { "epoch": 0.2374814814814815, "grad_norm": 1.0378276109695435, "learning_rate": 0.00015270570793180134, "loss": 1.053, "step": 1603 }, { "epoch": 0.23762962962962964, "grad_norm": 1.327704668045044, "learning_rate": 0.00015267605633802818, "loss": 1.2492, "step": 1604 }, { "epoch": 0.23777777777777778, "grad_norm": 1.621195912361145, "learning_rate": 0.000152646404744255, "loss": 1.2555, "step": 1605 }, { "epoch": 0.23792592592592593, "grad_norm": 1.1727124452590942, "learning_rate": 0.00015261675315048184, "loss": 1.0607, "step": 1606 }, { "epoch": 0.23807407407407408, "grad_norm": 2.3401219844818115, "learning_rate": 0.0001525871015567087, "loss": 1.1388, "step": 1607 }, { "epoch": 0.23822222222222222, "grad_norm": 1.2623165845870972, "learning_rate": 0.0001525574499629355, "loss": 0.8555, "step": 1608 }, { "epoch": 0.23837037037037037, "grad_norm": 1.048319935798645, "learning_rate": 0.00015252779836916235, "loss": 1.0111, "step": 1609 }, { "epoch": 0.23851851851851852, "grad_norm": 2.189643383026123, "learning_rate": 0.0001524981467753892, "loss": 1.0949, "step": 1610 }, { "epoch": 0.23866666666666667, "grad_norm": 3.7101433277130127, "learning_rate": 0.000152468495181616, "loss": 1.3376, "step": 1611 }, { "epoch": 0.2388148148148148, "grad_norm": 1.1837656497955322, "learning_rate": 0.00015243884358784286, "loss": 0.9051, "step": 1612 }, { "epoch": 0.23896296296296296, "grad_norm": 1.5767087936401367, "learning_rate": 0.0001524091919940697, "loss": 1.1235, "step": 1613 }, { "epoch": 0.2391111111111111, "grad_norm": 1.9962878227233887, "learning_rate": 0.00015237954040029652, "loss": 1.2182, "step": 1614 }, { "epoch": 0.23925925925925925, "grad_norm": 1.3688435554504395, "learning_rate": 0.00015234988880652336, "loss": 0.973, "step": 1615 }, { "epoch": 0.2394074074074074, "grad_norm": 1.3422621488571167, "learning_rate": 0.0001523202372127502, "loss": 1.266, "step": 1616 }, { "epoch": 0.23955555555555555, "grad_norm": 1.719692587852478, "learning_rate": 0.00015229058561897702, "loss": 1.3915, "step": 1617 }, { "epoch": 0.2397037037037037, "grad_norm": 1.1138745546340942, "learning_rate": 0.00015226093402520387, "loss": 0.8391, "step": 1618 }, { "epoch": 0.23985185185185184, "grad_norm": 1.1409568786621094, "learning_rate": 0.0001522312824314307, "loss": 1.0422, "step": 1619 }, { "epoch": 0.24, "grad_norm": 1.4203941822052002, "learning_rate": 0.00015220163083765753, "loss": 0.9852, "step": 1620 }, { "epoch": 0.24014814814814814, "grad_norm": 1.5709267854690552, "learning_rate": 0.00015217197924388435, "loss": 1.1323, "step": 1621 }, { "epoch": 0.24029629629629629, "grad_norm": 1.2581415176391602, "learning_rate": 0.0001521423276501112, "loss": 1.2246, "step": 1622 }, { "epoch": 0.24044444444444443, "grad_norm": 1.197487711906433, "learning_rate": 0.00015211267605633804, "loss": 1.2816, "step": 1623 }, { "epoch": 0.24059259259259258, "grad_norm": 1.5059443712234497, "learning_rate": 0.00015208302446256485, "loss": 1.2273, "step": 1624 }, { "epoch": 0.24074074074074073, "grad_norm": 1.7525125741958618, "learning_rate": 0.0001520533728687917, "loss": 0.9537, "step": 1625 }, { "epoch": 0.2408888888888889, "grad_norm": 1.6179605722427368, "learning_rate": 0.00015202372127501854, "loss": 1.276, "step": 1626 }, { "epoch": 0.24103703703703705, "grad_norm": 1.1902782917022705, "learning_rate": 0.00015199406968124536, "loss": 0.9585, "step": 1627 }, { "epoch": 0.2411851851851852, "grad_norm": 1.3122388124465942, "learning_rate": 0.0001519644180874722, "loss": 1.3009, "step": 1628 }, { "epoch": 0.24133333333333334, "grad_norm": 1.5132875442504883, "learning_rate": 0.00015193476649369905, "loss": 1.1741, "step": 1629 }, { "epoch": 0.2414814814814815, "grad_norm": 1.6310935020446777, "learning_rate": 0.00015190511489992587, "loss": 1.3513, "step": 1630 }, { "epoch": 0.24162962962962964, "grad_norm": 1.2689038515090942, "learning_rate": 0.0001518754633061527, "loss": 1.022, "step": 1631 }, { "epoch": 0.24177777777777779, "grad_norm": 6.270352363586426, "learning_rate": 0.00015184581171237956, "loss": 1.2905, "step": 1632 }, { "epoch": 0.24192592592592593, "grad_norm": 4.507718086242676, "learning_rate": 0.00015181616011860637, "loss": 1.1989, "step": 1633 }, { "epoch": 0.24207407407407408, "grad_norm": 1.1099902391433716, "learning_rate": 0.00015178650852483322, "loss": 1.2158, "step": 1634 }, { "epoch": 0.24222222222222223, "grad_norm": 2.4325876235961914, "learning_rate": 0.00015175685693106006, "loss": 0.954, "step": 1635 }, { "epoch": 0.24237037037037037, "grad_norm": 3.1827144622802734, "learning_rate": 0.00015172720533728688, "loss": 1.3827, "step": 1636 }, { "epoch": 0.24251851851851852, "grad_norm": 1.328933835029602, "learning_rate": 0.00015169755374351372, "loss": 1.0837, "step": 1637 }, { "epoch": 0.24266666666666667, "grad_norm": 1.7154287099838257, "learning_rate": 0.00015166790214974057, "loss": 1.1321, "step": 1638 }, { "epoch": 0.24281481481481482, "grad_norm": 2.116061210632324, "learning_rate": 0.00015163825055596739, "loss": 1.0563, "step": 1639 }, { "epoch": 0.24296296296296296, "grad_norm": 1.2728734016418457, "learning_rate": 0.0001516085989621942, "loss": 1.1292, "step": 1640 }, { "epoch": 0.2431111111111111, "grad_norm": 1.740496039390564, "learning_rate": 0.00015157894736842108, "loss": 1.144, "step": 1641 }, { "epoch": 0.24325925925925926, "grad_norm": 1.2164138555526733, "learning_rate": 0.0001515492957746479, "loss": 0.9607, "step": 1642 }, { "epoch": 0.2434074074074074, "grad_norm": 1.3772945404052734, "learning_rate": 0.0001515196441808747, "loss": 1.204, "step": 1643 }, { "epoch": 0.24355555555555555, "grad_norm": 2.048625946044922, "learning_rate": 0.00015148999258710158, "loss": 1.2636, "step": 1644 }, { "epoch": 0.2437037037037037, "grad_norm": 1.2583869695663452, "learning_rate": 0.0001514603409933284, "loss": 0.97, "step": 1645 }, { "epoch": 0.24385185185185185, "grad_norm": 1.6000150442123413, "learning_rate": 0.00015143068939955522, "loss": 1.3946, "step": 1646 }, { "epoch": 0.244, "grad_norm": 1.0762135982513428, "learning_rate": 0.0001514010378057821, "loss": 1.0259, "step": 1647 }, { "epoch": 0.24414814814814814, "grad_norm": 1.103447675704956, "learning_rate": 0.0001513713862120089, "loss": 1.0098, "step": 1648 }, { "epoch": 0.2442962962962963, "grad_norm": 1.3879259824752808, "learning_rate": 0.00015134173461823572, "loss": 0.9891, "step": 1649 }, { "epoch": 0.24444444444444444, "grad_norm": 1.2222763299942017, "learning_rate": 0.0001513120830244626, "loss": 1.0233, "step": 1650 }, { "epoch": 0.24459259259259258, "grad_norm": 2.4225425720214844, "learning_rate": 0.0001512824314306894, "loss": 1.0827, "step": 1651 }, { "epoch": 0.24474074074074073, "grad_norm": 1.1073728799819946, "learning_rate": 0.00015125277983691623, "loss": 1.1398, "step": 1652 }, { "epoch": 0.24488888888888888, "grad_norm": 1.8750205039978027, "learning_rate": 0.00015122312824314307, "loss": 1.0011, "step": 1653 }, { "epoch": 0.24503703703703703, "grad_norm": 1.8179675340652466, "learning_rate": 0.00015119347664936992, "loss": 1.2671, "step": 1654 }, { "epoch": 0.24518518518518517, "grad_norm": 11.022109031677246, "learning_rate": 0.00015116382505559673, "loss": 1.124, "step": 1655 }, { "epoch": 0.24533333333333332, "grad_norm": 1.687408208847046, "learning_rate": 0.00015113417346182358, "loss": 1.1028, "step": 1656 }, { "epoch": 0.2454814814814815, "grad_norm": 1.3397771120071411, "learning_rate": 0.00015110452186805042, "loss": 1.2922, "step": 1657 }, { "epoch": 0.24562962962962964, "grad_norm": 1.9326858520507812, "learning_rate": 0.00015107487027427724, "loss": 1.0396, "step": 1658 }, { "epoch": 0.2457777777777778, "grad_norm": 2.765787124633789, "learning_rate": 0.00015104521868050409, "loss": 1.1338, "step": 1659 }, { "epoch": 0.24592592592592594, "grad_norm": 1.0958811044692993, "learning_rate": 0.00015101556708673093, "loss": 0.9947, "step": 1660 }, { "epoch": 0.24607407407407408, "grad_norm": 2.3440728187561035, "learning_rate": 0.00015098591549295775, "loss": 1.2989, "step": 1661 }, { "epoch": 0.24622222222222223, "grad_norm": 1.3007993698120117, "learning_rate": 0.0001509562638991846, "loss": 1.357, "step": 1662 }, { "epoch": 0.24637037037037038, "grad_norm": 1.1814420223236084, "learning_rate": 0.00015092661230541144, "loss": 1.417, "step": 1663 }, { "epoch": 0.24651851851851853, "grad_norm": 1.2551305294036865, "learning_rate": 0.00015089696071163825, "loss": 1.2175, "step": 1664 }, { "epoch": 0.24666666666666667, "grad_norm": 1.9871037006378174, "learning_rate": 0.0001508673091178651, "loss": 1.1472, "step": 1665 }, { "epoch": 0.24681481481481482, "grad_norm": 1.1334728002548218, "learning_rate": 0.00015083765752409194, "loss": 1.0093, "step": 1666 }, { "epoch": 0.24696296296296297, "grad_norm": 1.8311874866485596, "learning_rate": 0.00015080800593031876, "loss": 1.1317, "step": 1667 }, { "epoch": 0.24711111111111111, "grad_norm": 1.4071077108383179, "learning_rate": 0.0001507783543365456, "loss": 1.0525, "step": 1668 }, { "epoch": 0.24725925925925926, "grad_norm": 1.2422730922698975, "learning_rate": 0.00015074870274277245, "loss": 1.0748, "step": 1669 }, { "epoch": 0.2474074074074074, "grad_norm": 1.0326001644134521, "learning_rate": 0.00015071905114899927, "loss": 1.2629, "step": 1670 }, { "epoch": 0.24755555555555556, "grad_norm": 1.173397183418274, "learning_rate": 0.00015068939955522608, "loss": 1.2473, "step": 1671 }, { "epoch": 0.2477037037037037, "grad_norm": 0.9199334979057312, "learning_rate": 0.00015065974796145296, "loss": 0.9084, "step": 1672 }, { "epoch": 0.24785185185185185, "grad_norm": 2.1282355785369873, "learning_rate": 0.00015063009636767977, "loss": 1.0612, "step": 1673 }, { "epoch": 0.248, "grad_norm": 1.5971500873565674, "learning_rate": 0.0001506004447739066, "loss": 1.1243, "step": 1674 }, { "epoch": 0.24814814814814815, "grad_norm": 1.3204591274261475, "learning_rate": 0.00015057079318013343, "loss": 1.1751, "step": 1675 }, { "epoch": 0.2482962962962963, "grad_norm": 1.365966558456421, "learning_rate": 0.00015054114158636028, "loss": 1.2581, "step": 1676 }, { "epoch": 0.24844444444444444, "grad_norm": 2.5307705402374268, "learning_rate": 0.0001505114899925871, "loss": 1.0367, "step": 1677 }, { "epoch": 0.2485925925925926, "grad_norm": 1.1115132570266724, "learning_rate": 0.00015048183839881394, "loss": 1.0591, "step": 1678 }, { "epoch": 0.24874074074074073, "grad_norm": 1.1425161361694336, "learning_rate": 0.00015045218680504079, "loss": 0.9711, "step": 1679 }, { "epoch": 0.24888888888888888, "grad_norm": 0.990850031375885, "learning_rate": 0.0001504225352112676, "loss": 1.0807, "step": 1680 }, { "epoch": 0.24903703703703703, "grad_norm": 1.6098432540893555, "learning_rate": 0.00015039288361749445, "loss": 0.9325, "step": 1681 }, { "epoch": 0.24918518518518518, "grad_norm": 2.343644857406616, "learning_rate": 0.0001503632320237213, "loss": 1.0391, "step": 1682 }, { "epoch": 0.24933333333333332, "grad_norm": 1.5114959478378296, "learning_rate": 0.0001503335804299481, "loss": 1.3867, "step": 1683 }, { "epoch": 0.24948148148148147, "grad_norm": 1.31886887550354, "learning_rate": 0.00015030392883617495, "loss": 1.1278, "step": 1684 }, { "epoch": 0.24962962962962962, "grad_norm": 1.9786417484283447, "learning_rate": 0.0001502742772424018, "loss": 1.4276, "step": 1685 }, { "epoch": 0.24977777777777777, "grad_norm": 1.586955189704895, "learning_rate": 0.00015024462564862861, "loss": 1.1962, "step": 1686 }, { "epoch": 0.2499259259259259, "grad_norm": 1.065201759338379, "learning_rate": 0.00015021497405485546, "loss": 1.0395, "step": 1687 }, { "epoch": 0.25007407407407406, "grad_norm": 2.2416906356811523, "learning_rate": 0.0001501853224610823, "loss": 0.9971, "step": 1688 }, { "epoch": 0.25022222222222223, "grad_norm": 1.2670091390609741, "learning_rate": 0.00015015567086730912, "loss": 1.0083, "step": 1689 }, { "epoch": 0.25037037037037035, "grad_norm": 1.256080150604248, "learning_rate": 0.00015012601927353597, "loss": 1.0631, "step": 1690 }, { "epoch": 0.25051851851851853, "grad_norm": 1.2797483205795288, "learning_rate": 0.00015009636767976278, "loss": 1.0362, "step": 1691 }, { "epoch": 0.25066666666666665, "grad_norm": 1.3483778238296509, "learning_rate": 0.00015006671608598963, "loss": 1.0629, "step": 1692 }, { "epoch": 0.2508148148148148, "grad_norm": 1.6776374578475952, "learning_rate": 0.00015003706449221647, "loss": 0.9733, "step": 1693 }, { "epoch": 0.25096296296296294, "grad_norm": 1.5913230180740356, "learning_rate": 0.0001500074128984433, "loss": 1.2711, "step": 1694 }, { "epoch": 0.2511111111111111, "grad_norm": 1.493897557258606, "learning_rate": 0.00014997776130467013, "loss": 1.2509, "step": 1695 }, { "epoch": 0.25125925925925924, "grad_norm": 1.1311514377593994, "learning_rate": 0.00014994810971089698, "loss": 1.0818, "step": 1696 }, { "epoch": 0.2514074074074074, "grad_norm": 1.299396276473999, "learning_rate": 0.0001499184581171238, "loss": 1.0392, "step": 1697 }, { "epoch": 0.25155555555555553, "grad_norm": 1.3660831451416016, "learning_rate": 0.00014988880652335064, "loss": 1.0738, "step": 1698 }, { "epoch": 0.2517037037037037, "grad_norm": 2.1091654300689697, "learning_rate": 0.00014985915492957748, "loss": 1.1295, "step": 1699 }, { "epoch": 0.2518518518518518, "grad_norm": 1.4837119579315186, "learning_rate": 0.0001498295033358043, "loss": 1.3783, "step": 1700 }, { "epoch": 0.252, "grad_norm": 1.9091229438781738, "learning_rate": 0.00014979985174203115, "loss": 1.0157, "step": 1701 }, { "epoch": 0.2521481481481481, "grad_norm": 1.6074457168579102, "learning_rate": 0.000149770200148258, "loss": 1.1333, "step": 1702 }, { "epoch": 0.2522962962962963, "grad_norm": 1.9816997051239014, "learning_rate": 0.0001497405485544848, "loss": 1.1686, "step": 1703 }, { "epoch": 0.25244444444444447, "grad_norm": 1.0871987342834473, "learning_rate": 0.00014971089696071165, "loss": 1.1539, "step": 1704 }, { "epoch": 0.2525925925925926, "grad_norm": 1.8652485609054565, "learning_rate": 0.00014968124536693847, "loss": 1.2676, "step": 1705 }, { "epoch": 0.25274074074074077, "grad_norm": 1.4037657976150513, "learning_rate": 0.00014965159377316531, "loss": 0.8503, "step": 1706 }, { "epoch": 0.2528888888888889, "grad_norm": 1.6149505376815796, "learning_rate": 0.00014962194217939216, "loss": 1.3148, "step": 1707 }, { "epoch": 0.25303703703703706, "grad_norm": 2.128265142440796, "learning_rate": 0.00014959229058561898, "loss": 1.3584, "step": 1708 }, { "epoch": 0.2531851851851852, "grad_norm": 3.4360711574554443, "learning_rate": 0.00014956263899184582, "loss": 1.2946, "step": 1709 }, { "epoch": 0.25333333333333335, "grad_norm": 1.0533114671707153, "learning_rate": 0.00014953298739807264, "loss": 0.9652, "step": 1710 }, { "epoch": 0.2534814814814815, "grad_norm": 2.195936441421509, "learning_rate": 0.00014950333580429948, "loss": 1.0955, "step": 1711 }, { "epoch": 0.25362962962962965, "grad_norm": 1.2903773784637451, "learning_rate": 0.00014947368421052633, "loss": 1.1884, "step": 1712 }, { "epoch": 0.25377777777777777, "grad_norm": 2.6290183067321777, "learning_rate": 0.00014944403261675314, "loss": 1.0086, "step": 1713 }, { "epoch": 0.25392592592592594, "grad_norm": 1.4115326404571533, "learning_rate": 0.00014941438102298, "loss": 0.9618, "step": 1714 }, { "epoch": 0.25407407407407406, "grad_norm": 2.979445219039917, "learning_rate": 0.00014938472942920683, "loss": 1.3909, "step": 1715 }, { "epoch": 0.25422222222222224, "grad_norm": 1.03138267993927, "learning_rate": 0.00014935507783543365, "loss": 1.0494, "step": 1716 }, { "epoch": 0.25437037037037036, "grad_norm": 1.688761591911316, "learning_rate": 0.0001493254262416605, "loss": 1.0728, "step": 1717 }, { "epoch": 0.25451851851851853, "grad_norm": 1.0532830953598022, "learning_rate": 0.00014929577464788734, "loss": 1.0128, "step": 1718 }, { "epoch": 0.25466666666666665, "grad_norm": 2.199781656265259, "learning_rate": 0.00014926612305411416, "loss": 1.0623, "step": 1719 }, { "epoch": 0.2548148148148148, "grad_norm": 1.4258421659469604, "learning_rate": 0.000149236471460341, "loss": 1.1596, "step": 1720 }, { "epoch": 0.25496296296296295, "grad_norm": 0.9015241265296936, "learning_rate": 0.00014920681986656785, "loss": 1.1419, "step": 1721 }, { "epoch": 0.2551111111111111, "grad_norm": 1.3522428274154663, "learning_rate": 0.00014917716827279466, "loss": 1.2893, "step": 1722 }, { "epoch": 0.25525925925925924, "grad_norm": 1.257901906967163, "learning_rate": 0.0001491475166790215, "loss": 1.0008, "step": 1723 }, { "epoch": 0.2554074074074074, "grad_norm": 1.484906554222107, "learning_rate": 0.00014911786508524835, "loss": 1.2028, "step": 1724 }, { "epoch": 0.25555555555555554, "grad_norm": 1.1615350246429443, "learning_rate": 0.00014908821349147517, "loss": 0.8992, "step": 1725 }, { "epoch": 0.2557037037037037, "grad_norm": 2.3160672187805176, "learning_rate": 0.000149058561897702, "loss": 1.2325, "step": 1726 }, { "epoch": 0.25585185185185183, "grad_norm": 3.3724308013916016, "learning_rate": 0.00014902891030392886, "loss": 1.1052, "step": 1727 }, { "epoch": 0.256, "grad_norm": 1.7354025840759277, "learning_rate": 0.00014899925871015568, "loss": 1.4236, "step": 1728 }, { "epoch": 0.2561481481481481, "grad_norm": 1.625180721282959, "learning_rate": 0.0001489696071163825, "loss": 1.2491, "step": 1729 }, { "epoch": 0.2562962962962963, "grad_norm": 2.191547155380249, "learning_rate": 0.00014893995552260936, "loss": 1.0228, "step": 1730 }, { "epoch": 0.2564444444444444, "grad_norm": 1.3447610139846802, "learning_rate": 0.00014891030392883618, "loss": 1.0485, "step": 1731 }, { "epoch": 0.2565925925925926, "grad_norm": 1.1098294258117676, "learning_rate": 0.000148880652335063, "loss": 0.9501, "step": 1732 }, { "epoch": 0.2567407407407407, "grad_norm": 1.87017822265625, "learning_rate": 0.00014885100074128987, "loss": 1.0179, "step": 1733 }, { "epoch": 0.2568888888888889, "grad_norm": 2.0555732250213623, "learning_rate": 0.0001488213491475167, "loss": 0.9778, "step": 1734 }, { "epoch": 0.25703703703703706, "grad_norm": 0.9215679168701172, "learning_rate": 0.0001487916975537435, "loss": 0.9877, "step": 1735 }, { "epoch": 0.2571851851851852, "grad_norm": 2.807542562484741, "learning_rate": 0.00014876204595997038, "loss": 1.2137, "step": 1736 }, { "epoch": 0.25733333333333336, "grad_norm": 1.4949638843536377, "learning_rate": 0.0001487323943661972, "loss": 1.3621, "step": 1737 }, { "epoch": 0.2574814814814815, "grad_norm": 1.8658875226974487, "learning_rate": 0.000148702742772424, "loss": 0.9667, "step": 1738 }, { "epoch": 0.25762962962962965, "grad_norm": 1.7364095449447632, "learning_rate": 0.00014867309117865086, "loss": 1.1931, "step": 1739 }, { "epoch": 0.2577777777777778, "grad_norm": 1.4463858604431152, "learning_rate": 0.0001486434395848777, "loss": 0.9939, "step": 1740 }, { "epoch": 0.25792592592592595, "grad_norm": 1.1945314407348633, "learning_rate": 0.00014861378799110452, "loss": 1.1426, "step": 1741 }, { "epoch": 0.25807407407407407, "grad_norm": 1.6998885869979858, "learning_rate": 0.00014858413639733136, "loss": 1.2283, "step": 1742 }, { "epoch": 0.25822222222222224, "grad_norm": 1.1630172729492188, "learning_rate": 0.0001485544848035582, "loss": 0.9967, "step": 1743 }, { "epoch": 0.25837037037037036, "grad_norm": 2.042039155960083, "learning_rate": 0.00014852483320978502, "loss": 1.1009, "step": 1744 }, { "epoch": 0.25851851851851854, "grad_norm": 1.3501461744308472, "learning_rate": 0.00014849518161601187, "loss": 1.5284, "step": 1745 }, { "epoch": 0.25866666666666666, "grad_norm": 1.1120213270187378, "learning_rate": 0.0001484655300222387, "loss": 0.8859, "step": 1746 }, { "epoch": 0.25881481481481483, "grad_norm": 1.3494079113006592, "learning_rate": 0.00014843587842846553, "loss": 1.2178, "step": 1747 }, { "epoch": 0.25896296296296295, "grad_norm": 1.2317906618118286, "learning_rate": 0.00014840622683469238, "loss": 1.0457, "step": 1748 }, { "epoch": 0.2591111111111111, "grad_norm": 1.3726729154586792, "learning_rate": 0.00014837657524091922, "loss": 0.9816, "step": 1749 }, { "epoch": 0.25925925925925924, "grad_norm": 1.264622688293457, "learning_rate": 0.00014834692364714604, "loss": 1.1851, "step": 1750 }, { "epoch": 0.2594074074074074, "grad_norm": 1.4444366693496704, "learning_rate": 0.00014831727205337288, "loss": 1.1572, "step": 1751 }, { "epoch": 0.25955555555555554, "grad_norm": 1.4414944648742676, "learning_rate": 0.00014828762045959973, "loss": 1.2479, "step": 1752 }, { "epoch": 0.2597037037037037, "grad_norm": 2.290572166442871, "learning_rate": 0.00014825796886582654, "loss": 1.0049, "step": 1753 }, { "epoch": 0.25985185185185183, "grad_norm": 1.5054208040237427, "learning_rate": 0.0001482283172720534, "loss": 1.1638, "step": 1754 }, { "epoch": 0.26, "grad_norm": 1.7715121507644653, "learning_rate": 0.00014819866567828023, "loss": 1.165, "step": 1755 }, { "epoch": 0.26014814814814813, "grad_norm": 1.6338270902633667, "learning_rate": 0.00014816901408450705, "loss": 1.2188, "step": 1756 }, { "epoch": 0.2602962962962963, "grad_norm": 1.2027145624160767, "learning_rate": 0.00014813936249073387, "loss": 1.0042, "step": 1757 }, { "epoch": 0.2604444444444444, "grad_norm": 1.4290188550949097, "learning_rate": 0.00014810971089696074, "loss": 1.0109, "step": 1758 }, { "epoch": 0.2605925925925926, "grad_norm": 1.5197229385375977, "learning_rate": 0.00014808005930318756, "loss": 1.1907, "step": 1759 }, { "epoch": 0.2607407407407407, "grad_norm": 1.4386405944824219, "learning_rate": 0.00014805040770941437, "loss": 1.0953, "step": 1760 }, { "epoch": 0.2608888888888889, "grad_norm": 1.638106107711792, "learning_rate": 0.00014802075611564122, "loss": 1.1874, "step": 1761 }, { "epoch": 0.261037037037037, "grad_norm": 1.4412510395050049, "learning_rate": 0.00014799110452186806, "loss": 0.8044, "step": 1762 }, { "epoch": 0.2611851851851852, "grad_norm": 0.9868293404579163, "learning_rate": 0.00014796145292809488, "loss": 0.8231, "step": 1763 }, { "epoch": 0.2613333333333333, "grad_norm": 1.248249888420105, "learning_rate": 0.00014793180133432172, "loss": 0.9181, "step": 1764 }, { "epoch": 0.2614814814814815, "grad_norm": 1.0567678213119507, "learning_rate": 0.00014790214974054857, "loss": 1.0008, "step": 1765 }, { "epoch": 0.26162962962962966, "grad_norm": 2.1666345596313477, "learning_rate": 0.00014787249814677539, "loss": 1.0072, "step": 1766 }, { "epoch": 0.2617777777777778, "grad_norm": 1.361720085144043, "learning_rate": 0.00014784284655300223, "loss": 1.1215, "step": 1767 }, { "epoch": 0.26192592592592595, "grad_norm": 1.513120174407959, "learning_rate": 0.00014781319495922907, "loss": 1.1039, "step": 1768 }, { "epoch": 0.26207407407407407, "grad_norm": 1.3253802061080933, "learning_rate": 0.0001477835433654559, "loss": 1.0483, "step": 1769 }, { "epoch": 0.26222222222222225, "grad_norm": 1.3114228248596191, "learning_rate": 0.00014775389177168274, "loss": 0.8184, "step": 1770 }, { "epoch": 0.26237037037037036, "grad_norm": 1.4233734607696533, "learning_rate": 0.00014772424017790958, "loss": 1.2382, "step": 1771 }, { "epoch": 0.26251851851851854, "grad_norm": 1.3835053443908691, "learning_rate": 0.0001476945885841364, "loss": 1.0324, "step": 1772 }, { "epoch": 0.26266666666666666, "grad_norm": 1.469097375869751, "learning_rate": 0.00014766493699036324, "loss": 1.3254, "step": 1773 }, { "epoch": 0.26281481481481483, "grad_norm": 2.058704137802124, "learning_rate": 0.0001476352853965901, "loss": 1.0667, "step": 1774 }, { "epoch": 0.26296296296296295, "grad_norm": 1.145105004310608, "learning_rate": 0.0001476056338028169, "loss": 1.0848, "step": 1775 }, { "epoch": 0.26311111111111113, "grad_norm": 0.9493306875228882, "learning_rate": 0.00014757598220904375, "loss": 0.8814, "step": 1776 }, { "epoch": 0.26325925925925925, "grad_norm": 1.336127519607544, "learning_rate": 0.00014754633061527057, "loss": 0.9214, "step": 1777 }, { "epoch": 0.2634074074074074, "grad_norm": 1.348656177520752, "learning_rate": 0.0001475166790214974, "loss": 1.179, "step": 1778 }, { "epoch": 0.26355555555555554, "grad_norm": 1.284738302230835, "learning_rate": 0.00014748702742772426, "loss": 1.1527, "step": 1779 }, { "epoch": 0.2637037037037037, "grad_norm": 1.421317219734192, "learning_rate": 0.00014745737583395107, "loss": 1.2755, "step": 1780 }, { "epoch": 0.26385185185185184, "grad_norm": 1.293117880821228, "learning_rate": 0.00014742772424017792, "loss": 0.9949, "step": 1781 }, { "epoch": 0.264, "grad_norm": 1.6930663585662842, "learning_rate": 0.00014739807264640476, "loss": 1.6129, "step": 1782 }, { "epoch": 0.26414814814814813, "grad_norm": 1.8110852241516113, "learning_rate": 0.00014736842105263158, "loss": 1.1731, "step": 1783 }, { "epoch": 0.2642962962962963, "grad_norm": 1.2088935375213623, "learning_rate": 0.00014733876945885842, "loss": 0.9807, "step": 1784 }, { "epoch": 0.2644444444444444, "grad_norm": 1.3096468448638916, "learning_rate": 0.00014730911786508527, "loss": 1.2637, "step": 1785 }, { "epoch": 0.2645925925925926, "grad_norm": 1.1938130855560303, "learning_rate": 0.00014727946627131209, "loss": 0.904, "step": 1786 }, { "epoch": 0.2647407407407407, "grad_norm": 2.0407581329345703, "learning_rate": 0.00014724981467753893, "loss": 1.0874, "step": 1787 }, { "epoch": 0.2648888888888889, "grad_norm": 1.9213590621948242, "learning_rate": 0.00014722016308376577, "loss": 0.9534, "step": 1788 }, { "epoch": 0.265037037037037, "grad_norm": 1.3738536834716797, "learning_rate": 0.0001471905114899926, "loss": 1.2112, "step": 1789 }, { "epoch": 0.2651851851851852, "grad_norm": 1.7585114240646362, "learning_rate": 0.00014716085989621944, "loss": 1.1066, "step": 1790 }, { "epoch": 0.2653333333333333, "grad_norm": 1.6683770418167114, "learning_rate": 0.00014713120830244625, "loss": 1.1725, "step": 1791 }, { "epoch": 0.2654814814814815, "grad_norm": 1.3317421674728394, "learning_rate": 0.0001471015567086731, "loss": 1.2168, "step": 1792 }, { "epoch": 0.2656296296296296, "grad_norm": 1.2089072465896606, "learning_rate": 0.00014707190511489994, "loss": 1.3568, "step": 1793 }, { "epoch": 0.2657777777777778, "grad_norm": 1.3464851379394531, "learning_rate": 0.00014704225352112676, "loss": 1.2184, "step": 1794 }, { "epoch": 0.2659259259259259, "grad_norm": 1.285253643989563, "learning_rate": 0.0001470126019273536, "loss": 1.0801, "step": 1795 }, { "epoch": 0.2660740740740741, "grad_norm": 1.6146527528762817, "learning_rate": 0.00014698295033358042, "loss": 1.0191, "step": 1796 }, { "epoch": 0.26622222222222225, "grad_norm": 1.29169762134552, "learning_rate": 0.00014695329873980727, "loss": 0.9172, "step": 1797 }, { "epoch": 0.26637037037037037, "grad_norm": 1.3831039667129517, "learning_rate": 0.0001469236471460341, "loss": 1.1838, "step": 1798 }, { "epoch": 0.26651851851851854, "grad_norm": 1.3449994325637817, "learning_rate": 0.00014689399555226093, "loss": 1.0419, "step": 1799 }, { "epoch": 0.26666666666666666, "grad_norm": 2.0927140712738037, "learning_rate": 0.00014686434395848777, "loss": 1.117, "step": 1800 }, { "epoch": 0.26681481481481484, "grad_norm": 1.8944083452224731, "learning_rate": 0.00014683469236471462, "loss": 1.0172, "step": 1801 }, { "epoch": 0.26696296296296296, "grad_norm": 1.5123599767684937, "learning_rate": 0.00014680504077094143, "loss": 1.2678, "step": 1802 }, { "epoch": 0.26711111111111113, "grad_norm": 1.9740763902664185, "learning_rate": 0.00014677538917716828, "loss": 1.1469, "step": 1803 }, { "epoch": 0.26725925925925925, "grad_norm": 1.1890026330947876, "learning_rate": 0.00014674573758339512, "loss": 0.8493, "step": 1804 }, { "epoch": 0.2674074074074074, "grad_norm": 1.6631288528442383, "learning_rate": 0.00014671608598962194, "loss": 1.3172, "step": 1805 }, { "epoch": 0.26755555555555555, "grad_norm": 1.374815583229065, "learning_rate": 0.00014668643439584878, "loss": 1.1752, "step": 1806 }, { "epoch": 0.2677037037037037, "grad_norm": 1.6949161291122437, "learning_rate": 0.00014665678280207563, "loss": 1.4217, "step": 1807 }, { "epoch": 0.26785185185185184, "grad_norm": 1.3431637287139893, "learning_rate": 0.00014662713120830245, "loss": 1.4953, "step": 1808 }, { "epoch": 0.268, "grad_norm": 1.5021950006484985, "learning_rate": 0.0001465974796145293, "loss": 1.0382, "step": 1809 }, { "epoch": 0.26814814814814814, "grad_norm": 2.2632665634155273, "learning_rate": 0.00014656782802075614, "loss": 1.255, "step": 1810 }, { "epoch": 0.2682962962962963, "grad_norm": 1.290856122970581, "learning_rate": 0.00014653817642698295, "loss": 1.1311, "step": 1811 }, { "epoch": 0.26844444444444443, "grad_norm": 1.3417847156524658, "learning_rate": 0.00014650852483320977, "loss": 1.0782, "step": 1812 }, { "epoch": 0.2685925925925926, "grad_norm": 2.0849289894104004, "learning_rate": 0.00014647887323943664, "loss": 1.1883, "step": 1813 }, { "epoch": 0.2687407407407407, "grad_norm": 1.8363046646118164, "learning_rate": 0.00014644922164566346, "loss": 1.1608, "step": 1814 }, { "epoch": 0.2688888888888889, "grad_norm": 2.068054676055908, "learning_rate": 0.00014641957005189028, "loss": 1.1914, "step": 1815 }, { "epoch": 0.269037037037037, "grad_norm": 1.2311770915985107, "learning_rate": 0.00014638991845811715, "loss": 1.1122, "step": 1816 }, { "epoch": 0.2691851851851852, "grad_norm": 1.6212143898010254, "learning_rate": 0.00014636026686434397, "loss": 1.2723, "step": 1817 }, { "epoch": 0.2693333333333333, "grad_norm": 2.1663825511932373, "learning_rate": 0.00014633061527057078, "loss": 1.226, "step": 1818 }, { "epoch": 0.2694814814814815, "grad_norm": 1.633810043334961, "learning_rate": 0.00014630096367679765, "loss": 1.1611, "step": 1819 }, { "epoch": 0.2696296296296296, "grad_norm": 2.2274973392486572, "learning_rate": 0.00014627131208302447, "loss": 1.0705, "step": 1820 }, { "epoch": 0.2697777777777778, "grad_norm": 1.201061487197876, "learning_rate": 0.0001462416604892513, "loss": 1.0724, "step": 1821 }, { "epoch": 0.2699259259259259, "grad_norm": 1.2622113227844238, "learning_rate": 0.00014621200889547816, "loss": 1.3015, "step": 1822 }, { "epoch": 0.2700740740740741, "grad_norm": 1.0760676860809326, "learning_rate": 0.00014618235730170498, "loss": 1.1231, "step": 1823 }, { "epoch": 0.2702222222222222, "grad_norm": 1.277963638305664, "learning_rate": 0.0001461527057079318, "loss": 1.1237, "step": 1824 }, { "epoch": 0.27037037037037037, "grad_norm": 1.220734715461731, "learning_rate": 0.00014612305411415864, "loss": 0.9188, "step": 1825 }, { "epoch": 0.2705185185185185, "grad_norm": 1.4577242136001587, "learning_rate": 0.00014609340252038548, "loss": 1.1207, "step": 1826 }, { "epoch": 0.27066666666666667, "grad_norm": 1.287782073020935, "learning_rate": 0.0001460637509266123, "loss": 1.0625, "step": 1827 }, { "epoch": 0.27081481481481484, "grad_norm": 1.487561821937561, "learning_rate": 0.00014603409933283915, "loss": 1.0836, "step": 1828 }, { "epoch": 0.27096296296296296, "grad_norm": 1.497262716293335, "learning_rate": 0.000146004447739066, "loss": 1.162, "step": 1829 }, { "epoch": 0.27111111111111114, "grad_norm": 1.4409953355789185, "learning_rate": 0.0001459747961452928, "loss": 0.9221, "step": 1830 }, { "epoch": 0.27125925925925926, "grad_norm": 1.2185614109039307, "learning_rate": 0.00014594514455151965, "loss": 1.2039, "step": 1831 }, { "epoch": 0.27140740740740743, "grad_norm": 1.4382011890411377, "learning_rate": 0.0001459154929577465, "loss": 1.1022, "step": 1832 }, { "epoch": 0.27155555555555555, "grad_norm": 1.8076633214950562, "learning_rate": 0.00014588584136397331, "loss": 1.0575, "step": 1833 }, { "epoch": 0.2717037037037037, "grad_norm": 1.1807422637939453, "learning_rate": 0.00014585618977020016, "loss": 1.1686, "step": 1834 }, { "epoch": 0.27185185185185184, "grad_norm": 1.8012399673461914, "learning_rate": 0.000145826538176427, "loss": 1.1746, "step": 1835 }, { "epoch": 0.272, "grad_norm": 3.0142524242401123, "learning_rate": 0.00014579688658265382, "loss": 1.1084, "step": 1836 }, { "epoch": 0.27214814814814814, "grad_norm": 1.2734559774398804, "learning_rate": 0.00014576723498888066, "loss": 0.9371, "step": 1837 }, { "epoch": 0.2722962962962963, "grad_norm": 2.423719882965088, "learning_rate": 0.0001457375833951075, "loss": 0.9749, "step": 1838 }, { "epoch": 0.27244444444444443, "grad_norm": 1.2738714218139648, "learning_rate": 0.00014570793180133433, "loss": 1.1978, "step": 1839 }, { "epoch": 0.2725925925925926, "grad_norm": 1.620475172996521, "learning_rate": 0.00014567828020756117, "loss": 1.0713, "step": 1840 }, { "epoch": 0.27274074074074073, "grad_norm": 1.6548757553100586, "learning_rate": 0.00014564862861378802, "loss": 1.3495, "step": 1841 }, { "epoch": 0.2728888888888889, "grad_norm": 1.3922020196914673, "learning_rate": 0.00014561897702001483, "loss": 1.0026, "step": 1842 }, { "epoch": 0.273037037037037, "grad_norm": 1.3025039434432983, "learning_rate": 0.00014558932542624165, "loss": 0.8775, "step": 1843 }, { "epoch": 0.2731851851851852, "grad_norm": 1.4768999814987183, "learning_rate": 0.00014555967383246852, "loss": 0.9688, "step": 1844 }, { "epoch": 0.2733333333333333, "grad_norm": 1.5161889791488647, "learning_rate": 0.00014553002223869534, "loss": 1.0882, "step": 1845 }, { "epoch": 0.2734814814814815, "grad_norm": 2.0973105430603027, "learning_rate": 0.00014550037064492216, "loss": 0.9726, "step": 1846 }, { "epoch": 0.2736296296296296, "grad_norm": 1.1360061168670654, "learning_rate": 0.000145470719051149, "loss": 0.9396, "step": 1847 }, { "epoch": 0.2737777777777778, "grad_norm": 2.1403005123138428, "learning_rate": 0.00014544106745737585, "loss": 1.0215, "step": 1848 }, { "epoch": 0.2739259259259259, "grad_norm": 2.610644817352295, "learning_rate": 0.00014541141586360266, "loss": 1.035, "step": 1849 }, { "epoch": 0.2740740740740741, "grad_norm": 1.1601502895355225, "learning_rate": 0.0001453817642698295, "loss": 0.8839, "step": 1850 }, { "epoch": 0.2742222222222222, "grad_norm": 1.4491620063781738, "learning_rate": 0.00014535211267605635, "loss": 1.0072, "step": 1851 }, { "epoch": 0.2743703703703704, "grad_norm": 2.7844150066375732, "learning_rate": 0.00014532246108228317, "loss": 1.0882, "step": 1852 }, { "epoch": 0.2745185185185185, "grad_norm": 1.5077942609786987, "learning_rate": 0.00014529280948851, "loss": 1.0161, "step": 1853 }, { "epoch": 0.27466666666666667, "grad_norm": 1.31179940700531, "learning_rate": 0.00014526315789473686, "loss": 1.1895, "step": 1854 }, { "epoch": 0.2748148148148148, "grad_norm": 2.904644012451172, "learning_rate": 0.00014523350630096368, "loss": 1.1659, "step": 1855 }, { "epoch": 0.27496296296296296, "grad_norm": 2.673091411590576, "learning_rate": 0.00014520385470719052, "loss": 1.1377, "step": 1856 }, { "epoch": 0.2751111111111111, "grad_norm": 1.848264455795288, "learning_rate": 0.00014517420311341736, "loss": 1.1158, "step": 1857 }, { "epoch": 0.27525925925925926, "grad_norm": 1.3115473985671997, "learning_rate": 0.00014514455151964418, "loss": 1.1346, "step": 1858 }, { "epoch": 0.27540740740740743, "grad_norm": 1.5505434274673462, "learning_rate": 0.00014511489992587103, "loss": 1.3049, "step": 1859 }, { "epoch": 0.27555555555555555, "grad_norm": 1.2869677543640137, "learning_rate": 0.00014508524833209787, "loss": 0.9032, "step": 1860 }, { "epoch": 0.27570370370370373, "grad_norm": 3.9332666397094727, "learning_rate": 0.0001450555967383247, "loss": 1.351, "step": 1861 }, { "epoch": 0.27585185185185185, "grad_norm": 1.4140563011169434, "learning_rate": 0.00014502594514455153, "loss": 1.3197, "step": 1862 }, { "epoch": 0.276, "grad_norm": 1.3230453729629517, "learning_rate": 0.00014499629355077835, "loss": 1.0612, "step": 1863 }, { "epoch": 0.27614814814814814, "grad_norm": 1.4923207759857178, "learning_rate": 0.0001449666419570052, "loss": 0.9324, "step": 1864 }, { "epoch": 0.2762962962962963, "grad_norm": 1.35109281539917, "learning_rate": 0.00014493699036323204, "loss": 0.9941, "step": 1865 }, { "epoch": 0.27644444444444444, "grad_norm": 1.4291627407073975, "learning_rate": 0.00014490733876945886, "loss": 0.8801, "step": 1866 }, { "epoch": 0.2765925925925926, "grad_norm": 1.297644853591919, "learning_rate": 0.0001448776871756857, "loss": 0.9117, "step": 1867 }, { "epoch": 0.27674074074074073, "grad_norm": 1.4469108581542969, "learning_rate": 0.00014484803558191254, "loss": 1.1232, "step": 1868 }, { "epoch": 0.2768888888888889, "grad_norm": 2.2614119052886963, "learning_rate": 0.00014481838398813936, "loss": 1.4292, "step": 1869 }, { "epoch": 0.277037037037037, "grad_norm": 2.2086384296417236, "learning_rate": 0.0001447887323943662, "loss": 1.4552, "step": 1870 }, { "epoch": 0.2771851851851852, "grad_norm": 1.3284968137741089, "learning_rate": 0.00014475908080059305, "loss": 1.2932, "step": 1871 }, { "epoch": 0.2773333333333333, "grad_norm": 1.6751161813735962, "learning_rate": 0.00014472942920681987, "loss": 1.1819, "step": 1872 }, { "epoch": 0.2774814814814815, "grad_norm": 1.5120177268981934, "learning_rate": 0.0001446997776130467, "loss": 1.2286, "step": 1873 }, { "epoch": 0.2776296296296296, "grad_norm": 1.416845679283142, "learning_rate": 0.00014467012601927356, "loss": 1.0614, "step": 1874 }, { "epoch": 0.2777777777777778, "grad_norm": 1.9702194929122925, "learning_rate": 0.00014464047442550037, "loss": 1.0974, "step": 1875 }, { "epoch": 0.2779259259259259, "grad_norm": 1.1982643604278564, "learning_rate": 0.00014461082283172722, "loss": 1.3637, "step": 1876 }, { "epoch": 0.2780740740740741, "grad_norm": 1.339460849761963, "learning_rate": 0.00014458117123795404, "loss": 1.0512, "step": 1877 }, { "epoch": 0.2782222222222222, "grad_norm": 1.8185973167419434, "learning_rate": 0.00014455151964418088, "loss": 1.3702, "step": 1878 }, { "epoch": 0.2783703703703704, "grad_norm": 1.7184706926345825, "learning_rate": 0.00014452186805040773, "loss": 1.1731, "step": 1879 }, { "epoch": 0.2785185185185185, "grad_norm": 1.1334104537963867, "learning_rate": 0.00014449221645663454, "loss": 1.0635, "step": 1880 }, { "epoch": 0.2786666666666667, "grad_norm": 1.3710155487060547, "learning_rate": 0.0001444625648628614, "loss": 0.8815, "step": 1881 }, { "epoch": 0.2788148148148148, "grad_norm": 3.35996150970459, "learning_rate": 0.0001444329132690882, "loss": 1.1999, "step": 1882 }, { "epoch": 0.27896296296296297, "grad_norm": 1.0989127159118652, "learning_rate": 0.00014440326167531505, "loss": 1.0851, "step": 1883 }, { "epoch": 0.2791111111111111, "grad_norm": 1.127285122871399, "learning_rate": 0.0001443736100815419, "loss": 1.0653, "step": 1884 }, { "epoch": 0.27925925925925926, "grad_norm": 1.6573400497436523, "learning_rate": 0.0001443439584877687, "loss": 1.1363, "step": 1885 }, { "epoch": 0.2794074074074074, "grad_norm": 1.572654128074646, "learning_rate": 0.00014431430689399556, "loss": 1.0745, "step": 1886 }, { "epoch": 0.27955555555555556, "grad_norm": 2.3293049335479736, "learning_rate": 0.0001442846553002224, "loss": 1.1124, "step": 1887 }, { "epoch": 0.2797037037037037, "grad_norm": 2.035825729370117, "learning_rate": 0.00014425500370644922, "loss": 1.3554, "step": 1888 }, { "epoch": 0.27985185185185185, "grad_norm": 2.500138998031616, "learning_rate": 0.00014422535211267606, "loss": 1.1704, "step": 1889 }, { "epoch": 0.28, "grad_norm": 1.3954805135726929, "learning_rate": 0.0001441957005189029, "loss": 1.1847, "step": 1890 }, { "epoch": 0.28014814814814815, "grad_norm": 1.9010459184646606, "learning_rate": 0.00014416604892512972, "loss": 1.3681, "step": 1891 }, { "epoch": 0.2802962962962963, "grad_norm": 1.322282314300537, "learning_rate": 0.00014413639733135657, "loss": 1.2975, "step": 1892 }, { "epoch": 0.28044444444444444, "grad_norm": 1.429682970046997, "learning_rate": 0.0001441067457375834, "loss": 1.0909, "step": 1893 }, { "epoch": 0.2805925925925926, "grad_norm": 2.190866231918335, "learning_rate": 0.00014407709414381023, "loss": 1.3857, "step": 1894 }, { "epoch": 0.28074074074074074, "grad_norm": 1.7194695472717285, "learning_rate": 0.00014404744255003707, "loss": 0.9431, "step": 1895 }, { "epoch": 0.2808888888888889, "grad_norm": 4.119308948516846, "learning_rate": 0.00014401779095626392, "loss": 1.2931, "step": 1896 }, { "epoch": 0.28103703703703703, "grad_norm": 1.2322664260864258, "learning_rate": 0.00014398813936249074, "loss": 0.9592, "step": 1897 }, { "epoch": 0.2811851851851852, "grad_norm": 1.7712771892547607, "learning_rate": 0.00014395848776871755, "loss": 0.9421, "step": 1898 }, { "epoch": 0.2813333333333333, "grad_norm": 1.3226734399795532, "learning_rate": 0.00014392883617494442, "loss": 0.9837, "step": 1899 }, { "epoch": 0.2814814814814815, "grad_norm": 2.522213935852051, "learning_rate": 0.00014389918458117124, "loss": 1.2813, "step": 1900 }, { "epoch": 0.2816296296296296, "grad_norm": 1.8988999128341675, "learning_rate": 0.00014386953298739806, "loss": 1.1405, "step": 1901 }, { "epoch": 0.2817777777777778, "grad_norm": 1.8001806735992432, "learning_rate": 0.00014383988139362493, "loss": 1.1198, "step": 1902 }, { "epoch": 0.2819259259259259, "grad_norm": 1.7499761581420898, "learning_rate": 0.00014381022979985175, "loss": 1.0926, "step": 1903 }, { "epoch": 0.2820740740740741, "grad_norm": 1.7617902755737305, "learning_rate": 0.00014378057820607857, "loss": 1.0601, "step": 1904 }, { "epoch": 0.2822222222222222, "grad_norm": 1.4161577224731445, "learning_rate": 0.00014375092661230544, "loss": 0.9908, "step": 1905 }, { "epoch": 0.2823703703703704, "grad_norm": 1.7820091247558594, "learning_rate": 0.00014372127501853225, "loss": 1.1655, "step": 1906 }, { "epoch": 0.2825185185185185, "grad_norm": 1.7779147624969482, "learning_rate": 0.00014369162342475907, "loss": 1.0416, "step": 1907 }, { "epoch": 0.2826666666666667, "grad_norm": 1.3166837692260742, "learning_rate": 0.00014366197183098594, "loss": 1.1624, "step": 1908 }, { "epoch": 0.2828148148148148, "grad_norm": 1.822948932647705, "learning_rate": 0.00014363232023721276, "loss": 1.1519, "step": 1909 }, { "epoch": 0.28296296296296297, "grad_norm": 2.3033623695373535, "learning_rate": 0.00014360266864343958, "loss": 1.1689, "step": 1910 }, { "epoch": 0.2831111111111111, "grad_norm": 1.3716576099395752, "learning_rate": 0.00014357301704966642, "loss": 1.0579, "step": 1911 }, { "epoch": 0.28325925925925927, "grad_norm": 1.3765064477920532, "learning_rate": 0.00014354336545589327, "loss": 1.0688, "step": 1912 }, { "epoch": 0.2834074074074074, "grad_norm": 2.1557159423828125, "learning_rate": 0.00014351371386212008, "loss": 1.0384, "step": 1913 }, { "epoch": 0.28355555555555556, "grad_norm": 2.223402976989746, "learning_rate": 0.00014348406226834693, "loss": 1.2898, "step": 1914 }, { "epoch": 0.2837037037037037, "grad_norm": 1.851702094078064, "learning_rate": 0.00014345441067457377, "loss": 1.0041, "step": 1915 }, { "epoch": 0.28385185185185186, "grad_norm": 4.681601524353027, "learning_rate": 0.0001434247590808006, "loss": 1.2566, "step": 1916 }, { "epoch": 0.284, "grad_norm": 4.030296325683594, "learning_rate": 0.00014339510748702744, "loss": 0.9683, "step": 1917 }, { "epoch": 0.28414814814814815, "grad_norm": 2.739288330078125, "learning_rate": 0.00014336545589325428, "loss": 1.2889, "step": 1918 }, { "epoch": 0.28429629629629627, "grad_norm": 1.4121339321136475, "learning_rate": 0.0001433358042994811, "loss": 1.1335, "step": 1919 }, { "epoch": 0.28444444444444444, "grad_norm": 3.6412529945373535, "learning_rate": 0.00014330615270570794, "loss": 1.001, "step": 1920 }, { "epoch": 0.2845925925925926, "grad_norm": 1.610365390777588, "learning_rate": 0.00014327650111193479, "loss": 1.1877, "step": 1921 }, { "epoch": 0.28474074074074074, "grad_norm": 6.543050289154053, "learning_rate": 0.0001432468495181616, "loss": 1.2976, "step": 1922 }, { "epoch": 0.2848888888888889, "grad_norm": 2.442750930786133, "learning_rate": 0.00014321719792438845, "loss": 1.2311, "step": 1923 }, { "epoch": 0.28503703703703703, "grad_norm": 1.7650247812271118, "learning_rate": 0.0001431875463306153, "loss": 1.0885, "step": 1924 }, { "epoch": 0.2851851851851852, "grad_norm": 1.3626705408096313, "learning_rate": 0.0001431578947368421, "loss": 1.168, "step": 1925 }, { "epoch": 0.2853333333333333, "grad_norm": 1.6422728300094604, "learning_rate": 0.00014312824314306895, "loss": 1.0842, "step": 1926 }, { "epoch": 0.2854814814814815, "grad_norm": 3.8954384326934814, "learning_rate": 0.0001430985915492958, "loss": 1.0006, "step": 1927 }, { "epoch": 0.2856296296296296, "grad_norm": 2.413081645965576, "learning_rate": 0.00014306893995552262, "loss": 1.2681, "step": 1928 }, { "epoch": 0.2857777777777778, "grad_norm": 1.2404167652130127, "learning_rate": 0.00014303928836174943, "loss": 1.0716, "step": 1929 }, { "epoch": 0.2859259259259259, "grad_norm": 2.429436683654785, "learning_rate": 0.0001430096367679763, "loss": 1.242, "step": 1930 }, { "epoch": 0.2860740740740741, "grad_norm": 1.9265131950378418, "learning_rate": 0.00014297998517420312, "loss": 1.376, "step": 1931 }, { "epoch": 0.2862222222222222, "grad_norm": 1.5621486902236938, "learning_rate": 0.00014295033358042994, "loss": 1.0938, "step": 1932 }, { "epoch": 0.2863703703703704, "grad_norm": 1.2274481058120728, "learning_rate": 0.00014292068198665678, "loss": 1.1254, "step": 1933 }, { "epoch": 0.2865185185185185, "grad_norm": 1.5727094411849976, "learning_rate": 0.00014289103039288363, "loss": 1.2272, "step": 1934 }, { "epoch": 0.2866666666666667, "grad_norm": 1.5376226902008057, "learning_rate": 0.00014286137879911045, "loss": 1.2636, "step": 1935 }, { "epoch": 0.2868148148148148, "grad_norm": 1.1617416143417358, "learning_rate": 0.0001428317272053373, "loss": 1.0181, "step": 1936 }, { "epoch": 0.286962962962963, "grad_norm": 2.308255672454834, "learning_rate": 0.00014280207561156413, "loss": 0.9628, "step": 1937 }, { "epoch": 0.2871111111111111, "grad_norm": 1.201524019241333, "learning_rate": 0.00014277242401779095, "loss": 0.8777, "step": 1938 }, { "epoch": 0.28725925925925927, "grad_norm": 1.079357385635376, "learning_rate": 0.0001427427724240178, "loss": 1.1408, "step": 1939 }, { "epoch": 0.2874074074074074, "grad_norm": 0.8939654231071472, "learning_rate": 0.00014271312083024464, "loss": 0.9137, "step": 1940 }, { "epoch": 0.28755555555555556, "grad_norm": 2.082834005355835, "learning_rate": 0.00014268346923647146, "loss": 1.1254, "step": 1941 }, { "epoch": 0.2877037037037037, "grad_norm": 1.8582286834716797, "learning_rate": 0.0001426538176426983, "loss": 1.2128, "step": 1942 }, { "epoch": 0.28785185185185186, "grad_norm": 1.6522938013076782, "learning_rate": 0.00014262416604892515, "loss": 1.3354, "step": 1943 }, { "epoch": 0.288, "grad_norm": 1.6389734745025635, "learning_rate": 0.00014259451445515196, "loss": 0.9965, "step": 1944 }, { "epoch": 0.28814814814814815, "grad_norm": 1.6375473737716675, "learning_rate": 0.0001425648628613788, "loss": 1.0244, "step": 1945 }, { "epoch": 0.2882962962962963, "grad_norm": 1.497673511505127, "learning_rate": 0.00014253521126760565, "loss": 1.1187, "step": 1946 }, { "epoch": 0.28844444444444445, "grad_norm": 1.0881043672561646, "learning_rate": 0.00014250555967383247, "loss": 1.065, "step": 1947 }, { "epoch": 0.28859259259259257, "grad_norm": 1.5061964988708496, "learning_rate": 0.00014247590808005932, "loss": 1.0024, "step": 1948 }, { "epoch": 0.28874074074074074, "grad_norm": 1.6110973358154297, "learning_rate": 0.00014244625648628613, "loss": 1.3192, "step": 1949 }, { "epoch": 0.28888888888888886, "grad_norm": 1.792970895767212, "learning_rate": 0.00014241660489251298, "loss": 0.9602, "step": 1950 }, { "epoch": 0.28903703703703704, "grad_norm": 1.2316087484359741, "learning_rate": 0.00014238695329873982, "loss": 1.0634, "step": 1951 }, { "epoch": 0.2891851851851852, "grad_norm": 6.28628396987915, "learning_rate": 0.00014235730170496664, "loss": 0.9428, "step": 1952 }, { "epoch": 0.28933333333333333, "grad_norm": 3.0054290294647217, "learning_rate": 0.00014232765011119348, "loss": 1.0454, "step": 1953 }, { "epoch": 0.2894814814814815, "grad_norm": 1.8051791191101074, "learning_rate": 0.00014229799851742033, "loss": 1.3311, "step": 1954 }, { "epoch": 0.2896296296296296, "grad_norm": 1.9095637798309326, "learning_rate": 0.00014226834692364715, "loss": 1.2088, "step": 1955 }, { "epoch": 0.2897777777777778, "grad_norm": 2.037728786468506, "learning_rate": 0.000142238695329874, "loss": 1.238, "step": 1956 }, { "epoch": 0.2899259259259259, "grad_norm": 1.1061418056488037, "learning_rate": 0.00014220904373610083, "loss": 1.0668, "step": 1957 }, { "epoch": 0.2900740740740741, "grad_norm": 1.9614605903625488, "learning_rate": 0.00014217939214232765, "loss": 1.1359, "step": 1958 }, { "epoch": 0.2902222222222222, "grad_norm": 3.108888626098633, "learning_rate": 0.0001421497405485545, "loss": 1.0206, "step": 1959 }, { "epoch": 0.2903703703703704, "grad_norm": 2.745737314224243, "learning_rate": 0.00014212008895478134, "loss": 1.1219, "step": 1960 }, { "epoch": 0.2905185185185185, "grad_norm": 1.4380501508712769, "learning_rate": 0.00014209043736100816, "loss": 1.0199, "step": 1961 }, { "epoch": 0.2906666666666667, "grad_norm": 1.7835073471069336, "learning_rate": 0.000142060785767235, "loss": 1.2319, "step": 1962 }, { "epoch": 0.2908148148148148, "grad_norm": 1.7886954545974731, "learning_rate": 0.00014203113417346182, "loss": 0.9066, "step": 1963 }, { "epoch": 0.290962962962963, "grad_norm": 2.375223159790039, "learning_rate": 0.00014200148257968866, "loss": 1.1687, "step": 1964 }, { "epoch": 0.2911111111111111, "grad_norm": 1.4279139041900635, "learning_rate": 0.0001419718309859155, "loss": 1.2445, "step": 1965 }, { "epoch": 0.2912592592592593, "grad_norm": 1.6330987215042114, "learning_rate": 0.00014194217939214233, "loss": 1.1252, "step": 1966 }, { "epoch": 0.2914074074074074, "grad_norm": 1.4857168197631836, "learning_rate": 0.00014191252779836917, "loss": 1.2034, "step": 1967 }, { "epoch": 0.29155555555555557, "grad_norm": 2.2815330028533936, "learning_rate": 0.000141882876204596, "loss": 1.0196, "step": 1968 }, { "epoch": 0.2917037037037037, "grad_norm": 2.0011632442474365, "learning_rate": 0.00014185322461082283, "loss": 1.2011, "step": 1969 }, { "epoch": 0.29185185185185186, "grad_norm": 2.2272956371307373, "learning_rate": 0.00014182357301704968, "loss": 1.3563, "step": 1970 }, { "epoch": 0.292, "grad_norm": 1.3123246431350708, "learning_rate": 0.0001417939214232765, "loss": 1.1925, "step": 1971 }, { "epoch": 0.29214814814814816, "grad_norm": 1.6946793794631958, "learning_rate": 0.00014176426982950334, "loss": 1.0575, "step": 1972 }, { "epoch": 0.2922962962962963, "grad_norm": 1.524677038192749, "learning_rate": 0.00014173461823573018, "loss": 1.2523, "step": 1973 }, { "epoch": 0.29244444444444445, "grad_norm": 1.8048510551452637, "learning_rate": 0.000141704966641957, "loss": 1.0267, "step": 1974 }, { "epoch": 0.29259259259259257, "grad_norm": 3.08186936378479, "learning_rate": 0.00014167531504818384, "loss": 0.9579, "step": 1975 }, { "epoch": 0.29274074074074075, "grad_norm": 1.672932744026184, "learning_rate": 0.0001416456634544107, "loss": 1.0669, "step": 1976 }, { "epoch": 0.29288888888888887, "grad_norm": 1.2979950904846191, "learning_rate": 0.0001416160118606375, "loss": 0.8932, "step": 1977 }, { "epoch": 0.29303703703703704, "grad_norm": 1.031038761138916, "learning_rate": 0.00014158636026686435, "loss": 0.9491, "step": 1978 }, { "epoch": 0.29318518518518516, "grad_norm": 2.2748239040374756, "learning_rate": 0.0001415567086730912, "loss": 1.1151, "step": 1979 }, { "epoch": 0.29333333333333333, "grad_norm": 1.350414752960205, "learning_rate": 0.000141527057079318, "loss": 1.2051, "step": 1980 }, { "epoch": 0.29348148148148145, "grad_norm": 2.0504868030548096, "learning_rate": 0.00014149740548554486, "loss": 1.1326, "step": 1981 }, { "epoch": 0.29362962962962963, "grad_norm": 1.304865837097168, "learning_rate": 0.0001414677538917717, "loss": 1.0071, "step": 1982 }, { "epoch": 0.2937777777777778, "grad_norm": 2.612804412841797, "learning_rate": 0.00014143810229799852, "loss": 1.0072, "step": 1983 }, { "epoch": 0.2939259259259259, "grad_norm": 1.4876537322998047, "learning_rate": 0.00014140845070422534, "loss": 1.189, "step": 1984 }, { "epoch": 0.2940740740740741, "grad_norm": 1.6770356893539429, "learning_rate": 0.0001413787991104522, "loss": 1.1421, "step": 1985 }, { "epoch": 0.2942222222222222, "grad_norm": 1.357831597328186, "learning_rate": 0.00014134914751667903, "loss": 1.022, "step": 1986 }, { "epoch": 0.2943703703703704, "grad_norm": 1.505112648010254, "learning_rate": 0.00014131949592290584, "loss": 1.1427, "step": 1987 }, { "epoch": 0.2945185185185185, "grad_norm": 1.8086297512054443, "learning_rate": 0.00014128984432913271, "loss": 1.0027, "step": 1988 }, { "epoch": 0.2946666666666667, "grad_norm": 1.528651237487793, "learning_rate": 0.00014126019273535953, "loss": 1.2825, "step": 1989 }, { "epoch": 0.2948148148148148, "grad_norm": 2.0352816581726074, "learning_rate": 0.00014123054114158635, "loss": 1.1326, "step": 1990 }, { "epoch": 0.294962962962963, "grad_norm": 1.5756343603134155, "learning_rate": 0.00014120088954781322, "loss": 0.9532, "step": 1991 }, { "epoch": 0.2951111111111111, "grad_norm": 1.6788939237594604, "learning_rate": 0.00014117123795404004, "loss": 1.1934, "step": 1992 }, { "epoch": 0.2952592592592593, "grad_norm": 2.541499376296997, "learning_rate": 0.00014114158636026686, "loss": 0.9294, "step": 1993 }, { "epoch": 0.2954074074074074, "grad_norm": 1.2268800735473633, "learning_rate": 0.00014111193476649373, "loss": 1.0078, "step": 1994 }, { "epoch": 0.29555555555555557, "grad_norm": 1.4305241107940674, "learning_rate": 0.00014108228317272054, "loss": 1.1887, "step": 1995 }, { "epoch": 0.2957037037037037, "grad_norm": 1.9692851305007935, "learning_rate": 0.00014105263157894736, "loss": 1.2435, "step": 1996 }, { "epoch": 0.29585185185185187, "grad_norm": 3.043774366378784, "learning_rate": 0.0001410229799851742, "loss": 1.1618, "step": 1997 }, { "epoch": 0.296, "grad_norm": 1.3506648540496826, "learning_rate": 0.00014099332839140105, "loss": 1.2005, "step": 1998 }, { "epoch": 0.29614814814814816, "grad_norm": 1.5097509622573853, "learning_rate": 0.00014096367679762787, "loss": 1.2464, "step": 1999 }, { "epoch": 0.2962962962962963, "grad_norm": 2.3342812061309814, "learning_rate": 0.0001409340252038547, "loss": 1.095, "step": 2000 }, { "epoch": 0.29644444444444445, "grad_norm": 5.315145969390869, "learning_rate": 0.00014090437361008156, "loss": 1.1415, "step": 2001 }, { "epoch": 0.2965925925925926, "grad_norm": 1.8004692792892456, "learning_rate": 0.00014087472201630837, "loss": 1.3195, "step": 2002 }, { "epoch": 0.29674074074074075, "grad_norm": 1.706508994102478, "learning_rate": 0.00014084507042253522, "loss": 0.98, "step": 2003 }, { "epoch": 0.29688888888888887, "grad_norm": 1.7100145816802979, "learning_rate": 0.00014081541882876206, "loss": 1.1068, "step": 2004 }, { "epoch": 0.29703703703703704, "grad_norm": 1.4298951625823975, "learning_rate": 0.00014078576723498888, "loss": 1.0667, "step": 2005 }, { "epoch": 0.29718518518518516, "grad_norm": 2.0300867557525635, "learning_rate": 0.00014075611564121573, "loss": 1.1282, "step": 2006 }, { "epoch": 0.29733333333333334, "grad_norm": 2.285794258117676, "learning_rate": 0.00014072646404744257, "loss": 1.2465, "step": 2007 }, { "epoch": 0.29748148148148146, "grad_norm": 2.2818078994750977, "learning_rate": 0.0001406968124536694, "loss": 1.0622, "step": 2008 }, { "epoch": 0.29762962962962963, "grad_norm": 1.829458236694336, "learning_rate": 0.00014066716085989623, "loss": 1.3267, "step": 2009 }, { "epoch": 0.29777777777777775, "grad_norm": 2.451023817062378, "learning_rate": 0.00014063750926612308, "loss": 1.1424, "step": 2010 }, { "epoch": 0.2979259259259259, "grad_norm": 1.8001643419265747, "learning_rate": 0.0001406078576723499, "loss": 1.0965, "step": 2011 }, { "epoch": 0.29807407407407405, "grad_norm": 2.1611523628234863, "learning_rate": 0.00014057820607857674, "loss": 1.4567, "step": 2012 }, { "epoch": 0.2982222222222222, "grad_norm": 2.005830764770508, "learning_rate": 0.00014054855448480358, "loss": 1.179, "step": 2013 }, { "epoch": 0.2983703703703704, "grad_norm": 1.6628178358078003, "learning_rate": 0.0001405189028910304, "loss": 1.0446, "step": 2014 }, { "epoch": 0.2985185185185185, "grad_norm": 1.9607075452804565, "learning_rate": 0.00014048925129725722, "loss": 0.9936, "step": 2015 }, { "epoch": 0.2986666666666667, "grad_norm": 1.6360664367675781, "learning_rate": 0.0001404595997034841, "loss": 1.2731, "step": 2016 }, { "epoch": 0.2988148148148148, "grad_norm": 1.4931821823120117, "learning_rate": 0.0001404299481097109, "loss": 1.4205, "step": 2017 }, { "epoch": 0.298962962962963, "grad_norm": 2.0647025108337402, "learning_rate": 0.00014040029651593772, "loss": 0.9611, "step": 2018 }, { "epoch": 0.2991111111111111, "grad_norm": 3.6834490299224854, "learning_rate": 0.00014037064492216457, "loss": 1.118, "step": 2019 }, { "epoch": 0.2992592592592593, "grad_norm": 2.8627822399139404, "learning_rate": 0.0001403409933283914, "loss": 1.1278, "step": 2020 }, { "epoch": 0.2994074074074074, "grad_norm": 1.3225929737091064, "learning_rate": 0.00014031134173461823, "loss": 0.8782, "step": 2021 }, { "epoch": 0.2995555555555556, "grad_norm": 3.9964535236358643, "learning_rate": 0.00014028169014084507, "loss": 1.0422, "step": 2022 }, { "epoch": 0.2997037037037037, "grad_norm": 5.868827819824219, "learning_rate": 0.00014025203854707192, "loss": 1.0829, "step": 2023 }, { "epoch": 0.29985185185185187, "grad_norm": 1.5820839405059814, "learning_rate": 0.00014022238695329874, "loss": 1.285, "step": 2024 }, { "epoch": 0.3, "grad_norm": 2.345027208328247, "learning_rate": 0.00014019273535952558, "loss": 1.1556, "step": 2025 }, { "epoch": 0.30014814814814816, "grad_norm": 1.3832290172576904, "learning_rate": 0.00014016308376575242, "loss": 1.2876, "step": 2026 }, { "epoch": 0.3002962962962963, "grad_norm": 2.850512742996216, "learning_rate": 0.00014013343217197924, "loss": 1.1713, "step": 2027 }, { "epoch": 0.30044444444444446, "grad_norm": 2.576414108276367, "learning_rate": 0.00014010378057820609, "loss": 1.1124, "step": 2028 }, { "epoch": 0.3005925925925926, "grad_norm": 2.3272831439971924, "learning_rate": 0.00014007412898443293, "loss": 1.2297, "step": 2029 }, { "epoch": 0.30074074074074075, "grad_norm": 2.0849647521972656, "learning_rate": 0.00014004447739065975, "loss": 1.2055, "step": 2030 }, { "epoch": 0.3008888888888889, "grad_norm": 1.5891973972320557, "learning_rate": 0.0001400148257968866, "loss": 1.2693, "step": 2031 }, { "epoch": 0.30103703703703705, "grad_norm": 1.419865369796753, "learning_rate": 0.00013998517420311344, "loss": 1.1698, "step": 2032 }, { "epoch": 0.30118518518518517, "grad_norm": 1.8656340837478638, "learning_rate": 0.00013995552260934025, "loss": 1.1178, "step": 2033 }, { "epoch": 0.30133333333333334, "grad_norm": 1.6104474067687988, "learning_rate": 0.0001399258710155671, "loss": 1.1444, "step": 2034 }, { "epoch": 0.30148148148148146, "grad_norm": 1.2462067604064941, "learning_rate": 0.00013989621942179392, "loss": 1.0419, "step": 2035 }, { "epoch": 0.30162962962962964, "grad_norm": 1.6338249444961548, "learning_rate": 0.00013986656782802076, "loss": 0.9944, "step": 2036 }, { "epoch": 0.30177777777777776, "grad_norm": 1.7983711957931519, "learning_rate": 0.0001398369162342476, "loss": 1.2706, "step": 2037 }, { "epoch": 0.30192592592592593, "grad_norm": 1.923130989074707, "learning_rate": 0.00013980726464047442, "loss": 1.21, "step": 2038 }, { "epoch": 0.30207407407407405, "grad_norm": 1.6995714902877808, "learning_rate": 0.00013977761304670127, "loss": 1.2053, "step": 2039 }, { "epoch": 0.3022222222222222, "grad_norm": 1.351297378540039, "learning_rate": 0.0001397479614529281, "loss": 0.9145, "step": 2040 }, { "epoch": 0.30237037037037034, "grad_norm": 1.4165565967559814, "learning_rate": 0.00013971830985915493, "loss": 1.2775, "step": 2041 }, { "epoch": 0.3025185185185185, "grad_norm": 1.3446037769317627, "learning_rate": 0.00013968865826538177, "loss": 1.0615, "step": 2042 }, { "epoch": 0.30266666666666664, "grad_norm": 3.5239148139953613, "learning_rate": 0.00013965900667160862, "loss": 1.2237, "step": 2043 }, { "epoch": 0.3028148148148148, "grad_norm": 2.3545305728912354, "learning_rate": 0.00013962935507783544, "loss": 1.0055, "step": 2044 }, { "epoch": 0.302962962962963, "grad_norm": 0.9781142473220825, "learning_rate": 0.00013959970348406228, "loss": 1.0277, "step": 2045 }, { "epoch": 0.3031111111111111, "grad_norm": 2.4959397315979004, "learning_rate": 0.00013957005189028912, "loss": 1.252, "step": 2046 }, { "epoch": 0.3032592592592593, "grad_norm": 2.1696665287017822, "learning_rate": 0.00013954040029651594, "loss": 1.1637, "step": 2047 }, { "epoch": 0.3034074074074074, "grad_norm": 1.444602608680725, "learning_rate": 0.00013951074870274279, "loss": 0.8602, "step": 2048 }, { "epoch": 0.3035555555555556, "grad_norm": 2.0049591064453125, "learning_rate": 0.0001394810971089696, "loss": 1.1302, "step": 2049 }, { "epoch": 0.3037037037037037, "grad_norm": 6.134702682495117, "learning_rate": 0.00013945144551519645, "loss": 1.1219, "step": 2050 }, { "epoch": 0.3038518518518519, "grad_norm": 1.8419837951660156, "learning_rate": 0.0001394217939214233, "loss": 0.9795, "step": 2051 }, { "epoch": 0.304, "grad_norm": 2.378662109375, "learning_rate": 0.0001393921423276501, "loss": 1.1208, "step": 2052 }, { "epoch": 0.30414814814814817, "grad_norm": 1.314644694328308, "learning_rate": 0.00013936249073387695, "loss": 1.2562, "step": 2053 }, { "epoch": 0.3042962962962963, "grad_norm": 1.2006711959838867, "learning_rate": 0.00013933283914010377, "loss": 1.1376, "step": 2054 }, { "epoch": 0.30444444444444446, "grad_norm": 1.1629737615585327, "learning_rate": 0.00013930318754633062, "loss": 1.1162, "step": 2055 }, { "epoch": 0.3045925925925926, "grad_norm": 1.5775014162063599, "learning_rate": 0.00013927353595255746, "loss": 1.2141, "step": 2056 }, { "epoch": 0.30474074074074076, "grad_norm": 1.3674976825714111, "learning_rate": 0.00013924388435878428, "loss": 1.0952, "step": 2057 }, { "epoch": 0.3048888888888889, "grad_norm": 1.9454426765441895, "learning_rate": 0.00013921423276501112, "loss": 1.1577, "step": 2058 }, { "epoch": 0.30503703703703705, "grad_norm": 1.3205631971359253, "learning_rate": 0.00013918458117123797, "loss": 1.0524, "step": 2059 }, { "epoch": 0.30518518518518517, "grad_norm": 1.596271276473999, "learning_rate": 0.00013915492957746478, "loss": 1.252, "step": 2060 }, { "epoch": 0.30533333333333335, "grad_norm": 1.1906851530075073, "learning_rate": 0.00013912527798369163, "loss": 1.0535, "step": 2061 }, { "epoch": 0.30548148148148146, "grad_norm": 1.4791088104248047, "learning_rate": 0.00013909562638991847, "loss": 1.1838, "step": 2062 }, { "epoch": 0.30562962962962964, "grad_norm": 1.7153657674789429, "learning_rate": 0.0001390659747961453, "loss": 1.1325, "step": 2063 }, { "epoch": 0.30577777777777776, "grad_norm": 2.7629482746124268, "learning_rate": 0.00013903632320237213, "loss": 0.9909, "step": 2064 }, { "epoch": 0.30592592592592593, "grad_norm": 1.4287002086639404, "learning_rate": 0.00013900667160859898, "loss": 1.1571, "step": 2065 }, { "epoch": 0.30607407407407405, "grad_norm": 1.3045237064361572, "learning_rate": 0.0001389770200148258, "loss": 1.1581, "step": 2066 }, { "epoch": 0.30622222222222223, "grad_norm": 1.5022578239440918, "learning_rate": 0.00013894736842105264, "loss": 1.2141, "step": 2067 }, { "epoch": 0.30637037037037035, "grad_norm": 7.063483238220215, "learning_rate": 0.00013891771682727949, "loss": 1.3515, "step": 2068 }, { "epoch": 0.3065185185185185, "grad_norm": 1.825252652168274, "learning_rate": 0.0001388880652335063, "loss": 1.1497, "step": 2069 }, { "epoch": 0.30666666666666664, "grad_norm": 0.9816583395004272, "learning_rate": 0.00013885841363973312, "loss": 1.0345, "step": 2070 }, { "epoch": 0.3068148148148148, "grad_norm": 1.6378679275512695, "learning_rate": 0.00013882876204596, "loss": 1.2512, "step": 2071 }, { "epoch": 0.30696296296296294, "grad_norm": 1.7705243825912476, "learning_rate": 0.0001387991104521868, "loss": 0.924, "step": 2072 }, { "epoch": 0.3071111111111111, "grad_norm": 1.4243345260620117, "learning_rate": 0.00013876945885841363, "loss": 1.0156, "step": 2073 }, { "epoch": 0.30725925925925923, "grad_norm": 1.9182519912719727, "learning_rate": 0.0001387398072646405, "loss": 1.176, "step": 2074 }, { "epoch": 0.3074074074074074, "grad_norm": 1.8517405986785889, "learning_rate": 0.00013871015567086732, "loss": 1.3266, "step": 2075 }, { "epoch": 0.3075555555555556, "grad_norm": 2.758709192276001, "learning_rate": 0.00013868050407709413, "loss": 0.9802, "step": 2076 }, { "epoch": 0.3077037037037037, "grad_norm": 1.551541805267334, "learning_rate": 0.000138650852483321, "loss": 1.1485, "step": 2077 }, { "epoch": 0.3078518518518519, "grad_norm": 1.4198012351989746, "learning_rate": 0.00013862120088954782, "loss": 1.3013, "step": 2078 }, { "epoch": 0.308, "grad_norm": 1.623287558555603, "learning_rate": 0.00013859154929577464, "loss": 1.0242, "step": 2079 }, { "epoch": 0.30814814814814817, "grad_norm": 1.7934820652008057, "learning_rate": 0.0001385618977020015, "loss": 0.9132, "step": 2080 }, { "epoch": 0.3082962962962963, "grad_norm": 1.3298826217651367, "learning_rate": 0.00013853224610822833, "loss": 0.9939, "step": 2081 }, { "epoch": 0.30844444444444447, "grad_norm": 1.3171889781951904, "learning_rate": 0.00013850259451445515, "loss": 1.4316, "step": 2082 }, { "epoch": 0.3085925925925926, "grad_norm": 1.2780592441558838, "learning_rate": 0.000138472942920682, "loss": 1.2374, "step": 2083 }, { "epoch": 0.30874074074074076, "grad_norm": 0.9781845211982727, "learning_rate": 0.00013844329132690883, "loss": 0.8591, "step": 2084 }, { "epoch": 0.3088888888888889, "grad_norm": 1.3549484014511108, "learning_rate": 0.00013841363973313565, "loss": 1.0935, "step": 2085 }, { "epoch": 0.30903703703703705, "grad_norm": 1.1546287536621094, "learning_rate": 0.0001383839881393625, "loss": 1.1185, "step": 2086 }, { "epoch": 0.3091851851851852, "grad_norm": 4.052056312561035, "learning_rate": 0.00013835433654558934, "loss": 0.9417, "step": 2087 }, { "epoch": 0.30933333333333335, "grad_norm": 3.001648426055908, "learning_rate": 0.00013832468495181616, "loss": 1.2758, "step": 2088 }, { "epoch": 0.30948148148148147, "grad_norm": 2.10091233253479, "learning_rate": 0.000138295033358043, "loss": 1.3074, "step": 2089 }, { "epoch": 0.30962962962962964, "grad_norm": 1.6234939098358154, "learning_rate": 0.00013826538176426985, "loss": 1.2339, "step": 2090 }, { "epoch": 0.30977777777777776, "grad_norm": 1.1966112852096558, "learning_rate": 0.00013823573017049666, "loss": 1.1366, "step": 2091 }, { "epoch": 0.30992592592592594, "grad_norm": 1.2139739990234375, "learning_rate": 0.0001382060785767235, "loss": 1.3649, "step": 2092 }, { "epoch": 0.31007407407407406, "grad_norm": 1.45710289478302, "learning_rate": 0.00013817642698295035, "loss": 1.0141, "step": 2093 }, { "epoch": 0.31022222222222223, "grad_norm": 1.7804147005081177, "learning_rate": 0.00013814677538917717, "loss": 1.206, "step": 2094 }, { "epoch": 0.31037037037037035, "grad_norm": 2.8588223457336426, "learning_rate": 0.00013811712379540401, "loss": 1.2005, "step": 2095 }, { "epoch": 0.3105185185185185, "grad_norm": 1.061313271522522, "learning_rate": 0.00013808747220163086, "loss": 0.8415, "step": 2096 }, { "epoch": 0.31066666666666665, "grad_norm": 1.0050193071365356, "learning_rate": 0.00013805782060785768, "loss": 0.8893, "step": 2097 }, { "epoch": 0.3108148148148148, "grad_norm": 1.303781270980835, "learning_rate": 0.00013802816901408452, "loss": 1.1208, "step": 2098 }, { "epoch": 0.31096296296296294, "grad_norm": 0.9755362868309021, "learning_rate": 0.00013799851742031137, "loss": 1.0903, "step": 2099 }, { "epoch": 0.3111111111111111, "grad_norm": 1.2339026927947998, "learning_rate": 0.00013796886582653818, "loss": 1.1237, "step": 2100 }, { "epoch": 0.31125925925925924, "grad_norm": 1.6841607093811035, "learning_rate": 0.000137939214232765, "loss": 0.9327, "step": 2101 }, { "epoch": 0.3114074074074074, "grad_norm": 1.405722737312317, "learning_rate": 0.00013790956263899187, "loss": 1.2344, "step": 2102 }, { "epoch": 0.31155555555555553, "grad_norm": 2.11942195892334, "learning_rate": 0.0001378799110452187, "loss": 0.86, "step": 2103 }, { "epoch": 0.3117037037037037, "grad_norm": 1.6581157445907593, "learning_rate": 0.0001378502594514455, "loss": 1.3143, "step": 2104 }, { "epoch": 0.3118518518518518, "grad_norm": 2.0134646892547607, "learning_rate": 0.00013782060785767235, "loss": 1.2018, "step": 2105 }, { "epoch": 0.312, "grad_norm": 2.4110283851623535, "learning_rate": 0.0001377909562638992, "loss": 1.3338, "step": 2106 }, { "epoch": 0.3121481481481482, "grad_norm": 1.8445804119110107, "learning_rate": 0.000137761304670126, "loss": 1.0447, "step": 2107 }, { "epoch": 0.3122962962962963, "grad_norm": 2.610743761062622, "learning_rate": 0.00013773165307635286, "loss": 1.073, "step": 2108 }, { "epoch": 0.31244444444444447, "grad_norm": 1.6694021224975586, "learning_rate": 0.0001377020014825797, "loss": 1.0696, "step": 2109 }, { "epoch": 0.3125925925925926, "grad_norm": 1.3696857690811157, "learning_rate": 0.00013767234988880652, "loss": 1.0803, "step": 2110 }, { "epoch": 0.31274074074074076, "grad_norm": 1.8750032186508179, "learning_rate": 0.00013764269829503336, "loss": 1.2241, "step": 2111 }, { "epoch": 0.3128888888888889, "grad_norm": 2.962296962738037, "learning_rate": 0.0001376130467012602, "loss": 1.1358, "step": 2112 }, { "epoch": 0.31303703703703706, "grad_norm": 1.7400325536727905, "learning_rate": 0.00013758339510748703, "loss": 1.2508, "step": 2113 }, { "epoch": 0.3131851851851852, "grad_norm": 5.648115158081055, "learning_rate": 0.00013755374351371387, "loss": 1.2557, "step": 2114 }, { "epoch": 0.31333333333333335, "grad_norm": 2.5589797496795654, "learning_rate": 0.00013752409191994071, "loss": 1.1127, "step": 2115 }, { "epoch": 0.31348148148148147, "grad_norm": 2.085777997970581, "learning_rate": 0.00013749444032616753, "loss": 1.245, "step": 2116 }, { "epoch": 0.31362962962962965, "grad_norm": 2.188732147216797, "learning_rate": 0.00013746478873239438, "loss": 1.1694, "step": 2117 }, { "epoch": 0.31377777777777777, "grad_norm": 1.11812162399292, "learning_rate": 0.00013743513713862122, "loss": 1.323, "step": 2118 }, { "epoch": 0.31392592592592594, "grad_norm": 1.5886762142181396, "learning_rate": 0.00013740548554484804, "loss": 1.0427, "step": 2119 }, { "epoch": 0.31407407407407406, "grad_norm": 1.855367660522461, "learning_rate": 0.00013737583395107488, "loss": 0.9953, "step": 2120 }, { "epoch": 0.31422222222222224, "grad_norm": 1.6558690071105957, "learning_rate": 0.00013734618235730173, "loss": 1.0263, "step": 2121 }, { "epoch": 0.31437037037037036, "grad_norm": 0.9354804754257202, "learning_rate": 0.00013731653076352854, "loss": 1.4811, "step": 2122 }, { "epoch": 0.31451851851851853, "grad_norm": 2.6394095420837402, "learning_rate": 0.0001372868791697554, "loss": 0.9087, "step": 2123 }, { "epoch": 0.31466666666666665, "grad_norm": 1.5338950157165527, "learning_rate": 0.0001372572275759822, "loss": 1.0878, "step": 2124 }, { "epoch": 0.3148148148148148, "grad_norm": 2.82747220993042, "learning_rate": 0.00013722757598220905, "loss": 1.2008, "step": 2125 }, { "epoch": 0.31496296296296294, "grad_norm": 1.4896637201309204, "learning_rate": 0.0001371979243884359, "loss": 1.1101, "step": 2126 }, { "epoch": 0.3151111111111111, "grad_norm": 3.165152072906494, "learning_rate": 0.0001371682727946627, "loss": 1.2841, "step": 2127 }, { "epoch": 0.31525925925925924, "grad_norm": 2.199152946472168, "learning_rate": 0.00013713862120088956, "loss": 1.3937, "step": 2128 }, { "epoch": 0.3154074074074074, "grad_norm": 5.559042930603027, "learning_rate": 0.0001371089696071164, "loss": 1.2933, "step": 2129 }, { "epoch": 0.31555555555555553, "grad_norm": 10.026233673095703, "learning_rate": 0.00013707931801334322, "loss": 1.1713, "step": 2130 }, { "epoch": 0.3157037037037037, "grad_norm": 1.5633232593536377, "learning_rate": 0.00013704966641957006, "loss": 1.054, "step": 2131 }, { "epoch": 0.31585185185185183, "grad_norm": 1.8309271335601807, "learning_rate": 0.0001370200148257969, "loss": 1.1635, "step": 2132 }, { "epoch": 0.316, "grad_norm": 1.3764691352844238, "learning_rate": 0.00013699036323202372, "loss": 1.0757, "step": 2133 }, { "epoch": 0.3161481481481481, "grad_norm": 1.3178585767745972, "learning_rate": 0.00013696071163825057, "loss": 0.8379, "step": 2134 }, { "epoch": 0.3162962962962963, "grad_norm": 1.8320229053497314, "learning_rate": 0.0001369310600444774, "loss": 1.0575, "step": 2135 }, { "epoch": 0.3164444444444444, "grad_norm": 2.8101818561553955, "learning_rate": 0.00013690140845070423, "loss": 1.1453, "step": 2136 }, { "epoch": 0.3165925925925926, "grad_norm": 2.0384163856506348, "learning_rate": 0.00013687175685693108, "loss": 1.0169, "step": 2137 }, { "epoch": 0.31674074074074077, "grad_norm": 1.8953146934509277, "learning_rate": 0.0001368421052631579, "loss": 1.2889, "step": 2138 }, { "epoch": 0.3168888888888889, "grad_norm": 2.657672643661499, "learning_rate": 0.00013681245366938474, "loss": 1.1288, "step": 2139 }, { "epoch": 0.31703703703703706, "grad_norm": 4.052217960357666, "learning_rate": 0.00013678280207561155, "loss": 1.2616, "step": 2140 }, { "epoch": 0.3171851851851852, "grad_norm": 1.951884388923645, "learning_rate": 0.0001367531504818384, "loss": 1.0237, "step": 2141 }, { "epoch": 0.31733333333333336, "grad_norm": 2.485708236694336, "learning_rate": 0.00013672349888806524, "loss": 1.0757, "step": 2142 }, { "epoch": 0.3174814814814815, "grad_norm": 3.166104555130005, "learning_rate": 0.00013669384729429206, "loss": 1.0823, "step": 2143 }, { "epoch": 0.31762962962962965, "grad_norm": 4.034739017486572, "learning_rate": 0.0001366641957005189, "loss": 1.1093, "step": 2144 }, { "epoch": 0.31777777777777777, "grad_norm": 2.1679728031158447, "learning_rate": 0.00013663454410674575, "loss": 1.0586, "step": 2145 }, { "epoch": 0.31792592592592595, "grad_norm": 2.8911707401275635, "learning_rate": 0.00013660489251297257, "loss": 1.1043, "step": 2146 }, { "epoch": 0.31807407407407406, "grad_norm": 1.3390415906906128, "learning_rate": 0.0001365752409191994, "loss": 0.9645, "step": 2147 }, { "epoch": 0.31822222222222224, "grad_norm": 2.3080697059631348, "learning_rate": 0.00013654558932542626, "loss": 1.234, "step": 2148 }, { "epoch": 0.31837037037037036, "grad_norm": 1.975243091583252, "learning_rate": 0.00013651593773165307, "loss": 0.9722, "step": 2149 }, { "epoch": 0.31851851851851853, "grad_norm": 1.9367717504501343, "learning_rate": 0.00013648628613787992, "loss": 0.8212, "step": 2150 }, { "epoch": 0.31866666666666665, "grad_norm": 1.9093810319900513, "learning_rate": 0.00013645663454410676, "loss": 1.0883, "step": 2151 }, { "epoch": 0.31881481481481483, "grad_norm": 1.5951952934265137, "learning_rate": 0.00013642698295033358, "loss": 1.1901, "step": 2152 }, { "epoch": 0.31896296296296295, "grad_norm": 2.7034664154052734, "learning_rate": 0.00013639733135656042, "loss": 1.2348, "step": 2153 }, { "epoch": 0.3191111111111111, "grad_norm": 1.6457303762435913, "learning_rate": 0.00013636767976278727, "loss": 1.3959, "step": 2154 }, { "epoch": 0.31925925925925924, "grad_norm": 1.4474924802780151, "learning_rate": 0.00013633802816901409, "loss": 1.0256, "step": 2155 }, { "epoch": 0.3194074074074074, "grad_norm": 4.5285139083862305, "learning_rate": 0.0001363083765752409, "loss": 1.3252, "step": 2156 }, { "epoch": 0.31955555555555554, "grad_norm": 3.122760534286499, "learning_rate": 0.00013627872498146777, "loss": 1.4024, "step": 2157 }, { "epoch": 0.3197037037037037, "grad_norm": 3.5009829998016357, "learning_rate": 0.0001362490733876946, "loss": 1.0491, "step": 2158 }, { "epoch": 0.31985185185185183, "grad_norm": 2.080441951751709, "learning_rate": 0.0001362194217939214, "loss": 0.962, "step": 2159 }, { "epoch": 0.32, "grad_norm": 1.5678153038024902, "learning_rate": 0.00013618977020014828, "loss": 0.9275, "step": 2160 }, { "epoch": 0.3201481481481481, "grad_norm": 3.1842198371887207, "learning_rate": 0.0001361601186063751, "loss": 1.2926, "step": 2161 }, { "epoch": 0.3202962962962963, "grad_norm": 2.5489861965179443, "learning_rate": 0.00013613046701260192, "loss": 1.0225, "step": 2162 }, { "epoch": 0.3204444444444444, "grad_norm": 1.5396262407302856, "learning_rate": 0.0001361008154188288, "loss": 1.1842, "step": 2163 }, { "epoch": 0.3205925925925926, "grad_norm": 2.000410318374634, "learning_rate": 0.0001360711638250556, "loss": 1.1912, "step": 2164 }, { "epoch": 0.3207407407407407, "grad_norm": 3.4067788124084473, "learning_rate": 0.00013604151223128242, "loss": 1.0961, "step": 2165 }, { "epoch": 0.3208888888888889, "grad_norm": 2.5867462158203125, "learning_rate": 0.0001360118606375093, "loss": 1.5042, "step": 2166 }, { "epoch": 0.321037037037037, "grad_norm": 1.708670735359192, "learning_rate": 0.0001359822090437361, "loss": 1.1048, "step": 2167 }, { "epoch": 0.3211851851851852, "grad_norm": 1.2881687879562378, "learning_rate": 0.00013595255744996293, "loss": 1.2155, "step": 2168 }, { "epoch": 0.32133333333333336, "grad_norm": 1.3208478689193726, "learning_rate": 0.00013592290585618977, "loss": 1.15, "step": 2169 }, { "epoch": 0.3214814814814815, "grad_norm": 2.9517972469329834, "learning_rate": 0.00013589325426241662, "loss": 1.2613, "step": 2170 }, { "epoch": 0.32162962962962965, "grad_norm": 1.9396419525146484, "learning_rate": 0.00013586360266864343, "loss": 1.0884, "step": 2171 }, { "epoch": 0.3217777777777778, "grad_norm": 1.2326692342758179, "learning_rate": 0.00013583395107487028, "loss": 1.2093, "step": 2172 }, { "epoch": 0.32192592592592595, "grad_norm": 1.4012480974197388, "learning_rate": 0.00013580429948109712, "loss": 1.1254, "step": 2173 }, { "epoch": 0.32207407407407407, "grad_norm": 3.0078437328338623, "learning_rate": 0.00013577464788732394, "loss": 1.0937, "step": 2174 }, { "epoch": 0.32222222222222224, "grad_norm": 1.1547621488571167, "learning_rate": 0.00013574499629355079, "loss": 0.9611, "step": 2175 }, { "epoch": 0.32237037037037036, "grad_norm": 2.170196771621704, "learning_rate": 0.00013571534469977763, "loss": 1.2364, "step": 2176 }, { "epoch": 0.32251851851851854, "grad_norm": 1.1854034662246704, "learning_rate": 0.00013568569310600445, "loss": 1.1208, "step": 2177 }, { "epoch": 0.32266666666666666, "grad_norm": 1.8629307746887207, "learning_rate": 0.0001356560415122313, "loss": 1.0558, "step": 2178 }, { "epoch": 0.32281481481481483, "grad_norm": 1.7712525129318237, "learning_rate": 0.00013562638991845814, "loss": 1.1729, "step": 2179 }, { "epoch": 0.32296296296296295, "grad_norm": 1.4765230417251587, "learning_rate": 0.00013559673832468495, "loss": 1.3337, "step": 2180 }, { "epoch": 0.3231111111111111, "grad_norm": 1.1344188451766968, "learning_rate": 0.0001355670867309118, "loss": 0.9294, "step": 2181 }, { "epoch": 0.32325925925925925, "grad_norm": 1.6488782167434692, "learning_rate": 0.00013553743513713864, "loss": 1.2047, "step": 2182 }, { "epoch": 0.3234074074074074, "grad_norm": 1.8202825784683228, "learning_rate": 0.00013550778354336546, "loss": 1.0434, "step": 2183 }, { "epoch": 0.32355555555555554, "grad_norm": 2.084465265274048, "learning_rate": 0.0001354781319495923, "loss": 1.2276, "step": 2184 }, { "epoch": 0.3237037037037037, "grad_norm": 1.4090747833251953, "learning_rate": 0.00013544848035581915, "loss": 1.1444, "step": 2185 }, { "epoch": 0.32385185185185184, "grad_norm": 1.7574411630630493, "learning_rate": 0.00013541882876204597, "loss": 1.0567, "step": 2186 }, { "epoch": 0.324, "grad_norm": 1.773861050605774, "learning_rate": 0.00013538917716827278, "loss": 1.1064, "step": 2187 }, { "epoch": 0.32414814814814813, "grad_norm": 2.8019587993621826, "learning_rate": 0.00013535952557449965, "loss": 1.1687, "step": 2188 }, { "epoch": 0.3242962962962963, "grad_norm": 1.6108754873275757, "learning_rate": 0.00013532987398072647, "loss": 1.0169, "step": 2189 }, { "epoch": 0.3244444444444444, "grad_norm": 1.2681635618209839, "learning_rate": 0.0001353002223869533, "loss": 0.9246, "step": 2190 }, { "epoch": 0.3245925925925926, "grad_norm": 1.8541462421417236, "learning_rate": 0.00013527057079318013, "loss": 1.2053, "step": 2191 }, { "epoch": 0.3247407407407407, "grad_norm": 1.8434734344482422, "learning_rate": 0.00013524091919940698, "loss": 1.1271, "step": 2192 }, { "epoch": 0.3248888888888889, "grad_norm": 2.0871453285217285, "learning_rate": 0.0001352112676056338, "loss": 1.0076, "step": 2193 }, { "epoch": 0.325037037037037, "grad_norm": 1.3376096487045288, "learning_rate": 0.00013518161601186064, "loss": 1.0293, "step": 2194 }, { "epoch": 0.3251851851851852, "grad_norm": 1.2752751111984253, "learning_rate": 0.00013515196441808748, "loss": 1.0601, "step": 2195 }, { "epoch": 0.3253333333333333, "grad_norm": 5.159078121185303, "learning_rate": 0.0001351223128243143, "loss": 0.8771, "step": 2196 }, { "epoch": 0.3254814814814815, "grad_norm": 1.5362927913665771, "learning_rate": 0.00013509266123054115, "loss": 1.1814, "step": 2197 }, { "epoch": 0.3256296296296296, "grad_norm": 1.4354616403579712, "learning_rate": 0.000135063009636768, "loss": 1.5378, "step": 2198 }, { "epoch": 0.3257777777777778, "grad_norm": 2.6065075397491455, "learning_rate": 0.0001350333580429948, "loss": 1.1688, "step": 2199 }, { "epoch": 0.32592592592592595, "grad_norm": 1.4871209859848022, "learning_rate": 0.00013500370644922165, "loss": 1.125, "step": 2200 }, { "epoch": 0.32607407407407407, "grad_norm": 6.628143787384033, "learning_rate": 0.0001349740548554485, "loss": 1.1464, "step": 2201 }, { "epoch": 0.32622222222222225, "grad_norm": 2.4741086959838867, "learning_rate": 0.00013494440326167531, "loss": 1.0454, "step": 2202 }, { "epoch": 0.32637037037037037, "grad_norm": 1.276069164276123, "learning_rate": 0.00013491475166790216, "loss": 0.9532, "step": 2203 }, { "epoch": 0.32651851851851854, "grad_norm": 1.3667021989822388, "learning_rate": 0.000134885100074129, "loss": 1.0145, "step": 2204 }, { "epoch": 0.32666666666666666, "grad_norm": 1.7200194597244263, "learning_rate": 0.00013485544848035582, "loss": 0.8424, "step": 2205 }, { "epoch": 0.32681481481481484, "grad_norm": 1.6975458860397339, "learning_rate": 0.00013482579688658267, "loss": 1.1774, "step": 2206 }, { "epoch": 0.32696296296296296, "grad_norm": 2.316565990447998, "learning_rate": 0.0001347961452928095, "loss": 1.1567, "step": 2207 }, { "epoch": 0.32711111111111113, "grad_norm": 2.421337604522705, "learning_rate": 0.00013476649369903633, "loss": 1.195, "step": 2208 }, { "epoch": 0.32725925925925925, "grad_norm": 1.2722667455673218, "learning_rate": 0.00013473684210526317, "loss": 1.0311, "step": 2209 }, { "epoch": 0.3274074074074074, "grad_norm": 1.2215676307678223, "learning_rate": 0.00013470719051149, "loss": 1.0461, "step": 2210 }, { "epoch": 0.32755555555555554, "grad_norm": 1.0788629055023193, "learning_rate": 0.00013467753891771683, "loss": 1.2845, "step": 2211 }, { "epoch": 0.3277037037037037, "grad_norm": 1.3409634828567505, "learning_rate": 0.00013464788732394368, "loss": 1.1823, "step": 2212 }, { "epoch": 0.32785185185185184, "grad_norm": 2.9622018337249756, "learning_rate": 0.0001346182357301705, "loss": 1.0053, "step": 2213 }, { "epoch": 0.328, "grad_norm": 1.644832730293274, "learning_rate": 0.00013458858413639734, "loss": 1.1964, "step": 2214 }, { "epoch": 0.32814814814814813, "grad_norm": 1.0058623552322388, "learning_rate": 0.00013455893254262418, "loss": 0.7503, "step": 2215 }, { "epoch": 0.3282962962962963, "grad_norm": 1.2541953325271606, "learning_rate": 0.000134529280948851, "loss": 1.1544, "step": 2216 }, { "epoch": 0.32844444444444443, "grad_norm": 2.020519495010376, "learning_rate": 0.00013449962935507785, "loss": 1.2573, "step": 2217 }, { "epoch": 0.3285925925925926, "grad_norm": 1.1754764318466187, "learning_rate": 0.0001344699777613047, "loss": 1.0214, "step": 2218 }, { "epoch": 0.3287407407407407, "grad_norm": 5.07017183303833, "learning_rate": 0.0001344403261675315, "loss": 0.9595, "step": 2219 }, { "epoch": 0.3288888888888889, "grad_norm": 1.5297579765319824, "learning_rate": 0.00013441067457375835, "loss": 1.2923, "step": 2220 }, { "epoch": 0.329037037037037, "grad_norm": 1.3993247747421265, "learning_rate": 0.00013438102297998517, "loss": 1.1458, "step": 2221 }, { "epoch": 0.3291851851851852, "grad_norm": 1.262351393699646, "learning_rate": 0.00013435137138621201, "loss": 0.966, "step": 2222 }, { "epoch": 0.3293333333333333, "grad_norm": 1.176295518875122, "learning_rate": 0.00013432171979243886, "loss": 1.028, "step": 2223 }, { "epoch": 0.3294814814814815, "grad_norm": 1.2695571184158325, "learning_rate": 0.00013429206819866568, "loss": 1.0897, "step": 2224 }, { "epoch": 0.3296296296296296, "grad_norm": 1.488978385925293, "learning_rate": 0.00013426241660489252, "loss": 1.1638, "step": 2225 }, { "epoch": 0.3297777777777778, "grad_norm": 4.710011959075928, "learning_rate": 0.00013423276501111934, "loss": 1.2246, "step": 2226 }, { "epoch": 0.3299259259259259, "grad_norm": 1.6328994035720825, "learning_rate": 0.00013420311341734618, "loss": 0.9819, "step": 2227 }, { "epoch": 0.3300740740740741, "grad_norm": 2.277803421020508, "learning_rate": 0.00013417346182357303, "loss": 1.2242, "step": 2228 }, { "epoch": 0.3302222222222222, "grad_norm": 2.4489951133728027, "learning_rate": 0.00013414381022979984, "loss": 1.1634, "step": 2229 }, { "epoch": 0.33037037037037037, "grad_norm": 1.8839341402053833, "learning_rate": 0.0001341141586360267, "loss": 1.1036, "step": 2230 }, { "epoch": 0.33051851851851854, "grad_norm": 2.3348946571350098, "learning_rate": 0.00013408450704225353, "loss": 1.0797, "step": 2231 }, { "epoch": 0.33066666666666666, "grad_norm": 1.9656461477279663, "learning_rate": 0.00013405485544848035, "loss": 1.2083, "step": 2232 }, { "epoch": 0.33081481481481484, "grad_norm": 1.7312856912612915, "learning_rate": 0.0001340252038547072, "loss": 0.9045, "step": 2233 }, { "epoch": 0.33096296296296296, "grad_norm": 1.9389487504959106, "learning_rate": 0.00013399555226093404, "loss": 1.036, "step": 2234 }, { "epoch": 0.33111111111111113, "grad_norm": 1.2917704582214355, "learning_rate": 0.00013396590066716086, "loss": 1.0194, "step": 2235 }, { "epoch": 0.33125925925925925, "grad_norm": 1.4721956253051758, "learning_rate": 0.0001339362490733877, "loss": 1.1603, "step": 2236 }, { "epoch": 0.33140740740740743, "grad_norm": 1.4399042129516602, "learning_rate": 0.00013390659747961455, "loss": 0.8924, "step": 2237 }, { "epoch": 0.33155555555555555, "grad_norm": 1.5402852296829224, "learning_rate": 0.00013387694588584136, "loss": 0.9991, "step": 2238 }, { "epoch": 0.3317037037037037, "grad_norm": 1.7603425979614258, "learning_rate": 0.0001338472942920682, "loss": 1.2753, "step": 2239 }, { "epoch": 0.33185185185185184, "grad_norm": 9.889684677124023, "learning_rate": 0.00013381764269829505, "loss": 1.0038, "step": 2240 }, { "epoch": 0.332, "grad_norm": 1.8739274740219116, "learning_rate": 0.00013378799110452187, "loss": 1.1637, "step": 2241 }, { "epoch": 0.33214814814814814, "grad_norm": 1.2513951063156128, "learning_rate": 0.0001337583395107487, "loss": 0.8515, "step": 2242 }, { "epoch": 0.3322962962962963, "grad_norm": 1.3573156595230103, "learning_rate": 0.00013372868791697556, "loss": 1.1591, "step": 2243 }, { "epoch": 0.33244444444444443, "grad_norm": 1.0235463380813599, "learning_rate": 0.00013369903632320238, "loss": 1.0468, "step": 2244 }, { "epoch": 0.3325925925925926, "grad_norm": 3.3065717220306396, "learning_rate": 0.0001336693847294292, "loss": 1.3364, "step": 2245 }, { "epoch": 0.3327407407407407, "grad_norm": 1.5484888553619385, "learning_rate": 0.00013363973313565606, "loss": 1.1987, "step": 2246 }, { "epoch": 0.3328888888888889, "grad_norm": 1.9932880401611328, "learning_rate": 0.00013361008154188288, "loss": 1.1988, "step": 2247 }, { "epoch": 0.333037037037037, "grad_norm": 1.6632351875305176, "learning_rate": 0.0001335804299481097, "loss": 1.236, "step": 2248 }, { "epoch": 0.3331851851851852, "grad_norm": 1.444595456123352, "learning_rate": 0.00013355077835433657, "loss": 0.977, "step": 2249 }, { "epoch": 0.3333333333333333, "grad_norm": 2.094960927963257, "learning_rate": 0.0001335211267605634, "loss": 1.2066, "step": 2250 }, { "epoch": 0.3334814814814815, "grad_norm": 1.1526453495025635, "learning_rate": 0.0001334914751667902, "loss": 1.0169, "step": 2251 }, { "epoch": 0.3336296296296296, "grad_norm": 1.17086923122406, "learning_rate": 0.00013346182357301708, "loss": 1.0256, "step": 2252 }, { "epoch": 0.3337777777777778, "grad_norm": 2.183746337890625, "learning_rate": 0.0001334321719792439, "loss": 1.1574, "step": 2253 }, { "epoch": 0.3339259259259259, "grad_norm": 1.5439538955688477, "learning_rate": 0.0001334025203854707, "loss": 1.0676, "step": 2254 }, { "epoch": 0.3340740740740741, "grad_norm": 2.3153722286224365, "learning_rate": 0.00013337286879169756, "loss": 1.1764, "step": 2255 }, { "epoch": 0.3342222222222222, "grad_norm": 2.0121192932128906, "learning_rate": 0.0001333432171979244, "loss": 1.2408, "step": 2256 }, { "epoch": 0.3343703703703704, "grad_norm": 1.2326407432556152, "learning_rate": 0.00013331356560415122, "loss": 1.0246, "step": 2257 }, { "epoch": 0.3345185185185185, "grad_norm": 1.514545202255249, "learning_rate": 0.00013328391401037806, "loss": 1.1998, "step": 2258 }, { "epoch": 0.33466666666666667, "grad_norm": 2.469101667404175, "learning_rate": 0.0001332542624166049, "loss": 1.1136, "step": 2259 }, { "epoch": 0.3348148148148148, "grad_norm": 1.266455054283142, "learning_rate": 0.00013322461082283172, "loss": 1.1005, "step": 2260 }, { "epoch": 0.33496296296296296, "grad_norm": 1.3625801801681519, "learning_rate": 0.00013319495922905857, "loss": 0.9987, "step": 2261 }, { "epoch": 0.33511111111111114, "grad_norm": 2.7278425693511963, "learning_rate": 0.0001331653076352854, "loss": 1.2047, "step": 2262 }, { "epoch": 0.33525925925925926, "grad_norm": 1.64750075340271, "learning_rate": 0.00013313565604151223, "loss": 1.056, "step": 2263 }, { "epoch": 0.33540740740740743, "grad_norm": 1.3636395931243896, "learning_rate": 0.00013310600444773907, "loss": 1.2012, "step": 2264 }, { "epoch": 0.33555555555555555, "grad_norm": 1.602080225944519, "learning_rate": 0.00013307635285396592, "loss": 1.0032, "step": 2265 }, { "epoch": 0.3357037037037037, "grad_norm": 1.533245325088501, "learning_rate": 0.00013304670126019274, "loss": 1.1727, "step": 2266 }, { "epoch": 0.33585185185185185, "grad_norm": 5.972871780395508, "learning_rate": 0.00013301704966641958, "loss": 1.0066, "step": 2267 }, { "epoch": 0.336, "grad_norm": 2.36637282371521, "learning_rate": 0.00013298739807264643, "loss": 1.068, "step": 2268 }, { "epoch": 0.33614814814814814, "grad_norm": 1.5738499164581299, "learning_rate": 0.00013295774647887324, "loss": 1.2732, "step": 2269 }, { "epoch": 0.3362962962962963, "grad_norm": 1.3270472288131714, "learning_rate": 0.0001329280948851001, "loss": 1.0262, "step": 2270 }, { "epoch": 0.33644444444444443, "grad_norm": 2.080435276031494, "learning_rate": 0.00013289844329132693, "loss": 1.0533, "step": 2271 }, { "epoch": 0.3365925925925926, "grad_norm": 1.3887215852737427, "learning_rate": 0.00013286879169755375, "loss": 1.0709, "step": 2272 }, { "epoch": 0.33674074074074073, "grad_norm": 2.125825881958008, "learning_rate": 0.00013283914010378057, "loss": 1.1915, "step": 2273 }, { "epoch": 0.3368888888888889, "grad_norm": 1.562106728553772, "learning_rate": 0.00013280948851000744, "loss": 1.1003, "step": 2274 }, { "epoch": 0.337037037037037, "grad_norm": 3.1373605728149414, "learning_rate": 0.00013277983691623426, "loss": 0.8193, "step": 2275 }, { "epoch": 0.3371851851851852, "grad_norm": 1.7475008964538574, "learning_rate": 0.00013275018532246107, "loss": 1.0573, "step": 2276 }, { "epoch": 0.3373333333333333, "grad_norm": 1.6964002847671509, "learning_rate": 0.00013272053372868792, "loss": 1.2842, "step": 2277 }, { "epoch": 0.3374814814814815, "grad_norm": 1.929476022720337, "learning_rate": 0.00013269088213491476, "loss": 1.2528, "step": 2278 }, { "epoch": 0.3376296296296296, "grad_norm": 1.4762523174285889, "learning_rate": 0.00013266123054114158, "loss": 1.3121, "step": 2279 }, { "epoch": 0.3377777777777778, "grad_norm": 1.1908106803894043, "learning_rate": 0.00013263157894736842, "loss": 0.9289, "step": 2280 }, { "epoch": 0.3379259259259259, "grad_norm": 2.59541654586792, "learning_rate": 0.00013260192735359527, "loss": 0.962, "step": 2281 }, { "epoch": 0.3380740740740741, "grad_norm": 1.5643655061721802, "learning_rate": 0.00013257227575982209, "loss": 0.912, "step": 2282 }, { "epoch": 0.3382222222222222, "grad_norm": 3.0142710208892822, "learning_rate": 0.00013254262416604893, "loss": 1.2697, "step": 2283 }, { "epoch": 0.3383703703703704, "grad_norm": 1.6093851327896118, "learning_rate": 0.00013251297257227577, "loss": 1.0584, "step": 2284 }, { "epoch": 0.3385185185185185, "grad_norm": 2.0272083282470703, "learning_rate": 0.0001324833209785026, "loss": 1.5104, "step": 2285 }, { "epoch": 0.33866666666666667, "grad_norm": 2.518580198287964, "learning_rate": 0.00013245366938472944, "loss": 1.2991, "step": 2286 }, { "epoch": 0.3388148148148148, "grad_norm": 1.4523662328720093, "learning_rate": 0.00013242401779095628, "loss": 1.2912, "step": 2287 }, { "epoch": 0.33896296296296297, "grad_norm": 2.055608034133911, "learning_rate": 0.0001323943661971831, "loss": 0.9778, "step": 2288 }, { "epoch": 0.3391111111111111, "grad_norm": 1.3338221311569214, "learning_rate": 0.00013236471460340994, "loss": 0.8675, "step": 2289 }, { "epoch": 0.33925925925925926, "grad_norm": 1.8887592554092407, "learning_rate": 0.0001323350630096368, "loss": 1.0106, "step": 2290 }, { "epoch": 0.3394074074074074, "grad_norm": 1.289023756980896, "learning_rate": 0.0001323054114158636, "loss": 1.327, "step": 2291 }, { "epoch": 0.33955555555555555, "grad_norm": 1.4453749656677246, "learning_rate": 0.00013227575982209045, "loss": 1.3064, "step": 2292 }, { "epoch": 0.33970370370370373, "grad_norm": 1.5419834852218628, "learning_rate": 0.0001322461082283173, "loss": 1.2386, "step": 2293 }, { "epoch": 0.33985185185185185, "grad_norm": 1.6710052490234375, "learning_rate": 0.0001322164566345441, "loss": 1.0302, "step": 2294 }, { "epoch": 0.34, "grad_norm": 1.6174490451812744, "learning_rate": 0.00013218680504077096, "loss": 1.0299, "step": 2295 }, { "epoch": 0.34014814814814814, "grad_norm": 2.3813655376434326, "learning_rate": 0.00013215715344699777, "loss": 1.3254, "step": 2296 }, { "epoch": 0.3402962962962963, "grad_norm": 2.1109097003936768, "learning_rate": 0.00013212750185322462, "loss": 0.9789, "step": 2297 }, { "epoch": 0.34044444444444444, "grad_norm": 2.058483362197876, "learning_rate": 0.00013209785025945146, "loss": 1.065, "step": 2298 }, { "epoch": 0.3405925925925926, "grad_norm": 2.3258121013641357, "learning_rate": 0.00013206819866567828, "loss": 0.9045, "step": 2299 }, { "epoch": 0.34074074074074073, "grad_norm": 2.2851665019989014, "learning_rate": 0.00013203854707190512, "loss": 1.2529, "step": 2300 }, { "epoch": 0.3408888888888889, "grad_norm": 1.613037347793579, "learning_rate": 0.00013200889547813197, "loss": 1.2204, "step": 2301 }, { "epoch": 0.341037037037037, "grad_norm": 1.3707785606384277, "learning_rate": 0.00013197924388435878, "loss": 0.9235, "step": 2302 }, { "epoch": 0.3411851851851852, "grad_norm": 2.2744858264923096, "learning_rate": 0.00013194959229058563, "loss": 1.2832, "step": 2303 }, { "epoch": 0.3413333333333333, "grad_norm": 2.278960943222046, "learning_rate": 0.00013191994069681247, "loss": 1.1904, "step": 2304 }, { "epoch": 0.3414814814814815, "grad_norm": 1.6594761610031128, "learning_rate": 0.0001318902891030393, "loss": 1.3182, "step": 2305 }, { "epoch": 0.3416296296296296, "grad_norm": 1.9333387613296509, "learning_rate": 0.00013186063750926614, "loss": 1.1184, "step": 2306 }, { "epoch": 0.3417777777777778, "grad_norm": 2.1505253314971924, "learning_rate": 0.00013183098591549295, "loss": 1.343, "step": 2307 }, { "epoch": 0.3419259259259259, "grad_norm": 2.506364345550537, "learning_rate": 0.0001318013343217198, "loss": 1.0817, "step": 2308 }, { "epoch": 0.3420740740740741, "grad_norm": 1.2928565740585327, "learning_rate": 0.00013177168272794664, "loss": 1.1916, "step": 2309 }, { "epoch": 0.3422222222222222, "grad_norm": 1.342826247215271, "learning_rate": 0.00013174203113417346, "loss": 1.1338, "step": 2310 }, { "epoch": 0.3423703703703704, "grad_norm": 2.0148978233337402, "learning_rate": 0.0001317123795404003, "loss": 1.2262, "step": 2311 }, { "epoch": 0.3425185185185185, "grad_norm": 1.5498647689819336, "learning_rate": 0.00013168272794662712, "loss": 1.2198, "step": 2312 }, { "epoch": 0.3426666666666667, "grad_norm": 1.4745525121688843, "learning_rate": 0.00013165307635285397, "loss": 1.0997, "step": 2313 }, { "epoch": 0.3428148148148148, "grad_norm": 1.331832766532898, "learning_rate": 0.0001316234247590808, "loss": 1.162, "step": 2314 }, { "epoch": 0.34296296296296297, "grad_norm": 1.9498533010482788, "learning_rate": 0.00013159377316530763, "loss": 1.1283, "step": 2315 }, { "epoch": 0.3431111111111111, "grad_norm": 1.7941899299621582, "learning_rate": 0.00013156412157153447, "loss": 1.1096, "step": 2316 }, { "epoch": 0.34325925925925926, "grad_norm": 2.869532823562622, "learning_rate": 0.00013153446997776132, "loss": 0.9594, "step": 2317 }, { "epoch": 0.3434074074074074, "grad_norm": 1.5423548221588135, "learning_rate": 0.00013150481838398813, "loss": 1.0791, "step": 2318 }, { "epoch": 0.34355555555555556, "grad_norm": 2.9745006561279297, "learning_rate": 0.00013147516679021498, "loss": 1.1664, "step": 2319 }, { "epoch": 0.3437037037037037, "grad_norm": 2.872915506362915, "learning_rate": 0.00013144551519644182, "loss": 1.0395, "step": 2320 }, { "epoch": 0.34385185185185185, "grad_norm": 1.4737216234207153, "learning_rate": 0.00013141586360266864, "loss": 1.2031, "step": 2321 }, { "epoch": 0.344, "grad_norm": 2.5522093772888184, "learning_rate": 0.00013138621200889548, "loss": 1.316, "step": 2322 }, { "epoch": 0.34414814814814815, "grad_norm": 1.5588302612304688, "learning_rate": 0.00013135656041512233, "loss": 0.9322, "step": 2323 }, { "epoch": 0.3442962962962963, "grad_norm": 1.507179856300354, "learning_rate": 0.00013132690882134915, "loss": 1.0541, "step": 2324 }, { "epoch": 0.34444444444444444, "grad_norm": 1.4153691530227661, "learning_rate": 0.000131297257227576, "loss": 1.0663, "step": 2325 }, { "epoch": 0.3445925925925926, "grad_norm": 2.8796892166137695, "learning_rate": 0.00013126760563380284, "loss": 0.9802, "step": 2326 }, { "epoch": 0.34474074074074074, "grad_norm": 2.128862142562866, "learning_rate": 0.00013123795404002965, "loss": 1.0643, "step": 2327 }, { "epoch": 0.3448888888888889, "grad_norm": 7.3458075523376465, "learning_rate": 0.0001312083024462565, "loss": 1.3487, "step": 2328 }, { "epoch": 0.34503703703703703, "grad_norm": 2.286179542541504, "learning_rate": 0.00013117865085248334, "loss": 1.092, "step": 2329 }, { "epoch": 0.3451851851851852, "grad_norm": 1.8476670980453491, "learning_rate": 0.00013114899925871016, "loss": 1.1717, "step": 2330 }, { "epoch": 0.3453333333333333, "grad_norm": 2.7225584983825684, "learning_rate": 0.00013111934766493698, "loss": 1.1744, "step": 2331 }, { "epoch": 0.3454814814814815, "grad_norm": 1.977587103843689, "learning_rate": 0.00013108969607116385, "loss": 1.0366, "step": 2332 }, { "epoch": 0.3456296296296296, "grad_norm": 1.4287402629852295, "learning_rate": 0.00013106004447739067, "loss": 0.94, "step": 2333 }, { "epoch": 0.3457777777777778, "grad_norm": 3.3908700942993164, "learning_rate": 0.00013103039288361748, "loss": 0.9075, "step": 2334 }, { "epoch": 0.3459259259259259, "grad_norm": 1.7279934883117676, "learning_rate": 0.00013100074128984435, "loss": 0.9598, "step": 2335 }, { "epoch": 0.3460740740740741, "grad_norm": 2.6617941856384277, "learning_rate": 0.00013097108969607117, "loss": 1.0555, "step": 2336 }, { "epoch": 0.3462222222222222, "grad_norm": 1.5456342697143555, "learning_rate": 0.000130941438102298, "loss": 0.9512, "step": 2337 }, { "epoch": 0.3463703703703704, "grad_norm": 1.7742456197738647, "learning_rate": 0.00013091178650852486, "loss": 1.1459, "step": 2338 }, { "epoch": 0.3465185185185185, "grad_norm": 1.6286709308624268, "learning_rate": 0.00013088213491475168, "loss": 1.0374, "step": 2339 }, { "epoch": 0.3466666666666667, "grad_norm": 2.609323024749756, "learning_rate": 0.0001308524833209785, "loss": 1.0615, "step": 2340 }, { "epoch": 0.3468148148148148, "grad_norm": 1.8275586366653442, "learning_rate": 0.00013082283172720534, "loss": 1.1204, "step": 2341 }, { "epoch": 0.346962962962963, "grad_norm": 1.8131769895553589, "learning_rate": 0.00013079318013343218, "loss": 1.2238, "step": 2342 }, { "epoch": 0.3471111111111111, "grad_norm": 1.819657802581787, "learning_rate": 0.000130763528539659, "loss": 1.1137, "step": 2343 }, { "epoch": 0.34725925925925927, "grad_norm": 1.9716880321502686, "learning_rate": 0.00013073387694588585, "loss": 0.9928, "step": 2344 }, { "epoch": 0.3474074074074074, "grad_norm": 1.4299585819244385, "learning_rate": 0.0001307042253521127, "loss": 1.1994, "step": 2345 }, { "epoch": 0.34755555555555556, "grad_norm": 3.061483383178711, "learning_rate": 0.0001306745737583395, "loss": 1.095, "step": 2346 }, { "epoch": 0.3477037037037037, "grad_norm": 1.1320656538009644, "learning_rate": 0.00013064492216456635, "loss": 1.1769, "step": 2347 }, { "epoch": 0.34785185185185186, "grad_norm": 1.1383968591690063, "learning_rate": 0.0001306152705707932, "loss": 1.2742, "step": 2348 }, { "epoch": 0.348, "grad_norm": 1.5431816577911377, "learning_rate": 0.00013058561897702001, "loss": 0.8871, "step": 2349 }, { "epoch": 0.34814814814814815, "grad_norm": 1.4152706861495972, "learning_rate": 0.00013055596738324686, "loss": 1.1823, "step": 2350 }, { "epoch": 0.34829629629629627, "grad_norm": 1.2930550575256348, "learning_rate": 0.0001305263157894737, "loss": 1.0146, "step": 2351 }, { "epoch": 0.34844444444444445, "grad_norm": 1.5922138690948486, "learning_rate": 0.00013049666419570052, "loss": 0.948, "step": 2352 }, { "epoch": 0.34859259259259257, "grad_norm": 2.2768802642822266, "learning_rate": 0.00013046701260192736, "loss": 1.0215, "step": 2353 }, { "epoch": 0.34874074074074074, "grad_norm": 3.258697986602783, "learning_rate": 0.0001304373610081542, "loss": 1.3143, "step": 2354 }, { "epoch": 0.3488888888888889, "grad_norm": 1.8296974897384644, "learning_rate": 0.00013040770941438103, "loss": 1.0664, "step": 2355 }, { "epoch": 0.34903703703703703, "grad_norm": 1.260080099105835, "learning_rate": 0.00013037805782060787, "loss": 1.1924, "step": 2356 }, { "epoch": 0.3491851851851852, "grad_norm": 2.329533100128174, "learning_rate": 0.00013034840622683472, "loss": 1.1003, "step": 2357 }, { "epoch": 0.34933333333333333, "grad_norm": 1.5267668962478638, "learning_rate": 0.00013031875463306153, "loss": 1.1741, "step": 2358 }, { "epoch": 0.3494814814814815, "grad_norm": 1.801763653755188, "learning_rate": 0.00013028910303928835, "loss": 1.0512, "step": 2359 }, { "epoch": 0.3496296296296296, "grad_norm": 1.5702931880950928, "learning_rate": 0.00013025945144551522, "loss": 0.9878, "step": 2360 }, { "epoch": 0.3497777777777778, "grad_norm": 1.5275589227676392, "learning_rate": 0.00013022979985174204, "loss": 1.2239, "step": 2361 }, { "epoch": 0.3499259259259259, "grad_norm": 1.3607791662216187, "learning_rate": 0.00013020014825796886, "loss": 0.955, "step": 2362 }, { "epoch": 0.3500740740740741, "grad_norm": 1.9856244325637817, "learning_rate": 0.0001301704966641957, "loss": 1.059, "step": 2363 }, { "epoch": 0.3502222222222222, "grad_norm": 4.6814656257629395, "learning_rate": 0.00013014084507042255, "loss": 1.2651, "step": 2364 }, { "epoch": 0.3503703703703704, "grad_norm": 1.6786004304885864, "learning_rate": 0.00013011119347664936, "loss": 1.4956, "step": 2365 }, { "epoch": 0.3505185185185185, "grad_norm": 1.4799193143844604, "learning_rate": 0.0001300815418828762, "loss": 1.1083, "step": 2366 }, { "epoch": 0.3506666666666667, "grad_norm": 1.5611683130264282, "learning_rate": 0.00013005189028910305, "loss": 1.1887, "step": 2367 }, { "epoch": 0.3508148148148148, "grad_norm": 1.5283243656158447, "learning_rate": 0.00013002223869532987, "loss": 1.0457, "step": 2368 }, { "epoch": 0.350962962962963, "grad_norm": 1.9599016904830933, "learning_rate": 0.0001299925871015567, "loss": 1.2424, "step": 2369 }, { "epoch": 0.3511111111111111, "grad_norm": 2.5228400230407715, "learning_rate": 0.00012996293550778356, "loss": 1.1464, "step": 2370 }, { "epoch": 0.35125925925925927, "grad_norm": 2.2123970985412598, "learning_rate": 0.00012993328391401038, "loss": 0.9944, "step": 2371 }, { "epoch": 0.3514074074074074, "grad_norm": 1.8431411981582642, "learning_rate": 0.00012990363232023722, "loss": 0.9995, "step": 2372 }, { "epoch": 0.35155555555555557, "grad_norm": 1.4077692031860352, "learning_rate": 0.00012987398072646406, "loss": 1.195, "step": 2373 }, { "epoch": 0.3517037037037037, "grad_norm": 1.4908804893493652, "learning_rate": 0.00012984432913269088, "loss": 1.1056, "step": 2374 }, { "epoch": 0.35185185185185186, "grad_norm": 1.3918676376342773, "learning_rate": 0.00012981467753891773, "loss": 1.088, "step": 2375 }, { "epoch": 0.352, "grad_norm": 1.3568589687347412, "learning_rate": 0.00012978502594514457, "loss": 0.9954, "step": 2376 }, { "epoch": 0.35214814814814815, "grad_norm": 2.4987220764160156, "learning_rate": 0.0001297553743513714, "loss": 1.2841, "step": 2377 }, { "epoch": 0.3522962962962963, "grad_norm": 1.3742563724517822, "learning_rate": 0.00012972572275759823, "loss": 0.9693, "step": 2378 }, { "epoch": 0.35244444444444445, "grad_norm": 1.2668038606643677, "learning_rate": 0.00012969607116382508, "loss": 1.1097, "step": 2379 }, { "epoch": 0.35259259259259257, "grad_norm": 2.4651150703430176, "learning_rate": 0.0001296664195700519, "loss": 0.9591, "step": 2380 }, { "epoch": 0.35274074074074074, "grad_norm": 2.202927827835083, "learning_rate": 0.00012963676797627874, "loss": 1.0431, "step": 2381 }, { "epoch": 0.35288888888888886, "grad_norm": 3.5075621604919434, "learning_rate": 0.00012960711638250556, "loss": 1.0707, "step": 2382 }, { "epoch": 0.35303703703703704, "grad_norm": 1.142947793006897, "learning_rate": 0.0001295774647887324, "loss": 0.9011, "step": 2383 }, { "epoch": 0.35318518518518516, "grad_norm": 6.0171918869018555, "learning_rate": 0.00012954781319495924, "loss": 1.3032, "step": 2384 }, { "epoch": 0.35333333333333333, "grad_norm": 1.3418155908584595, "learning_rate": 0.00012951816160118606, "loss": 1.1346, "step": 2385 }, { "epoch": 0.3534814814814815, "grad_norm": 4.483288764953613, "learning_rate": 0.0001294885100074129, "loss": 1.2346, "step": 2386 }, { "epoch": 0.3536296296296296, "grad_norm": 2.130213737487793, "learning_rate": 0.00012945885841363975, "loss": 1.0704, "step": 2387 }, { "epoch": 0.3537777777777778, "grad_norm": 1.1577987670898438, "learning_rate": 0.00012942920681986657, "loss": 1.1326, "step": 2388 }, { "epoch": 0.3539259259259259, "grad_norm": 3.3889710903167725, "learning_rate": 0.0001293995552260934, "loss": 1.2009, "step": 2389 }, { "epoch": 0.3540740740740741, "grad_norm": 1.7422261238098145, "learning_rate": 0.00012936990363232026, "loss": 1.0677, "step": 2390 }, { "epoch": 0.3542222222222222, "grad_norm": 2.3017358779907227, "learning_rate": 0.00012934025203854707, "loss": 1.1908, "step": 2391 }, { "epoch": 0.3543703703703704, "grad_norm": 1.8185927867889404, "learning_rate": 0.00012931060044477392, "loss": 1.0373, "step": 2392 }, { "epoch": 0.3545185185185185, "grad_norm": 2.478407144546509, "learning_rate": 0.00012928094885100074, "loss": 0.9484, "step": 2393 }, { "epoch": 0.3546666666666667, "grad_norm": 1.702976942062378, "learning_rate": 0.00012925129725722758, "loss": 1.105, "step": 2394 }, { "epoch": 0.3548148148148148, "grad_norm": 2.6446189880371094, "learning_rate": 0.00012922164566345443, "loss": 0.8852, "step": 2395 }, { "epoch": 0.354962962962963, "grad_norm": 1.8409467935562134, "learning_rate": 0.00012919199406968124, "loss": 1.031, "step": 2396 }, { "epoch": 0.3551111111111111, "grad_norm": 1.3260278701782227, "learning_rate": 0.0001291623424759081, "loss": 1.0098, "step": 2397 }, { "epoch": 0.3552592592592593, "grad_norm": 1.2395411729812622, "learning_rate": 0.0001291326908821349, "loss": 1.1051, "step": 2398 }, { "epoch": 0.3554074074074074, "grad_norm": 1.3518856763839722, "learning_rate": 0.00012910303928836175, "loss": 0.8233, "step": 2399 }, { "epoch": 0.35555555555555557, "grad_norm": 1.969286322593689, "learning_rate": 0.0001290733876945886, "loss": 1.1434, "step": 2400 }, { "epoch": 0.3557037037037037, "grad_norm": 7.381817817687988, "learning_rate": 0.0001290437361008154, "loss": 1.2454, "step": 2401 }, { "epoch": 0.35585185185185186, "grad_norm": 1.8303916454315186, "learning_rate": 0.00012901408450704226, "loss": 1.1364, "step": 2402 }, { "epoch": 0.356, "grad_norm": 2.781118631362915, "learning_rate": 0.0001289844329132691, "loss": 1.318, "step": 2403 }, { "epoch": 0.35614814814814816, "grad_norm": 2.0820934772491455, "learning_rate": 0.00012895478131949592, "loss": 1.1334, "step": 2404 }, { "epoch": 0.3562962962962963, "grad_norm": 2.527456283569336, "learning_rate": 0.00012892512972572276, "loss": 1.019, "step": 2405 }, { "epoch": 0.35644444444444445, "grad_norm": 1.662646770477295, "learning_rate": 0.0001288954781319496, "loss": 1.2195, "step": 2406 }, { "epoch": 0.35659259259259257, "grad_norm": 1.7455997467041016, "learning_rate": 0.00012886582653817642, "loss": 1.0235, "step": 2407 }, { "epoch": 0.35674074074074075, "grad_norm": 1.5952746868133545, "learning_rate": 0.00012883617494440327, "loss": 1.3457, "step": 2408 }, { "epoch": 0.35688888888888887, "grad_norm": 1.823468565940857, "learning_rate": 0.0001288065233506301, "loss": 1.2087, "step": 2409 }, { "epoch": 0.35703703703703704, "grad_norm": 1.7799134254455566, "learning_rate": 0.00012877687175685693, "loss": 1.2204, "step": 2410 }, { "epoch": 0.35718518518518516, "grad_norm": 3.1586012840270996, "learning_rate": 0.00012874722016308377, "loss": 1.117, "step": 2411 }, { "epoch": 0.35733333333333334, "grad_norm": 1.5659916400909424, "learning_rate": 0.00012871756856931062, "loss": 1.0821, "step": 2412 }, { "epoch": 0.35748148148148146, "grad_norm": 1.5964833498001099, "learning_rate": 0.00012868791697553744, "loss": 1.2492, "step": 2413 }, { "epoch": 0.35762962962962963, "grad_norm": 1.5727459192276, "learning_rate": 0.00012865826538176428, "loss": 1.0777, "step": 2414 }, { "epoch": 0.35777777777777775, "grad_norm": 2.3969857692718506, "learning_rate": 0.00012862861378799112, "loss": 1.0429, "step": 2415 }, { "epoch": 0.3579259259259259, "grad_norm": 1.8711894750595093, "learning_rate": 0.00012859896219421794, "loss": 1.1727, "step": 2416 }, { "epoch": 0.3580740740740741, "grad_norm": 1.7183738946914673, "learning_rate": 0.00012856931060044476, "loss": 1.1054, "step": 2417 }, { "epoch": 0.3582222222222222, "grad_norm": 19.228012084960938, "learning_rate": 0.00012853965900667163, "loss": 1.0972, "step": 2418 }, { "epoch": 0.3583703703703704, "grad_norm": 2.1944210529327393, "learning_rate": 0.00012851000741289845, "loss": 1.2464, "step": 2419 }, { "epoch": 0.3585185185185185, "grad_norm": 6.018752098083496, "learning_rate": 0.00012848035581912527, "loss": 0.8132, "step": 2420 }, { "epoch": 0.3586666666666667, "grad_norm": 1.8604862689971924, "learning_rate": 0.00012845070422535214, "loss": 1.2575, "step": 2421 }, { "epoch": 0.3588148148148148, "grad_norm": 2.9561004638671875, "learning_rate": 0.00012842105263157895, "loss": 0.9434, "step": 2422 }, { "epoch": 0.358962962962963, "grad_norm": 1.7331130504608154, "learning_rate": 0.00012839140103780577, "loss": 1.1185, "step": 2423 }, { "epoch": 0.3591111111111111, "grad_norm": 2.323881149291992, "learning_rate": 0.00012836174944403264, "loss": 1.067, "step": 2424 }, { "epoch": 0.3592592592592593, "grad_norm": 1.638932466506958, "learning_rate": 0.00012833209785025946, "loss": 1.1443, "step": 2425 }, { "epoch": 0.3594074074074074, "grad_norm": 2.301362991333008, "learning_rate": 0.00012830244625648628, "loss": 0.9504, "step": 2426 }, { "epoch": 0.3595555555555556, "grad_norm": 1.333407998085022, "learning_rate": 0.00012827279466271312, "loss": 0.8333, "step": 2427 }, { "epoch": 0.3597037037037037, "grad_norm": 1.5285286903381348, "learning_rate": 0.00012824314306893997, "loss": 1.1386, "step": 2428 }, { "epoch": 0.35985185185185187, "grad_norm": 1.8714796304702759, "learning_rate": 0.00012821349147516678, "loss": 1.3843, "step": 2429 }, { "epoch": 0.36, "grad_norm": 1.7737548351287842, "learning_rate": 0.00012818383988139363, "loss": 0.9567, "step": 2430 }, { "epoch": 0.36014814814814816, "grad_norm": 2.0456058979034424, "learning_rate": 0.00012815418828762047, "loss": 0.8945, "step": 2431 }, { "epoch": 0.3602962962962963, "grad_norm": 3.461463212966919, "learning_rate": 0.0001281245366938473, "loss": 1.0448, "step": 2432 }, { "epoch": 0.36044444444444446, "grad_norm": 1.7812738418579102, "learning_rate": 0.00012809488510007414, "loss": 1.1725, "step": 2433 }, { "epoch": 0.3605925925925926, "grad_norm": 1.7331719398498535, "learning_rate": 0.00012806523350630098, "loss": 1.2265, "step": 2434 }, { "epoch": 0.36074074074074075, "grad_norm": 1.94052255153656, "learning_rate": 0.0001280355819125278, "loss": 1.2161, "step": 2435 }, { "epoch": 0.36088888888888887, "grad_norm": 2.128408908843994, "learning_rate": 0.00012800593031875464, "loss": 1.004, "step": 2436 }, { "epoch": 0.36103703703703705, "grad_norm": 1.4985857009887695, "learning_rate": 0.00012797627872498149, "loss": 0.9299, "step": 2437 }, { "epoch": 0.36118518518518516, "grad_norm": 2.550116539001465, "learning_rate": 0.0001279466271312083, "loss": 0.989, "step": 2438 }, { "epoch": 0.36133333333333334, "grad_norm": 1.5930713415145874, "learning_rate": 0.00012791697553743515, "loss": 1.3768, "step": 2439 }, { "epoch": 0.36148148148148146, "grad_norm": 1.9154518842697144, "learning_rate": 0.000127887323943662, "loss": 1.0797, "step": 2440 }, { "epoch": 0.36162962962962963, "grad_norm": 1.6080303192138672, "learning_rate": 0.0001278576723498888, "loss": 0.8371, "step": 2441 }, { "epoch": 0.36177777777777775, "grad_norm": 1.7143136262893677, "learning_rate": 0.00012782802075611565, "loss": 0.997, "step": 2442 }, { "epoch": 0.36192592592592593, "grad_norm": 2.2452192306518555, "learning_rate": 0.0001277983691623425, "loss": 1.0163, "step": 2443 }, { "epoch": 0.36207407407407405, "grad_norm": 1.7062718868255615, "learning_rate": 0.00012776871756856932, "loss": 1.0657, "step": 2444 }, { "epoch": 0.3622222222222222, "grad_norm": 1.5455044507980347, "learning_rate": 0.00012773906597479613, "loss": 0.8581, "step": 2445 }, { "epoch": 0.36237037037037034, "grad_norm": 1.2241191864013672, "learning_rate": 0.000127709414381023, "loss": 1.095, "step": 2446 }, { "epoch": 0.3625185185185185, "grad_norm": 1.5044482946395874, "learning_rate": 0.00012767976278724982, "loss": 1.0647, "step": 2447 }, { "epoch": 0.3626666666666667, "grad_norm": 2.371448278427124, "learning_rate": 0.00012765011119347664, "loss": 1.2282, "step": 2448 }, { "epoch": 0.3628148148148148, "grad_norm": 1.5230810642242432, "learning_rate": 0.00012762045959970348, "loss": 0.9934, "step": 2449 }, { "epoch": 0.362962962962963, "grad_norm": 1.5243303775787354, "learning_rate": 0.00012759080800593033, "loss": 0.9776, "step": 2450 }, { "epoch": 0.3631111111111111, "grad_norm": 2.3378851413726807, "learning_rate": 0.00012756115641215715, "loss": 1.2032, "step": 2451 }, { "epoch": 0.3632592592592593, "grad_norm": 1.9390791654586792, "learning_rate": 0.000127531504818384, "loss": 1.1387, "step": 2452 }, { "epoch": 0.3634074074074074, "grad_norm": 1.212646722793579, "learning_rate": 0.00012750185322461083, "loss": 1.2237, "step": 2453 }, { "epoch": 0.3635555555555556, "grad_norm": 2.331469774246216, "learning_rate": 0.00012747220163083765, "loss": 1.0961, "step": 2454 }, { "epoch": 0.3637037037037037, "grad_norm": 3.145979642868042, "learning_rate": 0.0001274425500370645, "loss": 1.1028, "step": 2455 }, { "epoch": 0.36385185185185187, "grad_norm": 3.360153913497925, "learning_rate": 0.00012741289844329134, "loss": 0.9997, "step": 2456 }, { "epoch": 0.364, "grad_norm": 1.4973174333572388, "learning_rate": 0.00012738324684951816, "loss": 1.0894, "step": 2457 }, { "epoch": 0.36414814814814817, "grad_norm": 1.7876518964767456, "learning_rate": 0.000127353595255745, "loss": 1.0056, "step": 2458 }, { "epoch": 0.3642962962962963, "grad_norm": 1.4665687084197998, "learning_rate": 0.00012732394366197185, "loss": 1.1241, "step": 2459 }, { "epoch": 0.36444444444444446, "grad_norm": 1.607784390449524, "learning_rate": 0.00012729429206819866, "loss": 1.1712, "step": 2460 }, { "epoch": 0.3645925925925926, "grad_norm": 5.57588005065918, "learning_rate": 0.0001272646404744255, "loss": 1.3167, "step": 2461 }, { "epoch": 0.36474074074074075, "grad_norm": 2.404571294784546, "learning_rate": 0.00012723498888065235, "loss": 1.1764, "step": 2462 }, { "epoch": 0.3648888888888889, "grad_norm": 1.4461548328399658, "learning_rate": 0.00012720533728687917, "loss": 1.2814, "step": 2463 }, { "epoch": 0.36503703703703705, "grad_norm": 1.2893413305282593, "learning_rate": 0.00012717568569310602, "loss": 1.0241, "step": 2464 }, { "epoch": 0.36518518518518517, "grad_norm": 1.6379495859146118, "learning_rate": 0.00012714603409933286, "loss": 1.0907, "step": 2465 }, { "epoch": 0.36533333333333334, "grad_norm": 2.1730432510375977, "learning_rate": 0.00012711638250555968, "loss": 1.2725, "step": 2466 }, { "epoch": 0.36548148148148146, "grad_norm": 1.6737717390060425, "learning_rate": 0.00012708673091178652, "loss": 0.9893, "step": 2467 }, { "epoch": 0.36562962962962964, "grad_norm": 1.4764902591705322, "learning_rate": 0.00012705707931801334, "loss": 1.315, "step": 2468 }, { "epoch": 0.36577777777777776, "grad_norm": 1.63857901096344, "learning_rate": 0.00012702742772424018, "loss": 1.1869, "step": 2469 }, { "epoch": 0.36592592592592593, "grad_norm": 2.2073915004730225, "learning_rate": 0.00012699777613046703, "loss": 1.2164, "step": 2470 }, { "epoch": 0.36607407407407405, "grad_norm": 1.5736753940582275, "learning_rate": 0.00012696812453669385, "loss": 0.9623, "step": 2471 }, { "epoch": 0.3662222222222222, "grad_norm": 4.185050964355469, "learning_rate": 0.0001269384729429207, "loss": 1.2394, "step": 2472 }, { "epoch": 0.36637037037037035, "grad_norm": 2.4215621948242188, "learning_rate": 0.00012690882134914753, "loss": 1.1873, "step": 2473 }, { "epoch": 0.3665185185185185, "grad_norm": 1.6673725843429565, "learning_rate": 0.00012687916975537435, "loss": 1.1829, "step": 2474 }, { "epoch": 0.36666666666666664, "grad_norm": 2.9597597122192383, "learning_rate": 0.0001268495181616012, "loss": 1.2275, "step": 2475 }, { "epoch": 0.3668148148148148, "grad_norm": 2.4930925369262695, "learning_rate": 0.00012681986656782804, "loss": 1.1223, "step": 2476 }, { "epoch": 0.36696296296296294, "grad_norm": 3.147881031036377, "learning_rate": 0.00012679021497405486, "loss": 1.0393, "step": 2477 }, { "epoch": 0.3671111111111111, "grad_norm": 3.007131576538086, "learning_rate": 0.0001267605633802817, "loss": 1.0715, "step": 2478 }, { "epoch": 0.3672592592592593, "grad_norm": 8.853863716125488, "learning_rate": 0.00012673091178650852, "loss": 1.0706, "step": 2479 }, { "epoch": 0.3674074074074074, "grad_norm": 2.0026261806488037, "learning_rate": 0.00012670126019273536, "loss": 1.1067, "step": 2480 }, { "epoch": 0.3675555555555556, "grad_norm": 1.833274483680725, "learning_rate": 0.0001266716085989622, "loss": 1.3698, "step": 2481 }, { "epoch": 0.3677037037037037, "grad_norm": 2.059183120727539, "learning_rate": 0.00012664195700518903, "loss": 1.0032, "step": 2482 }, { "epoch": 0.3678518518518519, "grad_norm": 1.295753002166748, "learning_rate": 0.00012661230541141587, "loss": 1.1414, "step": 2483 }, { "epoch": 0.368, "grad_norm": 1.6397961378097534, "learning_rate": 0.0001265826538176427, "loss": 1.1128, "step": 2484 }, { "epoch": 0.36814814814814817, "grad_norm": 1.9030511379241943, "learning_rate": 0.00012655300222386953, "loss": 1.1714, "step": 2485 }, { "epoch": 0.3682962962962963, "grad_norm": 1.5962088108062744, "learning_rate": 0.00012652335063009638, "loss": 1.1224, "step": 2486 }, { "epoch": 0.36844444444444446, "grad_norm": 1.4006625413894653, "learning_rate": 0.0001264936990363232, "loss": 1.0887, "step": 2487 }, { "epoch": 0.3685925925925926, "grad_norm": 4.466986656188965, "learning_rate": 0.00012646404744255004, "loss": 0.9706, "step": 2488 }, { "epoch": 0.36874074074074076, "grad_norm": 1.7184315919876099, "learning_rate": 0.00012643439584877688, "loss": 1.0607, "step": 2489 }, { "epoch": 0.3688888888888889, "grad_norm": 2.338414192199707, "learning_rate": 0.0001264047442550037, "loss": 1.2112, "step": 2490 }, { "epoch": 0.36903703703703705, "grad_norm": 1.548397183418274, "learning_rate": 0.00012637509266123054, "loss": 1.0891, "step": 2491 }, { "epoch": 0.36918518518518517, "grad_norm": 1.8215255737304688, "learning_rate": 0.0001263454410674574, "loss": 0.959, "step": 2492 }, { "epoch": 0.36933333333333335, "grad_norm": 1.2169244289398193, "learning_rate": 0.0001263157894736842, "loss": 1.093, "step": 2493 }, { "epoch": 0.36948148148148147, "grad_norm": 3.0639641284942627, "learning_rate": 0.00012628613787991105, "loss": 1.3668, "step": 2494 }, { "epoch": 0.36962962962962964, "grad_norm": 1.5741491317749023, "learning_rate": 0.0001262564862861379, "loss": 0.9399, "step": 2495 }, { "epoch": 0.36977777777777776, "grad_norm": 1.4462112188339233, "learning_rate": 0.0001262268346923647, "loss": 1.139, "step": 2496 }, { "epoch": 0.36992592592592594, "grad_norm": 1.1880522966384888, "learning_rate": 0.00012619718309859156, "loss": 1.1902, "step": 2497 }, { "epoch": 0.37007407407407406, "grad_norm": 1.3176915645599365, "learning_rate": 0.0001261675315048184, "loss": 0.9821, "step": 2498 }, { "epoch": 0.37022222222222223, "grad_norm": 1.6645140647888184, "learning_rate": 0.00012613787991104522, "loss": 1.0211, "step": 2499 }, { "epoch": 0.37037037037037035, "grad_norm": 1.8516615629196167, "learning_rate": 0.00012610822831727206, "loss": 1.101, "step": 2500 }, { "epoch": 0.3705185185185185, "grad_norm": 1.4071452617645264, "learning_rate": 0.0001260785767234989, "loss": 0.9341, "step": 2501 }, { "epoch": 0.37066666666666664, "grad_norm": 2.9631004333496094, "learning_rate": 0.00012604892512972573, "loss": 1.0563, "step": 2502 }, { "epoch": 0.3708148148148148, "grad_norm": 4.322868824005127, "learning_rate": 0.00012601927353595254, "loss": 1.0793, "step": 2503 }, { "epoch": 0.37096296296296294, "grad_norm": 3.899007797241211, "learning_rate": 0.00012598962194217941, "loss": 1.2543, "step": 2504 }, { "epoch": 0.3711111111111111, "grad_norm": 1.6123673915863037, "learning_rate": 0.00012595997034840623, "loss": 1.1704, "step": 2505 }, { "epoch": 0.37125925925925923, "grad_norm": 2.163992404937744, "learning_rate": 0.00012593031875463305, "loss": 1.0913, "step": 2506 }, { "epoch": 0.3714074074074074, "grad_norm": 1.86015784740448, "learning_rate": 0.00012590066716085992, "loss": 0.9095, "step": 2507 }, { "epoch": 0.37155555555555553, "grad_norm": 1.3718888759613037, "learning_rate": 0.00012587101556708674, "loss": 1.1879, "step": 2508 }, { "epoch": 0.3717037037037037, "grad_norm": 5.352588653564453, "learning_rate": 0.00012584136397331356, "loss": 0.9394, "step": 2509 }, { "epoch": 0.3718518518518519, "grad_norm": 1.686468482017517, "learning_rate": 0.00012581171237954043, "loss": 1.18, "step": 2510 }, { "epoch": 0.372, "grad_norm": 2.798321485519409, "learning_rate": 0.00012578206078576724, "loss": 0.9624, "step": 2511 }, { "epoch": 0.3721481481481482, "grad_norm": 2.6296160221099854, "learning_rate": 0.00012575240919199406, "loss": 1.1563, "step": 2512 }, { "epoch": 0.3722962962962963, "grad_norm": 4.136303901672363, "learning_rate": 0.0001257227575982209, "loss": 1.12, "step": 2513 }, { "epoch": 0.37244444444444447, "grad_norm": 2.5504956245422363, "learning_rate": 0.00012569310600444775, "loss": 0.9938, "step": 2514 }, { "epoch": 0.3725925925925926, "grad_norm": 3.7541561126708984, "learning_rate": 0.00012566345441067457, "loss": 1.0822, "step": 2515 }, { "epoch": 0.37274074074074076, "grad_norm": 2.6331770420074463, "learning_rate": 0.0001256338028169014, "loss": 1.1897, "step": 2516 }, { "epoch": 0.3728888888888889, "grad_norm": 4.061927318572998, "learning_rate": 0.00012560415122312826, "loss": 1.2737, "step": 2517 }, { "epoch": 0.37303703703703706, "grad_norm": 3.8827645778656006, "learning_rate": 0.00012557449962935507, "loss": 0.9835, "step": 2518 }, { "epoch": 0.3731851851851852, "grad_norm": 1.8469038009643555, "learning_rate": 0.00012554484803558192, "loss": 1.3418, "step": 2519 }, { "epoch": 0.37333333333333335, "grad_norm": 3.677811861038208, "learning_rate": 0.00012551519644180876, "loss": 1.123, "step": 2520 }, { "epoch": 0.37348148148148147, "grad_norm": 1.4161865711212158, "learning_rate": 0.00012548554484803558, "loss": 0.96, "step": 2521 }, { "epoch": 0.37362962962962964, "grad_norm": 16.41788101196289, "learning_rate": 0.00012545589325426242, "loss": 0.7189, "step": 2522 }, { "epoch": 0.37377777777777776, "grad_norm": 3.403994560241699, "learning_rate": 0.00012542624166048927, "loss": 1.0135, "step": 2523 }, { "epoch": 0.37392592592592594, "grad_norm": 3.407245397567749, "learning_rate": 0.0001253965900667161, "loss": 1.1582, "step": 2524 }, { "epoch": 0.37407407407407406, "grad_norm": 9.095651626586914, "learning_rate": 0.00012536693847294293, "loss": 1.4819, "step": 2525 }, { "epoch": 0.37422222222222223, "grad_norm": 3.44998836517334, "learning_rate": 0.00012533728687916978, "loss": 1.0873, "step": 2526 }, { "epoch": 0.37437037037037035, "grad_norm": 1.7175861597061157, "learning_rate": 0.0001253076352853966, "loss": 1.0546, "step": 2527 }, { "epoch": 0.37451851851851853, "grad_norm": 4.137767314910889, "learning_rate": 0.00012527798369162344, "loss": 1.0061, "step": 2528 }, { "epoch": 0.37466666666666665, "grad_norm": 3.3150739669799805, "learning_rate": 0.00012524833209785028, "loss": 1.1405, "step": 2529 }, { "epoch": 0.3748148148148148, "grad_norm": 6.849560737609863, "learning_rate": 0.0001252186805040771, "loss": 1.2259, "step": 2530 }, { "epoch": 0.37496296296296294, "grad_norm": 3.9576241970062256, "learning_rate": 0.00012518902891030392, "loss": 0.9823, "step": 2531 }, { "epoch": 0.3751111111111111, "grad_norm": 4.216405391693115, "learning_rate": 0.0001251593773165308, "loss": 0.9662, "step": 2532 }, { "epoch": 0.37525925925925924, "grad_norm": 5.597860813140869, "learning_rate": 0.0001251297257227576, "loss": 0.9107, "step": 2533 }, { "epoch": 0.3754074074074074, "grad_norm": 12.664651870727539, "learning_rate": 0.00012510007412898442, "loss": 1.3503, "step": 2534 }, { "epoch": 0.37555555555555553, "grad_norm": 2.6865103244781494, "learning_rate": 0.0001250704225352113, "loss": 0.8692, "step": 2535 }, { "epoch": 0.3757037037037037, "grad_norm": 1.703162670135498, "learning_rate": 0.0001250407709414381, "loss": 1.1979, "step": 2536 }, { "epoch": 0.3758518518518518, "grad_norm": 8.578655242919922, "learning_rate": 0.00012501111934766493, "loss": 1.1494, "step": 2537 }, { "epoch": 0.376, "grad_norm": 4.007734775543213, "learning_rate": 0.00012498146775389177, "loss": 1.1308, "step": 2538 }, { "epoch": 0.3761481481481482, "grad_norm": 1.7926369905471802, "learning_rate": 0.00012495181616011862, "loss": 1.0621, "step": 2539 }, { "epoch": 0.3762962962962963, "grad_norm": 3.823511838912964, "learning_rate": 0.00012492216456634544, "loss": 1.0382, "step": 2540 }, { "epoch": 0.37644444444444447, "grad_norm": 3.8496482372283936, "learning_rate": 0.00012489251297257228, "loss": 1.075, "step": 2541 }, { "epoch": 0.3765925925925926, "grad_norm": 13.723682403564453, "learning_rate": 0.00012486286137879912, "loss": 1.0547, "step": 2542 }, { "epoch": 0.37674074074074076, "grad_norm": 1.737381935119629, "learning_rate": 0.00012483320978502594, "loss": 1.0844, "step": 2543 }, { "epoch": 0.3768888888888889, "grad_norm": 1.9060014486312866, "learning_rate": 0.00012480355819125279, "loss": 1.0751, "step": 2544 }, { "epoch": 0.37703703703703706, "grad_norm": 3.8223044872283936, "learning_rate": 0.00012477390659747963, "loss": 1.2052, "step": 2545 }, { "epoch": 0.3771851851851852, "grad_norm": 4.125335693359375, "learning_rate": 0.00012474425500370645, "loss": 1.2216, "step": 2546 }, { "epoch": 0.37733333333333335, "grad_norm": 1.7740851640701294, "learning_rate": 0.0001247146034099333, "loss": 1.0954, "step": 2547 }, { "epoch": 0.3774814814814815, "grad_norm": 2.5749096870422363, "learning_rate": 0.00012468495181616014, "loss": 1.0394, "step": 2548 }, { "epoch": 0.37762962962962965, "grad_norm": 1.3280686140060425, "learning_rate": 0.00012465530022238695, "loss": 0.9565, "step": 2549 }, { "epoch": 0.37777777777777777, "grad_norm": 2.461038112640381, "learning_rate": 0.0001246256486286138, "loss": 1.0782, "step": 2550 }, { "epoch": 0.37792592592592594, "grad_norm": 1.3010717630386353, "learning_rate": 0.00012459599703484064, "loss": 1.0623, "step": 2551 }, { "epoch": 0.37807407407407406, "grad_norm": 3.0568859577178955, "learning_rate": 0.00012456634544106746, "loss": 1.2525, "step": 2552 }, { "epoch": 0.37822222222222224, "grad_norm": 1.7495825290679932, "learning_rate": 0.0001245366938472943, "loss": 1.2495, "step": 2553 }, { "epoch": 0.37837037037037036, "grad_norm": 5.899725437164307, "learning_rate": 0.00012450704225352112, "loss": 1.2229, "step": 2554 }, { "epoch": 0.37851851851851853, "grad_norm": 1.1927306652069092, "learning_rate": 0.00012447739065974797, "loss": 1.0398, "step": 2555 }, { "epoch": 0.37866666666666665, "grad_norm": 4.006344795227051, "learning_rate": 0.0001244477390659748, "loss": 1.3708, "step": 2556 }, { "epoch": 0.3788148148148148, "grad_norm": 3.9117653369903564, "learning_rate": 0.00012441808747220163, "loss": 0.8369, "step": 2557 }, { "epoch": 0.37896296296296295, "grad_norm": 1.193713665008545, "learning_rate": 0.00012438843587842847, "loss": 0.9799, "step": 2558 }, { "epoch": 0.3791111111111111, "grad_norm": 7.468748569488525, "learning_rate": 0.00012435878428465532, "loss": 1.0952, "step": 2559 }, { "epoch": 0.37925925925925924, "grad_norm": 9.303503036499023, "learning_rate": 0.00012432913269088213, "loss": 1.1702, "step": 2560 }, { "epoch": 0.3794074074074074, "grad_norm": 1.804075837135315, "learning_rate": 0.00012429948109710898, "loss": 1.3226, "step": 2561 }, { "epoch": 0.37955555555555553, "grad_norm": 2.3133153915405273, "learning_rate": 0.00012426982950333582, "loss": 1.1468, "step": 2562 }, { "epoch": 0.3797037037037037, "grad_norm": 3.307896375656128, "learning_rate": 0.00012424017790956264, "loss": 1.1266, "step": 2563 }, { "epoch": 0.37985185185185183, "grad_norm": 1.6659990549087524, "learning_rate": 0.00012421052631578949, "loss": 1.1707, "step": 2564 }, { "epoch": 0.38, "grad_norm": 2.732919454574585, "learning_rate": 0.0001241808747220163, "loss": 1.138, "step": 2565 }, { "epoch": 0.3801481481481481, "grad_norm": 1.9463250637054443, "learning_rate": 0.00012415122312824315, "loss": 0.9848, "step": 2566 }, { "epoch": 0.3802962962962963, "grad_norm": 1.2089314460754395, "learning_rate": 0.00012412157153447, "loss": 1.015, "step": 2567 }, { "epoch": 0.3804444444444444, "grad_norm": 1.5920319557189941, "learning_rate": 0.0001240919199406968, "loss": 1.191, "step": 2568 }, { "epoch": 0.3805925925925926, "grad_norm": 2.5458788871765137, "learning_rate": 0.00012406226834692365, "loss": 0.8645, "step": 2569 }, { "epoch": 0.38074074074074077, "grad_norm": 2.972222089767456, "learning_rate": 0.00012403261675315047, "loss": 1.1067, "step": 2570 }, { "epoch": 0.3808888888888889, "grad_norm": 2.4307265281677246, "learning_rate": 0.00012400296515937732, "loss": 1.3904, "step": 2571 }, { "epoch": 0.38103703703703706, "grad_norm": 1.4049432277679443, "learning_rate": 0.00012397331356560416, "loss": 1.0049, "step": 2572 }, { "epoch": 0.3811851851851852, "grad_norm": 1.3105792999267578, "learning_rate": 0.00012394366197183098, "loss": 0.9569, "step": 2573 }, { "epoch": 0.38133333333333336, "grad_norm": 1.5824027061462402, "learning_rate": 0.00012391401037805782, "loss": 0.9485, "step": 2574 }, { "epoch": 0.3814814814814815, "grad_norm": 1.0780473947525024, "learning_rate": 0.00012388435878428467, "loss": 1.1683, "step": 2575 }, { "epoch": 0.38162962962962965, "grad_norm": 1.4697093963623047, "learning_rate": 0.00012385470719051148, "loss": 1.0345, "step": 2576 }, { "epoch": 0.38177777777777777, "grad_norm": 1.468275785446167, "learning_rate": 0.00012382505559673833, "loss": 1.191, "step": 2577 }, { "epoch": 0.38192592592592595, "grad_norm": 1.504564881324768, "learning_rate": 0.00012379540400296517, "loss": 1.1971, "step": 2578 }, { "epoch": 0.38207407407407407, "grad_norm": 2.012601375579834, "learning_rate": 0.000123765752409192, "loss": 1.1544, "step": 2579 }, { "epoch": 0.38222222222222224, "grad_norm": 1.1641924381256104, "learning_rate": 0.00012373610081541883, "loss": 0.9978, "step": 2580 }, { "epoch": 0.38237037037037036, "grad_norm": 1.745923638343811, "learning_rate": 0.00012370644922164568, "loss": 1.1242, "step": 2581 }, { "epoch": 0.38251851851851854, "grad_norm": 1.4969233274459839, "learning_rate": 0.0001236767976278725, "loss": 1.1654, "step": 2582 }, { "epoch": 0.38266666666666665, "grad_norm": 1.4478421211242676, "learning_rate": 0.00012364714603409934, "loss": 1.1774, "step": 2583 }, { "epoch": 0.38281481481481483, "grad_norm": 2.4453020095825195, "learning_rate": 0.00012361749444032619, "loss": 1.0666, "step": 2584 }, { "epoch": 0.38296296296296295, "grad_norm": 1.3032078742980957, "learning_rate": 0.000123587842846553, "loss": 1.2511, "step": 2585 }, { "epoch": 0.3831111111111111, "grad_norm": 1.3252873420715332, "learning_rate": 0.00012355819125277985, "loss": 0.9683, "step": 2586 }, { "epoch": 0.38325925925925924, "grad_norm": 1.6873570680618286, "learning_rate": 0.0001235285396590067, "loss": 1.0997, "step": 2587 }, { "epoch": 0.3834074074074074, "grad_norm": 1.2579675912857056, "learning_rate": 0.0001234988880652335, "loss": 1.0251, "step": 2588 }, { "epoch": 0.38355555555555554, "grad_norm": 1.1985095739364624, "learning_rate": 0.00012346923647146033, "loss": 1.2056, "step": 2589 }, { "epoch": 0.3837037037037037, "grad_norm": 6.052125453948975, "learning_rate": 0.0001234395848776872, "loss": 1.1525, "step": 2590 }, { "epoch": 0.38385185185185183, "grad_norm": 1.3533743619918823, "learning_rate": 0.00012340993328391401, "loss": 1.0307, "step": 2591 }, { "epoch": 0.384, "grad_norm": 1.623461127281189, "learning_rate": 0.00012338028169014083, "loss": 1.053, "step": 2592 }, { "epoch": 0.3841481481481481, "grad_norm": 2.8680176734924316, "learning_rate": 0.0001233506300963677, "loss": 1.1363, "step": 2593 }, { "epoch": 0.3842962962962963, "grad_norm": 5.083739757537842, "learning_rate": 0.00012332097850259452, "loss": 1.051, "step": 2594 }, { "epoch": 0.3844444444444444, "grad_norm": 1.3027294874191284, "learning_rate": 0.00012329132690882134, "loss": 1.2212, "step": 2595 }, { "epoch": 0.3845925925925926, "grad_norm": 2.079653024673462, "learning_rate": 0.0001232616753150482, "loss": 1.0086, "step": 2596 }, { "epoch": 0.3847407407407407, "grad_norm": 1.485935926437378, "learning_rate": 0.00012323202372127503, "loss": 1.485, "step": 2597 }, { "epoch": 0.3848888888888889, "grad_norm": 1.3015352487564087, "learning_rate": 0.00012320237212750184, "loss": 0.9506, "step": 2598 }, { "epoch": 0.385037037037037, "grad_norm": 1.783921718597412, "learning_rate": 0.0001231727205337287, "loss": 1.085, "step": 2599 }, { "epoch": 0.3851851851851852, "grad_norm": 2.8830976486206055, "learning_rate": 0.00012314306893995553, "loss": 0.991, "step": 2600 }, { "epoch": 0.38533333333333336, "grad_norm": 1.2732336521148682, "learning_rate": 0.00012311341734618235, "loss": 1.1099, "step": 2601 }, { "epoch": 0.3854814814814815, "grad_norm": 2.5392229557037354, "learning_rate": 0.0001230837657524092, "loss": 1.0938, "step": 2602 }, { "epoch": 0.38562962962962966, "grad_norm": 2.4780893325805664, "learning_rate": 0.00012305411415863604, "loss": 1.1627, "step": 2603 }, { "epoch": 0.3857777777777778, "grad_norm": 1.658705711364746, "learning_rate": 0.00012302446256486286, "loss": 1.1664, "step": 2604 }, { "epoch": 0.38592592592592595, "grad_norm": 1.5529338121414185, "learning_rate": 0.0001229948109710897, "loss": 1.1685, "step": 2605 }, { "epoch": 0.38607407407407407, "grad_norm": 3.1977758407592773, "learning_rate": 0.00012296515937731655, "loss": 1.1231, "step": 2606 }, { "epoch": 0.38622222222222224, "grad_norm": 1.6928820610046387, "learning_rate": 0.00012293550778354336, "loss": 1.1304, "step": 2607 }, { "epoch": 0.38637037037037036, "grad_norm": 1.929652452468872, "learning_rate": 0.0001229058561897702, "loss": 1.241, "step": 2608 }, { "epoch": 0.38651851851851854, "grad_norm": 1.2363271713256836, "learning_rate": 0.00012287620459599705, "loss": 1.1009, "step": 2609 }, { "epoch": 0.38666666666666666, "grad_norm": 1.6452393531799316, "learning_rate": 0.00012284655300222387, "loss": 1.1277, "step": 2610 }, { "epoch": 0.38681481481481483, "grad_norm": 1.3308392763137817, "learning_rate": 0.00012281690140845071, "loss": 1.429, "step": 2611 }, { "epoch": 0.38696296296296295, "grad_norm": 1.0953389406204224, "learning_rate": 0.00012278724981467756, "loss": 1.2, "step": 2612 }, { "epoch": 0.38711111111111113, "grad_norm": 2.9433095455169678, "learning_rate": 0.00012275759822090438, "loss": 1.3746, "step": 2613 }, { "epoch": 0.38725925925925925, "grad_norm": 1.1800731420516968, "learning_rate": 0.00012272794662713122, "loss": 1.323, "step": 2614 }, { "epoch": 0.3874074074074074, "grad_norm": 1.9630125761032104, "learning_rate": 0.00012269829503335807, "loss": 1.0169, "step": 2615 }, { "epoch": 0.38755555555555554, "grad_norm": 1.4434458017349243, "learning_rate": 0.00012266864343958488, "loss": 0.7611, "step": 2616 }, { "epoch": 0.3877037037037037, "grad_norm": 1.4789292812347412, "learning_rate": 0.0001226389918458117, "loss": 0.9573, "step": 2617 }, { "epoch": 0.38785185185185184, "grad_norm": 1.034011721611023, "learning_rate": 0.00012260934025203857, "loss": 1.1082, "step": 2618 }, { "epoch": 0.388, "grad_norm": 1.391062617301941, "learning_rate": 0.0001225796886582654, "loss": 1.0864, "step": 2619 }, { "epoch": 0.38814814814814813, "grad_norm": 1.247158169746399, "learning_rate": 0.0001225500370644922, "loss": 1.2264, "step": 2620 }, { "epoch": 0.3882962962962963, "grad_norm": 2.1945993900299072, "learning_rate": 0.00012252038547071908, "loss": 1.1592, "step": 2621 }, { "epoch": 0.3884444444444444, "grad_norm": 1.4600727558135986, "learning_rate": 0.0001224907338769459, "loss": 1.138, "step": 2622 }, { "epoch": 0.3885925925925926, "grad_norm": 1.4038389921188354, "learning_rate": 0.0001224610822831727, "loss": 1.0787, "step": 2623 }, { "epoch": 0.3887407407407407, "grad_norm": 1.3697896003723145, "learning_rate": 0.00012243143068939956, "loss": 1.2264, "step": 2624 }, { "epoch": 0.3888888888888889, "grad_norm": 2.064251184463501, "learning_rate": 0.0001224017790956264, "loss": 1.093, "step": 2625 }, { "epoch": 0.389037037037037, "grad_norm": 2.4867541790008545, "learning_rate": 0.00012237212750185322, "loss": 0.8753, "step": 2626 }, { "epoch": 0.3891851851851852, "grad_norm": 3.2699592113494873, "learning_rate": 0.00012234247590808006, "loss": 1.2429, "step": 2627 }, { "epoch": 0.3893333333333333, "grad_norm": 1.61955726146698, "learning_rate": 0.0001223128243143069, "loss": 1.0896, "step": 2628 }, { "epoch": 0.3894814814814815, "grad_norm": 0.942071259021759, "learning_rate": 0.00012228317272053373, "loss": 1.1198, "step": 2629 }, { "epoch": 0.3896296296296296, "grad_norm": 1.041031002998352, "learning_rate": 0.00012225352112676057, "loss": 1.1359, "step": 2630 }, { "epoch": 0.3897777777777778, "grad_norm": 1.2857797145843506, "learning_rate": 0.00012222386953298741, "loss": 1.3863, "step": 2631 }, { "epoch": 0.38992592592592595, "grad_norm": 1.5760654211044312, "learning_rate": 0.00012219421793921423, "loss": 1.0017, "step": 2632 }, { "epoch": 0.3900740740740741, "grad_norm": 1.2841405868530273, "learning_rate": 0.00012216456634544108, "loss": 0.9268, "step": 2633 }, { "epoch": 0.39022222222222225, "grad_norm": 1.0363614559173584, "learning_rate": 0.00012213491475166792, "loss": 0.9656, "step": 2634 }, { "epoch": 0.39037037037037037, "grad_norm": 2.366209030151367, "learning_rate": 0.00012210526315789474, "loss": 1.4091, "step": 2635 }, { "epoch": 0.39051851851851854, "grad_norm": 1.3012871742248535, "learning_rate": 0.00012207561156412158, "loss": 0.9263, "step": 2636 }, { "epoch": 0.39066666666666666, "grad_norm": 1.8037832975387573, "learning_rate": 0.00012204595997034843, "loss": 0.8226, "step": 2637 }, { "epoch": 0.39081481481481484, "grad_norm": 1.461460828781128, "learning_rate": 0.00012201630837657524, "loss": 1.2998, "step": 2638 }, { "epoch": 0.39096296296296296, "grad_norm": 1.3180598020553589, "learning_rate": 0.00012198665678280207, "loss": 0.9662, "step": 2639 }, { "epoch": 0.39111111111111113, "grad_norm": 1.1122477054595947, "learning_rate": 0.0001219570051890289, "loss": 0.9911, "step": 2640 }, { "epoch": 0.39125925925925925, "grad_norm": 2.2250654697418213, "learning_rate": 0.00012192735359525575, "loss": 1.1587, "step": 2641 }, { "epoch": 0.3914074074074074, "grad_norm": 1.1606261730194092, "learning_rate": 0.00012189770200148258, "loss": 0.8492, "step": 2642 }, { "epoch": 0.39155555555555555, "grad_norm": 1.3771429061889648, "learning_rate": 0.00012186805040770941, "loss": 0.9648, "step": 2643 }, { "epoch": 0.3917037037037037, "grad_norm": 1.557597279548645, "learning_rate": 0.00012183839881393626, "loss": 1.0868, "step": 2644 }, { "epoch": 0.39185185185185184, "grad_norm": 2.8755409717559814, "learning_rate": 0.00012180874722016309, "loss": 1.2169, "step": 2645 }, { "epoch": 0.392, "grad_norm": 2.0587596893310547, "learning_rate": 0.00012177909562638992, "loss": 1.0257, "step": 2646 }, { "epoch": 0.39214814814814813, "grad_norm": 1.3243364095687866, "learning_rate": 0.00012174944403261676, "loss": 0.9206, "step": 2647 }, { "epoch": 0.3922962962962963, "grad_norm": 1.1226966381072998, "learning_rate": 0.0001217197924388436, "loss": 0.9565, "step": 2648 }, { "epoch": 0.39244444444444443, "grad_norm": 1.1772652864456177, "learning_rate": 0.00012169014084507042, "loss": 0.9498, "step": 2649 }, { "epoch": 0.3925925925925926, "grad_norm": 1.1876426935195923, "learning_rate": 0.00012166048925129727, "loss": 1.2936, "step": 2650 }, { "epoch": 0.3927407407407407, "grad_norm": 1.5671477317810059, "learning_rate": 0.0001216308376575241, "loss": 1.3194, "step": 2651 }, { "epoch": 0.3928888888888889, "grad_norm": 1.7110168933868408, "learning_rate": 0.00012160118606375093, "loss": 1.3425, "step": 2652 }, { "epoch": 0.393037037037037, "grad_norm": 1.093193769454956, "learning_rate": 0.00012157153446997778, "loss": 1.1644, "step": 2653 }, { "epoch": 0.3931851851851852, "grad_norm": 0.9900262355804443, "learning_rate": 0.0001215418828762046, "loss": 1.2502, "step": 2654 }, { "epoch": 0.3933333333333333, "grad_norm": 1.0472580194473267, "learning_rate": 0.00012151223128243144, "loss": 0.9266, "step": 2655 }, { "epoch": 0.3934814814814815, "grad_norm": 1.7296568155288696, "learning_rate": 0.00012148257968865825, "loss": 1.0858, "step": 2656 }, { "epoch": 0.3936296296296296, "grad_norm": 1.21535325050354, "learning_rate": 0.00012145292809488511, "loss": 1.3982, "step": 2657 }, { "epoch": 0.3937777777777778, "grad_norm": 1.611433744430542, "learning_rate": 0.00012142327650111194, "loss": 1.1701, "step": 2658 }, { "epoch": 0.3939259259259259, "grad_norm": 1.3290176391601562, "learning_rate": 0.00012139362490733876, "loss": 0.858, "step": 2659 }, { "epoch": 0.3940740740740741, "grad_norm": 1.7268757820129395, "learning_rate": 0.00012136397331356562, "loss": 1.2488, "step": 2660 }, { "epoch": 0.3942222222222222, "grad_norm": 1.0796136856079102, "learning_rate": 0.00012133432171979245, "loss": 1.1617, "step": 2661 }, { "epoch": 0.39437037037037037, "grad_norm": 1.4177995920181274, "learning_rate": 0.00012130467012601927, "loss": 0.9514, "step": 2662 }, { "epoch": 0.39451851851851855, "grad_norm": 3.1777184009552, "learning_rate": 0.00012127501853224612, "loss": 1.1792, "step": 2663 }, { "epoch": 0.39466666666666667, "grad_norm": 1.3864431381225586, "learning_rate": 0.00012124536693847294, "loss": 1.0457, "step": 2664 }, { "epoch": 0.39481481481481484, "grad_norm": 1.309844732284546, "learning_rate": 0.00012121571534469977, "loss": 1.1057, "step": 2665 }, { "epoch": 0.39496296296296296, "grad_norm": 6.418210983276367, "learning_rate": 0.00012118606375092663, "loss": 1.3275, "step": 2666 }, { "epoch": 0.39511111111111114, "grad_norm": 1.2371841669082642, "learning_rate": 0.00012115641215715345, "loss": 1.2082, "step": 2667 }, { "epoch": 0.39525925925925925, "grad_norm": 1.165908932685852, "learning_rate": 0.00012112676056338028, "loss": 1.2033, "step": 2668 }, { "epoch": 0.39540740740740743, "grad_norm": 1.2984356880187988, "learning_rate": 0.00012109710896960714, "loss": 1.1775, "step": 2669 }, { "epoch": 0.39555555555555555, "grad_norm": 1.3221341371536255, "learning_rate": 0.00012106745737583395, "loss": 0.8594, "step": 2670 }, { "epoch": 0.3957037037037037, "grad_norm": 2.138890027999878, "learning_rate": 0.00012103780578206079, "loss": 1.0214, "step": 2671 }, { "epoch": 0.39585185185185184, "grad_norm": 2.4488580226898193, "learning_rate": 0.00012100815418828763, "loss": 0.95, "step": 2672 }, { "epoch": 0.396, "grad_norm": 1.6993088722229004, "learning_rate": 0.00012097850259451446, "loss": 0.8995, "step": 2673 }, { "epoch": 0.39614814814814814, "grad_norm": 1.2788841724395752, "learning_rate": 0.00012094885100074129, "loss": 1.0569, "step": 2674 }, { "epoch": 0.3962962962962963, "grad_norm": 1.178033471107483, "learning_rate": 0.00012091919940696812, "loss": 1.3261, "step": 2675 }, { "epoch": 0.39644444444444443, "grad_norm": 1.4852286577224731, "learning_rate": 0.00012088954781319497, "loss": 1.0252, "step": 2676 }, { "epoch": 0.3965925925925926, "grad_norm": 1.4918464422225952, "learning_rate": 0.0001208598962194218, "loss": 0.9957, "step": 2677 }, { "epoch": 0.3967407407407407, "grad_norm": 1.2921345233917236, "learning_rate": 0.00012083024462564863, "loss": 1.0395, "step": 2678 }, { "epoch": 0.3968888888888889, "grad_norm": 1.8526917695999146, "learning_rate": 0.00012080059303187547, "loss": 1.1959, "step": 2679 }, { "epoch": 0.397037037037037, "grad_norm": 1.2774417400360107, "learning_rate": 0.0001207709414381023, "loss": 1.2662, "step": 2680 }, { "epoch": 0.3971851851851852, "grad_norm": 1.7397370338439941, "learning_rate": 0.00012074128984432914, "loss": 0.987, "step": 2681 }, { "epoch": 0.3973333333333333, "grad_norm": 1.5014103651046753, "learning_rate": 0.00012071163825055598, "loss": 1.0948, "step": 2682 }, { "epoch": 0.3974814814814815, "grad_norm": 1.7596739530563354, "learning_rate": 0.00012068198665678281, "loss": 1.1562, "step": 2683 }, { "epoch": 0.3976296296296296, "grad_norm": 1.8999735116958618, "learning_rate": 0.00012065233506300964, "loss": 1.1918, "step": 2684 }, { "epoch": 0.3977777777777778, "grad_norm": 1.339568018913269, "learning_rate": 0.00012062268346923649, "loss": 1.2655, "step": 2685 }, { "epoch": 0.3979259259259259, "grad_norm": 1.881459355354309, "learning_rate": 0.00012059303187546332, "loss": 1.2279, "step": 2686 }, { "epoch": 0.3980740740740741, "grad_norm": 1.3578423261642456, "learning_rate": 0.00012056338028169015, "loss": 1.4419, "step": 2687 }, { "epoch": 0.3982222222222222, "grad_norm": 1.7409465312957764, "learning_rate": 0.00012053372868791699, "loss": 1.0041, "step": 2688 }, { "epoch": 0.3983703703703704, "grad_norm": 1.2117589712142944, "learning_rate": 0.00012050407709414382, "loss": 1.0102, "step": 2689 }, { "epoch": 0.3985185185185185, "grad_norm": 1.3980178833007812, "learning_rate": 0.00012047442550037064, "loss": 0.9528, "step": 2690 }, { "epoch": 0.39866666666666667, "grad_norm": 1.5781069993972778, "learning_rate": 0.00012044477390659747, "loss": 1.0334, "step": 2691 }, { "epoch": 0.3988148148148148, "grad_norm": 1.789389967918396, "learning_rate": 0.00012041512231282433, "loss": 1.1931, "step": 2692 }, { "epoch": 0.39896296296296296, "grad_norm": 2.0397837162017822, "learning_rate": 0.00012038547071905115, "loss": 1.0539, "step": 2693 }, { "epoch": 0.39911111111111114, "grad_norm": 1.8052537441253662, "learning_rate": 0.00012035581912527798, "loss": 0.9529, "step": 2694 }, { "epoch": 0.39925925925925926, "grad_norm": 1.8215285539627075, "learning_rate": 0.00012032616753150484, "loss": 1.1164, "step": 2695 }, { "epoch": 0.39940740740740743, "grad_norm": 1.65324866771698, "learning_rate": 0.00012029651593773165, "loss": 1.1857, "step": 2696 }, { "epoch": 0.39955555555555555, "grad_norm": 1.7311931848526, "learning_rate": 0.00012026686434395848, "loss": 1.1854, "step": 2697 }, { "epoch": 0.3997037037037037, "grad_norm": 1.9131438732147217, "learning_rate": 0.00012023721275018533, "loss": 1.2563, "step": 2698 }, { "epoch": 0.39985185185185185, "grad_norm": 1.6355929374694824, "learning_rate": 0.00012020756115641216, "loss": 1.0209, "step": 2699 }, { "epoch": 0.4, "grad_norm": 1.510325312614441, "learning_rate": 0.00012017790956263899, "loss": 1.0529, "step": 2700 }, { "epoch": 0.40014814814814814, "grad_norm": 1.334281325340271, "learning_rate": 0.00012014825796886583, "loss": 0.9216, "step": 2701 }, { "epoch": 0.4002962962962963, "grad_norm": 1.1154178380966187, "learning_rate": 0.00012011860637509267, "loss": 0.9629, "step": 2702 }, { "epoch": 0.40044444444444444, "grad_norm": 1.282820224761963, "learning_rate": 0.0001200889547813195, "loss": 0.9645, "step": 2703 }, { "epoch": 0.4005925925925926, "grad_norm": 1.3689979314804077, "learning_rate": 0.00012005930318754634, "loss": 1.1408, "step": 2704 }, { "epoch": 0.40074074074074073, "grad_norm": 2.0123424530029297, "learning_rate": 0.00012002965159377317, "loss": 1.087, "step": 2705 }, { "epoch": 0.4008888888888889, "grad_norm": 1.6564671993255615, "learning_rate": 0.00012, "loss": 1.0362, "step": 2706 }, { "epoch": 0.401037037037037, "grad_norm": 2.1356120109558105, "learning_rate": 0.00011997034840622685, "loss": 0.8953, "step": 2707 }, { "epoch": 0.4011851851851852, "grad_norm": 2.0640385150909424, "learning_rate": 0.00011994069681245368, "loss": 1.0647, "step": 2708 }, { "epoch": 0.4013333333333333, "grad_norm": 1.7021347284317017, "learning_rate": 0.00011991104521868051, "loss": 0.974, "step": 2709 }, { "epoch": 0.4014814814814815, "grad_norm": 2.582080364227295, "learning_rate": 0.00011988139362490734, "loss": 1.1002, "step": 2710 }, { "epoch": 0.4016296296296296, "grad_norm": 1.5332332849502563, "learning_rate": 0.00011985174203113418, "loss": 1.0432, "step": 2711 }, { "epoch": 0.4017777777777778, "grad_norm": 1.7648104429244995, "learning_rate": 0.00011982209043736102, "loss": 0.9823, "step": 2712 }, { "epoch": 0.4019259259259259, "grad_norm": 1.6554726362228394, "learning_rate": 0.00011979243884358785, "loss": 1.4986, "step": 2713 }, { "epoch": 0.4020740740740741, "grad_norm": 3.5072312355041504, "learning_rate": 0.00011976278724981469, "loss": 1.0405, "step": 2714 }, { "epoch": 0.4022222222222222, "grad_norm": 1.4579415321350098, "learning_rate": 0.00011973313565604152, "loss": 0.9843, "step": 2715 }, { "epoch": 0.4023703703703704, "grad_norm": 2.068174123764038, "learning_rate": 0.00011970348406226834, "loss": 1.1261, "step": 2716 }, { "epoch": 0.4025185185185185, "grad_norm": 2.4436683654785156, "learning_rate": 0.0001196738324684952, "loss": 0.9804, "step": 2717 }, { "epoch": 0.4026666666666667, "grad_norm": 1.9487828016281128, "learning_rate": 0.00011964418087472203, "loss": 1.1943, "step": 2718 }, { "epoch": 0.4028148148148148, "grad_norm": 1.7667052745819092, "learning_rate": 0.00011961452928094885, "loss": 1.1153, "step": 2719 }, { "epoch": 0.40296296296296297, "grad_norm": 1.949844479560852, "learning_rate": 0.0001195848776871757, "loss": 0.9712, "step": 2720 }, { "epoch": 0.4031111111111111, "grad_norm": 1.6448479890823364, "learning_rate": 0.00011955522609340253, "loss": 1.0122, "step": 2721 }, { "epoch": 0.40325925925925926, "grad_norm": 1.8042147159576416, "learning_rate": 0.00011952557449962935, "loss": 1.1163, "step": 2722 }, { "epoch": 0.4034074074074074, "grad_norm": 1.6923189163208008, "learning_rate": 0.00011949592290585621, "loss": 1.2883, "step": 2723 }, { "epoch": 0.40355555555555556, "grad_norm": 2.571420907974243, "learning_rate": 0.00011946627131208303, "loss": 1.1161, "step": 2724 }, { "epoch": 0.40370370370370373, "grad_norm": 2.145495891571045, "learning_rate": 0.00011943661971830986, "loss": 1.0038, "step": 2725 }, { "epoch": 0.40385185185185185, "grad_norm": 1.3031134605407715, "learning_rate": 0.00011940696812453669, "loss": 1.0228, "step": 2726 }, { "epoch": 0.404, "grad_norm": 1.7984037399291992, "learning_rate": 0.00011937731653076353, "loss": 0.9614, "step": 2727 }, { "epoch": 0.40414814814814815, "grad_norm": 1.277489185333252, "learning_rate": 0.00011934766493699036, "loss": 1.0074, "step": 2728 }, { "epoch": 0.4042962962962963, "grad_norm": 2.0690696239471436, "learning_rate": 0.0001193180133432172, "loss": 1.0676, "step": 2729 }, { "epoch": 0.40444444444444444, "grad_norm": 1.6869933605194092, "learning_rate": 0.00011928836174944404, "loss": 1.1119, "step": 2730 }, { "epoch": 0.4045925925925926, "grad_norm": 2.0686795711517334, "learning_rate": 0.00011925871015567087, "loss": 1.0984, "step": 2731 }, { "epoch": 0.40474074074074073, "grad_norm": 2.3337457180023193, "learning_rate": 0.0001192290585618977, "loss": 1.0275, "step": 2732 }, { "epoch": 0.4048888888888889, "grad_norm": 1.5476855039596558, "learning_rate": 0.00011919940696812455, "loss": 1.0743, "step": 2733 }, { "epoch": 0.40503703703703703, "grad_norm": 5.566887378692627, "learning_rate": 0.00011916975537435138, "loss": 0.9098, "step": 2734 }, { "epoch": 0.4051851851851852, "grad_norm": 1.3219506740570068, "learning_rate": 0.00011914010378057821, "loss": 1.2043, "step": 2735 }, { "epoch": 0.4053333333333333, "grad_norm": 1.6084272861480713, "learning_rate": 0.00011911045218680505, "loss": 1.1581, "step": 2736 }, { "epoch": 0.4054814814814815, "grad_norm": 1.5224446058273315, "learning_rate": 0.00011908080059303188, "loss": 1.2197, "step": 2737 }, { "epoch": 0.4056296296296296, "grad_norm": 1.6093271970748901, "learning_rate": 0.00011905114899925871, "loss": 1.0036, "step": 2738 }, { "epoch": 0.4057777777777778, "grad_norm": 3.103193759918213, "learning_rate": 0.00011902149740548556, "loss": 1.0624, "step": 2739 }, { "epoch": 0.4059259259259259, "grad_norm": 2.020883560180664, "learning_rate": 0.00011899184581171239, "loss": 1.2473, "step": 2740 }, { "epoch": 0.4060740740740741, "grad_norm": 1.3161982297897339, "learning_rate": 0.00011896219421793922, "loss": 0.961, "step": 2741 }, { "epoch": 0.4062222222222222, "grad_norm": 1.6283003091812134, "learning_rate": 0.00011893254262416606, "loss": 0.9689, "step": 2742 }, { "epoch": 0.4063703703703704, "grad_norm": 3.9993278980255127, "learning_rate": 0.0001189028910303929, "loss": 1.1541, "step": 2743 }, { "epoch": 0.4065185185185185, "grad_norm": 2.251734495162964, "learning_rate": 0.00011887323943661973, "loss": 0.8959, "step": 2744 }, { "epoch": 0.4066666666666667, "grad_norm": 1.5558217763900757, "learning_rate": 0.00011884358784284654, "loss": 1.0407, "step": 2745 }, { "epoch": 0.4068148148148148, "grad_norm": 3.1227128505706787, "learning_rate": 0.0001188139362490734, "loss": 1.0737, "step": 2746 }, { "epoch": 0.40696296296296297, "grad_norm": 1.7677065134048462, "learning_rate": 0.00011878428465530023, "loss": 1.2473, "step": 2747 }, { "epoch": 0.4071111111111111, "grad_norm": 1.9715033769607544, "learning_rate": 0.00011875463306152705, "loss": 1.0825, "step": 2748 }, { "epoch": 0.40725925925925927, "grad_norm": 2.3912477493286133, "learning_rate": 0.00011872498146775391, "loss": 1.0649, "step": 2749 }, { "epoch": 0.4074074074074074, "grad_norm": 2.0233819484710693, "learning_rate": 0.00011869532987398073, "loss": 1.1185, "step": 2750 }, { "epoch": 0.40755555555555556, "grad_norm": 2.5465264320373535, "learning_rate": 0.00011866567828020756, "loss": 0.8653, "step": 2751 }, { "epoch": 0.4077037037037037, "grad_norm": 1.1739938259124756, "learning_rate": 0.00011863602668643441, "loss": 0.9556, "step": 2752 }, { "epoch": 0.40785185185185185, "grad_norm": 2.1719088554382324, "learning_rate": 0.00011860637509266123, "loss": 1.2272, "step": 2753 }, { "epoch": 0.408, "grad_norm": 1.2471652030944824, "learning_rate": 0.00011857672349888806, "loss": 1.1197, "step": 2754 }, { "epoch": 0.40814814814814815, "grad_norm": 1.5716360807418823, "learning_rate": 0.00011854707190511492, "loss": 1.281, "step": 2755 }, { "epoch": 0.4082962962962963, "grad_norm": 1.5447243452072144, "learning_rate": 0.00011851742031134174, "loss": 1.0123, "step": 2756 }, { "epoch": 0.40844444444444444, "grad_norm": 2.2320587635040283, "learning_rate": 0.00011848776871756857, "loss": 1.1074, "step": 2757 }, { "epoch": 0.4085925925925926, "grad_norm": 1.7709585428237915, "learning_rate": 0.00011845811712379541, "loss": 0.8078, "step": 2758 }, { "epoch": 0.40874074074074074, "grad_norm": 1.587296485900879, "learning_rate": 0.00011842846553002224, "loss": 1.084, "step": 2759 }, { "epoch": 0.4088888888888889, "grad_norm": 1.9970415830612183, "learning_rate": 0.00011839881393624908, "loss": 1.1203, "step": 2760 }, { "epoch": 0.40903703703703703, "grad_norm": 2.760261297225952, "learning_rate": 0.0001183691623424759, "loss": 1.297, "step": 2761 }, { "epoch": 0.4091851851851852, "grad_norm": 1.9074251651763916, "learning_rate": 0.00011833951074870275, "loss": 1.266, "step": 2762 }, { "epoch": 0.4093333333333333, "grad_norm": 1.3241833448410034, "learning_rate": 0.00011830985915492958, "loss": 1.0471, "step": 2763 }, { "epoch": 0.4094814814814815, "grad_norm": 2.138880968093872, "learning_rate": 0.00011828020756115641, "loss": 1.2054, "step": 2764 }, { "epoch": 0.4096296296296296, "grad_norm": 2.0332658290863037, "learning_rate": 0.00011825055596738326, "loss": 1.2343, "step": 2765 }, { "epoch": 0.4097777777777778, "grad_norm": 1.1907109022140503, "learning_rate": 0.00011822090437361009, "loss": 1.0939, "step": 2766 }, { "epoch": 0.4099259259259259, "grad_norm": 1.491758108139038, "learning_rate": 0.00011819125277983692, "loss": 1.2937, "step": 2767 }, { "epoch": 0.4100740740740741, "grad_norm": 1.6654256582260132, "learning_rate": 0.00011816160118606376, "loss": 0.9628, "step": 2768 }, { "epoch": 0.4102222222222222, "grad_norm": 1.8008649349212646, "learning_rate": 0.0001181319495922906, "loss": 0.9737, "step": 2769 }, { "epoch": 0.4103703703703704, "grad_norm": 1.270262360572815, "learning_rate": 0.00011810229799851743, "loss": 0.9349, "step": 2770 }, { "epoch": 0.4105185185185185, "grad_norm": 1.9640406370162964, "learning_rate": 0.00011807264640474427, "loss": 1.3547, "step": 2771 }, { "epoch": 0.4106666666666667, "grad_norm": 2.8695318698883057, "learning_rate": 0.0001180429948109711, "loss": 1.1886, "step": 2772 }, { "epoch": 0.4108148148148148, "grad_norm": 1.2530816793441772, "learning_rate": 0.00011801334321719793, "loss": 1.0095, "step": 2773 }, { "epoch": 0.410962962962963, "grad_norm": 1.5395197868347168, "learning_rate": 0.00011798369162342478, "loss": 1.1235, "step": 2774 }, { "epoch": 0.4111111111111111, "grad_norm": 1.469625473022461, "learning_rate": 0.0001179540400296516, "loss": 1.0371, "step": 2775 }, { "epoch": 0.41125925925925927, "grad_norm": 1.732647180557251, "learning_rate": 0.00011792438843587842, "loss": 0.972, "step": 2776 }, { "epoch": 0.4114074074074074, "grad_norm": 1.5699431896209717, "learning_rate": 0.00011789473684210525, "loss": 1.0902, "step": 2777 }, { "epoch": 0.41155555555555556, "grad_norm": 2.115208387374878, "learning_rate": 0.00011786508524833211, "loss": 1.1188, "step": 2778 }, { "epoch": 0.4117037037037037, "grad_norm": 1.4656784534454346, "learning_rate": 0.00011783543365455893, "loss": 0.9521, "step": 2779 }, { "epoch": 0.41185185185185186, "grad_norm": 1.6068055629730225, "learning_rate": 0.00011780578206078576, "loss": 1.0153, "step": 2780 }, { "epoch": 0.412, "grad_norm": 2.062823534011841, "learning_rate": 0.00011777613046701262, "loss": 1.0661, "step": 2781 }, { "epoch": 0.41214814814814815, "grad_norm": 2.0889668464660645, "learning_rate": 0.00011774647887323944, "loss": 0.9254, "step": 2782 }, { "epoch": 0.41229629629629627, "grad_norm": 1.8792005777359009, "learning_rate": 0.00011771682727946627, "loss": 1.1005, "step": 2783 }, { "epoch": 0.41244444444444445, "grad_norm": 2.2597243785858154, "learning_rate": 0.00011768717568569311, "loss": 1.0809, "step": 2784 }, { "epoch": 0.41259259259259257, "grad_norm": 1.2650634050369263, "learning_rate": 0.00011765752409191994, "loss": 0.9761, "step": 2785 }, { "epoch": 0.41274074074074074, "grad_norm": 1.583432674407959, "learning_rate": 0.00011762787249814677, "loss": 1.2031, "step": 2786 }, { "epoch": 0.4128888888888889, "grad_norm": 1.9459563493728638, "learning_rate": 0.00011759822090437362, "loss": 1.2632, "step": 2787 }, { "epoch": 0.41303703703703704, "grad_norm": 1.8172357082366943, "learning_rate": 0.00011756856931060045, "loss": 1.199, "step": 2788 }, { "epoch": 0.4131851851851852, "grad_norm": 1.8270275592803955, "learning_rate": 0.00011753891771682728, "loss": 1.0676, "step": 2789 }, { "epoch": 0.41333333333333333, "grad_norm": 2.283125877380371, "learning_rate": 0.00011750926612305412, "loss": 1.2442, "step": 2790 }, { "epoch": 0.4134814814814815, "grad_norm": 1.7654433250427246, "learning_rate": 0.00011747961452928096, "loss": 0.8421, "step": 2791 }, { "epoch": 0.4136296296296296, "grad_norm": 1.8013252019882202, "learning_rate": 0.00011744996293550779, "loss": 0.9433, "step": 2792 }, { "epoch": 0.4137777777777778, "grad_norm": 1.6748549938201904, "learning_rate": 0.00011742031134173463, "loss": 1.2431, "step": 2793 }, { "epoch": 0.4139259259259259, "grad_norm": 2.0394046306610107, "learning_rate": 0.00011739065974796146, "loss": 1.2758, "step": 2794 }, { "epoch": 0.4140740740740741, "grad_norm": 1.8491860628128052, "learning_rate": 0.00011736100815418829, "loss": 1.1828, "step": 2795 }, { "epoch": 0.4142222222222222, "grad_norm": 1.5049947500228882, "learning_rate": 0.00011733135656041512, "loss": 1.0096, "step": 2796 }, { "epoch": 0.4143703703703704, "grad_norm": 3.2022616863250732, "learning_rate": 0.00011730170496664197, "loss": 1.1493, "step": 2797 }, { "epoch": 0.4145185185185185, "grad_norm": 3.3280324935913086, "learning_rate": 0.0001172720533728688, "loss": 1.1176, "step": 2798 }, { "epoch": 0.4146666666666667, "grad_norm": 1.351608395576477, "learning_rate": 0.00011724240177909563, "loss": 0.8128, "step": 2799 }, { "epoch": 0.4148148148148148, "grad_norm": 2.201603889465332, "learning_rate": 0.00011721275018532247, "loss": 1.2038, "step": 2800 }, { "epoch": 0.414962962962963, "grad_norm": 1.5568238496780396, "learning_rate": 0.0001171830985915493, "loss": 0.9006, "step": 2801 }, { "epoch": 0.4151111111111111, "grad_norm": 3.2228941917419434, "learning_rate": 0.00011715344699777612, "loss": 1.3825, "step": 2802 }, { "epoch": 0.4152592592592593, "grad_norm": 1.5834277868270874, "learning_rate": 0.00011712379540400298, "loss": 0.9658, "step": 2803 }, { "epoch": 0.4154074074074074, "grad_norm": 1.1434483528137207, "learning_rate": 0.00011709414381022981, "loss": 1.0708, "step": 2804 }, { "epoch": 0.41555555555555557, "grad_norm": 1.8442808389663696, "learning_rate": 0.00011706449221645663, "loss": 1.1493, "step": 2805 }, { "epoch": 0.4157037037037037, "grad_norm": 3.8759419918060303, "learning_rate": 0.00011703484062268349, "loss": 1.1004, "step": 2806 }, { "epoch": 0.41585185185185186, "grad_norm": 1.9965623617172241, "learning_rate": 0.00011700518902891032, "loss": 1.2326, "step": 2807 }, { "epoch": 0.416, "grad_norm": 1.3121302127838135, "learning_rate": 0.00011697553743513714, "loss": 1.2076, "step": 2808 }, { "epoch": 0.41614814814814816, "grad_norm": 1.7846583127975464, "learning_rate": 0.00011694588584136399, "loss": 0.9679, "step": 2809 }, { "epoch": 0.4162962962962963, "grad_norm": 3.997375011444092, "learning_rate": 0.00011691623424759081, "loss": 1.1087, "step": 2810 }, { "epoch": 0.41644444444444445, "grad_norm": 1.307847499847412, "learning_rate": 0.00011688658265381764, "loss": 1.0535, "step": 2811 }, { "epoch": 0.41659259259259257, "grad_norm": 6.995899200439453, "learning_rate": 0.00011685693106004447, "loss": 0.9963, "step": 2812 }, { "epoch": 0.41674074074074074, "grad_norm": 2.9513638019561768, "learning_rate": 0.00011682727946627132, "loss": 1.1157, "step": 2813 }, { "epoch": 0.41688888888888886, "grad_norm": 1.6517165899276733, "learning_rate": 0.00011679762787249815, "loss": 1.0103, "step": 2814 }, { "epoch": 0.41703703703703704, "grad_norm": 1.0372458696365356, "learning_rate": 0.00011676797627872498, "loss": 0.9458, "step": 2815 }, { "epoch": 0.41718518518518516, "grad_norm": 1.507336974143982, "learning_rate": 0.00011673832468495182, "loss": 0.9639, "step": 2816 }, { "epoch": 0.41733333333333333, "grad_norm": 2.7597787380218506, "learning_rate": 0.00011670867309117865, "loss": 0.8869, "step": 2817 }, { "epoch": 0.4174814814814815, "grad_norm": 1.2514851093292236, "learning_rate": 0.00011667902149740548, "loss": 1.0918, "step": 2818 }, { "epoch": 0.41762962962962963, "grad_norm": 2.244161605834961, "learning_rate": 0.00011664936990363233, "loss": 1.2108, "step": 2819 }, { "epoch": 0.4177777777777778, "grad_norm": 1.3502081632614136, "learning_rate": 0.00011661971830985916, "loss": 1.0707, "step": 2820 }, { "epoch": 0.4179259259259259, "grad_norm": 1.668387532234192, "learning_rate": 0.00011659006671608599, "loss": 1.1988, "step": 2821 }, { "epoch": 0.4180740740740741, "grad_norm": 2.31921124458313, "learning_rate": 0.00011656041512231284, "loss": 0.8422, "step": 2822 }, { "epoch": 0.4182222222222222, "grad_norm": 1.1336761713027954, "learning_rate": 0.00011653076352853967, "loss": 0.7769, "step": 2823 }, { "epoch": 0.4183703703703704, "grad_norm": 7.61287784576416, "learning_rate": 0.0001165011119347665, "loss": 1.152, "step": 2824 }, { "epoch": 0.4185185185185185, "grad_norm": 1.4002079963684082, "learning_rate": 0.00011647146034099334, "loss": 0.9023, "step": 2825 }, { "epoch": 0.4186666666666667, "grad_norm": 1.6339399814605713, "learning_rate": 0.00011644180874722017, "loss": 1.0011, "step": 2826 }, { "epoch": 0.4188148148148148, "grad_norm": 1.773979663848877, "learning_rate": 0.000116412157153447, "loss": 1.092, "step": 2827 }, { "epoch": 0.418962962962963, "grad_norm": 1.360486388206482, "learning_rate": 0.00011638250555967385, "loss": 1.0493, "step": 2828 }, { "epoch": 0.4191111111111111, "grad_norm": 1.9607775211334229, "learning_rate": 0.00011635285396590068, "loss": 1.2693, "step": 2829 }, { "epoch": 0.4192592592592593, "grad_norm": 2.056821346282959, "learning_rate": 0.00011632320237212751, "loss": 0.9182, "step": 2830 }, { "epoch": 0.4194074074074074, "grad_norm": 1.7508609294891357, "learning_rate": 0.00011629355077835433, "loss": 1.14, "step": 2831 }, { "epoch": 0.41955555555555557, "grad_norm": 2.5750112533569336, "learning_rate": 0.00011626389918458119, "loss": 1.0558, "step": 2832 }, { "epoch": 0.4197037037037037, "grad_norm": 1.7716282606124878, "learning_rate": 0.00011623424759080802, "loss": 0.9906, "step": 2833 }, { "epoch": 0.41985185185185186, "grad_norm": 1.949119210243225, "learning_rate": 0.00011620459599703483, "loss": 1.109, "step": 2834 }, { "epoch": 0.42, "grad_norm": 6.4630961418151855, "learning_rate": 0.00011617494440326169, "loss": 0.9866, "step": 2835 }, { "epoch": 0.42014814814814816, "grad_norm": 1.7842416763305664, "learning_rate": 0.00011614529280948851, "loss": 1.2651, "step": 2836 }, { "epoch": 0.4202962962962963, "grad_norm": 1.5930815935134888, "learning_rate": 0.00011611564121571534, "loss": 1.0781, "step": 2837 }, { "epoch": 0.42044444444444445, "grad_norm": 2.1503567695617676, "learning_rate": 0.0001160859896219422, "loss": 1.0805, "step": 2838 }, { "epoch": 0.4205925925925926, "grad_norm": 3.180330753326416, "learning_rate": 0.00011605633802816902, "loss": 1.0535, "step": 2839 }, { "epoch": 0.42074074074074075, "grad_norm": 1.7494330406188965, "learning_rate": 0.00011602668643439585, "loss": 1.2937, "step": 2840 }, { "epoch": 0.42088888888888887, "grad_norm": 2.0975334644317627, "learning_rate": 0.0001159970348406227, "loss": 0.8353, "step": 2841 }, { "epoch": 0.42103703703703704, "grad_norm": 1.3983445167541504, "learning_rate": 0.00011596738324684952, "loss": 0.8618, "step": 2842 }, { "epoch": 0.42118518518518516, "grad_norm": 1.968269944190979, "learning_rate": 0.00011593773165307635, "loss": 0.8831, "step": 2843 }, { "epoch": 0.42133333333333334, "grad_norm": 2.646681070327759, "learning_rate": 0.0001159080800593032, "loss": 1.1054, "step": 2844 }, { "epoch": 0.42148148148148146, "grad_norm": 1.488573670387268, "learning_rate": 0.00011587842846553003, "loss": 1.0823, "step": 2845 }, { "epoch": 0.42162962962962963, "grad_norm": 6.95759391784668, "learning_rate": 0.00011584877687175686, "loss": 1.0718, "step": 2846 }, { "epoch": 0.42177777777777775, "grad_norm": 2.1569931507110596, "learning_rate": 0.00011581912527798369, "loss": 1.2504, "step": 2847 }, { "epoch": 0.4219259259259259, "grad_norm": 2.4960248470306396, "learning_rate": 0.00011578947368421053, "loss": 1.0442, "step": 2848 }, { "epoch": 0.4220740740740741, "grad_norm": 3.746467351913452, "learning_rate": 0.00011575982209043736, "loss": 1.1778, "step": 2849 }, { "epoch": 0.4222222222222222, "grad_norm": 2.5490500926971436, "learning_rate": 0.0001157301704966642, "loss": 1.1062, "step": 2850 }, { "epoch": 0.4223703703703704, "grad_norm": 1.6705260276794434, "learning_rate": 0.00011570051890289104, "loss": 1.3907, "step": 2851 }, { "epoch": 0.4225185185185185, "grad_norm": 4.421507835388184, "learning_rate": 0.00011567086730911787, "loss": 0.8469, "step": 2852 }, { "epoch": 0.4226666666666667, "grad_norm": 1.6530746221542358, "learning_rate": 0.0001156412157153447, "loss": 1.1554, "step": 2853 }, { "epoch": 0.4228148148148148, "grad_norm": 2.410132884979248, "learning_rate": 0.00011561156412157155, "loss": 1.202, "step": 2854 }, { "epoch": 0.422962962962963, "grad_norm": 1.847341775894165, "learning_rate": 0.00011558191252779838, "loss": 1.0665, "step": 2855 }, { "epoch": 0.4231111111111111, "grad_norm": 4.5526347160339355, "learning_rate": 0.00011555226093402521, "loss": 1.1206, "step": 2856 }, { "epoch": 0.4232592592592593, "grad_norm": 8.143963813781738, "learning_rate": 0.00011552260934025205, "loss": 1.2316, "step": 2857 }, { "epoch": 0.4234074074074074, "grad_norm": 1.9890685081481934, "learning_rate": 0.00011549295774647888, "loss": 1.0101, "step": 2858 }, { "epoch": 0.4235555555555556, "grad_norm": 3.8913021087646484, "learning_rate": 0.00011546330615270571, "loss": 0.942, "step": 2859 }, { "epoch": 0.4237037037037037, "grad_norm": 2.6386547088623047, "learning_rate": 0.00011543365455893256, "loss": 1.1263, "step": 2860 }, { "epoch": 0.42385185185185187, "grad_norm": 5.978245735168457, "learning_rate": 0.00011540400296515939, "loss": 1.0431, "step": 2861 }, { "epoch": 0.424, "grad_norm": 7.629427433013916, "learning_rate": 0.00011537435137138621, "loss": 1.1471, "step": 2862 }, { "epoch": 0.42414814814814816, "grad_norm": 4.681686878204346, "learning_rate": 0.00011534469977761304, "loss": 0.9924, "step": 2863 }, { "epoch": 0.4242962962962963, "grad_norm": 3.3226373195648193, "learning_rate": 0.0001153150481838399, "loss": 1.0308, "step": 2864 }, { "epoch": 0.42444444444444446, "grad_norm": 2.119053840637207, "learning_rate": 0.00011528539659006671, "loss": 1.0734, "step": 2865 }, { "epoch": 0.4245925925925926, "grad_norm": 1.8275108337402344, "learning_rate": 0.00011525574499629354, "loss": 0.9565, "step": 2866 }, { "epoch": 0.42474074074074075, "grad_norm": 6.552157402038574, "learning_rate": 0.0001152260934025204, "loss": 1.0816, "step": 2867 }, { "epoch": 0.42488888888888887, "grad_norm": 3.380706548690796, "learning_rate": 0.00011519644180874722, "loss": 1.4825, "step": 2868 }, { "epoch": 0.42503703703703705, "grad_norm": 4.735677242279053, "learning_rate": 0.00011516679021497405, "loss": 1.2556, "step": 2869 }, { "epoch": 0.42518518518518517, "grad_norm": 5.0357136726379395, "learning_rate": 0.0001151371386212009, "loss": 1.1886, "step": 2870 }, { "epoch": 0.42533333333333334, "grad_norm": 2.4263415336608887, "learning_rate": 0.00011510748702742773, "loss": 1.2772, "step": 2871 }, { "epoch": 0.42548148148148146, "grad_norm": 4.668264389038086, "learning_rate": 0.00011507783543365456, "loss": 1.0242, "step": 2872 }, { "epoch": 0.42562962962962964, "grad_norm": 2.4905412197113037, "learning_rate": 0.0001150481838398814, "loss": 1.1628, "step": 2873 }, { "epoch": 0.42577777777777776, "grad_norm": 2.1681697368621826, "learning_rate": 0.00011501853224610823, "loss": 0.8461, "step": 2874 }, { "epoch": 0.42592592592592593, "grad_norm": 2.9159774780273438, "learning_rate": 0.00011498888065233506, "loss": 1.0069, "step": 2875 }, { "epoch": 0.42607407407407405, "grad_norm": 3.7189862728118896, "learning_rate": 0.00011495922905856191, "loss": 1.6027, "step": 2876 }, { "epoch": 0.4262222222222222, "grad_norm": 2.600834608078003, "learning_rate": 0.00011492957746478874, "loss": 0.9963, "step": 2877 }, { "epoch": 0.42637037037037034, "grad_norm": 3.699162006378174, "learning_rate": 0.00011489992587101557, "loss": 0.9006, "step": 2878 }, { "epoch": 0.4265185185185185, "grad_norm": 5.747298717498779, "learning_rate": 0.00011487027427724241, "loss": 1.1509, "step": 2879 }, { "epoch": 0.4266666666666667, "grad_norm": 1.7831780910491943, "learning_rate": 0.00011484062268346925, "loss": 1.1276, "step": 2880 }, { "epoch": 0.4268148148148148, "grad_norm": 8.267875671386719, "learning_rate": 0.00011481097108969608, "loss": 0.9024, "step": 2881 }, { "epoch": 0.426962962962963, "grad_norm": 2.4939606189727783, "learning_rate": 0.00011478131949592291, "loss": 0.9272, "step": 2882 }, { "epoch": 0.4271111111111111, "grad_norm": 3.7283899784088135, "learning_rate": 0.00011475166790214975, "loss": 0.9605, "step": 2883 }, { "epoch": 0.4272592592592593, "grad_norm": 2.7416200637817383, "learning_rate": 0.00011472201630837658, "loss": 0.9724, "step": 2884 }, { "epoch": 0.4274074074074074, "grad_norm": 4.044346809387207, "learning_rate": 0.00011469236471460341, "loss": 0.8317, "step": 2885 }, { "epoch": 0.4275555555555556, "grad_norm": 4.139947414398193, "learning_rate": 0.00011466271312083026, "loss": 1.1099, "step": 2886 }, { "epoch": 0.4277037037037037, "grad_norm": 5.855690002441406, "learning_rate": 0.00011463306152705709, "loss": 1.0223, "step": 2887 }, { "epoch": 0.42785185185185187, "grad_norm": 5.331573963165283, "learning_rate": 0.0001146034099332839, "loss": 1.1985, "step": 2888 }, { "epoch": 0.428, "grad_norm": 8.737065315246582, "learning_rate": 0.00011457375833951076, "loss": 1.4157, "step": 2889 }, { "epoch": 0.42814814814814817, "grad_norm": 9.862096786499023, "learning_rate": 0.0001145441067457376, "loss": 1.2015, "step": 2890 }, { "epoch": 0.4282962962962963, "grad_norm": 5.805713176727295, "learning_rate": 0.00011451445515196441, "loss": 1.2362, "step": 2891 }, { "epoch": 0.42844444444444446, "grad_norm": 6.242314338684082, "learning_rate": 0.00011448480355819127, "loss": 1.2281, "step": 2892 }, { "epoch": 0.4285925925925926, "grad_norm": 3.5984320640563965, "learning_rate": 0.0001144551519644181, "loss": 1.06, "step": 2893 }, { "epoch": 0.42874074074074076, "grad_norm": 2.9863204956054688, "learning_rate": 0.00011442550037064492, "loss": 1.4264, "step": 2894 }, { "epoch": 0.4288888888888889, "grad_norm": 7.7998223304748535, "learning_rate": 0.00011439584877687178, "loss": 1.1028, "step": 2895 }, { "epoch": 0.42903703703703705, "grad_norm": 4.614808559417725, "learning_rate": 0.0001143661971830986, "loss": 1.0646, "step": 2896 }, { "epoch": 0.42918518518518517, "grad_norm": 8.673526763916016, "learning_rate": 0.00011433654558932542, "loss": 1.1162, "step": 2897 }, { "epoch": 0.42933333333333334, "grad_norm": 18.236242294311523, "learning_rate": 0.00011430689399555226, "loss": 1.1246, "step": 2898 }, { "epoch": 0.42948148148148146, "grad_norm": 5.567377090454102, "learning_rate": 0.0001142772424017791, "loss": 1.1503, "step": 2899 }, { "epoch": 0.42962962962962964, "grad_norm": Infinity, "learning_rate": 0.0001142772424017791, "loss": 1.1606, "step": 2900 }, { "epoch": 0.42977777777777776, "grad_norm": 24.497400283813477, "learning_rate": 0.00011424759080800593, "loss": 0.9694, "step": 2901 }, { "epoch": 0.42992592592592593, "grad_norm": 9.419271469116211, "learning_rate": 0.00011421793921423276, "loss": 1.1911, "step": 2902 }, { "epoch": 0.43007407407407405, "grad_norm": 6.582805633544922, "learning_rate": 0.0001141882876204596, "loss": 1.2768, "step": 2903 }, { "epoch": 0.43022222222222223, "grad_norm": 10.915657043457031, "learning_rate": 0.00011415863602668644, "loss": 1.1907, "step": 2904 }, { "epoch": 0.43037037037037035, "grad_norm": 12.22497844696045, "learning_rate": 0.00011412898443291327, "loss": 1.234, "step": 2905 }, { "epoch": 0.4305185185185185, "grad_norm": 3.5595293045043945, "learning_rate": 0.00011409933283914011, "loss": 1.5526, "step": 2906 }, { "epoch": 0.43066666666666664, "grad_norm": 4.788974761962891, "learning_rate": 0.00011406968124536694, "loss": 1.0496, "step": 2907 }, { "epoch": 0.4308148148148148, "grad_norm": 3.6452412605285645, "learning_rate": 0.00011404002965159377, "loss": 1.1436, "step": 2908 }, { "epoch": 0.43096296296296294, "grad_norm": 13.032570838928223, "learning_rate": 0.00011401037805782062, "loss": 0.974, "step": 2909 }, { "epoch": 0.4311111111111111, "grad_norm": 2.916541337966919, "learning_rate": 0.00011398072646404745, "loss": 1.1616, "step": 2910 }, { "epoch": 0.4312592592592593, "grad_norm": 4.221134185791016, "learning_rate": 0.00011395107487027428, "loss": 1.1259, "step": 2911 }, { "epoch": 0.4314074074074074, "grad_norm": 3.0779690742492676, "learning_rate": 0.00011392142327650113, "loss": 0.8523, "step": 2912 }, { "epoch": 0.4315555555555556, "grad_norm": 9.860507011413574, "learning_rate": 0.00011389177168272796, "loss": 1.033, "step": 2913 }, { "epoch": 0.4317037037037037, "grad_norm": 2.1072463989257812, "learning_rate": 0.00011386212008895479, "loss": 0.9688, "step": 2914 }, { "epoch": 0.4318518518518519, "grad_norm": 4.792141914367676, "learning_rate": 0.00011383246849518163, "loss": 1.1055, "step": 2915 }, { "epoch": 0.432, "grad_norm": 2.455380439758301, "learning_rate": 0.00011380281690140846, "loss": 1.0688, "step": 2916 }, { "epoch": 0.43214814814814817, "grad_norm": 3.3475184440612793, "learning_rate": 0.00011377316530763529, "loss": 1.1049, "step": 2917 }, { "epoch": 0.4322962962962963, "grad_norm": 6.160754680633545, "learning_rate": 0.00011374351371386211, "loss": 0.9171, "step": 2918 }, { "epoch": 0.43244444444444446, "grad_norm": 2.9104185104370117, "learning_rate": 0.00011371386212008897, "loss": 1.0173, "step": 2919 }, { "epoch": 0.4325925925925926, "grad_norm": 12.792739868164062, "learning_rate": 0.0001136842105263158, "loss": 1.0066, "step": 2920 }, { "epoch": 0.43274074074074076, "grad_norm": 3.8627989292144775, "learning_rate": 0.00011365455893254262, "loss": 0.8961, "step": 2921 }, { "epoch": 0.4328888888888889, "grad_norm": 16.445222854614258, "learning_rate": 0.00011362490733876947, "loss": 1.2134, "step": 2922 }, { "epoch": 0.43303703703703705, "grad_norm": 4.181873798370361, "learning_rate": 0.00011359525574499629, "loss": 0.89, "step": 2923 }, { "epoch": 0.4331851851851852, "grad_norm": 2.4696426391601562, "learning_rate": 0.00011356560415122312, "loss": 1.0442, "step": 2924 }, { "epoch": 0.43333333333333335, "grad_norm": 8.993077278137207, "learning_rate": 0.00011353595255744998, "loss": 1.1675, "step": 2925 }, { "epoch": 0.43348148148148147, "grad_norm": 3.3576834201812744, "learning_rate": 0.0001135063009636768, "loss": 1.0312, "step": 2926 }, { "epoch": 0.43362962962962964, "grad_norm": 14.622025489807129, "learning_rate": 0.00011347664936990363, "loss": 1.3223, "step": 2927 }, { "epoch": 0.43377777777777776, "grad_norm": 4.972567081451416, "learning_rate": 0.00011344699777613049, "loss": 1.1023, "step": 2928 }, { "epoch": 0.43392592592592594, "grad_norm": 2.8445241451263428, "learning_rate": 0.0001134173461823573, "loss": 1.2656, "step": 2929 }, { "epoch": 0.43407407407407406, "grad_norm": 4.684587478637695, "learning_rate": 0.00011338769458858414, "loss": 0.9524, "step": 2930 }, { "epoch": 0.43422222222222223, "grad_norm": 6.1581268310546875, "learning_rate": 0.00011335804299481098, "loss": 0.9127, "step": 2931 }, { "epoch": 0.43437037037037035, "grad_norm": 8.210823059082031, "learning_rate": 0.00011332839140103781, "loss": 1.0098, "step": 2932 }, { "epoch": 0.4345185185185185, "grad_norm": 3.8922359943389893, "learning_rate": 0.00011329873980726464, "loss": 0.9481, "step": 2933 }, { "epoch": 0.43466666666666665, "grad_norm": 24.909101486206055, "learning_rate": 0.00011326908821349147, "loss": 1.1517, "step": 2934 }, { "epoch": 0.4348148148148148, "grad_norm": 4.422623157501221, "learning_rate": 0.00011323943661971832, "loss": 1.0295, "step": 2935 }, { "epoch": 0.43496296296296294, "grad_norm": 17.422393798828125, "learning_rate": 0.00011320978502594515, "loss": 1.001, "step": 2936 }, { "epoch": 0.4351111111111111, "grad_norm": 4.383595943450928, "learning_rate": 0.00011318013343217198, "loss": 1.2922, "step": 2937 }, { "epoch": 0.43525925925925923, "grad_norm": 5.5702900886535645, "learning_rate": 0.00011315048183839882, "loss": 0.9702, "step": 2938 }, { "epoch": 0.4354074074074074, "grad_norm": 19.464635848999023, "learning_rate": 0.00011312083024462565, "loss": 1.0723, "step": 2939 }, { "epoch": 0.43555555555555553, "grad_norm": 6.28037691116333, "learning_rate": 0.00011309117865085249, "loss": 1.0345, "step": 2940 }, { "epoch": 0.4357037037037037, "grad_norm": 8.55496597290039, "learning_rate": 0.00011306152705707933, "loss": 0.9858, "step": 2941 }, { "epoch": 0.4358518518518519, "grad_norm": 18.133056640625, "learning_rate": 0.00011303187546330616, "loss": 1.0856, "step": 2942 }, { "epoch": 0.436, "grad_norm": 3.8335206508636475, "learning_rate": 0.00011300222386953299, "loss": 0.9485, "step": 2943 }, { "epoch": 0.4361481481481482, "grad_norm": 6.911520957946777, "learning_rate": 0.00011297257227575984, "loss": 1.0289, "step": 2944 }, { "epoch": 0.4362962962962963, "grad_norm": 3.026308059692383, "learning_rate": 0.00011294292068198667, "loss": 0.9971, "step": 2945 }, { "epoch": 0.43644444444444447, "grad_norm": 7.223231315612793, "learning_rate": 0.0001129132690882135, "loss": 1.169, "step": 2946 }, { "epoch": 0.4365925925925926, "grad_norm": 7.4562153816223145, "learning_rate": 0.00011288361749444034, "loss": 1.0482, "step": 2947 }, { "epoch": 0.43674074074074076, "grad_norm": 5.943459987640381, "learning_rate": 0.00011285396590066717, "loss": 1.2589, "step": 2948 }, { "epoch": 0.4368888888888889, "grad_norm": 5.091812610626221, "learning_rate": 0.00011282431430689399, "loss": 1.1445, "step": 2949 }, { "epoch": 0.43703703703703706, "grad_norm": 7.933882236480713, "learning_rate": 0.00011279466271312085, "loss": 1.1507, "step": 2950 }, { "epoch": 0.4371851851851852, "grad_norm": 8.546607971191406, "learning_rate": 0.00011276501111934768, "loss": 1.0796, "step": 2951 }, { "epoch": 0.43733333333333335, "grad_norm": 8.824572563171387, "learning_rate": 0.0001127353595255745, "loss": 1.335, "step": 2952 }, { "epoch": 0.43748148148148147, "grad_norm": 8.691951751708984, "learning_rate": 0.00011270570793180133, "loss": 1.1626, "step": 2953 }, { "epoch": 0.43762962962962965, "grad_norm": 4.850937366485596, "learning_rate": 0.00011267605633802819, "loss": 1.1599, "step": 2954 }, { "epoch": 0.43777777777777777, "grad_norm": 5.646979331970215, "learning_rate": 0.000112646404744255, "loss": 0.9394, "step": 2955 }, { "epoch": 0.43792592592592594, "grad_norm": 4.293743133544922, "learning_rate": 0.00011261675315048183, "loss": 1.0186, "step": 2956 }, { "epoch": 0.43807407407407406, "grad_norm": 2.938546895980835, "learning_rate": 0.00011258710155670868, "loss": 1.048, "step": 2957 }, { "epoch": 0.43822222222222224, "grad_norm": 7.627182483673096, "learning_rate": 0.00011255744996293551, "loss": 0.9949, "step": 2958 }, { "epoch": 0.43837037037037035, "grad_norm": 3.260939359664917, "learning_rate": 0.00011252779836916234, "loss": 1.2443, "step": 2959 }, { "epoch": 0.43851851851851853, "grad_norm": 2.731006145477295, "learning_rate": 0.00011249814677538918, "loss": 0.9315, "step": 2960 }, { "epoch": 0.43866666666666665, "grad_norm": 2.959324598312378, "learning_rate": 0.00011246849518161602, "loss": 1.0446, "step": 2961 }, { "epoch": 0.4388148148148148, "grad_norm": 5.511545181274414, "learning_rate": 0.00011243884358784285, "loss": 1.0153, "step": 2962 }, { "epoch": 0.43896296296296294, "grad_norm": 5.317894458770752, "learning_rate": 0.00011240919199406969, "loss": 0.9335, "step": 2963 }, { "epoch": 0.4391111111111111, "grad_norm": 12.788424491882324, "learning_rate": 0.00011237954040029652, "loss": 0.953, "step": 2964 }, { "epoch": 0.43925925925925924, "grad_norm": 24.395565032958984, "learning_rate": 0.00011234988880652335, "loss": 1.1692, "step": 2965 }, { "epoch": 0.4394074074074074, "grad_norm": 8.75581169128418, "learning_rate": 0.0001123202372127502, "loss": 1.0485, "step": 2966 }, { "epoch": 0.43955555555555553, "grad_norm": 17.51173973083496, "learning_rate": 0.00011229058561897703, "loss": 1.0157, "step": 2967 }, { "epoch": 0.4397037037037037, "grad_norm": 2.498159885406494, "learning_rate": 0.00011226093402520386, "loss": 1.1258, "step": 2968 }, { "epoch": 0.4398518518518518, "grad_norm": 17.463228225708008, "learning_rate": 0.00011223128243143069, "loss": 1.1448, "step": 2969 }, { "epoch": 0.44, "grad_norm": 10.378259658813477, "learning_rate": 0.00011220163083765753, "loss": 1.1837, "step": 2970 }, { "epoch": 0.4401481481481481, "grad_norm": 3.2528207302093506, "learning_rate": 0.00011217197924388437, "loss": 1.0357, "step": 2971 }, { "epoch": 0.4402962962962963, "grad_norm": 3.818711042404175, "learning_rate": 0.0001121423276501112, "loss": 1.2929, "step": 2972 }, { "epoch": 0.44044444444444447, "grad_norm": 5.835166931152344, "learning_rate": 0.00011211267605633804, "loss": 1.0996, "step": 2973 }, { "epoch": 0.4405925925925926, "grad_norm": 12.154253005981445, "learning_rate": 0.00011208302446256487, "loss": 1.102, "step": 2974 }, { "epoch": 0.44074074074074077, "grad_norm": 3.3864121437072754, "learning_rate": 0.00011205337286879169, "loss": 1.0104, "step": 2975 }, { "epoch": 0.4408888888888889, "grad_norm": 5.395549774169922, "learning_rate": 0.00011202372127501855, "loss": 0.9617, "step": 2976 }, { "epoch": 0.44103703703703706, "grad_norm": 2.800696849822998, "learning_rate": 0.00011199406968124538, "loss": 0.9404, "step": 2977 }, { "epoch": 0.4411851851851852, "grad_norm": 6.2743940353393555, "learning_rate": 0.0001119644180874722, "loss": 1.2399, "step": 2978 }, { "epoch": 0.44133333333333336, "grad_norm": 10.434906005859375, "learning_rate": 0.00011193476649369905, "loss": 1.0988, "step": 2979 }, { "epoch": 0.4414814814814815, "grad_norm": 7.3947858810424805, "learning_rate": 0.00011190511489992588, "loss": 1.1077, "step": 2980 }, { "epoch": 0.44162962962962965, "grad_norm": 9.96932315826416, "learning_rate": 0.0001118754633061527, "loss": 1.1874, "step": 2981 }, { "epoch": 0.44177777777777777, "grad_norm": 1.851417899131775, "learning_rate": 0.00011184581171237956, "loss": 0.866, "step": 2982 }, { "epoch": 0.44192592592592594, "grad_norm": 6.065192222595215, "learning_rate": 0.00011181616011860638, "loss": 0.9024, "step": 2983 }, { "epoch": 0.44207407407407406, "grad_norm": 10.450791358947754, "learning_rate": 0.00011178650852483321, "loss": 1.2193, "step": 2984 }, { "epoch": 0.44222222222222224, "grad_norm": 1.4706751108169556, "learning_rate": 0.00011175685693106004, "loss": 1.0355, "step": 2985 }, { "epoch": 0.44237037037037036, "grad_norm": 3.530428886413574, "learning_rate": 0.00011172720533728688, "loss": 1.0946, "step": 2986 }, { "epoch": 0.44251851851851853, "grad_norm": 2.906909227371216, "learning_rate": 0.00011169755374351371, "loss": 1.3765, "step": 2987 }, { "epoch": 0.44266666666666665, "grad_norm": 4.955533027648926, "learning_rate": 0.00011166790214974055, "loss": 1.1557, "step": 2988 }, { "epoch": 0.44281481481481483, "grad_norm": 3.982163667678833, "learning_rate": 0.00011163825055596739, "loss": 1.1782, "step": 2989 }, { "epoch": 0.44296296296296295, "grad_norm": 2.162320852279663, "learning_rate": 0.00011160859896219422, "loss": 1.175, "step": 2990 }, { "epoch": 0.4431111111111111, "grad_norm": 2.021183967590332, "learning_rate": 0.00011157894736842105, "loss": 1.3226, "step": 2991 }, { "epoch": 0.44325925925925924, "grad_norm": 7.46436071395874, "learning_rate": 0.0001115492957746479, "loss": 1.0841, "step": 2992 }, { "epoch": 0.4434074074074074, "grad_norm": 2.1430652141571045, "learning_rate": 0.00011151964418087473, "loss": 1.1459, "step": 2993 }, { "epoch": 0.44355555555555554, "grad_norm": 1.8138455152511597, "learning_rate": 0.00011148999258710156, "loss": 1.3235, "step": 2994 }, { "epoch": 0.4437037037037037, "grad_norm": 5.439229488372803, "learning_rate": 0.0001114603409933284, "loss": 1.086, "step": 2995 }, { "epoch": 0.44385185185185183, "grad_norm": 2.4075825214385986, "learning_rate": 0.00011143068939955523, "loss": 1.0681, "step": 2996 }, { "epoch": 0.444, "grad_norm": 3.177294969558716, "learning_rate": 0.00011140103780578206, "loss": 1.11, "step": 2997 }, { "epoch": 0.4441481481481481, "grad_norm": 2.3719303607940674, "learning_rate": 0.00011137138621200891, "loss": 0.9737, "step": 2998 }, { "epoch": 0.4442962962962963, "grad_norm": 6.038328647613525, "learning_rate": 0.00011134173461823574, "loss": 0.9585, "step": 2999 }, { "epoch": 0.4444444444444444, "grad_norm": 2.2186923027038574, "learning_rate": 0.00011131208302446257, "loss": 1.2404, "step": 3000 }, { "epoch": 0.4445925925925926, "grad_norm": 3.7171268463134766, "learning_rate": 0.00011128243143068941, "loss": 1.1925, "step": 3001 }, { "epoch": 0.4447407407407407, "grad_norm": 2.1566059589385986, "learning_rate": 0.00011125277983691625, "loss": 0.9764, "step": 3002 }, { "epoch": 0.4448888888888889, "grad_norm": 2.386497974395752, "learning_rate": 0.00011122312824314308, "loss": 1.1597, "step": 3003 }, { "epoch": 0.44503703703703706, "grad_norm": 2.0896761417388916, "learning_rate": 0.0001111934766493699, "loss": 0.996, "step": 3004 }, { "epoch": 0.4451851851851852, "grad_norm": 4.1234307289123535, "learning_rate": 0.00011116382505559675, "loss": 1.1501, "step": 3005 }, { "epoch": 0.44533333333333336, "grad_norm": 2.1813316345214844, "learning_rate": 0.00011113417346182358, "loss": 1.2474, "step": 3006 }, { "epoch": 0.4454814814814815, "grad_norm": 2.360969305038452, "learning_rate": 0.0001111045218680504, "loss": 0.7602, "step": 3007 }, { "epoch": 0.44562962962962965, "grad_norm": 2.4061508178710938, "learning_rate": 0.00011107487027427726, "loss": 0.9027, "step": 3008 }, { "epoch": 0.4457777777777778, "grad_norm": 1.799283504486084, "learning_rate": 0.00011104521868050408, "loss": 1.154, "step": 3009 }, { "epoch": 0.44592592592592595, "grad_norm": 1.6981852054595947, "learning_rate": 0.0001110155670867309, "loss": 1.0655, "step": 3010 }, { "epoch": 0.44607407407407407, "grad_norm": 2.8216686248779297, "learning_rate": 0.00011098591549295776, "loss": 1.2816, "step": 3011 }, { "epoch": 0.44622222222222224, "grad_norm": 2.0957045555114746, "learning_rate": 0.00011095626389918458, "loss": 1.0868, "step": 3012 }, { "epoch": 0.44637037037037036, "grad_norm": 1.7423616647720337, "learning_rate": 0.00011092661230541141, "loss": 1.0197, "step": 3013 }, { "epoch": 0.44651851851851854, "grad_norm": 1.9414492845535278, "learning_rate": 0.00011089696071163827, "loss": 0.926, "step": 3014 }, { "epoch": 0.44666666666666666, "grad_norm": 3.2115371227264404, "learning_rate": 0.00011086730911786509, "loss": 1.1533, "step": 3015 }, { "epoch": 0.44681481481481483, "grad_norm": 2.3400022983551025, "learning_rate": 0.00011083765752409192, "loss": 1.0101, "step": 3016 }, { "epoch": 0.44696296296296295, "grad_norm": 5.575966835021973, "learning_rate": 0.00011080800593031876, "loss": 0.9246, "step": 3017 }, { "epoch": 0.4471111111111111, "grad_norm": 5.21854305267334, "learning_rate": 0.0001107783543365456, "loss": 0.9716, "step": 3018 }, { "epoch": 0.44725925925925925, "grad_norm": 1.3820289373397827, "learning_rate": 0.00011074870274277243, "loss": 1.0381, "step": 3019 }, { "epoch": 0.4474074074074074, "grad_norm": 1.17095947265625, "learning_rate": 0.00011071905114899926, "loss": 1.191, "step": 3020 }, { "epoch": 0.44755555555555554, "grad_norm": 1.6841038465499878, "learning_rate": 0.0001106893995552261, "loss": 1.0123, "step": 3021 }, { "epoch": 0.4477037037037037, "grad_norm": 2.9544711112976074, "learning_rate": 0.00011065974796145293, "loss": 1.0386, "step": 3022 }, { "epoch": 0.44785185185185183, "grad_norm": 2.053281307220459, "learning_rate": 0.00011063009636767976, "loss": 1.174, "step": 3023 }, { "epoch": 0.448, "grad_norm": 3.5765340328216553, "learning_rate": 0.00011060044477390661, "loss": 1.193, "step": 3024 }, { "epoch": 0.44814814814814813, "grad_norm": 1.4198112487792969, "learning_rate": 0.00011057079318013344, "loss": 1.0356, "step": 3025 }, { "epoch": 0.4482962962962963, "grad_norm": 2.1637237071990967, "learning_rate": 0.00011054114158636027, "loss": 1.1385, "step": 3026 }, { "epoch": 0.4484444444444444, "grad_norm": 2.439480781555176, "learning_rate": 0.00011051148999258711, "loss": 1.2328, "step": 3027 }, { "epoch": 0.4485925925925926, "grad_norm": 1.2427433729171753, "learning_rate": 0.00011048183839881394, "loss": 1.1139, "step": 3028 }, { "epoch": 0.4487407407407407, "grad_norm": 3.283803939819336, "learning_rate": 0.00011045218680504077, "loss": 1.1461, "step": 3029 }, { "epoch": 0.4488888888888889, "grad_norm": 1.2881802320480347, "learning_rate": 0.00011042253521126762, "loss": 0.9716, "step": 3030 }, { "epoch": 0.449037037037037, "grad_norm": 3.095428228378296, "learning_rate": 0.00011039288361749445, "loss": 1.1238, "step": 3031 }, { "epoch": 0.4491851851851852, "grad_norm": 1.309549331665039, "learning_rate": 0.00011036323202372128, "loss": 1.1793, "step": 3032 }, { "epoch": 0.4493333333333333, "grad_norm": 1.5135389566421509, "learning_rate": 0.00011033358042994813, "loss": 0.953, "step": 3033 }, { "epoch": 0.4494814814814815, "grad_norm": 2.1670916080474854, "learning_rate": 0.00011030392883617496, "loss": 0.9172, "step": 3034 }, { "epoch": 0.44962962962962966, "grad_norm": 2.9388227462768555, "learning_rate": 0.00011027427724240177, "loss": 1.1726, "step": 3035 }, { "epoch": 0.4497777777777778, "grad_norm": 3.653726100921631, "learning_rate": 0.00011024462564862863, "loss": 0.916, "step": 3036 }, { "epoch": 0.44992592592592595, "grad_norm": 1.2855350971221924, "learning_rate": 0.00011021497405485546, "loss": 1.0213, "step": 3037 }, { "epoch": 0.45007407407407407, "grad_norm": 1.843808889389038, "learning_rate": 0.00011018532246108228, "loss": 1.0107, "step": 3038 }, { "epoch": 0.45022222222222225, "grad_norm": 1.0858509540557861, "learning_rate": 0.00011015567086730911, "loss": 1.1499, "step": 3039 }, { "epoch": 0.45037037037037037, "grad_norm": 2.298110008239746, "learning_rate": 0.00011012601927353597, "loss": 1.0576, "step": 3040 }, { "epoch": 0.45051851851851854, "grad_norm": 2.0090999603271484, "learning_rate": 0.00011009636767976279, "loss": 0.9171, "step": 3041 }, { "epoch": 0.45066666666666666, "grad_norm": 1.9219692945480347, "learning_rate": 0.00011006671608598962, "loss": 0.9276, "step": 3042 }, { "epoch": 0.45081481481481483, "grad_norm": 1.3808432817459106, "learning_rate": 0.00011003706449221646, "loss": 0.692, "step": 3043 }, { "epoch": 0.45096296296296295, "grad_norm": 1.2714673280715942, "learning_rate": 0.00011000741289844329, "loss": 0.9973, "step": 3044 }, { "epoch": 0.45111111111111113, "grad_norm": 5.028180122375488, "learning_rate": 0.00010997776130467012, "loss": 1.0399, "step": 3045 }, { "epoch": 0.45125925925925925, "grad_norm": 1.7946445941925049, "learning_rate": 0.00010994810971089697, "loss": 1.0096, "step": 3046 }, { "epoch": 0.4514074074074074, "grad_norm": 2.270958662033081, "learning_rate": 0.0001099184581171238, "loss": 1.2807, "step": 3047 }, { "epoch": 0.45155555555555554, "grad_norm": 2.8045239448547363, "learning_rate": 0.00010988880652335063, "loss": 0.9194, "step": 3048 }, { "epoch": 0.4517037037037037, "grad_norm": 5.084659576416016, "learning_rate": 0.00010985915492957747, "loss": 0.9291, "step": 3049 }, { "epoch": 0.45185185185185184, "grad_norm": 4.033753395080566, "learning_rate": 0.0001098295033358043, "loss": 1.0498, "step": 3050 }, { "epoch": 0.452, "grad_norm": 1.7575342655181885, "learning_rate": 0.00010979985174203114, "loss": 1.3233, "step": 3051 }, { "epoch": 0.45214814814814813, "grad_norm": 4.462325096130371, "learning_rate": 0.00010977020014825798, "loss": 1.1128, "step": 3052 }, { "epoch": 0.4522962962962963, "grad_norm": 2.3308298587799072, "learning_rate": 0.00010974054855448481, "loss": 1.0498, "step": 3053 }, { "epoch": 0.4524444444444444, "grad_norm": 1.3861079216003418, "learning_rate": 0.00010971089696071164, "loss": 0.9949, "step": 3054 }, { "epoch": 0.4525925925925926, "grad_norm": 4.199979305267334, "learning_rate": 0.00010968124536693847, "loss": 1.2077, "step": 3055 }, { "epoch": 0.4527407407407407, "grad_norm": 1.8853832483291626, "learning_rate": 0.00010965159377316532, "loss": 0.8667, "step": 3056 }, { "epoch": 0.4528888888888889, "grad_norm": 1.4810208082199097, "learning_rate": 0.00010962194217939215, "loss": 1.1541, "step": 3057 }, { "epoch": 0.453037037037037, "grad_norm": 1.997210144996643, "learning_rate": 0.00010959229058561898, "loss": 1.0197, "step": 3058 }, { "epoch": 0.4531851851851852, "grad_norm": 1.7493116855621338, "learning_rate": 0.00010956263899184582, "loss": 1.0517, "step": 3059 }, { "epoch": 0.4533333333333333, "grad_norm": 2.0254781246185303, "learning_rate": 0.00010953298739807266, "loss": 1.1163, "step": 3060 }, { "epoch": 0.4534814814814815, "grad_norm": 9.770330429077148, "learning_rate": 0.00010950333580429947, "loss": 1.018, "step": 3061 }, { "epoch": 0.4536296296296296, "grad_norm": 3.976106882095337, "learning_rate": 0.00010947368421052633, "loss": 1.0003, "step": 3062 }, { "epoch": 0.4537777777777778, "grad_norm": 3.375537633895874, "learning_rate": 0.00010944403261675316, "loss": 1.1881, "step": 3063 }, { "epoch": 0.4539259259259259, "grad_norm": 7.4661865234375, "learning_rate": 0.00010941438102297998, "loss": 1.0032, "step": 3064 }, { "epoch": 0.4540740740740741, "grad_norm": 13.027575492858887, "learning_rate": 0.00010938472942920684, "loss": 1.1811, "step": 3065 }, { "epoch": 0.45422222222222225, "grad_norm": 9.472670555114746, "learning_rate": 0.00010935507783543367, "loss": 1.1753, "step": 3066 }, { "epoch": 0.45437037037037037, "grad_norm": 1.929787278175354, "learning_rate": 0.00010932542624166048, "loss": 1.2076, "step": 3067 }, { "epoch": 0.45451851851851854, "grad_norm": 3.8757476806640625, "learning_rate": 0.00010929577464788734, "loss": 1.1256, "step": 3068 }, { "epoch": 0.45466666666666666, "grad_norm": 4.183289527893066, "learning_rate": 0.00010926612305411416, "loss": 1.166, "step": 3069 }, { "epoch": 0.45481481481481484, "grad_norm": 3.670625925064087, "learning_rate": 0.00010923647146034099, "loss": 0.9806, "step": 3070 }, { "epoch": 0.45496296296296296, "grad_norm": 2.6360023021698, "learning_rate": 0.00010920681986656782, "loss": 1.1445, "step": 3071 }, { "epoch": 0.45511111111111113, "grad_norm": 1.8747177124023438, "learning_rate": 0.00010917716827279467, "loss": 0.9117, "step": 3072 }, { "epoch": 0.45525925925925925, "grad_norm": 3.689152479171753, "learning_rate": 0.0001091475166790215, "loss": 0.9284, "step": 3073 }, { "epoch": 0.4554074074074074, "grad_norm": 2.021763563156128, "learning_rate": 0.00010911786508524833, "loss": 1.0789, "step": 3074 }, { "epoch": 0.45555555555555555, "grad_norm": 8.889338493347168, "learning_rate": 0.00010908821349147517, "loss": 1.1416, "step": 3075 }, { "epoch": 0.4557037037037037, "grad_norm": 1.4197607040405273, "learning_rate": 0.000109058561897702, "loss": 0.8435, "step": 3076 }, { "epoch": 0.45585185185185184, "grad_norm": 5.286947250366211, "learning_rate": 0.00010902891030392883, "loss": 1.1242, "step": 3077 }, { "epoch": 0.456, "grad_norm": 2.942779779434204, "learning_rate": 0.00010899925871015568, "loss": 1.1233, "step": 3078 }, { "epoch": 0.45614814814814814, "grad_norm": 2.4830679893493652, "learning_rate": 0.00010896960711638251, "loss": 0.9952, "step": 3079 }, { "epoch": 0.4562962962962963, "grad_norm": 3.3844051361083984, "learning_rate": 0.00010893995552260934, "loss": 1.0727, "step": 3080 }, { "epoch": 0.45644444444444443, "grad_norm": 1.5580177307128906, "learning_rate": 0.00010891030392883619, "loss": 1.0469, "step": 3081 }, { "epoch": 0.4565925925925926, "grad_norm": 2.341848850250244, "learning_rate": 0.00010888065233506302, "loss": 1.1389, "step": 3082 }, { "epoch": 0.4567407407407407, "grad_norm": 3.0161538124084473, "learning_rate": 0.00010885100074128985, "loss": 1.1048, "step": 3083 }, { "epoch": 0.4568888888888889, "grad_norm": 1.2393746376037598, "learning_rate": 0.00010882134914751669, "loss": 1.0041, "step": 3084 }, { "epoch": 0.457037037037037, "grad_norm": 2.847796678543091, "learning_rate": 0.00010879169755374352, "loss": 0.9235, "step": 3085 }, { "epoch": 0.4571851851851852, "grad_norm": 5.12452507019043, "learning_rate": 0.00010876204595997035, "loss": 1.2076, "step": 3086 }, { "epoch": 0.4573333333333333, "grad_norm": 1.9241137504577637, "learning_rate": 0.0001087323943661972, "loss": 0.9998, "step": 3087 }, { "epoch": 0.4574814814814815, "grad_norm": 3.328599452972412, "learning_rate": 0.00010870274277242403, "loss": 1.2035, "step": 3088 }, { "epoch": 0.4576296296296296, "grad_norm": 2.1363518238067627, "learning_rate": 0.00010867309117865086, "loss": 1.0252, "step": 3089 }, { "epoch": 0.4577777777777778, "grad_norm": 2.4398419857025146, "learning_rate": 0.00010864343958487768, "loss": 1.1888, "step": 3090 }, { "epoch": 0.4579259259259259, "grad_norm": 3.3628358840942383, "learning_rate": 0.00010861378799110454, "loss": 1.0159, "step": 3091 }, { "epoch": 0.4580740740740741, "grad_norm": 2.5573160648345947, "learning_rate": 0.00010858413639733137, "loss": 0.7046, "step": 3092 }, { "epoch": 0.4582222222222222, "grad_norm": 4.458554744720459, "learning_rate": 0.00010855448480355818, "loss": 1.0843, "step": 3093 }, { "epoch": 0.4583703703703704, "grad_norm": 6.643269062042236, "learning_rate": 0.00010852483320978504, "loss": 0.9896, "step": 3094 }, { "epoch": 0.4585185185185185, "grad_norm": 1.6025092601776123, "learning_rate": 0.00010849518161601186, "loss": 0.8736, "step": 3095 }, { "epoch": 0.45866666666666667, "grad_norm": 2.3946588039398193, "learning_rate": 0.00010846553002223869, "loss": 0.9271, "step": 3096 }, { "epoch": 0.45881481481481484, "grad_norm": 4.110802173614502, "learning_rate": 0.00010843587842846555, "loss": 0.9321, "step": 3097 }, { "epoch": 0.45896296296296296, "grad_norm": 3.624279499053955, "learning_rate": 0.00010840622683469237, "loss": 1.1331, "step": 3098 }, { "epoch": 0.45911111111111114, "grad_norm": 1.5807106494903564, "learning_rate": 0.0001083765752409192, "loss": 1.279, "step": 3099 }, { "epoch": 0.45925925925925926, "grad_norm": 4.025631904602051, "learning_rate": 0.00010834692364714605, "loss": 1.2527, "step": 3100 }, { "epoch": 0.45940740740740743, "grad_norm": 1.865668773651123, "learning_rate": 0.00010831727205337287, "loss": 1.1178, "step": 3101 }, { "epoch": 0.45955555555555555, "grad_norm": 10.235735893249512, "learning_rate": 0.0001082876204595997, "loss": 1.0382, "step": 3102 }, { "epoch": 0.4597037037037037, "grad_norm": 2.669966459274292, "learning_rate": 0.00010825796886582655, "loss": 1.1538, "step": 3103 }, { "epoch": 0.45985185185185184, "grad_norm": 5.778812885284424, "learning_rate": 0.00010822831727205338, "loss": 1.5724, "step": 3104 }, { "epoch": 0.46, "grad_norm": 3.6304867267608643, "learning_rate": 0.00010819866567828021, "loss": 0.8341, "step": 3105 }, { "epoch": 0.46014814814814814, "grad_norm": 3.9482040405273438, "learning_rate": 0.00010816901408450704, "loss": 1.1932, "step": 3106 }, { "epoch": 0.4602962962962963, "grad_norm": 9.13787841796875, "learning_rate": 0.00010813936249073388, "loss": 1.0207, "step": 3107 }, { "epoch": 0.46044444444444443, "grad_norm": 2.339956045150757, "learning_rate": 0.00010810971089696071, "loss": 0.9284, "step": 3108 }, { "epoch": 0.4605925925925926, "grad_norm": 2.340646743774414, "learning_rate": 0.00010808005930318755, "loss": 1.2671, "step": 3109 }, { "epoch": 0.46074074074074073, "grad_norm": 4.460016250610352, "learning_rate": 0.00010805040770941439, "loss": 1.145, "step": 3110 }, { "epoch": 0.4608888888888889, "grad_norm": 6.100407600402832, "learning_rate": 0.00010802075611564122, "loss": 1.029, "step": 3111 }, { "epoch": 0.461037037037037, "grad_norm": 3.0961480140686035, "learning_rate": 0.00010799110452186805, "loss": 1.2583, "step": 3112 }, { "epoch": 0.4611851851851852, "grad_norm": 8.958905220031738, "learning_rate": 0.0001079614529280949, "loss": 1.0846, "step": 3113 }, { "epoch": 0.4613333333333333, "grad_norm": 2.06352162361145, "learning_rate": 0.00010793180133432173, "loss": 1.0211, "step": 3114 }, { "epoch": 0.4614814814814815, "grad_norm": 2.211383104324341, "learning_rate": 0.00010790214974054856, "loss": 1.1329, "step": 3115 }, { "epoch": 0.4616296296296296, "grad_norm": 2.3058247566223145, "learning_rate": 0.0001078724981467754, "loss": 1.0362, "step": 3116 }, { "epoch": 0.4617777777777778, "grad_norm": 5.804232597351074, "learning_rate": 0.00010784284655300223, "loss": 1.1004, "step": 3117 }, { "epoch": 0.4619259259259259, "grad_norm": 3.8884549140930176, "learning_rate": 0.00010781319495922906, "loss": 1.0747, "step": 3118 }, { "epoch": 0.4620740740740741, "grad_norm": 1.8256559371948242, "learning_rate": 0.00010778354336545591, "loss": 1.2765, "step": 3119 }, { "epoch": 0.4622222222222222, "grad_norm": 3.3232460021972656, "learning_rate": 0.00010775389177168274, "loss": 0.9998, "step": 3120 }, { "epoch": 0.4623703703703704, "grad_norm": 4.4035139083862305, "learning_rate": 0.00010772424017790956, "loss": 1.1608, "step": 3121 }, { "epoch": 0.4625185185185185, "grad_norm": 2.6258749961853027, "learning_rate": 0.00010769458858413642, "loss": 1.168, "step": 3122 }, { "epoch": 0.46266666666666667, "grad_norm": 5.1971282958984375, "learning_rate": 0.00010766493699036325, "loss": 1.2523, "step": 3123 }, { "epoch": 0.4628148148148148, "grad_norm": 8.231294631958008, "learning_rate": 0.00010763528539659006, "loss": 0.945, "step": 3124 }, { "epoch": 0.46296296296296297, "grad_norm": 7.108486652374268, "learning_rate": 0.0001076056338028169, "loss": 1.1456, "step": 3125 }, { "epoch": 0.4631111111111111, "grad_norm": 4.68864107131958, "learning_rate": 0.00010757598220904375, "loss": 0.9352, "step": 3126 }, { "epoch": 0.46325925925925926, "grad_norm": 3.0115625858306885, "learning_rate": 0.00010754633061527057, "loss": 0.929, "step": 3127 }, { "epoch": 0.46340740740740743, "grad_norm": 2.0125746726989746, "learning_rate": 0.0001075166790214974, "loss": 1.1929, "step": 3128 }, { "epoch": 0.46355555555555555, "grad_norm": 2.8470137119293213, "learning_rate": 0.00010748702742772425, "loss": 1.1472, "step": 3129 }, { "epoch": 0.46370370370370373, "grad_norm": 3.7088513374328613, "learning_rate": 0.00010745737583395108, "loss": 1.3877, "step": 3130 }, { "epoch": 0.46385185185185185, "grad_norm": 2.3978278636932373, "learning_rate": 0.00010742772424017791, "loss": 0.9441, "step": 3131 }, { "epoch": 0.464, "grad_norm": 1.5147128105163574, "learning_rate": 0.00010739807264640475, "loss": 1.1477, "step": 3132 }, { "epoch": 0.46414814814814814, "grad_norm": 2.765228271484375, "learning_rate": 0.00010736842105263158, "loss": 1.258, "step": 3133 }, { "epoch": 0.4642962962962963, "grad_norm": 7.870732307434082, "learning_rate": 0.00010733876945885841, "loss": 0.8307, "step": 3134 }, { "epoch": 0.46444444444444444, "grad_norm": 3.9861104488372803, "learning_rate": 0.00010730911786508526, "loss": 1.1876, "step": 3135 }, { "epoch": 0.4645925925925926, "grad_norm": 4.176854610443115, "learning_rate": 0.00010727946627131209, "loss": 1.1667, "step": 3136 }, { "epoch": 0.46474074074074073, "grad_norm": 3.1691198348999023, "learning_rate": 0.00010724981467753892, "loss": 1.2163, "step": 3137 }, { "epoch": 0.4648888888888889, "grad_norm": 2.161644220352173, "learning_rate": 0.00010722016308376576, "loss": 1.1669, "step": 3138 }, { "epoch": 0.465037037037037, "grad_norm": 5.941046237945557, "learning_rate": 0.0001071905114899926, "loss": 1.0284, "step": 3139 }, { "epoch": 0.4651851851851852, "grad_norm": 3.7416329383850098, "learning_rate": 0.00010716085989621943, "loss": 1.06, "step": 3140 }, { "epoch": 0.4653333333333333, "grad_norm": 3.7596495151519775, "learning_rate": 0.00010713120830244626, "loss": 1.2669, "step": 3141 }, { "epoch": 0.4654814814814815, "grad_norm": 1.4510890245437622, "learning_rate": 0.0001071015567086731, "loss": 0.9947, "step": 3142 }, { "epoch": 0.4656296296296296, "grad_norm": 2.532569169998169, "learning_rate": 0.00010707190511489993, "loss": 1.0805, "step": 3143 }, { "epoch": 0.4657777777777778, "grad_norm": 7.6669511795043945, "learning_rate": 0.00010704225352112676, "loss": 1.0176, "step": 3144 }, { "epoch": 0.4659259259259259, "grad_norm": 19.293540954589844, "learning_rate": 0.00010701260192735361, "loss": 1.4962, "step": 3145 }, { "epoch": 0.4660740740740741, "grad_norm": 8.845331192016602, "learning_rate": 0.00010698295033358044, "loss": 1.1374, "step": 3146 }, { "epoch": 0.4662222222222222, "grad_norm": 2.234372615814209, "learning_rate": 0.00010695329873980726, "loss": 0.961, "step": 3147 }, { "epoch": 0.4663703703703704, "grad_norm": 16.582304000854492, "learning_rate": 0.00010692364714603411, "loss": 1.0187, "step": 3148 }, { "epoch": 0.4665185185185185, "grad_norm": 3.5459134578704834, "learning_rate": 0.00010689399555226094, "loss": 1.0671, "step": 3149 }, { "epoch": 0.4666666666666667, "grad_norm": 3.5274243354797363, "learning_rate": 0.00010686434395848776, "loss": 1.0944, "step": 3150 }, { "epoch": 0.4668148148148148, "grad_norm": 3.0876269340515137, "learning_rate": 0.00010683469236471462, "loss": 1.1617, "step": 3151 }, { "epoch": 0.46696296296296297, "grad_norm": 3.162646532058716, "learning_rate": 0.00010680504077094145, "loss": 1.0033, "step": 3152 }, { "epoch": 0.4671111111111111, "grad_norm": 4.672357082366943, "learning_rate": 0.00010677538917716827, "loss": 1.1112, "step": 3153 }, { "epoch": 0.46725925925925926, "grad_norm": 2.4324915409088135, "learning_rate": 0.00010674573758339513, "loss": 1.2125, "step": 3154 }, { "epoch": 0.4674074074074074, "grad_norm": 3.274158239364624, "learning_rate": 0.00010671608598962194, "loss": 1.2154, "step": 3155 }, { "epoch": 0.46755555555555556, "grad_norm": 2.635927438735962, "learning_rate": 0.00010668643439584877, "loss": 1.1436, "step": 3156 }, { "epoch": 0.4677037037037037, "grad_norm": 3.1742031574249268, "learning_rate": 0.00010665678280207563, "loss": 0.9477, "step": 3157 }, { "epoch": 0.46785185185185185, "grad_norm": 2.651947021484375, "learning_rate": 0.00010662713120830245, "loss": 1.0503, "step": 3158 }, { "epoch": 0.468, "grad_norm": 2.1971304416656494, "learning_rate": 0.00010659747961452928, "loss": 0.995, "step": 3159 }, { "epoch": 0.46814814814814815, "grad_norm": 6.848816394805908, "learning_rate": 0.00010656782802075611, "loss": 0.9247, "step": 3160 }, { "epoch": 0.4682962962962963, "grad_norm": 2.959502935409546, "learning_rate": 0.00010653817642698296, "loss": 1.1678, "step": 3161 }, { "epoch": 0.46844444444444444, "grad_norm": 2.398146629333496, "learning_rate": 0.00010650852483320979, "loss": 1.1378, "step": 3162 }, { "epoch": 0.4685925925925926, "grad_norm": 2.858717203140259, "learning_rate": 0.00010647887323943662, "loss": 1.017, "step": 3163 }, { "epoch": 0.46874074074074074, "grad_norm": 4.143274307250977, "learning_rate": 0.00010644922164566346, "loss": 0.962, "step": 3164 }, { "epoch": 0.4688888888888889, "grad_norm": 2.289644718170166, "learning_rate": 0.0001064195700518903, "loss": 1.042, "step": 3165 }, { "epoch": 0.46903703703703703, "grad_norm": 3.0466256141662598, "learning_rate": 0.00010638991845811712, "loss": 1.0398, "step": 3166 }, { "epoch": 0.4691851851851852, "grad_norm": 5.905399799346924, "learning_rate": 0.00010636026686434397, "loss": 1.0772, "step": 3167 }, { "epoch": 0.4693333333333333, "grad_norm": 2.9436914920806885, "learning_rate": 0.0001063306152705708, "loss": 0.99, "step": 3168 }, { "epoch": 0.4694814814814815, "grad_norm": 16.72655487060547, "learning_rate": 0.00010630096367679763, "loss": 1.0256, "step": 3169 }, { "epoch": 0.4696296296296296, "grad_norm": 5.662783622741699, "learning_rate": 0.00010627131208302448, "loss": 1.2278, "step": 3170 }, { "epoch": 0.4697777777777778, "grad_norm": 1.763562560081482, "learning_rate": 0.0001062416604892513, "loss": 0.8771, "step": 3171 }, { "epoch": 0.4699259259259259, "grad_norm": 3.000190496444702, "learning_rate": 0.00010621200889547814, "loss": 1.1496, "step": 3172 }, { "epoch": 0.4700740740740741, "grad_norm": 14.939896583557129, "learning_rate": 0.00010618235730170498, "loss": 1.2843, "step": 3173 }, { "epoch": 0.4702222222222222, "grad_norm": 21.34058380126953, "learning_rate": 0.00010615270570793181, "loss": 0.6499, "step": 3174 }, { "epoch": 0.4703703703703704, "grad_norm": 12.188255310058594, "learning_rate": 0.00010612305411415864, "loss": 1.0415, "step": 3175 }, { "epoch": 0.4705185185185185, "grad_norm": 5.454524517059326, "learning_rate": 0.00010609340252038546, "loss": 1.1986, "step": 3176 }, { "epoch": 0.4706666666666667, "grad_norm": 8.059273719787598, "learning_rate": 0.00010606375092661232, "loss": 1.0591, "step": 3177 }, { "epoch": 0.4708148148148148, "grad_norm": 2.550055742263794, "learning_rate": 0.00010603409933283915, "loss": 1.1642, "step": 3178 }, { "epoch": 0.47096296296296297, "grad_norm": 5.875574588775635, "learning_rate": 0.00010600444773906597, "loss": 1.0948, "step": 3179 }, { "epoch": 0.4711111111111111, "grad_norm": 17.175174713134766, "learning_rate": 0.00010597479614529282, "loss": 1.1653, "step": 3180 }, { "epoch": 0.47125925925925927, "grad_norm": 3.404045581817627, "learning_rate": 0.00010594514455151964, "loss": 1.1246, "step": 3181 }, { "epoch": 0.4714074074074074, "grad_norm": 6.8494086265563965, "learning_rate": 0.00010591549295774647, "loss": 1.0845, "step": 3182 }, { "epoch": 0.47155555555555556, "grad_norm": 10.930824279785156, "learning_rate": 0.00010588584136397333, "loss": 0.9457, "step": 3183 }, { "epoch": 0.4717037037037037, "grad_norm": 2.7525134086608887, "learning_rate": 0.00010585618977020015, "loss": 1.2138, "step": 3184 }, { "epoch": 0.47185185185185186, "grad_norm": 5.941433906555176, "learning_rate": 0.00010582653817642698, "loss": 1.3032, "step": 3185 }, { "epoch": 0.472, "grad_norm": 5.860634803771973, "learning_rate": 0.00010579688658265384, "loss": 0.9713, "step": 3186 }, { "epoch": 0.47214814814814815, "grad_norm": 4.1480865478515625, "learning_rate": 0.00010576723498888065, "loss": 1.3524, "step": 3187 }, { "epoch": 0.47229629629629627, "grad_norm": 6.544581413269043, "learning_rate": 0.00010573758339510749, "loss": 1.1095, "step": 3188 }, { "epoch": 0.47244444444444444, "grad_norm": 4.920466899871826, "learning_rate": 0.00010570793180133433, "loss": 0.7559, "step": 3189 }, { "epoch": 0.4725925925925926, "grad_norm": 8.82383918762207, "learning_rate": 0.00010567828020756116, "loss": 1.0021, "step": 3190 }, { "epoch": 0.47274074074074074, "grad_norm": 3.9936301708221436, "learning_rate": 0.00010564862861378799, "loss": 1.1223, "step": 3191 }, { "epoch": 0.4728888888888889, "grad_norm": 5.304333209991455, "learning_rate": 0.00010561897702001482, "loss": 1.2125, "step": 3192 }, { "epoch": 0.47303703703703703, "grad_norm": 3.7484147548675537, "learning_rate": 0.00010558932542624167, "loss": 1.2072, "step": 3193 }, { "epoch": 0.4731851851851852, "grad_norm": 8.583340644836426, "learning_rate": 0.0001055596738324685, "loss": 0.8443, "step": 3194 }, { "epoch": 0.47333333333333333, "grad_norm": 6.135061740875244, "learning_rate": 0.00010553002223869533, "loss": 1.0808, "step": 3195 }, { "epoch": 0.4734814814814815, "grad_norm": 5.279104232788086, "learning_rate": 0.00010550037064492217, "loss": 1.0127, "step": 3196 }, { "epoch": 0.4736296296296296, "grad_norm": 9.33665943145752, "learning_rate": 0.000105470719051149, "loss": 1.0379, "step": 3197 }, { "epoch": 0.4737777777777778, "grad_norm": 1.8126426935195923, "learning_rate": 0.00010544106745737584, "loss": 1.0124, "step": 3198 }, { "epoch": 0.4739259259259259, "grad_norm": 2.6330509185791016, "learning_rate": 0.00010541141586360268, "loss": 1.1142, "step": 3199 }, { "epoch": 0.4740740740740741, "grad_norm": 16.036205291748047, "learning_rate": 0.00010538176426982951, "loss": 0.9784, "step": 3200 }, { "epoch": 0.4742222222222222, "grad_norm": 3.068990468978882, "learning_rate": 0.00010535211267605634, "loss": 1.1511, "step": 3201 }, { "epoch": 0.4743703703703704, "grad_norm": 7.242805480957031, "learning_rate": 0.00010532246108228319, "loss": 0.8941, "step": 3202 }, { "epoch": 0.4745185185185185, "grad_norm": 4.33395528793335, "learning_rate": 0.00010529280948851002, "loss": 0.8751, "step": 3203 }, { "epoch": 0.4746666666666667, "grad_norm": 4.307576656341553, "learning_rate": 0.00010526315789473685, "loss": 1.1531, "step": 3204 }, { "epoch": 0.4748148148148148, "grad_norm": 3.184480667114258, "learning_rate": 0.00010523350630096369, "loss": 1.2952, "step": 3205 }, { "epoch": 0.474962962962963, "grad_norm": 3.037388324737549, "learning_rate": 0.00010520385470719052, "loss": 1.0486, "step": 3206 }, { "epoch": 0.4751111111111111, "grad_norm": 2.3104970455169678, "learning_rate": 0.00010517420311341734, "loss": 1.0139, "step": 3207 }, { "epoch": 0.47525925925925927, "grad_norm": 7.615547180175781, "learning_rate": 0.0001051445515196442, "loss": 1.1018, "step": 3208 }, { "epoch": 0.4754074074074074, "grad_norm": 3.9027249813079834, "learning_rate": 0.00010511489992587103, "loss": 1.1016, "step": 3209 }, { "epoch": 0.47555555555555556, "grad_norm": 4.110639572143555, "learning_rate": 0.00010508524833209785, "loss": 0.9585, "step": 3210 }, { "epoch": 0.4757037037037037, "grad_norm": 3.2899138927459717, "learning_rate": 0.00010505559673832468, "loss": 1.0343, "step": 3211 }, { "epoch": 0.47585185185185186, "grad_norm": 6.333221435546875, "learning_rate": 0.00010502594514455154, "loss": 1.0174, "step": 3212 }, { "epoch": 0.476, "grad_norm": 6.963191032409668, "learning_rate": 0.00010499629355077835, "loss": 1.1656, "step": 3213 }, { "epoch": 0.47614814814814815, "grad_norm": 6.781939506530762, "learning_rate": 0.00010496664195700518, "loss": 1.086, "step": 3214 }, { "epoch": 0.4762962962962963, "grad_norm": 10.270206451416016, "learning_rate": 0.00010493699036323203, "loss": 0.7711, "step": 3215 }, { "epoch": 0.47644444444444445, "grad_norm": 6.469484806060791, "learning_rate": 0.00010490733876945886, "loss": 1.314, "step": 3216 }, { "epoch": 0.47659259259259257, "grad_norm": 2.9984261989593506, "learning_rate": 0.00010487768717568569, "loss": 1.2979, "step": 3217 }, { "epoch": 0.47674074074074074, "grad_norm": 4.488178253173828, "learning_rate": 0.00010484803558191253, "loss": 1.196, "step": 3218 }, { "epoch": 0.47688888888888886, "grad_norm": 6.778642177581787, "learning_rate": 0.00010481838398813937, "loss": 1.1411, "step": 3219 }, { "epoch": 0.47703703703703704, "grad_norm": 12.261344909667969, "learning_rate": 0.0001047887323943662, "loss": 1.0381, "step": 3220 }, { "epoch": 0.4771851851851852, "grad_norm": 5.716145038604736, "learning_rate": 0.00010475908080059304, "loss": 1.1154, "step": 3221 }, { "epoch": 0.47733333333333333, "grad_norm": 3.8252673149108887, "learning_rate": 0.00010472942920681987, "loss": 1.0692, "step": 3222 }, { "epoch": 0.4774814814814815, "grad_norm": 6.230849266052246, "learning_rate": 0.0001046997776130467, "loss": 1.1221, "step": 3223 }, { "epoch": 0.4776296296296296, "grad_norm": 8.367166519165039, "learning_rate": 0.00010467012601927355, "loss": 1.1213, "step": 3224 }, { "epoch": 0.4777777777777778, "grad_norm": 4.575971603393555, "learning_rate": 0.00010464047442550038, "loss": 0.983, "step": 3225 }, { "epoch": 0.4779259259259259, "grad_norm": 11.973864555358887, "learning_rate": 0.00010461082283172721, "loss": 1.1238, "step": 3226 }, { "epoch": 0.4780740740740741, "grad_norm": 3.2397303581237793, "learning_rate": 0.00010458117123795404, "loss": 0.9948, "step": 3227 }, { "epoch": 0.4782222222222222, "grad_norm": 4.968510627746582, "learning_rate": 0.00010455151964418088, "loss": 1.119, "step": 3228 }, { "epoch": 0.4783703703703704, "grad_norm": 5.719479084014893, "learning_rate": 0.00010452186805040772, "loss": 1.3429, "step": 3229 }, { "epoch": 0.4785185185185185, "grad_norm": 7.569019317626953, "learning_rate": 0.00010449221645663455, "loss": 1.024, "step": 3230 }, { "epoch": 0.4786666666666667, "grad_norm": 7.236438274383545, "learning_rate": 0.00010446256486286139, "loss": 0.9336, "step": 3231 }, { "epoch": 0.4788148148148148, "grad_norm": 3.0502982139587402, "learning_rate": 0.00010443291326908822, "loss": 1.2291, "step": 3232 }, { "epoch": 0.478962962962963, "grad_norm": 6.6787872314453125, "learning_rate": 0.00010440326167531504, "loss": 0.9153, "step": 3233 }, { "epoch": 0.4791111111111111, "grad_norm": 8.486490249633789, "learning_rate": 0.0001043736100815419, "loss": 1.3001, "step": 3234 }, { "epoch": 0.4792592592592593, "grad_norm": 10.042091369628906, "learning_rate": 0.00010434395848776873, "loss": 0.9567, "step": 3235 }, { "epoch": 0.4794074074074074, "grad_norm": 3.748504638671875, "learning_rate": 0.00010431430689399555, "loss": 1.0477, "step": 3236 }, { "epoch": 0.47955555555555557, "grad_norm": 5.855503082275391, "learning_rate": 0.0001042846553002224, "loss": 1.0012, "step": 3237 }, { "epoch": 0.4797037037037037, "grad_norm": 4.62381649017334, "learning_rate": 0.00010425500370644923, "loss": 1.1407, "step": 3238 }, { "epoch": 0.47985185185185186, "grad_norm": 7.165449142456055, "learning_rate": 0.00010422535211267605, "loss": 1.3546, "step": 3239 }, { "epoch": 0.48, "grad_norm": 5.236550331115723, "learning_rate": 0.00010419570051890291, "loss": 1.0317, "step": 3240 }, { "epoch": 0.48014814814814816, "grad_norm": 7.2281975746154785, "learning_rate": 0.00010416604892512973, "loss": 0.9371, "step": 3241 }, { "epoch": 0.4802962962962963, "grad_norm": 4.920871734619141, "learning_rate": 0.00010413639733135656, "loss": 1.012, "step": 3242 }, { "epoch": 0.48044444444444445, "grad_norm": 3.736154317855835, "learning_rate": 0.00010410674573758342, "loss": 0.9538, "step": 3243 }, { "epoch": 0.48059259259259257, "grad_norm": 6.0349555015563965, "learning_rate": 0.00010407709414381023, "loss": 1.2737, "step": 3244 }, { "epoch": 0.48074074074074075, "grad_norm": 3.131864070892334, "learning_rate": 0.00010404744255003706, "loss": 1.0383, "step": 3245 }, { "epoch": 0.48088888888888887, "grad_norm": 6.332339286804199, "learning_rate": 0.0001040177909562639, "loss": 1.0275, "step": 3246 }, { "epoch": 0.48103703703703704, "grad_norm": 3.2845075130462646, "learning_rate": 0.00010398813936249074, "loss": 1.1353, "step": 3247 }, { "epoch": 0.48118518518518516, "grad_norm": 5.595439434051514, "learning_rate": 0.00010395848776871757, "loss": 1.0353, "step": 3248 }, { "epoch": 0.48133333333333334, "grad_norm": 2.96281099319458, "learning_rate": 0.0001039288361749444, "loss": 0.965, "step": 3249 }, { "epoch": 0.48148148148148145, "grad_norm": 9.507851600646973, "learning_rate": 0.00010389918458117125, "loss": 1.1361, "step": 3250 }, { "epoch": 0.48162962962962963, "grad_norm": 5.7583465576171875, "learning_rate": 0.00010386953298739808, "loss": 1.0297, "step": 3251 }, { "epoch": 0.4817777777777778, "grad_norm": 17.506040573120117, "learning_rate": 0.00010383988139362491, "loss": 0.9779, "step": 3252 }, { "epoch": 0.4819259259259259, "grad_norm": 18.134029388427734, "learning_rate": 0.00010381022979985175, "loss": 1.1237, "step": 3253 }, { "epoch": 0.4820740740740741, "grad_norm": 8.527471542358398, "learning_rate": 0.00010378057820607858, "loss": 1.0244, "step": 3254 }, { "epoch": 0.4822222222222222, "grad_norm": 2.8711700439453125, "learning_rate": 0.00010375092661230541, "loss": 1.2051, "step": 3255 }, { "epoch": 0.4823703703703704, "grad_norm": 9.874549865722656, "learning_rate": 0.00010372127501853226, "loss": 0.9903, "step": 3256 }, { "epoch": 0.4825185185185185, "grad_norm": 3.829430103302002, "learning_rate": 0.00010369162342475909, "loss": 0.9314, "step": 3257 }, { "epoch": 0.4826666666666667, "grad_norm": 20.969573974609375, "learning_rate": 0.00010366197183098592, "loss": 0.9503, "step": 3258 }, { "epoch": 0.4828148148148148, "grad_norm": 6.133211612701416, "learning_rate": 0.00010363232023721276, "loss": 1.0567, "step": 3259 }, { "epoch": 0.482962962962963, "grad_norm": 5.0121541023254395, "learning_rate": 0.0001036026686434396, "loss": 0.9237, "step": 3260 }, { "epoch": 0.4831111111111111, "grad_norm": 3.7844817638397217, "learning_rate": 0.00010357301704966643, "loss": 1.1164, "step": 3261 }, { "epoch": 0.4832592592592593, "grad_norm": 29.42932891845703, "learning_rate": 0.00010354336545589324, "loss": 1.0503, "step": 3262 }, { "epoch": 0.4834074074074074, "grad_norm": 7.4309401512146, "learning_rate": 0.0001035137138621201, "loss": 1.2604, "step": 3263 }, { "epoch": 0.48355555555555557, "grad_norm": 4.521541118621826, "learning_rate": 0.00010348406226834693, "loss": 0.8875, "step": 3264 }, { "epoch": 0.4837037037037037, "grad_norm": 4.30009126663208, "learning_rate": 0.00010345441067457375, "loss": 1.0131, "step": 3265 }, { "epoch": 0.48385185185185187, "grad_norm": 4.7270402908325195, "learning_rate": 0.00010342475908080061, "loss": 0.9544, "step": 3266 }, { "epoch": 0.484, "grad_norm": 7.024489879608154, "learning_rate": 0.00010339510748702743, "loss": 1.0964, "step": 3267 }, { "epoch": 0.48414814814814816, "grad_norm": 7.917887210845947, "learning_rate": 0.00010336545589325426, "loss": 1.1018, "step": 3268 }, { "epoch": 0.4842962962962963, "grad_norm": 4.285741806030273, "learning_rate": 0.00010333580429948111, "loss": 0.9772, "step": 3269 }, { "epoch": 0.48444444444444446, "grad_norm": 6.478012561798096, "learning_rate": 0.00010330615270570793, "loss": 0.9533, "step": 3270 }, { "epoch": 0.4845925925925926, "grad_norm": 2.709798574447632, "learning_rate": 0.00010327650111193476, "loss": 1.0569, "step": 3271 }, { "epoch": 0.48474074074074075, "grad_norm": 9.730769157409668, "learning_rate": 0.00010324684951816162, "loss": 1.0849, "step": 3272 }, { "epoch": 0.48488888888888887, "grad_norm": 3.158259391784668, "learning_rate": 0.00010321719792438844, "loss": 1.2448, "step": 3273 }, { "epoch": 0.48503703703703704, "grad_norm": 16.45296287536621, "learning_rate": 0.00010318754633061527, "loss": 0.9933, "step": 3274 }, { "epoch": 0.48518518518518516, "grad_norm": 15.388961791992188, "learning_rate": 0.00010315789473684211, "loss": 0.9935, "step": 3275 }, { "epoch": 0.48533333333333334, "grad_norm": 3.80056095123291, "learning_rate": 0.00010312824314306894, "loss": 1.0549, "step": 3276 }, { "epoch": 0.48548148148148146, "grad_norm": 4.19111442565918, "learning_rate": 0.00010309859154929578, "loss": 1.1943, "step": 3277 }, { "epoch": 0.48562962962962963, "grad_norm": 3.5380570888519287, "learning_rate": 0.0001030689399555226, "loss": 1.0885, "step": 3278 }, { "epoch": 0.48577777777777775, "grad_norm": 27.214094161987305, "learning_rate": 0.00010303928836174945, "loss": 0.9228, "step": 3279 }, { "epoch": 0.48592592592592593, "grad_norm": 2.5952913761138916, "learning_rate": 0.00010300963676797628, "loss": 1.1284, "step": 3280 }, { "epoch": 0.48607407407407405, "grad_norm": 3.481954574584961, "learning_rate": 0.00010297998517420311, "loss": 1.1537, "step": 3281 }, { "epoch": 0.4862222222222222, "grad_norm": 9.62716007232666, "learning_rate": 0.00010295033358042996, "loss": 1.0798, "step": 3282 }, { "epoch": 0.4863703703703704, "grad_norm": 3.729365348815918, "learning_rate": 0.00010292068198665679, "loss": 0.9712, "step": 3283 }, { "epoch": 0.4865185185185185, "grad_norm": 6.008885383605957, "learning_rate": 0.00010289103039288362, "loss": 1.0435, "step": 3284 }, { "epoch": 0.4866666666666667, "grad_norm": 6.04400110244751, "learning_rate": 0.00010286137879911046, "loss": 1.0902, "step": 3285 }, { "epoch": 0.4868148148148148, "grad_norm": 8.919031143188477, "learning_rate": 0.0001028317272053373, "loss": 1.0534, "step": 3286 }, { "epoch": 0.486962962962963, "grad_norm": Infinity, "learning_rate": 0.0001028317272053373, "loss": 1.3228, "step": 3287 }, { "epoch": 0.4871111111111111, "grad_norm": 2.824831485748291, "learning_rate": 0.00010280207561156412, "loss": 1.0485, "step": 3288 }, { "epoch": 0.4872592592592593, "grad_norm": 4.498315334320068, "learning_rate": 0.00010277242401779097, "loss": 1.0745, "step": 3289 }, { "epoch": 0.4874074074074074, "grad_norm": 3.279341220855713, "learning_rate": 0.0001027427724240178, "loss": 1.0256, "step": 3290 }, { "epoch": 0.4875555555555556, "grad_norm": 3.054405927658081, "learning_rate": 0.00010271312083024463, "loss": 1.1039, "step": 3291 }, { "epoch": 0.4877037037037037, "grad_norm": 4.371935844421387, "learning_rate": 0.00010268346923647148, "loss": 1.0527, "step": 3292 }, { "epoch": 0.48785185185185187, "grad_norm": 7.293436050415039, "learning_rate": 0.0001026538176426983, "loss": 0.9356, "step": 3293 }, { "epoch": 0.488, "grad_norm": 4.384091854095459, "learning_rate": 0.00010262416604892512, "loss": 1.1196, "step": 3294 }, { "epoch": 0.48814814814814816, "grad_norm": 5.955717086791992, "learning_rate": 0.00010259451445515198, "loss": 0.8373, "step": 3295 }, { "epoch": 0.4882962962962963, "grad_norm": 2.367436170578003, "learning_rate": 0.00010256486286137881, "loss": 0.9994, "step": 3296 }, { "epoch": 0.48844444444444446, "grad_norm": 3.521695375442505, "learning_rate": 0.00010253521126760563, "loss": 1.0349, "step": 3297 }, { "epoch": 0.4885925925925926, "grad_norm": 10.636849403381348, "learning_rate": 0.00010250555967383246, "loss": 0.9018, "step": 3298 }, { "epoch": 0.48874074074074075, "grad_norm": 2.3019065856933594, "learning_rate": 0.00010247590808005932, "loss": 0.9242, "step": 3299 }, { "epoch": 0.4888888888888889, "grad_norm": 6.471230983734131, "learning_rate": 0.00010244625648628614, "loss": 1.0473, "step": 3300 }, { "epoch": 0.48903703703703705, "grad_norm": 4.0714592933654785, "learning_rate": 0.00010241660489251297, "loss": 1.1151, "step": 3301 }, { "epoch": 0.48918518518518517, "grad_norm": 4.892930507659912, "learning_rate": 0.00010238695329873981, "loss": 1.1018, "step": 3302 }, { "epoch": 0.48933333333333334, "grad_norm": 3.136021614074707, "learning_rate": 0.00010235730170496664, "loss": 0.9623, "step": 3303 }, { "epoch": 0.48948148148148146, "grad_norm": 3.2672665119171143, "learning_rate": 0.00010232765011119347, "loss": 1.0466, "step": 3304 }, { "epoch": 0.48962962962962964, "grad_norm": 3.7321572303771973, "learning_rate": 0.00010229799851742032, "loss": 1.0502, "step": 3305 }, { "epoch": 0.48977777777777776, "grad_norm": 3.520411491394043, "learning_rate": 0.00010226834692364715, "loss": 0.9759, "step": 3306 }, { "epoch": 0.48992592592592593, "grad_norm": 3.4311301708221436, "learning_rate": 0.00010223869532987398, "loss": 0.913, "step": 3307 }, { "epoch": 0.49007407407407405, "grad_norm": 5.491833686828613, "learning_rate": 0.00010220904373610082, "loss": 1.1901, "step": 3308 }, { "epoch": 0.4902222222222222, "grad_norm": 4.865017414093018, "learning_rate": 0.00010217939214232766, "loss": 0.9807, "step": 3309 }, { "epoch": 0.49037037037037035, "grad_norm": 5.523568630218506, "learning_rate": 0.00010214974054855449, "loss": 1.0945, "step": 3310 }, { "epoch": 0.4905185185185185, "grad_norm": 7.591002464294434, "learning_rate": 0.00010212008895478133, "loss": 1.1448, "step": 3311 }, { "epoch": 0.49066666666666664, "grad_norm": 2.7854530811309814, "learning_rate": 0.00010209043736100816, "loss": 1.0046, "step": 3312 }, { "epoch": 0.4908148148148148, "grad_norm": 3.1816959381103516, "learning_rate": 0.00010206078576723499, "loss": 1.0797, "step": 3313 }, { "epoch": 0.490962962962963, "grad_norm": 2.2549257278442383, "learning_rate": 0.00010203113417346182, "loss": 1.012, "step": 3314 }, { "epoch": 0.4911111111111111, "grad_norm": 4.814846992492676, "learning_rate": 0.00010200148257968867, "loss": 1.1395, "step": 3315 }, { "epoch": 0.4912592592592593, "grad_norm": 8.303160667419434, "learning_rate": 0.0001019718309859155, "loss": 0.9523, "step": 3316 }, { "epoch": 0.4914074074074074, "grad_norm": 7.3843464851379395, "learning_rate": 0.00010194217939214233, "loss": 0.9284, "step": 3317 }, { "epoch": 0.4915555555555556, "grad_norm": 3.921623706817627, "learning_rate": 0.00010191252779836917, "loss": 1.0163, "step": 3318 }, { "epoch": 0.4917037037037037, "grad_norm": 10.940088272094727, "learning_rate": 0.000101882876204596, "loss": 1.1748, "step": 3319 }, { "epoch": 0.4918518518518519, "grad_norm": 38.7925910949707, "learning_rate": 0.00010185322461082282, "loss": 1.1016, "step": 3320 }, { "epoch": 0.492, "grad_norm": 7.665594577789307, "learning_rate": 0.00010182357301704968, "loss": 1.0122, "step": 3321 }, { "epoch": 0.49214814814814817, "grad_norm": 2.942826747894287, "learning_rate": 0.00010179392142327651, "loss": 0.9124, "step": 3322 }, { "epoch": 0.4922962962962963, "grad_norm": 6.7701239585876465, "learning_rate": 0.00010176426982950333, "loss": 1.1139, "step": 3323 }, { "epoch": 0.49244444444444446, "grad_norm": 5.877721786499023, "learning_rate": 0.00010173461823573019, "loss": 1.0566, "step": 3324 }, { "epoch": 0.4925925925925926, "grad_norm": 6.935197830200195, "learning_rate": 0.00010170496664195702, "loss": 1.2815, "step": 3325 }, { "epoch": 0.49274074074074076, "grad_norm": 2.7543389797210693, "learning_rate": 0.00010167531504818383, "loss": 1.0665, "step": 3326 }, { "epoch": 0.4928888888888889, "grad_norm": 2.9426321983337402, "learning_rate": 0.00010164566345441069, "loss": 1.0679, "step": 3327 }, { "epoch": 0.49303703703703705, "grad_norm": 4.462264537811279, "learning_rate": 0.00010161601186063751, "loss": 1.1118, "step": 3328 }, { "epoch": 0.49318518518518517, "grad_norm": 5.7362060546875, "learning_rate": 0.00010158636026686434, "loss": 0.9373, "step": 3329 }, { "epoch": 0.49333333333333335, "grad_norm": 11.453153610229492, "learning_rate": 0.0001015567086730912, "loss": 0.9115, "step": 3330 }, { "epoch": 0.49348148148148147, "grad_norm": 3.6631534099578857, "learning_rate": 0.00010152705707931802, "loss": 0.8081, "step": 3331 }, { "epoch": 0.49362962962962964, "grad_norm": 4.551608562469482, "learning_rate": 0.00010149740548554485, "loss": 1.0346, "step": 3332 }, { "epoch": 0.49377777777777776, "grad_norm": 19.665569305419922, "learning_rate": 0.00010146775389177168, "loss": 1.1854, "step": 3333 }, { "epoch": 0.49392592592592593, "grad_norm": 3.854532480239868, "learning_rate": 0.00010143810229799852, "loss": 0.9575, "step": 3334 }, { "epoch": 0.49407407407407405, "grad_norm": 7.537748336791992, "learning_rate": 0.00010140845070422535, "loss": 1.1637, "step": 3335 }, { "epoch": 0.49422222222222223, "grad_norm": 5.545541763305664, "learning_rate": 0.00010137879911045218, "loss": 1.0542, "step": 3336 }, { "epoch": 0.49437037037037035, "grad_norm": 5.337022304534912, "learning_rate": 0.00010134914751667903, "loss": 1.1365, "step": 3337 }, { "epoch": 0.4945185185185185, "grad_norm": 3.3391358852386475, "learning_rate": 0.00010131949592290586, "loss": 1.0307, "step": 3338 }, { "epoch": 0.49466666666666664, "grad_norm": 3.6873624324798584, "learning_rate": 0.00010128984432913269, "loss": 1.0978, "step": 3339 }, { "epoch": 0.4948148148148148, "grad_norm": 9.957468032836914, "learning_rate": 0.00010126019273535954, "loss": 1.0894, "step": 3340 }, { "epoch": 0.49496296296296294, "grad_norm": 3.154163122177124, "learning_rate": 0.00010123054114158637, "loss": 1.1681, "step": 3341 }, { "epoch": 0.4951111111111111, "grad_norm": 2.4883615970611572, "learning_rate": 0.0001012008895478132, "loss": 0.9564, "step": 3342 }, { "epoch": 0.49525925925925923, "grad_norm": 2.372100353240967, "learning_rate": 0.00010117123795404004, "loss": 1.1688, "step": 3343 }, { "epoch": 0.4954074074074074, "grad_norm": 6.041884899139404, "learning_rate": 0.00010114158636026687, "loss": 1.0489, "step": 3344 }, { "epoch": 0.4955555555555556, "grad_norm": 3.130751132965088, "learning_rate": 0.0001011119347664937, "loss": 1.461, "step": 3345 }, { "epoch": 0.4957037037037037, "grad_norm": 4.726568222045898, "learning_rate": 0.00010108228317272055, "loss": 1.0306, "step": 3346 }, { "epoch": 0.4958518518518519, "grad_norm": 9.536551475524902, "learning_rate": 0.00010105263157894738, "loss": 1.0686, "step": 3347 }, { "epoch": 0.496, "grad_norm": 4.3145551681518555, "learning_rate": 0.00010102297998517421, "loss": 0.9891, "step": 3348 }, { "epoch": 0.49614814814814817, "grad_norm": 2.656224012374878, "learning_rate": 0.00010099332839140103, "loss": 1.4133, "step": 3349 }, { "epoch": 0.4962962962962963, "grad_norm": 2.4819960594177246, "learning_rate": 0.00010096367679762789, "loss": 1.1467, "step": 3350 }, { "epoch": 0.49644444444444447, "grad_norm": 22.741085052490234, "learning_rate": 0.00010093402520385472, "loss": 1.1741, "step": 3351 }, { "epoch": 0.4965925925925926, "grad_norm": 5.024966716766357, "learning_rate": 0.00010090437361008153, "loss": 1.2479, "step": 3352 }, { "epoch": 0.49674074074074076, "grad_norm": 4.4860382080078125, "learning_rate": 0.00010087472201630839, "loss": 1.0149, "step": 3353 }, { "epoch": 0.4968888888888889, "grad_norm": 18.099023818969727, "learning_rate": 0.00010084507042253521, "loss": 0.9571, "step": 3354 }, { "epoch": 0.49703703703703705, "grad_norm": 2.42246675491333, "learning_rate": 0.00010081541882876204, "loss": 1.1217, "step": 3355 }, { "epoch": 0.4971851851851852, "grad_norm": 12.542469024658203, "learning_rate": 0.0001007857672349889, "loss": 1.1278, "step": 3356 }, { "epoch": 0.49733333333333335, "grad_norm": 3.75882887840271, "learning_rate": 0.00010075611564121572, "loss": 1.3366, "step": 3357 }, { "epoch": 0.49748148148148147, "grad_norm": 2.7175066471099854, "learning_rate": 0.00010072646404744255, "loss": 1.0177, "step": 3358 }, { "epoch": 0.49762962962962964, "grad_norm": 9.794149398803711, "learning_rate": 0.0001006968124536694, "loss": 1.1153, "step": 3359 }, { "epoch": 0.49777777777777776, "grad_norm": 5.885090351104736, "learning_rate": 0.00010066716085989622, "loss": 1.0297, "step": 3360 }, { "epoch": 0.49792592592592594, "grad_norm": 2.174469470977783, "learning_rate": 0.00010063750926612305, "loss": 0.8401, "step": 3361 }, { "epoch": 0.49807407407407406, "grad_norm": 16.41845703125, "learning_rate": 0.0001006078576723499, "loss": 1.0287, "step": 3362 }, { "epoch": 0.49822222222222223, "grad_norm": 3.114132881164551, "learning_rate": 0.00010057820607857673, "loss": 0.9149, "step": 3363 }, { "epoch": 0.49837037037037035, "grad_norm": 10.348255157470703, "learning_rate": 0.00010054855448480356, "loss": 0.9482, "step": 3364 }, { "epoch": 0.4985185185185185, "grad_norm": 2.6116645336151123, "learning_rate": 0.0001005189028910304, "loss": 1.0156, "step": 3365 }, { "epoch": 0.49866666666666665, "grad_norm": 8.228729248046875, "learning_rate": 0.00010048925129725723, "loss": 1.0018, "step": 3366 }, { "epoch": 0.4988148148148148, "grad_norm": 5.662245273590088, "learning_rate": 0.00010045959970348406, "loss": 1.2499, "step": 3367 }, { "epoch": 0.49896296296296294, "grad_norm": 2.135146379470825, "learning_rate": 0.0001004299481097109, "loss": 1.5001, "step": 3368 }, { "epoch": 0.4991111111111111, "grad_norm": 3.819507122039795, "learning_rate": 0.00010040029651593774, "loss": 1.0538, "step": 3369 }, { "epoch": 0.49925925925925924, "grad_norm": 3.6428349018096924, "learning_rate": 0.00010037064492216457, "loss": 0.8779, "step": 3370 }, { "epoch": 0.4994074074074074, "grad_norm": 1.6564476490020752, "learning_rate": 0.0001003409933283914, "loss": 1.2154, "step": 3371 }, { "epoch": 0.49955555555555553, "grad_norm": 2.1467227935791016, "learning_rate": 0.00010031134173461825, "loss": 0.927, "step": 3372 }, { "epoch": 0.4997037037037037, "grad_norm": 3.4069087505340576, "learning_rate": 0.00010028169014084508, "loss": 1.1846, "step": 3373 }, { "epoch": 0.4998518518518518, "grad_norm": 1.6431936025619507, "learning_rate": 0.00010025203854707191, "loss": 0.9053, "step": 3374 }, { "epoch": 0.5, "grad_norm": 2.8949391841888428, "learning_rate": 0.00010022238695329875, "loss": 1.1747, "step": 3375 }, { "epoch": 0.5001481481481481, "grad_norm": 1.8398103713989258, "learning_rate": 0.00010019273535952558, "loss": 0.8153, "step": 3376 }, { "epoch": 0.5002962962962964, "grad_norm": 2.2975878715515137, "learning_rate": 0.00010016308376575241, "loss": 1.0039, "step": 3377 }, { "epoch": 0.5004444444444445, "grad_norm": 4.093273639678955, "learning_rate": 0.00010013343217197926, "loss": 1.2182, "step": 3378 }, { "epoch": 0.5005925925925926, "grad_norm": 3.4979758262634277, "learning_rate": 0.00010010378057820609, "loss": 1.115, "step": 3379 }, { "epoch": 0.5007407407407407, "grad_norm": 2.741710662841797, "learning_rate": 0.00010007412898443291, "loss": 0.9741, "step": 3380 }, { "epoch": 0.5008888888888889, "grad_norm": 3.284590482711792, "learning_rate": 0.00010004447739065977, "loss": 0.9585, "step": 3381 }, { "epoch": 0.5010370370370371, "grad_norm": 6.5590081214904785, "learning_rate": 0.0001000148257968866, "loss": 1.0111, "step": 3382 }, { "epoch": 0.5011851851851852, "grad_norm": 1.995739221572876, "learning_rate": 9.998517420311341e-05, "loss": 1.2119, "step": 3383 }, { "epoch": 0.5013333333333333, "grad_norm": 5.810882568359375, "learning_rate": 9.995552260934026e-05, "loss": 0.8591, "step": 3384 }, { "epoch": 0.5014814814814815, "grad_norm": 1.9267522096633911, "learning_rate": 9.99258710155671e-05, "loss": 1.2118, "step": 3385 }, { "epoch": 0.5016296296296296, "grad_norm": 2.457502603530884, "learning_rate": 9.989621942179392e-05, "loss": 1.194, "step": 3386 }, { "epoch": 0.5017777777777778, "grad_norm": 2.373825788497925, "learning_rate": 9.986656782802076e-05, "loss": 1.1115, "step": 3387 }, { "epoch": 0.5019259259259259, "grad_norm": 2.949768304824829, "learning_rate": 9.98369162342476e-05, "loss": 1.1156, "step": 3388 }, { "epoch": 0.5020740740740741, "grad_norm": 2.7182395458221436, "learning_rate": 9.980726464047443e-05, "loss": 1.155, "step": 3389 }, { "epoch": 0.5022222222222222, "grad_norm": 3.5946409702301025, "learning_rate": 9.977761304670127e-05, "loss": 1.2636, "step": 3390 }, { "epoch": 0.5023703703703704, "grad_norm": 3.0228304862976074, "learning_rate": 9.97479614529281e-05, "loss": 1.1162, "step": 3391 }, { "epoch": 0.5025185185185185, "grad_norm": 2.010730028152466, "learning_rate": 9.971830985915493e-05, "loss": 0.9778, "step": 3392 }, { "epoch": 0.5026666666666667, "grad_norm": 2.3101558685302734, "learning_rate": 9.968865826538178e-05, "loss": 0.9486, "step": 3393 }, { "epoch": 0.5028148148148148, "grad_norm": 4.203605651855469, "learning_rate": 9.965900667160861e-05, "loss": 1.4041, "step": 3394 }, { "epoch": 0.502962962962963, "grad_norm": 2.7500808238983154, "learning_rate": 9.962935507783544e-05, "loss": 1.0569, "step": 3395 }, { "epoch": 0.5031111111111111, "grad_norm": 2.914515972137451, "learning_rate": 9.959970348406227e-05, "loss": 1.0734, "step": 3396 }, { "epoch": 0.5032592592592593, "grad_norm": 4.110001087188721, "learning_rate": 9.95700518902891e-05, "loss": 1.1533, "step": 3397 }, { "epoch": 0.5034074074074074, "grad_norm": 1.5626189708709717, "learning_rate": 9.954040029651594e-05, "loss": 1.1873, "step": 3398 }, { "epoch": 0.5035555555555555, "grad_norm": 2.6834592819213867, "learning_rate": 9.951074870274278e-05, "loss": 1.1417, "step": 3399 }, { "epoch": 0.5037037037037037, "grad_norm": 1.8137449026107788, "learning_rate": 9.94810971089696e-05, "loss": 1.0368, "step": 3400 }, { "epoch": 0.5038518518518519, "grad_norm": 1.611829161643982, "learning_rate": 9.945144551519645e-05, "loss": 0.8365, "step": 3401 }, { "epoch": 0.504, "grad_norm": 2.8950698375701904, "learning_rate": 9.942179392142328e-05, "loss": 1.103, "step": 3402 }, { "epoch": 0.5041481481481481, "grad_norm": 3.5890421867370605, "learning_rate": 9.939214232765011e-05, "loss": 1.0627, "step": 3403 }, { "epoch": 0.5042962962962962, "grad_norm": 1.892509937286377, "learning_rate": 9.936249073387696e-05, "loss": 1.0033, "step": 3404 }, { "epoch": 0.5044444444444445, "grad_norm": 2.3791167736053467, "learning_rate": 9.933283914010379e-05, "loss": 1.243, "step": 3405 }, { "epoch": 0.5045925925925926, "grad_norm": 1.5246496200561523, "learning_rate": 9.930318754633062e-05, "loss": 1.0807, "step": 3406 }, { "epoch": 0.5047407407407407, "grad_norm": 1.6053105592727661, "learning_rate": 9.927353595255745e-05, "loss": 1.0406, "step": 3407 }, { "epoch": 0.5048888888888889, "grad_norm": 2.311598539352417, "learning_rate": 9.92438843587843e-05, "loss": 1.0438, "step": 3408 }, { "epoch": 0.5050370370370371, "grad_norm": 5.0231781005859375, "learning_rate": 9.921423276501113e-05, "loss": 0.9384, "step": 3409 }, { "epoch": 0.5051851851851852, "grad_norm": 1.5837359428405762, "learning_rate": 9.918458117123796e-05, "loss": 1.1756, "step": 3410 }, { "epoch": 0.5053333333333333, "grad_norm": 1.6837211847305298, "learning_rate": 9.91549295774648e-05, "loss": 0.8449, "step": 3411 }, { "epoch": 0.5054814814814815, "grad_norm": 2.0716450214385986, "learning_rate": 9.912527798369163e-05, "loss": 1.2139, "step": 3412 }, { "epoch": 0.5056296296296297, "grad_norm": 1.12191641330719, "learning_rate": 9.909562638991846e-05, "loss": 0.8494, "step": 3413 }, { "epoch": 0.5057777777777778, "grad_norm": 1.795994520187378, "learning_rate": 9.90659747961453e-05, "loss": 1.3463, "step": 3414 }, { "epoch": 0.5059259259259259, "grad_norm": 1.747239112854004, "learning_rate": 9.903632320237212e-05, "loss": 1.0093, "step": 3415 }, { "epoch": 0.5060740740740741, "grad_norm": 1.6786613464355469, "learning_rate": 9.900667160859897e-05, "loss": 1.0296, "step": 3416 }, { "epoch": 0.5062222222222222, "grad_norm": 1.5573384761810303, "learning_rate": 9.89770200148258e-05, "loss": 0.7602, "step": 3417 }, { "epoch": 0.5063703703703704, "grad_norm": 1.5690990686416626, "learning_rate": 9.894736842105263e-05, "loss": 1.0206, "step": 3418 }, { "epoch": 0.5065185185185185, "grad_norm": 2.2538905143737793, "learning_rate": 9.891771682727948e-05, "loss": 1.2526, "step": 3419 }, { "epoch": 0.5066666666666667, "grad_norm": 2.059178590774536, "learning_rate": 9.88880652335063e-05, "loss": 0.9341, "step": 3420 }, { "epoch": 0.5068148148148148, "grad_norm": 3.653055429458618, "learning_rate": 9.885841363973314e-05, "loss": 1.1119, "step": 3421 }, { "epoch": 0.506962962962963, "grad_norm": 2.135084629058838, "learning_rate": 9.882876204595998e-05, "loss": 0.9503, "step": 3422 }, { "epoch": 0.5071111111111111, "grad_norm": 1.3559598922729492, "learning_rate": 9.87991104521868e-05, "loss": 1.0422, "step": 3423 }, { "epoch": 0.5072592592592593, "grad_norm": 3.151815414428711, "learning_rate": 9.876945885841364e-05, "loss": 0.8861, "step": 3424 }, { "epoch": 0.5074074074074074, "grad_norm": 1.5202436447143555, "learning_rate": 9.873980726464049e-05, "loss": 1.2197, "step": 3425 }, { "epoch": 0.5075555555555555, "grad_norm": 2.9413371086120605, "learning_rate": 9.87101556708673e-05, "loss": 1.0491, "step": 3426 }, { "epoch": 0.5077037037037037, "grad_norm": 1.1613496541976929, "learning_rate": 9.868050407709415e-05, "loss": 1.1598, "step": 3427 }, { "epoch": 0.5078518518518519, "grad_norm": 2.011124849319458, "learning_rate": 9.8650852483321e-05, "loss": 1.1567, "step": 3428 }, { "epoch": 0.508, "grad_norm": 2.108271360397339, "learning_rate": 9.862120088954781e-05, "loss": 1.0974, "step": 3429 }, { "epoch": 0.5081481481481481, "grad_norm": 3.502854585647583, "learning_rate": 9.859154929577466e-05, "loss": 1.1057, "step": 3430 }, { "epoch": 0.5082962962962962, "grad_norm": 1.2759106159210205, "learning_rate": 9.856189770200149e-05, "loss": 1.0656, "step": 3431 }, { "epoch": 0.5084444444444445, "grad_norm": 2.211440086364746, "learning_rate": 9.853224610822832e-05, "loss": 0.9594, "step": 3432 }, { "epoch": 0.5085925925925926, "grad_norm": 1.1375516653060913, "learning_rate": 9.850259451445516e-05, "loss": 1.166, "step": 3433 }, { "epoch": 0.5087407407407407, "grad_norm": 1.9250344038009644, "learning_rate": 9.847294292068199e-05, "loss": 0.9071, "step": 3434 }, { "epoch": 0.5088888888888888, "grad_norm": 1.8056732416152954, "learning_rate": 9.844329132690882e-05, "loss": 1.1854, "step": 3435 }, { "epoch": 0.5090370370370371, "grad_norm": 1.496208667755127, "learning_rate": 9.841363973313567e-05, "loss": 0.9071, "step": 3436 }, { "epoch": 0.5091851851851852, "grad_norm": 2.020498037338257, "learning_rate": 9.83839881393625e-05, "loss": 1.02, "step": 3437 }, { "epoch": 0.5093333333333333, "grad_norm": 1.7645292282104492, "learning_rate": 9.835433654558933e-05, "loss": 1.0352, "step": 3438 }, { "epoch": 0.5094814814814815, "grad_norm": 3.5330545902252197, "learning_rate": 9.832468495181616e-05, "loss": 0.8771, "step": 3439 }, { "epoch": 0.5096296296296297, "grad_norm": 1.1161566972732544, "learning_rate": 9.829503335804299e-05, "loss": 1.3086, "step": 3440 }, { "epoch": 0.5097777777777778, "grad_norm": 1.4570153951644897, "learning_rate": 9.826538176426984e-05, "loss": 1.1426, "step": 3441 }, { "epoch": 0.5099259259259259, "grad_norm": 2.8942079544067383, "learning_rate": 9.823573017049667e-05, "loss": 0.8561, "step": 3442 }, { "epoch": 0.5100740740740741, "grad_norm": 2.263978958129883, "learning_rate": 9.82060785767235e-05, "loss": 1.1924, "step": 3443 }, { "epoch": 0.5102222222222222, "grad_norm": 1.3524726629257202, "learning_rate": 9.817642698295034e-05, "loss": 1.1621, "step": 3444 }, { "epoch": 0.5103703703703704, "grad_norm": 1.2318508625030518, "learning_rate": 9.814677538917717e-05, "loss": 1.1966, "step": 3445 }, { "epoch": 0.5105185185185185, "grad_norm": 2.823561191558838, "learning_rate": 9.8117123795404e-05, "loss": 0.9551, "step": 3446 }, { "epoch": 0.5106666666666667, "grad_norm": 1.9692493677139282, "learning_rate": 9.808747220163085e-05, "loss": 1.0933, "step": 3447 }, { "epoch": 0.5108148148148148, "grad_norm": 2.315603017807007, "learning_rate": 9.805782060785768e-05, "loss": 1.1432, "step": 3448 }, { "epoch": 0.510962962962963, "grad_norm": 1.3484950065612793, "learning_rate": 9.802816901408451e-05, "loss": 0.9704, "step": 3449 }, { "epoch": 0.5111111111111111, "grad_norm": 4.833320140838623, "learning_rate": 9.799851742031134e-05, "loss": 0.9685, "step": 3450 }, { "epoch": 0.5112592592592593, "grad_norm": 1.722293734550476, "learning_rate": 9.796886582653819e-05, "loss": 1.0885, "step": 3451 }, { "epoch": 0.5114074074074074, "grad_norm": 1.3373132944107056, "learning_rate": 9.793921423276502e-05, "loss": 0.9328, "step": 3452 }, { "epoch": 0.5115555555555555, "grad_norm": 2.0415165424346924, "learning_rate": 9.790956263899185e-05, "loss": 1.0558, "step": 3453 }, { "epoch": 0.5117037037037037, "grad_norm": 1.8608633279800415, "learning_rate": 9.787991104521869e-05, "loss": 1.1091, "step": 3454 }, { "epoch": 0.5118518518518519, "grad_norm": 1.510023832321167, "learning_rate": 9.785025945144552e-05, "loss": 0.9175, "step": 3455 }, { "epoch": 0.512, "grad_norm": 1.702152967453003, "learning_rate": 9.782060785767235e-05, "loss": 1.1188, "step": 3456 }, { "epoch": 0.5121481481481481, "grad_norm": 1.7295708656311035, "learning_rate": 9.779095626389919e-05, "loss": 1.1258, "step": 3457 }, { "epoch": 0.5122962962962962, "grad_norm": 1.4218422174453735, "learning_rate": 9.776130467012602e-05, "loss": 1.1501, "step": 3458 }, { "epoch": 0.5124444444444445, "grad_norm": 2.1711299419403076, "learning_rate": 9.773165307635286e-05, "loss": 1.0482, "step": 3459 }, { "epoch": 0.5125925925925926, "grad_norm": 2.0415642261505127, "learning_rate": 9.770200148257969e-05, "loss": 1.2222, "step": 3460 }, { "epoch": 0.5127407407407407, "grad_norm": 1.3829699754714966, "learning_rate": 9.767234988880652e-05, "loss": 0.9807, "step": 3461 }, { "epoch": 0.5128888888888888, "grad_norm": 2.3946306705474854, "learning_rate": 9.764269829503337e-05, "loss": 1.1625, "step": 3462 }, { "epoch": 0.5130370370370371, "grad_norm": 1.7022687196731567, "learning_rate": 9.76130467012602e-05, "loss": 1.0249, "step": 3463 }, { "epoch": 0.5131851851851852, "grad_norm": 2.9826409816741943, "learning_rate": 9.758339510748703e-05, "loss": 1.1052, "step": 3464 }, { "epoch": 0.5133333333333333, "grad_norm": 3.7996156215667725, "learning_rate": 9.755374351371387e-05, "loss": 1.0881, "step": 3465 }, { "epoch": 0.5134814814814814, "grad_norm": 7.144508361816406, "learning_rate": 9.752409191994069e-05, "loss": 1.2364, "step": 3466 }, { "epoch": 0.5136296296296297, "grad_norm": 1.8672593832015991, "learning_rate": 9.749444032616753e-05, "loss": 1.0515, "step": 3467 }, { "epoch": 0.5137777777777778, "grad_norm": 1.5496947765350342, "learning_rate": 9.746478873239438e-05, "loss": 1.0754, "step": 3468 }, { "epoch": 0.5139259259259259, "grad_norm": 1.7912988662719727, "learning_rate": 9.74351371386212e-05, "loss": 1.054, "step": 3469 }, { "epoch": 0.5140740740740741, "grad_norm": 1.3586891889572144, "learning_rate": 9.740548554484804e-05, "loss": 1.1254, "step": 3470 }, { "epoch": 0.5142222222222222, "grad_norm": 2.686096429824829, "learning_rate": 9.737583395107489e-05, "loss": 1.1777, "step": 3471 }, { "epoch": 0.5143703703703704, "grad_norm": 1.231162190437317, "learning_rate": 9.73461823573017e-05, "loss": 1.097, "step": 3472 }, { "epoch": 0.5145185185185185, "grad_norm": 1.7783644199371338, "learning_rate": 9.731653076352855e-05, "loss": 0.9981, "step": 3473 }, { "epoch": 0.5146666666666667, "grad_norm": 2.513786554336548, "learning_rate": 9.728687916975538e-05, "loss": 0.9621, "step": 3474 }, { "epoch": 0.5148148148148148, "grad_norm": 7.590953350067139, "learning_rate": 9.725722757598221e-05, "loss": 1.263, "step": 3475 }, { "epoch": 0.514962962962963, "grad_norm": 1.2946828603744507, "learning_rate": 9.722757598220905e-05, "loss": 1.0425, "step": 3476 }, { "epoch": 0.5151111111111111, "grad_norm": 1.7475824356079102, "learning_rate": 9.719792438843588e-05, "loss": 1.0472, "step": 3477 }, { "epoch": 0.5152592592592593, "grad_norm": 1.539388656616211, "learning_rate": 9.716827279466272e-05, "loss": 1.0123, "step": 3478 }, { "epoch": 0.5154074074074074, "grad_norm": 2.2849812507629395, "learning_rate": 9.713862120088956e-05, "loss": 1.3275, "step": 3479 }, { "epoch": 0.5155555555555555, "grad_norm": 2.3221042156219482, "learning_rate": 9.710896960711639e-05, "loss": 0.8895, "step": 3480 }, { "epoch": 0.5157037037037037, "grad_norm": 2.655463933944702, "learning_rate": 9.707931801334322e-05, "loss": 0.9543, "step": 3481 }, { "epoch": 0.5158518518518519, "grad_norm": 1.56253182888031, "learning_rate": 9.704966641957005e-05, "loss": 1.0628, "step": 3482 }, { "epoch": 0.516, "grad_norm": 1.8649121522903442, "learning_rate": 9.702001482579688e-05, "loss": 0.8793, "step": 3483 }, { "epoch": 0.5161481481481481, "grad_norm": 2.0733768939971924, "learning_rate": 9.699036323202373e-05, "loss": 1.2105, "step": 3484 }, { "epoch": 0.5162962962962963, "grad_norm": 2.5004401206970215, "learning_rate": 9.696071163825056e-05, "loss": 1.2187, "step": 3485 }, { "epoch": 0.5164444444444445, "grad_norm": 2.358077049255371, "learning_rate": 9.693106004447739e-05, "loss": 1.0586, "step": 3486 }, { "epoch": 0.5165925925925926, "grad_norm": 1.6531083583831787, "learning_rate": 9.690140845070423e-05, "loss": 0.8671, "step": 3487 }, { "epoch": 0.5167407407407407, "grad_norm": 1.7980809211730957, "learning_rate": 9.687175685693107e-05, "loss": 1.0647, "step": 3488 }, { "epoch": 0.5168888888888888, "grad_norm": 2.022136926651001, "learning_rate": 9.68421052631579e-05, "loss": 0.9466, "step": 3489 }, { "epoch": 0.5170370370370371, "grad_norm": 1.33211350440979, "learning_rate": 9.681245366938474e-05, "loss": 1.1837, "step": 3490 }, { "epoch": 0.5171851851851852, "grad_norm": 2.676328182220459, "learning_rate": 9.678280207561157e-05, "loss": 1.0544, "step": 3491 }, { "epoch": 0.5173333333333333, "grad_norm": 1.818198800086975, "learning_rate": 9.67531504818384e-05, "loss": 1.1062, "step": 3492 }, { "epoch": 0.5174814814814814, "grad_norm": 1.833338975906372, "learning_rate": 9.672349888806523e-05, "loss": 1.0738, "step": 3493 }, { "epoch": 0.5176296296296297, "grad_norm": 2.3086183071136475, "learning_rate": 9.669384729429208e-05, "loss": 1.0157, "step": 3494 }, { "epoch": 0.5177777777777778, "grad_norm": 1.5982645750045776, "learning_rate": 9.666419570051891e-05, "loss": 1.0132, "step": 3495 }, { "epoch": 0.5179259259259259, "grad_norm": 1.5193531513214111, "learning_rate": 9.663454410674574e-05, "loss": 1.0794, "step": 3496 }, { "epoch": 0.518074074074074, "grad_norm": 7.415847301483154, "learning_rate": 9.660489251297258e-05, "loss": 0.8472, "step": 3497 }, { "epoch": 0.5182222222222223, "grad_norm": 1.256219506263733, "learning_rate": 9.657524091919942e-05, "loss": 1.0972, "step": 3498 }, { "epoch": 0.5183703703703704, "grad_norm": 4.069087028503418, "learning_rate": 9.654558932542625e-05, "loss": 1.1666, "step": 3499 }, { "epoch": 0.5185185185185185, "grad_norm": 2.4944255352020264, "learning_rate": 9.651593773165308e-05, "loss": 0.8349, "step": 3500 }, { "epoch": 0.5186666666666667, "grad_norm": 2.0719282627105713, "learning_rate": 9.648628613787991e-05, "loss": 1.1829, "step": 3501 }, { "epoch": 0.5188148148148148, "grad_norm": 3.4375545978546143, "learning_rate": 9.645663454410675e-05, "loss": 1.0, "step": 3502 }, { "epoch": 0.518962962962963, "grad_norm": 2.382495641708374, "learning_rate": 9.642698295033358e-05, "loss": 1.275, "step": 3503 }, { "epoch": 0.5191111111111111, "grad_norm": 2.8072335720062256, "learning_rate": 9.639733135656041e-05, "loss": 1.3857, "step": 3504 }, { "epoch": 0.5192592592592593, "grad_norm": 2.0779919624328613, "learning_rate": 9.636767976278726e-05, "loss": 1.1554, "step": 3505 }, { "epoch": 0.5194074074074074, "grad_norm": 2.170212507247925, "learning_rate": 9.633802816901409e-05, "loss": 0.9806, "step": 3506 }, { "epoch": 0.5195555555555555, "grad_norm": 1.8128262758255005, "learning_rate": 9.630837657524092e-05, "loss": 1.0177, "step": 3507 }, { "epoch": 0.5197037037037037, "grad_norm": 1.5690702199935913, "learning_rate": 9.627872498146776e-05, "loss": 0.9887, "step": 3508 }, { "epoch": 0.5198518518518519, "grad_norm": 2.0018036365509033, "learning_rate": 9.624907338769458e-05, "loss": 0.9906, "step": 3509 }, { "epoch": 0.52, "grad_norm": 2.020087718963623, "learning_rate": 9.621942179392143e-05, "loss": 0.9797, "step": 3510 }, { "epoch": 0.5201481481481481, "grad_norm": 3.4835586547851562, "learning_rate": 9.618977020014827e-05, "loss": 0.8488, "step": 3511 }, { "epoch": 0.5202962962962963, "grad_norm": 1.7970818281173706, "learning_rate": 9.616011860637509e-05, "loss": 1.145, "step": 3512 }, { "epoch": 0.5204444444444445, "grad_norm": 2.0925204753875732, "learning_rate": 9.613046701260193e-05, "loss": 1.3318, "step": 3513 }, { "epoch": 0.5205925925925926, "grad_norm": 2.5351264476776123, "learning_rate": 9.610081541882878e-05, "loss": 0.9796, "step": 3514 }, { "epoch": 0.5207407407407407, "grad_norm": 1.360851764678955, "learning_rate": 9.60711638250556e-05, "loss": 1.0119, "step": 3515 }, { "epoch": 0.5208888888888888, "grad_norm": 2.807292938232422, "learning_rate": 9.604151223128244e-05, "loss": 0.9417, "step": 3516 }, { "epoch": 0.5210370370370371, "grad_norm": 2.252411127090454, "learning_rate": 9.601186063750927e-05, "loss": 1.3305, "step": 3517 }, { "epoch": 0.5211851851851852, "grad_norm": 1.8232671022415161, "learning_rate": 9.59822090437361e-05, "loss": 1.0784, "step": 3518 }, { "epoch": 0.5213333333333333, "grad_norm": 2.167621612548828, "learning_rate": 9.595255744996295e-05, "loss": 0.9162, "step": 3519 }, { "epoch": 0.5214814814814814, "grad_norm": 2.444425106048584, "learning_rate": 9.592290585618978e-05, "loss": 1.021, "step": 3520 }, { "epoch": 0.5216296296296297, "grad_norm": 1.9010529518127441, "learning_rate": 9.589325426241661e-05, "loss": 0.9694, "step": 3521 }, { "epoch": 0.5217777777777778, "grad_norm": 1.6443415880203247, "learning_rate": 9.586360266864345e-05, "loss": 1.0086, "step": 3522 }, { "epoch": 0.5219259259259259, "grad_norm": 2.088010311126709, "learning_rate": 9.583395107487028e-05, "loss": 1.1408, "step": 3523 }, { "epoch": 0.522074074074074, "grad_norm": 1.4378952980041504, "learning_rate": 9.580429948109711e-05, "loss": 0.9919, "step": 3524 }, { "epoch": 0.5222222222222223, "grad_norm": 2.5560710430145264, "learning_rate": 9.577464788732394e-05, "loss": 0.9739, "step": 3525 }, { "epoch": 0.5223703703703704, "grad_norm": 1.7038018703460693, "learning_rate": 9.574499629355078e-05, "loss": 1.1463, "step": 3526 }, { "epoch": 0.5225185185185185, "grad_norm": 2.424886703491211, "learning_rate": 9.571534469977762e-05, "loss": 1.2281, "step": 3527 }, { "epoch": 0.5226666666666666, "grad_norm": 2.7288095951080322, "learning_rate": 9.568569310600445e-05, "loss": 1.1308, "step": 3528 }, { "epoch": 0.5228148148148148, "grad_norm": 2.1019182205200195, "learning_rate": 9.565604151223128e-05, "loss": 0.9406, "step": 3529 }, { "epoch": 0.522962962962963, "grad_norm": 4.319130897521973, "learning_rate": 9.562638991845813e-05, "loss": 1.015, "step": 3530 }, { "epoch": 0.5231111111111111, "grad_norm": 1.8898524045944214, "learning_rate": 9.559673832468496e-05, "loss": 0.9298, "step": 3531 }, { "epoch": 0.5232592592592593, "grad_norm": 3.966099500656128, "learning_rate": 9.556708673091179e-05, "loss": 0.919, "step": 3532 }, { "epoch": 0.5234074074074074, "grad_norm": 2.4514873027801514, "learning_rate": 9.553743513713863e-05, "loss": 1.2715, "step": 3533 }, { "epoch": 0.5235555555555556, "grad_norm": 1.3653898239135742, "learning_rate": 9.550778354336546e-05, "loss": 1.0814, "step": 3534 }, { "epoch": 0.5237037037037037, "grad_norm": 1.8816627264022827, "learning_rate": 9.54781319495923e-05, "loss": 0.9403, "step": 3535 }, { "epoch": 0.5238518518518519, "grad_norm": 4.248720169067383, "learning_rate": 9.544848035581913e-05, "loss": 1.0868, "step": 3536 }, { "epoch": 0.524, "grad_norm": 2.754378318786621, "learning_rate": 9.541882876204597e-05, "loss": 1.1859, "step": 3537 }, { "epoch": 0.5241481481481481, "grad_norm": 2.7608413696289062, "learning_rate": 9.53891771682728e-05, "loss": 0.9651, "step": 3538 }, { "epoch": 0.5242962962962963, "grad_norm": 3.119210720062256, "learning_rate": 9.535952557449963e-05, "loss": 1.2463, "step": 3539 }, { "epoch": 0.5244444444444445, "grad_norm": 4.5977654457092285, "learning_rate": 9.532987398072648e-05, "loss": 0.8846, "step": 3540 }, { "epoch": 0.5245925925925926, "grad_norm": 1.846290946006775, "learning_rate": 9.53002223869533e-05, "loss": 1.1083, "step": 3541 }, { "epoch": 0.5247407407407407, "grad_norm": 3.4465091228485107, "learning_rate": 9.527057079318014e-05, "loss": 0.9773, "step": 3542 }, { "epoch": 0.5248888888888888, "grad_norm": 4.273923873901367, "learning_rate": 9.524091919940697e-05, "loss": 1.175, "step": 3543 }, { "epoch": 0.5250370370370371, "grad_norm": 2.4471707344055176, "learning_rate": 9.52112676056338e-05, "loss": 0.9078, "step": 3544 }, { "epoch": 0.5251851851851852, "grad_norm": 1.573663592338562, "learning_rate": 9.518161601186064e-05, "loss": 1.0537, "step": 3545 }, { "epoch": 0.5253333333333333, "grad_norm": 3.942758321762085, "learning_rate": 9.515196441808747e-05, "loss": 1.2459, "step": 3546 }, { "epoch": 0.5254814814814814, "grad_norm": 2.88981032371521, "learning_rate": 9.51223128243143e-05, "loss": 1.0162, "step": 3547 }, { "epoch": 0.5256296296296297, "grad_norm": 6.503210544586182, "learning_rate": 9.509266123054115e-05, "loss": 1.2065, "step": 3548 }, { "epoch": 0.5257777777777778, "grad_norm": 3.7239830493927, "learning_rate": 9.506300963676798e-05, "loss": 1.2433, "step": 3549 }, { "epoch": 0.5259259259259259, "grad_norm": 3.274492025375366, "learning_rate": 9.503335804299481e-05, "loss": 1.2985, "step": 3550 }, { "epoch": 0.526074074074074, "grad_norm": 5.178545951843262, "learning_rate": 9.500370644922166e-05, "loss": 1.1406, "step": 3551 }, { "epoch": 0.5262222222222223, "grad_norm": 4.6761393547058105, "learning_rate": 9.497405485544847e-05, "loss": 1.2129, "step": 3552 }, { "epoch": 0.5263703703703704, "grad_norm": 2.7359654903411865, "learning_rate": 9.494440326167532e-05, "loss": 1.2082, "step": 3553 }, { "epoch": 0.5265185185185185, "grad_norm": 7.111677169799805, "learning_rate": 9.491475166790216e-05, "loss": 0.9609, "step": 3554 }, { "epoch": 0.5266666666666666, "grad_norm": 1.8699086904525757, "learning_rate": 9.488510007412898e-05, "loss": 0.8755, "step": 3555 }, { "epoch": 0.5268148148148148, "grad_norm": 2.1888856887817383, "learning_rate": 9.485544848035582e-05, "loss": 0.9932, "step": 3556 }, { "epoch": 0.526962962962963, "grad_norm": 2.197059154510498, "learning_rate": 9.482579688658267e-05, "loss": 1.0316, "step": 3557 }, { "epoch": 0.5271111111111111, "grad_norm": 3.5654101371765137, "learning_rate": 9.479614529280949e-05, "loss": 1.1914, "step": 3558 }, { "epoch": 0.5272592592592592, "grad_norm": 4.099245548248291, "learning_rate": 9.476649369903633e-05, "loss": 1.0678, "step": 3559 }, { "epoch": 0.5274074074074074, "grad_norm": 2.1959383487701416, "learning_rate": 9.473684210526316e-05, "loss": 1.2129, "step": 3560 }, { "epoch": 0.5275555555555556, "grad_norm": 3.4108617305755615, "learning_rate": 9.470719051148999e-05, "loss": 0.9935, "step": 3561 }, { "epoch": 0.5277037037037037, "grad_norm": 4.734465599060059, "learning_rate": 9.467753891771684e-05, "loss": 0.7632, "step": 3562 }, { "epoch": 0.5278518518518519, "grad_norm": 1.6672844886779785, "learning_rate": 9.464788732394367e-05, "loss": 0.931, "step": 3563 }, { "epoch": 0.528, "grad_norm": 3.7147114276885986, "learning_rate": 9.46182357301705e-05, "loss": 0.9767, "step": 3564 }, { "epoch": 0.5281481481481481, "grad_norm": 2.0520565509796143, "learning_rate": 9.458858413639734e-05, "loss": 0.7752, "step": 3565 }, { "epoch": 0.5282962962962963, "grad_norm": 2.508117914199829, "learning_rate": 9.455893254262417e-05, "loss": 1.0447, "step": 3566 }, { "epoch": 0.5284444444444445, "grad_norm": 2.8419182300567627, "learning_rate": 9.4529280948851e-05, "loss": 0.7909, "step": 3567 }, { "epoch": 0.5285925925925926, "grad_norm": 2.5379035472869873, "learning_rate": 9.449962935507784e-05, "loss": 1.2331, "step": 3568 }, { "epoch": 0.5287407407407407, "grad_norm": 4.583249092102051, "learning_rate": 9.446997776130467e-05, "loss": 1.1977, "step": 3569 }, { "epoch": 0.5288888888888889, "grad_norm": 2.7971770763397217, "learning_rate": 9.444032616753151e-05, "loss": 1.168, "step": 3570 }, { "epoch": 0.5290370370370371, "grad_norm": 7.165088653564453, "learning_rate": 9.441067457375834e-05, "loss": 1.0675, "step": 3571 }, { "epoch": 0.5291851851851852, "grad_norm": 3.349947690963745, "learning_rate": 9.438102297998517e-05, "loss": 0.9698, "step": 3572 }, { "epoch": 0.5293333333333333, "grad_norm": 2.9816622734069824, "learning_rate": 9.435137138621202e-05, "loss": 1.1645, "step": 3573 }, { "epoch": 0.5294814814814814, "grad_norm": 6.285075664520264, "learning_rate": 9.432171979243885e-05, "loss": 1.2269, "step": 3574 }, { "epoch": 0.5296296296296297, "grad_norm": 2.5549850463867188, "learning_rate": 9.429206819866568e-05, "loss": 1.0708, "step": 3575 }, { "epoch": 0.5297777777777778, "grad_norm": 5.181352615356445, "learning_rate": 9.426241660489252e-05, "loss": 1.0041, "step": 3576 }, { "epoch": 0.5299259259259259, "grad_norm": 3.1909756660461426, "learning_rate": 9.423276501111935e-05, "loss": 0.9387, "step": 3577 }, { "epoch": 0.530074074074074, "grad_norm": 3.0633323192596436, "learning_rate": 9.420311341734619e-05, "loss": 1.1736, "step": 3578 }, { "epoch": 0.5302222222222223, "grad_norm": 3.279362916946411, "learning_rate": 9.417346182357302e-05, "loss": 0.8721, "step": 3579 }, { "epoch": 0.5303703703703704, "grad_norm": 3.5336806774139404, "learning_rate": 9.414381022979986e-05, "loss": 1.3407, "step": 3580 }, { "epoch": 0.5305185185185185, "grad_norm": 2.430006504058838, "learning_rate": 9.411415863602669e-05, "loss": 0.9274, "step": 3581 }, { "epoch": 0.5306666666666666, "grad_norm": 2.469728946685791, "learning_rate": 9.408450704225352e-05, "loss": 1.1849, "step": 3582 }, { "epoch": 0.5308148148148149, "grad_norm": 8.362934112548828, "learning_rate": 9.405485544848037e-05, "loss": 1.0888, "step": 3583 }, { "epoch": 0.530962962962963, "grad_norm": 5.342774391174316, "learning_rate": 9.40252038547072e-05, "loss": 1.2708, "step": 3584 }, { "epoch": 0.5311111111111111, "grad_norm": 11.297273635864258, "learning_rate": 9.399555226093403e-05, "loss": 1.0547, "step": 3585 }, { "epoch": 0.5312592592592592, "grad_norm": 3.0192387104034424, "learning_rate": 9.396590066716086e-05, "loss": 1.0153, "step": 3586 }, { "epoch": 0.5314074074074074, "grad_norm": 3.046156883239746, "learning_rate": 9.393624907338769e-05, "loss": 1.0377, "step": 3587 }, { "epoch": 0.5315555555555556, "grad_norm": 2.5736989974975586, "learning_rate": 9.390659747961454e-05, "loss": 0.9452, "step": 3588 }, { "epoch": 0.5317037037037037, "grad_norm": 4.893460750579834, "learning_rate": 9.387694588584137e-05, "loss": 1.1708, "step": 3589 }, { "epoch": 0.5318518518518518, "grad_norm": 4.664818286895752, "learning_rate": 9.38472942920682e-05, "loss": 1.1881, "step": 3590 }, { "epoch": 0.532, "grad_norm": 6.313598155975342, "learning_rate": 9.381764269829504e-05, "loss": 1.0385, "step": 3591 }, { "epoch": 0.5321481481481481, "grad_norm": 2.325152635574341, "learning_rate": 9.378799110452187e-05, "loss": 1.2152, "step": 3592 }, { "epoch": 0.5322962962962963, "grad_norm": 3.308443546295166, "learning_rate": 9.37583395107487e-05, "loss": 0.8925, "step": 3593 }, { "epoch": 0.5324444444444445, "grad_norm": 4.343135356903076, "learning_rate": 9.372868791697555e-05, "loss": 1.1041, "step": 3594 }, { "epoch": 0.5325925925925926, "grad_norm": 5.608693599700928, "learning_rate": 9.369903632320237e-05, "loss": 1.0201, "step": 3595 }, { "epoch": 0.5327407407407407, "grad_norm": 3.607280969619751, "learning_rate": 9.366938472942921e-05, "loss": 1.1627, "step": 3596 }, { "epoch": 0.5328888888888889, "grad_norm": 4.711460590362549, "learning_rate": 9.363973313565605e-05, "loss": 1.1231, "step": 3597 }, { "epoch": 0.5330370370370371, "grad_norm": 1.5293848514556885, "learning_rate": 9.361008154188287e-05, "loss": 1.3455, "step": 3598 }, { "epoch": 0.5331851851851852, "grad_norm": 2.6837525367736816, "learning_rate": 9.358042994810972e-05, "loss": 1.0517, "step": 3599 }, { "epoch": 0.5333333333333333, "grad_norm": 2.2800650596618652, "learning_rate": 9.355077835433656e-05, "loss": 1.2113, "step": 3600 }, { "epoch": 0.5334814814814814, "grad_norm": 7.9449944496154785, "learning_rate": 9.352112676056338e-05, "loss": 1.0929, "step": 3601 }, { "epoch": 0.5336296296296297, "grad_norm": 2.1599514484405518, "learning_rate": 9.349147516679022e-05, "loss": 1.1329, "step": 3602 }, { "epoch": 0.5337777777777778, "grad_norm": 8.514766693115234, "learning_rate": 9.346182357301705e-05, "loss": 0.9507, "step": 3603 }, { "epoch": 0.5339259259259259, "grad_norm": 1.950701117515564, "learning_rate": 9.343217197924388e-05, "loss": 1.0046, "step": 3604 }, { "epoch": 0.534074074074074, "grad_norm": 2.9352715015411377, "learning_rate": 9.340252038547073e-05, "loss": 0.9885, "step": 3605 }, { "epoch": 0.5342222222222223, "grad_norm": 3.379699230194092, "learning_rate": 9.337286879169756e-05, "loss": 1.0122, "step": 3606 }, { "epoch": 0.5343703703703704, "grad_norm": 2.198274850845337, "learning_rate": 9.334321719792439e-05, "loss": 1.1297, "step": 3607 }, { "epoch": 0.5345185185185185, "grad_norm": 2.858485460281372, "learning_rate": 9.331356560415124e-05, "loss": 1.0752, "step": 3608 }, { "epoch": 0.5346666666666666, "grad_norm": 2.4457948207855225, "learning_rate": 9.328391401037807e-05, "loss": 1.1746, "step": 3609 }, { "epoch": 0.5348148148148149, "grad_norm": 2.0533862113952637, "learning_rate": 9.32542624166049e-05, "loss": 1.1646, "step": 3610 }, { "epoch": 0.534962962962963, "grad_norm": 4.793309211730957, "learning_rate": 9.322461082283174e-05, "loss": 1.1582, "step": 3611 }, { "epoch": 0.5351111111111111, "grad_norm": 3.439361810684204, "learning_rate": 9.319495922905856e-05, "loss": 1.0282, "step": 3612 }, { "epoch": 0.5352592592592592, "grad_norm": 4.565910339355469, "learning_rate": 9.31653076352854e-05, "loss": 1.2051, "step": 3613 }, { "epoch": 0.5354074074074074, "grad_norm": 4.375285625457764, "learning_rate": 9.313565604151223e-05, "loss": 0.7596, "step": 3614 }, { "epoch": 0.5355555555555556, "grad_norm": 2.1360411643981934, "learning_rate": 9.310600444773906e-05, "loss": 0.8976, "step": 3615 }, { "epoch": 0.5357037037037037, "grad_norm": 1.7414849996566772, "learning_rate": 9.307635285396591e-05, "loss": 0.9914, "step": 3616 }, { "epoch": 0.5358518518518518, "grad_norm": 1.7469319105148315, "learning_rate": 9.304670126019274e-05, "loss": 1.1223, "step": 3617 }, { "epoch": 0.536, "grad_norm": 2.1778478622436523, "learning_rate": 9.301704966641957e-05, "loss": 1.0785, "step": 3618 }, { "epoch": 0.5361481481481482, "grad_norm": 2.8595550060272217, "learning_rate": 9.298739807264642e-05, "loss": 1.0983, "step": 3619 }, { "epoch": 0.5362962962962963, "grad_norm": 5.059350967407227, "learning_rate": 9.295774647887325e-05, "loss": 1.2499, "step": 3620 }, { "epoch": 0.5364444444444444, "grad_norm": 1.920396089553833, "learning_rate": 9.292809488510008e-05, "loss": 0.8308, "step": 3621 }, { "epoch": 0.5365925925925926, "grad_norm": 3.148193836212158, "learning_rate": 9.289844329132691e-05, "loss": 0.8431, "step": 3622 }, { "epoch": 0.5367407407407407, "grad_norm": 4.7985405921936035, "learning_rate": 9.286879169755375e-05, "loss": 1.0488, "step": 3623 }, { "epoch": 0.5368888888888889, "grad_norm": 1.8743398189544678, "learning_rate": 9.283914010378058e-05, "loss": 1.1173, "step": 3624 }, { "epoch": 0.5370370370370371, "grad_norm": 3.4600822925567627, "learning_rate": 9.280948851000741e-05, "loss": 0.9806, "step": 3625 }, { "epoch": 0.5371851851851852, "grad_norm": 2.07930850982666, "learning_rate": 9.277983691623426e-05, "loss": 0.9602, "step": 3626 }, { "epoch": 0.5373333333333333, "grad_norm": 2.4907588958740234, "learning_rate": 9.275018532246109e-05, "loss": 1.1242, "step": 3627 }, { "epoch": 0.5374814814814814, "grad_norm": 1.5953439474105835, "learning_rate": 9.272053372868792e-05, "loss": 0.739, "step": 3628 }, { "epoch": 0.5376296296296297, "grad_norm": 3.6003265380859375, "learning_rate": 9.269088213491475e-05, "loss": 1.0478, "step": 3629 }, { "epoch": 0.5377777777777778, "grad_norm": 1.954933524131775, "learning_rate": 9.266123054114158e-05, "loss": 1.0159, "step": 3630 }, { "epoch": 0.5379259259259259, "grad_norm": 4.888095378875732, "learning_rate": 9.263157894736843e-05, "loss": 1.1897, "step": 3631 }, { "epoch": 0.538074074074074, "grad_norm": 2.864197015762329, "learning_rate": 9.260192735359526e-05, "loss": 0.8302, "step": 3632 }, { "epoch": 0.5382222222222223, "grad_norm": 1.9595667123794556, "learning_rate": 9.257227575982209e-05, "loss": 1.0346, "step": 3633 }, { "epoch": 0.5383703703703704, "grad_norm": 2.2659835815429688, "learning_rate": 9.254262416604893e-05, "loss": 0.9851, "step": 3634 }, { "epoch": 0.5385185185185185, "grad_norm": 1.8228520154953003, "learning_rate": 9.251297257227576e-05, "loss": 0.9439, "step": 3635 }, { "epoch": 0.5386666666666666, "grad_norm": 2.0427815914154053, "learning_rate": 9.24833209785026e-05, "loss": 0.8628, "step": 3636 }, { "epoch": 0.5388148148148149, "grad_norm": 3.836122751235962, "learning_rate": 9.245366938472944e-05, "loss": 0.9758, "step": 3637 }, { "epoch": 0.538962962962963, "grad_norm": 1.4492243528366089, "learning_rate": 9.242401779095626e-05, "loss": 1.1122, "step": 3638 }, { "epoch": 0.5391111111111111, "grad_norm": 2.1065256595611572, "learning_rate": 9.23943661971831e-05, "loss": 0.966, "step": 3639 }, { "epoch": 0.5392592592592592, "grad_norm": 1.8754328489303589, "learning_rate": 9.236471460340995e-05, "loss": 0.9189, "step": 3640 }, { "epoch": 0.5394074074074074, "grad_norm": 4.311052322387695, "learning_rate": 9.233506300963676e-05, "loss": 1.1817, "step": 3641 }, { "epoch": 0.5395555555555556, "grad_norm": 3.1392056941986084, "learning_rate": 9.230541141586361e-05, "loss": 0.9071, "step": 3642 }, { "epoch": 0.5397037037037037, "grad_norm": 2.574453353881836, "learning_rate": 9.227575982209045e-05, "loss": 1.26, "step": 3643 }, { "epoch": 0.5398518518518518, "grad_norm": 2.1404294967651367, "learning_rate": 9.224610822831727e-05, "loss": 1.3647, "step": 3644 }, { "epoch": 0.54, "grad_norm": 3.1237967014312744, "learning_rate": 9.221645663454411e-05, "loss": 1.0603, "step": 3645 }, { "epoch": 0.5401481481481482, "grad_norm": 4.297988414764404, "learning_rate": 9.218680504077095e-05, "loss": 0.9979, "step": 3646 }, { "epoch": 0.5402962962962963, "grad_norm": 1.8716181516647339, "learning_rate": 9.215715344699778e-05, "loss": 0.9667, "step": 3647 }, { "epoch": 0.5404444444444444, "grad_norm": 2.383779287338257, "learning_rate": 9.212750185322462e-05, "loss": 0.9201, "step": 3648 }, { "epoch": 0.5405925925925926, "grad_norm": 2.4546163082122803, "learning_rate": 9.209785025945145e-05, "loss": 0.9959, "step": 3649 }, { "epoch": 0.5407407407407407, "grad_norm": 2.4224185943603516, "learning_rate": 9.206819866567828e-05, "loss": 1.1507, "step": 3650 }, { "epoch": 0.5408888888888889, "grad_norm": 1.896492600440979, "learning_rate": 9.203854707190513e-05, "loss": 1.1051, "step": 3651 }, { "epoch": 0.541037037037037, "grad_norm": 2.1329288482666016, "learning_rate": 9.200889547813196e-05, "loss": 1.1851, "step": 3652 }, { "epoch": 0.5411851851851852, "grad_norm": 2.4231183528900146, "learning_rate": 9.197924388435879e-05, "loss": 1.1228, "step": 3653 }, { "epoch": 0.5413333333333333, "grad_norm": 1.6991504430770874, "learning_rate": 9.194959229058563e-05, "loss": 1.1305, "step": 3654 }, { "epoch": 0.5414814814814815, "grad_norm": 1.461769938468933, "learning_rate": 9.191994069681245e-05, "loss": 0.9971, "step": 3655 }, { "epoch": 0.5416296296296297, "grad_norm": 2.796705961227417, "learning_rate": 9.18902891030393e-05, "loss": 1.0234, "step": 3656 }, { "epoch": 0.5417777777777778, "grad_norm": 3.8761069774627686, "learning_rate": 9.186063750926613e-05, "loss": 1.2682, "step": 3657 }, { "epoch": 0.5419259259259259, "grad_norm": 2.3537259101867676, "learning_rate": 9.183098591549296e-05, "loss": 1.1272, "step": 3658 }, { "epoch": 0.542074074074074, "grad_norm": 2.1708521842956543, "learning_rate": 9.18013343217198e-05, "loss": 0.9924, "step": 3659 }, { "epoch": 0.5422222222222223, "grad_norm": 2.080087184906006, "learning_rate": 9.177168272794663e-05, "loss": 1.2023, "step": 3660 }, { "epoch": 0.5423703703703704, "grad_norm": 1.725771427154541, "learning_rate": 9.174203113417346e-05, "loss": 1.0793, "step": 3661 }, { "epoch": 0.5425185185185185, "grad_norm": 1.996046781539917, "learning_rate": 9.171237954040031e-05, "loss": 0.9576, "step": 3662 }, { "epoch": 0.5426666666666666, "grad_norm": 2.237187147140503, "learning_rate": 9.168272794662714e-05, "loss": 1.059, "step": 3663 }, { "epoch": 0.5428148148148149, "grad_norm": 1.372070550918579, "learning_rate": 9.165307635285397e-05, "loss": 1.032, "step": 3664 }, { "epoch": 0.542962962962963, "grad_norm": 1.853932499885559, "learning_rate": 9.16234247590808e-05, "loss": 0.9676, "step": 3665 }, { "epoch": 0.5431111111111111, "grad_norm": 2.8059439659118652, "learning_rate": 9.159377316530764e-05, "loss": 1.0134, "step": 3666 }, { "epoch": 0.5432592592592592, "grad_norm": 1.68291437625885, "learning_rate": 9.156412157153448e-05, "loss": 1.1153, "step": 3667 }, { "epoch": 0.5434074074074075, "grad_norm": 4.564503192901611, "learning_rate": 9.15344699777613e-05, "loss": 1.0873, "step": 3668 }, { "epoch": 0.5435555555555556, "grad_norm": 3.0195443630218506, "learning_rate": 9.150481838398815e-05, "loss": 0.8743, "step": 3669 }, { "epoch": 0.5437037037037037, "grad_norm": 1.9659086465835571, "learning_rate": 9.147516679021498e-05, "loss": 0.9922, "step": 3670 }, { "epoch": 0.5438518518518518, "grad_norm": 1.9579589366912842, "learning_rate": 9.144551519644181e-05, "loss": 1.0172, "step": 3671 }, { "epoch": 0.544, "grad_norm": 3.0811991691589355, "learning_rate": 9.141586360266864e-05, "loss": 1.0922, "step": 3672 }, { "epoch": 0.5441481481481482, "grad_norm": 2.1591901779174805, "learning_rate": 9.138621200889547e-05, "loss": 0.9586, "step": 3673 }, { "epoch": 0.5442962962962963, "grad_norm": 1.5976217985153198, "learning_rate": 9.135656041512232e-05, "loss": 1.086, "step": 3674 }, { "epoch": 0.5444444444444444, "grad_norm": 4.451835632324219, "learning_rate": 9.132690882134915e-05, "loss": 1.2514, "step": 3675 }, { "epoch": 0.5445925925925926, "grad_norm": 1.4515033960342407, "learning_rate": 9.129725722757598e-05, "loss": 1.0563, "step": 3676 }, { "epoch": 0.5447407407407407, "grad_norm": 1.9500216245651245, "learning_rate": 9.126760563380283e-05, "loss": 0.7786, "step": 3677 }, { "epoch": 0.5448888888888889, "grad_norm": 2.538944721221924, "learning_rate": 9.123795404002966e-05, "loss": 1.0119, "step": 3678 }, { "epoch": 0.545037037037037, "grad_norm": 1.7618365287780762, "learning_rate": 9.120830244625649e-05, "loss": 0.9883, "step": 3679 }, { "epoch": 0.5451851851851852, "grad_norm": 1.7227729558944702, "learning_rate": 9.117865085248333e-05, "loss": 1.0522, "step": 3680 }, { "epoch": 0.5453333333333333, "grad_norm": 2.5352048873901367, "learning_rate": 9.114899925871015e-05, "loss": 1.2212, "step": 3681 }, { "epoch": 0.5454814814814815, "grad_norm": 2.4464914798736572, "learning_rate": 9.111934766493699e-05, "loss": 0.939, "step": 3682 }, { "epoch": 0.5456296296296296, "grad_norm": 2.1241180896759033, "learning_rate": 9.108969607116384e-05, "loss": 0.9484, "step": 3683 }, { "epoch": 0.5457777777777778, "grad_norm": 1.5858129262924194, "learning_rate": 9.106004447739066e-05, "loss": 1.106, "step": 3684 }, { "epoch": 0.5459259259259259, "grad_norm": 2.267939567565918, "learning_rate": 9.10303928836175e-05, "loss": 1.1996, "step": 3685 }, { "epoch": 0.546074074074074, "grad_norm": 2.072701930999756, "learning_rate": 9.100074128984434e-05, "loss": 1.3082, "step": 3686 }, { "epoch": 0.5462222222222223, "grad_norm": 4.704676151275635, "learning_rate": 9.097108969607116e-05, "loss": 0.8774, "step": 3687 }, { "epoch": 0.5463703703703704, "grad_norm": 2.327634334564209, "learning_rate": 9.0941438102298e-05, "loss": 1.3083, "step": 3688 }, { "epoch": 0.5465185185185185, "grad_norm": 1.9294790029525757, "learning_rate": 9.091178650852484e-05, "loss": 1.0257, "step": 3689 }, { "epoch": 0.5466666666666666, "grad_norm": 2.1735949516296387, "learning_rate": 9.088213491475167e-05, "loss": 1.0953, "step": 3690 }, { "epoch": 0.5468148148148149, "grad_norm": 2.482743501663208, "learning_rate": 9.085248332097851e-05, "loss": 1.0466, "step": 3691 }, { "epoch": 0.546962962962963, "grad_norm": 1.8486324548721313, "learning_rate": 9.082283172720534e-05, "loss": 1.1685, "step": 3692 }, { "epoch": 0.5471111111111111, "grad_norm": 1.4710181951522827, "learning_rate": 9.079318013343217e-05, "loss": 0.9895, "step": 3693 }, { "epoch": 0.5472592592592592, "grad_norm": 1.2480143308639526, "learning_rate": 9.076352853965902e-05, "loss": 0.8952, "step": 3694 }, { "epoch": 0.5474074074074075, "grad_norm": 1.9864888191223145, "learning_rate": 9.073387694588585e-05, "loss": 1.1691, "step": 3695 }, { "epoch": 0.5475555555555556, "grad_norm": 1.6673661470413208, "learning_rate": 9.070422535211268e-05, "loss": 1.1285, "step": 3696 }, { "epoch": 0.5477037037037037, "grad_norm": 2.3615503311157227, "learning_rate": 9.067457375833952e-05, "loss": 1.1099, "step": 3697 }, { "epoch": 0.5478518518518518, "grad_norm": 1.8084917068481445, "learning_rate": 9.064492216456634e-05, "loss": 1.0073, "step": 3698 }, { "epoch": 0.548, "grad_norm": 1.7501620054244995, "learning_rate": 9.061527057079319e-05, "loss": 1.1668, "step": 3699 }, { "epoch": 0.5481481481481482, "grad_norm": 2.0538253784179688, "learning_rate": 9.058561897702002e-05, "loss": 0.875, "step": 3700 }, { "epoch": 0.5482962962962963, "grad_norm": 2.4856984615325928, "learning_rate": 9.055596738324685e-05, "loss": 1.0177, "step": 3701 }, { "epoch": 0.5484444444444444, "grad_norm": 1.54976224899292, "learning_rate": 9.052631578947369e-05, "loss": 1.1514, "step": 3702 }, { "epoch": 0.5485925925925926, "grad_norm": 1.382947564125061, "learning_rate": 9.049666419570052e-05, "loss": 0.8129, "step": 3703 }, { "epoch": 0.5487407407407408, "grad_norm": 2.57087779045105, "learning_rate": 9.046701260192735e-05, "loss": 1.0847, "step": 3704 }, { "epoch": 0.5488888888888889, "grad_norm": 1.8630986213684082, "learning_rate": 9.04373610081542e-05, "loss": 0.9154, "step": 3705 }, { "epoch": 0.549037037037037, "grad_norm": 2.1322314739227295, "learning_rate": 9.040770941438103e-05, "loss": 0.9647, "step": 3706 }, { "epoch": 0.5491851851851852, "grad_norm": 2.444566488265991, "learning_rate": 9.037805782060786e-05, "loss": 0.9911, "step": 3707 }, { "epoch": 0.5493333333333333, "grad_norm": 1.566116452217102, "learning_rate": 9.034840622683469e-05, "loss": 0.9378, "step": 3708 }, { "epoch": 0.5494814814814815, "grad_norm": 15.759946823120117, "learning_rate": 9.031875463306154e-05, "loss": 0.924, "step": 3709 }, { "epoch": 0.5496296296296296, "grad_norm": 1.5855779647827148, "learning_rate": 9.028910303928837e-05, "loss": 1.2671, "step": 3710 }, { "epoch": 0.5497777777777778, "grad_norm": 2.2595438957214355, "learning_rate": 9.02594514455152e-05, "loss": 0.9608, "step": 3711 }, { "epoch": 0.5499259259259259, "grad_norm": 2.2142670154571533, "learning_rate": 9.022979985174204e-05, "loss": 0.9906, "step": 3712 }, { "epoch": 0.550074074074074, "grad_norm": 1.7206697463989258, "learning_rate": 9.020014825796887e-05, "loss": 1.1186, "step": 3713 }, { "epoch": 0.5502222222222222, "grad_norm": 1.3970435857772827, "learning_rate": 9.01704966641957e-05, "loss": 1.1455, "step": 3714 }, { "epoch": 0.5503703703703704, "grad_norm": 2.5814034938812256, "learning_rate": 9.014084507042254e-05, "loss": 1.3222, "step": 3715 }, { "epoch": 0.5505185185185185, "grad_norm": 2.862255096435547, "learning_rate": 9.011119347664937e-05, "loss": 0.9744, "step": 3716 }, { "epoch": 0.5506666666666666, "grad_norm": 2.0777993202209473, "learning_rate": 9.008154188287621e-05, "loss": 1.1848, "step": 3717 }, { "epoch": 0.5508148148148149, "grad_norm": 1.3220739364624023, "learning_rate": 9.005189028910304e-05, "loss": 1.0493, "step": 3718 }, { "epoch": 0.550962962962963, "grad_norm": 1.9476999044418335, "learning_rate": 9.002223869532987e-05, "loss": 0.7834, "step": 3719 }, { "epoch": 0.5511111111111111, "grad_norm": 1.7757718563079834, "learning_rate": 8.999258710155672e-05, "loss": 1.152, "step": 3720 }, { "epoch": 0.5512592592592592, "grad_norm": 1.1972419023513794, "learning_rate": 8.996293550778355e-05, "loss": 0.9338, "step": 3721 }, { "epoch": 0.5514074074074075, "grad_norm": 1.3252272605895996, "learning_rate": 8.993328391401038e-05, "loss": 1.1488, "step": 3722 }, { "epoch": 0.5515555555555556, "grad_norm": 1.7388628721237183, "learning_rate": 8.990363232023722e-05, "loss": 0.988, "step": 3723 }, { "epoch": 0.5517037037037037, "grad_norm": 1.9051963090896606, "learning_rate": 8.987398072646404e-05, "loss": 1.315, "step": 3724 }, { "epoch": 0.5518518518518518, "grad_norm": 1.2011630535125732, "learning_rate": 8.984432913269088e-05, "loss": 0.9855, "step": 3725 }, { "epoch": 0.552, "grad_norm": 1.7776042222976685, "learning_rate": 8.981467753891773e-05, "loss": 1.1439, "step": 3726 }, { "epoch": 0.5521481481481482, "grad_norm": 1.6855988502502441, "learning_rate": 8.978502594514455e-05, "loss": 0.9103, "step": 3727 }, { "epoch": 0.5522962962962963, "grad_norm": 1.8196964263916016, "learning_rate": 8.975537435137139e-05, "loss": 1.1186, "step": 3728 }, { "epoch": 0.5524444444444444, "grad_norm": 2.096670150756836, "learning_rate": 8.972572275759824e-05, "loss": 0.9407, "step": 3729 }, { "epoch": 0.5525925925925926, "grad_norm": 2.373717784881592, "learning_rate": 8.969607116382505e-05, "loss": 1.3133, "step": 3730 }, { "epoch": 0.5527407407407408, "grad_norm": 1.496888279914856, "learning_rate": 8.96664195700519e-05, "loss": 0.9431, "step": 3731 }, { "epoch": 0.5528888888888889, "grad_norm": 1.5524799823760986, "learning_rate": 8.963676797627873e-05, "loss": 0.8583, "step": 3732 }, { "epoch": 0.553037037037037, "grad_norm": 1.5310463905334473, "learning_rate": 8.960711638250556e-05, "loss": 1.4312, "step": 3733 }, { "epoch": 0.5531851851851852, "grad_norm": 1.8233987092971802, "learning_rate": 8.95774647887324e-05, "loss": 1.0469, "step": 3734 }, { "epoch": 0.5533333333333333, "grad_norm": 1.8349663019180298, "learning_rate": 8.954781319495923e-05, "loss": 1.0884, "step": 3735 }, { "epoch": 0.5534814814814815, "grad_norm": 1.9651541709899902, "learning_rate": 8.951816160118607e-05, "loss": 1.2182, "step": 3736 }, { "epoch": 0.5536296296296296, "grad_norm": 4.574952602386475, "learning_rate": 8.948851000741291e-05, "loss": 1.073, "step": 3737 }, { "epoch": 0.5537777777777778, "grad_norm": 1.901864767074585, "learning_rate": 8.945885841363974e-05, "loss": 1.3331, "step": 3738 }, { "epoch": 0.5539259259259259, "grad_norm": 1.2893147468566895, "learning_rate": 8.942920681986657e-05, "loss": 1.0266, "step": 3739 }, { "epoch": 0.554074074074074, "grad_norm": 1.4766526222229004, "learning_rate": 8.939955522609342e-05, "loss": 1.0866, "step": 3740 }, { "epoch": 0.5542222222222222, "grad_norm": 1.5488594770431519, "learning_rate": 8.936990363232023e-05, "loss": 0.9393, "step": 3741 }, { "epoch": 0.5543703703703704, "grad_norm": 1.9072670936584473, "learning_rate": 8.934025203854708e-05, "loss": 1.027, "step": 3742 }, { "epoch": 0.5545185185185185, "grad_norm": 1.2895652055740356, "learning_rate": 8.931060044477391e-05, "loss": 0.9981, "step": 3743 }, { "epoch": 0.5546666666666666, "grad_norm": 3.711061477661133, "learning_rate": 8.928094885100074e-05, "loss": 1.0616, "step": 3744 }, { "epoch": 0.5548148148148148, "grad_norm": 2.482109546661377, "learning_rate": 8.925129725722758e-05, "loss": 0.8922, "step": 3745 }, { "epoch": 0.554962962962963, "grad_norm": 2.232032060623169, "learning_rate": 8.922164566345442e-05, "loss": 1.2899, "step": 3746 }, { "epoch": 0.5551111111111111, "grad_norm": 1.6179907321929932, "learning_rate": 8.919199406968125e-05, "loss": 1.1324, "step": 3747 }, { "epoch": 0.5552592592592592, "grad_norm": 2.103853940963745, "learning_rate": 8.916234247590809e-05, "loss": 1.2801, "step": 3748 }, { "epoch": 0.5554074074074075, "grad_norm": 1.3915637731552124, "learning_rate": 8.913269088213492e-05, "loss": 0.9872, "step": 3749 }, { "epoch": 0.5555555555555556, "grad_norm": 1.6203278303146362, "learning_rate": 8.910303928836175e-05, "loss": 0.8849, "step": 3750 }, { "epoch": 0.5557037037037037, "grad_norm": 1.936846375465393, "learning_rate": 8.907338769458858e-05, "loss": 1.2005, "step": 3751 }, { "epoch": 0.5558518518518518, "grad_norm": 2.231778144836426, "learning_rate": 8.904373610081543e-05, "loss": 1.1457, "step": 3752 }, { "epoch": 0.556, "grad_norm": 1.567258358001709, "learning_rate": 8.901408450704226e-05, "loss": 1.2589, "step": 3753 }, { "epoch": 0.5561481481481482, "grad_norm": 1.4013599157333374, "learning_rate": 8.898443291326909e-05, "loss": 1.1576, "step": 3754 }, { "epoch": 0.5562962962962963, "grad_norm": 1.2259019613265991, "learning_rate": 8.895478131949593e-05, "loss": 1.0385, "step": 3755 }, { "epoch": 0.5564444444444444, "grad_norm": 2.0349345207214355, "learning_rate": 8.892512972572276e-05, "loss": 1.1771, "step": 3756 }, { "epoch": 0.5565925925925926, "grad_norm": 1.4776968955993652, "learning_rate": 8.88954781319496e-05, "loss": 0.9647, "step": 3757 }, { "epoch": 0.5567407407407408, "grad_norm": 2.031043529510498, "learning_rate": 8.886582653817643e-05, "loss": 0.99, "step": 3758 }, { "epoch": 0.5568888888888889, "grad_norm": 1.9026827812194824, "learning_rate": 8.883617494440326e-05, "loss": 1.049, "step": 3759 }, { "epoch": 0.557037037037037, "grad_norm": 2.278316020965576, "learning_rate": 8.88065233506301e-05, "loss": 1.2736, "step": 3760 }, { "epoch": 0.5571851851851852, "grad_norm": 1.2471530437469482, "learning_rate": 8.877687175685693e-05, "loss": 0.9425, "step": 3761 }, { "epoch": 0.5573333333333333, "grad_norm": 1.2254701852798462, "learning_rate": 8.874722016308376e-05, "loss": 1.0904, "step": 3762 }, { "epoch": 0.5574814814814815, "grad_norm": 1.9684616327285767, "learning_rate": 8.871756856931061e-05, "loss": 1.022, "step": 3763 }, { "epoch": 0.5576296296296296, "grad_norm": 1.5332367420196533, "learning_rate": 8.868791697553744e-05, "loss": 1.0997, "step": 3764 }, { "epoch": 0.5577777777777778, "grad_norm": 1.2924193143844604, "learning_rate": 8.865826538176427e-05, "loss": 1.0744, "step": 3765 }, { "epoch": 0.5579259259259259, "grad_norm": 1.6238971948623657, "learning_rate": 8.862861378799111e-05, "loss": 1.0373, "step": 3766 }, { "epoch": 0.5580740740740741, "grad_norm": 1.2670892477035522, "learning_rate": 8.859896219421793e-05, "loss": 1.0113, "step": 3767 }, { "epoch": 0.5582222222222222, "grad_norm": 1.2489657402038574, "learning_rate": 8.856931060044478e-05, "loss": 0.9363, "step": 3768 }, { "epoch": 0.5583703703703704, "grad_norm": 1.6227606534957886, "learning_rate": 8.853965900667162e-05, "loss": 0.7972, "step": 3769 }, { "epoch": 0.5585185185185185, "grad_norm": 2.044673204421997, "learning_rate": 8.851000741289844e-05, "loss": 0.9603, "step": 3770 }, { "epoch": 0.5586666666666666, "grad_norm": 2.5743000507354736, "learning_rate": 8.848035581912528e-05, "loss": 0.9594, "step": 3771 }, { "epoch": 0.5588148148148148, "grad_norm": 1.3743785619735718, "learning_rate": 8.845070422535213e-05, "loss": 1.0438, "step": 3772 }, { "epoch": 0.558962962962963, "grad_norm": 1.4109739065170288, "learning_rate": 8.842105263157894e-05, "loss": 0.9083, "step": 3773 }, { "epoch": 0.5591111111111111, "grad_norm": 1.4330799579620361, "learning_rate": 8.839140103780579e-05, "loss": 1.0214, "step": 3774 }, { "epoch": 0.5592592592592592, "grad_norm": 2.8556342124938965, "learning_rate": 8.836174944403262e-05, "loss": 1.1462, "step": 3775 }, { "epoch": 0.5594074074074074, "grad_norm": 2.1466240882873535, "learning_rate": 8.833209785025945e-05, "loss": 0.9762, "step": 3776 }, { "epoch": 0.5595555555555556, "grad_norm": 1.436170220375061, "learning_rate": 8.83024462564863e-05, "loss": 1.0515, "step": 3777 }, { "epoch": 0.5597037037037037, "grad_norm": 1.9817818403244019, "learning_rate": 8.827279466271313e-05, "loss": 0.9498, "step": 3778 }, { "epoch": 0.5598518518518518, "grad_norm": 1.9754718542099, "learning_rate": 8.824314306893996e-05, "loss": 1.1527, "step": 3779 }, { "epoch": 0.56, "grad_norm": 1.9548002481460571, "learning_rate": 8.82134914751668e-05, "loss": 1.0014, "step": 3780 }, { "epoch": 0.5601481481481482, "grad_norm": 1.8182411193847656, "learning_rate": 8.818383988139363e-05, "loss": 1.2531, "step": 3781 }, { "epoch": 0.5602962962962963, "grad_norm": 4.865880966186523, "learning_rate": 8.815418828762046e-05, "loss": 1.1013, "step": 3782 }, { "epoch": 0.5604444444444444, "grad_norm": 2.1615490913391113, "learning_rate": 8.812453669384731e-05, "loss": 1.2701, "step": 3783 }, { "epoch": 0.5605925925925926, "grad_norm": 1.86410391330719, "learning_rate": 8.809488510007413e-05, "loss": 1.1001, "step": 3784 }, { "epoch": 0.5607407407407408, "grad_norm": 1.4472167491912842, "learning_rate": 8.806523350630097e-05, "loss": 0.9705, "step": 3785 }, { "epoch": 0.5608888888888889, "grad_norm": 2.6446444988250732, "learning_rate": 8.80355819125278e-05, "loss": 1.1533, "step": 3786 }, { "epoch": 0.561037037037037, "grad_norm": 1.4465818405151367, "learning_rate": 8.800593031875463e-05, "loss": 0.9612, "step": 3787 }, { "epoch": 0.5611851851851852, "grad_norm": 2.375319480895996, "learning_rate": 8.797627872498148e-05, "loss": 1.1046, "step": 3788 }, { "epoch": 0.5613333333333334, "grad_norm": 1.54768967628479, "learning_rate": 8.794662713120831e-05, "loss": 0.7747, "step": 3789 }, { "epoch": 0.5614814814814815, "grad_norm": 1.7208143472671509, "learning_rate": 8.791697553743514e-05, "loss": 1.1109, "step": 3790 }, { "epoch": 0.5616296296296296, "grad_norm": 1.2667361497879028, "learning_rate": 8.788732394366198e-05, "loss": 1.1482, "step": 3791 }, { "epoch": 0.5617777777777778, "grad_norm": 1.4096691608428955, "learning_rate": 8.785767234988881e-05, "loss": 1.0901, "step": 3792 }, { "epoch": 0.5619259259259259, "grad_norm": 1.3970277309417725, "learning_rate": 8.782802075611564e-05, "loss": 1.1533, "step": 3793 }, { "epoch": 0.5620740740740741, "grad_norm": 1.7509400844573975, "learning_rate": 8.779836916234247e-05, "loss": 1.0374, "step": 3794 }, { "epoch": 0.5622222222222222, "grad_norm": 1.8194208145141602, "learning_rate": 8.776871756856932e-05, "loss": 1.141, "step": 3795 }, { "epoch": 0.5623703703703704, "grad_norm": 1.4984582662582397, "learning_rate": 8.773906597479615e-05, "loss": 1.0448, "step": 3796 }, { "epoch": 0.5625185185185185, "grad_norm": 2.3701395988464355, "learning_rate": 8.770941438102298e-05, "loss": 0.8618, "step": 3797 }, { "epoch": 0.5626666666666666, "grad_norm": 1.72593092918396, "learning_rate": 8.767976278724983e-05, "loss": 1.2991, "step": 3798 }, { "epoch": 0.5628148148148148, "grad_norm": 3.1371865272521973, "learning_rate": 8.765011119347666e-05, "loss": 0.9859, "step": 3799 }, { "epoch": 0.562962962962963, "grad_norm": 1.9553066492080688, "learning_rate": 8.762045959970349e-05, "loss": 1.1226, "step": 3800 }, { "epoch": 0.5631111111111111, "grad_norm": 1.190905213356018, "learning_rate": 8.759080800593032e-05, "loss": 1.0534, "step": 3801 }, { "epoch": 0.5632592592592592, "grad_norm": 1.4159561395645142, "learning_rate": 8.756115641215715e-05, "loss": 1.0524, "step": 3802 }, { "epoch": 0.5634074074074074, "grad_norm": 1.7796388864517212, "learning_rate": 8.7531504818384e-05, "loss": 1.4437, "step": 3803 }, { "epoch": 0.5635555555555556, "grad_norm": 3.057600259780884, "learning_rate": 8.750185322461082e-05, "loss": 1.1243, "step": 3804 }, { "epoch": 0.5637037037037037, "grad_norm": 1.2936995029449463, "learning_rate": 8.747220163083766e-05, "loss": 1.2014, "step": 3805 }, { "epoch": 0.5638518518518518, "grad_norm": 1.568647861480713, "learning_rate": 8.74425500370645e-05, "loss": 1.1087, "step": 3806 }, { "epoch": 0.564, "grad_norm": 2.1068265438079834, "learning_rate": 8.741289844329133e-05, "loss": 1.0517, "step": 3807 }, { "epoch": 0.5641481481481482, "grad_norm": 1.7895489931106567, "learning_rate": 8.738324684951816e-05, "loss": 1.0477, "step": 3808 }, { "epoch": 0.5642962962962963, "grad_norm": 1.199803352355957, "learning_rate": 8.7353595255745e-05, "loss": 1.1233, "step": 3809 }, { "epoch": 0.5644444444444444, "grad_norm": 1.5444388389587402, "learning_rate": 8.732394366197182e-05, "loss": 1.2986, "step": 3810 }, { "epoch": 0.5645925925925926, "grad_norm": 3.2322614192962646, "learning_rate": 8.729429206819867e-05, "loss": 1.0613, "step": 3811 }, { "epoch": 0.5647407407407408, "grad_norm": 2.483393430709839, "learning_rate": 8.726464047442551e-05, "loss": 1.043, "step": 3812 }, { "epoch": 0.5648888888888889, "grad_norm": 2.9713010787963867, "learning_rate": 8.723498888065233e-05, "loss": 1.0973, "step": 3813 }, { "epoch": 0.565037037037037, "grad_norm": 1.9532005786895752, "learning_rate": 8.720533728687917e-05, "loss": 1.1491, "step": 3814 }, { "epoch": 0.5651851851851852, "grad_norm": 1.2219641208648682, "learning_rate": 8.717568569310602e-05, "loss": 0.9145, "step": 3815 }, { "epoch": 0.5653333333333334, "grad_norm": 1.3188670873641968, "learning_rate": 8.714603409933284e-05, "loss": 1.0009, "step": 3816 }, { "epoch": 0.5654814814814815, "grad_norm": 2.6909098625183105, "learning_rate": 8.711638250555968e-05, "loss": 1.1565, "step": 3817 }, { "epoch": 0.5656296296296296, "grad_norm": 2.079143762588501, "learning_rate": 8.708673091178651e-05, "loss": 1.0825, "step": 3818 }, { "epoch": 0.5657777777777778, "grad_norm": 4.6160054206848145, "learning_rate": 8.705707931801334e-05, "loss": 0.9953, "step": 3819 }, { "epoch": 0.5659259259259259, "grad_norm": 1.2882026433944702, "learning_rate": 8.702742772424019e-05, "loss": 0.8762, "step": 3820 }, { "epoch": 0.5660740740740741, "grad_norm": 1.4951224327087402, "learning_rate": 8.699777613046702e-05, "loss": 1.1549, "step": 3821 }, { "epoch": 0.5662222222222222, "grad_norm": 1.5525470972061157, "learning_rate": 8.696812453669385e-05, "loss": 1.0891, "step": 3822 }, { "epoch": 0.5663703703703704, "grad_norm": 1.595550775527954, "learning_rate": 8.69384729429207e-05, "loss": 1.0471, "step": 3823 }, { "epoch": 0.5665185185185185, "grad_norm": 1.8549158573150635, "learning_rate": 8.690882134914752e-05, "loss": 1.1468, "step": 3824 }, { "epoch": 0.5666666666666667, "grad_norm": 1.8177645206451416, "learning_rate": 8.687916975537436e-05, "loss": 1.2461, "step": 3825 }, { "epoch": 0.5668148148148148, "grad_norm": 3.8642890453338623, "learning_rate": 8.68495181616012e-05, "loss": 1.2282, "step": 3826 }, { "epoch": 0.566962962962963, "grad_norm": 2.238409996032715, "learning_rate": 8.681986656782802e-05, "loss": 0.9814, "step": 3827 }, { "epoch": 0.5671111111111111, "grad_norm": 2.2041618824005127, "learning_rate": 8.679021497405486e-05, "loss": 1.1013, "step": 3828 }, { "epoch": 0.5672592592592592, "grad_norm": 2.5299365520477295, "learning_rate": 8.676056338028169e-05, "loss": 1.1499, "step": 3829 }, { "epoch": 0.5674074074074074, "grad_norm": 1.562060832977295, "learning_rate": 8.673091178650852e-05, "loss": 0.8048, "step": 3830 }, { "epoch": 0.5675555555555556, "grad_norm": 1.3858586549758911, "learning_rate": 8.670126019273537e-05, "loss": 1.1815, "step": 3831 }, { "epoch": 0.5677037037037037, "grad_norm": 1.5052539110183716, "learning_rate": 8.66716085989622e-05, "loss": 1.0912, "step": 3832 }, { "epoch": 0.5678518518518518, "grad_norm": 1.2242339849472046, "learning_rate": 8.664195700518903e-05, "loss": 1.014, "step": 3833 }, { "epoch": 0.568, "grad_norm": 2.0522894859313965, "learning_rate": 8.661230541141587e-05, "loss": 0.8479, "step": 3834 }, { "epoch": 0.5681481481481482, "grad_norm": 2.4743332862854004, "learning_rate": 8.65826538176427e-05, "loss": 0.8606, "step": 3835 }, { "epoch": 0.5682962962962963, "grad_norm": 1.8897267580032349, "learning_rate": 8.655300222386954e-05, "loss": 1.1797, "step": 3836 }, { "epoch": 0.5684444444444444, "grad_norm": 1.2706682682037354, "learning_rate": 8.652335063009637e-05, "loss": 1.2249, "step": 3837 }, { "epoch": 0.5685925925925925, "grad_norm": 1.5865893363952637, "learning_rate": 8.649369903632321e-05, "loss": 1.3758, "step": 3838 }, { "epoch": 0.5687407407407408, "grad_norm": 2.2144436836242676, "learning_rate": 8.646404744255004e-05, "loss": 1.1577, "step": 3839 }, { "epoch": 0.5688888888888889, "grad_norm": 1.9299513101577759, "learning_rate": 8.643439584877687e-05, "loss": 1.1502, "step": 3840 }, { "epoch": 0.569037037037037, "grad_norm": 2.1713242530822754, "learning_rate": 8.640474425500372e-05, "loss": 0.8978, "step": 3841 }, { "epoch": 0.5691851851851852, "grad_norm": 1.302879810333252, "learning_rate": 8.637509266123055e-05, "loss": 0.8991, "step": 3842 }, { "epoch": 0.5693333333333334, "grad_norm": 1.569977045059204, "learning_rate": 8.634544106745738e-05, "loss": 0.9357, "step": 3843 }, { "epoch": 0.5694814814814815, "grad_norm": 1.9835530519485474, "learning_rate": 8.631578947368421e-05, "loss": 1.0916, "step": 3844 }, { "epoch": 0.5696296296296296, "grad_norm": 1.7564622163772583, "learning_rate": 8.628613787991104e-05, "loss": 1.1367, "step": 3845 }, { "epoch": 0.5697777777777778, "grad_norm": 1.8343626260757446, "learning_rate": 8.625648628613789e-05, "loss": 1.1506, "step": 3846 }, { "epoch": 0.569925925925926, "grad_norm": 1.1550328731536865, "learning_rate": 8.622683469236472e-05, "loss": 0.7496, "step": 3847 }, { "epoch": 0.5700740740740741, "grad_norm": 1.8206971883773804, "learning_rate": 8.619718309859155e-05, "loss": 1.334, "step": 3848 }, { "epoch": 0.5702222222222222, "grad_norm": 1.7791658639907837, "learning_rate": 8.616753150481839e-05, "loss": 1.1134, "step": 3849 }, { "epoch": 0.5703703703703704, "grad_norm": 1.7074779272079468, "learning_rate": 8.613787991104522e-05, "loss": 1.218, "step": 3850 }, { "epoch": 0.5705185185185185, "grad_norm": 1.5398327112197876, "learning_rate": 8.610822831727205e-05, "loss": 1.0318, "step": 3851 }, { "epoch": 0.5706666666666667, "grad_norm": 1.385349154472351, "learning_rate": 8.60785767234989e-05, "loss": 1.0344, "step": 3852 }, { "epoch": 0.5708148148148148, "grad_norm": 3.941965103149414, "learning_rate": 8.604892512972572e-05, "loss": 1.204, "step": 3853 }, { "epoch": 0.570962962962963, "grad_norm": 2.1788978576660156, "learning_rate": 8.601927353595256e-05, "loss": 1.0896, "step": 3854 }, { "epoch": 0.5711111111111111, "grad_norm": 1.6998543739318848, "learning_rate": 8.59896219421794e-05, "loss": 0.907, "step": 3855 }, { "epoch": 0.5712592592592592, "grad_norm": 1.5642954111099243, "learning_rate": 8.595997034840622e-05, "loss": 1.1863, "step": 3856 }, { "epoch": 0.5714074074074074, "grad_norm": 1.456916332244873, "learning_rate": 8.593031875463307e-05, "loss": 0.9201, "step": 3857 }, { "epoch": 0.5715555555555556, "grad_norm": 1.270564317703247, "learning_rate": 8.590066716085991e-05, "loss": 0.982, "step": 3858 }, { "epoch": 0.5717037037037037, "grad_norm": 1.182762622833252, "learning_rate": 8.587101556708673e-05, "loss": 1.0951, "step": 3859 }, { "epoch": 0.5718518518518518, "grad_norm": 1.4090367555618286, "learning_rate": 8.584136397331357e-05, "loss": 0.9046, "step": 3860 }, { "epoch": 0.572, "grad_norm": 1.2275861501693726, "learning_rate": 8.58117123795404e-05, "loss": 0.993, "step": 3861 }, { "epoch": 0.5721481481481482, "grad_norm": 3.029972791671753, "learning_rate": 8.578206078576723e-05, "loss": 1.0086, "step": 3862 }, { "epoch": 0.5722962962962963, "grad_norm": 3.029181957244873, "learning_rate": 8.575240919199408e-05, "loss": 1.0326, "step": 3863 }, { "epoch": 0.5724444444444444, "grad_norm": 1.5321046113967896, "learning_rate": 8.572275759822091e-05, "loss": 0.9353, "step": 3864 }, { "epoch": 0.5725925925925925, "grad_norm": 4.442076206207275, "learning_rate": 8.569310600444774e-05, "loss": 1.0312, "step": 3865 }, { "epoch": 0.5727407407407408, "grad_norm": 1.3870460987091064, "learning_rate": 8.566345441067458e-05, "loss": 1.0636, "step": 3866 }, { "epoch": 0.5728888888888889, "grad_norm": 1.3915530443191528, "learning_rate": 8.563380281690142e-05, "loss": 0.9028, "step": 3867 }, { "epoch": 0.573037037037037, "grad_norm": 5.211930274963379, "learning_rate": 8.560415122312825e-05, "loss": 1.0612, "step": 3868 }, { "epoch": 0.5731851851851851, "grad_norm": 1.4853953123092651, "learning_rate": 8.557449962935509e-05, "loss": 0.9738, "step": 3869 }, { "epoch": 0.5733333333333334, "grad_norm": 1.6683987379074097, "learning_rate": 8.554484803558191e-05, "loss": 1.0143, "step": 3870 }, { "epoch": 0.5734814814814815, "grad_norm": 1.8787882328033447, "learning_rate": 8.551519644180875e-05, "loss": 1.3646, "step": 3871 }, { "epoch": 0.5736296296296296, "grad_norm": 2.6080405712127686, "learning_rate": 8.548554484803558e-05, "loss": 1.2406, "step": 3872 }, { "epoch": 0.5737777777777778, "grad_norm": 1.8387919664382935, "learning_rate": 8.545589325426241e-05, "loss": 1.009, "step": 3873 }, { "epoch": 0.573925925925926, "grad_norm": 1.450384497642517, "learning_rate": 8.542624166048926e-05, "loss": 1.0718, "step": 3874 }, { "epoch": 0.5740740740740741, "grad_norm": 1.6187493801116943, "learning_rate": 8.539659006671609e-05, "loss": 1.2854, "step": 3875 }, { "epoch": 0.5742222222222222, "grad_norm": 1.410619854927063, "learning_rate": 8.536693847294292e-05, "loss": 1.0601, "step": 3876 }, { "epoch": 0.5743703703703704, "grad_norm": 1.3554319143295288, "learning_rate": 8.533728687916977e-05, "loss": 0.8737, "step": 3877 }, { "epoch": 0.5745185185185185, "grad_norm": 1.593678593635559, "learning_rate": 8.53076352853966e-05, "loss": 1.1269, "step": 3878 }, { "epoch": 0.5746666666666667, "grad_norm": 2.516368865966797, "learning_rate": 8.527798369162343e-05, "loss": 0.9533, "step": 3879 }, { "epoch": 0.5748148148148148, "grad_norm": 1.5936317443847656, "learning_rate": 8.524833209785026e-05, "loss": 1.1332, "step": 3880 }, { "epoch": 0.574962962962963, "grad_norm": 2.9010207653045654, "learning_rate": 8.52186805040771e-05, "loss": 0.9723, "step": 3881 }, { "epoch": 0.5751111111111111, "grad_norm": 1.9528433084487915, "learning_rate": 8.518902891030393e-05, "loss": 1.0244, "step": 3882 }, { "epoch": 0.5752592592592592, "grad_norm": 3.864577293395996, "learning_rate": 8.515937731653076e-05, "loss": 1.1317, "step": 3883 }, { "epoch": 0.5754074074074074, "grad_norm": 1.400748610496521, "learning_rate": 8.512972572275761e-05, "loss": 1.1082, "step": 3884 }, { "epoch": 0.5755555555555556, "grad_norm": 1.3284187316894531, "learning_rate": 8.510007412898444e-05, "loss": 1.1752, "step": 3885 }, { "epoch": 0.5757037037037037, "grad_norm": 3.269716739654541, "learning_rate": 8.507042253521127e-05, "loss": 1.0402, "step": 3886 }, { "epoch": 0.5758518518518518, "grad_norm": 1.8288670778274536, "learning_rate": 8.50407709414381e-05, "loss": 1.2304, "step": 3887 }, { "epoch": 0.576, "grad_norm": 1.7066706418991089, "learning_rate": 8.501111934766493e-05, "loss": 0.8752, "step": 3888 }, { "epoch": 0.5761481481481482, "grad_norm": 1.2042165994644165, "learning_rate": 8.498146775389178e-05, "loss": 0.9482, "step": 3889 }, { "epoch": 0.5762962962962963, "grad_norm": 1.4405831098556519, "learning_rate": 8.495181616011861e-05, "loss": 0.9983, "step": 3890 }, { "epoch": 0.5764444444444444, "grad_norm": 1.7180677652359009, "learning_rate": 8.492216456634544e-05, "loss": 1.1633, "step": 3891 }, { "epoch": 0.5765925925925925, "grad_norm": 1.9741321802139282, "learning_rate": 8.489251297257228e-05, "loss": 1.0343, "step": 3892 }, { "epoch": 0.5767407407407408, "grad_norm": 1.2920258045196533, "learning_rate": 8.486286137879911e-05, "loss": 0.7823, "step": 3893 }, { "epoch": 0.5768888888888889, "grad_norm": 2.000192642211914, "learning_rate": 8.483320978502595e-05, "loss": 1.1409, "step": 3894 }, { "epoch": 0.577037037037037, "grad_norm": 2.24792742729187, "learning_rate": 8.480355819125279e-05, "loss": 1.3058, "step": 3895 }, { "epoch": 0.5771851851851851, "grad_norm": 1.621907114982605, "learning_rate": 8.477390659747961e-05, "loss": 1.0469, "step": 3896 }, { "epoch": 0.5773333333333334, "grad_norm": 2.199373722076416, "learning_rate": 8.474425500370645e-05, "loss": 1.047, "step": 3897 }, { "epoch": 0.5774814814814815, "grad_norm": 1.3308420181274414, "learning_rate": 8.47146034099333e-05, "loss": 0.9544, "step": 3898 }, { "epoch": 0.5776296296296296, "grad_norm": 1.9536174535751343, "learning_rate": 8.468495181616011e-05, "loss": 0.9893, "step": 3899 }, { "epoch": 0.5777777777777777, "grad_norm": 1.3404901027679443, "learning_rate": 8.465530022238696e-05, "loss": 1.3704, "step": 3900 }, { "epoch": 0.577925925925926, "grad_norm": 1.9785478115081787, "learning_rate": 8.46256486286138e-05, "loss": 0.6877, "step": 3901 }, { "epoch": 0.5780740740740741, "grad_norm": 1.2857595682144165, "learning_rate": 8.459599703484062e-05, "loss": 0.8752, "step": 3902 }, { "epoch": 0.5782222222222222, "grad_norm": 2.6999502182006836, "learning_rate": 8.456634544106746e-05, "loss": 1.2352, "step": 3903 }, { "epoch": 0.5783703703703704, "grad_norm": 1.5243096351623535, "learning_rate": 8.45366938472943e-05, "loss": 0.8517, "step": 3904 }, { "epoch": 0.5785185185185185, "grad_norm": 26.335729598999023, "learning_rate": 8.450704225352113e-05, "loss": 0.9115, "step": 3905 }, { "epoch": 0.5786666666666667, "grad_norm": 3.49088454246521, "learning_rate": 8.447739065974797e-05, "loss": 1.0258, "step": 3906 }, { "epoch": 0.5788148148148148, "grad_norm": 1.7637948989868164, "learning_rate": 8.44477390659748e-05, "loss": 0.9831, "step": 3907 }, { "epoch": 0.578962962962963, "grad_norm": 1.4845067262649536, "learning_rate": 8.441808747220163e-05, "loss": 0.9553, "step": 3908 }, { "epoch": 0.5791111111111111, "grad_norm": 1.3211807012557983, "learning_rate": 8.438843587842848e-05, "loss": 0.9891, "step": 3909 }, { "epoch": 0.5792592592592593, "grad_norm": 1.7925214767456055, "learning_rate": 8.435878428465531e-05, "loss": 1.1427, "step": 3910 }, { "epoch": 0.5794074074074074, "grad_norm": 1.4540693759918213, "learning_rate": 8.432913269088214e-05, "loss": 1.1146, "step": 3911 }, { "epoch": 0.5795555555555556, "grad_norm": 1.6134746074676514, "learning_rate": 8.429948109710898e-05, "loss": 1.0095, "step": 3912 }, { "epoch": 0.5797037037037037, "grad_norm": 1.8240565061569214, "learning_rate": 8.42698295033358e-05, "loss": 1.0438, "step": 3913 }, { "epoch": 0.5798518518518518, "grad_norm": 2.772273063659668, "learning_rate": 8.424017790956264e-05, "loss": 1.051, "step": 3914 }, { "epoch": 0.58, "grad_norm": 1.9138096570968628, "learning_rate": 8.421052631578948e-05, "loss": 1.0445, "step": 3915 }, { "epoch": 0.5801481481481482, "grad_norm": 1.457571029663086, "learning_rate": 8.41808747220163e-05, "loss": 1.1765, "step": 3916 }, { "epoch": 0.5802962962962963, "grad_norm": 1.4449666738510132, "learning_rate": 8.415122312824315e-05, "loss": 0.9397, "step": 3917 }, { "epoch": 0.5804444444444444, "grad_norm": 2.546419143676758, "learning_rate": 8.412157153446998e-05, "loss": 1.2447, "step": 3918 }, { "epoch": 0.5805925925925925, "grad_norm": 2.1319639682769775, "learning_rate": 8.409191994069681e-05, "loss": 1.0725, "step": 3919 }, { "epoch": 0.5807407407407408, "grad_norm": 1.8370797634124756, "learning_rate": 8.406226834692366e-05, "loss": 1.1535, "step": 3920 }, { "epoch": 0.5808888888888889, "grad_norm": 1.722662091255188, "learning_rate": 8.403261675315049e-05, "loss": 1.1676, "step": 3921 }, { "epoch": 0.581037037037037, "grad_norm": 2.8595473766326904, "learning_rate": 8.400296515937732e-05, "loss": 1.167, "step": 3922 }, { "epoch": 0.5811851851851851, "grad_norm": 5.709082126617432, "learning_rate": 8.397331356560415e-05, "loss": 1.0609, "step": 3923 }, { "epoch": 0.5813333333333334, "grad_norm": 2.37556791305542, "learning_rate": 8.3943661971831e-05, "loss": 1.0483, "step": 3924 }, { "epoch": 0.5814814814814815, "grad_norm": 2.6315367221832275, "learning_rate": 8.391401037805783e-05, "loss": 0.9119, "step": 3925 }, { "epoch": 0.5816296296296296, "grad_norm": 2.009979009628296, "learning_rate": 8.388435878428466e-05, "loss": 1.2372, "step": 3926 }, { "epoch": 0.5817777777777777, "grad_norm": 2.6860642433166504, "learning_rate": 8.38547071905115e-05, "loss": 1.2327, "step": 3927 }, { "epoch": 0.581925925925926, "grad_norm": 1.9737948179244995, "learning_rate": 8.382505559673833e-05, "loss": 0.9675, "step": 3928 }, { "epoch": 0.5820740740740741, "grad_norm": 1.1909997463226318, "learning_rate": 8.379540400296516e-05, "loss": 0.9533, "step": 3929 }, { "epoch": 0.5822222222222222, "grad_norm": 1.620888590812683, "learning_rate": 8.3765752409192e-05, "loss": 1.0316, "step": 3930 }, { "epoch": 0.5823703703703703, "grad_norm": 1.3121371269226074, "learning_rate": 8.373610081541882e-05, "loss": 1.0043, "step": 3931 }, { "epoch": 0.5825185185185185, "grad_norm": 1.4426884651184082, "learning_rate": 8.370644922164567e-05, "loss": 0.9521, "step": 3932 }, { "epoch": 0.5826666666666667, "grad_norm": 1.5646693706512451, "learning_rate": 8.36767976278725e-05, "loss": 1.1229, "step": 3933 }, { "epoch": 0.5828148148148148, "grad_norm": 1.7909319400787354, "learning_rate": 8.364714603409933e-05, "loss": 1.0374, "step": 3934 }, { "epoch": 0.582962962962963, "grad_norm": 1.7373473644256592, "learning_rate": 8.361749444032618e-05, "loss": 1.1421, "step": 3935 }, { "epoch": 0.5831111111111111, "grad_norm": 2.039731740951538, "learning_rate": 8.3587842846553e-05, "loss": 0.8892, "step": 3936 }, { "epoch": 0.5832592592592593, "grad_norm": 2.041718006134033, "learning_rate": 8.355819125277984e-05, "loss": 0.9515, "step": 3937 }, { "epoch": 0.5834074074074074, "grad_norm": 1.4242746829986572, "learning_rate": 8.352853965900668e-05, "loss": 0.9712, "step": 3938 }, { "epoch": 0.5835555555555556, "grad_norm": 3.0257763862609863, "learning_rate": 8.34988880652335e-05, "loss": 0.8978, "step": 3939 }, { "epoch": 0.5837037037037037, "grad_norm": 1.5052884817123413, "learning_rate": 8.346923647146034e-05, "loss": 1.1112, "step": 3940 }, { "epoch": 0.5838518518518518, "grad_norm": 1.7193881273269653, "learning_rate": 8.343958487768719e-05, "loss": 0.9639, "step": 3941 }, { "epoch": 0.584, "grad_norm": 1.8135850429534912, "learning_rate": 8.3409933283914e-05, "loss": 1.0764, "step": 3942 }, { "epoch": 0.5841481481481482, "grad_norm": 1.574263572692871, "learning_rate": 8.338028169014085e-05, "loss": 1.1113, "step": 3943 }, { "epoch": 0.5842962962962963, "grad_norm": 1.1261448860168457, "learning_rate": 8.33506300963677e-05, "loss": 1.2041, "step": 3944 }, { "epoch": 0.5844444444444444, "grad_norm": 1.75725519657135, "learning_rate": 8.332097850259451e-05, "loss": 1.1305, "step": 3945 }, { "epoch": 0.5845925925925926, "grad_norm": 1.8887760639190674, "learning_rate": 8.329132690882136e-05, "loss": 1.041, "step": 3946 }, { "epoch": 0.5847407407407408, "grad_norm": 1.9972128868103027, "learning_rate": 8.326167531504819e-05, "loss": 1.1726, "step": 3947 }, { "epoch": 0.5848888888888889, "grad_norm": 2.7304162979125977, "learning_rate": 8.323202372127502e-05, "loss": 1.0078, "step": 3948 }, { "epoch": 0.585037037037037, "grad_norm": 1.2695363759994507, "learning_rate": 8.320237212750186e-05, "loss": 1.2488, "step": 3949 }, { "epoch": 0.5851851851851851, "grad_norm": 1.4677480459213257, "learning_rate": 8.317272053372869e-05, "loss": 0.9995, "step": 3950 }, { "epoch": 0.5853333333333334, "grad_norm": 1.2333437204360962, "learning_rate": 8.314306893995552e-05, "loss": 0.9881, "step": 3951 }, { "epoch": 0.5854814814814815, "grad_norm": 1.8984434604644775, "learning_rate": 8.311341734618237e-05, "loss": 0.8546, "step": 3952 }, { "epoch": 0.5856296296296296, "grad_norm": 1.391685962677002, "learning_rate": 8.30837657524092e-05, "loss": 1.0106, "step": 3953 }, { "epoch": 0.5857777777777777, "grad_norm": 1.926734209060669, "learning_rate": 8.305411415863603e-05, "loss": 1.1263, "step": 3954 }, { "epoch": 0.585925925925926, "grad_norm": 1.428473949432373, "learning_rate": 8.302446256486287e-05, "loss": 1.0882, "step": 3955 }, { "epoch": 0.5860740740740741, "grad_norm": 2.3048830032348633, "learning_rate": 8.299481097108969e-05, "loss": 0.6306, "step": 3956 }, { "epoch": 0.5862222222222222, "grad_norm": 1.389608383178711, "learning_rate": 8.296515937731654e-05, "loss": 0.9708, "step": 3957 }, { "epoch": 0.5863703703703703, "grad_norm": 1.9436511993408203, "learning_rate": 8.293550778354337e-05, "loss": 0.9615, "step": 3958 }, { "epoch": 0.5865185185185186, "grad_norm": 2.1171786785125732, "learning_rate": 8.29058561897702e-05, "loss": 1.2669, "step": 3959 }, { "epoch": 0.5866666666666667, "grad_norm": 1.4744733572006226, "learning_rate": 8.287620459599704e-05, "loss": 1.185, "step": 3960 }, { "epoch": 0.5868148148148148, "grad_norm": 1.9858527183532715, "learning_rate": 8.284655300222387e-05, "loss": 1.0073, "step": 3961 }, { "epoch": 0.5869629629629629, "grad_norm": 1.8568344116210938, "learning_rate": 8.28169014084507e-05, "loss": 1.0161, "step": 3962 }, { "epoch": 0.5871111111111111, "grad_norm": 1.805935025215149, "learning_rate": 8.278724981467755e-05, "loss": 1.0971, "step": 3963 }, { "epoch": 0.5872592592592593, "grad_norm": 1.8495519161224365, "learning_rate": 8.275759822090438e-05, "loss": 1.1824, "step": 3964 }, { "epoch": 0.5874074074074074, "grad_norm": 1.497774362564087, "learning_rate": 8.272794662713121e-05, "loss": 1.063, "step": 3965 }, { "epoch": 0.5875555555555556, "grad_norm": 1.5078151226043701, "learning_rate": 8.269829503335804e-05, "loss": 1.0814, "step": 3966 }, { "epoch": 0.5877037037037037, "grad_norm": 1.7053334712982178, "learning_rate": 8.266864343958489e-05, "loss": 1.0877, "step": 3967 }, { "epoch": 0.5878518518518518, "grad_norm": 1.7991783618927002, "learning_rate": 8.263899184581172e-05, "loss": 0.8419, "step": 3968 }, { "epoch": 0.588, "grad_norm": 1.92278254032135, "learning_rate": 8.260934025203855e-05, "loss": 1.2756, "step": 3969 }, { "epoch": 0.5881481481481482, "grad_norm": 1.3542948961257935, "learning_rate": 8.257968865826539e-05, "loss": 0.8694, "step": 3970 }, { "epoch": 0.5882962962962963, "grad_norm": 1.9901429414749146, "learning_rate": 8.255003706449222e-05, "loss": 0.931, "step": 3971 }, { "epoch": 0.5884444444444444, "grad_norm": 1.8600529432296753, "learning_rate": 8.252038547071905e-05, "loss": 1.0207, "step": 3972 }, { "epoch": 0.5885925925925926, "grad_norm": 1.4098716974258423, "learning_rate": 8.249073387694589e-05, "loss": 1.0243, "step": 3973 }, { "epoch": 0.5887407407407408, "grad_norm": 1.8596630096435547, "learning_rate": 8.246108228317272e-05, "loss": 1.0433, "step": 3974 }, { "epoch": 0.5888888888888889, "grad_norm": 1.580466628074646, "learning_rate": 8.243143068939956e-05, "loss": 1.1893, "step": 3975 }, { "epoch": 0.589037037037037, "grad_norm": 1.5999795198440552, "learning_rate": 8.240177909562639e-05, "loss": 1.0266, "step": 3976 }, { "epoch": 0.5891851851851851, "grad_norm": 2.5227270126342773, "learning_rate": 8.237212750185322e-05, "loss": 1.1009, "step": 3977 }, { "epoch": 0.5893333333333334, "grad_norm": 1.55117666721344, "learning_rate": 8.234247590808007e-05, "loss": 1.0404, "step": 3978 }, { "epoch": 0.5894814814814815, "grad_norm": 1.7022956609725952, "learning_rate": 8.23128243143069e-05, "loss": 1.0096, "step": 3979 }, { "epoch": 0.5896296296296296, "grad_norm": 1.8657565116882324, "learning_rate": 8.228317272053373e-05, "loss": 1.0103, "step": 3980 }, { "epoch": 0.5897777777777777, "grad_norm": 2.14446759223938, "learning_rate": 8.225352112676057e-05, "loss": 1.0509, "step": 3981 }, { "epoch": 0.589925925925926, "grad_norm": 1.3459311723709106, "learning_rate": 8.222386953298739e-05, "loss": 0.9706, "step": 3982 }, { "epoch": 0.5900740740740741, "grad_norm": 1.5558202266693115, "learning_rate": 8.219421793921423e-05, "loss": 1.0645, "step": 3983 }, { "epoch": 0.5902222222222222, "grad_norm": 1.6945942640304565, "learning_rate": 8.216456634544108e-05, "loss": 1.3269, "step": 3984 }, { "epoch": 0.5903703703703703, "grad_norm": 1.5880628824234009, "learning_rate": 8.21349147516679e-05, "loss": 0.9565, "step": 3985 }, { "epoch": 0.5905185185185186, "grad_norm": 1.7825582027435303, "learning_rate": 8.210526315789474e-05, "loss": 0.8963, "step": 3986 }, { "epoch": 0.5906666666666667, "grad_norm": 1.5919945240020752, "learning_rate": 8.207561156412159e-05, "loss": 0.9176, "step": 3987 }, { "epoch": 0.5908148148148148, "grad_norm": 1.5249149799346924, "learning_rate": 8.20459599703484e-05, "loss": 0.9807, "step": 3988 }, { "epoch": 0.5909629629629629, "grad_norm": 5.834096908569336, "learning_rate": 8.201630837657525e-05, "loss": 0.9669, "step": 3989 }, { "epoch": 0.5911111111111111, "grad_norm": 1.2772576808929443, "learning_rate": 8.198665678280208e-05, "loss": 1.0903, "step": 3990 }, { "epoch": 0.5912592592592593, "grad_norm": 1.7056981325149536, "learning_rate": 8.195700518902891e-05, "loss": 0.9943, "step": 3991 }, { "epoch": 0.5914074074074074, "grad_norm": 2.801191806793213, "learning_rate": 8.192735359525575e-05, "loss": 1.1217, "step": 3992 }, { "epoch": 0.5915555555555555, "grad_norm": 1.6783663034439087, "learning_rate": 8.189770200148258e-05, "loss": 1.0719, "step": 3993 }, { "epoch": 0.5917037037037037, "grad_norm": 2.6965856552124023, "learning_rate": 8.186805040770942e-05, "loss": 0.9692, "step": 3994 }, { "epoch": 0.5918518518518519, "grad_norm": 2.0375962257385254, "learning_rate": 8.183839881393626e-05, "loss": 1.2247, "step": 3995 }, { "epoch": 0.592, "grad_norm": 1.5867644548416138, "learning_rate": 8.180874722016309e-05, "loss": 1.1221, "step": 3996 }, { "epoch": 0.5921481481481482, "grad_norm": 1.8249834775924683, "learning_rate": 8.177909562638992e-05, "loss": 1.0135, "step": 3997 }, { "epoch": 0.5922962962962963, "grad_norm": 2.1309573650360107, "learning_rate": 8.174944403261677e-05, "loss": 1.1323, "step": 3998 }, { "epoch": 0.5924444444444444, "grad_norm": 2.425473690032959, "learning_rate": 8.171979243884358e-05, "loss": 0.971, "step": 3999 }, { "epoch": 0.5925925925925926, "grad_norm": 1.6478471755981445, "learning_rate": 8.169014084507043e-05, "loss": 0.9494, "step": 4000 }, { "epoch": 0.5927407407407408, "grad_norm": 1.8858792781829834, "learning_rate": 8.166048925129726e-05, "loss": 1.0459, "step": 4001 }, { "epoch": 0.5928888888888889, "grad_norm": 1.6564253568649292, "learning_rate": 8.163083765752409e-05, "loss": 0.9786, "step": 4002 }, { "epoch": 0.593037037037037, "grad_norm": 1.7090998888015747, "learning_rate": 8.160118606375093e-05, "loss": 0.924, "step": 4003 }, { "epoch": 0.5931851851851851, "grad_norm": 1.7010623216629028, "learning_rate": 8.157153446997777e-05, "loss": 0.966, "step": 4004 }, { "epoch": 0.5933333333333334, "grad_norm": 1.4552000761032104, "learning_rate": 8.15418828762046e-05, "loss": 0.8709, "step": 4005 }, { "epoch": 0.5934814814814815, "grad_norm": 2.0067336559295654, "learning_rate": 8.151223128243144e-05, "loss": 1.0272, "step": 4006 }, { "epoch": 0.5936296296296296, "grad_norm": 1.1080560684204102, "learning_rate": 8.148257968865827e-05, "loss": 1.0738, "step": 4007 }, { "epoch": 0.5937777777777777, "grad_norm": 1.604054570198059, "learning_rate": 8.14529280948851e-05, "loss": 1.0267, "step": 4008 }, { "epoch": 0.593925925925926, "grad_norm": 3.8693463802337646, "learning_rate": 8.142327650111193e-05, "loss": 1.1424, "step": 4009 }, { "epoch": 0.5940740740740741, "grad_norm": 1.270259141921997, "learning_rate": 8.139362490733878e-05, "loss": 1.0009, "step": 4010 }, { "epoch": 0.5942222222222222, "grad_norm": 1.2867693901062012, "learning_rate": 8.136397331356561e-05, "loss": 0.7596, "step": 4011 }, { "epoch": 0.5943703703703703, "grad_norm": 2.2901933193206787, "learning_rate": 8.133432171979244e-05, "loss": 1.0978, "step": 4012 }, { "epoch": 0.5945185185185186, "grad_norm": 1.6102063655853271, "learning_rate": 8.130467012601928e-05, "loss": 1.1288, "step": 4013 }, { "epoch": 0.5946666666666667, "grad_norm": 1.5397627353668213, "learning_rate": 8.127501853224611e-05, "loss": 1.0938, "step": 4014 }, { "epoch": 0.5948148148148148, "grad_norm": 1.4628108739852905, "learning_rate": 8.124536693847295e-05, "loss": 0.9546, "step": 4015 }, { "epoch": 0.5949629629629629, "grad_norm": 1.4136143922805786, "learning_rate": 8.121571534469978e-05, "loss": 0.9113, "step": 4016 }, { "epoch": 0.5951111111111111, "grad_norm": 1.436536431312561, "learning_rate": 8.118606375092661e-05, "loss": 1.1298, "step": 4017 }, { "epoch": 0.5952592592592593, "grad_norm": 1.4656097888946533, "learning_rate": 8.115641215715345e-05, "loss": 1.1187, "step": 4018 }, { "epoch": 0.5954074074074074, "grad_norm": 1.8365551233291626, "learning_rate": 8.112676056338028e-05, "loss": 1.0314, "step": 4019 }, { "epoch": 0.5955555555555555, "grad_norm": 1.4153543710708618, "learning_rate": 8.109710896960711e-05, "loss": 1.1519, "step": 4020 }, { "epoch": 0.5957037037037037, "grad_norm": 1.3734936714172363, "learning_rate": 8.106745737583396e-05, "loss": 1.3338, "step": 4021 }, { "epoch": 0.5958518518518519, "grad_norm": 2.5461981296539307, "learning_rate": 8.103780578206079e-05, "loss": 1.2022, "step": 4022 }, { "epoch": 0.596, "grad_norm": 2.483335256576538, "learning_rate": 8.100815418828762e-05, "loss": 1.0438, "step": 4023 }, { "epoch": 0.5961481481481481, "grad_norm": 1.7527753114700317, "learning_rate": 8.097850259451446e-05, "loss": 1.0306, "step": 4024 }, { "epoch": 0.5962962962962963, "grad_norm": 1.9566283226013184, "learning_rate": 8.09488510007413e-05, "loss": 0.8303, "step": 4025 }, { "epoch": 0.5964444444444444, "grad_norm": 2.9948341846466064, "learning_rate": 8.091919940696813e-05, "loss": 0.9456, "step": 4026 }, { "epoch": 0.5965925925925926, "grad_norm": 1.8734164237976074, "learning_rate": 8.088954781319497e-05, "loss": 1.0004, "step": 4027 }, { "epoch": 0.5967407407407408, "grad_norm": 2.0065784454345703, "learning_rate": 8.085989621942179e-05, "loss": 0.9237, "step": 4028 }, { "epoch": 0.5968888888888889, "grad_norm": 1.6082143783569336, "learning_rate": 8.083024462564863e-05, "loss": 0.9663, "step": 4029 }, { "epoch": 0.597037037037037, "grad_norm": 1.461788296699524, "learning_rate": 8.080059303187548e-05, "loss": 1.0964, "step": 4030 }, { "epoch": 0.5971851851851852, "grad_norm": 1.3981989622116089, "learning_rate": 8.07709414381023e-05, "loss": 0.9772, "step": 4031 }, { "epoch": 0.5973333333333334, "grad_norm": 2.0816566944122314, "learning_rate": 8.074128984432914e-05, "loss": 0.8227, "step": 4032 }, { "epoch": 0.5974814814814815, "grad_norm": 2.549114465713501, "learning_rate": 8.071163825055597e-05, "loss": 1.1723, "step": 4033 }, { "epoch": 0.5976296296296296, "grad_norm": 2.281337022781372, "learning_rate": 8.06819866567828e-05, "loss": 1.1551, "step": 4034 }, { "epoch": 0.5977777777777777, "grad_norm": 1.365309238433838, "learning_rate": 8.065233506300965e-05, "loss": 1.0515, "step": 4035 }, { "epoch": 0.597925925925926, "grad_norm": 1.4237347841262817, "learning_rate": 8.062268346923648e-05, "loss": 1.0027, "step": 4036 }, { "epoch": 0.5980740740740741, "grad_norm": 1.2399948835372925, "learning_rate": 8.059303187546331e-05, "loss": 0.9247, "step": 4037 }, { "epoch": 0.5982222222222222, "grad_norm": 1.2645602226257324, "learning_rate": 8.056338028169015e-05, "loss": 0.7925, "step": 4038 }, { "epoch": 0.5983703703703703, "grad_norm": 1.2598143815994263, "learning_rate": 8.053372868791698e-05, "loss": 0.8836, "step": 4039 }, { "epoch": 0.5985185185185186, "grad_norm": 1.4593044519424438, "learning_rate": 8.050407709414381e-05, "loss": 1.1674, "step": 4040 }, { "epoch": 0.5986666666666667, "grad_norm": 2.0067574977874756, "learning_rate": 8.047442550037066e-05, "loss": 1.3531, "step": 4041 }, { "epoch": 0.5988148148148148, "grad_norm": 1.540021538734436, "learning_rate": 8.044477390659748e-05, "loss": 0.8199, "step": 4042 }, { "epoch": 0.5989629629629629, "grad_norm": 1.3369797468185425, "learning_rate": 8.041512231282432e-05, "loss": 0.9617, "step": 4043 }, { "epoch": 0.5991111111111111, "grad_norm": 2.357513666152954, "learning_rate": 8.038547071905115e-05, "loss": 1.0855, "step": 4044 }, { "epoch": 0.5992592592592593, "grad_norm": 3.003789186477661, "learning_rate": 8.035581912527798e-05, "loss": 1.1802, "step": 4045 }, { "epoch": 0.5994074074074074, "grad_norm": 1.4910309314727783, "learning_rate": 8.032616753150483e-05, "loss": 1.0519, "step": 4046 }, { "epoch": 0.5995555555555555, "grad_norm": 1.4305469989776611, "learning_rate": 8.029651593773166e-05, "loss": 1.0129, "step": 4047 }, { "epoch": 0.5997037037037037, "grad_norm": 1.8170634508132935, "learning_rate": 8.026686434395849e-05, "loss": 0.8642, "step": 4048 }, { "epoch": 0.5998518518518519, "grad_norm": 1.469193935394287, "learning_rate": 8.023721275018533e-05, "loss": 1.0446, "step": 4049 }, { "epoch": 0.6, "grad_norm": 1.5894759893417358, "learning_rate": 8.020756115641216e-05, "loss": 0.9202, "step": 4050 }, { "epoch": 0.6001481481481481, "grad_norm": 1.668437123298645, "learning_rate": 8.0177909562639e-05, "loss": 0.8982, "step": 4051 }, { "epoch": 0.6002962962962963, "grad_norm": 1.35999596118927, "learning_rate": 8.014825796886582e-05, "loss": 0.9149, "step": 4052 }, { "epoch": 0.6004444444444444, "grad_norm": 1.5806087255477905, "learning_rate": 8.011860637509267e-05, "loss": 1.1388, "step": 4053 }, { "epoch": 0.6005925925925926, "grad_norm": 1.7249394655227661, "learning_rate": 8.00889547813195e-05, "loss": 1.1687, "step": 4054 }, { "epoch": 0.6007407407407407, "grad_norm": 1.5930378437042236, "learning_rate": 8.005930318754633e-05, "loss": 1.2331, "step": 4055 }, { "epoch": 0.6008888888888889, "grad_norm": 1.7165197134017944, "learning_rate": 8.002965159377318e-05, "loss": 1.1119, "step": 4056 }, { "epoch": 0.601037037037037, "grad_norm": 2.0387492179870605, "learning_rate": 8e-05, "loss": 0.9951, "step": 4057 }, { "epoch": 0.6011851851851852, "grad_norm": 2.3698105812072754, "learning_rate": 7.997034840622684e-05, "loss": 1.0545, "step": 4058 }, { "epoch": 0.6013333333333334, "grad_norm": 1.5965845584869385, "learning_rate": 7.994069681245367e-05, "loss": 0.9012, "step": 4059 }, { "epoch": 0.6014814814814815, "grad_norm": 2.0365452766418457, "learning_rate": 7.99110452186805e-05, "loss": 1.1274, "step": 4060 }, { "epoch": 0.6016296296296296, "grad_norm": 1.958509087562561, "learning_rate": 7.988139362490734e-05, "loss": 1.1167, "step": 4061 }, { "epoch": 0.6017777777777777, "grad_norm": 2.1451406478881836, "learning_rate": 7.985174203113417e-05, "loss": 1.3345, "step": 4062 }, { "epoch": 0.601925925925926, "grad_norm": 1.3491348028182983, "learning_rate": 7.9822090437361e-05, "loss": 1.0724, "step": 4063 }, { "epoch": 0.6020740740740741, "grad_norm": 2.01298451423645, "learning_rate": 7.979243884358785e-05, "loss": 0.8657, "step": 4064 }, { "epoch": 0.6022222222222222, "grad_norm": 1.8076258897781372, "learning_rate": 7.976278724981468e-05, "loss": 0.8764, "step": 4065 }, { "epoch": 0.6023703703703703, "grad_norm": 2.361900806427002, "learning_rate": 7.973313565604151e-05, "loss": 1.2356, "step": 4066 }, { "epoch": 0.6025185185185186, "grad_norm": 1.8414537906646729, "learning_rate": 7.970348406226836e-05, "loss": 1.0375, "step": 4067 }, { "epoch": 0.6026666666666667, "grad_norm": 2.9812514781951904, "learning_rate": 7.967383246849519e-05, "loss": 1.0114, "step": 4068 }, { "epoch": 0.6028148148148148, "grad_norm": 2.112748861312866, "learning_rate": 7.964418087472202e-05, "loss": 1.0682, "step": 4069 }, { "epoch": 0.6029629629629629, "grad_norm": 2.281156063079834, "learning_rate": 7.961452928094886e-05, "loss": 1.0197, "step": 4070 }, { "epoch": 0.6031111111111112, "grad_norm": 1.7563687562942505, "learning_rate": 7.958487768717568e-05, "loss": 1.1047, "step": 4071 }, { "epoch": 0.6032592592592593, "grad_norm": 1.8988922834396362, "learning_rate": 7.955522609340252e-05, "loss": 1.1074, "step": 4072 }, { "epoch": 0.6034074074074074, "grad_norm": 1.6233725547790527, "learning_rate": 7.952557449962937e-05, "loss": 1.1455, "step": 4073 }, { "epoch": 0.6035555555555555, "grad_norm": 5.494024276733398, "learning_rate": 7.949592290585619e-05, "loss": 1.0763, "step": 4074 }, { "epoch": 0.6037037037037037, "grad_norm": 2.2812435626983643, "learning_rate": 7.946627131208303e-05, "loss": 1.0217, "step": 4075 }, { "epoch": 0.6038518518518519, "grad_norm": 1.5280718803405762, "learning_rate": 7.943661971830986e-05, "loss": 1.1622, "step": 4076 }, { "epoch": 0.604, "grad_norm": 3.2437915802001953, "learning_rate": 7.940696812453669e-05, "loss": 0.8585, "step": 4077 }, { "epoch": 0.6041481481481481, "grad_norm": 1.639431357383728, "learning_rate": 7.937731653076354e-05, "loss": 1.2248, "step": 4078 }, { "epoch": 0.6042962962962963, "grad_norm": 2.29898738861084, "learning_rate": 7.934766493699037e-05, "loss": 1.0092, "step": 4079 }, { "epoch": 0.6044444444444445, "grad_norm": 1.4584858417510986, "learning_rate": 7.93180133432172e-05, "loss": 0.9846, "step": 4080 }, { "epoch": 0.6045925925925926, "grad_norm": 1.757988691329956, "learning_rate": 7.928836174944404e-05, "loss": 1.1713, "step": 4081 }, { "epoch": 0.6047407407407407, "grad_norm": 1.628948450088501, "learning_rate": 7.925871015567087e-05, "loss": 1.0576, "step": 4082 }, { "epoch": 0.6048888888888889, "grad_norm": 1.3002852201461792, "learning_rate": 7.92290585618977e-05, "loss": 1.0973, "step": 4083 }, { "epoch": 0.605037037037037, "grad_norm": 1.8399114608764648, "learning_rate": 7.919940696812455e-05, "loss": 0.8469, "step": 4084 }, { "epoch": 0.6051851851851852, "grad_norm": 2.028707504272461, "learning_rate": 7.916975537435137e-05, "loss": 1.1424, "step": 4085 }, { "epoch": 0.6053333333333333, "grad_norm": 1.6397368907928467, "learning_rate": 7.914010378057821e-05, "loss": 1.3036, "step": 4086 }, { "epoch": 0.6054814814814815, "grad_norm": 2.5899932384490967, "learning_rate": 7.911045218680504e-05, "loss": 0.8892, "step": 4087 }, { "epoch": 0.6056296296296296, "grad_norm": 4.704237461090088, "learning_rate": 7.908080059303187e-05, "loss": 1.246, "step": 4088 }, { "epoch": 0.6057777777777777, "grad_norm": 1.4413715600967407, "learning_rate": 7.905114899925872e-05, "loss": 0.9217, "step": 4089 }, { "epoch": 0.605925925925926, "grad_norm": 2.4111835956573486, "learning_rate": 7.902149740548555e-05, "loss": 1.1111, "step": 4090 }, { "epoch": 0.6060740740740741, "grad_norm": 1.3525553941726685, "learning_rate": 7.899184581171238e-05, "loss": 0.866, "step": 4091 }, { "epoch": 0.6062222222222222, "grad_norm": 1.4736638069152832, "learning_rate": 7.896219421793922e-05, "loss": 1.0034, "step": 4092 }, { "epoch": 0.6063703703703703, "grad_norm": 3.423008680343628, "learning_rate": 7.893254262416605e-05, "loss": 1.1446, "step": 4093 }, { "epoch": 0.6065185185185186, "grad_norm": 2.5786850452423096, "learning_rate": 7.890289103039289e-05, "loss": 1.1736, "step": 4094 }, { "epoch": 0.6066666666666667, "grad_norm": 1.7646931409835815, "learning_rate": 7.887323943661972e-05, "loss": 1.1437, "step": 4095 }, { "epoch": 0.6068148148148148, "grad_norm": 1.339115858078003, "learning_rate": 7.884358784284656e-05, "loss": 1.0239, "step": 4096 }, { "epoch": 0.6069629629629629, "grad_norm": 1.8531190156936646, "learning_rate": 7.881393624907339e-05, "loss": 1.0883, "step": 4097 }, { "epoch": 0.6071111111111112, "grad_norm": 1.5783220529556274, "learning_rate": 7.878428465530022e-05, "loss": 0.9258, "step": 4098 }, { "epoch": 0.6072592592592593, "grad_norm": 1.628716230392456, "learning_rate": 7.875463306152707e-05, "loss": 1.0058, "step": 4099 }, { "epoch": 0.6074074074074074, "grad_norm": 1.6141797304153442, "learning_rate": 7.87249814677539e-05, "loss": 0.9564, "step": 4100 }, { "epoch": 0.6075555555555555, "grad_norm": 2.645317554473877, "learning_rate": 7.869532987398073e-05, "loss": 0.9276, "step": 4101 }, { "epoch": 0.6077037037037037, "grad_norm": 1.5307976007461548, "learning_rate": 7.866567828020756e-05, "loss": 0.9949, "step": 4102 }, { "epoch": 0.6078518518518519, "grad_norm": 1.6276675462722778, "learning_rate": 7.863602668643439e-05, "loss": 1.0947, "step": 4103 }, { "epoch": 0.608, "grad_norm": 2.62444806098938, "learning_rate": 7.860637509266124e-05, "loss": 1.1976, "step": 4104 }, { "epoch": 0.6081481481481481, "grad_norm": 1.280867099761963, "learning_rate": 7.857672349888807e-05, "loss": 0.9916, "step": 4105 }, { "epoch": 0.6082962962962963, "grad_norm": 1.7362924814224243, "learning_rate": 7.85470719051149e-05, "loss": 1.379, "step": 4106 }, { "epoch": 0.6084444444444445, "grad_norm": 1.80226731300354, "learning_rate": 7.851742031134174e-05, "loss": 1.1435, "step": 4107 }, { "epoch": 0.6085925925925926, "grad_norm": 9.848043441772461, "learning_rate": 7.848776871756857e-05, "loss": 1.0301, "step": 4108 }, { "epoch": 0.6087407407407407, "grad_norm": 1.266819953918457, "learning_rate": 7.84581171237954e-05, "loss": 0.917, "step": 4109 }, { "epoch": 0.6088888888888889, "grad_norm": 1.533785104751587, "learning_rate": 7.842846553002225e-05, "loss": 0.9731, "step": 4110 }, { "epoch": 0.609037037037037, "grad_norm": 3.039918899536133, "learning_rate": 7.839881393624908e-05, "loss": 1.0057, "step": 4111 }, { "epoch": 0.6091851851851852, "grad_norm": 1.6496689319610596, "learning_rate": 7.836916234247591e-05, "loss": 0.9676, "step": 4112 }, { "epoch": 0.6093333333333333, "grad_norm": 2.629190444946289, "learning_rate": 7.833951074870275e-05, "loss": 0.9329, "step": 4113 }, { "epoch": 0.6094814814814815, "grad_norm": 1.5872153043746948, "learning_rate": 7.830985915492957e-05, "loss": 1.3053, "step": 4114 }, { "epoch": 0.6096296296296296, "grad_norm": 2.0701448917388916, "learning_rate": 7.828020756115642e-05, "loss": 1.0366, "step": 4115 }, { "epoch": 0.6097777777777778, "grad_norm": 2.4702115058898926, "learning_rate": 7.825055596738326e-05, "loss": 1.2577, "step": 4116 }, { "epoch": 0.6099259259259259, "grad_norm": 1.9070110321044922, "learning_rate": 7.822090437361008e-05, "loss": 0.8908, "step": 4117 }, { "epoch": 0.6100740740740741, "grad_norm": 1.2122715711593628, "learning_rate": 7.819125277983692e-05, "loss": 0.9514, "step": 4118 }, { "epoch": 0.6102222222222222, "grad_norm": 2.643510580062866, "learning_rate": 7.816160118606375e-05, "loss": 1.1475, "step": 4119 }, { "epoch": 0.6103703703703703, "grad_norm": 1.505334734916687, "learning_rate": 7.813194959229058e-05, "loss": 1.0381, "step": 4120 }, { "epoch": 0.6105185185185186, "grad_norm": 1.9772828817367554, "learning_rate": 7.810229799851743e-05, "loss": 1.0905, "step": 4121 }, { "epoch": 0.6106666666666667, "grad_norm": 1.8512394428253174, "learning_rate": 7.807264640474426e-05, "loss": 0.8946, "step": 4122 }, { "epoch": 0.6108148148148148, "grad_norm": 1.8181568384170532, "learning_rate": 7.804299481097109e-05, "loss": 1.1931, "step": 4123 }, { "epoch": 0.6109629629629629, "grad_norm": 2.4302375316619873, "learning_rate": 7.801334321719793e-05, "loss": 1.0536, "step": 4124 }, { "epoch": 0.6111111111111112, "grad_norm": 2.5360796451568604, "learning_rate": 7.798369162342477e-05, "loss": 1.1119, "step": 4125 }, { "epoch": 0.6112592592592593, "grad_norm": 2.764610767364502, "learning_rate": 7.79540400296516e-05, "loss": 1.0809, "step": 4126 }, { "epoch": 0.6114074074074074, "grad_norm": 8.232843399047852, "learning_rate": 7.792438843587844e-05, "loss": 1.0302, "step": 4127 }, { "epoch": 0.6115555555555555, "grad_norm": 2.0853981971740723, "learning_rate": 7.789473684210526e-05, "loss": 1.1032, "step": 4128 }, { "epoch": 0.6117037037037037, "grad_norm": 4.052962779998779, "learning_rate": 7.78650852483321e-05, "loss": 0.9787, "step": 4129 }, { "epoch": 0.6118518518518519, "grad_norm": 2.480470895767212, "learning_rate": 7.783543365455893e-05, "loss": 0.9696, "step": 4130 }, { "epoch": 0.612, "grad_norm": 4.445143699645996, "learning_rate": 7.780578206078576e-05, "loss": 1.2104, "step": 4131 }, { "epoch": 0.6121481481481481, "grad_norm": 2.423997402191162, "learning_rate": 7.777613046701261e-05, "loss": 0.8823, "step": 4132 }, { "epoch": 0.6122962962962963, "grad_norm": 1.5796433687210083, "learning_rate": 7.774647887323944e-05, "loss": 1.1221, "step": 4133 }, { "epoch": 0.6124444444444445, "grad_norm": 1.7948166131973267, "learning_rate": 7.771682727946627e-05, "loss": 0.9861, "step": 4134 }, { "epoch": 0.6125925925925926, "grad_norm": 2.499213933944702, "learning_rate": 7.768717568569312e-05, "loss": 0.9916, "step": 4135 }, { "epoch": 0.6127407407407407, "grad_norm": 2.29291033744812, "learning_rate": 7.765752409191995e-05, "loss": 1.0726, "step": 4136 }, { "epoch": 0.6128888888888889, "grad_norm": 2.7643020153045654, "learning_rate": 7.762787249814678e-05, "loss": 0.9617, "step": 4137 }, { "epoch": 0.613037037037037, "grad_norm": 1.8919548988342285, "learning_rate": 7.759822090437361e-05, "loss": 0.9031, "step": 4138 }, { "epoch": 0.6131851851851852, "grad_norm": 1.8989598751068115, "learning_rate": 7.756856931060045e-05, "loss": 1.0372, "step": 4139 }, { "epoch": 0.6133333333333333, "grad_norm": 1.5658788681030273, "learning_rate": 7.753891771682728e-05, "loss": 1.273, "step": 4140 }, { "epoch": 0.6134814814814815, "grad_norm": 2.625537872314453, "learning_rate": 7.750926612305411e-05, "loss": 1.0783, "step": 4141 }, { "epoch": 0.6136296296296296, "grad_norm": 2.4267075061798096, "learning_rate": 7.747961452928096e-05, "loss": 1.094, "step": 4142 }, { "epoch": 0.6137777777777778, "grad_norm": 4.063164710998535, "learning_rate": 7.744996293550779e-05, "loss": 0.9473, "step": 4143 }, { "epoch": 0.6139259259259259, "grad_norm": 2.0295581817626953, "learning_rate": 7.742031134173462e-05, "loss": 0.7895, "step": 4144 }, { "epoch": 0.6140740740740741, "grad_norm": 1.3276896476745605, "learning_rate": 7.739065974796145e-05, "loss": 0.8924, "step": 4145 }, { "epoch": 0.6142222222222222, "grad_norm": 1.562353491783142, "learning_rate": 7.736100815418828e-05, "loss": 0.9593, "step": 4146 }, { "epoch": 0.6143703703703703, "grad_norm": 1.0941139459609985, "learning_rate": 7.733135656041513e-05, "loss": 1.0222, "step": 4147 }, { "epoch": 0.6145185185185185, "grad_norm": 2.2305283546447754, "learning_rate": 7.730170496664196e-05, "loss": 1.1955, "step": 4148 }, { "epoch": 0.6146666666666667, "grad_norm": 1.459812045097351, "learning_rate": 7.727205337286879e-05, "loss": 1.198, "step": 4149 }, { "epoch": 0.6148148148148148, "grad_norm": 1.284201979637146, "learning_rate": 7.724240177909563e-05, "loss": 1.1772, "step": 4150 }, { "epoch": 0.6149629629629629, "grad_norm": 1.9952270984649658, "learning_rate": 7.721275018532246e-05, "loss": 1.0394, "step": 4151 }, { "epoch": 0.6151111111111112, "grad_norm": 2.0768494606018066, "learning_rate": 7.71830985915493e-05, "loss": 1.1319, "step": 4152 }, { "epoch": 0.6152592592592593, "grad_norm": 2.1895029544830322, "learning_rate": 7.715344699777614e-05, "loss": 1.1949, "step": 4153 }, { "epoch": 0.6154074074074074, "grad_norm": 1.666520118713379, "learning_rate": 7.712379540400297e-05, "loss": 0.9457, "step": 4154 }, { "epoch": 0.6155555555555555, "grad_norm": 2.864825963973999, "learning_rate": 7.70941438102298e-05, "loss": 1.1301, "step": 4155 }, { "epoch": 0.6157037037037038, "grad_norm": 1.4113929271697998, "learning_rate": 7.706449221645665e-05, "loss": 1.0233, "step": 4156 }, { "epoch": 0.6158518518518519, "grad_norm": 1.3558247089385986, "learning_rate": 7.703484062268346e-05, "loss": 0.836, "step": 4157 }, { "epoch": 0.616, "grad_norm": 2.3334364891052246, "learning_rate": 7.700518902891031e-05, "loss": 0.9674, "step": 4158 }, { "epoch": 0.6161481481481481, "grad_norm": 1.3023676872253418, "learning_rate": 7.697553743513715e-05, "loss": 0.9478, "step": 4159 }, { "epoch": 0.6162962962962963, "grad_norm": 2.030163049697876, "learning_rate": 7.694588584136397e-05, "loss": 1.3565, "step": 4160 }, { "epoch": 0.6164444444444445, "grad_norm": 1.7704468965530396, "learning_rate": 7.691623424759081e-05, "loss": 1.2324, "step": 4161 }, { "epoch": 0.6165925925925926, "grad_norm": 1.984651803970337, "learning_rate": 7.688658265381764e-05, "loss": 0.9833, "step": 4162 }, { "epoch": 0.6167407407407407, "grad_norm": 4.376496315002441, "learning_rate": 7.685693106004448e-05, "loss": 1.0987, "step": 4163 }, { "epoch": 0.6168888888888889, "grad_norm": 1.5003368854522705, "learning_rate": 7.682727946627132e-05, "loss": 0.9752, "step": 4164 }, { "epoch": 0.617037037037037, "grad_norm": 2.00019907951355, "learning_rate": 7.679762787249815e-05, "loss": 1.0655, "step": 4165 }, { "epoch": 0.6171851851851852, "grad_norm": 2.5613484382629395, "learning_rate": 7.676797627872498e-05, "loss": 0.9663, "step": 4166 }, { "epoch": 0.6173333333333333, "grad_norm": 1.5503530502319336, "learning_rate": 7.673832468495183e-05, "loss": 0.9448, "step": 4167 }, { "epoch": 0.6174814814814815, "grad_norm": 2.6386940479278564, "learning_rate": 7.670867309117866e-05, "loss": 1.0828, "step": 4168 }, { "epoch": 0.6176296296296296, "grad_norm": 1.0050711631774902, "learning_rate": 7.667902149740549e-05, "loss": 0.7964, "step": 4169 }, { "epoch": 0.6177777777777778, "grad_norm": 1.5181121826171875, "learning_rate": 7.664936990363233e-05, "loss": 0.6968, "step": 4170 }, { "epoch": 0.6179259259259259, "grad_norm": 1.0668538808822632, "learning_rate": 7.661971830985915e-05, "loss": 1.2148, "step": 4171 }, { "epoch": 0.6180740740740741, "grad_norm": 2.245246410369873, "learning_rate": 7.6590066716086e-05, "loss": 0.9375, "step": 4172 }, { "epoch": 0.6182222222222222, "grad_norm": 1.1597541570663452, "learning_rate": 7.656041512231283e-05, "loss": 0.7565, "step": 4173 }, { "epoch": 0.6183703703703703, "grad_norm": 2.1912178993225098, "learning_rate": 7.653076352853966e-05, "loss": 1.1889, "step": 4174 }, { "epoch": 0.6185185185185185, "grad_norm": 1.9651681184768677, "learning_rate": 7.65011119347665e-05, "loss": 0.99, "step": 4175 }, { "epoch": 0.6186666666666667, "grad_norm": 1.5237642526626587, "learning_rate": 7.647146034099333e-05, "loss": 0.9575, "step": 4176 }, { "epoch": 0.6188148148148148, "grad_norm": 1.6119823455810547, "learning_rate": 7.644180874722016e-05, "loss": 0.9707, "step": 4177 }, { "epoch": 0.6189629629629629, "grad_norm": 1.219313621520996, "learning_rate": 7.641215715344701e-05, "loss": 0.7978, "step": 4178 }, { "epoch": 0.6191111111111111, "grad_norm": 2.2975165843963623, "learning_rate": 7.638250555967384e-05, "loss": 1.1831, "step": 4179 }, { "epoch": 0.6192592592592593, "grad_norm": 1.8074194192886353, "learning_rate": 7.635285396590067e-05, "loss": 1.0527, "step": 4180 }, { "epoch": 0.6194074074074074, "grad_norm": 2.2357099056243896, "learning_rate": 7.63232023721275e-05, "loss": 1.0034, "step": 4181 }, { "epoch": 0.6195555555555555, "grad_norm": 1.9494400024414062, "learning_rate": 7.629355077835434e-05, "loss": 0.9735, "step": 4182 }, { "epoch": 0.6197037037037038, "grad_norm": 1.4775364398956299, "learning_rate": 7.626389918458118e-05, "loss": 1.0355, "step": 4183 }, { "epoch": 0.6198518518518519, "grad_norm": 1.3309204578399658, "learning_rate": 7.6234247590808e-05, "loss": 1.2182, "step": 4184 }, { "epoch": 0.62, "grad_norm": 2.209710121154785, "learning_rate": 7.620459599703485e-05, "loss": 1.266, "step": 4185 }, { "epoch": 0.6201481481481481, "grad_norm": 3.240086078643799, "learning_rate": 7.617494440326168e-05, "loss": 0.8897, "step": 4186 }, { "epoch": 0.6202962962962963, "grad_norm": 2.5816378593444824, "learning_rate": 7.614529280948851e-05, "loss": 0.9752, "step": 4187 }, { "epoch": 0.6204444444444445, "grad_norm": 2.542442560195923, "learning_rate": 7.611564121571534e-05, "loss": 0.9243, "step": 4188 }, { "epoch": 0.6205925925925926, "grad_norm": 2.0120692253112793, "learning_rate": 7.608598962194217e-05, "loss": 1.1543, "step": 4189 }, { "epoch": 0.6207407407407407, "grad_norm": 1.7393430471420288, "learning_rate": 7.605633802816902e-05, "loss": 1.0426, "step": 4190 }, { "epoch": 0.6208888888888889, "grad_norm": 1.8278281688690186, "learning_rate": 7.602668643439585e-05, "loss": 0.929, "step": 4191 }, { "epoch": 0.621037037037037, "grad_norm": 1.3061013221740723, "learning_rate": 7.599703484062268e-05, "loss": 1.0186, "step": 4192 }, { "epoch": 0.6211851851851852, "grad_norm": 2.045588493347168, "learning_rate": 7.596738324684952e-05, "loss": 0.7341, "step": 4193 }, { "epoch": 0.6213333333333333, "grad_norm": 1.4220703840255737, "learning_rate": 7.593773165307636e-05, "loss": 0.9295, "step": 4194 }, { "epoch": 0.6214814814814815, "grad_norm": 3.4312973022460938, "learning_rate": 7.590808005930319e-05, "loss": 1.0083, "step": 4195 }, { "epoch": 0.6216296296296296, "grad_norm": 3.1611483097076416, "learning_rate": 7.587842846553003e-05, "loss": 1.0974, "step": 4196 }, { "epoch": 0.6217777777777778, "grad_norm": 2.1379802227020264, "learning_rate": 7.584877687175686e-05, "loss": 1.2359, "step": 4197 }, { "epoch": 0.6219259259259259, "grad_norm": 1.9825153350830078, "learning_rate": 7.581912527798369e-05, "loss": 1.1976, "step": 4198 }, { "epoch": 0.6220740740740741, "grad_norm": 1.353455901145935, "learning_rate": 7.578947368421054e-05, "loss": 0.9957, "step": 4199 }, { "epoch": 0.6222222222222222, "grad_norm": 1.3882861137390137, "learning_rate": 7.575982209043735e-05, "loss": 1.0771, "step": 4200 }, { "epoch": 0.6223703703703704, "grad_norm": 1.9619619846343994, "learning_rate": 7.57301704966642e-05, "loss": 0.7962, "step": 4201 }, { "epoch": 0.6225185185185185, "grad_norm": 1.5832520723342896, "learning_rate": 7.570051890289104e-05, "loss": 0.9227, "step": 4202 }, { "epoch": 0.6226666666666667, "grad_norm": 2.1912996768951416, "learning_rate": 7.567086730911786e-05, "loss": 1.1511, "step": 4203 }, { "epoch": 0.6228148148148148, "grad_norm": 1.8055832386016846, "learning_rate": 7.56412157153447e-05, "loss": 0.9385, "step": 4204 }, { "epoch": 0.6229629629629629, "grad_norm": 2.094910144805908, "learning_rate": 7.561156412157154e-05, "loss": 1.0901, "step": 4205 }, { "epoch": 0.6231111111111111, "grad_norm": 1.8955321311950684, "learning_rate": 7.558191252779837e-05, "loss": 1.0497, "step": 4206 }, { "epoch": 0.6232592592592593, "grad_norm": 12.515061378479004, "learning_rate": 7.555226093402521e-05, "loss": 1.021, "step": 4207 }, { "epoch": 0.6234074074074074, "grad_norm": 1.5139273405075073, "learning_rate": 7.552260934025204e-05, "loss": 1.0022, "step": 4208 }, { "epoch": 0.6235555555555555, "grad_norm": 1.5754523277282715, "learning_rate": 7.549295774647887e-05, "loss": 1.2412, "step": 4209 }, { "epoch": 0.6237037037037036, "grad_norm": 1.48861563205719, "learning_rate": 7.546330615270572e-05, "loss": 0.9615, "step": 4210 }, { "epoch": 0.6238518518518519, "grad_norm": 1.5839914083480835, "learning_rate": 7.543365455893255e-05, "loss": 1.0142, "step": 4211 }, { "epoch": 0.624, "grad_norm": 2.623595714569092, "learning_rate": 7.540400296515938e-05, "loss": 1.0416, "step": 4212 }, { "epoch": 0.6241481481481481, "grad_norm": 2.2146899700164795, "learning_rate": 7.537435137138622e-05, "loss": 1.267, "step": 4213 }, { "epoch": 0.6242962962962963, "grad_norm": 2.8189847469329834, "learning_rate": 7.534469977761304e-05, "loss": 1.1923, "step": 4214 }, { "epoch": 0.6244444444444445, "grad_norm": 1.65372633934021, "learning_rate": 7.531504818383989e-05, "loss": 0.9996, "step": 4215 }, { "epoch": 0.6245925925925926, "grad_norm": 1.4494138956069946, "learning_rate": 7.528539659006672e-05, "loss": 0.9434, "step": 4216 }, { "epoch": 0.6247407407407407, "grad_norm": 2.095656156539917, "learning_rate": 7.525574499629355e-05, "loss": 1.0989, "step": 4217 }, { "epoch": 0.6248888888888889, "grad_norm": 1.9197618961334229, "learning_rate": 7.522609340252039e-05, "loss": 1.2848, "step": 4218 }, { "epoch": 0.6250370370370371, "grad_norm": 2.0667176246643066, "learning_rate": 7.519644180874722e-05, "loss": 0.8713, "step": 4219 }, { "epoch": 0.6251851851851852, "grad_norm": 1.7209436893463135, "learning_rate": 7.516679021497405e-05, "loss": 1.1994, "step": 4220 }, { "epoch": 0.6253333333333333, "grad_norm": 1.2906612157821655, "learning_rate": 7.51371386212009e-05, "loss": 1.1263, "step": 4221 }, { "epoch": 0.6254814814814815, "grad_norm": 2.0552279949188232, "learning_rate": 7.510748702742773e-05, "loss": 1.0431, "step": 4222 }, { "epoch": 0.6256296296296296, "grad_norm": 1.662018895149231, "learning_rate": 7.507783543365456e-05, "loss": 1.1519, "step": 4223 }, { "epoch": 0.6257777777777778, "grad_norm": 2.582836866378784, "learning_rate": 7.504818383988139e-05, "loss": 0.9639, "step": 4224 }, { "epoch": 0.6259259259259259, "grad_norm": 1.4937838315963745, "learning_rate": 7.501853224610824e-05, "loss": 1.0501, "step": 4225 }, { "epoch": 0.6260740740740741, "grad_norm": 1.4058259725570679, "learning_rate": 7.498888065233507e-05, "loss": 0.8024, "step": 4226 }, { "epoch": 0.6262222222222222, "grad_norm": 1.589437484741211, "learning_rate": 7.49592290585619e-05, "loss": 1.1069, "step": 4227 }, { "epoch": 0.6263703703703704, "grad_norm": 1.9938610792160034, "learning_rate": 7.492957746478874e-05, "loss": 1.169, "step": 4228 }, { "epoch": 0.6265185185185185, "grad_norm": 1.3336560726165771, "learning_rate": 7.489992587101557e-05, "loss": 1.2742, "step": 4229 }, { "epoch": 0.6266666666666667, "grad_norm": 2.6536037921905518, "learning_rate": 7.48702742772424e-05, "loss": 1.2188, "step": 4230 }, { "epoch": 0.6268148148148148, "grad_norm": 1.3016421794891357, "learning_rate": 7.484062268346923e-05, "loss": 0.9154, "step": 4231 }, { "epoch": 0.6269629629629629, "grad_norm": 1.5644383430480957, "learning_rate": 7.481097108969608e-05, "loss": 1.0773, "step": 4232 }, { "epoch": 0.6271111111111111, "grad_norm": 1.9621175527572632, "learning_rate": 7.478131949592291e-05, "loss": 0.9856, "step": 4233 }, { "epoch": 0.6272592592592593, "grad_norm": 1.5468896627426147, "learning_rate": 7.475166790214974e-05, "loss": 0.9565, "step": 4234 }, { "epoch": 0.6274074074074074, "grad_norm": 1.7012988328933716, "learning_rate": 7.472201630837657e-05, "loss": 0.9858, "step": 4235 }, { "epoch": 0.6275555555555555, "grad_norm": 1.4528248310089111, "learning_rate": 7.469236471460342e-05, "loss": 0.854, "step": 4236 }, { "epoch": 0.6277037037037037, "grad_norm": 2.4127585887908936, "learning_rate": 7.466271312083025e-05, "loss": 1.1117, "step": 4237 }, { "epoch": 0.6278518518518519, "grad_norm": 2.762421131134033, "learning_rate": 7.463306152705708e-05, "loss": 0.9908, "step": 4238 }, { "epoch": 0.628, "grad_norm": 1.190571665763855, "learning_rate": 7.460340993328392e-05, "loss": 0.83, "step": 4239 }, { "epoch": 0.6281481481481481, "grad_norm": 1.5424587726593018, "learning_rate": 7.457375833951075e-05, "loss": 1.1369, "step": 4240 }, { "epoch": 0.6282962962962962, "grad_norm": 2.1614809036254883, "learning_rate": 7.454410674573758e-05, "loss": 1.0667, "step": 4241 }, { "epoch": 0.6284444444444445, "grad_norm": 1.5766479969024658, "learning_rate": 7.451445515196443e-05, "loss": 0.934, "step": 4242 }, { "epoch": 0.6285925925925926, "grad_norm": 1.3466310501098633, "learning_rate": 7.448480355819125e-05, "loss": 1.0371, "step": 4243 }, { "epoch": 0.6287407407407407, "grad_norm": 2.342902183532715, "learning_rate": 7.445515196441809e-05, "loss": 1.1869, "step": 4244 }, { "epoch": 0.6288888888888889, "grad_norm": 1.3603520393371582, "learning_rate": 7.442550037064494e-05, "loss": 1.2303, "step": 4245 }, { "epoch": 0.6290370370370371, "grad_norm": 1.6758443117141724, "learning_rate": 7.439584877687175e-05, "loss": 1.1124, "step": 4246 }, { "epoch": 0.6291851851851852, "grad_norm": 2.4252185821533203, "learning_rate": 7.43661971830986e-05, "loss": 1.0878, "step": 4247 }, { "epoch": 0.6293333333333333, "grad_norm": 2.861835241317749, "learning_rate": 7.433654558932543e-05, "loss": 1.0192, "step": 4248 }, { "epoch": 0.6294814814814815, "grad_norm": 1.5305482149124146, "learning_rate": 7.430689399555226e-05, "loss": 1.0219, "step": 4249 }, { "epoch": 0.6296296296296297, "grad_norm": 1.8694067001342773, "learning_rate": 7.42772424017791e-05, "loss": 1.1271, "step": 4250 }, { "epoch": 0.6297777777777778, "grad_norm": 4.762666702270508, "learning_rate": 7.424759080800593e-05, "loss": 0.9223, "step": 4251 }, { "epoch": 0.6299259259259259, "grad_norm": 1.920195460319519, "learning_rate": 7.421793921423277e-05, "loss": 1.1276, "step": 4252 }, { "epoch": 0.6300740740740741, "grad_norm": 2.142634153366089, "learning_rate": 7.418828762045961e-05, "loss": 0.878, "step": 4253 }, { "epoch": 0.6302222222222222, "grad_norm": 1.4534026384353638, "learning_rate": 7.415863602668644e-05, "loss": 1.0016, "step": 4254 }, { "epoch": 0.6303703703703704, "grad_norm": 1.5006121397018433, "learning_rate": 7.412898443291327e-05, "loss": 0.6872, "step": 4255 }, { "epoch": 0.6305185185185185, "grad_norm": 1.8782285451889038, "learning_rate": 7.409933283914012e-05, "loss": 1.0539, "step": 4256 }, { "epoch": 0.6306666666666667, "grad_norm": 1.6077821254730225, "learning_rate": 7.406968124536693e-05, "loss": 0.9447, "step": 4257 }, { "epoch": 0.6308148148148148, "grad_norm": 1.6158119440078735, "learning_rate": 7.404002965159378e-05, "loss": 1.2271, "step": 4258 }, { "epoch": 0.630962962962963, "grad_norm": 1.2205272912979126, "learning_rate": 7.401037805782061e-05, "loss": 0.9001, "step": 4259 }, { "epoch": 0.6311111111111111, "grad_norm": 6.948575019836426, "learning_rate": 7.398072646404744e-05, "loss": 1.2105, "step": 4260 }, { "epoch": 0.6312592592592593, "grad_norm": 1.7600739002227783, "learning_rate": 7.395107487027428e-05, "loss": 1.3201, "step": 4261 }, { "epoch": 0.6314074074074074, "grad_norm": 1.9330729246139526, "learning_rate": 7.392142327650112e-05, "loss": 0.8782, "step": 4262 }, { "epoch": 0.6315555555555555, "grad_norm": 1.58470618724823, "learning_rate": 7.389177168272795e-05, "loss": 0.9092, "step": 4263 }, { "epoch": 0.6317037037037037, "grad_norm": 1.2753225564956665, "learning_rate": 7.386212008895479e-05, "loss": 0.8799, "step": 4264 }, { "epoch": 0.6318518518518519, "grad_norm": 1.2597649097442627, "learning_rate": 7.383246849518162e-05, "loss": 0.9713, "step": 4265 }, { "epoch": 0.632, "grad_norm": 1.6237164735794067, "learning_rate": 7.380281690140845e-05, "loss": 1.3855, "step": 4266 }, { "epoch": 0.6321481481481481, "grad_norm": 1.4116196632385254, "learning_rate": 7.377316530763528e-05, "loss": 0.8775, "step": 4267 }, { "epoch": 0.6322962962962962, "grad_norm": 1.9042555093765259, "learning_rate": 7.374351371386213e-05, "loss": 0.9302, "step": 4268 }, { "epoch": 0.6324444444444445, "grad_norm": 1.2561272382736206, "learning_rate": 7.371386212008896e-05, "loss": 1.0513, "step": 4269 }, { "epoch": 0.6325925925925926, "grad_norm": 1.7997751235961914, "learning_rate": 7.368421052631579e-05, "loss": 1.2709, "step": 4270 }, { "epoch": 0.6327407407407407, "grad_norm": 1.4882794618606567, "learning_rate": 7.365455893254263e-05, "loss": 0.9939, "step": 4271 }, { "epoch": 0.6328888888888888, "grad_norm": 1.924720048904419, "learning_rate": 7.362490733876946e-05, "loss": 0.9963, "step": 4272 }, { "epoch": 0.6330370370370371, "grad_norm": 1.4113523960113525, "learning_rate": 7.35952557449963e-05, "loss": 0.9659, "step": 4273 }, { "epoch": 0.6331851851851852, "grad_norm": 1.4288746118545532, "learning_rate": 7.356560415122313e-05, "loss": 0.9953, "step": 4274 }, { "epoch": 0.6333333333333333, "grad_norm": 2.565685272216797, "learning_rate": 7.353595255744997e-05, "loss": 0.9741, "step": 4275 }, { "epoch": 0.6334814814814815, "grad_norm": 1.199468731880188, "learning_rate": 7.35063009636768e-05, "loss": 1.2029, "step": 4276 }, { "epoch": 0.6336296296296297, "grad_norm": 1.4904115200042725, "learning_rate": 7.347664936990363e-05, "loss": 1.0986, "step": 4277 }, { "epoch": 0.6337777777777778, "grad_norm": 1.748367428779602, "learning_rate": 7.344699777613046e-05, "loss": 0.882, "step": 4278 }, { "epoch": 0.6339259259259259, "grad_norm": 1.7592281103134155, "learning_rate": 7.341734618235731e-05, "loss": 1.0141, "step": 4279 }, { "epoch": 0.6340740740740741, "grad_norm": 1.6296261548995972, "learning_rate": 7.338769458858414e-05, "loss": 0.9284, "step": 4280 }, { "epoch": 0.6342222222222222, "grad_norm": 6.17998743057251, "learning_rate": 7.335804299481097e-05, "loss": 0.9908, "step": 4281 }, { "epoch": 0.6343703703703704, "grad_norm": 1.2737665176391602, "learning_rate": 7.332839140103781e-05, "loss": 0.9491, "step": 4282 }, { "epoch": 0.6345185185185185, "grad_norm": 1.1072090864181519, "learning_rate": 7.329873980726465e-05, "loss": 1.0856, "step": 4283 }, { "epoch": 0.6346666666666667, "grad_norm": 1.6702488660812378, "learning_rate": 7.326908821349148e-05, "loss": 1.187, "step": 4284 }, { "epoch": 0.6348148148148148, "grad_norm": 1.4215946197509766, "learning_rate": 7.323943661971832e-05, "loss": 0.8082, "step": 4285 }, { "epoch": 0.634962962962963, "grad_norm": 1.2191710472106934, "learning_rate": 7.320978502594514e-05, "loss": 0.9704, "step": 4286 }, { "epoch": 0.6351111111111111, "grad_norm": 1.2213389873504639, "learning_rate": 7.318013343217198e-05, "loss": 0.9641, "step": 4287 }, { "epoch": 0.6352592592592593, "grad_norm": 4.540380001068115, "learning_rate": 7.315048183839883e-05, "loss": 0.9253, "step": 4288 }, { "epoch": 0.6354074074074074, "grad_norm": 1.5974304676055908, "learning_rate": 7.312083024462564e-05, "loss": 1.3511, "step": 4289 }, { "epoch": 0.6355555555555555, "grad_norm": 1.6083300113677979, "learning_rate": 7.309117865085249e-05, "loss": 0.958, "step": 4290 }, { "epoch": 0.6357037037037037, "grad_norm": 1.281067967414856, "learning_rate": 7.306152705707932e-05, "loss": 0.8817, "step": 4291 }, { "epoch": 0.6358518518518519, "grad_norm": 1.8055102825164795, "learning_rate": 7.303187546330615e-05, "loss": 1.3341, "step": 4292 }, { "epoch": 0.636, "grad_norm": 1.7039903402328491, "learning_rate": 7.3002223869533e-05, "loss": 1.0581, "step": 4293 }, { "epoch": 0.6361481481481481, "grad_norm": 1.5735193490982056, "learning_rate": 7.297257227575983e-05, "loss": 1.0945, "step": 4294 }, { "epoch": 0.6362962962962962, "grad_norm": 2.5691006183624268, "learning_rate": 7.294292068198666e-05, "loss": 0.9692, "step": 4295 }, { "epoch": 0.6364444444444445, "grad_norm": 1.5633037090301514, "learning_rate": 7.29132690882135e-05, "loss": 0.9985, "step": 4296 }, { "epoch": 0.6365925925925926, "grad_norm": 1.8769036531448364, "learning_rate": 7.288361749444033e-05, "loss": 1.1071, "step": 4297 }, { "epoch": 0.6367407407407407, "grad_norm": 1.3452436923980713, "learning_rate": 7.285396590066716e-05, "loss": 1.0296, "step": 4298 }, { "epoch": 0.6368888888888888, "grad_norm": 1.531742811203003, "learning_rate": 7.282431430689401e-05, "loss": 1.1471, "step": 4299 }, { "epoch": 0.6370370370370371, "grad_norm": 1.4082554578781128, "learning_rate": 7.279466271312083e-05, "loss": 1.0521, "step": 4300 }, { "epoch": 0.6371851851851852, "grad_norm": 1.5034687519073486, "learning_rate": 7.276501111934767e-05, "loss": 1.1396, "step": 4301 }, { "epoch": 0.6373333333333333, "grad_norm": 4.933014392852783, "learning_rate": 7.27353595255745e-05, "loss": 1.2253, "step": 4302 }, { "epoch": 0.6374814814814814, "grad_norm": 1.5926105976104736, "learning_rate": 7.270570793180133e-05, "loss": 1.2105, "step": 4303 }, { "epoch": 0.6376296296296297, "grad_norm": 3.1412436962127686, "learning_rate": 7.267605633802818e-05, "loss": 1.0453, "step": 4304 }, { "epoch": 0.6377777777777778, "grad_norm": 1.523639440536499, "learning_rate": 7.2646404744255e-05, "loss": 0.7762, "step": 4305 }, { "epoch": 0.6379259259259259, "grad_norm": 1.5867841243743896, "learning_rate": 7.261675315048184e-05, "loss": 0.8973, "step": 4306 }, { "epoch": 0.6380740740740741, "grad_norm": 1.5310838222503662, "learning_rate": 7.258710155670868e-05, "loss": 1.4496, "step": 4307 }, { "epoch": 0.6382222222222222, "grad_norm": 1.4425774812698364, "learning_rate": 7.255744996293551e-05, "loss": 1.047, "step": 4308 }, { "epoch": 0.6383703703703704, "grad_norm": 1.5306428670883179, "learning_rate": 7.252779836916234e-05, "loss": 0.8203, "step": 4309 }, { "epoch": 0.6385185185185185, "grad_norm": 1.9451457262039185, "learning_rate": 7.249814677538917e-05, "loss": 1.117, "step": 4310 }, { "epoch": 0.6386666666666667, "grad_norm": 0.9834615588188171, "learning_rate": 7.246849518161602e-05, "loss": 0.9603, "step": 4311 }, { "epoch": 0.6388148148148148, "grad_norm": 1.9894959926605225, "learning_rate": 7.243884358784285e-05, "loss": 1.0404, "step": 4312 }, { "epoch": 0.638962962962963, "grad_norm": 1.2576085329055786, "learning_rate": 7.240919199406968e-05, "loss": 0.8318, "step": 4313 }, { "epoch": 0.6391111111111111, "grad_norm": 2.3639659881591797, "learning_rate": 7.237954040029653e-05, "loss": 0.9326, "step": 4314 }, { "epoch": 0.6392592592592593, "grad_norm": 1.5994187593460083, "learning_rate": 7.234988880652336e-05, "loss": 0.9818, "step": 4315 }, { "epoch": 0.6394074074074074, "grad_norm": 1.3508126735687256, "learning_rate": 7.232023721275019e-05, "loss": 1.0248, "step": 4316 }, { "epoch": 0.6395555555555555, "grad_norm": 2.3110697269439697, "learning_rate": 7.229058561897702e-05, "loss": 0.9616, "step": 4317 }, { "epoch": 0.6397037037037037, "grad_norm": 1.4802743196487427, "learning_rate": 7.226093402520386e-05, "loss": 1.0018, "step": 4318 }, { "epoch": 0.6398518518518519, "grad_norm": 1.8502402305603027, "learning_rate": 7.22312824314307e-05, "loss": 1.1073, "step": 4319 }, { "epoch": 0.64, "grad_norm": 2.272400140762329, "learning_rate": 7.220163083765752e-05, "loss": 1.0234, "step": 4320 }, { "epoch": 0.6401481481481481, "grad_norm": 1.1387325525283813, "learning_rate": 7.217197924388436e-05, "loss": 0.8343, "step": 4321 }, { "epoch": 0.6402962962962963, "grad_norm": 1.1921565532684326, "learning_rate": 7.21423276501112e-05, "loss": 1.0573, "step": 4322 }, { "epoch": 0.6404444444444445, "grad_norm": 1.639599084854126, "learning_rate": 7.211267605633803e-05, "loss": 0.7863, "step": 4323 }, { "epoch": 0.6405925925925926, "grad_norm": 1.4308897256851196, "learning_rate": 7.208302446256486e-05, "loss": 0.7823, "step": 4324 }, { "epoch": 0.6407407407407407, "grad_norm": 3.084224224090576, "learning_rate": 7.20533728687917e-05, "loss": 1.2159, "step": 4325 }, { "epoch": 0.6408888888888888, "grad_norm": 1.5659797191619873, "learning_rate": 7.202372127501854e-05, "loss": 1.1887, "step": 4326 }, { "epoch": 0.6410370370370371, "grad_norm": 1.1393101215362549, "learning_rate": 7.199406968124537e-05, "loss": 0.8358, "step": 4327 }, { "epoch": 0.6411851851851852, "grad_norm": 1.5044102668762207, "learning_rate": 7.196441808747221e-05, "loss": 0.942, "step": 4328 }, { "epoch": 0.6413333333333333, "grad_norm": 1.4487009048461914, "learning_rate": 7.193476649369903e-05, "loss": 0.9936, "step": 4329 }, { "epoch": 0.6414814814814814, "grad_norm": 1.853836178779602, "learning_rate": 7.190511489992587e-05, "loss": 1.1098, "step": 4330 }, { "epoch": 0.6416296296296297, "grad_norm": 3.4412965774536133, "learning_rate": 7.187546330615272e-05, "loss": 0.8773, "step": 4331 }, { "epoch": 0.6417777777777778, "grad_norm": 3.3973379135131836, "learning_rate": 7.184581171237954e-05, "loss": 1.0711, "step": 4332 }, { "epoch": 0.6419259259259259, "grad_norm": 1.1628870964050293, "learning_rate": 7.181616011860638e-05, "loss": 0.9314, "step": 4333 }, { "epoch": 0.642074074074074, "grad_norm": 1.3469637632369995, "learning_rate": 7.178650852483321e-05, "loss": 0.9951, "step": 4334 }, { "epoch": 0.6422222222222222, "grad_norm": 1.5088274478912354, "learning_rate": 7.175685693106004e-05, "loss": 0.8278, "step": 4335 }, { "epoch": 0.6423703703703704, "grad_norm": 1.4323192834854126, "learning_rate": 7.172720533728689e-05, "loss": 0.9538, "step": 4336 }, { "epoch": 0.6425185185185185, "grad_norm": 1.4325075149536133, "learning_rate": 7.169755374351372e-05, "loss": 1.0913, "step": 4337 }, { "epoch": 0.6426666666666667, "grad_norm": 1.6242316961288452, "learning_rate": 7.166790214974055e-05, "loss": 1.1175, "step": 4338 }, { "epoch": 0.6428148148148148, "grad_norm": 2.5923986434936523, "learning_rate": 7.163825055596739e-05, "loss": 1.1285, "step": 4339 }, { "epoch": 0.642962962962963, "grad_norm": 1.162093162536621, "learning_rate": 7.160859896219422e-05, "loss": 0.9083, "step": 4340 }, { "epoch": 0.6431111111111111, "grad_norm": 1.2753523588180542, "learning_rate": 7.157894736842105e-05, "loss": 0.9541, "step": 4341 }, { "epoch": 0.6432592592592593, "grad_norm": 1.8385745286941528, "learning_rate": 7.15492957746479e-05, "loss": 0.9997, "step": 4342 }, { "epoch": 0.6434074074074074, "grad_norm": 1.579655647277832, "learning_rate": 7.151964418087472e-05, "loss": 1.3064, "step": 4343 }, { "epoch": 0.6435555555555555, "grad_norm": 1.538081169128418, "learning_rate": 7.148999258710156e-05, "loss": 1.0038, "step": 4344 }, { "epoch": 0.6437037037037037, "grad_norm": 1.4161475896835327, "learning_rate": 7.146034099332839e-05, "loss": 0.8822, "step": 4345 }, { "epoch": 0.6438518518518519, "grad_norm": 1.350731611251831, "learning_rate": 7.143068939955522e-05, "loss": 1.0213, "step": 4346 }, { "epoch": 0.644, "grad_norm": 2.6670544147491455, "learning_rate": 7.140103780578207e-05, "loss": 1.2181, "step": 4347 }, { "epoch": 0.6441481481481481, "grad_norm": 1.6661410331726074, "learning_rate": 7.13713862120089e-05, "loss": 0.9935, "step": 4348 }, { "epoch": 0.6442962962962963, "grad_norm": 1.722327709197998, "learning_rate": 7.134173461823573e-05, "loss": 0.996, "step": 4349 }, { "epoch": 0.6444444444444445, "grad_norm": 1.233064889907837, "learning_rate": 7.131208302446257e-05, "loss": 1.0661, "step": 4350 }, { "epoch": 0.6445925925925926, "grad_norm": 1.1755728721618652, "learning_rate": 7.12824314306894e-05, "loss": 0.9878, "step": 4351 }, { "epoch": 0.6447407407407407, "grad_norm": 2.2716925144195557, "learning_rate": 7.125277983691624e-05, "loss": 0.8776, "step": 4352 }, { "epoch": 0.6448888888888888, "grad_norm": 1.3941679000854492, "learning_rate": 7.122312824314307e-05, "loss": 1.3136, "step": 4353 }, { "epoch": 0.6450370370370371, "grad_norm": 1.9948437213897705, "learning_rate": 7.119347664936991e-05, "loss": 1.3365, "step": 4354 }, { "epoch": 0.6451851851851852, "grad_norm": 1.6200815439224243, "learning_rate": 7.116382505559674e-05, "loss": 1.1891, "step": 4355 }, { "epoch": 0.6453333333333333, "grad_norm": 1.497673511505127, "learning_rate": 7.113417346182357e-05, "loss": 0.9411, "step": 4356 }, { "epoch": 0.6454814814814814, "grad_norm": 1.9665746688842773, "learning_rate": 7.110452186805042e-05, "loss": 0.7485, "step": 4357 }, { "epoch": 0.6456296296296297, "grad_norm": 1.8524481058120728, "learning_rate": 7.107487027427725e-05, "loss": 0.9909, "step": 4358 }, { "epoch": 0.6457777777777778, "grad_norm": 1.7892533540725708, "learning_rate": 7.104521868050408e-05, "loss": 1.0559, "step": 4359 }, { "epoch": 0.6459259259259259, "grad_norm": 2.46882963180542, "learning_rate": 7.101556708673091e-05, "loss": 0.8663, "step": 4360 }, { "epoch": 0.646074074074074, "grad_norm": 1.3626261949539185, "learning_rate": 7.098591549295775e-05, "loss": 1.0529, "step": 4361 }, { "epoch": 0.6462222222222223, "grad_norm": 1.4369004964828491, "learning_rate": 7.095626389918459e-05, "loss": 1.3747, "step": 4362 }, { "epoch": 0.6463703703703704, "grad_norm": 1.6594363451004028, "learning_rate": 7.092661230541142e-05, "loss": 1.0476, "step": 4363 }, { "epoch": 0.6465185185185185, "grad_norm": 1.728981614112854, "learning_rate": 7.089696071163825e-05, "loss": 0.9783, "step": 4364 }, { "epoch": 0.6466666666666666, "grad_norm": 1.7096563577651978, "learning_rate": 7.086730911786509e-05, "loss": 1.0713, "step": 4365 }, { "epoch": 0.6468148148148148, "grad_norm": 2.27201509475708, "learning_rate": 7.083765752409192e-05, "loss": 0.9459, "step": 4366 }, { "epoch": 0.646962962962963, "grad_norm": 3.5282950401306152, "learning_rate": 7.080800593031875e-05, "loss": 1.1179, "step": 4367 }, { "epoch": 0.6471111111111111, "grad_norm": 6.547924518585205, "learning_rate": 7.07783543365456e-05, "loss": 1.1707, "step": 4368 }, { "epoch": 0.6472592592592593, "grad_norm": 1.124451994895935, "learning_rate": 7.074870274277243e-05, "loss": 0.9589, "step": 4369 }, { "epoch": 0.6474074074074074, "grad_norm": 1.4242360591888428, "learning_rate": 7.071905114899926e-05, "loss": 1.1477, "step": 4370 }, { "epoch": 0.6475555555555556, "grad_norm": 1.1883344650268555, "learning_rate": 7.06893995552261e-05, "loss": 1.0449, "step": 4371 }, { "epoch": 0.6477037037037037, "grad_norm": 1.572464942932129, "learning_rate": 7.065974796145292e-05, "loss": 1.1263, "step": 4372 }, { "epoch": 0.6478518518518519, "grad_norm": 1.5645495653152466, "learning_rate": 7.063009636767977e-05, "loss": 1.0772, "step": 4373 }, { "epoch": 0.648, "grad_norm": 1.2831021547317505, "learning_rate": 7.060044477390661e-05, "loss": 1.0004, "step": 4374 }, { "epoch": 0.6481481481481481, "grad_norm": 2.3488590717315674, "learning_rate": 7.057079318013343e-05, "loss": 1.2111, "step": 4375 }, { "epoch": 0.6482962962962963, "grad_norm": 2.050532341003418, "learning_rate": 7.054114158636027e-05, "loss": 1.2414, "step": 4376 }, { "epoch": 0.6484444444444445, "grad_norm": 1.1811741590499878, "learning_rate": 7.05114899925871e-05, "loss": 1.1716, "step": 4377 }, { "epoch": 0.6485925925925926, "grad_norm": 1.6432982683181763, "learning_rate": 7.048183839881393e-05, "loss": 0.9345, "step": 4378 }, { "epoch": 0.6487407407407407, "grad_norm": 1.1864477396011353, "learning_rate": 7.045218680504078e-05, "loss": 0.8852, "step": 4379 }, { "epoch": 0.6488888888888888, "grad_norm": 1.747707724571228, "learning_rate": 7.042253521126761e-05, "loss": 1.1688, "step": 4380 }, { "epoch": 0.6490370370370371, "grad_norm": 1.7994520664215088, "learning_rate": 7.039288361749444e-05, "loss": 1.3091, "step": 4381 }, { "epoch": 0.6491851851851852, "grad_norm": 2.1519625186920166, "learning_rate": 7.036323202372128e-05, "loss": 1.2301, "step": 4382 }, { "epoch": 0.6493333333333333, "grad_norm": 2.0501434803009033, "learning_rate": 7.033358042994812e-05, "loss": 1.1329, "step": 4383 }, { "epoch": 0.6494814814814814, "grad_norm": 1.7140653133392334, "learning_rate": 7.030392883617495e-05, "loss": 1.1757, "step": 4384 }, { "epoch": 0.6496296296296297, "grad_norm": 1.445603370666504, "learning_rate": 7.027427724240179e-05, "loss": 1.0283, "step": 4385 }, { "epoch": 0.6497777777777778, "grad_norm": 1.7610565423965454, "learning_rate": 7.024462564862861e-05, "loss": 1.0102, "step": 4386 }, { "epoch": 0.6499259259259259, "grad_norm": 1.5968871116638184, "learning_rate": 7.021497405485545e-05, "loss": 0.9477, "step": 4387 }, { "epoch": 0.650074074074074, "grad_norm": 1.4317080974578857, "learning_rate": 7.018532246108228e-05, "loss": 0.9569, "step": 4388 }, { "epoch": 0.6502222222222223, "grad_norm": 1.1420197486877441, "learning_rate": 7.015567086730911e-05, "loss": 1.0412, "step": 4389 }, { "epoch": 0.6503703703703704, "grad_norm": 1.4308433532714844, "learning_rate": 7.012601927353596e-05, "loss": 1.0494, "step": 4390 }, { "epoch": 0.6505185185185185, "grad_norm": 1.580278754234314, "learning_rate": 7.009636767976279e-05, "loss": 0.856, "step": 4391 }, { "epoch": 0.6506666666666666, "grad_norm": 3.086088180541992, "learning_rate": 7.006671608598962e-05, "loss": 1.1996, "step": 4392 }, { "epoch": 0.6508148148148148, "grad_norm": 1.5802485942840576, "learning_rate": 7.003706449221647e-05, "loss": 1.1041, "step": 4393 }, { "epoch": 0.650962962962963, "grad_norm": 1.2996973991394043, "learning_rate": 7.00074128984433e-05, "loss": 1.0198, "step": 4394 }, { "epoch": 0.6511111111111111, "grad_norm": 1.907769799232483, "learning_rate": 6.997776130467013e-05, "loss": 0.9092, "step": 4395 }, { "epoch": 0.6512592592592592, "grad_norm": 2.091052770614624, "learning_rate": 6.994810971089696e-05, "loss": 1.0609, "step": 4396 }, { "epoch": 0.6514074074074074, "grad_norm": 1.3710469007492065, "learning_rate": 6.99184581171238e-05, "loss": 1.237, "step": 4397 }, { "epoch": 0.6515555555555556, "grad_norm": 1.3312311172485352, "learning_rate": 6.988880652335063e-05, "loss": 0.9948, "step": 4398 }, { "epoch": 0.6517037037037037, "grad_norm": 1.4814784526824951, "learning_rate": 6.985915492957746e-05, "loss": 1.0974, "step": 4399 }, { "epoch": 0.6518518518518519, "grad_norm": 1.5356966257095337, "learning_rate": 6.982950333580431e-05, "loss": 1.1478, "step": 4400 }, { "epoch": 0.652, "grad_norm": 6.357550621032715, "learning_rate": 6.979985174203114e-05, "loss": 1.1715, "step": 4401 }, { "epoch": 0.6521481481481481, "grad_norm": 1.4539903402328491, "learning_rate": 6.977020014825797e-05, "loss": 0.952, "step": 4402 }, { "epoch": 0.6522962962962963, "grad_norm": 1.4701181650161743, "learning_rate": 6.97405485544848e-05, "loss": 1.1111, "step": 4403 }, { "epoch": 0.6524444444444445, "grad_norm": 1.5456289052963257, "learning_rate": 6.971089696071165e-05, "loss": 0.8952, "step": 4404 }, { "epoch": 0.6525925925925926, "grad_norm": 1.3559447526931763, "learning_rate": 6.968124536693848e-05, "loss": 0.8807, "step": 4405 }, { "epoch": 0.6527407407407407, "grad_norm": 1.323038935661316, "learning_rate": 6.965159377316531e-05, "loss": 0.8733, "step": 4406 }, { "epoch": 0.6528888888888889, "grad_norm": 1.5306757688522339, "learning_rate": 6.962194217939214e-05, "loss": 1.1169, "step": 4407 }, { "epoch": 0.6530370370370371, "grad_norm": 2.25394344329834, "learning_rate": 6.959229058561898e-05, "loss": 0.9831, "step": 4408 }, { "epoch": 0.6531851851851852, "grad_norm": 1.4161977767944336, "learning_rate": 6.956263899184581e-05, "loss": 1.0598, "step": 4409 }, { "epoch": 0.6533333333333333, "grad_norm": 1.2248963117599487, "learning_rate": 6.953298739807265e-05, "loss": 0.9447, "step": 4410 }, { "epoch": 0.6534814814814814, "grad_norm": 2.0185866355895996, "learning_rate": 6.950333580429949e-05, "loss": 1.0465, "step": 4411 }, { "epoch": 0.6536296296296297, "grad_norm": 1.0591628551483154, "learning_rate": 6.947368421052632e-05, "loss": 1.4689, "step": 4412 }, { "epoch": 0.6537777777777778, "grad_norm": 1.6865726709365845, "learning_rate": 6.944403261675315e-05, "loss": 1.1569, "step": 4413 }, { "epoch": 0.6539259259259259, "grad_norm": 3.0567121505737305, "learning_rate": 6.941438102298e-05, "loss": 0.8923, "step": 4414 }, { "epoch": 0.654074074074074, "grad_norm": 1.4168965816497803, "learning_rate": 6.938472942920681e-05, "loss": 0.9829, "step": 4415 }, { "epoch": 0.6542222222222223, "grad_norm": 1.2122267484664917, "learning_rate": 6.935507783543366e-05, "loss": 0.7592, "step": 4416 }, { "epoch": 0.6543703703703704, "grad_norm": 1.550054907798767, "learning_rate": 6.93254262416605e-05, "loss": 1.2356, "step": 4417 }, { "epoch": 0.6545185185185185, "grad_norm": 1.6436558961868286, "learning_rate": 6.929577464788732e-05, "loss": 0.9523, "step": 4418 }, { "epoch": 0.6546666666666666, "grad_norm": 1.7433936595916748, "learning_rate": 6.926612305411416e-05, "loss": 1.1111, "step": 4419 }, { "epoch": 0.6548148148148148, "grad_norm": 1.400778889656067, "learning_rate": 6.9236471460341e-05, "loss": 1.0042, "step": 4420 }, { "epoch": 0.654962962962963, "grad_norm": 1.8073093891143799, "learning_rate": 6.920681986656783e-05, "loss": 1.0994, "step": 4421 }, { "epoch": 0.6551111111111111, "grad_norm": 3.2208495140075684, "learning_rate": 6.917716827279467e-05, "loss": 0.8091, "step": 4422 }, { "epoch": 0.6552592592592592, "grad_norm": 1.918202519416809, "learning_rate": 6.91475166790215e-05, "loss": 1.1474, "step": 4423 }, { "epoch": 0.6554074074074074, "grad_norm": 2.4938154220581055, "learning_rate": 6.911786508524833e-05, "loss": 1.0839, "step": 4424 }, { "epoch": 0.6555555555555556, "grad_norm": 1.207390308380127, "learning_rate": 6.908821349147518e-05, "loss": 0.9566, "step": 4425 }, { "epoch": 0.6557037037037037, "grad_norm": 1.9529876708984375, "learning_rate": 6.905856189770201e-05, "loss": 1.0909, "step": 4426 }, { "epoch": 0.6558518518518518, "grad_norm": 1.7125343084335327, "learning_rate": 6.902891030392884e-05, "loss": 1.1568, "step": 4427 }, { "epoch": 0.656, "grad_norm": 1.4207946062088013, "learning_rate": 6.899925871015568e-05, "loss": 1.1674, "step": 4428 }, { "epoch": 0.6561481481481481, "grad_norm": 1.2349700927734375, "learning_rate": 6.89696071163825e-05, "loss": 0.8894, "step": 4429 }, { "epoch": 0.6562962962962963, "grad_norm": 1.6173549890518188, "learning_rate": 6.893995552260934e-05, "loss": 0.9113, "step": 4430 }, { "epoch": 0.6564444444444445, "grad_norm": 1.610507845878601, "learning_rate": 6.891030392883618e-05, "loss": 1.1148, "step": 4431 }, { "epoch": 0.6565925925925926, "grad_norm": 1.5079268217086792, "learning_rate": 6.8880652335063e-05, "loss": 0.8821, "step": 4432 }, { "epoch": 0.6567407407407407, "grad_norm": 1.8289453983306885, "learning_rate": 6.885100074128985e-05, "loss": 1.2582, "step": 4433 }, { "epoch": 0.6568888888888889, "grad_norm": 2.1192467212677, "learning_rate": 6.882134914751668e-05, "loss": 1.1065, "step": 4434 }, { "epoch": 0.6570370370370371, "grad_norm": 1.39765465259552, "learning_rate": 6.879169755374351e-05, "loss": 0.8152, "step": 4435 }, { "epoch": 0.6571851851851852, "grad_norm": 2.078397274017334, "learning_rate": 6.876204595997036e-05, "loss": 1.059, "step": 4436 }, { "epoch": 0.6573333333333333, "grad_norm": 1.189782977104187, "learning_rate": 6.873239436619719e-05, "loss": 0.838, "step": 4437 }, { "epoch": 0.6574814814814814, "grad_norm": 1.6565886735916138, "learning_rate": 6.870274277242402e-05, "loss": 1.0258, "step": 4438 }, { "epoch": 0.6576296296296297, "grad_norm": 1.6193196773529053, "learning_rate": 6.867309117865086e-05, "loss": 0.9524, "step": 4439 }, { "epoch": 0.6577777777777778, "grad_norm": 2.2543153762817383, "learning_rate": 6.86434395848777e-05, "loss": 1.1232, "step": 4440 }, { "epoch": 0.6579259259259259, "grad_norm": 2.2773020267486572, "learning_rate": 6.861378799110453e-05, "loss": 1.2689, "step": 4441 }, { "epoch": 0.658074074074074, "grad_norm": 1.8485618829727173, "learning_rate": 6.858413639733136e-05, "loss": 1.0171, "step": 4442 }, { "epoch": 0.6582222222222223, "grad_norm": 1.9104846715927124, "learning_rate": 6.85544848035582e-05, "loss": 1.1316, "step": 4443 }, { "epoch": 0.6583703703703704, "grad_norm": 1.614223599433899, "learning_rate": 6.852483320978503e-05, "loss": 0.9565, "step": 4444 }, { "epoch": 0.6585185185185185, "grad_norm": 1.0929920673370361, "learning_rate": 6.849518161601186e-05, "loss": 1.3088, "step": 4445 }, { "epoch": 0.6586666666666666, "grad_norm": 1.3854314088821411, "learning_rate": 6.84655300222387e-05, "loss": 1.0783, "step": 4446 }, { "epoch": 0.6588148148148149, "grad_norm": 1.7415268421173096, "learning_rate": 6.843587842846554e-05, "loss": 1.0249, "step": 4447 }, { "epoch": 0.658962962962963, "grad_norm": 1.6687849760055542, "learning_rate": 6.840622683469237e-05, "loss": 1.0307, "step": 4448 }, { "epoch": 0.6591111111111111, "grad_norm": 1.1237823963165283, "learning_rate": 6.83765752409192e-05, "loss": 0.8642, "step": 4449 }, { "epoch": 0.6592592592592592, "grad_norm": 1.1712908744812012, "learning_rate": 6.834692364714603e-05, "loss": 0.8663, "step": 4450 }, { "epoch": 0.6594074074074074, "grad_norm": 2.365463972091675, "learning_rate": 6.831727205337287e-05, "loss": 1.0883, "step": 4451 }, { "epoch": 0.6595555555555556, "grad_norm": 1.5662684440612793, "learning_rate": 6.82876204595997e-05, "loss": 1.2927, "step": 4452 }, { "epoch": 0.6597037037037037, "grad_norm": 1.570806622505188, "learning_rate": 6.825796886582654e-05, "loss": 1.1432, "step": 4453 }, { "epoch": 0.6598518518518518, "grad_norm": 1.8160992860794067, "learning_rate": 6.822831727205338e-05, "loss": 1.1425, "step": 4454 }, { "epoch": 0.66, "grad_norm": 7.27062463760376, "learning_rate": 6.819866567828021e-05, "loss": 1.063, "step": 4455 }, { "epoch": 0.6601481481481482, "grad_norm": 1.5494304895401, "learning_rate": 6.816901408450704e-05, "loss": 1.1038, "step": 4456 }, { "epoch": 0.6602962962962963, "grad_norm": 1.6469160318374634, "learning_rate": 6.813936249073389e-05, "loss": 1.0867, "step": 4457 }, { "epoch": 0.6604444444444444, "grad_norm": 1.5185201168060303, "learning_rate": 6.81097108969607e-05, "loss": 0.8585, "step": 4458 }, { "epoch": 0.6605925925925926, "grad_norm": 1.7621744871139526, "learning_rate": 6.808005930318755e-05, "loss": 0.9402, "step": 4459 }, { "epoch": 0.6607407407407407, "grad_norm": 1.990752935409546, "learning_rate": 6.80504077094144e-05, "loss": 1.2131, "step": 4460 }, { "epoch": 0.6608888888888889, "grad_norm": 4.080032825469971, "learning_rate": 6.802075611564121e-05, "loss": 1.1293, "step": 4461 }, { "epoch": 0.6610370370370371, "grad_norm": 1.4148558378219604, "learning_rate": 6.799110452186806e-05, "loss": 1.1308, "step": 4462 }, { "epoch": 0.6611851851851852, "grad_norm": 3.500199317932129, "learning_rate": 6.796145292809489e-05, "loss": 1.2161, "step": 4463 }, { "epoch": 0.6613333333333333, "grad_norm": 1.847669243812561, "learning_rate": 6.793180133432172e-05, "loss": 1.1194, "step": 4464 }, { "epoch": 0.6614814814814814, "grad_norm": 1.549932599067688, "learning_rate": 6.790214974054856e-05, "loss": 1.097, "step": 4465 }, { "epoch": 0.6616296296296297, "grad_norm": 1.6146255731582642, "learning_rate": 6.787249814677539e-05, "loss": 1.1995, "step": 4466 }, { "epoch": 0.6617777777777778, "grad_norm": 1.697512149810791, "learning_rate": 6.784284655300222e-05, "loss": 1.2261, "step": 4467 }, { "epoch": 0.6619259259259259, "grad_norm": 2.989250421524048, "learning_rate": 6.781319495922907e-05, "loss": 0.976, "step": 4468 }, { "epoch": 0.662074074074074, "grad_norm": 2.1078250408172607, "learning_rate": 6.77835433654559e-05, "loss": 0.9773, "step": 4469 }, { "epoch": 0.6622222222222223, "grad_norm": 1.2377699613571167, "learning_rate": 6.775389177168273e-05, "loss": 0.9623, "step": 4470 }, { "epoch": 0.6623703703703704, "grad_norm": 2.2702996730804443, "learning_rate": 6.772424017790957e-05, "loss": 1.196, "step": 4471 }, { "epoch": 0.6625185185185185, "grad_norm": 2.383726119995117, "learning_rate": 6.769458858413639e-05, "loss": 1.0343, "step": 4472 }, { "epoch": 0.6626666666666666, "grad_norm": 1.5570571422576904, "learning_rate": 6.766493699036324e-05, "loss": 1.1076, "step": 4473 }, { "epoch": 0.6628148148148149, "grad_norm": 1.4325950145721436, "learning_rate": 6.763528539659007e-05, "loss": 1.2037, "step": 4474 }, { "epoch": 0.662962962962963, "grad_norm": 1.4265539646148682, "learning_rate": 6.76056338028169e-05, "loss": 1.0731, "step": 4475 }, { "epoch": 0.6631111111111111, "grad_norm": 1.4804054498672485, "learning_rate": 6.757598220904374e-05, "loss": 1.049, "step": 4476 }, { "epoch": 0.6632592592592592, "grad_norm": 1.302612543106079, "learning_rate": 6.754633061527057e-05, "loss": 0.8392, "step": 4477 }, { "epoch": 0.6634074074074074, "grad_norm": 2.4564685821533203, "learning_rate": 6.75166790214974e-05, "loss": 1.0384, "step": 4478 }, { "epoch": 0.6635555555555556, "grad_norm": 1.816359281539917, "learning_rate": 6.748702742772425e-05, "loss": 1.0906, "step": 4479 }, { "epoch": 0.6637037037037037, "grad_norm": 1.585599660873413, "learning_rate": 6.745737583395108e-05, "loss": 0.9097, "step": 4480 }, { "epoch": 0.6638518518518518, "grad_norm": 1.2226183414459229, "learning_rate": 6.742772424017791e-05, "loss": 0.8926, "step": 4481 }, { "epoch": 0.664, "grad_norm": 1.1785897016525269, "learning_rate": 6.739807264640475e-05, "loss": 0.8586, "step": 4482 }, { "epoch": 0.6641481481481482, "grad_norm": 1.2020103931427002, "learning_rate": 6.736842105263159e-05, "loss": 0.8763, "step": 4483 }, { "epoch": 0.6642962962962963, "grad_norm": 1.3404290676116943, "learning_rate": 6.733876945885842e-05, "loss": 1.0221, "step": 4484 }, { "epoch": 0.6644444444444444, "grad_norm": 1.8340681791305542, "learning_rate": 6.730911786508525e-05, "loss": 1.1126, "step": 4485 }, { "epoch": 0.6645925925925926, "grad_norm": 1.6338748931884766, "learning_rate": 6.727946627131209e-05, "loss": 1.1481, "step": 4486 }, { "epoch": 0.6647407407407407, "grad_norm": 1.3059214353561401, "learning_rate": 6.724981467753892e-05, "loss": 0.8895, "step": 4487 }, { "epoch": 0.6648888888888889, "grad_norm": 4.59125280380249, "learning_rate": 6.722016308376575e-05, "loss": 1.0238, "step": 4488 }, { "epoch": 0.665037037037037, "grad_norm": 2.042271614074707, "learning_rate": 6.719051148999258e-05, "loss": 1.2073, "step": 4489 }, { "epoch": 0.6651851851851852, "grad_norm": 1.6275269985198975, "learning_rate": 6.716085989621943e-05, "loss": 0.9926, "step": 4490 }, { "epoch": 0.6653333333333333, "grad_norm": 3.977426290512085, "learning_rate": 6.713120830244626e-05, "loss": 1.2023, "step": 4491 }, { "epoch": 0.6654814814814815, "grad_norm": 1.3853273391723633, "learning_rate": 6.710155670867309e-05, "loss": 0.9428, "step": 4492 }, { "epoch": 0.6656296296296297, "grad_norm": 6.533312797546387, "learning_rate": 6.707190511489992e-05, "loss": 0.8783, "step": 4493 }, { "epoch": 0.6657777777777778, "grad_norm": 1.368784785270691, "learning_rate": 6.704225352112677e-05, "loss": 0.8937, "step": 4494 }, { "epoch": 0.6659259259259259, "grad_norm": 1.5556150674819946, "learning_rate": 6.70126019273536e-05, "loss": 1.0887, "step": 4495 }, { "epoch": 0.666074074074074, "grad_norm": 1.5616638660430908, "learning_rate": 6.698295033358043e-05, "loss": 0.9505, "step": 4496 }, { "epoch": 0.6662222222222223, "grad_norm": 1.4048346281051636, "learning_rate": 6.695329873980727e-05, "loss": 1.1939, "step": 4497 }, { "epoch": 0.6663703703703704, "grad_norm": 2.11753511428833, "learning_rate": 6.69236471460341e-05, "loss": 1.0441, "step": 4498 }, { "epoch": 0.6665185185185185, "grad_norm": 1.3496278524398804, "learning_rate": 6.689399555226093e-05, "loss": 1.0327, "step": 4499 }, { "epoch": 0.6666666666666666, "grad_norm": 1.2425817251205444, "learning_rate": 6.686434395848778e-05, "loss": 0.8351, "step": 4500 }, { "epoch": 0.6668148148148149, "grad_norm": 1.1833040714263916, "learning_rate": 6.68346923647146e-05, "loss": 1.1542, "step": 4501 }, { "epoch": 0.666962962962963, "grad_norm": 3.7058610916137695, "learning_rate": 6.680504077094144e-05, "loss": 0.8921, "step": 4502 }, { "epoch": 0.6671111111111111, "grad_norm": 2.4060218334198, "learning_rate": 6.677538917716829e-05, "loss": 1.2563, "step": 4503 }, { "epoch": 0.6672592592592592, "grad_norm": 1.6111152172088623, "learning_rate": 6.67457375833951e-05, "loss": 0.9096, "step": 4504 }, { "epoch": 0.6674074074074074, "grad_norm": 1.828894853591919, "learning_rate": 6.671608598962195e-05, "loss": 0.9255, "step": 4505 }, { "epoch": 0.6675555555555556, "grad_norm": 1.4932504892349243, "learning_rate": 6.668643439584878e-05, "loss": 1.1503, "step": 4506 }, { "epoch": 0.6677037037037037, "grad_norm": 2.4620230197906494, "learning_rate": 6.665678280207561e-05, "loss": 1.3892, "step": 4507 }, { "epoch": 0.6678518518518518, "grad_norm": 1.4247181415557861, "learning_rate": 6.662713120830245e-05, "loss": 1.1199, "step": 4508 }, { "epoch": 0.668, "grad_norm": 1.8170108795166016, "learning_rate": 6.659747961452928e-05, "loss": 0.9776, "step": 4509 }, { "epoch": 0.6681481481481482, "grad_norm": 1.342509388923645, "learning_rate": 6.656782802075612e-05, "loss": 1.0256, "step": 4510 }, { "epoch": 0.6682962962962963, "grad_norm": 1.2939727306365967, "learning_rate": 6.653817642698296e-05, "loss": 1.1221, "step": 4511 }, { "epoch": 0.6684444444444444, "grad_norm": 2.5898356437683105, "learning_rate": 6.650852483320979e-05, "loss": 0.9475, "step": 4512 }, { "epoch": 0.6685925925925926, "grad_norm": 2.077467203140259, "learning_rate": 6.647887323943662e-05, "loss": 0.9947, "step": 4513 }, { "epoch": 0.6687407407407407, "grad_norm": 2.0203447341918945, "learning_rate": 6.644922164566347e-05, "loss": 1.1886, "step": 4514 }, { "epoch": 0.6688888888888889, "grad_norm": 1.8688184022903442, "learning_rate": 6.641957005189028e-05, "loss": 1.0013, "step": 4515 }, { "epoch": 0.669037037037037, "grad_norm": 2.0238397121429443, "learning_rate": 6.638991845811713e-05, "loss": 1.2235, "step": 4516 }, { "epoch": 0.6691851851851852, "grad_norm": 1.3287385702133179, "learning_rate": 6.636026686434396e-05, "loss": 0.8432, "step": 4517 }, { "epoch": 0.6693333333333333, "grad_norm": 1.7164368629455566, "learning_rate": 6.633061527057079e-05, "loss": 1.0692, "step": 4518 }, { "epoch": 0.6694814814814815, "grad_norm": 1.6082435846328735, "learning_rate": 6.630096367679763e-05, "loss": 1.1004, "step": 4519 }, { "epoch": 0.6696296296296296, "grad_norm": 2.6166815757751465, "learning_rate": 6.627131208302446e-05, "loss": 0.9445, "step": 4520 }, { "epoch": 0.6697777777777778, "grad_norm": 1.7008702754974365, "learning_rate": 6.62416604892513e-05, "loss": 0.8787, "step": 4521 }, { "epoch": 0.6699259259259259, "grad_norm": 1.8740018606185913, "learning_rate": 6.621200889547814e-05, "loss": 0.8673, "step": 4522 }, { "epoch": 0.670074074074074, "grad_norm": 2.3854541778564453, "learning_rate": 6.618235730170497e-05, "loss": 1.1032, "step": 4523 }, { "epoch": 0.6702222222222223, "grad_norm": 1.7728660106658936, "learning_rate": 6.61527057079318e-05, "loss": 0.9903, "step": 4524 }, { "epoch": 0.6703703703703704, "grad_norm": 1.9628788232803345, "learning_rate": 6.612305411415865e-05, "loss": 1.004, "step": 4525 }, { "epoch": 0.6705185185185185, "grad_norm": 1.3167074918746948, "learning_rate": 6.609340252038548e-05, "loss": 1.1517, "step": 4526 }, { "epoch": 0.6706666666666666, "grad_norm": 1.9610397815704346, "learning_rate": 6.606375092661231e-05, "loss": 0.9651, "step": 4527 }, { "epoch": 0.6708148148148149, "grad_norm": 1.2325596809387207, "learning_rate": 6.603409933283914e-05, "loss": 0.8683, "step": 4528 }, { "epoch": 0.670962962962963, "grad_norm": 1.4824199676513672, "learning_rate": 6.600444773906598e-05, "loss": 0.9708, "step": 4529 }, { "epoch": 0.6711111111111111, "grad_norm": 1.3262386322021484, "learning_rate": 6.597479614529281e-05, "loss": 1.1023, "step": 4530 }, { "epoch": 0.6712592592592592, "grad_norm": 1.7843201160430908, "learning_rate": 6.594514455151965e-05, "loss": 0.9397, "step": 4531 }, { "epoch": 0.6714074074074075, "grad_norm": 1.5266560316085815, "learning_rate": 6.591549295774648e-05, "loss": 1.059, "step": 4532 }, { "epoch": 0.6715555555555556, "grad_norm": 1.3263297080993652, "learning_rate": 6.588584136397332e-05, "loss": 0.9717, "step": 4533 }, { "epoch": 0.6717037037037037, "grad_norm": 1.3532426357269287, "learning_rate": 6.585618977020015e-05, "loss": 0.9635, "step": 4534 }, { "epoch": 0.6718518518518518, "grad_norm": 1.5721492767333984, "learning_rate": 6.582653817642698e-05, "loss": 0.9163, "step": 4535 }, { "epoch": 0.672, "grad_norm": 3.239018678665161, "learning_rate": 6.579688658265381e-05, "loss": 0.939, "step": 4536 }, { "epoch": 0.6721481481481482, "grad_norm": 2.1135377883911133, "learning_rate": 6.576723498888066e-05, "loss": 1.1707, "step": 4537 }, { "epoch": 0.6722962962962963, "grad_norm": 1.684714436531067, "learning_rate": 6.573758339510749e-05, "loss": 0.9886, "step": 4538 }, { "epoch": 0.6724444444444444, "grad_norm": 1.624194860458374, "learning_rate": 6.570793180133432e-05, "loss": 0.9391, "step": 4539 }, { "epoch": 0.6725925925925926, "grad_norm": 1.5050755739212036, "learning_rate": 6.567828020756116e-05, "loss": 0.9477, "step": 4540 }, { "epoch": 0.6727407407407408, "grad_norm": 2.0678727626800537, "learning_rate": 6.5648628613788e-05, "loss": 0.9755, "step": 4541 }, { "epoch": 0.6728888888888889, "grad_norm": 5.3233561515808105, "learning_rate": 6.561897702001483e-05, "loss": 1.2563, "step": 4542 }, { "epoch": 0.673037037037037, "grad_norm": 1.804870843887329, "learning_rate": 6.558932542624167e-05, "loss": 1.1061, "step": 4543 }, { "epoch": 0.6731851851851852, "grad_norm": 1.8689392805099487, "learning_rate": 6.555967383246849e-05, "loss": 1.0492, "step": 4544 }, { "epoch": 0.6733333333333333, "grad_norm": 2.034116268157959, "learning_rate": 6.553002223869533e-05, "loss": 1.0905, "step": 4545 }, { "epoch": 0.6734814814814815, "grad_norm": 2.48861026763916, "learning_rate": 6.550037064492218e-05, "loss": 1.1014, "step": 4546 }, { "epoch": 0.6736296296296296, "grad_norm": 1.8998236656188965, "learning_rate": 6.5470719051149e-05, "loss": 1.1355, "step": 4547 }, { "epoch": 0.6737777777777778, "grad_norm": 3.2076668739318848, "learning_rate": 6.544106745737584e-05, "loss": 0.9577, "step": 4548 }, { "epoch": 0.6739259259259259, "grad_norm": 2.001138925552368, "learning_rate": 6.541141586360267e-05, "loss": 0.988, "step": 4549 }, { "epoch": 0.674074074074074, "grad_norm": 1.9601798057556152, "learning_rate": 6.53817642698295e-05, "loss": 1.1651, "step": 4550 }, { "epoch": 0.6742222222222222, "grad_norm": 2.0755982398986816, "learning_rate": 6.535211267605635e-05, "loss": 0.9835, "step": 4551 }, { "epoch": 0.6743703703703704, "grad_norm": 2.2885663509368896, "learning_rate": 6.532246108228318e-05, "loss": 0.9954, "step": 4552 }, { "epoch": 0.6745185185185185, "grad_norm": 2.1896133422851562, "learning_rate": 6.529280948851001e-05, "loss": 1.0637, "step": 4553 }, { "epoch": 0.6746666666666666, "grad_norm": 1.3759965896606445, "learning_rate": 6.526315789473685e-05, "loss": 1.0231, "step": 4554 }, { "epoch": 0.6748148148148149, "grad_norm": 3.140455484390259, "learning_rate": 6.523350630096368e-05, "loss": 1.2912, "step": 4555 }, { "epoch": 0.674962962962963, "grad_norm": 1.1157242059707642, "learning_rate": 6.520385470719051e-05, "loss": 0.8042, "step": 4556 }, { "epoch": 0.6751111111111111, "grad_norm": 1.8724523782730103, "learning_rate": 6.517420311341736e-05, "loss": 1.0487, "step": 4557 }, { "epoch": 0.6752592592592592, "grad_norm": 1.4933273792266846, "learning_rate": 6.514455151964417e-05, "loss": 1.0804, "step": 4558 }, { "epoch": 0.6754074074074075, "grad_norm": 1.7172576189041138, "learning_rate": 6.511489992587102e-05, "loss": 1.4236, "step": 4559 }, { "epoch": 0.6755555555555556, "grad_norm": 2.238006591796875, "learning_rate": 6.508524833209785e-05, "loss": 1.1543, "step": 4560 }, { "epoch": 0.6757037037037037, "grad_norm": 2.3506276607513428, "learning_rate": 6.505559673832468e-05, "loss": 1.0662, "step": 4561 }, { "epoch": 0.6758518518518518, "grad_norm": 4.046957015991211, "learning_rate": 6.502594514455153e-05, "loss": 1.2876, "step": 4562 }, { "epoch": 0.676, "grad_norm": 1.5363638401031494, "learning_rate": 6.499629355077836e-05, "loss": 0.8477, "step": 4563 }, { "epoch": 0.6761481481481482, "grad_norm": 1.6767934560775757, "learning_rate": 6.496664195700519e-05, "loss": 0.9757, "step": 4564 }, { "epoch": 0.6762962962962963, "grad_norm": 1.6577056646347046, "learning_rate": 6.493699036323203e-05, "loss": 1.0816, "step": 4565 }, { "epoch": 0.6764444444444444, "grad_norm": 1.40839684009552, "learning_rate": 6.490733876945886e-05, "loss": 1.245, "step": 4566 }, { "epoch": 0.6765925925925926, "grad_norm": 1.8399194478988647, "learning_rate": 6.48776871756857e-05, "loss": 1.1084, "step": 4567 }, { "epoch": 0.6767407407407408, "grad_norm": 1.5465725660324097, "learning_rate": 6.484803558191254e-05, "loss": 1.1317, "step": 4568 }, { "epoch": 0.6768888888888889, "grad_norm": 1.1534277200698853, "learning_rate": 6.481838398813937e-05, "loss": 0.9297, "step": 4569 }, { "epoch": 0.677037037037037, "grad_norm": 1.9321751594543457, "learning_rate": 6.47887323943662e-05, "loss": 1.0722, "step": 4570 }, { "epoch": 0.6771851851851852, "grad_norm": 2.9229111671447754, "learning_rate": 6.475908080059303e-05, "loss": 1.0186, "step": 4571 }, { "epoch": 0.6773333333333333, "grad_norm": 1.6638622283935547, "learning_rate": 6.472942920681988e-05, "loss": 1.314, "step": 4572 }, { "epoch": 0.6774814814814815, "grad_norm": 1.4501103162765503, "learning_rate": 6.46997776130467e-05, "loss": 1.2855, "step": 4573 }, { "epoch": 0.6776296296296296, "grad_norm": 1.2768062353134155, "learning_rate": 6.467012601927354e-05, "loss": 0.8762, "step": 4574 }, { "epoch": 0.6777777777777778, "grad_norm": 5.492815017700195, "learning_rate": 6.464047442550037e-05, "loss": 1.169, "step": 4575 }, { "epoch": 0.6779259259259259, "grad_norm": 1.2650996446609497, "learning_rate": 6.461082283172721e-05, "loss": 1.0619, "step": 4576 }, { "epoch": 0.678074074074074, "grad_norm": 1.293104887008667, "learning_rate": 6.458117123795404e-05, "loss": 0.9211, "step": 4577 }, { "epoch": 0.6782222222222222, "grad_norm": 1.4094767570495605, "learning_rate": 6.455151964418087e-05, "loss": 1.0138, "step": 4578 }, { "epoch": 0.6783703703703704, "grad_norm": 1.4086834192276, "learning_rate": 6.45218680504077e-05, "loss": 0.9488, "step": 4579 }, { "epoch": 0.6785185185185185, "grad_norm": 1.2480393648147583, "learning_rate": 6.449221645663455e-05, "loss": 1.0364, "step": 4580 }, { "epoch": 0.6786666666666666, "grad_norm": 2.318099021911621, "learning_rate": 6.446256486286138e-05, "loss": 0.8435, "step": 4581 }, { "epoch": 0.6788148148148148, "grad_norm": 1.66366708278656, "learning_rate": 6.443291326908821e-05, "loss": 0.9257, "step": 4582 }, { "epoch": 0.678962962962963, "grad_norm": 1.1964306831359863, "learning_rate": 6.440326167531506e-05, "loss": 1.1364, "step": 4583 }, { "epoch": 0.6791111111111111, "grad_norm": 2.636220932006836, "learning_rate": 6.437361008154189e-05, "loss": 1.0545, "step": 4584 }, { "epoch": 0.6792592592592592, "grad_norm": 1.8049631118774414, "learning_rate": 6.434395848776872e-05, "loss": 0.8934, "step": 4585 }, { "epoch": 0.6794074074074075, "grad_norm": 2.333420753479004, "learning_rate": 6.431430689399556e-05, "loss": 0.8483, "step": 4586 }, { "epoch": 0.6795555555555556, "grad_norm": 1.6943825483322144, "learning_rate": 6.428465530022238e-05, "loss": 1.3099, "step": 4587 }, { "epoch": 0.6797037037037037, "grad_norm": 1.1508939266204834, "learning_rate": 6.425500370644922e-05, "loss": 0.7506, "step": 4588 }, { "epoch": 0.6798518518518518, "grad_norm": 1.2238248586654663, "learning_rate": 6.422535211267607e-05, "loss": 1.012, "step": 4589 }, { "epoch": 0.68, "grad_norm": 1.4984291791915894, "learning_rate": 6.419570051890289e-05, "loss": 1.0019, "step": 4590 }, { "epoch": 0.6801481481481482, "grad_norm": 1.367092251777649, "learning_rate": 6.416604892512973e-05, "loss": 0.9964, "step": 4591 }, { "epoch": 0.6802962962962963, "grad_norm": 1.9505276679992676, "learning_rate": 6.413639733135656e-05, "loss": 0.8053, "step": 4592 }, { "epoch": 0.6804444444444444, "grad_norm": 1.4009976387023926, "learning_rate": 6.410674573758339e-05, "loss": 0.7596, "step": 4593 }, { "epoch": 0.6805925925925926, "grad_norm": 2.9519119262695312, "learning_rate": 6.407709414381024e-05, "loss": 1.1529, "step": 4594 }, { "epoch": 0.6807407407407408, "grad_norm": 1.8711851835250854, "learning_rate": 6.404744255003707e-05, "loss": 1.017, "step": 4595 }, { "epoch": 0.6808888888888889, "grad_norm": 1.149286150932312, "learning_rate": 6.40177909562639e-05, "loss": 0.9867, "step": 4596 }, { "epoch": 0.681037037037037, "grad_norm": 1.6351492404937744, "learning_rate": 6.398813936249074e-05, "loss": 1.0733, "step": 4597 }, { "epoch": 0.6811851851851852, "grad_norm": 3.9957165718078613, "learning_rate": 6.395848776871757e-05, "loss": 1.2609, "step": 4598 }, { "epoch": 0.6813333333333333, "grad_norm": 3.0349299907684326, "learning_rate": 6.39288361749444e-05, "loss": 1.0371, "step": 4599 }, { "epoch": 0.6814814814814815, "grad_norm": 1.1951899528503418, "learning_rate": 6.389918458117125e-05, "loss": 0.8061, "step": 4600 }, { "epoch": 0.6816296296296296, "grad_norm": 2.5305910110473633, "learning_rate": 6.386953298739807e-05, "loss": 1.0257, "step": 4601 }, { "epoch": 0.6817777777777778, "grad_norm": 1.5263696908950806, "learning_rate": 6.383988139362491e-05, "loss": 0.9928, "step": 4602 }, { "epoch": 0.6819259259259259, "grad_norm": 1.5052536725997925, "learning_rate": 6.381022979985174e-05, "loss": 1.0276, "step": 4603 }, { "epoch": 0.682074074074074, "grad_norm": 2.024991989135742, "learning_rate": 6.378057820607857e-05, "loss": 0.8347, "step": 4604 }, { "epoch": 0.6822222222222222, "grad_norm": 1.2899211645126343, "learning_rate": 6.375092661230542e-05, "loss": 0.9684, "step": 4605 }, { "epoch": 0.6823703703703704, "grad_norm": 1.3688863515853882, "learning_rate": 6.372127501853225e-05, "loss": 1.0025, "step": 4606 }, { "epoch": 0.6825185185185185, "grad_norm": 1.3318511247634888, "learning_rate": 6.369162342475908e-05, "loss": 0.8014, "step": 4607 }, { "epoch": 0.6826666666666666, "grad_norm": 1.8962894678115845, "learning_rate": 6.366197183098592e-05, "loss": 1.1632, "step": 4608 }, { "epoch": 0.6828148148148148, "grad_norm": 1.9377597570419312, "learning_rate": 6.363232023721275e-05, "loss": 0.9583, "step": 4609 }, { "epoch": 0.682962962962963, "grad_norm": 1.8377279043197632, "learning_rate": 6.360266864343959e-05, "loss": 0.8932, "step": 4610 }, { "epoch": 0.6831111111111111, "grad_norm": 1.5697270631790161, "learning_rate": 6.357301704966643e-05, "loss": 1.1786, "step": 4611 }, { "epoch": 0.6832592592592592, "grad_norm": 1.098049283027649, "learning_rate": 6.354336545589326e-05, "loss": 1.4777, "step": 4612 }, { "epoch": 0.6834074074074074, "grad_norm": 2.8057503700256348, "learning_rate": 6.351371386212009e-05, "loss": 1.1235, "step": 4613 }, { "epoch": 0.6835555555555556, "grad_norm": 1.5012096166610718, "learning_rate": 6.348406226834692e-05, "loss": 1.0603, "step": 4614 }, { "epoch": 0.6837037037037037, "grad_norm": 1.646206021308899, "learning_rate": 6.345441067457377e-05, "loss": 0.7903, "step": 4615 }, { "epoch": 0.6838518518518518, "grad_norm": 6.089558124542236, "learning_rate": 6.34247590808006e-05, "loss": 0.7942, "step": 4616 }, { "epoch": 0.684, "grad_norm": 2.203416585922241, "learning_rate": 6.339510748702743e-05, "loss": 1.0105, "step": 4617 }, { "epoch": 0.6841481481481482, "grad_norm": 1.3859854936599731, "learning_rate": 6.336545589325426e-05, "loss": 0.9016, "step": 4618 }, { "epoch": 0.6842962962962963, "grad_norm": 1.4855002164840698, "learning_rate": 6.33358042994811e-05, "loss": 0.7378, "step": 4619 }, { "epoch": 0.6844444444444444, "grad_norm": 1.8707270622253418, "learning_rate": 6.330615270570794e-05, "loss": 1.0435, "step": 4620 }, { "epoch": 0.6845925925925926, "grad_norm": 1.3705289363861084, "learning_rate": 6.327650111193477e-05, "loss": 1.2175, "step": 4621 }, { "epoch": 0.6847407407407408, "grad_norm": 6.896151542663574, "learning_rate": 6.32468495181616e-05, "loss": 1.0735, "step": 4622 }, { "epoch": 0.6848888888888889, "grad_norm": 2.116827964782715, "learning_rate": 6.321719792438844e-05, "loss": 1.0728, "step": 4623 }, { "epoch": 0.685037037037037, "grad_norm": 2.238344669342041, "learning_rate": 6.318754633061527e-05, "loss": 0.8672, "step": 4624 }, { "epoch": 0.6851851851851852, "grad_norm": 2.1863036155700684, "learning_rate": 6.31578947368421e-05, "loss": 1.0841, "step": 4625 }, { "epoch": 0.6853333333333333, "grad_norm": 1.78851318359375, "learning_rate": 6.312824314306895e-05, "loss": 1.0287, "step": 4626 }, { "epoch": 0.6854814814814815, "grad_norm": 5.450416088104248, "learning_rate": 6.309859154929578e-05, "loss": 1.1345, "step": 4627 }, { "epoch": 0.6856296296296296, "grad_norm": 1.4445854425430298, "learning_rate": 6.306893995552261e-05, "loss": 1.2755, "step": 4628 }, { "epoch": 0.6857777777777778, "grad_norm": 1.173293113708496, "learning_rate": 6.303928836174945e-05, "loss": 0.8193, "step": 4629 }, { "epoch": 0.6859259259259259, "grad_norm": 1.6796793937683105, "learning_rate": 6.300963676797627e-05, "loss": 1.0992, "step": 4630 }, { "epoch": 0.6860740740740741, "grad_norm": 5.284728050231934, "learning_rate": 6.297998517420312e-05, "loss": 1.0728, "step": 4631 }, { "epoch": 0.6862222222222222, "grad_norm": 1.464169979095459, "learning_rate": 6.295033358042996e-05, "loss": 1.0381, "step": 4632 }, { "epoch": 0.6863703703703704, "grad_norm": 2.6172666549682617, "learning_rate": 6.292068198665678e-05, "loss": 1.0066, "step": 4633 }, { "epoch": 0.6865185185185185, "grad_norm": 1.7775756120681763, "learning_rate": 6.289103039288362e-05, "loss": 1.1639, "step": 4634 }, { "epoch": 0.6866666666666666, "grad_norm": 3.5914671421051025, "learning_rate": 6.286137879911045e-05, "loss": 1.0721, "step": 4635 }, { "epoch": 0.6868148148148148, "grad_norm": 2.1291871070861816, "learning_rate": 6.283172720533728e-05, "loss": 0.9818, "step": 4636 }, { "epoch": 0.686962962962963, "grad_norm": 3.232025384902954, "learning_rate": 6.280207561156413e-05, "loss": 1.1499, "step": 4637 }, { "epoch": 0.6871111111111111, "grad_norm": 1.3152642250061035, "learning_rate": 6.277242401779096e-05, "loss": 0.8339, "step": 4638 }, { "epoch": 0.6872592592592592, "grad_norm": 2.176630973815918, "learning_rate": 6.274277242401779e-05, "loss": 1.1671, "step": 4639 }, { "epoch": 0.6874074074074074, "grad_norm": 1.4894682168960571, "learning_rate": 6.271312083024463e-05, "loss": 1.3326, "step": 4640 }, { "epoch": 0.6875555555555556, "grad_norm": 5.215499401092529, "learning_rate": 6.268346923647147e-05, "loss": 0.8718, "step": 4641 }, { "epoch": 0.6877037037037037, "grad_norm": 1.5449154376983643, "learning_rate": 6.26538176426983e-05, "loss": 1.0189, "step": 4642 }, { "epoch": 0.6878518518518518, "grad_norm": 1.9991947412490845, "learning_rate": 6.262416604892514e-05, "loss": 0.9451, "step": 4643 }, { "epoch": 0.688, "grad_norm": 1.1996254920959473, "learning_rate": 6.259451445515196e-05, "loss": 1.0544, "step": 4644 }, { "epoch": 0.6881481481481482, "grad_norm": 1.4383140802383423, "learning_rate": 6.25648628613788e-05, "loss": 0.8975, "step": 4645 }, { "epoch": 0.6882962962962963, "grad_norm": 4.084737300872803, "learning_rate": 6.253521126760565e-05, "loss": 1.143, "step": 4646 }, { "epoch": 0.6884444444444444, "grad_norm": 1.930940866470337, "learning_rate": 6.250555967383246e-05, "loss": 1.2551, "step": 4647 }, { "epoch": 0.6885925925925926, "grad_norm": 1.4857474565505981, "learning_rate": 6.247590808005931e-05, "loss": 1.1241, "step": 4648 }, { "epoch": 0.6887407407407408, "grad_norm": 1.0609443187713623, "learning_rate": 6.244625648628614e-05, "loss": 0.7137, "step": 4649 }, { "epoch": 0.6888888888888889, "grad_norm": 2.1343209743499756, "learning_rate": 6.241660489251297e-05, "loss": 1.1902, "step": 4650 }, { "epoch": 0.689037037037037, "grad_norm": 5.095562934875488, "learning_rate": 6.238695329873982e-05, "loss": 1.007, "step": 4651 }, { "epoch": 0.6891851851851852, "grad_norm": 1.7792041301727295, "learning_rate": 6.235730170496665e-05, "loss": 1.1408, "step": 4652 }, { "epoch": 0.6893333333333334, "grad_norm": 1.6858946084976196, "learning_rate": 6.232765011119348e-05, "loss": 1.1224, "step": 4653 }, { "epoch": 0.6894814814814815, "grad_norm": 1.2887392044067383, "learning_rate": 6.229799851742032e-05, "loss": 1.0686, "step": 4654 }, { "epoch": 0.6896296296296296, "grad_norm": 1.3962105512619019, "learning_rate": 6.226834692364715e-05, "loss": 1.0521, "step": 4655 }, { "epoch": 0.6897777777777778, "grad_norm": 1.8462930917739868, "learning_rate": 6.223869532987398e-05, "loss": 0.9927, "step": 4656 }, { "epoch": 0.6899259259259259, "grad_norm": 1.4207310676574707, "learning_rate": 6.220904373610081e-05, "loss": 1.1223, "step": 4657 }, { "epoch": 0.6900740740740741, "grad_norm": 1.438154935836792, "learning_rate": 6.217939214232766e-05, "loss": 0.988, "step": 4658 }, { "epoch": 0.6902222222222222, "grad_norm": 1.3789966106414795, "learning_rate": 6.214974054855449e-05, "loss": 0.9999, "step": 4659 }, { "epoch": 0.6903703703703704, "grad_norm": 1.7231265306472778, "learning_rate": 6.212008895478132e-05, "loss": 0.8048, "step": 4660 }, { "epoch": 0.6905185185185185, "grad_norm": 2.3367037773132324, "learning_rate": 6.209043736100815e-05, "loss": 0.7982, "step": 4661 }, { "epoch": 0.6906666666666667, "grad_norm": 1.5115360021591187, "learning_rate": 6.2060785767235e-05, "loss": 0.8287, "step": 4662 }, { "epoch": 0.6908148148148148, "grad_norm": 2.3880302906036377, "learning_rate": 6.203113417346183e-05, "loss": 1.1945, "step": 4663 }, { "epoch": 0.690962962962963, "grad_norm": 1.965254306793213, "learning_rate": 6.200148257968866e-05, "loss": 1.0172, "step": 4664 }, { "epoch": 0.6911111111111111, "grad_norm": 1.6692672967910767, "learning_rate": 6.197183098591549e-05, "loss": 0.801, "step": 4665 }, { "epoch": 0.6912592592592592, "grad_norm": 1.7301688194274902, "learning_rate": 6.194217939214233e-05, "loss": 1.1828, "step": 4666 }, { "epoch": 0.6914074074074074, "grad_norm": 2.8348965644836426, "learning_rate": 6.191252779836916e-05, "loss": 1.0651, "step": 4667 }, { "epoch": 0.6915555555555556, "grad_norm": 1.343113899230957, "learning_rate": 6.1882876204596e-05, "loss": 0.9749, "step": 4668 }, { "epoch": 0.6917037037037037, "grad_norm": 1.9785374402999878, "learning_rate": 6.185322461082284e-05, "loss": 1.1086, "step": 4669 }, { "epoch": 0.6918518518518518, "grad_norm": 1.3531380891799927, "learning_rate": 6.182357301704967e-05, "loss": 1.1721, "step": 4670 }, { "epoch": 0.692, "grad_norm": 1.2428358793258667, "learning_rate": 6.17939214232765e-05, "loss": 1.0404, "step": 4671 }, { "epoch": 0.6921481481481482, "grad_norm": 3.1928300857543945, "learning_rate": 6.176426982950335e-05, "loss": 1.0914, "step": 4672 }, { "epoch": 0.6922962962962963, "grad_norm": 1.8552119731903076, "learning_rate": 6.173461823573016e-05, "loss": 0.9968, "step": 4673 }, { "epoch": 0.6924444444444444, "grad_norm": 2.0166265964508057, "learning_rate": 6.170496664195701e-05, "loss": 1.1317, "step": 4674 }, { "epoch": 0.6925925925925925, "grad_norm": 1.3955466747283936, "learning_rate": 6.167531504818385e-05, "loss": 0.9586, "step": 4675 }, { "epoch": 0.6927407407407408, "grad_norm": 1.3913319110870361, "learning_rate": 6.164566345441067e-05, "loss": 1.2034, "step": 4676 }, { "epoch": 0.6928888888888889, "grad_norm": 2.064141273498535, "learning_rate": 6.161601186063751e-05, "loss": 0.9239, "step": 4677 }, { "epoch": 0.693037037037037, "grad_norm": 1.7810102701187134, "learning_rate": 6.158636026686434e-05, "loss": 0.9786, "step": 4678 }, { "epoch": 0.6931851851851852, "grad_norm": 1.4174787998199463, "learning_rate": 6.155670867309118e-05, "loss": 0.934, "step": 4679 }, { "epoch": 0.6933333333333334, "grad_norm": 1.6481225490570068, "learning_rate": 6.152705707931802e-05, "loss": 1.0113, "step": 4680 }, { "epoch": 0.6934814814814815, "grad_norm": 1.7207773923873901, "learning_rate": 6.149740548554485e-05, "loss": 0.9592, "step": 4681 }, { "epoch": 0.6936296296296296, "grad_norm": 1.2530018091201782, "learning_rate": 6.146775389177168e-05, "loss": 0.97, "step": 4682 }, { "epoch": 0.6937777777777778, "grad_norm": 2.6475136280059814, "learning_rate": 6.143810229799853e-05, "loss": 1.1792, "step": 4683 }, { "epoch": 0.693925925925926, "grad_norm": 2.30336856842041, "learning_rate": 6.140845070422536e-05, "loss": 1.0311, "step": 4684 }, { "epoch": 0.6940740740740741, "grad_norm": 1.6658436059951782, "learning_rate": 6.137879911045219e-05, "loss": 1.0753, "step": 4685 }, { "epoch": 0.6942222222222222, "grad_norm": 2.4629809856414795, "learning_rate": 6.134914751667903e-05, "loss": 1.0303, "step": 4686 }, { "epoch": 0.6943703703703704, "grad_norm": 1.2104218006134033, "learning_rate": 6.131949592290585e-05, "loss": 0.9383, "step": 4687 }, { "epoch": 0.6945185185185185, "grad_norm": 1.5628855228424072, "learning_rate": 6.12898443291327e-05, "loss": 0.9355, "step": 4688 }, { "epoch": 0.6946666666666667, "grad_norm": 1.9757931232452393, "learning_rate": 6.126019273535954e-05, "loss": 0.9127, "step": 4689 }, { "epoch": 0.6948148148148148, "grad_norm": 1.963261604309082, "learning_rate": 6.123054114158636e-05, "loss": 1.0436, "step": 4690 }, { "epoch": 0.694962962962963, "grad_norm": 1.6282535791397095, "learning_rate": 6.12008895478132e-05, "loss": 0.8477, "step": 4691 }, { "epoch": 0.6951111111111111, "grad_norm": 1.7320218086242676, "learning_rate": 6.117123795404003e-05, "loss": 1.2204, "step": 4692 }, { "epoch": 0.6952592592592592, "grad_norm": 2.105339765548706, "learning_rate": 6.114158636026686e-05, "loss": 0.9144, "step": 4693 }, { "epoch": 0.6954074074074074, "grad_norm": 2.56491756439209, "learning_rate": 6.111193476649371e-05, "loss": 1.0554, "step": 4694 }, { "epoch": 0.6955555555555556, "grad_norm": 1.3379398584365845, "learning_rate": 6.108228317272054e-05, "loss": 1.0433, "step": 4695 }, { "epoch": 0.6957037037037037, "grad_norm": 1.3718205690383911, "learning_rate": 6.105263157894737e-05, "loss": 0.9201, "step": 4696 }, { "epoch": 0.6958518518518518, "grad_norm": 2.1932365894317627, "learning_rate": 6.102297998517421e-05, "loss": 1.3829, "step": 4697 }, { "epoch": 0.696, "grad_norm": 2.6814064979553223, "learning_rate": 6.099332839140104e-05, "loss": 1.3149, "step": 4698 }, { "epoch": 0.6961481481481482, "grad_norm": 2.553849935531616, "learning_rate": 6.0963676797627875e-05, "loss": 1.0724, "step": 4699 }, { "epoch": 0.6962962962962963, "grad_norm": 1.985586166381836, "learning_rate": 6.0934025203854706e-05, "loss": 1.1211, "step": 4700 }, { "epoch": 0.6964444444444444, "grad_norm": 1.577874779701233, "learning_rate": 6.0904373610081544e-05, "loss": 0.9315, "step": 4701 }, { "epoch": 0.6965925925925925, "grad_norm": 2.6683530807495117, "learning_rate": 6.087472201630838e-05, "loss": 0.8294, "step": 4702 }, { "epoch": 0.6967407407407408, "grad_norm": 2.568171262741089, "learning_rate": 6.084507042253521e-05, "loss": 0.9812, "step": 4703 }, { "epoch": 0.6968888888888889, "grad_norm": 1.4571040868759155, "learning_rate": 6.081541882876205e-05, "loss": 1.2184, "step": 4704 }, { "epoch": 0.697037037037037, "grad_norm": 3.402040481567383, "learning_rate": 6.078576723498889e-05, "loss": 0.9914, "step": 4705 }, { "epoch": 0.6971851851851851, "grad_norm": 1.8167369365692139, "learning_rate": 6.075611564121572e-05, "loss": 0.96, "step": 4706 }, { "epoch": 0.6973333333333334, "grad_norm": 1.9412450790405273, "learning_rate": 6.0726464047442556e-05, "loss": 1.1004, "step": 4707 }, { "epoch": 0.6974814814814815, "grad_norm": 1.4077459573745728, "learning_rate": 6.069681245366938e-05, "loss": 0.8931, "step": 4708 }, { "epoch": 0.6976296296296296, "grad_norm": 2.860391139984131, "learning_rate": 6.0667160859896225e-05, "loss": 1.0071, "step": 4709 }, { "epoch": 0.6977777777777778, "grad_norm": 2.3669731616973877, "learning_rate": 6.063750926612306e-05, "loss": 0.9619, "step": 4710 }, { "epoch": 0.697925925925926, "grad_norm": 1.5092662572860718, "learning_rate": 6.0607857672349887e-05, "loss": 1.1466, "step": 4711 }, { "epoch": 0.6980740740740741, "grad_norm": 1.7964667081832886, "learning_rate": 6.0578206078576724e-05, "loss": 0.9678, "step": 4712 }, { "epoch": 0.6982222222222222, "grad_norm": 1.487083077430725, "learning_rate": 6.054855448480357e-05, "loss": 0.8466, "step": 4713 }, { "epoch": 0.6983703703703704, "grad_norm": 1.8605631589889526, "learning_rate": 6.051890289103039e-05, "loss": 0.9773, "step": 4714 }, { "epoch": 0.6985185185185185, "grad_norm": 1.8182073831558228, "learning_rate": 6.048925129725723e-05, "loss": 1.0576, "step": 4715 }, { "epoch": 0.6986666666666667, "grad_norm": 1.7778788805007935, "learning_rate": 6.045959970348406e-05, "loss": 0.9264, "step": 4716 }, { "epoch": 0.6988148148148148, "grad_norm": 1.9893543720245361, "learning_rate": 6.04299481097109e-05, "loss": 0.9176, "step": 4717 }, { "epoch": 0.698962962962963, "grad_norm": 1.6655364036560059, "learning_rate": 6.040029651593774e-05, "loss": 1.0479, "step": 4718 }, { "epoch": 0.6991111111111111, "grad_norm": 1.5168418884277344, "learning_rate": 6.037064492216457e-05, "loss": 0.983, "step": 4719 }, { "epoch": 0.6992592592592592, "grad_norm": 1.7691303491592407, "learning_rate": 6.0340993328391405e-05, "loss": 1.0607, "step": 4720 }, { "epoch": 0.6994074074074074, "grad_norm": 1.6258023977279663, "learning_rate": 6.031134173461824e-05, "loss": 0.8576, "step": 4721 }, { "epoch": 0.6995555555555556, "grad_norm": 1.5737048387527466, "learning_rate": 6.0281690140845074e-05, "loss": 0.9638, "step": 4722 }, { "epoch": 0.6997037037037037, "grad_norm": 2.3338394165039062, "learning_rate": 6.025203854707191e-05, "loss": 1.0194, "step": 4723 }, { "epoch": 0.6998518518518518, "grad_norm": 1.656455159187317, "learning_rate": 6.0222386953298736e-05, "loss": 1.0185, "step": 4724 }, { "epoch": 0.7, "grad_norm": 2.012742519378662, "learning_rate": 6.0192735359525574e-05, "loss": 1.0691, "step": 4725 }, { "epoch": 0.7001481481481482, "grad_norm": 1.199476718902588, "learning_rate": 6.016308376575242e-05, "loss": 0.9874, "step": 4726 }, { "epoch": 0.7002962962962963, "grad_norm": 1.5636026859283447, "learning_rate": 6.013343217197924e-05, "loss": 1.1817, "step": 4727 }, { "epoch": 0.7004444444444444, "grad_norm": 1.44326913356781, "learning_rate": 6.010378057820608e-05, "loss": 1.0238, "step": 4728 }, { "epoch": 0.7005925925925925, "grad_norm": 1.7215958833694458, "learning_rate": 6.007412898443292e-05, "loss": 1.1073, "step": 4729 }, { "epoch": 0.7007407407407408, "grad_norm": 2.1584391593933105, "learning_rate": 6.004447739065975e-05, "loss": 1.1707, "step": 4730 }, { "epoch": 0.7008888888888889, "grad_norm": 1.508112907409668, "learning_rate": 6.0014825796886586e-05, "loss": 0.9277, "step": 4731 }, { "epoch": 0.701037037037037, "grad_norm": 1.450149416923523, "learning_rate": 5.9985174203113424e-05, "loss": 1.1159, "step": 4732 }, { "epoch": 0.7011851851851851, "grad_norm": 1.7273179292678833, "learning_rate": 5.9955522609340255e-05, "loss": 0.9167, "step": 4733 }, { "epoch": 0.7013333333333334, "grad_norm": 1.7336523532867432, "learning_rate": 5.992587101556709e-05, "loss": 0.9558, "step": 4734 }, { "epoch": 0.7014814814814815, "grad_norm": 1.4574651718139648, "learning_rate": 5.989621942179392e-05, "loss": 0.8977, "step": 4735 }, { "epoch": 0.7016296296296296, "grad_norm": 2.9354634284973145, "learning_rate": 5.986656782802076e-05, "loss": 1.0963, "step": 4736 }, { "epoch": 0.7017777777777777, "grad_norm": 3.131915807723999, "learning_rate": 5.98369162342476e-05, "loss": 0.8543, "step": 4737 }, { "epoch": 0.701925925925926, "grad_norm": 1.915164828300476, "learning_rate": 5.980726464047442e-05, "loss": 1.021, "step": 4738 }, { "epoch": 0.7020740740740741, "grad_norm": 2.250742197036743, "learning_rate": 5.977761304670127e-05, "loss": 1.1014, "step": 4739 }, { "epoch": 0.7022222222222222, "grad_norm": 1.5038580894470215, "learning_rate": 5.9747961452928105e-05, "loss": 1.1808, "step": 4740 }, { "epoch": 0.7023703703703704, "grad_norm": 1.679661512374878, "learning_rate": 5.971830985915493e-05, "loss": 0.996, "step": 4741 }, { "epoch": 0.7025185185185185, "grad_norm": 1.9651271104812622, "learning_rate": 5.968865826538177e-05, "loss": 1.1116, "step": 4742 }, { "epoch": 0.7026666666666667, "grad_norm": 2.024658203125, "learning_rate": 5.96590066716086e-05, "loss": 0.8722, "step": 4743 }, { "epoch": 0.7028148148148148, "grad_norm": 4.580418586730957, "learning_rate": 5.9629355077835435e-05, "loss": 1.0557, "step": 4744 }, { "epoch": 0.702962962962963, "grad_norm": 2.757688045501709, "learning_rate": 5.959970348406227e-05, "loss": 1.3364, "step": 4745 }, { "epoch": 0.7031111111111111, "grad_norm": 1.519632339477539, "learning_rate": 5.9570051890289104e-05, "loss": 0.932, "step": 4746 }, { "epoch": 0.7032592592592593, "grad_norm": 1.3939028978347778, "learning_rate": 5.954040029651594e-05, "loss": 0.968, "step": 4747 }, { "epoch": 0.7034074074074074, "grad_norm": 1.3090217113494873, "learning_rate": 5.951074870274278e-05, "loss": 1.1631, "step": 4748 }, { "epoch": 0.7035555555555556, "grad_norm": 1.3829586505889893, "learning_rate": 5.948109710896961e-05, "loss": 0.7478, "step": 4749 }, { "epoch": 0.7037037037037037, "grad_norm": 1.5770014524459839, "learning_rate": 5.945144551519645e-05, "loss": 0.7757, "step": 4750 }, { "epoch": 0.7038518518518518, "grad_norm": 1.3989028930664062, "learning_rate": 5.942179392142327e-05, "loss": 0.7791, "step": 4751 }, { "epoch": 0.704, "grad_norm": 2.3870842456817627, "learning_rate": 5.9392142327650116e-05, "loss": 1.2263, "step": 4752 }, { "epoch": 0.7041481481481482, "grad_norm": 1.4271196126937866, "learning_rate": 5.9362490733876954e-05, "loss": 0.858, "step": 4753 }, { "epoch": 0.7042962962962963, "grad_norm": 1.4456055164337158, "learning_rate": 5.933283914010378e-05, "loss": 0.9463, "step": 4754 }, { "epoch": 0.7044444444444444, "grad_norm": 1.2765156030654907, "learning_rate": 5.9303187546330616e-05, "loss": 1.0271, "step": 4755 }, { "epoch": 0.7045925925925925, "grad_norm": 1.2719509601593018, "learning_rate": 5.927353595255746e-05, "loss": 1.1208, "step": 4756 }, { "epoch": 0.7047407407407408, "grad_norm": 2.141507148742676, "learning_rate": 5.9243884358784285e-05, "loss": 1.0317, "step": 4757 }, { "epoch": 0.7048888888888889, "grad_norm": 2.1003317832946777, "learning_rate": 5.921423276501112e-05, "loss": 1.1045, "step": 4758 }, { "epoch": 0.705037037037037, "grad_norm": 1.7553337812423706, "learning_rate": 5.918458117123795e-05, "loss": 0.8747, "step": 4759 }, { "epoch": 0.7051851851851851, "grad_norm": 1.8500460386276245, "learning_rate": 5.915492957746479e-05, "loss": 0.8808, "step": 4760 }, { "epoch": 0.7053333333333334, "grad_norm": 2.0247552394866943, "learning_rate": 5.912527798369163e-05, "loss": 1.1156, "step": 4761 }, { "epoch": 0.7054814814814815, "grad_norm": 1.9151968955993652, "learning_rate": 5.909562638991846e-05, "loss": 0.9452, "step": 4762 }, { "epoch": 0.7056296296296296, "grad_norm": 1.699159026145935, "learning_rate": 5.90659747961453e-05, "loss": 0.9527, "step": 4763 }, { "epoch": 0.7057777777777777, "grad_norm": 1.6331653594970703, "learning_rate": 5.9036323202372135e-05, "loss": 0.89, "step": 4764 }, { "epoch": 0.705925925925926, "grad_norm": 2.28598690032959, "learning_rate": 5.9006671608598966e-05, "loss": 1.108, "step": 4765 }, { "epoch": 0.7060740740740741, "grad_norm": 2.5625808238983154, "learning_rate": 5.89770200148258e-05, "loss": 1.0557, "step": 4766 }, { "epoch": 0.7062222222222222, "grad_norm": 3.2963948249816895, "learning_rate": 5.894736842105263e-05, "loss": 1.1277, "step": 4767 }, { "epoch": 0.7063703703703703, "grad_norm": 1.650059461593628, "learning_rate": 5.8917716827279465e-05, "loss": 1.0941, "step": 4768 }, { "epoch": 0.7065185185185185, "grad_norm": 1.6791961193084717, "learning_rate": 5.888806523350631e-05, "loss": 1.2284, "step": 4769 }, { "epoch": 0.7066666666666667, "grad_norm": 1.6996461153030396, "learning_rate": 5.8858413639733134e-05, "loss": 0.8496, "step": 4770 }, { "epoch": 0.7068148148148148, "grad_norm": 1.7521129846572876, "learning_rate": 5.882876204595997e-05, "loss": 0.9027, "step": 4771 }, { "epoch": 0.706962962962963, "grad_norm": 1.7088426351547241, "learning_rate": 5.879911045218681e-05, "loss": 1.1444, "step": 4772 }, { "epoch": 0.7071111111111111, "grad_norm": 3.168114423751831, "learning_rate": 5.876945885841364e-05, "loss": 1.1774, "step": 4773 }, { "epoch": 0.7072592592592593, "grad_norm": 1.2813169956207275, "learning_rate": 5.873980726464048e-05, "loss": 0.9264, "step": 4774 }, { "epoch": 0.7074074074074074, "grad_norm": 1.6186448335647583, "learning_rate": 5.8710155670867315e-05, "loss": 1.1366, "step": 4775 }, { "epoch": 0.7075555555555556, "grad_norm": 1.6248599290847778, "learning_rate": 5.8680504077094146e-05, "loss": 1.295, "step": 4776 }, { "epoch": 0.7077037037037037, "grad_norm": 1.871125340461731, "learning_rate": 5.8650852483320984e-05, "loss": 1.3185, "step": 4777 }, { "epoch": 0.7078518518518518, "grad_norm": 1.6283950805664062, "learning_rate": 5.8621200889547815e-05, "loss": 1.1021, "step": 4778 }, { "epoch": 0.708, "grad_norm": 2.1870861053466797, "learning_rate": 5.859154929577465e-05, "loss": 0.9308, "step": 4779 }, { "epoch": 0.7081481481481482, "grad_norm": 1.734559416770935, "learning_rate": 5.856189770200149e-05, "loss": 1.3067, "step": 4780 }, { "epoch": 0.7082962962962963, "grad_norm": 1.5416643619537354, "learning_rate": 5.8532246108228314e-05, "loss": 0.8081, "step": 4781 }, { "epoch": 0.7084444444444444, "grad_norm": 2.0428013801574707, "learning_rate": 5.850259451445516e-05, "loss": 0.9032, "step": 4782 }, { "epoch": 0.7085925925925926, "grad_norm": 1.3764290809631348, "learning_rate": 5.8472942920681997e-05, "loss": 0.8764, "step": 4783 }, { "epoch": 0.7087407407407408, "grad_norm": 1.7580024003982544, "learning_rate": 5.844329132690882e-05, "loss": 1.0703, "step": 4784 }, { "epoch": 0.7088888888888889, "grad_norm": 1.3402817249298096, "learning_rate": 5.841363973313566e-05, "loss": 0.7716, "step": 4785 }, { "epoch": 0.709037037037037, "grad_norm": 1.4462758302688599, "learning_rate": 5.838398813936249e-05, "loss": 0.8883, "step": 4786 }, { "epoch": 0.7091851851851851, "grad_norm": 1.529545783996582, "learning_rate": 5.835433654558933e-05, "loss": 0.7961, "step": 4787 }, { "epoch": 0.7093333333333334, "grad_norm": 1.3867297172546387, "learning_rate": 5.8324684951816165e-05, "loss": 0.9369, "step": 4788 }, { "epoch": 0.7094814814814815, "grad_norm": 1.3662264347076416, "learning_rate": 5.8295033358042996e-05, "loss": 0.9777, "step": 4789 }, { "epoch": 0.7096296296296296, "grad_norm": 1.5619665384292603, "learning_rate": 5.826538176426983e-05, "loss": 0.9971, "step": 4790 }, { "epoch": 0.7097777777777777, "grad_norm": 1.6321423053741455, "learning_rate": 5.823573017049667e-05, "loss": 1.0788, "step": 4791 }, { "epoch": 0.709925925925926, "grad_norm": 1.869868278503418, "learning_rate": 5.82060785767235e-05, "loss": 1.0693, "step": 4792 }, { "epoch": 0.7100740740740741, "grad_norm": 3.2114758491516113, "learning_rate": 5.817642698295034e-05, "loss": 1.0596, "step": 4793 }, { "epoch": 0.7102222222222222, "grad_norm": 1.7647440433502197, "learning_rate": 5.8146775389177164e-05, "loss": 1.1664, "step": 4794 }, { "epoch": 0.7103703703703703, "grad_norm": 1.788858413696289, "learning_rate": 5.811712379540401e-05, "loss": 0.7981, "step": 4795 }, { "epoch": 0.7105185185185185, "grad_norm": 1.276236891746521, "learning_rate": 5.8087472201630846e-05, "loss": 0.952, "step": 4796 }, { "epoch": 0.7106666666666667, "grad_norm": 1.2361873388290405, "learning_rate": 5.805782060785767e-05, "loss": 0.8385, "step": 4797 }, { "epoch": 0.7108148148148148, "grad_norm": 1.7445262670516968, "learning_rate": 5.802816901408451e-05, "loss": 1.2577, "step": 4798 }, { "epoch": 0.7109629629629629, "grad_norm": 3.5595784187316895, "learning_rate": 5.799851742031135e-05, "loss": 1.1013, "step": 4799 }, { "epoch": 0.7111111111111111, "grad_norm": 1.8771253824234009, "learning_rate": 5.7968865826538176e-05, "loss": 1.3519, "step": 4800 }, { "epoch": 0.7112592592592593, "grad_norm": 2.0168356895446777, "learning_rate": 5.7939214232765014e-05, "loss": 0.8908, "step": 4801 }, { "epoch": 0.7114074074074074, "grad_norm": 1.7869441509246826, "learning_rate": 5.7909562638991845e-05, "loss": 0.9439, "step": 4802 }, { "epoch": 0.7115555555555556, "grad_norm": 1.5168068408966064, "learning_rate": 5.787991104521868e-05, "loss": 1.1086, "step": 4803 }, { "epoch": 0.7117037037037037, "grad_norm": 1.6673247814178467, "learning_rate": 5.785025945144552e-05, "loss": 1.1323, "step": 4804 }, { "epoch": 0.7118518518518518, "grad_norm": 2.1463546752929688, "learning_rate": 5.782060785767235e-05, "loss": 0.9427, "step": 4805 }, { "epoch": 0.712, "grad_norm": 2.2018017768859863, "learning_rate": 5.779095626389919e-05, "loss": 0.7993, "step": 4806 }, { "epoch": 0.7121481481481482, "grad_norm": 2.1906869411468506, "learning_rate": 5.7761304670126026e-05, "loss": 0.9726, "step": 4807 }, { "epoch": 0.7122962962962963, "grad_norm": 1.5149343013763428, "learning_rate": 5.773165307635286e-05, "loss": 0.8789, "step": 4808 }, { "epoch": 0.7124444444444444, "grad_norm": 2.22306489944458, "learning_rate": 5.7702001482579695e-05, "loss": 1.1814, "step": 4809 }, { "epoch": 0.7125925925925926, "grad_norm": 1.7001208066940308, "learning_rate": 5.767234988880652e-05, "loss": 1.3092, "step": 4810 }, { "epoch": 0.7127407407407408, "grad_norm": 1.7391821146011353, "learning_rate": 5.764269829503336e-05, "loss": 1.0056, "step": 4811 }, { "epoch": 0.7128888888888889, "grad_norm": 1.6737895011901855, "learning_rate": 5.76130467012602e-05, "loss": 1.1267, "step": 4812 }, { "epoch": 0.713037037037037, "grad_norm": 1.665744662284851, "learning_rate": 5.7583395107487025e-05, "loss": 1.0543, "step": 4813 }, { "epoch": 0.7131851851851851, "grad_norm": 1.5450365543365479, "learning_rate": 5.755374351371386e-05, "loss": 0.9675, "step": 4814 }, { "epoch": 0.7133333333333334, "grad_norm": 2.015901565551758, "learning_rate": 5.75240919199407e-05, "loss": 1.0007, "step": 4815 }, { "epoch": 0.7134814814814815, "grad_norm": 1.0717864036560059, "learning_rate": 5.749444032616753e-05, "loss": 1.3662, "step": 4816 }, { "epoch": 0.7136296296296296, "grad_norm": 1.3023358583450317, "learning_rate": 5.746478873239437e-05, "loss": 0.8571, "step": 4817 }, { "epoch": 0.7137777777777777, "grad_norm": 2.133415460586548, "learning_rate": 5.743513713862121e-05, "loss": 1.063, "step": 4818 }, { "epoch": 0.713925925925926, "grad_norm": 3.2209153175354004, "learning_rate": 5.740548554484804e-05, "loss": 0.9373, "step": 4819 }, { "epoch": 0.7140740740740741, "grad_norm": 1.4295662641525269, "learning_rate": 5.7375833951074876e-05, "loss": 0.9626, "step": 4820 }, { "epoch": 0.7142222222222222, "grad_norm": 1.9351325035095215, "learning_rate": 5.7346182357301707e-05, "loss": 0.9861, "step": 4821 }, { "epoch": 0.7143703703703703, "grad_norm": 1.9480867385864258, "learning_rate": 5.7316530763528544e-05, "loss": 0.8906, "step": 4822 }, { "epoch": 0.7145185185185186, "grad_norm": 2.4652979373931885, "learning_rate": 5.728687916975538e-05, "loss": 1.1842, "step": 4823 }, { "epoch": 0.7146666666666667, "grad_norm": 1.3195927143096924, "learning_rate": 5.7257227575982206e-05, "loss": 1.0483, "step": 4824 }, { "epoch": 0.7148148148148148, "grad_norm": 2.3855903148651123, "learning_rate": 5.722757598220905e-05, "loss": 1.0235, "step": 4825 }, { "epoch": 0.7149629629629629, "grad_norm": 2.136852979660034, "learning_rate": 5.719792438843589e-05, "loss": 0.9468, "step": 4826 }, { "epoch": 0.7151111111111111, "grad_norm": 2.072896718978882, "learning_rate": 5.716827279466271e-05, "loss": 0.9382, "step": 4827 }, { "epoch": 0.7152592592592593, "grad_norm": 1.2670269012451172, "learning_rate": 5.713862120088955e-05, "loss": 0.927, "step": 4828 }, { "epoch": 0.7154074074074074, "grad_norm": 3.4418578147888184, "learning_rate": 5.710896960711638e-05, "loss": 0.8586, "step": 4829 }, { "epoch": 0.7155555555555555, "grad_norm": 2.2086219787597656, "learning_rate": 5.707931801334322e-05, "loss": 1.1675, "step": 4830 }, { "epoch": 0.7157037037037037, "grad_norm": 1.7778724431991577, "learning_rate": 5.7049666419570056e-05, "loss": 1.0195, "step": 4831 }, { "epoch": 0.7158518518518519, "grad_norm": 1.8800742626190186, "learning_rate": 5.702001482579689e-05, "loss": 1.103, "step": 4832 }, { "epoch": 0.716, "grad_norm": 1.665619969367981, "learning_rate": 5.6990363232023725e-05, "loss": 1.0806, "step": 4833 }, { "epoch": 0.7161481481481482, "grad_norm": 1.8878470659255981, "learning_rate": 5.696071163825056e-05, "loss": 0.9141, "step": 4834 }, { "epoch": 0.7162962962962963, "grad_norm": 2.0529701709747314, "learning_rate": 5.6931060044477393e-05, "loss": 0.9595, "step": 4835 }, { "epoch": 0.7164444444444444, "grad_norm": 1.6207361221313477, "learning_rate": 5.690140845070423e-05, "loss": 0.8366, "step": 4836 }, { "epoch": 0.7165925925925926, "grad_norm": 2.2214808464050293, "learning_rate": 5.6871756856931055e-05, "loss": 1.1933, "step": 4837 }, { "epoch": 0.7167407407407408, "grad_norm": 1.9758164882659912, "learning_rate": 5.68421052631579e-05, "loss": 1.0578, "step": 4838 }, { "epoch": 0.7168888888888889, "grad_norm": 2.119447946548462, "learning_rate": 5.681245366938474e-05, "loss": 0.7454, "step": 4839 }, { "epoch": 0.717037037037037, "grad_norm": 3.2046563625335693, "learning_rate": 5.678280207561156e-05, "loss": 1.031, "step": 4840 }, { "epoch": 0.7171851851851851, "grad_norm": 1.210294246673584, "learning_rate": 5.67531504818384e-05, "loss": 1.1647, "step": 4841 }, { "epoch": 0.7173333333333334, "grad_norm": 1.4997482299804688, "learning_rate": 5.6723498888065244e-05, "loss": 1.0107, "step": 4842 }, { "epoch": 0.7174814814814815, "grad_norm": 1.4815231561660767, "learning_rate": 5.669384729429207e-05, "loss": 0.9828, "step": 4843 }, { "epoch": 0.7176296296296296, "grad_norm": 7.08305549621582, "learning_rate": 5.6664195700518906e-05, "loss": 1.1005, "step": 4844 }, { "epoch": 0.7177777777777777, "grad_norm": 1.646203875541687, "learning_rate": 5.6634544106745736e-05, "loss": 0.9126, "step": 4845 }, { "epoch": 0.717925925925926, "grad_norm": 2.0197436809539795, "learning_rate": 5.6604892512972574e-05, "loss": 0.8148, "step": 4846 }, { "epoch": 0.7180740740740741, "grad_norm": 2.079904794692993, "learning_rate": 5.657524091919941e-05, "loss": 1.1894, "step": 4847 }, { "epoch": 0.7182222222222222, "grad_norm": 2.2810299396514893, "learning_rate": 5.654558932542624e-05, "loss": 0.9773, "step": 4848 }, { "epoch": 0.7183703703703703, "grad_norm": 1.8330743312835693, "learning_rate": 5.651593773165308e-05, "loss": 1.3854, "step": 4849 }, { "epoch": 0.7185185185185186, "grad_norm": 1.503127098083496, "learning_rate": 5.648628613787992e-05, "loss": 0.8601, "step": 4850 }, { "epoch": 0.7186666666666667, "grad_norm": 2.749389410018921, "learning_rate": 5.645663454410675e-05, "loss": 0.9774, "step": 4851 }, { "epoch": 0.7188148148148148, "grad_norm": 1.6506550312042236, "learning_rate": 5.642698295033359e-05, "loss": 0.8976, "step": 4852 }, { "epoch": 0.7189629629629629, "grad_norm": 1.4076423645019531, "learning_rate": 5.6397331356560424e-05, "loss": 1.0072, "step": 4853 }, { "epoch": 0.7191111111111111, "grad_norm": 4.1458892822265625, "learning_rate": 5.636767976278725e-05, "loss": 0.7843, "step": 4854 }, { "epoch": 0.7192592592592593, "grad_norm": 1.7543416023254395, "learning_rate": 5.633802816901409e-05, "loss": 0.8265, "step": 4855 }, { "epoch": 0.7194074074074074, "grad_norm": 3.1243369579315186, "learning_rate": 5.630837657524092e-05, "loss": 1.2424, "step": 4856 }, { "epoch": 0.7195555555555555, "grad_norm": 1.600679636001587, "learning_rate": 5.6278724981467755e-05, "loss": 1.0629, "step": 4857 }, { "epoch": 0.7197037037037037, "grad_norm": 1.5085700750350952, "learning_rate": 5.624907338769459e-05, "loss": 1.0591, "step": 4858 }, { "epoch": 0.7198518518518519, "grad_norm": 1.312741994857788, "learning_rate": 5.621942179392142e-05, "loss": 0.9902, "step": 4859 }, { "epoch": 0.72, "grad_norm": 1.5295848846435547, "learning_rate": 5.618977020014826e-05, "loss": 1.2123, "step": 4860 }, { "epoch": 0.7201481481481481, "grad_norm": 1.4847466945648193, "learning_rate": 5.61601186063751e-05, "loss": 1.0278, "step": 4861 }, { "epoch": 0.7202962962962963, "grad_norm": 2.524733781814575, "learning_rate": 5.613046701260193e-05, "loss": 0.8715, "step": 4862 }, { "epoch": 0.7204444444444444, "grad_norm": 2.0416817665100098, "learning_rate": 5.610081541882877e-05, "loss": 0.8527, "step": 4863 }, { "epoch": 0.7205925925925926, "grad_norm": 1.7584664821624756, "learning_rate": 5.60711638250556e-05, "loss": 1.1136, "step": 4864 }, { "epoch": 0.7207407407407408, "grad_norm": 1.4379719495773315, "learning_rate": 5.6041512231282436e-05, "loss": 0.9294, "step": 4865 }, { "epoch": 0.7208888888888889, "grad_norm": 1.8781688213348389, "learning_rate": 5.6011860637509274e-05, "loss": 0.9797, "step": 4866 }, { "epoch": 0.721037037037037, "grad_norm": 1.8690986633300781, "learning_rate": 5.59822090437361e-05, "loss": 1.0365, "step": 4867 }, { "epoch": 0.7211851851851852, "grad_norm": 1.4605828523635864, "learning_rate": 5.595255744996294e-05, "loss": 0.9485, "step": 4868 }, { "epoch": 0.7213333333333334, "grad_norm": 2.2668371200561523, "learning_rate": 5.592290585618978e-05, "loss": 1.24, "step": 4869 }, { "epoch": 0.7214814814814815, "grad_norm": 1.3561054468154907, "learning_rate": 5.5893254262416604e-05, "loss": 1.0381, "step": 4870 }, { "epoch": 0.7216296296296296, "grad_norm": 1.8758090734481812, "learning_rate": 5.586360266864344e-05, "loss": 0.8211, "step": 4871 }, { "epoch": 0.7217777777777777, "grad_norm": 1.7644171714782715, "learning_rate": 5.583395107487027e-05, "loss": 0.8471, "step": 4872 }, { "epoch": 0.721925925925926, "grad_norm": 1.2087984085083008, "learning_rate": 5.580429948109711e-05, "loss": 1.0755, "step": 4873 }, { "epoch": 0.7220740740740741, "grad_norm": 2.2735400199890137, "learning_rate": 5.577464788732395e-05, "loss": 1.2577, "step": 4874 }, { "epoch": 0.7222222222222222, "grad_norm": 1.866197109222412, "learning_rate": 5.574499629355078e-05, "loss": 1.0508, "step": 4875 }, { "epoch": 0.7223703703703703, "grad_norm": 2.100440502166748, "learning_rate": 5.5715344699777617e-05, "loss": 1.067, "step": 4876 }, { "epoch": 0.7225185185185186, "grad_norm": 1.6745151281356812, "learning_rate": 5.5685693106004454e-05, "loss": 1.0073, "step": 4877 }, { "epoch": 0.7226666666666667, "grad_norm": 2.121823787689209, "learning_rate": 5.5656041512231285e-05, "loss": 0.9013, "step": 4878 }, { "epoch": 0.7228148148148148, "grad_norm": 1.6000807285308838, "learning_rate": 5.562638991845812e-05, "loss": 0.971, "step": 4879 }, { "epoch": 0.7229629629629629, "grad_norm": 2.799863576889038, "learning_rate": 5.559673832468495e-05, "loss": 1.2896, "step": 4880 }, { "epoch": 0.7231111111111111, "grad_norm": 1.652538776397705, "learning_rate": 5.556708673091179e-05, "loss": 1.116, "step": 4881 }, { "epoch": 0.7232592592592593, "grad_norm": 2.3919286727905273, "learning_rate": 5.553743513713863e-05, "loss": 1.0388, "step": 4882 }, { "epoch": 0.7234074074074074, "grad_norm": 2.370417594909668, "learning_rate": 5.550778354336545e-05, "loss": 0.9581, "step": 4883 }, { "epoch": 0.7235555555555555, "grad_norm": 1.2086527347564697, "learning_rate": 5.547813194959229e-05, "loss": 0.8202, "step": 4884 }, { "epoch": 0.7237037037037037, "grad_norm": 1.8177130222320557, "learning_rate": 5.5448480355819135e-05, "loss": 1.1192, "step": 4885 }, { "epoch": 0.7238518518518519, "grad_norm": 6.591041088104248, "learning_rate": 5.541882876204596e-05, "loss": 1.0015, "step": 4886 }, { "epoch": 0.724, "grad_norm": 2.110539436340332, "learning_rate": 5.53891771682728e-05, "loss": 0.9016, "step": 4887 }, { "epoch": 0.7241481481481481, "grad_norm": 3.109020948410034, "learning_rate": 5.535952557449963e-05, "loss": 1.0737, "step": 4888 }, { "epoch": 0.7242962962962963, "grad_norm": 1.3220057487487793, "learning_rate": 5.5329873980726466e-05, "loss": 0.973, "step": 4889 }, { "epoch": 0.7244444444444444, "grad_norm": 1.1820323467254639, "learning_rate": 5.5300222386953303e-05, "loss": 1.0008, "step": 4890 }, { "epoch": 0.7245925925925926, "grad_norm": 2.1216866970062256, "learning_rate": 5.5270570793180134e-05, "loss": 0.7249, "step": 4891 }, { "epoch": 0.7247407407407407, "grad_norm": 1.6888575553894043, "learning_rate": 5.524091919940697e-05, "loss": 1.4341, "step": 4892 }, { "epoch": 0.7248888888888889, "grad_norm": 2.9108896255493164, "learning_rate": 5.521126760563381e-05, "loss": 0.9926, "step": 4893 }, { "epoch": 0.725037037037037, "grad_norm": 1.7559731006622314, "learning_rate": 5.518161601186064e-05, "loss": 1.1029, "step": 4894 }, { "epoch": 0.7251851851851852, "grad_norm": 1.4509018659591675, "learning_rate": 5.515196441808748e-05, "loss": 1.0668, "step": 4895 }, { "epoch": 0.7253333333333334, "grad_norm": 1.8988780975341797, "learning_rate": 5.5122312824314316e-05, "loss": 1.0963, "step": 4896 }, { "epoch": 0.7254814814814815, "grad_norm": 3.0174896717071533, "learning_rate": 5.509266123054114e-05, "loss": 1.0151, "step": 4897 }, { "epoch": 0.7256296296296296, "grad_norm": 1.4957520961761475, "learning_rate": 5.5063009636767985e-05, "loss": 0.9085, "step": 4898 }, { "epoch": 0.7257777777777777, "grad_norm": 2.035078525543213, "learning_rate": 5.503335804299481e-05, "loss": 0.8713, "step": 4899 }, { "epoch": 0.725925925925926, "grad_norm": 0.9935551285743713, "learning_rate": 5.5003706449221646e-05, "loss": 1.0032, "step": 4900 }, { "epoch": 0.7260740740740741, "grad_norm": 2.0780887603759766, "learning_rate": 5.4974054855448484e-05, "loss": 1.0751, "step": 4901 }, { "epoch": 0.7262222222222222, "grad_norm": 1.6350312232971191, "learning_rate": 5.4944403261675315e-05, "loss": 1.1934, "step": 4902 }, { "epoch": 0.7263703703703703, "grad_norm": 2.609414577484131, "learning_rate": 5.491475166790215e-05, "loss": 1.1323, "step": 4903 }, { "epoch": 0.7265185185185186, "grad_norm": 1.2709413766860962, "learning_rate": 5.488510007412899e-05, "loss": 0.8645, "step": 4904 }, { "epoch": 0.7266666666666667, "grad_norm": 1.2477498054504395, "learning_rate": 5.485544848035582e-05, "loss": 1.012, "step": 4905 }, { "epoch": 0.7268148148148148, "grad_norm": 1.6356146335601807, "learning_rate": 5.482579688658266e-05, "loss": 1.3078, "step": 4906 }, { "epoch": 0.7269629629629629, "grad_norm": 1.3068426847457886, "learning_rate": 5.479614529280949e-05, "loss": 1.3083, "step": 4907 }, { "epoch": 0.7271111111111112, "grad_norm": 1.8441948890686035, "learning_rate": 5.476649369903633e-05, "loss": 1.1332, "step": 4908 }, { "epoch": 0.7272592592592593, "grad_norm": 2.111903429031372, "learning_rate": 5.4736842105263165e-05, "loss": 1.0196, "step": 4909 }, { "epoch": 0.7274074074074074, "grad_norm": 1.3523560762405396, "learning_rate": 5.470719051148999e-05, "loss": 1.0159, "step": 4910 }, { "epoch": 0.7275555555555555, "grad_norm": 2.855959892272949, "learning_rate": 5.4677538917716834e-05, "loss": 1.2127, "step": 4911 }, { "epoch": 0.7277037037037037, "grad_norm": 1.5005217790603638, "learning_rate": 5.464788732394367e-05, "loss": 1.0609, "step": 4912 }, { "epoch": 0.7278518518518519, "grad_norm": 1.4840768575668335, "learning_rate": 5.4618235730170496e-05, "loss": 0.9953, "step": 4913 }, { "epoch": 0.728, "grad_norm": 1.0789657831192017, "learning_rate": 5.458858413639733e-05, "loss": 1.1345, "step": 4914 }, { "epoch": 0.7281481481481481, "grad_norm": 2.3142013549804688, "learning_rate": 5.4558932542624164e-05, "loss": 0.9493, "step": 4915 }, { "epoch": 0.7282962962962963, "grad_norm": 4.36397647857666, "learning_rate": 5.4529280948851e-05, "loss": 0.88, "step": 4916 }, { "epoch": 0.7284444444444444, "grad_norm": 1.8086189031600952, "learning_rate": 5.449962935507784e-05, "loss": 1.2032, "step": 4917 }, { "epoch": 0.7285925925925926, "grad_norm": 1.501092553138733, "learning_rate": 5.446997776130467e-05, "loss": 0.7264, "step": 4918 }, { "epoch": 0.7287407407407407, "grad_norm": 1.809766411781311, "learning_rate": 5.444032616753151e-05, "loss": 1.1109, "step": 4919 }, { "epoch": 0.7288888888888889, "grad_norm": 1.3491125106811523, "learning_rate": 5.4410674573758346e-05, "loss": 0.7823, "step": 4920 }, { "epoch": 0.729037037037037, "grad_norm": 1.5708109140396118, "learning_rate": 5.438102297998518e-05, "loss": 0.829, "step": 4921 }, { "epoch": 0.7291851851851852, "grad_norm": 2.049161911010742, "learning_rate": 5.4351371386212014e-05, "loss": 0.8991, "step": 4922 }, { "epoch": 0.7293333333333333, "grad_norm": 1.0794745683670044, "learning_rate": 5.432171979243884e-05, "loss": 0.7913, "step": 4923 }, { "epoch": 0.7294814814814815, "grad_norm": 2.5652902126312256, "learning_rate": 5.429206819866568e-05, "loss": 1.0493, "step": 4924 }, { "epoch": 0.7296296296296296, "grad_norm": 2.240591287612915, "learning_rate": 5.426241660489252e-05, "loss": 1.0168, "step": 4925 }, { "epoch": 0.7297777777777777, "grad_norm": 1.9119607210159302, "learning_rate": 5.4232765011119345e-05, "loss": 1.2101, "step": 4926 }, { "epoch": 0.729925925925926, "grad_norm": 1.4905898571014404, "learning_rate": 5.420311341734618e-05, "loss": 0.9353, "step": 4927 }, { "epoch": 0.7300740740740741, "grad_norm": 2.004554510116577, "learning_rate": 5.417346182357303e-05, "loss": 1.2032, "step": 4928 }, { "epoch": 0.7302222222222222, "grad_norm": 1.0485835075378418, "learning_rate": 5.414381022979985e-05, "loss": 0.8296, "step": 4929 }, { "epoch": 0.7303703703703703, "grad_norm": 2.900482654571533, "learning_rate": 5.411415863602669e-05, "loss": 0.9702, "step": 4930 }, { "epoch": 0.7305185185185186, "grad_norm": 1.6167924404144287, "learning_rate": 5.408450704225352e-05, "loss": 1.0082, "step": 4931 }, { "epoch": 0.7306666666666667, "grad_norm": 2.0141680240631104, "learning_rate": 5.405485544848036e-05, "loss": 1.0995, "step": 4932 }, { "epoch": 0.7308148148148148, "grad_norm": 2.588238000869751, "learning_rate": 5.4025203854707195e-05, "loss": 1.202, "step": 4933 }, { "epoch": 0.7309629629629629, "grad_norm": 1.3934060335159302, "learning_rate": 5.3995552260934026e-05, "loss": 1.0188, "step": 4934 }, { "epoch": 0.7311111111111112, "grad_norm": 1.9247220754623413, "learning_rate": 5.3965900667160864e-05, "loss": 0.8479, "step": 4935 }, { "epoch": 0.7312592592592593, "grad_norm": 1.7972164154052734, "learning_rate": 5.39362490733877e-05, "loss": 1.2798, "step": 4936 }, { "epoch": 0.7314074074074074, "grad_norm": 1.7413814067840576, "learning_rate": 5.390659747961453e-05, "loss": 1.172, "step": 4937 }, { "epoch": 0.7315555555555555, "grad_norm": 1.755491852760315, "learning_rate": 5.387694588584137e-05, "loss": 0.814, "step": 4938 }, { "epoch": 0.7317037037037037, "grad_norm": 1.4138622283935547, "learning_rate": 5.384729429206821e-05, "loss": 1.023, "step": 4939 }, { "epoch": 0.7318518518518519, "grad_norm": 2.6964871883392334, "learning_rate": 5.381764269829503e-05, "loss": 1.2712, "step": 4940 }, { "epoch": 0.732, "grad_norm": 1.6311568021774292, "learning_rate": 5.3787991104521876e-05, "loss": 0.9323, "step": 4941 }, { "epoch": 0.7321481481481481, "grad_norm": 2.1600868701934814, "learning_rate": 5.37583395107487e-05, "loss": 0.9687, "step": 4942 }, { "epoch": 0.7322962962962963, "grad_norm": 2.4418132305145264, "learning_rate": 5.372868791697554e-05, "loss": 0.9305, "step": 4943 }, { "epoch": 0.7324444444444445, "grad_norm": 2.5369813442230225, "learning_rate": 5.3699036323202376e-05, "loss": 0.8592, "step": 4944 }, { "epoch": 0.7325925925925926, "grad_norm": 2.060513734817505, "learning_rate": 5.366938472942921e-05, "loss": 0.8562, "step": 4945 }, { "epoch": 0.7327407407407407, "grad_norm": 1.7541512250900269, "learning_rate": 5.3639733135656044e-05, "loss": 1.1362, "step": 4946 }, { "epoch": 0.7328888888888889, "grad_norm": 1.4201487302780151, "learning_rate": 5.361008154188288e-05, "loss": 1.021, "step": 4947 }, { "epoch": 0.733037037037037, "grad_norm": 1.720343828201294, "learning_rate": 5.358042994810971e-05, "loss": 0.8417, "step": 4948 }, { "epoch": 0.7331851851851852, "grad_norm": 2.402940034866333, "learning_rate": 5.355077835433655e-05, "loss": 0.8907, "step": 4949 }, { "epoch": 0.7333333333333333, "grad_norm": 1.6902525424957275, "learning_rate": 5.352112676056338e-05, "loss": 1.0181, "step": 4950 }, { "epoch": 0.7334814814814815, "grad_norm": 1.8172622919082642, "learning_rate": 5.349147516679022e-05, "loss": 1.1631, "step": 4951 }, { "epoch": 0.7336296296296296, "grad_norm": 1.879783272743225, "learning_rate": 5.346182357301706e-05, "loss": 0.9498, "step": 4952 }, { "epoch": 0.7337777777777778, "grad_norm": 1.7294676303863525, "learning_rate": 5.343217197924388e-05, "loss": 1.0644, "step": 4953 }, { "epoch": 0.7339259259259259, "grad_norm": 1.4138739109039307, "learning_rate": 5.3402520385470725e-05, "loss": 0.9301, "step": 4954 }, { "epoch": 0.7340740740740741, "grad_norm": 1.3349261283874512, "learning_rate": 5.337286879169756e-05, "loss": 0.9506, "step": 4955 }, { "epoch": 0.7342222222222222, "grad_norm": 2.168971061706543, "learning_rate": 5.334321719792439e-05, "loss": 0.9874, "step": 4956 }, { "epoch": 0.7343703703703703, "grad_norm": 1.384792685508728, "learning_rate": 5.3313565604151225e-05, "loss": 1.1012, "step": 4957 }, { "epoch": 0.7345185185185186, "grad_norm": 2.2706778049468994, "learning_rate": 5.3283914010378056e-05, "loss": 1.1663, "step": 4958 }, { "epoch": 0.7346666666666667, "grad_norm": 1.9906309843063354, "learning_rate": 5.3254262416604894e-05, "loss": 0.9603, "step": 4959 }, { "epoch": 0.7348148148148148, "grad_norm": 1.9914016723632812, "learning_rate": 5.322461082283173e-05, "loss": 1.12, "step": 4960 }, { "epoch": 0.7349629629629629, "grad_norm": 2.031702995300293, "learning_rate": 5.319495922905856e-05, "loss": 0.9065, "step": 4961 }, { "epoch": 0.7351111111111112, "grad_norm": 2.2742061614990234, "learning_rate": 5.31653076352854e-05, "loss": 1.196, "step": 4962 }, { "epoch": 0.7352592592592593, "grad_norm": 1.26792573928833, "learning_rate": 5.313565604151224e-05, "loss": 1.3298, "step": 4963 }, { "epoch": 0.7354074074074074, "grad_norm": 1.838092565536499, "learning_rate": 5.310600444773907e-05, "loss": 1.0595, "step": 4964 }, { "epoch": 0.7355555555555555, "grad_norm": 1.5691438913345337, "learning_rate": 5.3076352853965906e-05, "loss": 1.1053, "step": 4965 }, { "epoch": 0.7357037037037037, "grad_norm": 1.9591392278671265, "learning_rate": 5.304670126019273e-05, "loss": 1.0963, "step": 4966 }, { "epoch": 0.7358518518518519, "grad_norm": 2.119932174682617, "learning_rate": 5.3017049666419575e-05, "loss": 0.9933, "step": 4967 }, { "epoch": 0.736, "grad_norm": 1.7089087963104248, "learning_rate": 5.298739807264641e-05, "loss": 0.8262, "step": 4968 }, { "epoch": 0.7361481481481481, "grad_norm": 1.9580737352371216, "learning_rate": 5.2957746478873237e-05, "loss": 0.9954, "step": 4969 }, { "epoch": 0.7362962962962963, "grad_norm": 2.169576406478882, "learning_rate": 5.2928094885100074e-05, "loss": 0.9911, "step": 4970 }, { "epoch": 0.7364444444444445, "grad_norm": 1.5147404670715332, "learning_rate": 5.289844329132692e-05, "loss": 1.0696, "step": 4971 }, { "epoch": 0.7365925925925926, "grad_norm": 1.6818712949752808, "learning_rate": 5.286879169755374e-05, "loss": 0.8472, "step": 4972 }, { "epoch": 0.7367407407407407, "grad_norm": 1.5663869380950928, "learning_rate": 5.283914010378058e-05, "loss": 0.9923, "step": 4973 }, { "epoch": 0.7368888888888889, "grad_norm": 2.421029806137085, "learning_rate": 5.280948851000741e-05, "loss": 1.0343, "step": 4974 }, { "epoch": 0.737037037037037, "grad_norm": 3.366656541824341, "learning_rate": 5.277983691623425e-05, "loss": 1.1295, "step": 4975 }, { "epoch": 0.7371851851851852, "grad_norm": 1.3032071590423584, "learning_rate": 5.275018532246109e-05, "loss": 0.9873, "step": 4976 }, { "epoch": 0.7373333333333333, "grad_norm": 3.0822577476501465, "learning_rate": 5.272053372868792e-05, "loss": 0.773, "step": 4977 }, { "epoch": 0.7374814814814815, "grad_norm": 1.7527302503585815, "learning_rate": 5.2690882134914755e-05, "loss": 1.1594, "step": 4978 }, { "epoch": 0.7376296296296296, "grad_norm": 1.6602507829666138, "learning_rate": 5.266123054114159e-05, "loss": 0.9835, "step": 4979 }, { "epoch": 0.7377777777777778, "grad_norm": 1.6826786994934082, "learning_rate": 5.2631578947368424e-05, "loss": 1.1101, "step": 4980 }, { "epoch": 0.7379259259259259, "grad_norm": 1.423226237297058, "learning_rate": 5.260192735359526e-05, "loss": 1.0795, "step": 4981 }, { "epoch": 0.7380740740740741, "grad_norm": 1.8139948844909668, "learning_rate": 5.25722757598221e-05, "loss": 0.901, "step": 4982 }, { "epoch": 0.7382222222222222, "grad_norm": 1.323270320892334, "learning_rate": 5.2542624166048923e-05, "loss": 0.9716, "step": 4983 }, { "epoch": 0.7383703703703703, "grad_norm": 1.5968650579452515, "learning_rate": 5.251297257227577e-05, "loss": 1.0551, "step": 4984 }, { "epoch": 0.7385185185185185, "grad_norm": 4.135926723480225, "learning_rate": 5.248332097850259e-05, "loss": 0.9754, "step": 4985 }, { "epoch": 0.7386666666666667, "grad_norm": 1.9808924198150635, "learning_rate": 5.245366938472943e-05, "loss": 1.1565, "step": 4986 }, { "epoch": 0.7388148148148148, "grad_norm": 3.052340030670166, "learning_rate": 5.242401779095627e-05, "loss": 1.1365, "step": 4987 }, { "epoch": 0.7389629629629629, "grad_norm": 1.6291524171829224, "learning_rate": 5.23943661971831e-05, "loss": 0.9795, "step": 4988 }, { "epoch": 0.7391111111111112, "grad_norm": 2.475485324859619, "learning_rate": 5.2364714603409936e-05, "loss": 1.048, "step": 4989 }, { "epoch": 0.7392592592592593, "grad_norm": 1.5522637367248535, "learning_rate": 5.2335063009636774e-05, "loss": 1.0012, "step": 4990 }, { "epoch": 0.7394074074074074, "grad_norm": 1.6295976638793945, "learning_rate": 5.2305411415863605e-05, "loss": 1.1546, "step": 4991 }, { "epoch": 0.7395555555555555, "grad_norm": 1.7080334424972534, "learning_rate": 5.227575982209044e-05, "loss": 1.04, "step": 4992 }, { "epoch": 0.7397037037037038, "grad_norm": 2.2442939281463623, "learning_rate": 5.224610822831727e-05, "loss": 0.9557, "step": 4993 }, { "epoch": 0.7398518518518519, "grad_norm": 1.4062635898590088, "learning_rate": 5.221645663454411e-05, "loss": 0.9776, "step": 4994 }, { "epoch": 0.74, "grad_norm": 1.5093679428100586, "learning_rate": 5.218680504077095e-05, "loss": 1.0569, "step": 4995 }, { "epoch": 0.7401481481481481, "grad_norm": 5.3266496658325195, "learning_rate": 5.215715344699777e-05, "loss": 1.0382, "step": 4996 }, { "epoch": 0.7402962962962963, "grad_norm": 1.4864649772644043, "learning_rate": 5.212750185322462e-05, "loss": 1.3798, "step": 4997 }, { "epoch": 0.7404444444444445, "grad_norm": 2.1344094276428223, "learning_rate": 5.2097850259451455e-05, "loss": 0.8893, "step": 4998 }, { "epoch": 0.7405925925925926, "grad_norm": 2.4697370529174805, "learning_rate": 5.206819866567828e-05, "loss": 0.9593, "step": 4999 }, { "epoch": 0.7407407407407407, "grad_norm": 3.9260923862457275, "learning_rate": 5.203854707190512e-05, "loss": 1.1272, "step": 5000 }, { "epoch": 0.7408888888888889, "grad_norm": 1.388735055923462, "learning_rate": 5.200889547813195e-05, "loss": 0.6663, "step": 5001 }, { "epoch": 0.741037037037037, "grad_norm": 1.714282512664795, "learning_rate": 5.1979243884358785e-05, "loss": 1.2451, "step": 5002 }, { "epoch": 0.7411851851851852, "grad_norm": 2.0393123626708984, "learning_rate": 5.194959229058562e-05, "loss": 0.9522, "step": 5003 }, { "epoch": 0.7413333333333333, "grad_norm": 1.7348084449768066, "learning_rate": 5.1919940696812454e-05, "loss": 1.0637, "step": 5004 }, { "epoch": 0.7414814814814815, "grad_norm": 3.50278639793396, "learning_rate": 5.189028910303929e-05, "loss": 1.1513, "step": 5005 }, { "epoch": 0.7416296296296296, "grad_norm": 3.855315923690796, "learning_rate": 5.186063750926613e-05, "loss": 0.9822, "step": 5006 }, { "epoch": 0.7417777777777778, "grad_norm": 4.46833610534668, "learning_rate": 5.183098591549296e-05, "loss": 0.9115, "step": 5007 }, { "epoch": 0.7419259259259259, "grad_norm": 1.7555344104766846, "learning_rate": 5.18013343217198e-05, "loss": 1.2394, "step": 5008 }, { "epoch": 0.7420740740740741, "grad_norm": 2.125119924545288, "learning_rate": 5.177168272794662e-05, "loss": 1.2223, "step": 5009 }, { "epoch": 0.7422222222222222, "grad_norm": 3.150367021560669, "learning_rate": 5.1742031134173466e-05, "loss": 0.9966, "step": 5010 }, { "epoch": 0.7423703703703703, "grad_norm": 1.2428815364837646, "learning_rate": 5.1712379540400304e-05, "loss": 1.015, "step": 5011 }, { "epoch": 0.7425185185185185, "grad_norm": 1.4209166765213013, "learning_rate": 5.168272794662713e-05, "loss": 0.9191, "step": 5012 }, { "epoch": 0.7426666666666667, "grad_norm": 2.137342691421509, "learning_rate": 5.1653076352853966e-05, "loss": 1.1639, "step": 5013 }, { "epoch": 0.7428148148148148, "grad_norm": 1.7810817956924438, "learning_rate": 5.162342475908081e-05, "loss": 1.041, "step": 5014 }, { "epoch": 0.7429629629629629, "grad_norm": 1.5036296844482422, "learning_rate": 5.1593773165307634e-05, "loss": 1.0331, "step": 5015 }, { "epoch": 0.7431111111111111, "grad_norm": 2.7676284313201904, "learning_rate": 5.156412157153447e-05, "loss": 0.9642, "step": 5016 }, { "epoch": 0.7432592592592593, "grad_norm": 1.8751683235168457, "learning_rate": 5.15344699777613e-05, "loss": 1.1439, "step": 5017 }, { "epoch": 0.7434074074074074, "grad_norm": 1.4921735525131226, "learning_rate": 5.150481838398814e-05, "loss": 1.1156, "step": 5018 }, { "epoch": 0.7435555555555555, "grad_norm": 2.4699594974517822, "learning_rate": 5.147516679021498e-05, "loss": 1.0008, "step": 5019 }, { "epoch": 0.7437037037037038, "grad_norm": 2.1117660999298096, "learning_rate": 5.144551519644181e-05, "loss": 1.1055, "step": 5020 }, { "epoch": 0.7438518518518519, "grad_norm": 1.7587753534317017, "learning_rate": 5.141586360266865e-05, "loss": 1.1196, "step": 5021 }, { "epoch": 0.744, "grad_norm": 3.3482227325439453, "learning_rate": 5.1386212008895485e-05, "loss": 1.0338, "step": 5022 }, { "epoch": 0.7441481481481481, "grad_norm": 1.7586042881011963, "learning_rate": 5.1356560415122316e-05, "loss": 1.1843, "step": 5023 }, { "epoch": 0.7442962962962963, "grad_norm": 1.3716119527816772, "learning_rate": 5.132690882134915e-05, "loss": 0.7945, "step": 5024 }, { "epoch": 0.7444444444444445, "grad_norm": 2.298292636871338, "learning_rate": 5.129725722757599e-05, "loss": 1.1102, "step": 5025 }, { "epoch": 0.7445925925925926, "grad_norm": 3.0868923664093018, "learning_rate": 5.1267605633802815e-05, "loss": 1.0216, "step": 5026 }, { "epoch": 0.7447407407407407, "grad_norm": 6.296951770782471, "learning_rate": 5.123795404002966e-05, "loss": 0.9318, "step": 5027 }, { "epoch": 0.7448888888888889, "grad_norm": 1.2797033786773682, "learning_rate": 5.1208302446256484e-05, "loss": 0.7388, "step": 5028 }, { "epoch": 0.745037037037037, "grad_norm": 2.2225050926208496, "learning_rate": 5.117865085248332e-05, "loss": 1.038, "step": 5029 }, { "epoch": 0.7451851851851852, "grad_norm": 3.8540756702423096, "learning_rate": 5.114899925871016e-05, "loss": 1.445, "step": 5030 }, { "epoch": 0.7453333333333333, "grad_norm": 1.381791114807129, "learning_rate": 5.111934766493699e-05, "loss": 1.0029, "step": 5031 }, { "epoch": 0.7454814814814815, "grad_norm": 1.36771821975708, "learning_rate": 5.108969607116383e-05, "loss": 0.7636, "step": 5032 }, { "epoch": 0.7456296296296296, "grad_norm": 2.076235771179199, "learning_rate": 5.1060044477390665e-05, "loss": 0.7626, "step": 5033 }, { "epoch": 0.7457777777777778, "grad_norm": 1.1504572629928589, "learning_rate": 5.1030392883617496e-05, "loss": 0.925, "step": 5034 }, { "epoch": 0.7459259259259259, "grad_norm": 1.3281841278076172, "learning_rate": 5.1000741289844334e-05, "loss": 1.2024, "step": 5035 }, { "epoch": 0.7460740740740741, "grad_norm": 1.8246177434921265, "learning_rate": 5.0971089696071165e-05, "loss": 1.3113, "step": 5036 }, { "epoch": 0.7462222222222222, "grad_norm": 2.447537660598755, "learning_rate": 5.0941438102298e-05, "loss": 1.0149, "step": 5037 }, { "epoch": 0.7463703703703704, "grad_norm": 1.6020809412002563, "learning_rate": 5.091178650852484e-05, "loss": 0.8735, "step": 5038 }, { "epoch": 0.7465185185185185, "grad_norm": 1.535174012184143, "learning_rate": 5.0882134914751664e-05, "loss": 1.0452, "step": 5039 }, { "epoch": 0.7466666666666667, "grad_norm": 1.951108694076538, "learning_rate": 5.085248332097851e-05, "loss": 1.1868, "step": 5040 }, { "epoch": 0.7468148148148148, "grad_norm": 3.4173529148101807, "learning_rate": 5.0822831727205346e-05, "loss": 0.9471, "step": 5041 }, { "epoch": 0.7469629629629629, "grad_norm": 1.3767805099487305, "learning_rate": 5.079318013343217e-05, "loss": 0.9748, "step": 5042 }, { "epoch": 0.7471111111111111, "grad_norm": 1.9710142612457275, "learning_rate": 5.076352853965901e-05, "loss": 0.9426, "step": 5043 }, { "epoch": 0.7472592592592593, "grad_norm": 1.3733680248260498, "learning_rate": 5.073387694588584e-05, "loss": 0.8834, "step": 5044 }, { "epoch": 0.7474074074074074, "grad_norm": 1.5866197347640991, "learning_rate": 5.070422535211268e-05, "loss": 1.1177, "step": 5045 }, { "epoch": 0.7475555555555555, "grad_norm": 1.3883211612701416, "learning_rate": 5.0674573758339515e-05, "loss": 1.0757, "step": 5046 }, { "epoch": 0.7477037037037036, "grad_norm": 1.562939167022705, "learning_rate": 5.0644922164566345e-05, "loss": 0.9694, "step": 5047 }, { "epoch": 0.7478518518518519, "grad_norm": 1.792899250984192, "learning_rate": 5.061527057079318e-05, "loss": 0.9668, "step": 5048 }, { "epoch": 0.748, "grad_norm": 1.1375911235809326, "learning_rate": 5.058561897702002e-05, "loss": 1.0665, "step": 5049 }, { "epoch": 0.7481481481481481, "grad_norm": 1.2301177978515625, "learning_rate": 5.055596738324685e-05, "loss": 1.1568, "step": 5050 }, { "epoch": 0.7482962962962963, "grad_norm": 2.1505887508392334, "learning_rate": 5.052631578947369e-05, "loss": 1.0903, "step": 5051 }, { "epoch": 0.7484444444444445, "grad_norm": 1.9880132675170898, "learning_rate": 5.0496664195700514e-05, "loss": 0.9612, "step": 5052 }, { "epoch": 0.7485925925925926, "grad_norm": 1.5665372610092163, "learning_rate": 5.046701260192736e-05, "loss": 1.0377, "step": 5053 }, { "epoch": 0.7487407407407407, "grad_norm": 2.266237258911133, "learning_rate": 5.0437361008154196e-05, "loss": 1.2228, "step": 5054 }, { "epoch": 0.7488888888888889, "grad_norm": 2.041421890258789, "learning_rate": 5.040770941438102e-05, "loss": 1.0174, "step": 5055 }, { "epoch": 0.7490370370370371, "grad_norm": 1.5641827583312988, "learning_rate": 5.037805782060786e-05, "loss": 0.8587, "step": 5056 }, { "epoch": 0.7491851851851852, "grad_norm": 2.854626417160034, "learning_rate": 5.03484062268347e-05, "loss": 1.1644, "step": 5057 }, { "epoch": 0.7493333333333333, "grad_norm": 3.9217967987060547, "learning_rate": 5.0318754633061526e-05, "loss": 0.7622, "step": 5058 }, { "epoch": 0.7494814814814815, "grad_norm": 2.209564208984375, "learning_rate": 5.0289103039288364e-05, "loss": 0.9619, "step": 5059 }, { "epoch": 0.7496296296296296, "grad_norm": 1.823884129524231, "learning_rate": 5.02594514455152e-05, "loss": 1.0101, "step": 5060 }, { "epoch": 0.7497777777777778, "grad_norm": 2.5315563678741455, "learning_rate": 5.022979985174203e-05, "loss": 1.1403, "step": 5061 }, { "epoch": 0.7499259259259259, "grad_norm": 1.169373869895935, "learning_rate": 5.020014825796887e-05, "loss": 0.9472, "step": 5062 }, { "epoch": 0.7500740740740741, "grad_norm": 2.5863282680511475, "learning_rate": 5.01704966641957e-05, "loss": 1.276, "step": 5063 }, { "epoch": 0.7502222222222222, "grad_norm": 3.2997329235076904, "learning_rate": 5.014084507042254e-05, "loss": 0.9122, "step": 5064 }, { "epoch": 0.7503703703703704, "grad_norm": 1.424874186515808, "learning_rate": 5.0111193476649376e-05, "loss": 1.0295, "step": 5065 }, { "epoch": 0.7505185185185185, "grad_norm": 2.382889747619629, "learning_rate": 5.008154188287621e-05, "loss": 1.1297, "step": 5066 }, { "epoch": 0.7506666666666667, "grad_norm": 1.185091257095337, "learning_rate": 5.0051890289103045e-05, "loss": 0.8786, "step": 5067 }, { "epoch": 0.7508148148148148, "grad_norm": 2.00325083732605, "learning_rate": 5.002223869532988e-05, "loss": 1.042, "step": 5068 }, { "epoch": 0.7509629629629629, "grad_norm": 3.6229701042175293, "learning_rate": 4.999258710155671e-05, "loss": 0.9377, "step": 5069 }, { "epoch": 0.7511111111111111, "grad_norm": 1.0859620571136475, "learning_rate": 4.996293550778355e-05, "loss": 1.1823, "step": 5070 }, { "epoch": 0.7512592592592593, "grad_norm": 1.863409399986267, "learning_rate": 4.993328391401038e-05, "loss": 1.1228, "step": 5071 }, { "epoch": 0.7514074074074074, "grad_norm": 1.8272866010665894, "learning_rate": 4.990363232023721e-05, "loss": 1.0643, "step": 5072 }, { "epoch": 0.7515555555555555, "grad_norm": 1.7983300685882568, "learning_rate": 4.987398072646405e-05, "loss": 1.169, "step": 5073 }, { "epoch": 0.7517037037037037, "grad_norm": 2.3321971893310547, "learning_rate": 4.984432913269089e-05, "loss": 1.075, "step": 5074 }, { "epoch": 0.7518518518518519, "grad_norm": 1.8782232999801636, "learning_rate": 4.981467753891772e-05, "loss": 1.0492, "step": 5075 }, { "epoch": 0.752, "grad_norm": 2.8510990142822266, "learning_rate": 4.978502594514455e-05, "loss": 1.2358, "step": 5076 }, { "epoch": 0.7521481481481481, "grad_norm": 2.659990072250366, "learning_rate": 4.975537435137139e-05, "loss": 1.0693, "step": 5077 }, { "epoch": 0.7522962962962964, "grad_norm": 1.8440874814987183, "learning_rate": 4.9725722757598226e-05, "loss": 1.1364, "step": 5078 }, { "epoch": 0.7524444444444445, "grad_norm": 1.1493724584579468, "learning_rate": 4.9696071163825056e-05, "loss": 1.1714, "step": 5079 }, { "epoch": 0.7525925925925926, "grad_norm": 1.7375032901763916, "learning_rate": 4.9666419570051894e-05, "loss": 1.1045, "step": 5080 }, { "epoch": 0.7527407407407407, "grad_norm": 2.885653257369995, "learning_rate": 4.9636767976278725e-05, "loss": 1.0792, "step": 5081 }, { "epoch": 0.7528888888888889, "grad_norm": 1.5603786706924438, "learning_rate": 4.960711638250556e-05, "loss": 0.9486, "step": 5082 }, { "epoch": 0.7530370370370371, "grad_norm": 1.6088101863861084, "learning_rate": 4.95774647887324e-05, "loss": 1.081, "step": 5083 }, { "epoch": 0.7531851851851852, "grad_norm": 2.091627359390259, "learning_rate": 4.954781319495923e-05, "loss": 0.9934, "step": 5084 }, { "epoch": 0.7533333333333333, "grad_norm": 1.691460371017456, "learning_rate": 4.951816160118606e-05, "loss": 1.2726, "step": 5085 }, { "epoch": 0.7534814814814815, "grad_norm": 1.6092997789382935, "learning_rate": 4.94885100074129e-05, "loss": 1.0323, "step": 5086 }, { "epoch": 0.7536296296296296, "grad_norm": 1.2982960939407349, "learning_rate": 4.945885841363974e-05, "loss": 0.7403, "step": 5087 }, { "epoch": 0.7537777777777778, "grad_norm": 1.68495774269104, "learning_rate": 4.942920681986657e-05, "loss": 0.9302, "step": 5088 }, { "epoch": 0.7539259259259259, "grad_norm": 1.6897817850112915, "learning_rate": 4.93995552260934e-05, "loss": 0.9577, "step": 5089 }, { "epoch": 0.7540740740740741, "grad_norm": 1.0711270570755005, "learning_rate": 4.9369903632320244e-05, "loss": 0.8916, "step": 5090 }, { "epoch": 0.7542222222222222, "grad_norm": 4.649448394775391, "learning_rate": 4.9340252038547075e-05, "loss": 0.9868, "step": 5091 }, { "epoch": 0.7543703703703704, "grad_norm": 1.4439083337783813, "learning_rate": 4.9310600444773906e-05, "loss": 0.8525, "step": 5092 }, { "epoch": 0.7545185185185185, "grad_norm": 2.0443906784057617, "learning_rate": 4.928094885100074e-05, "loss": 1.2524, "step": 5093 }, { "epoch": 0.7546666666666667, "grad_norm": 1.8707892894744873, "learning_rate": 4.925129725722758e-05, "loss": 1.1176, "step": 5094 }, { "epoch": 0.7548148148148148, "grad_norm": 2.2765345573425293, "learning_rate": 4.922164566345441e-05, "loss": 1.0497, "step": 5095 }, { "epoch": 0.754962962962963, "grad_norm": 3.6682441234588623, "learning_rate": 4.919199406968125e-05, "loss": 1.0128, "step": 5096 }, { "epoch": 0.7551111111111111, "grad_norm": 2.4716155529022217, "learning_rate": 4.916234247590808e-05, "loss": 1.0322, "step": 5097 }, { "epoch": 0.7552592592592593, "grad_norm": 2.391040563583374, "learning_rate": 4.913269088213492e-05, "loss": 0.7442, "step": 5098 }, { "epoch": 0.7554074074074074, "grad_norm": 3.0249578952789307, "learning_rate": 4.910303928836175e-05, "loss": 1.1544, "step": 5099 }, { "epoch": 0.7555555555555555, "grad_norm": 1.9137699604034424, "learning_rate": 4.907338769458859e-05, "loss": 1.298, "step": 5100 }, { "epoch": 0.7557037037037037, "grad_norm": 1.8166754245758057, "learning_rate": 4.9043736100815425e-05, "loss": 1.0571, "step": 5101 }, { "epoch": 0.7558518518518519, "grad_norm": 5.292375087738037, "learning_rate": 4.9014084507042255e-05, "loss": 1.0935, "step": 5102 }, { "epoch": 0.756, "grad_norm": 1.5703907012939453, "learning_rate": 4.898443291326909e-05, "loss": 0.9633, "step": 5103 }, { "epoch": 0.7561481481481481, "grad_norm": 2.736743450164795, "learning_rate": 4.8954781319495924e-05, "loss": 0.9147, "step": 5104 }, { "epoch": 0.7562962962962962, "grad_norm": 5.090847969055176, "learning_rate": 4.892512972572276e-05, "loss": 0.8958, "step": 5105 }, { "epoch": 0.7564444444444445, "grad_norm": 1.9331525564193726, "learning_rate": 4.889547813194959e-05, "loss": 0.8684, "step": 5106 }, { "epoch": 0.7565925925925926, "grad_norm": 1.776873230934143, "learning_rate": 4.886582653817643e-05, "loss": 1.1317, "step": 5107 }, { "epoch": 0.7567407407407407, "grad_norm": 2.454066276550293, "learning_rate": 4.883617494440326e-05, "loss": 1.0814, "step": 5108 }, { "epoch": 0.7568888888888889, "grad_norm": 1.4756650924682617, "learning_rate": 4.88065233506301e-05, "loss": 0.8534, "step": 5109 }, { "epoch": 0.7570370370370371, "grad_norm": 1.317214846611023, "learning_rate": 4.8776871756856937e-05, "loss": 0.8166, "step": 5110 }, { "epoch": 0.7571851851851852, "grad_norm": 2.033379077911377, "learning_rate": 4.874722016308377e-05, "loss": 1.0837, "step": 5111 }, { "epoch": 0.7573333333333333, "grad_norm": 2.4382426738739014, "learning_rate": 4.87175685693106e-05, "loss": 0.8166, "step": 5112 }, { "epoch": 0.7574814814814815, "grad_norm": 1.879291296005249, "learning_rate": 4.868791697553744e-05, "loss": 1.1031, "step": 5113 }, { "epoch": 0.7576296296296297, "grad_norm": 1.4954347610473633, "learning_rate": 4.8658265381764274e-05, "loss": 0.9674, "step": 5114 }, { "epoch": 0.7577777777777778, "grad_norm": 2.026719570159912, "learning_rate": 4.8628613787991105e-05, "loss": 1.0438, "step": 5115 }, { "epoch": 0.7579259259259259, "grad_norm": 2.028395652770996, "learning_rate": 4.859896219421794e-05, "loss": 0.9114, "step": 5116 }, { "epoch": 0.7580740740740741, "grad_norm": 2.310157060623169, "learning_rate": 4.856931060044478e-05, "loss": 0.9937, "step": 5117 }, { "epoch": 0.7582222222222222, "grad_norm": 1.5251635313034058, "learning_rate": 4.853965900667161e-05, "loss": 1.048, "step": 5118 }, { "epoch": 0.7583703703703704, "grad_norm": 1.486611008644104, "learning_rate": 4.851000741289844e-05, "loss": 0.9944, "step": 5119 }, { "epoch": 0.7585185185185185, "grad_norm": 2.200636148452759, "learning_rate": 4.848035581912528e-05, "loss": 1.0271, "step": 5120 }, { "epoch": 0.7586666666666667, "grad_norm": 2.43915057182312, "learning_rate": 4.845070422535212e-05, "loss": 0.9229, "step": 5121 }, { "epoch": 0.7588148148148148, "grad_norm": 1.2409038543701172, "learning_rate": 4.842105263157895e-05, "loss": 0.8342, "step": 5122 }, { "epoch": 0.758962962962963, "grad_norm": 2.0774729251861572, "learning_rate": 4.8391401037805786e-05, "loss": 1.0365, "step": 5123 }, { "epoch": 0.7591111111111111, "grad_norm": 2.0544731616973877, "learning_rate": 4.836174944403262e-05, "loss": 0.9629, "step": 5124 }, { "epoch": 0.7592592592592593, "grad_norm": 1.7318483591079712, "learning_rate": 4.8332097850259454e-05, "loss": 1.2772, "step": 5125 }, { "epoch": 0.7594074074074074, "grad_norm": 8.01085090637207, "learning_rate": 4.830244625648629e-05, "loss": 1.1203, "step": 5126 }, { "epoch": 0.7595555555555555, "grad_norm": 3.245711088180542, "learning_rate": 4.827279466271312e-05, "loss": 1.0204, "step": 5127 }, { "epoch": 0.7597037037037037, "grad_norm": 1.6004467010498047, "learning_rate": 4.8243143068939954e-05, "loss": 0.8611, "step": 5128 }, { "epoch": 0.7598518518518519, "grad_norm": 1.5613590478897095, "learning_rate": 4.821349147516679e-05, "loss": 0.9004, "step": 5129 }, { "epoch": 0.76, "grad_norm": 1.1147972345352173, "learning_rate": 4.818383988139363e-05, "loss": 1.1883, "step": 5130 }, { "epoch": 0.7601481481481481, "grad_norm": 1.9375214576721191, "learning_rate": 4.815418828762046e-05, "loss": 1.055, "step": 5131 }, { "epoch": 0.7602962962962962, "grad_norm": 1.498977780342102, "learning_rate": 4.812453669384729e-05, "loss": 0.9044, "step": 5132 }, { "epoch": 0.7604444444444445, "grad_norm": 1.9106816053390503, "learning_rate": 4.8094885100074136e-05, "loss": 1.0633, "step": 5133 }, { "epoch": 0.7605925925925926, "grad_norm": 2.038706064224243, "learning_rate": 4.8065233506300966e-05, "loss": 1.1961, "step": 5134 }, { "epoch": 0.7607407407407407, "grad_norm": 3.4296905994415283, "learning_rate": 4.80355819125278e-05, "loss": 0.913, "step": 5135 }, { "epoch": 0.7608888888888888, "grad_norm": 4.157522678375244, "learning_rate": 4.8005930318754635e-05, "loss": 1.1286, "step": 5136 }, { "epoch": 0.7610370370370371, "grad_norm": 2.291372060775757, "learning_rate": 4.797627872498147e-05, "loss": 0.8568, "step": 5137 }, { "epoch": 0.7611851851851852, "grad_norm": 1.864700436592102, "learning_rate": 4.7946627131208304e-05, "loss": 0.8264, "step": 5138 }, { "epoch": 0.7613333333333333, "grad_norm": 1.610752820968628, "learning_rate": 4.791697553743514e-05, "loss": 1.0348, "step": 5139 }, { "epoch": 0.7614814814814815, "grad_norm": 4.244185447692871, "learning_rate": 4.788732394366197e-05, "loss": 1.1272, "step": 5140 }, { "epoch": 0.7616296296296297, "grad_norm": 1.9835176467895508, "learning_rate": 4.785767234988881e-05, "loss": 0.8225, "step": 5141 }, { "epoch": 0.7617777777777778, "grad_norm": 1.8490839004516602, "learning_rate": 4.782802075611564e-05, "loss": 1.0496, "step": 5142 }, { "epoch": 0.7619259259259259, "grad_norm": 3.377185583114624, "learning_rate": 4.779836916234248e-05, "loss": 0.9867, "step": 5143 }, { "epoch": 0.7620740740740741, "grad_norm": 1.957334280014038, "learning_rate": 4.7768717568569316e-05, "loss": 0.9881, "step": 5144 }, { "epoch": 0.7622222222222222, "grad_norm": 1.9648994207382202, "learning_rate": 4.773906597479615e-05, "loss": 1.0724, "step": 5145 }, { "epoch": 0.7623703703703704, "grad_norm": 2.500462055206299, "learning_rate": 4.7709414381022985e-05, "loss": 0.9337, "step": 5146 }, { "epoch": 0.7625185185185185, "grad_norm": 2.3736724853515625, "learning_rate": 4.7679762787249816e-05, "loss": 1.1224, "step": 5147 }, { "epoch": 0.7626666666666667, "grad_norm": 1.6987513303756714, "learning_rate": 4.765011119347665e-05, "loss": 0.9495, "step": 5148 }, { "epoch": 0.7628148148148148, "grad_norm": 1.900909423828125, "learning_rate": 4.7620459599703484e-05, "loss": 0.9714, "step": 5149 }, { "epoch": 0.762962962962963, "grad_norm": 2.466947317123413, "learning_rate": 4.759080800593032e-05, "loss": 1.1185, "step": 5150 }, { "epoch": 0.7631111111111111, "grad_norm": 1.8361475467681885, "learning_rate": 4.756115641215715e-05, "loss": 1.0755, "step": 5151 }, { "epoch": 0.7632592592592593, "grad_norm": 1.4487415552139282, "learning_rate": 4.753150481838399e-05, "loss": 0.9984, "step": 5152 }, { "epoch": 0.7634074074074074, "grad_norm": 1.4057559967041016, "learning_rate": 4.750185322461083e-05, "loss": 0.6848, "step": 5153 }, { "epoch": 0.7635555555555555, "grad_norm": 2.509324789047241, "learning_rate": 4.747220163083766e-05, "loss": 1.1268, "step": 5154 }, { "epoch": 0.7637037037037037, "grad_norm": 2.1801083087921143, "learning_rate": 4.744255003706449e-05, "loss": 1.0042, "step": 5155 }, { "epoch": 0.7638518518518519, "grad_norm": 1.8679866790771484, "learning_rate": 4.7412898443291334e-05, "loss": 0.8824, "step": 5156 }, { "epoch": 0.764, "grad_norm": 2.1970086097717285, "learning_rate": 4.7383246849518165e-05, "loss": 1.0381, "step": 5157 }, { "epoch": 0.7641481481481481, "grad_norm": 1.4073742628097534, "learning_rate": 4.7353595255744996e-05, "loss": 0.8238, "step": 5158 }, { "epoch": 0.7642962962962963, "grad_norm": 1.780596137046814, "learning_rate": 4.7323943661971834e-05, "loss": 1.1141, "step": 5159 }, { "epoch": 0.7644444444444445, "grad_norm": 1.9689271450042725, "learning_rate": 4.729429206819867e-05, "loss": 1.0593, "step": 5160 }, { "epoch": 0.7645925925925926, "grad_norm": 2.2552754878997803, "learning_rate": 4.72646404744255e-05, "loss": 0.9337, "step": 5161 }, { "epoch": 0.7647407407407407, "grad_norm": 1.968701958656311, "learning_rate": 4.7234988880652333e-05, "loss": 0.9357, "step": 5162 }, { "epoch": 0.7648888888888888, "grad_norm": 1.7369651794433594, "learning_rate": 4.720533728687917e-05, "loss": 1.2335, "step": 5163 }, { "epoch": 0.7650370370370371, "grad_norm": 1.7453725337982178, "learning_rate": 4.717568569310601e-05, "loss": 0.8022, "step": 5164 }, { "epoch": 0.7651851851851852, "grad_norm": 1.5386406183242798, "learning_rate": 4.714603409933284e-05, "loss": 1.0569, "step": 5165 }, { "epoch": 0.7653333333333333, "grad_norm": 2.168109178543091, "learning_rate": 4.711638250555968e-05, "loss": 0.9496, "step": 5166 }, { "epoch": 0.7654814814814814, "grad_norm": 1.2597980499267578, "learning_rate": 4.708673091178651e-05, "loss": 0.8692, "step": 5167 }, { "epoch": 0.7656296296296297, "grad_norm": 1.890368938446045, "learning_rate": 4.7057079318013346e-05, "loss": 1.074, "step": 5168 }, { "epoch": 0.7657777777777778, "grad_norm": 1.8125121593475342, "learning_rate": 4.7027427724240184e-05, "loss": 0.8905, "step": 5169 }, { "epoch": 0.7659259259259259, "grad_norm": 1.288940668106079, "learning_rate": 4.6997776130467015e-05, "loss": 1.0225, "step": 5170 }, { "epoch": 0.7660740740740741, "grad_norm": 1.973691463470459, "learning_rate": 4.6968124536693846e-05, "loss": 1.0235, "step": 5171 }, { "epoch": 0.7662222222222222, "grad_norm": 2.004398822784424, "learning_rate": 4.693847294292068e-05, "loss": 1.1091, "step": 5172 }, { "epoch": 0.7663703703703704, "grad_norm": 1.7750011682510376, "learning_rate": 4.690882134914752e-05, "loss": 1.0812, "step": 5173 }, { "epoch": 0.7665185185185185, "grad_norm": 1.1326333284378052, "learning_rate": 4.687916975537435e-05, "loss": 1.0171, "step": 5174 }, { "epoch": 0.7666666666666667, "grad_norm": 1.296269416809082, "learning_rate": 4.684951816160118e-05, "loss": 1.0817, "step": 5175 }, { "epoch": 0.7668148148148148, "grad_norm": 1.1539825201034546, "learning_rate": 4.681986656782803e-05, "loss": 0.8657, "step": 5176 }, { "epoch": 0.766962962962963, "grad_norm": 1.676647663116455, "learning_rate": 4.679021497405486e-05, "loss": 0.9272, "step": 5177 }, { "epoch": 0.7671111111111111, "grad_norm": 4.465587139129639, "learning_rate": 4.676056338028169e-05, "loss": 0.8286, "step": 5178 }, { "epoch": 0.7672592592592593, "grad_norm": 7.314817428588867, "learning_rate": 4.673091178650853e-05, "loss": 0.9234, "step": 5179 }, { "epoch": 0.7674074074074074, "grad_norm": 1.8242179155349731, "learning_rate": 4.6701260192735364e-05, "loss": 1.2321, "step": 5180 }, { "epoch": 0.7675555555555555, "grad_norm": 1.985378384590149, "learning_rate": 4.6671608598962195e-05, "loss": 1.0672, "step": 5181 }, { "epoch": 0.7677037037037037, "grad_norm": 1.7461580038070679, "learning_rate": 4.664195700518903e-05, "loss": 1.1942, "step": 5182 }, { "epoch": 0.7678518518518519, "grad_norm": 2.392956018447876, "learning_rate": 4.661230541141587e-05, "loss": 0.9936, "step": 5183 }, { "epoch": 0.768, "grad_norm": 1.471274971961975, "learning_rate": 4.65826538176427e-05, "loss": 0.8899, "step": 5184 }, { "epoch": 0.7681481481481481, "grad_norm": 1.7198207378387451, "learning_rate": 4.655300222386953e-05, "loss": 1.1507, "step": 5185 }, { "epoch": 0.7682962962962963, "grad_norm": 1.4129610061645508, "learning_rate": 4.652335063009637e-05, "loss": 0.8701, "step": 5186 }, { "epoch": 0.7684444444444445, "grad_norm": 1.2829008102416992, "learning_rate": 4.649369903632321e-05, "loss": 0.9653, "step": 5187 }, { "epoch": 0.7685925925925926, "grad_norm": 2.027888298034668, "learning_rate": 4.646404744255004e-05, "loss": 1.0169, "step": 5188 }, { "epoch": 0.7687407407407407, "grad_norm": 1.609615683555603, "learning_rate": 4.6434395848776876e-05, "loss": 1.2845, "step": 5189 }, { "epoch": 0.7688888888888888, "grad_norm": 1.939034342765808, "learning_rate": 4.640474425500371e-05, "loss": 1.0692, "step": 5190 }, { "epoch": 0.7690370370370371, "grad_norm": 1.922580599784851, "learning_rate": 4.6375092661230545e-05, "loss": 0.7777, "step": 5191 }, { "epoch": 0.7691851851851852, "grad_norm": 2.872612476348877, "learning_rate": 4.6345441067457376e-05, "loss": 1.0708, "step": 5192 }, { "epoch": 0.7693333333333333, "grad_norm": 1.4507546424865723, "learning_rate": 4.6315789473684214e-05, "loss": 1.0895, "step": 5193 }, { "epoch": 0.7694814814814814, "grad_norm": 1.2573904991149902, "learning_rate": 4.6286137879911045e-05, "loss": 0.8701, "step": 5194 }, { "epoch": 0.7696296296296297, "grad_norm": 1.58791184425354, "learning_rate": 4.625648628613788e-05, "loss": 1.0827, "step": 5195 }, { "epoch": 0.7697777777777778, "grad_norm": 2.590338945388794, "learning_rate": 4.622683469236472e-05, "loss": 1.3173, "step": 5196 }, { "epoch": 0.7699259259259259, "grad_norm": 2.0621609687805176, "learning_rate": 4.619718309859155e-05, "loss": 0.9686, "step": 5197 }, { "epoch": 0.770074074074074, "grad_norm": 3.1093332767486572, "learning_rate": 4.616753150481838e-05, "loss": 0.8499, "step": 5198 }, { "epoch": 0.7702222222222223, "grad_norm": 3.72360897064209, "learning_rate": 4.6137879911045226e-05, "loss": 0.9188, "step": 5199 }, { "epoch": 0.7703703703703704, "grad_norm": 1.468069314956665, "learning_rate": 4.610822831727206e-05, "loss": 0.6625, "step": 5200 }, { "epoch": 0.7705185185185185, "grad_norm": 1.898732304573059, "learning_rate": 4.607857672349889e-05, "loss": 0.9545, "step": 5201 }, { "epoch": 0.7706666666666667, "grad_norm": 1.5424686670303345, "learning_rate": 4.6048925129725726e-05, "loss": 0.9572, "step": 5202 }, { "epoch": 0.7708148148148148, "grad_norm": 1.6728442907333374, "learning_rate": 4.601927353595256e-05, "loss": 0.9951, "step": 5203 }, { "epoch": 0.770962962962963, "grad_norm": 1.8453505039215088, "learning_rate": 4.5989621942179394e-05, "loss": 1.1395, "step": 5204 }, { "epoch": 0.7711111111111111, "grad_norm": 1.8295652866363525, "learning_rate": 4.5959970348406225e-05, "loss": 0.9264, "step": 5205 }, { "epoch": 0.7712592592592593, "grad_norm": 2.741732597351074, "learning_rate": 4.593031875463306e-05, "loss": 0.9526, "step": 5206 }, { "epoch": 0.7714074074074074, "grad_norm": 1.5087363719940186, "learning_rate": 4.59006671608599e-05, "loss": 1.1452, "step": 5207 }, { "epoch": 0.7715555555555556, "grad_norm": 1.7406902313232422, "learning_rate": 4.587101556708673e-05, "loss": 0.9727, "step": 5208 }, { "epoch": 0.7717037037037037, "grad_norm": 2.977414846420288, "learning_rate": 4.584136397331357e-05, "loss": 1.2625, "step": 5209 }, { "epoch": 0.7718518518518519, "grad_norm": 1.4112823009490967, "learning_rate": 4.58117123795404e-05, "loss": 0.7912, "step": 5210 }, { "epoch": 0.772, "grad_norm": 2.0663907527923584, "learning_rate": 4.578206078576724e-05, "loss": 1.0525, "step": 5211 }, { "epoch": 0.7721481481481481, "grad_norm": 2.6327972412109375, "learning_rate": 4.5752409191994075e-05, "loss": 1.111, "step": 5212 }, { "epoch": 0.7722962962962963, "grad_norm": 1.619528889656067, "learning_rate": 4.5722757598220906e-05, "loss": 0.9059, "step": 5213 }, { "epoch": 0.7724444444444445, "grad_norm": 2.2156741619110107, "learning_rate": 4.569310600444774e-05, "loss": 1.0346, "step": 5214 }, { "epoch": 0.7725925925925926, "grad_norm": 1.7784485816955566, "learning_rate": 4.5663454410674575e-05, "loss": 0.9527, "step": 5215 }, { "epoch": 0.7727407407407407, "grad_norm": 1.4141141176223755, "learning_rate": 4.563380281690141e-05, "loss": 0.9136, "step": 5216 }, { "epoch": 0.7728888888888888, "grad_norm": 3.6373491287231445, "learning_rate": 4.5604151223128243e-05, "loss": 1.0312, "step": 5217 }, { "epoch": 0.7730370370370371, "grad_norm": 5.496160507202148, "learning_rate": 4.5574499629355074e-05, "loss": 1.1849, "step": 5218 }, { "epoch": 0.7731851851851852, "grad_norm": 1.7929624319076538, "learning_rate": 4.554484803558192e-05, "loss": 1.132, "step": 5219 }, { "epoch": 0.7733333333333333, "grad_norm": 2.1400399208068848, "learning_rate": 4.551519644180875e-05, "loss": 0.5993, "step": 5220 }, { "epoch": 0.7734814814814814, "grad_norm": 1.5654683113098145, "learning_rate": 4.548554484803558e-05, "loss": 0.8757, "step": 5221 }, { "epoch": 0.7736296296296297, "grad_norm": 2.1138947010040283, "learning_rate": 4.545589325426242e-05, "loss": 0.8961, "step": 5222 }, { "epoch": 0.7737777777777778, "grad_norm": 1.4625893831253052, "learning_rate": 4.5426241660489256e-05, "loss": 1.0103, "step": 5223 }, { "epoch": 0.7739259259259259, "grad_norm": 1.2665773630142212, "learning_rate": 4.539659006671609e-05, "loss": 1.0971, "step": 5224 }, { "epoch": 0.774074074074074, "grad_norm": 1.2460824251174927, "learning_rate": 4.5366938472942925e-05, "loss": 0.9117, "step": 5225 }, { "epoch": 0.7742222222222223, "grad_norm": 2.2900710105895996, "learning_rate": 4.533728687916976e-05, "loss": 0.9117, "step": 5226 }, { "epoch": 0.7743703703703704, "grad_norm": 2.224123001098633, "learning_rate": 4.530763528539659e-05, "loss": 1.0584, "step": 5227 }, { "epoch": 0.7745185185185185, "grad_norm": 1.5715810060501099, "learning_rate": 4.5277983691623424e-05, "loss": 1.1295, "step": 5228 }, { "epoch": 0.7746666666666666, "grad_norm": 2.7693135738372803, "learning_rate": 4.524833209785026e-05, "loss": 1.0575, "step": 5229 }, { "epoch": 0.7748148148148148, "grad_norm": 1.9409888982772827, "learning_rate": 4.52186805040771e-05, "loss": 1.0668, "step": 5230 }, { "epoch": 0.774962962962963, "grad_norm": 1.4197139739990234, "learning_rate": 4.518902891030393e-05, "loss": 1.1552, "step": 5231 }, { "epoch": 0.7751111111111111, "grad_norm": 1.9229507446289062, "learning_rate": 4.515937731653077e-05, "loss": 1.0031, "step": 5232 }, { "epoch": 0.7752592592592593, "grad_norm": 1.7542647123336792, "learning_rate": 4.51297257227576e-05, "loss": 1.0647, "step": 5233 }, { "epoch": 0.7754074074074074, "grad_norm": 1.3256491422653198, "learning_rate": 4.510007412898444e-05, "loss": 0.8055, "step": 5234 }, { "epoch": 0.7755555555555556, "grad_norm": 1.463727355003357, "learning_rate": 4.507042253521127e-05, "loss": 1.0222, "step": 5235 }, { "epoch": 0.7757037037037037, "grad_norm": 1.7977197170257568, "learning_rate": 4.5040770941438105e-05, "loss": 0.9904, "step": 5236 }, { "epoch": 0.7758518518518519, "grad_norm": 1.1428083181381226, "learning_rate": 4.5011119347664936e-05, "loss": 1.0521, "step": 5237 }, { "epoch": 0.776, "grad_norm": 1.4261960983276367, "learning_rate": 4.4981467753891774e-05, "loss": 0.8669, "step": 5238 }, { "epoch": 0.7761481481481481, "grad_norm": 2.0403456687927246, "learning_rate": 4.495181616011861e-05, "loss": 1.1753, "step": 5239 }, { "epoch": 0.7762962962962963, "grad_norm": 2.254314422607422, "learning_rate": 4.492216456634544e-05, "loss": 1.1025, "step": 5240 }, { "epoch": 0.7764444444444445, "grad_norm": 1.2379026412963867, "learning_rate": 4.489251297257227e-05, "loss": 1.2099, "step": 5241 }, { "epoch": 0.7765925925925926, "grad_norm": 1.7444937229156494, "learning_rate": 4.486286137879912e-05, "loss": 1.0509, "step": 5242 }, { "epoch": 0.7767407407407407, "grad_norm": 2.0687057971954346, "learning_rate": 4.483320978502595e-05, "loss": 0.6284, "step": 5243 }, { "epoch": 0.7768888888888889, "grad_norm": 2.0481202602386475, "learning_rate": 4.480355819125278e-05, "loss": 0.9988, "step": 5244 }, { "epoch": 0.7770370370370371, "grad_norm": 2.4136269092559814, "learning_rate": 4.477390659747962e-05, "loss": 1.2496, "step": 5245 }, { "epoch": 0.7771851851851852, "grad_norm": 1.8915942907333374, "learning_rate": 4.4744255003706455e-05, "loss": 1.0732, "step": 5246 }, { "epoch": 0.7773333333333333, "grad_norm": 1.8578381538391113, "learning_rate": 4.4714603409933286e-05, "loss": 0.9857, "step": 5247 }, { "epoch": 0.7774814814814814, "grad_norm": 2.6110923290252686, "learning_rate": 4.468495181616012e-05, "loss": 1.0259, "step": 5248 }, { "epoch": 0.7776296296296297, "grad_norm": 1.603898048400879, "learning_rate": 4.4655300222386954e-05, "loss": 0.8033, "step": 5249 }, { "epoch": 0.7777777777777778, "grad_norm": 1.5251612663269043, "learning_rate": 4.462564862861379e-05, "loss": 1.0749, "step": 5250 }, { "epoch": 0.7779259259259259, "grad_norm": 3.3131825923919678, "learning_rate": 4.459599703484062e-05, "loss": 1.0672, "step": 5251 }, { "epoch": 0.778074074074074, "grad_norm": 2.589365005493164, "learning_rate": 4.456634544106746e-05, "loss": 1.011, "step": 5252 }, { "epoch": 0.7782222222222223, "grad_norm": 2.332076072692871, "learning_rate": 4.453669384729429e-05, "loss": 1.2628, "step": 5253 }, { "epoch": 0.7783703703703704, "grad_norm": 1.7138726711273193, "learning_rate": 4.450704225352113e-05, "loss": 1.3278, "step": 5254 }, { "epoch": 0.7785185185185185, "grad_norm": 2.4788360595703125, "learning_rate": 4.447739065974797e-05, "loss": 1.4061, "step": 5255 }, { "epoch": 0.7786666666666666, "grad_norm": 1.6783311367034912, "learning_rate": 4.44477390659748e-05, "loss": 0.9061, "step": 5256 }, { "epoch": 0.7788148148148148, "grad_norm": 1.3091166019439697, "learning_rate": 4.441808747220163e-05, "loss": 1.2006, "step": 5257 }, { "epoch": 0.778962962962963, "grad_norm": 1.4785174131393433, "learning_rate": 4.4388435878428467e-05, "loss": 1.1297, "step": 5258 }, { "epoch": 0.7791111111111111, "grad_norm": 2.0996644496917725, "learning_rate": 4.4358784284655304e-05, "loss": 1.217, "step": 5259 }, { "epoch": 0.7792592592592592, "grad_norm": 1.4377645254135132, "learning_rate": 4.4329132690882135e-05, "loss": 0.8621, "step": 5260 }, { "epoch": 0.7794074074074074, "grad_norm": 1.9455523490905762, "learning_rate": 4.4299481097108966e-05, "loss": 1.0878, "step": 5261 }, { "epoch": 0.7795555555555556, "grad_norm": 1.466994285583496, "learning_rate": 4.426982950333581e-05, "loss": 1.1907, "step": 5262 }, { "epoch": 0.7797037037037037, "grad_norm": 1.3471636772155762, "learning_rate": 4.424017790956264e-05, "loss": 1.0218, "step": 5263 }, { "epoch": 0.7798518518518519, "grad_norm": 1.427681565284729, "learning_rate": 4.421052631578947e-05, "loss": 1.0729, "step": 5264 }, { "epoch": 0.78, "grad_norm": 1.1461193561553955, "learning_rate": 4.418087472201631e-05, "loss": 1.1489, "step": 5265 }, { "epoch": 0.7801481481481481, "grad_norm": 1.0958290100097656, "learning_rate": 4.415122312824315e-05, "loss": 1.0562, "step": 5266 }, { "epoch": 0.7802962962962963, "grad_norm": 2.9291634559631348, "learning_rate": 4.412157153446998e-05, "loss": 1.2814, "step": 5267 }, { "epoch": 0.7804444444444445, "grad_norm": 2.461829423904419, "learning_rate": 4.4091919940696816e-05, "loss": 0.9668, "step": 5268 }, { "epoch": 0.7805925925925926, "grad_norm": 1.934011459350586, "learning_rate": 4.4062268346923654e-05, "loss": 1.0245, "step": 5269 }, { "epoch": 0.7807407407407407, "grad_norm": 1.38313889503479, "learning_rate": 4.4032616753150485e-05, "loss": 0.9068, "step": 5270 }, { "epoch": 0.7808888888888889, "grad_norm": 1.3687999248504639, "learning_rate": 4.4002965159377316e-05, "loss": 1.0288, "step": 5271 }, { "epoch": 0.7810370370370371, "grad_norm": 1.9752943515777588, "learning_rate": 4.3973313565604153e-05, "loss": 0.8695, "step": 5272 }, { "epoch": 0.7811851851851852, "grad_norm": 1.7185872793197632, "learning_rate": 4.394366197183099e-05, "loss": 1.0999, "step": 5273 }, { "epoch": 0.7813333333333333, "grad_norm": 1.1522598266601562, "learning_rate": 4.391401037805782e-05, "loss": 0.9312, "step": 5274 }, { "epoch": 0.7814814814814814, "grad_norm": 1.209738850593567, "learning_rate": 4.388435878428466e-05, "loss": 1.2068, "step": 5275 }, { "epoch": 0.7816296296296297, "grad_norm": 1.4488725662231445, "learning_rate": 4.385470719051149e-05, "loss": 1.1119, "step": 5276 }, { "epoch": 0.7817777777777778, "grad_norm": 1.1559313535690308, "learning_rate": 4.382505559673833e-05, "loss": 1.0124, "step": 5277 }, { "epoch": 0.7819259259259259, "grad_norm": 2.1997835636138916, "learning_rate": 4.379540400296516e-05, "loss": 0.9524, "step": 5278 }, { "epoch": 0.782074074074074, "grad_norm": 4.906120777130127, "learning_rate": 4.3765752409192e-05, "loss": 0.9424, "step": 5279 }, { "epoch": 0.7822222222222223, "grad_norm": 2.4310436248779297, "learning_rate": 4.373610081541883e-05, "loss": 1.1148, "step": 5280 }, { "epoch": 0.7823703703703704, "grad_norm": 1.2778759002685547, "learning_rate": 4.3706449221645665e-05, "loss": 0.9099, "step": 5281 }, { "epoch": 0.7825185185185185, "grad_norm": 5.306490421295166, "learning_rate": 4.36767976278725e-05, "loss": 1.1062, "step": 5282 }, { "epoch": 0.7826666666666666, "grad_norm": 2.4609696865081787, "learning_rate": 4.3647146034099334e-05, "loss": 0.8321, "step": 5283 }, { "epoch": 0.7828148148148149, "grad_norm": 1.3912984132766724, "learning_rate": 4.3617494440326165e-05, "loss": 0.9438, "step": 5284 }, { "epoch": 0.782962962962963, "grad_norm": 1.6022518873214722, "learning_rate": 4.358784284655301e-05, "loss": 0.9162, "step": 5285 }, { "epoch": 0.7831111111111111, "grad_norm": 1.3555049896240234, "learning_rate": 4.355819125277984e-05, "loss": 0.8939, "step": 5286 }, { "epoch": 0.7832592592592592, "grad_norm": 1.3263565301895142, "learning_rate": 4.352853965900667e-05, "loss": 1.1853, "step": 5287 }, { "epoch": 0.7834074074074074, "grad_norm": 2.257020950317383, "learning_rate": 4.349888806523351e-05, "loss": 0.967, "step": 5288 }, { "epoch": 0.7835555555555556, "grad_norm": 1.8871673345565796, "learning_rate": 4.346923647146035e-05, "loss": 0.9221, "step": 5289 }, { "epoch": 0.7837037037037037, "grad_norm": 3.959899663925171, "learning_rate": 4.343958487768718e-05, "loss": 0.8704, "step": 5290 }, { "epoch": 0.7838518518518518, "grad_norm": 1.3531171083450317, "learning_rate": 4.340993328391401e-05, "loss": 1.0575, "step": 5291 }, { "epoch": 0.784, "grad_norm": 2.5105092525482178, "learning_rate": 4.3380281690140846e-05, "loss": 1.2141, "step": 5292 }, { "epoch": 0.7841481481481481, "grad_norm": 4.388306617736816, "learning_rate": 4.3350630096367684e-05, "loss": 1.1923, "step": 5293 }, { "epoch": 0.7842962962962963, "grad_norm": 1.7040736675262451, "learning_rate": 4.3320978502594515e-05, "loss": 0.9041, "step": 5294 }, { "epoch": 0.7844444444444445, "grad_norm": 1.3910716772079468, "learning_rate": 4.329132690882135e-05, "loss": 0.9643, "step": 5295 }, { "epoch": 0.7845925925925926, "grad_norm": 1.3138338327407837, "learning_rate": 4.326167531504818e-05, "loss": 1.0421, "step": 5296 }, { "epoch": 0.7847407407407407, "grad_norm": 1.597644329071045, "learning_rate": 4.323202372127502e-05, "loss": 1.064, "step": 5297 }, { "epoch": 0.7848888888888889, "grad_norm": 3.17818021774292, "learning_rate": 4.320237212750186e-05, "loss": 1.0256, "step": 5298 }, { "epoch": 0.7850370370370371, "grad_norm": 3.4625067710876465, "learning_rate": 4.317272053372869e-05, "loss": 1.111, "step": 5299 }, { "epoch": 0.7851851851851852, "grad_norm": 1.9711130857467651, "learning_rate": 4.314306893995552e-05, "loss": 1.065, "step": 5300 }, { "epoch": 0.7853333333333333, "grad_norm": 1.9203976392745972, "learning_rate": 4.311341734618236e-05, "loss": 1.0593, "step": 5301 }, { "epoch": 0.7854814814814814, "grad_norm": 1.304276943206787, "learning_rate": 4.3083765752409196e-05, "loss": 0.9218, "step": 5302 }, { "epoch": 0.7856296296296297, "grad_norm": 1.7462506294250488, "learning_rate": 4.305411415863603e-05, "loss": 1.2338, "step": 5303 }, { "epoch": 0.7857777777777778, "grad_norm": 1.6708916425704956, "learning_rate": 4.302446256486286e-05, "loss": 1.1153, "step": 5304 }, { "epoch": 0.7859259259259259, "grad_norm": 1.3915983438491821, "learning_rate": 4.29948109710897e-05, "loss": 1.0623, "step": 5305 }, { "epoch": 0.786074074074074, "grad_norm": 2.6342339515686035, "learning_rate": 4.296515937731653e-05, "loss": 0.8774, "step": 5306 }, { "epoch": 0.7862222222222223, "grad_norm": 1.4199789762496948, "learning_rate": 4.2935507783543364e-05, "loss": 1.0367, "step": 5307 }, { "epoch": 0.7863703703703704, "grad_norm": 1.8340078592300415, "learning_rate": 4.29058561897702e-05, "loss": 0.784, "step": 5308 }, { "epoch": 0.7865185185185185, "grad_norm": 2.203857660293579, "learning_rate": 4.287620459599704e-05, "loss": 1.0225, "step": 5309 }, { "epoch": 0.7866666666666666, "grad_norm": 2.1654696464538574, "learning_rate": 4.284655300222387e-05, "loss": 1.0628, "step": 5310 }, { "epoch": 0.7868148148148149, "grad_norm": 1.800679087638855, "learning_rate": 4.281690140845071e-05, "loss": 1.1444, "step": 5311 }, { "epoch": 0.786962962962963, "grad_norm": 1.3590725660324097, "learning_rate": 4.2787249814677546e-05, "loss": 1.074, "step": 5312 }, { "epoch": 0.7871111111111111, "grad_norm": 1.4780471324920654, "learning_rate": 4.2757598220904377e-05, "loss": 1.0316, "step": 5313 }, { "epoch": 0.7872592592592592, "grad_norm": 1.3855935335159302, "learning_rate": 4.272794662713121e-05, "loss": 0.933, "step": 5314 }, { "epoch": 0.7874074074074074, "grad_norm": 2.180665969848633, "learning_rate": 4.2698295033358045e-05, "loss": 1.3261, "step": 5315 }, { "epoch": 0.7875555555555556, "grad_norm": 2.058462142944336, "learning_rate": 4.266864343958488e-05, "loss": 1.0398, "step": 5316 }, { "epoch": 0.7877037037037037, "grad_norm": 1.1582828760147095, "learning_rate": 4.2638991845811714e-05, "loss": 0.9359, "step": 5317 }, { "epoch": 0.7878518518518518, "grad_norm": 1.454142451286316, "learning_rate": 4.260934025203855e-05, "loss": 1.0378, "step": 5318 }, { "epoch": 0.788, "grad_norm": 1.7095205783843994, "learning_rate": 4.257968865826538e-05, "loss": 0.8838, "step": 5319 }, { "epoch": 0.7881481481481482, "grad_norm": 1.327888011932373, "learning_rate": 4.255003706449222e-05, "loss": 1.0082, "step": 5320 }, { "epoch": 0.7882962962962963, "grad_norm": 1.7250101566314697, "learning_rate": 4.252038547071905e-05, "loss": 1.0791, "step": 5321 }, { "epoch": 0.7884444444444444, "grad_norm": 3.891110420227051, "learning_rate": 4.249073387694589e-05, "loss": 1.0821, "step": 5322 }, { "epoch": 0.7885925925925926, "grad_norm": 2.3465235233306885, "learning_rate": 4.246108228317272e-05, "loss": 0.8379, "step": 5323 }, { "epoch": 0.7887407407407407, "grad_norm": 1.1320641040802002, "learning_rate": 4.243143068939956e-05, "loss": 0.9779, "step": 5324 }, { "epoch": 0.7888888888888889, "grad_norm": 1.3223503828048706, "learning_rate": 4.2401779095626395e-05, "loss": 0.7022, "step": 5325 }, { "epoch": 0.7890370370370371, "grad_norm": 1.7182470560073853, "learning_rate": 4.2372127501853226e-05, "loss": 1.0736, "step": 5326 }, { "epoch": 0.7891851851851852, "grad_norm": 1.766697645187378, "learning_rate": 4.234247590808006e-05, "loss": 0.936, "step": 5327 }, { "epoch": 0.7893333333333333, "grad_norm": 1.171360731124878, "learning_rate": 4.23128243143069e-05, "loss": 0.962, "step": 5328 }, { "epoch": 0.7894814814814815, "grad_norm": 2.025193452835083, "learning_rate": 4.228317272053373e-05, "loss": 1.0565, "step": 5329 }, { "epoch": 0.7896296296296297, "grad_norm": 1.9869000911712646, "learning_rate": 4.225352112676056e-05, "loss": 1.1233, "step": 5330 }, { "epoch": 0.7897777777777778, "grad_norm": 5.825942516326904, "learning_rate": 4.22238695329874e-05, "loss": 0.867, "step": 5331 }, { "epoch": 0.7899259259259259, "grad_norm": 1.7098500728607178, "learning_rate": 4.219421793921424e-05, "loss": 0.8757, "step": 5332 }, { "epoch": 0.790074074074074, "grad_norm": 2.2363123893737793, "learning_rate": 4.216456634544107e-05, "loss": 0.9139, "step": 5333 }, { "epoch": 0.7902222222222223, "grad_norm": 1.6317927837371826, "learning_rate": 4.21349147516679e-05, "loss": 0.9388, "step": 5334 }, { "epoch": 0.7903703703703704, "grad_norm": 1.8629355430603027, "learning_rate": 4.210526315789474e-05, "loss": 0.9761, "step": 5335 }, { "epoch": 0.7905185185185185, "grad_norm": 2.1115009784698486, "learning_rate": 4.2075611564121575e-05, "loss": 1.0465, "step": 5336 }, { "epoch": 0.7906666666666666, "grad_norm": 1.4415923357009888, "learning_rate": 4.2045959970348406e-05, "loss": 0.9209, "step": 5337 }, { "epoch": 0.7908148148148149, "grad_norm": 1.5301687717437744, "learning_rate": 4.2016308376575244e-05, "loss": 1.1599, "step": 5338 }, { "epoch": 0.790962962962963, "grad_norm": 2.5030784606933594, "learning_rate": 4.1986656782802075e-05, "loss": 0.9027, "step": 5339 }, { "epoch": 0.7911111111111111, "grad_norm": 1.829357385635376, "learning_rate": 4.195700518902891e-05, "loss": 0.9675, "step": 5340 }, { "epoch": 0.7912592592592592, "grad_norm": 2.309213638305664, "learning_rate": 4.192735359525575e-05, "loss": 1.0948, "step": 5341 }, { "epoch": 0.7914074074074074, "grad_norm": 1.3838952779769897, "learning_rate": 4.189770200148258e-05, "loss": 0.9765, "step": 5342 }, { "epoch": 0.7915555555555556, "grad_norm": 1.4355117082595825, "learning_rate": 4.186805040770941e-05, "loss": 1.0487, "step": 5343 }, { "epoch": 0.7917037037037037, "grad_norm": 2.5563371181488037, "learning_rate": 4.183839881393625e-05, "loss": 1.1888, "step": 5344 }, { "epoch": 0.7918518518518518, "grad_norm": 2.01492977142334, "learning_rate": 4.180874722016309e-05, "loss": 0.9264, "step": 5345 }, { "epoch": 0.792, "grad_norm": 1.8180270195007324, "learning_rate": 4.177909562638992e-05, "loss": 0.924, "step": 5346 }, { "epoch": 0.7921481481481482, "grad_norm": 1.8011095523834229, "learning_rate": 4.174944403261675e-05, "loss": 1.1091, "step": 5347 }, { "epoch": 0.7922962962962963, "grad_norm": 1.3663522005081177, "learning_rate": 4.1719792438843594e-05, "loss": 1.04, "step": 5348 }, { "epoch": 0.7924444444444444, "grad_norm": 1.7330594062805176, "learning_rate": 4.1690140845070425e-05, "loss": 1.0136, "step": 5349 }, { "epoch": 0.7925925925925926, "grad_norm": 1.355904221534729, "learning_rate": 4.1660489251297256e-05, "loss": 0.9136, "step": 5350 }, { "epoch": 0.7927407407407407, "grad_norm": 1.5312200784683228, "learning_rate": 4.163083765752409e-05, "loss": 0.7906, "step": 5351 }, { "epoch": 0.7928888888888889, "grad_norm": 1.8225631713867188, "learning_rate": 4.160118606375093e-05, "loss": 1.2035, "step": 5352 }, { "epoch": 0.793037037037037, "grad_norm": 1.6775257587432861, "learning_rate": 4.157153446997776e-05, "loss": 0.9928, "step": 5353 }, { "epoch": 0.7931851851851852, "grad_norm": 1.468770980834961, "learning_rate": 4.15418828762046e-05, "loss": 1.1287, "step": 5354 }, { "epoch": 0.7933333333333333, "grad_norm": 1.8372886180877686, "learning_rate": 4.151223128243144e-05, "loss": 0.9269, "step": 5355 }, { "epoch": 0.7934814814814815, "grad_norm": 1.3729956150054932, "learning_rate": 4.148257968865827e-05, "loss": 0.9081, "step": 5356 }, { "epoch": 0.7936296296296297, "grad_norm": 3.273314952850342, "learning_rate": 4.14529280948851e-05, "loss": 1.2108, "step": 5357 }, { "epoch": 0.7937777777777778, "grad_norm": 1.243962287902832, "learning_rate": 4.142327650111194e-05, "loss": 0.9835, "step": 5358 }, { "epoch": 0.7939259259259259, "grad_norm": 1.414330244064331, "learning_rate": 4.1393624907338774e-05, "loss": 0.9602, "step": 5359 }, { "epoch": 0.794074074074074, "grad_norm": 3.1392345428466797, "learning_rate": 4.1363973313565605e-05, "loss": 1.1034, "step": 5360 }, { "epoch": 0.7942222222222223, "grad_norm": 1.39045250415802, "learning_rate": 4.133432171979244e-05, "loss": 1.0339, "step": 5361 }, { "epoch": 0.7943703703703704, "grad_norm": 2.301767349243164, "learning_rate": 4.1304670126019274e-05, "loss": 0.9389, "step": 5362 }, { "epoch": 0.7945185185185185, "grad_norm": 1.390378713607788, "learning_rate": 4.127501853224611e-05, "loss": 0.8781, "step": 5363 }, { "epoch": 0.7946666666666666, "grad_norm": 3.492128849029541, "learning_rate": 4.124536693847294e-05, "loss": 0.9743, "step": 5364 }, { "epoch": 0.7948148148148149, "grad_norm": 2.7725274562835693, "learning_rate": 4.121571534469978e-05, "loss": 0.8988, "step": 5365 }, { "epoch": 0.794962962962963, "grad_norm": 1.6059094667434692, "learning_rate": 4.118606375092661e-05, "loss": 0.8354, "step": 5366 }, { "epoch": 0.7951111111111111, "grad_norm": 1.6514225006103516, "learning_rate": 4.115641215715345e-05, "loss": 1.2253, "step": 5367 }, { "epoch": 0.7952592592592592, "grad_norm": 1.6473922729492188, "learning_rate": 4.1126760563380286e-05, "loss": 1.0169, "step": 5368 }, { "epoch": 0.7954074074074075, "grad_norm": 1.531722903251648, "learning_rate": 4.109710896960712e-05, "loss": 1.1131, "step": 5369 }, { "epoch": 0.7955555555555556, "grad_norm": 4.171046257019043, "learning_rate": 4.106745737583395e-05, "loss": 0.9861, "step": 5370 }, { "epoch": 0.7957037037037037, "grad_norm": 10.06808090209961, "learning_rate": 4.103780578206079e-05, "loss": 1.0757, "step": 5371 }, { "epoch": 0.7958518518518518, "grad_norm": 1.311013102531433, "learning_rate": 4.1008154188287624e-05, "loss": 0.9478, "step": 5372 }, { "epoch": 0.796, "grad_norm": 1.3929322957992554, "learning_rate": 4.0978502594514455e-05, "loss": 0.7379, "step": 5373 }, { "epoch": 0.7961481481481482, "grad_norm": 1.1322258710861206, "learning_rate": 4.094885100074129e-05, "loss": 0.8733, "step": 5374 }, { "epoch": 0.7962962962962963, "grad_norm": 1.8504239320755005, "learning_rate": 4.091919940696813e-05, "loss": 1.0918, "step": 5375 }, { "epoch": 0.7964444444444444, "grad_norm": 1.780629277229309, "learning_rate": 4.088954781319496e-05, "loss": 1.0471, "step": 5376 }, { "epoch": 0.7965925925925926, "grad_norm": 3.333953619003296, "learning_rate": 4.085989621942179e-05, "loss": 1.2132, "step": 5377 }, { "epoch": 0.7967407407407407, "grad_norm": 2.2474868297576904, "learning_rate": 4.083024462564863e-05, "loss": 0.9178, "step": 5378 }, { "epoch": 0.7968888888888889, "grad_norm": 1.3540128469467163, "learning_rate": 4.080059303187547e-05, "loss": 1.519, "step": 5379 }, { "epoch": 0.797037037037037, "grad_norm": 1.1776254177093506, "learning_rate": 4.07709414381023e-05, "loss": 1.0202, "step": 5380 }, { "epoch": 0.7971851851851852, "grad_norm": 1.7784807682037354, "learning_rate": 4.0741289844329136e-05, "loss": 0.9476, "step": 5381 }, { "epoch": 0.7973333333333333, "grad_norm": 1.7054754495620728, "learning_rate": 4.071163825055597e-05, "loss": 0.9141, "step": 5382 }, { "epoch": 0.7974814814814815, "grad_norm": 1.2991890907287598, "learning_rate": 4.0681986656782804e-05, "loss": 0.8789, "step": 5383 }, { "epoch": 0.7976296296296296, "grad_norm": 1.6967389583587646, "learning_rate": 4.065233506300964e-05, "loss": 1.1697, "step": 5384 }, { "epoch": 0.7977777777777778, "grad_norm": 1.8050689697265625, "learning_rate": 4.062268346923647e-05, "loss": 1.0018, "step": 5385 }, { "epoch": 0.7979259259259259, "grad_norm": 5.1262431144714355, "learning_rate": 4.0593031875463304e-05, "loss": 0.8984, "step": 5386 }, { "epoch": 0.798074074074074, "grad_norm": 1.3281807899475098, "learning_rate": 4.056338028169014e-05, "loss": 1.0359, "step": 5387 }, { "epoch": 0.7982222222222223, "grad_norm": 2.2558517456054688, "learning_rate": 4.053372868791698e-05, "loss": 1.1214, "step": 5388 }, { "epoch": 0.7983703703703704, "grad_norm": 2.250946044921875, "learning_rate": 4.050407709414381e-05, "loss": 0.9764, "step": 5389 }, { "epoch": 0.7985185185185185, "grad_norm": 1.5796802043914795, "learning_rate": 4.047442550037065e-05, "loss": 0.9587, "step": 5390 }, { "epoch": 0.7986666666666666, "grad_norm": 1.8032429218292236, "learning_rate": 4.0444773906597485e-05, "loss": 0.9545, "step": 5391 }, { "epoch": 0.7988148148148149, "grad_norm": 2.0286922454833984, "learning_rate": 4.0415122312824316e-05, "loss": 1.2246, "step": 5392 }, { "epoch": 0.798962962962963, "grad_norm": 1.4904719591140747, "learning_rate": 4.038547071905115e-05, "loss": 0.9967, "step": 5393 }, { "epoch": 0.7991111111111111, "grad_norm": 2.3682022094726562, "learning_rate": 4.0355819125277985e-05, "loss": 1.2159, "step": 5394 }, { "epoch": 0.7992592592592592, "grad_norm": 1.8080886602401733, "learning_rate": 4.032616753150482e-05, "loss": 1.0301, "step": 5395 }, { "epoch": 0.7994074074074075, "grad_norm": 2.6648478507995605, "learning_rate": 4.0296515937731654e-05, "loss": 0.9757, "step": 5396 }, { "epoch": 0.7995555555555556, "grad_norm": 2.8013088703155518, "learning_rate": 4.026686434395849e-05, "loss": 1.1824, "step": 5397 }, { "epoch": 0.7997037037037037, "grad_norm": 1.606271743774414, "learning_rate": 4.023721275018533e-05, "loss": 0.8274, "step": 5398 }, { "epoch": 0.7998518518518518, "grad_norm": 2.0751864910125732, "learning_rate": 4.020756115641216e-05, "loss": 1.0437, "step": 5399 }, { "epoch": 0.8, "grad_norm": 1.7119338512420654, "learning_rate": 4.017790956263899e-05, "loss": 0.8679, "step": 5400 }, { "epoch": 0.8001481481481482, "grad_norm": 4.252305030822754, "learning_rate": 4.014825796886583e-05, "loss": 1.0427, "step": 5401 }, { "epoch": 0.8002962962962963, "grad_norm": 1.4674803018569946, "learning_rate": 4.0118606375092666e-05, "loss": 0.8695, "step": 5402 }, { "epoch": 0.8004444444444444, "grad_norm": 1.4978564977645874, "learning_rate": 4.00889547813195e-05, "loss": 1.0836, "step": 5403 }, { "epoch": 0.8005925925925926, "grad_norm": 1.1871086359024048, "learning_rate": 4.0059303187546335e-05, "loss": 0.8869, "step": 5404 }, { "epoch": 0.8007407407407408, "grad_norm": 1.090904951095581, "learning_rate": 4.0029651593773166e-05, "loss": 1.0713, "step": 5405 }, { "epoch": 0.8008888888888889, "grad_norm": 1.7516885995864868, "learning_rate": 4e-05, "loss": 1.0477, "step": 5406 }, { "epoch": 0.801037037037037, "grad_norm": 1.5126169919967651, "learning_rate": 3.9970348406226834e-05, "loss": 0.8397, "step": 5407 }, { "epoch": 0.8011851851851852, "grad_norm": 1.9686980247497559, "learning_rate": 3.994069681245367e-05, "loss": 0.9915, "step": 5408 }, { "epoch": 0.8013333333333333, "grad_norm": 1.5845431089401245, "learning_rate": 3.99110452186805e-05, "loss": 1.0025, "step": 5409 }, { "epoch": 0.8014814814814815, "grad_norm": 2.3498854637145996, "learning_rate": 3.988139362490734e-05, "loss": 1.2884, "step": 5410 }, { "epoch": 0.8016296296296296, "grad_norm": 1.541824460029602, "learning_rate": 3.985174203113418e-05, "loss": 0.9402, "step": 5411 }, { "epoch": 0.8017777777777778, "grad_norm": 1.6967843770980835, "learning_rate": 3.982209043736101e-05, "loss": 0.9696, "step": 5412 }, { "epoch": 0.8019259259259259, "grad_norm": 1.4138332605361938, "learning_rate": 3.979243884358784e-05, "loss": 1.0194, "step": 5413 }, { "epoch": 0.802074074074074, "grad_norm": 1.6247764825820923, "learning_rate": 3.9762787249814684e-05, "loss": 0.9082, "step": 5414 }, { "epoch": 0.8022222222222222, "grad_norm": 2.2672598361968994, "learning_rate": 3.9733135656041515e-05, "loss": 1.1271, "step": 5415 }, { "epoch": 0.8023703703703704, "grad_norm": 1.7656632661819458, "learning_rate": 3.9703484062268346e-05, "loss": 1.0607, "step": 5416 }, { "epoch": 0.8025185185185185, "grad_norm": 1.7334779500961304, "learning_rate": 3.9673832468495184e-05, "loss": 1.3095, "step": 5417 }, { "epoch": 0.8026666666666666, "grad_norm": 1.2259182929992676, "learning_rate": 3.964418087472202e-05, "loss": 0.824, "step": 5418 }, { "epoch": 0.8028148148148149, "grad_norm": 1.4198298454284668, "learning_rate": 3.961452928094885e-05, "loss": 0.9926, "step": 5419 }, { "epoch": 0.802962962962963, "grad_norm": 1.5182987451553345, "learning_rate": 3.9584877687175683e-05, "loss": 1.0254, "step": 5420 }, { "epoch": 0.8031111111111111, "grad_norm": 2.0433640480041504, "learning_rate": 3.955522609340252e-05, "loss": 1.0753, "step": 5421 }, { "epoch": 0.8032592592592592, "grad_norm": 1.9819378852844238, "learning_rate": 3.952557449962936e-05, "loss": 0.9146, "step": 5422 }, { "epoch": 0.8034074074074075, "grad_norm": 1.3722615242004395, "learning_rate": 3.949592290585619e-05, "loss": 1.079, "step": 5423 }, { "epoch": 0.8035555555555556, "grad_norm": 1.2907549142837524, "learning_rate": 3.946627131208303e-05, "loss": 0.916, "step": 5424 }, { "epoch": 0.8037037037037037, "grad_norm": 2.0529794692993164, "learning_rate": 3.943661971830986e-05, "loss": 1.0675, "step": 5425 }, { "epoch": 0.8038518518518518, "grad_norm": 1.6150699853897095, "learning_rate": 3.9406968124536696e-05, "loss": 0.8458, "step": 5426 }, { "epoch": 0.804, "grad_norm": 1.9195685386657715, "learning_rate": 3.9377316530763534e-05, "loss": 0.9998, "step": 5427 }, { "epoch": 0.8041481481481482, "grad_norm": 1.518667221069336, "learning_rate": 3.9347664936990365e-05, "loss": 1.0308, "step": 5428 }, { "epoch": 0.8042962962962963, "grad_norm": 1.4057791233062744, "learning_rate": 3.9318013343217195e-05, "loss": 1.1398, "step": 5429 }, { "epoch": 0.8044444444444444, "grad_norm": 1.8205337524414062, "learning_rate": 3.928836174944403e-05, "loss": 0.8375, "step": 5430 }, { "epoch": 0.8045925925925926, "grad_norm": 1.9404771327972412, "learning_rate": 3.925871015567087e-05, "loss": 0.864, "step": 5431 }, { "epoch": 0.8047407407407408, "grad_norm": 1.4498867988586426, "learning_rate": 3.92290585618977e-05, "loss": 0.9772, "step": 5432 }, { "epoch": 0.8048888888888889, "grad_norm": 1.7233918905258179, "learning_rate": 3.919940696812454e-05, "loss": 0.9626, "step": 5433 }, { "epoch": 0.805037037037037, "grad_norm": 2.1639041900634766, "learning_rate": 3.916975537435138e-05, "loss": 0.8474, "step": 5434 }, { "epoch": 0.8051851851851852, "grad_norm": 2.4917683601379395, "learning_rate": 3.914010378057821e-05, "loss": 1.0292, "step": 5435 }, { "epoch": 0.8053333333333333, "grad_norm": 2.330807685852051, "learning_rate": 3.911045218680504e-05, "loss": 1.1002, "step": 5436 }, { "epoch": 0.8054814814814815, "grad_norm": 1.7886220216751099, "learning_rate": 3.9080800593031877e-05, "loss": 1.2997, "step": 5437 }, { "epoch": 0.8056296296296296, "grad_norm": 2.5931851863861084, "learning_rate": 3.9051148999258714e-05, "loss": 0.7829, "step": 5438 }, { "epoch": 0.8057777777777778, "grad_norm": 2.1554958820343018, "learning_rate": 3.9021497405485545e-05, "loss": 1.044, "step": 5439 }, { "epoch": 0.8059259259259259, "grad_norm": 1.3633235692977905, "learning_rate": 3.899184581171238e-05, "loss": 0.8883, "step": 5440 }, { "epoch": 0.806074074074074, "grad_norm": 1.4236547946929932, "learning_rate": 3.896219421793922e-05, "loss": 1.1795, "step": 5441 }, { "epoch": 0.8062222222222222, "grad_norm": 1.8070131540298462, "learning_rate": 3.893254262416605e-05, "loss": 1.0315, "step": 5442 }, { "epoch": 0.8063703703703704, "grad_norm": 1.392898678779602, "learning_rate": 3.890289103039288e-05, "loss": 0.8974, "step": 5443 }, { "epoch": 0.8065185185185185, "grad_norm": 2.12119197845459, "learning_rate": 3.887323943661972e-05, "loss": 0.9822, "step": 5444 }, { "epoch": 0.8066666666666666, "grad_norm": 2.1227033138275146, "learning_rate": 3.884358784284656e-05, "loss": 1.1702, "step": 5445 }, { "epoch": 0.8068148148148148, "grad_norm": 1.5095551013946533, "learning_rate": 3.881393624907339e-05, "loss": 0.9813, "step": 5446 }, { "epoch": 0.806962962962963, "grad_norm": 1.4496660232543945, "learning_rate": 3.8784284655300226e-05, "loss": 0.8225, "step": 5447 }, { "epoch": 0.8071111111111111, "grad_norm": 1.5028917789459229, "learning_rate": 3.875463306152706e-05, "loss": 0.8998, "step": 5448 }, { "epoch": 0.8072592592592592, "grad_norm": 1.9945577383041382, "learning_rate": 3.8724981467753895e-05, "loss": 0.9351, "step": 5449 }, { "epoch": 0.8074074074074075, "grad_norm": 1.7238104343414307, "learning_rate": 3.8695329873980726e-05, "loss": 1.0828, "step": 5450 }, { "epoch": 0.8075555555555556, "grad_norm": 1.9189610481262207, "learning_rate": 3.8665678280207564e-05, "loss": 1.192, "step": 5451 }, { "epoch": 0.8077037037037037, "grad_norm": 1.6241904497146606, "learning_rate": 3.8636026686434394e-05, "loss": 1.0557, "step": 5452 }, { "epoch": 0.8078518518518518, "grad_norm": 1.2917290925979614, "learning_rate": 3.860637509266123e-05, "loss": 1.1501, "step": 5453 }, { "epoch": 0.808, "grad_norm": 1.5646109580993652, "learning_rate": 3.857672349888807e-05, "loss": 0.787, "step": 5454 }, { "epoch": 0.8081481481481482, "grad_norm": 2.36950421333313, "learning_rate": 3.85470719051149e-05, "loss": 1.2892, "step": 5455 }, { "epoch": 0.8082962962962963, "grad_norm": 1.3422205448150635, "learning_rate": 3.851742031134173e-05, "loss": 1.1428, "step": 5456 }, { "epoch": 0.8084444444444444, "grad_norm": 1.5275359153747559, "learning_rate": 3.8487768717568576e-05, "loss": 1.2389, "step": 5457 }, { "epoch": 0.8085925925925926, "grad_norm": 1.90117609500885, "learning_rate": 3.845811712379541e-05, "loss": 1.161, "step": 5458 }, { "epoch": 0.8087407407407408, "grad_norm": 2.4937429428100586, "learning_rate": 3.842846553002224e-05, "loss": 1.1525, "step": 5459 }, { "epoch": 0.8088888888888889, "grad_norm": 2.7077555656433105, "learning_rate": 3.8398813936249076e-05, "loss": 1.2007, "step": 5460 }, { "epoch": 0.809037037037037, "grad_norm": 2.1760613918304443, "learning_rate": 3.836916234247591e-05, "loss": 1.022, "step": 5461 }, { "epoch": 0.8091851851851852, "grad_norm": 1.5264531373977661, "learning_rate": 3.8339510748702744e-05, "loss": 1.05, "step": 5462 }, { "epoch": 0.8093333333333333, "grad_norm": 1.820239543914795, "learning_rate": 3.8309859154929575e-05, "loss": 0.8786, "step": 5463 }, { "epoch": 0.8094814814814815, "grad_norm": 1.18095064163208, "learning_rate": 3.828020756115641e-05, "loss": 1.2188, "step": 5464 }, { "epoch": 0.8096296296296296, "grad_norm": 1.9990864992141724, "learning_rate": 3.825055596738325e-05, "loss": 1.0489, "step": 5465 }, { "epoch": 0.8097777777777778, "grad_norm": 1.3547680377960205, "learning_rate": 3.822090437361008e-05, "loss": 1.2582, "step": 5466 }, { "epoch": 0.8099259259259259, "grad_norm": 1.4557368755340576, "learning_rate": 3.819125277983692e-05, "loss": 1.0094, "step": 5467 }, { "epoch": 0.8100740740740741, "grad_norm": 1.4652191400527954, "learning_rate": 3.816160118606375e-05, "loss": 1.101, "step": 5468 }, { "epoch": 0.8102222222222222, "grad_norm": 1.6693017482757568, "learning_rate": 3.813194959229059e-05, "loss": 1.0293, "step": 5469 }, { "epoch": 0.8103703703703704, "grad_norm": 1.4987443685531616, "learning_rate": 3.8102297998517425e-05, "loss": 0.9499, "step": 5470 }, { "epoch": 0.8105185185185185, "grad_norm": 2.044036388397217, "learning_rate": 3.8072646404744256e-05, "loss": 1.0191, "step": 5471 }, { "epoch": 0.8106666666666666, "grad_norm": 1.643293857574463, "learning_rate": 3.804299481097109e-05, "loss": 1.036, "step": 5472 }, { "epoch": 0.8108148148148148, "grad_norm": 1.6891175508499146, "learning_rate": 3.8013343217197925e-05, "loss": 0.938, "step": 5473 }, { "epoch": 0.810962962962963, "grad_norm": 1.3681107759475708, "learning_rate": 3.798369162342476e-05, "loss": 0.947, "step": 5474 }, { "epoch": 0.8111111111111111, "grad_norm": 1.179011344909668, "learning_rate": 3.795404002965159e-05, "loss": 0.8874, "step": 5475 }, { "epoch": 0.8112592592592592, "grad_norm": 1.443042516708374, "learning_rate": 3.792438843587843e-05, "loss": 1.1634, "step": 5476 }, { "epoch": 0.8114074074074074, "grad_norm": 1.5776867866516113, "learning_rate": 3.789473684210527e-05, "loss": 0.8601, "step": 5477 }, { "epoch": 0.8115555555555556, "grad_norm": 2.202869176864624, "learning_rate": 3.78650852483321e-05, "loss": 0.9741, "step": 5478 }, { "epoch": 0.8117037037037037, "grad_norm": 1.7886111736297607, "learning_rate": 3.783543365455893e-05, "loss": 0.9143, "step": 5479 }, { "epoch": 0.8118518518518518, "grad_norm": 1.6400786638259888, "learning_rate": 3.780578206078577e-05, "loss": 0.9915, "step": 5480 }, { "epoch": 0.812, "grad_norm": 1.8931728601455688, "learning_rate": 3.7776130467012606e-05, "loss": 1.1405, "step": 5481 }, { "epoch": 0.8121481481481482, "grad_norm": 1.3592950105667114, "learning_rate": 3.774647887323944e-05, "loss": 0.8755, "step": 5482 }, { "epoch": 0.8122962962962963, "grad_norm": 3.140119791030884, "learning_rate": 3.7716827279466275e-05, "loss": 1.1823, "step": 5483 }, { "epoch": 0.8124444444444444, "grad_norm": 1.6572941541671753, "learning_rate": 3.768717568569311e-05, "loss": 1.0694, "step": 5484 }, { "epoch": 0.8125925925925926, "grad_norm": 2.7264232635498047, "learning_rate": 3.765752409191994e-05, "loss": 0.8261, "step": 5485 }, { "epoch": 0.8127407407407408, "grad_norm": 1.2844897508621216, "learning_rate": 3.7627872498146774e-05, "loss": 1.0877, "step": 5486 }, { "epoch": 0.8128888888888889, "grad_norm": 2.3487370014190674, "learning_rate": 3.759822090437361e-05, "loss": 1.0698, "step": 5487 }, { "epoch": 0.813037037037037, "grad_norm": 2.817286491394043, "learning_rate": 3.756856931060045e-05, "loss": 1.1748, "step": 5488 }, { "epoch": 0.8131851851851852, "grad_norm": 9.444596290588379, "learning_rate": 3.753891771682728e-05, "loss": 0.7979, "step": 5489 }, { "epoch": 0.8133333333333334, "grad_norm": 1.417769193649292, "learning_rate": 3.750926612305412e-05, "loss": 1.0666, "step": 5490 }, { "epoch": 0.8134814814814815, "grad_norm": 1.8905470371246338, "learning_rate": 3.747961452928095e-05, "loss": 0.8707, "step": 5491 }, { "epoch": 0.8136296296296296, "grad_norm": 3.3869268894195557, "learning_rate": 3.7449962935507787e-05, "loss": 1.081, "step": 5492 }, { "epoch": 0.8137777777777778, "grad_norm": 2.0902247428894043, "learning_rate": 3.742031134173462e-05, "loss": 0.9779, "step": 5493 }, { "epoch": 0.8139259259259259, "grad_norm": 1.871923565864563, "learning_rate": 3.7390659747961455e-05, "loss": 1.0492, "step": 5494 }, { "epoch": 0.8140740740740741, "grad_norm": 1.3906363248825073, "learning_rate": 3.7361008154188286e-05, "loss": 1.0126, "step": 5495 }, { "epoch": 0.8142222222222222, "grad_norm": 1.8805983066558838, "learning_rate": 3.7331356560415124e-05, "loss": 0.9645, "step": 5496 }, { "epoch": 0.8143703703703704, "grad_norm": 1.46416437625885, "learning_rate": 3.730170496664196e-05, "loss": 0.9676, "step": 5497 }, { "epoch": 0.8145185185185185, "grad_norm": 1.4247856140136719, "learning_rate": 3.727205337286879e-05, "loss": 0.9509, "step": 5498 }, { "epoch": 0.8146666666666667, "grad_norm": 1.200612187385559, "learning_rate": 3.724240177909562e-05, "loss": 0.8609, "step": 5499 }, { "epoch": 0.8148148148148148, "grad_norm": 1.1644580364227295, "learning_rate": 3.721275018532247e-05, "loss": 0.9188, "step": 5500 }, { "epoch": 0.814962962962963, "grad_norm": 1.4518518447875977, "learning_rate": 3.71830985915493e-05, "loss": 0.8037, "step": 5501 }, { "epoch": 0.8151111111111111, "grad_norm": 2.069011926651001, "learning_rate": 3.715344699777613e-05, "loss": 1.1069, "step": 5502 }, { "epoch": 0.8152592592592592, "grad_norm": 1.531945824623108, "learning_rate": 3.712379540400297e-05, "loss": 1.172, "step": 5503 }, { "epoch": 0.8154074074074074, "grad_norm": 1.9932469129562378, "learning_rate": 3.7094143810229805e-05, "loss": 0.7403, "step": 5504 }, { "epoch": 0.8155555555555556, "grad_norm": 1.6678223609924316, "learning_rate": 3.7064492216456636e-05, "loss": 1.1614, "step": 5505 }, { "epoch": 0.8157037037037037, "grad_norm": 1.8684024810791016, "learning_rate": 3.703484062268347e-05, "loss": 0.9379, "step": 5506 }, { "epoch": 0.8158518518518518, "grad_norm": 3.2789533138275146, "learning_rate": 3.7005189028910304e-05, "loss": 0.853, "step": 5507 }, { "epoch": 0.816, "grad_norm": 2.372007369995117, "learning_rate": 3.697553743513714e-05, "loss": 0.9628, "step": 5508 }, { "epoch": 0.8161481481481482, "grad_norm": 1.2470200061798096, "learning_rate": 3.694588584136397e-05, "loss": 1.1039, "step": 5509 }, { "epoch": 0.8162962962962963, "grad_norm": 1.9533486366271973, "learning_rate": 3.691623424759081e-05, "loss": 1.2456, "step": 5510 }, { "epoch": 0.8164444444444444, "grad_norm": 1.8412435054779053, "learning_rate": 3.688658265381764e-05, "loss": 1.1617, "step": 5511 }, { "epoch": 0.8165925925925926, "grad_norm": 2.08192777633667, "learning_rate": 3.685693106004448e-05, "loss": 1.027, "step": 5512 }, { "epoch": 0.8167407407407408, "grad_norm": 1.6966136693954468, "learning_rate": 3.682727946627132e-05, "loss": 1.2062, "step": 5513 }, { "epoch": 0.8168888888888889, "grad_norm": 1.702400803565979, "learning_rate": 3.679762787249815e-05, "loss": 1.1495, "step": 5514 }, { "epoch": 0.817037037037037, "grad_norm": 1.328120231628418, "learning_rate": 3.6767976278724986e-05, "loss": 0.865, "step": 5515 }, { "epoch": 0.8171851851851852, "grad_norm": 2.2423410415649414, "learning_rate": 3.6738324684951816e-05, "loss": 0.7895, "step": 5516 }, { "epoch": 0.8173333333333334, "grad_norm": 1.627179503440857, "learning_rate": 3.6708673091178654e-05, "loss": 1.073, "step": 5517 }, { "epoch": 0.8174814814814815, "grad_norm": 1.8343318700790405, "learning_rate": 3.6679021497405485e-05, "loss": 0.9913, "step": 5518 }, { "epoch": 0.8176296296296296, "grad_norm": 2.3681259155273438, "learning_rate": 3.664936990363232e-05, "loss": 1.0483, "step": 5519 }, { "epoch": 0.8177777777777778, "grad_norm": 1.7481133937835693, "learning_rate": 3.661971830985916e-05, "loss": 1.1003, "step": 5520 }, { "epoch": 0.817925925925926, "grad_norm": 1.617566466331482, "learning_rate": 3.659006671608599e-05, "loss": 1.1357, "step": 5521 }, { "epoch": 0.8180740740740741, "grad_norm": 1.6094902753829956, "learning_rate": 3.656041512231282e-05, "loss": 0.9548, "step": 5522 }, { "epoch": 0.8182222222222222, "grad_norm": 2.6640186309814453, "learning_rate": 3.653076352853966e-05, "loss": 1.1334, "step": 5523 }, { "epoch": 0.8183703703703704, "grad_norm": 2.049234390258789, "learning_rate": 3.65011119347665e-05, "loss": 1.0645, "step": 5524 }, { "epoch": 0.8185185185185185, "grad_norm": 1.8995856046676636, "learning_rate": 3.647146034099333e-05, "loss": 1.1411, "step": 5525 }, { "epoch": 0.8186666666666667, "grad_norm": 3.3620994091033936, "learning_rate": 3.6441808747220166e-05, "loss": 0.9616, "step": 5526 }, { "epoch": 0.8188148148148148, "grad_norm": 1.9591317176818848, "learning_rate": 3.6412157153447004e-05, "loss": 1.0467, "step": 5527 }, { "epoch": 0.818962962962963, "grad_norm": 3.109315872192383, "learning_rate": 3.6382505559673835e-05, "loss": 1.1829, "step": 5528 }, { "epoch": 0.8191111111111111, "grad_norm": 2.5587923526763916, "learning_rate": 3.6352853965900666e-05, "loss": 0.8522, "step": 5529 }, { "epoch": 0.8192592592592592, "grad_norm": 1.3240567445755005, "learning_rate": 3.63232023721275e-05, "loss": 0.9416, "step": 5530 }, { "epoch": 0.8194074074074074, "grad_norm": 1.5940901041030884, "learning_rate": 3.629355077835434e-05, "loss": 1.0305, "step": 5531 }, { "epoch": 0.8195555555555556, "grad_norm": 4.036149501800537, "learning_rate": 3.626389918458117e-05, "loss": 0.7898, "step": 5532 }, { "epoch": 0.8197037037037037, "grad_norm": 1.9516942501068115, "learning_rate": 3.623424759080801e-05, "loss": 0.9664, "step": 5533 }, { "epoch": 0.8198518518518518, "grad_norm": 1.7062143087387085, "learning_rate": 3.620459599703484e-05, "loss": 0.7753, "step": 5534 }, { "epoch": 0.82, "grad_norm": 2.762611150741577, "learning_rate": 3.617494440326168e-05, "loss": 0.9052, "step": 5535 }, { "epoch": 0.8201481481481482, "grad_norm": 1.630911111831665, "learning_rate": 3.614529280948851e-05, "loss": 1.1295, "step": 5536 }, { "epoch": 0.8202962962962963, "grad_norm": 2.24279522895813, "learning_rate": 3.611564121571535e-05, "loss": 0.791, "step": 5537 }, { "epoch": 0.8204444444444444, "grad_norm": 1.067190170288086, "learning_rate": 3.608598962194218e-05, "loss": 0.9947, "step": 5538 }, { "epoch": 0.8205925925925925, "grad_norm": 1.8940973281860352, "learning_rate": 3.6056338028169015e-05, "loss": 1.1676, "step": 5539 }, { "epoch": 0.8207407407407408, "grad_norm": 3.7762491703033447, "learning_rate": 3.602668643439585e-05, "loss": 1.045, "step": 5540 }, { "epoch": 0.8208888888888889, "grad_norm": 1.4663925170898438, "learning_rate": 3.5997034840622684e-05, "loss": 1.0359, "step": 5541 }, { "epoch": 0.821037037037037, "grad_norm": 2.0131566524505615, "learning_rate": 3.5967383246849515e-05, "loss": 0.9535, "step": 5542 }, { "epoch": 0.8211851851851852, "grad_norm": 3.2547378540039062, "learning_rate": 3.593773165307636e-05, "loss": 1.099, "step": 5543 }, { "epoch": 0.8213333333333334, "grad_norm": 1.8051725625991821, "learning_rate": 3.590808005930319e-05, "loss": 1.1414, "step": 5544 }, { "epoch": 0.8214814814814815, "grad_norm": 1.619515299797058, "learning_rate": 3.587842846553002e-05, "loss": 1.141, "step": 5545 }, { "epoch": 0.8216296296296296, "grad_norm": 3.431809425354004, "learning_rate": 3.584877687175686e-05, "loss": 1.1316, "step": 5546 }, { "epoch": 0.8217777777777778, "grad_norm": 1.6146504878997803, "learning_rate": 3.5819125277983697e-05, "loss": 1.1646, "step": 5547 }, { "epoch": 0.821925925925926, "grad_norm": 1.579978346824646, "learning_rate": 3.578947368421053e-05, "loss": 0.9278, "step": 5548 }, { "epoch": 0.8220740740740741, "grad_norm": 1.1909756660461426, "learning_rate": 3.575982209043736e-05, "loss": 0.8999, "step": 5549 }, { "epoch": 0.8222222222222222, "grad_norm": 2.0035927295684814, "learning_rate": 3.5730170496664196e-05, "loss": 0.8428, "step": 5550 }, { "epoch": 0.8223703703703704, "grad_norm": 2.4635682106018066, "learning_rate": 3.5700518902891034e-05, "loss": 0.8511, "step": 5551 }, { "epoch": 0.8225185185185185, "grad_norm": 1.6868245601654053, "learning_rate": 3.5670867309117865e-05, "loss": 0.9326, "step": 5552 }, { "epoch": 0.8226666666666667, "grad_norm": 1.6198513507843018, "learning_rate": 3.56412157153447e-05, "loss": 1.1399, "step": 5553 }, { "epoch": 0.8228148148148148, "grad_norm": 1.3706104755401611, "learning_rate": 3.561156412157153e-05, "loss": 0.8828, "step": 5554 }, { "epoch": 0.822962962962963, "grad_norm": 1.4087867736816406, "learning_rate": 3.558191252779837e-05, "loss": 1.1209, "step": 5555 }, { "epoch": 0.8231111111111111, "grad_norm": 1.3651924133300781, "learning_rate": 3.555226093402521e-05, "loss": 0.9406, "step": 5556 }, { "epoch": 0.8232592592592592, "grad_norm": 1.256922960281372, "learning_rate": 3.552260934025204e-05, "loss": 0.8475, "step": 5557 }, { "epoch": 0.8234074074074074, "grad_norm": 2.030674695968628, "learning_rate": 3.549295774647888e-05, "loss": 0.9721, "step": 5558 }, { "epoch": 0.8235555555555556, "grad_norm": 2.2394938468933105, "learning_rate": 3.546330615270571e-05, "loss": 0.7677, "step": 5559 }, { "epoch": 0.8237037037037037, "grad_norm": 6.911998748779297, "learning_rate": 3.5433654558932546e-05, "loss": 0.9324, "step": 5560 }, { "epoch": 0.8238518518518518, "grad_norm": 1.8028950691223145, "learning_rate": 3.540400296515938e-05, "loss": 0.8779, "step": 5561 }, { "epoch": 0.824, "grad_norm": 1.6942514181137085, "learning_rate": 3.5374351371386214e-05, "loss": 1.045, "step": 5562 }, { "epoch": 0.8241481481481482, "grad_norm": 1.9035639762878418, "learning_rate": 3.534469977761305e-05, "loss": 1.0837, "step": 5563 }, { "epoch": 0.8242962962962963, "grad_norm": 1.3376104831695557, "learning_rate": 3.531504818383988e-05, "loss": 0.9566, "step": 5564 }, { "epoch": 0.8244444444444444, "grad_norm": 1.4366275072097778, "learning_rate": 3.5285396590066714e-05, "loss": 0.8443, "step": 5565 }, { "epoch": 0.8245925925925925, "grad_norm": 1.142834186553955, "learning_rate": 3.525574499629355e-05, "loss": 1.1526, "step": 5566 }, { "epoch": 0.8247407407407408, "grad_norm": 1.9914871454238892, "learning_rate": 3.522609340252039e-05, "loss": 1.0796, "step": 5567 }, { "epoch": 0.8248888888888889, "grad_norm": 1.3143497705459595, "learning_rate": 3.519644180874722e-05, "loss": 1.0918, "step": 5568 }, { "epoch": 0.825037037037037, "grad_norm": 2.031128406524658, "learning_rate": 3.516679021497406e-05, "loss": 1.2729, "step": 5569 }, { "epoch": 0.8251851851851851, "grad_norm": 2.464761972427368, "learning_rate": 3.5137138621200896e-05, "loss": 0.9552, "step": 5570 }, { "epoch": 0.8253333333333334, "grad_norm": 1.7345706224441528, "learning_rate": 3.5107487027427726e-05, "loss": 0.848, "step": 5571 }, { "epoch": 0.8254814814814815, "grad_norm": 1.703432559967041, "learning_rate": 3.507783543365456e-05, "loss": 1.0549, "step": 5572 }, { "epoch": 0.8256296296296296, "grad_norm": 1.3836984634399414, "learning_rate": 3.5048183839881395e-05, "loss": 0.9967, "step": 5573 }, { "epoch": 0.8257777777777778, "grad_norm": 1.334790825843811, "learning_rate": 3.501853224610823e-05, "loss": 1.0518, "step": 5574 }, { "epoch": 0.825925925925926, "grad_norm": 4.566990852355957, "learning_rate": 3.4988880652335064e-05, "loss": 0.9782, "step": 5575 }, { "epoch": 0.8260740740740741, "grad_norm": 2.2013661861419678, "learning_rate": 3.49592290585619e-05, "loss": 1.0255, "step": 5576 }, { "epoch": 0.8262222222222222, "grad_norm": 2.3910906314849854, "learning_rate": 3.492957746478873e-05, "loss": 1.0668, "step": 5577 }, { "epoch": 0.8263703703703704, "grad_norm": 1.7936630249023438, "learning_rate": 3.489992587101557e-05, "loss": 0.9094, "step": 5578 }, { "epoch": 0.8265185185185185, "grad_norm": 1.5428414344787598, "learning_rate": 3.48702742772424e-05, "loss": 1.031, "step": 5579 }, { "epoch": 0.8266666666666667, "grad_norm": 1.2280282974243164, "learning_rate": 3.484062268346924e-05, "loss": 0.8246, "step": 5580 }, { "epoch": 0.8268148148148148, "grad_norm": 2.28139328956604, "learning_rate": 3.481097108969607e-05, "loss": 0.8762, "step": 5581 }, { "epoch": 0.826962962962963, "grad_norm": 2.3486006259918213, "learning_rate": 3.478131949592291e-05, "loss": 1.0251, "step": 5582 }, { "epoch": 0.8271111111111111, "grad_norm": 1.8135404586791992, "learning_rate": 3.4751667902149745e-05, "loss": 1.0686, "step": 5583 }, { "epoch": 0.8272592592592592, "grad_norm": 2.6683990955352783, "learning_rate": 3.4722016308376576e-05, "loss": 1.0136, "step": 5584 }, { "epoch": 0.8274074074074074, "grad_norm": 1.5761295557022095, "learning_rate": 3.4692364714603407e-05, "loss": 1.1173, "step": 5585 }, { "epoch": 0.8275555555555556, "grad_norm": 17.536306381225586, "learning_rate": 3.466271312083025e-05, "loss": 1.2514, "step": 5586 }, { "epoch": 0.8277037037037037, "grad_norm": 2.289813756942749, "learning_rate": 3.463306152705708e-05, "loss": 0.9197, "step": 5587 }, { "epoch": 0.8278518518518518, "grad_norm": 1.5273847579956055, "learning_rate": 3.460340993328391e-05, "loss": 0.8554, "step": 5588 }, { "epoch": 0.828, "grad_norm": 1.2904846668243408, "learning_rate": 3.457375833951075e-05, "loss": 1.046, "step": 5589 }, { "epoch": 0.8281481481481482, "grad_norm": 2.67862606048584, "learning_rate": 3.454410674573759e-05, "loss": 0.9925, "step": 5590 }, { "epoch": 0.8282962962962963, "grad_norm": 1.4206843376159668, "learning_rate": 3.451445515196442e-05, "loss": 0.9479, "step": 5591 }, { "epoch": 0.8284444444444444, "grad_norm": 1.2101153135299683, "learning_rate": 3.448480355819125e-05, "loss": 1.1607, "step": 5592 }, { "epoch": 0.8285925925925925, "grad_norm": 1.2494651079177856, "learning_rate": 3.445515196441809e-05, "loss": 0.8645, "step": 5593 }, { "epoch": 0.8287407407407408, "grad_norm": 2.1751744747161865, "learning_rate": 3.4425500370644925e-05, "loss": 1.1682, "step": 5594 }, { "epoch": 0.8288888888888889, "grad_norm": 1.934248685836792, "learning_rate": 3.4395848776871756e-05, "loss": 0.9544, "step": 5595 }, { "epoch": 0.829037037037037, "grad_norm": 8.274798393249512, "learning_rate": 3.4366197183098594e-05, "loss": 1.1841, "step": 5596 }, { "epoch": 0.8291851851851851, "grad_norm": 2.040048599243164, "learning_rate": 3.433654558932543e-05, "loss": 0.9006, "step": 5597 }, { "epoch": 0.8293333333333334, "grad_norm": 2.306729316711426, "learning_rate": 3.430689399555226e-05, "loss": 1.1027, "step": 5598 }, { "epoch": 0.8294814814814815, "grad_norm": 1.608792781829834, "learning_rate": 3.42772424017791e-05, "loss": 0.9009, "step": 5599 }, { "epoch": 0.8296296296296296, "grad_norm": 2.4428153038024902, "learning_rate": 3.424759080800593e-05, "loss": 1.1927, "step": 5600 }, { "epoch": 0.8297777777777777, "grad_norm": 1.4491654634475708, "learning_rate": 3.421793921423277e-05, "loss": 0.9686, "step": 5601 }, { "epoch": 0.829925925925926, "grad_norm": 2.759342670440674, "learning_rate": 3.41882876204596e-05, "loss": 1.0876, "step": 5602 }, { "epoch": 0.8300740740740741, "grad_norm": 1.6981993913650513, "learning_rate": 3.415863602668644e-05, "loss": 1.1029, "step": 5603 }, { "epoch": 0.8302222222222222, "grad_norm": 1.3702118396759033, "learning_rate": 3.412898443291327e-05, "loss": 0.9498, "step": 5604 }, { "epoch": 0.8303703703703704, "grad_norm": 1.777405023574829, "learning_rate": 3.4099332839140106e-05, "loss": 0.9104, "step": 5605 }, { "epoch": 0.8305185185185185, "grad_norm": 1.8026494979858398, "learning_rate": 3.4069681245366944e-05, "loss": 1.0487, "step": 5606 }, { "epoch": 0.8306666666666667, "grad_norm": 2.668658971786499, "learning_rate": 3.4040029651593775e-05, "loss": 1.1804, "step": 5607 }, { "epoch": 0.8308148148148148, "grad_norm": 1.5562937259674072, "learning_rate": 3.4010378057820606e-05, "loss": 0.9062, "step": 5608 }, { "epoch": 0.830962962962963, "grad_norm": 1.4961464405059814, "learning_rate": 3.398072646404744e-05, "loss": 1.0791, "step": 5609 }, { "epoch": 0.8311111111111111, "grad_norm": 3.3875136375427246, "learning_rate": 3.395107487027428e-05, "loss": 1.0595, "step": 5610 }, { "epoch": 0.8312592592592593, "grad_norm": 1.272215723991394, "learning_rate": 3.392142327650111e-05, "loss": 1.0394, "step": 5611 }, { "epoch": 0.8314074074074074, "grad_norm": 14.303399085998535, "learning_rate": 3.389177168272795e-05, "loss": 0.9272, "step": 5612 }, { "epoch": 0.8315555555555556, "grad_norm": 3.60304856300354, "learning_rate": 3.386212008895479e-05, "loss": 1.0269, "step": 5613 }, { "epoch": 0.8317037037037037, "grad_norm": 1.3587753772735596, "learning_rate": 3.383246849518162e-05, "loss": 1.4079, "step": 5614 }, { "epoch": 0.8318518518518518, "grad_norm": 2.1113405227661133, "learning_rate": 3.380281690140845e-05, "loss": 1.1314, "step": 5615 }, { "epoch": 0.832, "grad_norm": 1.4930996894836426, "learning_rate": 3.377316530763529e-05, "loss": 1.156, "step": 5616 }, { "epoch": 0.8321481481481482, "grad_norm": 1.3228946924209595, "learning_rate": 3.3743513713862124e-05, "loss": 1.1645, "step": 5617 }, { "epoch": 0.8322962962962963, "grad_norm": 1.929006814956665, "learning_rate": 3.3713862120088955e-05, "loss": 1.0037, "step": 5618 }, { "epoch": 0.8324444444444444, "grad_norm": 1.6608818769454956, "learning_rate": 3.368421052631579e-05, "loss": 1.1946, "step": 5619 }, { "epoch": 0.8325925925925926, "grad_norm": 2.0891196727752686, "learning_rate": 3.3654558932542624e-05, "loss": 1.1041, "step": 5620 }, { "epoch": 0.8327407407407408, "grad_norm": 1.4596067667007446, "learning_rate": 3.362490733876946e-05, "loss": 1.1719, "step": 5621 }, { "epoch": 0.8328888888888889, "grad_norm": 3.2699577808380127, "learning_rate": 3.359525574499629e-05, "loss": 1.3013, "step": 5622 }, { "epoch": 0.833037037037037, "grad_norm": 1.3315544128417969, "learning_rate": 3.356560415122313e-05, "loss": 0.9505, "step": 5623 }, { "epoch": 0.8331851851851851, "grad_norm": 1.5666136741638184, "learning_rate": 3.353595255744996e-05, "loss": 0.9916, "step": 5624 }, { "epoch": 0.8333333333333334, "grad_norm": 1.4195020198822021, "learning_rate": 3.35063009636768e-05, "loss": 0.8547, "step": 5625 }, { "epoch": 0.8334814814814815, "grad_norm": 1.1384927034378052, "learning_rate": 3.3476649369903636e-05, "loss": 0.9563, "step": 5626 }, { "epoch": 0.8336296296296296, "grad_norm": 2.0354249477386475, "learning_rate": 3.344699777613047e-05, "loss": 0.9357, "step": 5627 }, { "epoch": 0.8337777777777777, "grad_norm": 1.5699747800827026, "learning_rate": 3.34173461823573e-05, "loss": 1.2679, "step": 5628 }, { "epoch": 0.833925925925926, "grad_norm": 1.2100545167922974, "learning_rate": 3.338769458858414e-05, "loss": 0.9108, "step": 5629 }, { "epoch": 0.8340740740740741, "grad_norm": 1.406457781791687, "learning_rate": 3.3358042994810974e-05, "loss": 0.8079, "step": 5630 }, { "epoch": 0.8342222222222222, "grad_norm": 1.5774122476577759, "learning_rate": 3.3328391401037804e-05, "loss": 1.158, "step": 5631 }, { "epoch": 0.8343703703703703, "grad_norm": 1.589264988899231, "learning_rate": 3.329873980726464e-05, "loss": 1.0718, "step": 5632 }, { "epoch": 0.8345185185185185, "grad_norm": 1.7401427030563354, "learning_rate": 3.326908821349148e-05, "loss": 1.12, "step": 5633 }, { "epoch": 0.8346666666666667, "grad_norm": 1.2324284315109253, "learning_rate": 3.323943661971831e-05, "loss": 1.0212, "step": 5634 }, { "epoch": 0.8348148148148148, "grad_norm": 2.1774914264678955, "learning_rate": 3.320978502594514e-05, "loss": 1.0251, "step": 5635 }, { "epoch": 0.834962962962963, "grad_norm": 1.6775509119033813, "learning_rate": 3.318013343217198e-05, "loss": 1.1762, "step": 5636 }, { "epoch": 0.8351111111111111, "grad_norm": 1.5039470195770264, "learning_rate": 3.315048183839882e-05, "loss": 1.084, "step": 5637 }, { "epoch": 0.8352592592592593, "grad_norm": 1.2579706907272339, "learning_rate": 3.312083024462565e-05, "loss": 0.851, "step": 5638 }, { "epoch": 0.8354074074074074, "grad_norm": 1.8096531629562378, "learning_rate": 3.3091178650852486e-05, "loss": 0.8298, "step": 5639 }, { "epoch": 0.8355555555555556, "grad_norm": 1.6695431470870972, "learning_rate": 3.306152705707932e-05, "loss": 1.0724, "step": 5640 }, { "epoch": 0.8357037037037037, "grad_norm": 0.89354407787323, "learning_rate": 3.3031875463306154e-05, "loss": 0.6487, "step": 5641 }, { "epoch": 0.8358518518518518, "grad_norm": 1.5002517700195312, "learning_rate": 3.300222386953299e-05, "loss": 0.8566, "step": 5642 }, { "epoch": 0.836, "grad_norm": 1.8203537464141846, "learning_rate": 3.297257227575982e-05, "loss": 1.044, "step": 5643 }, { "epoch": 0.8361481481481482, "grad_norm": 2.73797607421875, "learning_rate": 3.294292068198666e-05, "loss": 0.9902, "step": 5644 }, { "epoch": 0.8362962962962963, "grad_norm": 1.4074082374572754, "learning_rate": 3.291326908821349e-05, "loss": 0.967, "step": 5645 }, { "epoch": 0.8364444444444444, "grad_norm": 1.300226092338562, "learning_rate": 3.288361749444033e-05, "loss": 0.9087, "step": 5646 }, { "epoch": 0.8365925925925926, "grad_norm": 2.106065273284912, "learning_rate": 3.285396590066716e-05, "loss": 1.0165, "step": 5647 }, { "epoch": 0.8367407407407408, "grad_norm": 2.0137596130371094, "learning_rate": 3.2824314306894e-05, "loss": 0.9612, "step": 5648 }, { "epoch": 0.8368888888888889, "grad_norm": 1.530579924583435, "learning_rate": 3.2794662713120835e-05, "loss": 1.0016, "step": 5649 }, { "epoch": 0.837037037037037, "grad_norm": 1.4735548496246338, "learning_rate": 3.2765011119347666e-05, "loss": 0.8761, "step": 5650 }, { "epoch": 0.8371851851851851, "grad_norm": 2.0071825981140137, "learning_rate": 3.27353595255745e-05, "loss": 1.1396, "step": 5651 }, { "epoch": 0.8373333333333334, "grad_norm": 1.5885924100875854, "learning_rate": 3.2705707931801335e-05, "loss": 0.9224, "step": 5652 }, { "epoch": 0.8374814814814815, "grad_norm": 1.7556943893432617, "learning_rate": 3.267605633802817e-05, "loss": 1.0015, "step": 5653 }, { "epoch": 0.8376296296296296, "grad_norm": 1.3980185985565186, "learning_rate": 3.2646404744255003e-05, "loss": 0.887, "step": 5654 }, { "epoch": 0.8377777777777777, "grad_norm": 2.194582223892212, "learning_rate": 3.261675315048184e-05, "loss": 0.9047, "step": 5655 }, { "epoch": 0.837925925925926, "grad_norm": 1.9248734712600708, "learning_rate": 3.258710155670868e-05, "loss": 0.9987, "step": 5656 }, { "epoch": 0.8380740740740741, "grad_norm": 1.244874358177185, "learning_rate": 3.255744996293551e-05, "loss": 0.7646, "step": 5657 }, { "epoch": 0.8382222222222222, "grad_norm": 1.1763979196548462, "learning_rate": 3.252779836916234e-05, "loss": 0.9621, "step": 5658 }, { "epoch": 0.8383703703703703, "grad_norm": 2.883955955505371, "learning_rate": 3.249814677538918e-05, "loss": 1.2885, "step": 5659 }, { "epoch": 0.8385185185185186, "grad_norm": 1.3345526456832886, "learning_rate": 3.2468495181616016e-05, "loss": 1.2323, "step": 5660 }, { "epoch": 0.8386666666666667, "grad_norm": 1.8527706861495972, "learning_rate": 3.243884358784285e-05, "loss": 0.8831, "step": 5661 }, { "epoch": 0.8388148148148148, "grad_norm": 1.5353264808654785, "learning_rate": 3.2409191994069685e-05, "loss": 1.1162, "step": 5662 }, { "epoch": 0.8389629629629629, "grad_norm": 2.0129382610321045, "learning_rate": 3.2379540400296516e-05, "loss": 0.9302, "step": 5663 }, { "epoch": 0.8391111111111111, "grad_norm": 2.3754312992095947, "learning_rate": 3.234988880652335e-05, "loss": 1.0311, "step": 5664 }, { "epoch": 0.8392592592592593, "grad_norm": 2.907285451889038, "learning_rate": 3.2320237212750184e-05, "loss": 0.9833, "step": 5665 }, { "epoch": 0.8394074074074074, "grad_norm": 1.2558186054229736, "learning_rate": 3.229058561897702e-05, "loss": 0.8794, "step": 5666 }, { "epoch": 0.8395555555555556, "grad_norm": 2.4155330657958984, "learning_rate": 3.226093402520385e-05, "loss": 0.9888, "step": 5667 }, { "epoch": 0.8397037037037037, "grad_norm": 1.5688879489898682, "learning_rate": 3.223128243143069e-05, "loss": 1.0269, "step": 5668 }, { "epoch": 0.8398518518518518, "grad_norm": 3.373100757598877, "learning_rate": 3.220163083765753e-05, "loss": 1.1238, "step": 5669 }, { "epoch": 0.84, "grad_norm": 1.8652637004852295, "learning_rate": 3.217197924388436e-05, "loss": 0.9321, "step": 5670 }, { "epoch": 0.8401481481481482, "grad_norm": 1.1480289697647095, "learning_rate": 3.214232765011119e-05, "loss": 0.9629, "step": 5671 }, { "epoch": 0.8402962962962963, "grad_norm": 1.8617579936981201, "learning_rate": 3.2112676056338034e-05, "loss": 1.0839, "step": 5672 }, { "epoch": 0.8404444444444444, "grad_norm": 1.8071740865707397, "learning_rate": 3.2083024462564865e-05, "loss": 1.2537, "step": 5673 }, { "epoch": 0.8405925925925926, "grad_norm": 2.2916259765625, "learning_rate": 3.2053372868791696e-05, "loss": 1.0515, "step": 5674 }, { "epoch": 0.8407407407407408, "grad_norm": 2.1010496616363525, "learning_rate": 3.2023721275018534e-05, "loss": 1.1199, "step": 5675 }, { "epoch": 0.8408888888888889, "grad_norm": 1.3914610147476196, "learning_rate": 3.199406968124537e-05, "loss": 1.0279, "step": 5676 }, { "epoch": 0.841037037037037, "grad_norm": 1.1546776294708252, "learning_rate": 3.19644180874722e-05, "loss": 0.8512, "step": 5677 }, { "epoch": 0.8411851851851851, "grad_norm": 1.5754714012145996, "learning_rate": 3.193476649369903e-05, "loss": 1.0605, "step": 5678 }, { "epoch": 0.8413333333333334, "grad_norm": 1.2197717428207397, "learning_rate": 3.190511489992587e-05, "loss": 0.8575, "step": 5679 }, { "epoch": 0.8414814814814815, "grad_norm": 1.5923357009887695, "learning_rate": 3.187546330615271e-05, "loss": 0.8947, "step": 5680 }, { "epoch": 0.8416296296296296, "grad_norm": 1.3048105239868164, "learning_rate": 3.184581171237954e-05, "loss": 1.2803, "step": 5681 }, { "epoch": 0.8417777777777777, "grad_norm": 4.217629909515381, "learning_rate": 3.181616011860638e-05, "loss": 1.2179, "step": 5682 }, { "epoch": 0.841925925925926, "grad_norm": 1.3836694955825806, "learning_rate": 3.1786508524833215e-05, "loss": 0.7614, "step": 5683 }, { "epoch": 0.8420740740740741, "grad_norm": 1.9527873992919922, "learning_rate": 3.1756856931060046e-05, "loss": 1.1067, "step": 5684 }, { "epoch": 0.8422222222222222, "grad_norm": 1.5900622606277466, "learning_rate": 3.1727205337286884e-05, "loss": 1.0342, "step": 5685 }, { "epoch": 0.8423703703703703, "grad_norm": 1.4586294889450073, "learning_rate": 3.1697553743513714e-05, "loss": 0.9812, "step": 5686 }, { "epoch": 0.8425185185185186, "grad_norm": 1.678809642791748, "learning_rate": 3.166790214974055e-05, "loss": 1.04, "step": 5687 }, { "epoch": 0.8426666666666667, "grad_norm": 1.2695871591567993, "learning_rate": 3.163825055596738e-05, "loss": 0.836, "step": 5688 }, { "epoch": 0.8428148148148148, "grad_norm": 1.399864912033081, "learning_rate": 3.160859896219422e-05, "loss": 0.8723, "step": 5689 }, { "epoch": 0.8429629629629629, "grad_norm": 1.1795293092727661, "learning_rate": 3.157894736842105e-05, "loss": 1.058, "step": 5690 }, { "epoch": 0.8431111111111111, "grad_norm": 1.6496189832687378, "learning_rate": 3.154929577464789e-05, "loss": 1.133, "step": 5691 }, { "epoch": 0.8432592592592593, "grad_norm": 1.6981966495513916, "learning_rate": 3.151964418087473e-05, "loss": 0.8794, "step": 5692 }, { "epoch": 0.8434074074074074, "grad_norm": 1.2521898746490479, "learning_rate": 3.148999258710156e-05, "loss": 0.9448, "step": 5693 }, { "epoch": 0.8435555555555555, "grad_norm": 1.4220592975616455, "learning_rate": 3.146034099332839e-05, "loss": 1.217, "step": 5694 }, { "epoch": 0.8437037037037037, "grad_norm": 4.723443508148193, "learning_rate": 3.1430689399555227e-05, "loss": 0.9995, "step": 5695 }, { "epoch": 0.8438518518518519, "grad_norm": 2.358628988265991, "learning_rate": 3.1401037805782064e-05, "loss": 1.1478, "step": 5696 }, { "epoch": 0.844, "grad_norm": 1.4681414365768433, "learning_rate": 3.1371386212008895e-05, "loss": 1.104, "step": 5697 }, { "epoch": 0.8441481481481482, "grad_norm": 1.7765322923660278, "learning_rate": 3.134173461823573e-05, "loss": 1.0113, "step": 5698 }, { "epoch": 0.8442962962962963, "grad_norm": 1.6960357427597046, "learning_rate": 3.131208302446257e-05, "loss": 1.0569, "step": 5699 }, { "epoch": 0.8444444444444444, "grad_norm": 3.481264591217041, "learning_rate": 3.12824314306894e-05, "loss": 0.9691, "step": 5700 }, { "epoch": 0.8445925925925926, "grad_norm": 1.8465306758880615, "learning_rate": 3.125277983691623e-05, "loss": 0.9778, "step": 5701 }, { "epoch": 0.8447407407407408, "grad_norm": 1.3541700839996338, "learning_rate": 3.122312824314307e-05, "loss": 1.0824, "step": 5702 }, { "epoch": 0.8448888888888889, "grad_norm": 4.360558986663818, "learning_rate": 3.119347664936991e-05, "loss": 0.8039, "step": 5703 }, { "epoch": 0.845037037037037, "grad_norm": 1.89080810546875, "learning_rate": 3.116382505559674e-05, "loss": 0.9669, "step": 5704 }, { "epoch": 0.8451851851851852, "grad_norm": 1.776655673980713, "learning_rate": 3.1134173461823576e-05, "loss": 1.0633, "step": 5705 }, { "epoch": 0.8453333333333334, "grad_norm": 1.923953890800476, "learning_rate": 3.110452186805041e-05, "loss": 1.054, "step": 5706 }, { "epoch": 0.8454814814814815, "grad_norm": 3.0366568565368652, "learning_rate": 3.1074870274277245e-05, "loss": 1.03, "step": 5707 }, { "epoch": 0.8456296296296296, "grad_norm": 1.6078895330429077, "learning_rate": 3.1045218680504076e-05, "loss": 1.0269, "step": 5708 }, { "epoch": 0.8457777777777777, "grad_norm": 3.041707992553711, "learning_rate": 3.1015567086730913e-05, "loss": 1.0908, "step": 5709 }, { "epoch": 0.845925925925926, "grad_norm": 9.95754623413086, "learning_rate": 3.0985915492957744e-05, "loss": 1.151, "step": 5710 }, { "epoch": 0.8460740740740741, "grad_norm": 1.799899935722351, "learning_rate": 3.095626389918458e-05, "loss": 1.1803, "step": 5711 }, { "epoch": 0.8462222222222222, "grad_norm": 1.8792227506637573, "learning_rate": 3.092661230541142e-05, "loss": 0.8073, "step": 5712 }, { "epoch": 0.8463703703703703, "grad_norm": 1.6597542762756348, "learning_rate": 3.089696071163825e-05, "loss": 1.1491, "step": 5713 }, { "epoch": 0.8465185185185186, "grad_norm": 2.065408945083618, "learning_rate": 3.086730911786508e-05, "loss": 0.9161, "step": 5714 }, { "epoch": 0.8466666666666667, "grad_norm": 2.674891948699951, "learning_rate": 3.0837657524091926e-05, "loss": 1.0146, "step": 5715 }, { "epoch": 0.8468148148148148, "grad_norm": 1.1968002319335938, "learning_rate": 3.080800593031876e-05, "loss": 0.9344, "step": 5716 }, { "epoch": 0.8469629629629629, "grad_norm": 1.9078673124313354, "learning_rate": 3.077835433654559e-05, "loss": 1.2448, "step": 5717 }, { "epoch": 0.8471111111111111, "grad_norm": 1.4666813611984253, "learning_rate": 3.0748702742772425e-05, "loss": 1.085, "step": 5718 }, { "epoch": 0.8472592592592593, "grad_norm": 1.6281707286834717, "learning_rate": 3.071905114899926e-05, "loss": 1.2932, "step": 5719 }, { "epoch": 0.8474074074074074, "grad_norm": 2.0884618759155273, "learning_rate": 3.0689399555226094e-05, "loss": 1.0515, "step": 5720 }, { "epoch": 0.8475555555555555, "grad_norm": 1.8514748811721802, "learning_rate": 3.0659747961452925e-05, "loss": 0.9292, "step": 5721 }, { "epoch": 0.8477037037037037, "grad_norm": 1.4120756387710571, "learning_rate": 3.063009636767977e-05, "loss": 0.9051, "step": 5722 }, { "epoch": 0.8478518518518519, "grad_norm": 1.574057936668396, "learning_rate": 3.06004447739066e-05, "loss": 0.9985, "step": 5723 }, { "epoch": 0.848, "grad_norm": 1.3220384120941162, "learning_rate": 3.057079318013343e-05, "loss": 0.9563, "step": 5724 }, { "epoch": 0.8481481481481481, "grad_norm": 1.1498291492462158, "learning_rate": 3.054114158636027e-05, "loss": 0.8015, "step": 5725 }, { "epoch": 0.8482962962962963, "grad_norm": 1.3494188785552979, "learning_rate": 3.0511489992587107e-05, "loss": 1.2001, "step": 5726 }, { "epoch": 0.8484444444444444, "grad_norm": 2.747594118118286, "learning_rate": 3.0481838398813938e-05, "loss": 1.2624, "step": 5727 }, { "epoch": 0.8485925925925926, "grad_norm": 2.4447944164276123, "learning_rate": 3.0452186805040772e-05, "loss": 1.0625, "step": 5728 }, { "epoch": 0.8487407407407408, "grad_norm": 1.318755865097046, "learning_rate": 3.0422535211267606e-05, "loss": 0.952, "step": 5729 }, { "epoch": 0.8488888888888889, "grad_norm": 1.4050874710083008, "learning_rate": 3.0392883617494444e-05, "loss": 1.0785, "step": 5730 }, { "epoch": 0.849037037037037, "grad_norm": 1.4720213413238525, "learning_rate": 3.0363232023721278e-05, "loss": 0.8125, "step": 5731 }, { "epoch": 0.8491851851851852, "grad_norm": 1.9184999465942383, "learning_rate": 3.0333580429948112e-05, "loss": 0.934, "step": 5732 }, { "epoch": 0.8493333333333334, "grad_norm": 1.6016595363616943, "learning_rate": 3.0303928836174943e-05, "loss": 1.0236, "step": 5733 }, { "epoch": 0.8494814814814815, "grad_norm": 1.7323766946792603, "learning_rate": 3.0274277242401784e-05, "loss": 0.8992, "step": 5734 }, { "epoch": 0.8496296296296296, "grad_norm": 1.5857726335525513, "learning_rate": 3.0244625648628615e-05, "loss": 0.9172, "step": 5735 }, { "epoch": 0.8497777777777777, "grad_norm": 1.6110647916793823, "learning_rate": 3.021497405485545e-05, "loss": 1.0478, "step": 5736 }, { "epoch": 0.849925925925926, "grad_norm": 1.5153177976608276, "learning_rate": 3.0185322461082284e-05, "loss": 0.8538, "step": 5737 }, { "epoch": 0.8500740740740741, "grad_norm": 1.643136739730835, "learning_rate": 3.015567086730912e-05, "loss": 1.0162, "step": 5738 }, { "epoch": 0.8502222222222222, "grad_norm": 1.9467730522155762, "learning_rate": 3.0126019273535956e-05, "loss": 1.2033, "step": 5739 }, { "epoch": 0.8503703703703703, "grad_norm": 1.8516819477081299, "learning_rate": 3.0096367679762787e-05, "loss": 1.0439, "step": 5740 }, { "epoch": 0.8505185185185186, "grad_norm": 1.1998674869537354, "learning_rate": 3.006671608598962e-05, "loss": 0.8788, "step": 5741 }, { "epoch": 0.8506666666666667, "grad_norm": 1.9696849584579468, "learning_rate": 3.003706449221646e-05, "loss": 0.9554, "step": 5742 }, { "epoch": 0.8508148148148148, "grad_norm": 1.493667483329773, "learning_rate": 3.0007412898443293e-05, "loss": 1.047, "step": 5743 }, { "epoch": 0.8509629629629629, "grad_norm": 1.3543988466262817, "learning_rate": 2.9977761304670127e-05, "loss": 0.9608, "step": 5744 }, { "epoch": 0.8511111111111112, "grad_norm": 1.2830764055252075, "learning_rate": 2.994810971089696e-05, "loss": 0.9455, "step": 5745 }, { "epoch": 0.8512592592592593, "grad_norm": 2.551621675491333, "learning_rate": 2.99184581171238e-05, "loss": 1.0836, "step": 5746 }, { "epoch": 0.8514074074074074, "grad_norm": 1.251689076423645, "learning_rate": 2.9888806523350634e-05, "loss": 1.2618, "step": 5747 }, { "epoch": 0.8515555555555555, "grad_norm": 2.0711042881011963, "learning_rate": 2.9859154929577465e-05, "loss": 1.0205, "step": 5748 }, { "epoch": 0.8517037037037037, "grad_norm": 9.725003242492676, "learning_rate": 2.98295033358043e-05, "loss": 0.9207, "step": 5749 }, { "epoch": 0.8518518518518519, "grad_norm": 1.4738051891326904, "learning_rate": 2.9799851742031136e-05, "loss": 0.9769, "step": 5750 }, { "epoch": 0.852, "grad_norm": 1.4541064500808716, "learning_rate": 2.977020014825797e-05, "loss": 1.1133, "step": 5751 }, { "epoch": 0.8521481481481481, "grad_norm": 1.4564684629440308, "learning_rate": 2.9740548554484805e-05, "loss": 1.2501, "step": 5752 }, { "epoch": 0.8522962962962963, "grad_norm": 2.8018648624420166, "learning_rate": 2.9710896960711636e-05, "loss": 0.8952, "step": 5753 }, { "epoch": 0.8524444444444444, "grad_norm": 4.907220840454102, "learning_rate": 2.9681245366938477e-05, "loss": 0.8738, "step": 5754 }, { "epoch": 0.8525925925925926, "grad_norm": 1.850071907043457, "learning_rate": 2.9651593773165308e-05, "loss": 1.0036, "step": 5755 }, { "epoch": 0.8527407407407407, "grad_norm": 1.1384518146514893, "learning_rate": 2.9621942179392142e-05, "loss": 1.1831, "step": 5756 }, { "epoch": 0.8528888888888889, "grad_norm": 1.965689778327942, "learning_rate": 2.9592290585618977e-05, "loss": 1.1209, "step": 5757 }, { "epoch": 0.853037037037037, "grad_norm": 1.3774926662445068, "learning_rate": 2.9562638991845814e-05, "loss": 0.8957, "step": 5758 }, { "epoch": 0.8531851851851852, "grad_norm": 2.0560662746429443, "learning_rate": 2.953298739807265e-05, "loss": 0.8915, "step": 5759 }, { "epoch": 0.8533333333333334, "grad_norm": 1.6404497623443604, "learning_rate": 2.9503335804299483e-05, "loss": 1.0045, "step": 5760 }, { "epoch": 0.8534814814814815, "grad_norm": 2.8091065883636475, "learning_rate": 2.9473684210526314e-05, "loss": 1.231, "step": 5761 }, { "epoch": 0.8536296296296296, "grad_norm": 1.8522920608520508, "learning_rate": 2.9444032616753155e-05, "loss": 0.8325, "step": 5762 }, { "epoch": 0.8537777777777777, "grad_norm": 1.9119120836257935, "learning_rate": 2.9414381022979986e-05, "loss": 1.1189, "step": 5763 }, { "epoch": 0.853925925925926, "grad_norm": 2.7033612728118896, "learning_rate": 2.938472942920682e-05, "loss": 0.9057, "step": 5764 }, { "epoch": 0.8540740740740741, "grad_norm": 2.591933012008667, "learning_rate": 2.9355077835433658e-05, "loss": 0.9156, "step": 5765 }, { "epoch": 0.8542222222222222, "grad_norm": 1.5084139108657837, "learning_rate": 2.9325426241660492e-05, "loss": 0.7997, "step": 5766 }, { "epoch": 0.8543703703703703, "grad_norm": 1.986857295036316, "learning_rate": 2.9295774647887326e-05, "loss": 1.1218, "step": 5767 }, { "epoch": 0.8545185185185186, "grad_norm": 1.9872719049453735, "learning_rate": 2.9266123054114157e-05, "loss": 0.936, "step": 5768 }, { "epoch": 0.8546666666666667, "grad_norm": 1.8185993432998657, "learning_rate": 2.9236471460340998e-05, "loss": 0.9352, "step": 5769 }, { "epoch": 0.8548148148148148, "grad_norm": 1.3699370622634888, "learning_rate": 2.920681986656783e-05, "loss": 1.0854, "step": 5770 }, { "epoch": 0.8549629629629629, "grad_norm": 1.5002169609069824, "learning_rate": 2.9177168272794663e-05, "loss": 1.1115, "step": 5771 }, { "epoch": 0.8551111111111112, "grad_norm": 1.9244588613510132, "learning_rate": 2.9147516679021498e-05, "loss": 0.9015, "step": 5772 }, { "epoch": 0.8552592592592593, "grad_norm": 1.6107194423675537, "learning_rate": 2.9117865085248335e-05, "loss": 1.0424, "step": 5773 }, { "epoch": 0.8554074074074074, "grad_norm": 1.9779789447784424, "learning_rate": 2.908821349147517e-05, "loss": 1.0823, "step": 5774 }, { "epoch": 0.8555555555555555, "grad_norm": 1.4399076700210571, "learning_rate": 2.9058561897702004e-05, "loss": 1.0582, "step": 5775 }, { "epoch": 0.8557037037037037, "grad_norm": 1.1391663551330566, "learning_rate": 2.9028910303928835e-05, "loss": 0.9975, "step": 5776 }, { "epoch": 0.8558518518518519, "grad_norm": 1.6162272691726685, "learning_rate": 2.8999258710155676e-05, "loss": 0.6114, "step": 5777 }, { "epoch": 0.856, "grad_norm": 1.2094829082489014, "learning_rate": 2.8969607116382507e-05, "loss": 1.0004, "step": 5778 }, { "epoch": 0.8561481481481481, "grad_norm": 1.7080848217010498, "learning_rate": 2.893995552260934e-05, "loss": 1.1552, "step": 5779 }, { "epoch": 0.8562962962962963, "grad_norm": 3.8012313842773438, "learning_rate": 2.8910303928836176e-05, "loss": 0.909, "step": 5780 }, { "epoch": 0.8564444444444445, "grad_norm": 2.1891396045684814, "learning_rate": 2.8880652335063013e-05, "loss": 0.7945, "step": 5781 }, { "epoch": 0.8565925925925926, "grad_norm": 3.9642629623413086, "learning_rate": 2.8851000741289848e-05, "loss": 0.9749, "step": 5782 }, { "epoch": 0.8567407407407407, "grad_norm": 1.8733007907867432, "learning_rate": 2.882134914751668e-05, "loss": 1.1285, "step": 5783 }, { "epoch": 0.8568888888888889, "grad_norm": 3.807276964187622, "learning_rate": 2.8791697553743513e-05, "loss": 1.1222, "step": 5784 }, { "epoch": 0.857037037037037, "grad_norm": 2.5071189403533936, "learning_rate": 2.876204595997035e-05, "loss": 1.2052, "step": 5785 }, { "epoch": 0.8571851851851852, "grad_norm": 1.5023587942123413, "learning_rate": 2.8732394366197185e-05, "loss": 0.981, "step": 5786 }, { "epoch": 0.8573333333333333, "grad_norm": 1.3831145763397217, "learning_rate": 2.870274277242402e-05, "loss": 0.9075, "step": 5787 }, { "epoch": 0.8574814814814815, "grad_norm": 1.405256748199463, "learning_rate": 2.8673091178650853e-05, "loss": 1.1884, "step": 5788 }, { "epoch": 0.8576296296296296, "grad_norm": 1.0629494190216064, "learning_rate": 2.864343958487769e-05, "loss": 0.8273, "step": 5789 }, { "epoch": 0.8577777777777778, "grad_norm": 1.526007056236267, "learning_rate": 2.8613787991104525e-05, "loss": 0.8813, "step": 5790 }, { "epoch": 0.857925925925926, "grad_norm": 2.1729397773742676, "learning_rate": 2.8584136397331356e-05, "loss": 1.1614, "step": 5791 }, { "epoch": 0.8580740740740741, "grad_norm": 1.637261152267456, "learning_rate": 2.855448480355819e-05, "loss": 0.9632, "step": 5792 }, { "epoch": 0.8582222222222222, "grad_norm": 1.7835174798965454, "learning_rate": 2.8524833209785028e-05, "loss": 0.7737, "step": 5793 }, { "epoch": 0.8583703703703703, "grad_norm": 1.6379073858261108, "learning_rate": 2.8495181616011862e-05, "loss": 1.1138, "step": 5794 }, { "epoch": 0.8585185185185186, "grad_norm": 1.29437255859375, "learning_rate": 2.8465530022238697e-05, "loss": 0.9225, "step": 5795 }, { "epoch": 0.8586666666666667, "grad_norm": 1.3875452280044556, "learning_rate": 2.8435878428465528e-05, "loss": 1.1082, "step": 5796 }, { "epoch": 0.8588148148148148, "grad_norm": 2.2187299728393555, "learning_rate": 2.840622683469237e-05, "loss": 1.4328, "step": 5797 }, { "epoch": 0.8589629629629629, "grad_norm": 1.1596548557281494, "learning_rate": 2.83765752409192e-05, "loss": 0.9621, "step": 5798 }, { "epoch": 0.8591111111111112, "grad_norm": 1.5259015560150146, "learning_rate": 2.8346923647146034e-05, "loss": 0.9985, "step": 5799 }, { "epoch": 0.8592592592592593, "grad_norm": 1.3542827367782593, "learning_rate": 2.8317272053372868e-05, "loss": 0.8894, "step": 5800 }, { "epoch": 0.8594074074074074, "grad_norm": 3.5634617805480957, "learning_rate": 2.8287620459599706e-05, "loss": 1.1314, "step": 5801 }, { "epoch": 0.8595555555555555, "grad_norm": 1.1892551183700562, "learning_rate": 2.825796886582654e-05, "loss": 0.994, "step": 5802 }, { "epoch": 0.8597037037037037, "grad_norm": 2.7257864475250244, "learning_rate": 2.8228317272053374e-05, "loss": 1.0517, "step": 5803 }, { "epoch": 0.8598518518518519, "grad_norm": 1.1691776514053345, "learning_rate": 2.8198665678280212e-05, "loss": 1.0076, "step": 5804 }, { "epoch": 0.86, "grad_norm": 2.0199103355407715, "learning_rate": 2.8169014084507046e-05, "loss": 0.8931, "step": 5805 }, { "epoch": 0.8601481481481481, "grad_norm": 2.6088061332702637, "learning_rate": 2.8139362490733877e-05, "loss": 1.0654, "step": 5806 }, { "epoch": 0.8602962962962963, "grad_norm": 2.0341060161590576, "learning_rate": 2.810971089696071e-05, "loss": 1.0152, "step": 5807 }, { "epoch": 0.8604444444444445, "grad_norm": 1.8954989910125732, "learning_rate": 2.808005930318755e-05, "loss": 1.0351, "step": 5808 }, { "epoch": 0.8605925925925926, "grad_norm": 1.435793161392212, "learning_rate": 2.8050407709414384e-05, "loss": 1.1166, "step": 5809 }, { "epoch": 0.8607407407407407, "grad_norm": 1.5432322025299072, "learning_rate": 2.8020756115641218e-05, "loss": 0.9407, "step": 5810 }, { "epoch": 0.8608888888888889, "grad_norm": 1.739850640296936, "learning_rate": 2.799110452186805e-05, "loss": 0.9118, "step": 5811 }, { "epoch": 0.861037037037037, "grad_norm": 2.085766553878784, "learning_rate": 2.796145292809489e-05, "loss": 1.0833, "step": 5812 }, { "epoch": 0.8611851851851852, "grad_norm": 1.909110188484192, "learning_rate": 2.793180133432172e-05, "loss": 1.1908, "step": 5813 }, { "epoch": 0.8613333333333333, "grad_norm": 1.3825736045837402, "learning_rate": 2.7902149740548555e-05, "loss": 0.9794, "step": 5814 }, { "epoch": 0.8614814814814815, "grad_norm": 1.6004115343093872, "learning_rate": 2.787249814677539e-05, "loss": 0.8425, "step": 5815 }, { "epoch": 0.8616296296296296, "grad_norm": 1.5785696506500244, "learning_rate": 2.7842846553002227e-05, "loss": 0.7433, "step": 5816 }, { "epoch": 0.8617777777777778, "grad_norm": 2.8138880729675293, "learning_rate": 2.781319495922906e-05, "loss": 1.0487, "step": 5817 }, { "epoch": 0.8619259259259259, "grad_norm": 6.772069931030273, "learning_rate": 2.7783543365455896e-05, "loss": 1.1717, "step": 5818 }, { "epoch": 0.8620740740740741, "grad_norm": 1.6086390018463135, "learning_rate": 2.7753891771682727e-05, "loss": 1.0606, "step": 5819 }, { "epoch": 0.8622222222222222, "grad_norm": 3.6303298473358154, "learning_rate": 2.7724240177909568e-05, "loss": 0.8553, "step": 5820 }, { "epoch": 0.8623703703703703, "grad_norm": 1.7301232814788818, "learning_rate": 2.76945885841364e-05, "loss": 0.9192, "step": 5821 }, { "epoch": 0.8625185185185186, "grad_norm": 1.4979515075683594, "learning_rate": 2.7664936990363233e-05, "loss": 1.1056, "step": 5822 }, { "epoch": 0.8626666666666667, "grad_norm": 3.1390674114227295, "learning_rate": 2.7635285396590067e-05, "loss": 0.9098, "step": 5823 }, { "epoch": 0.8628148148148148, "grad_norm": 1.3511881828308105, "learning_rate": 2.7605633802816905e-05, "loss": 0.7656, "step": 5824 }, { "epoch": 0.8629629629629629, "grad_norm": 1.104941964149475, "learning_rate": 2.757598220904374e-05, "loss": 0.9358, "step": 5825 }, { "epoch": 0.8631111111111112, "grad_norm": 2.2065224647521973, "learning_rate": 2.754633061527057e-05, "loss": 1.312, "step": 5826 }, { "epoch": 0.8632592592592593, "grad_norm": 1.4837945699691772, "learning_rate": 2.7516679021497404e-05, "loss": 1.1161, "step": 5827 }, { "epoch": 0.8634074074074074, "grad_norm": 1.9617424011230469, "learning_rate": 2.7487027427724242e-05, "loss": 0.8527, "step": 5828 }, { "epoch": 0.8635555555555555, "grad_norm": 1.9595869779586792, "learning_rate": 2.7457375833951076e-05, "loss": 0.9336, "step": 5829 }, { "epoch": 0.8637037037037038, "grad_norm": 2.233665704727173, "learning_rate": 2.742772424017791e-05, "loss": 0.8778, "step": 5830 }, { "epoch": 0.8638518518518519, "grad_norm": 2.3654582500457764, "learning_rate": 2.7398072646404745e-05, "loss": 0.753, "step": 5831 }, { "epoch": 0.864, "grad_norm": 1.868675708770752, "learning_rate": 2.7368421052631583e-05, "loss": 0.9666, "step": 5832 }, { "epoch": 0.8641481481481481, "grad_norm": 2.269033193588257, "learning_rate": 2.7338769458858417e-05, "loss": 1.0268, "step": 5833 }, { "epoch": 0.8642962962962963, "grad_norm": 1.8759225606918335, "learning_rate": 2.7309117865085248e-05, "loss": 0.8583, "step": 5834 }, { "epoch": 0.8644444444444445, "grad_norm": 1.870804786682129, "learning_rate": 2.7279466271312082e-05, "loss": 1.0527, "step": 5835 }, { "epoch": 0.8645925925925926, "grad_norm": 1.6852681636810303, "learning_rate": 2.724981467753892e-05, "loss": 0.9133, "step": 5836 }, { "epoch": 0.8647407407407407, "grad_norm": 2.536195755004883, "learning_rate": 2.7220163083765754e-05, "loss": 1.1145, "step": 5837 }, { "epoch": 0.8648888888888889, "grad_norm": 2.8536055088043213, "learning_rate": 2.719051148999259e-05, "loss": 1.2185, "step": 5838 }, { "epoch": 0.865037037037037, "grad_norm": 2.1045289039611816, "learning_rate": 2.716085989621942e-05, "loss": 1.0956, "step": 5839 }, { "epoch": 0.8651851851851852, "grad_norm": 1.3994905948638916, "learning_rate": 2.713120830244626e-05, "loss": 0.9142, "step": 5840 }, { "epoch": 0.8653333333333333, "grad_norm": 1.5658845901489258, "learning_rate": 2.710155670867309e-05, "loss": 0.8722, "step": 5841 }, { "epoch": 0.8654814814814815, "grad_norm": 1.4372714757919312, "learning_rate": 2.7071905114899926e-05, "loss": 1.0431, "step": 5842 }, { "epoch": 0.8656296296296296, "grad_norm": 1.67061185836792, "learning_rate": 2.704225352112676e-05, "loss": 1.0053, "step": 5843 }, { "epoch": 0.8657777777777778, "grad_norm": 1.4704614877700806, "learning_rate": 2.7012601927353598e-05, "loss": 1.0393, "step": 5844 }, { "epoch": 0.8659259259259259, "grad_norm": 1.3923258781433105, "learning_rate": 2.6982950333580432e-05, "loss": 0.6711, "step": 5845 }, { "epoch": 0.8660740740740741, "grad_norm": 1.9425890445709229, "learning_rate": 2.6953298739807266e-05, "loss": 0.9177, "step": 5846 }, { "epoch": 0.8662222222222222, "grad_norm": 1.7985904216766357, "learning_rate": 2.6923647146034104e-05, "loss": 1.1153, "step": 5847 }, { "epoch": 0.8663703703703703, "grad_norm": 2.1547863483428955, "learning_rate": 2.6893995552260938e-05, "loss": 1.1189, "step": 5848 }, { "epoch": 0.8665185185185185, "grad_norm": 2.150320291519165, "learning_rate": 2.686434395848777e-05, "loss": 0.8659, "step": 5849 }, { "epoch": 0.8666666666666667, "grad_norm": 1.5663039684295654, "learning_rate": 2.6834692364714603e-05, "loss": 0.9349, "step": 5850 }, { "epoch": 0.8668148148148148, "grad_norm": 1.7574503421783447, "learning_rate": 2.680504077094144e-05, "loss": 1.018, "step": 5851 }, { "epoch": 0.8669629629629629, "grad_norm": 1.6053471565246582, "learning_rate": 2.6775389177168275e-05, "loss": 0.9127, "step": 5852 }, { "epoch": 0.8671111111111112, "grad_norm": 1.1548551321029663, "learning_rate": 2.674573758339511e-05, "loss": 0.9632, "step": 5853 }, { "epoch": 0.8672592592592593, "grad_norm": 2.2477433681488037, "learning_rate": 2.671608598962194e-05, "loss": 1.0174, "step": 5854 }, { "epoch": 0.8674074074074074, "grad_norm": 1.5156891345977783, "learning_rate": 2.668643439584878e-05, "loss": 1.2504, "step": 5855 }, { "epoch": 0.8675555555555555, "grad_norm": 1.7786145210266113, "learning_rate": 2.6656782802075612e-05, "loss": 0.9207, "step": 5856 }, { "epoch": 0.8677037037037038, "grad_norm": 1.5490436553955078, "learning_rate": 2.6627131208302447e-05, "loss": 1.1374, "step": 5857 }, { "epoch": 0.8678518518518519, "grad_norm": 2.0994009971618652, "learning_rate": 2.659747961452928e-05, "loss": 0.818, "step": 5858 }, { "epoch": 0.868, "grad_norm": 1.4977202415466309, "learning_rate": 2.656782802075612e-05, "loss": 0.89, "step": 5859 }, { "epoch": 0.8681481481481481, "grad_norm": 2.5819857120513916, "learning_rate": 2.6538176426982953e-05, "loss": 1.037, "step": 5860 }, { "epoch": 0.8682962962962963, "grad_norm": 1.7890233993530273, "learning_rate": 2.6508524833209787e-05, "loss": 0.8482, "step": 5861 }, { "epoch": 0.8684444444444445, "grad_norm": 1.3251336812973022, "learning_rate": 2.6478873239436618e-05, "loss": 1.1135, "step": 5862 }, { "epoch": 0.8685925925925926, "grad_norm": 1.270087718963623, "learning_rate": 2.644922164566346e-05, "loss": 0.9451, "step": 5863 }, { "epoch": 0.8687407407407407, "grad_norm": 2.885849714279175, "learning_rate": 2.641957005189029e-05, "loss": 0.9745, "step": 5864 }, { "epoch": 0.8688888888888889, "grad_norm": 1.460054874420166, "learning_rate": 2.6389918458117125e-05, "loss": 0.8953, "step": 5865 }, { "epoch": 0.869037037037037, "grad_norm": 1.209641695022583, "learning_rate": 2.636026686434396e-05, "loss": 1.2016, "step": 5866 }, { "epoch": 0.8691851851851852, "grad_norm": 1.343348741531372, "learning_rate": 2.6330615270570797e-05, "loss": 0.9878, "step": 5867 }, { "epoch": 0.8693333333333333, "grad_norm": 1.8933838605880737, "learning_rate": 2.630096367679763e-05, "loss": 0.7703, "step": 5868 }, { "epoch": 0.8694814814814815, "grad_norm": 1.4754409790039062, "learning_rate": 2.6271312083024462e-05, "loss": 1.235, "step": 5869 }, { "epoch": 0.8696296296296296, "grad_norm": 1.4059656858444214, "learning_rate": 2.6241660489251296e-05, "loss": 1.0932, "step": 5870 }, { "epoch": 0.8697777777777778, "grad_norm": 1.4099831581115723, "learning_rate": 2.6212008895478134e-05, "loss": 0.8421, "step": 5871 }, { "epoch": 0.8699259259259259, "grad_norm": 1.7192801237106323, "learning_rate": 2.6182357301704968e-05, "loss": 1.0589, "step": 5872 }, { "epoch": 0.8700740740740741, "grad_norm": 1.7932615280151367, "learning_rate": 2.6152705707931802e-05, "loss": 0.9618, "step": 5873 }, { "epoch": 0.8702222222222222, "grad_norm": 1.4482512474060059, "learning_rate": 2.6123054114158637e-05, "loss": 0.9546, "step": 5874 }, { "epoch": 0.8703703703703703, "grad_norm": 1.6058628559112549, "learning_rate": 2.6093402520385474e-05, "loss": 1.1576, "step": 5875 }, { "epoch": 0.8705185185185185, "grad_norm": 1.2644472122192383, "learning_rate": 2.606375092661231e-05, "loss": 1.1664, "step": 5876 }, { "epoch": 0.8706666666666667, "grad_norm": 2.2129993438720703, "learning_rate": 2.603409933283914e-05, "loss": 0.8426, "step": 5877 }, { "epoch": 0.8708148148148148, "grad_norm": 1.7647929191589355, "learning_rate": 2.6004447739065974e-05, "loss": 0.8964, "step": 5878 }, { "epoch": 0.8709629629629629, "grad_norm": 2.011066436767578, "learning_rate": 2.597479614529281e-05, "loss": 0.9673, "step": 5879 }, { "epoch": 0.8711111111111111, "grad_norm": 2.7519032955169678, "learning_rate": 2.5945144551519646e-05, "loss": 0.8212, "step": 5880 }, { "epoch": 0.8712592592592593, "grad_norm": 1.8849031925201416, "learning_rate": 2.591549295774648e-05, "loss": 1.0938, "step": 5881 }, { "epoch": 0.8714074074074074, "grad_norm": 2.9380102157592773, "learning_rate": 2.588584136397331e-05, "loss": 1.1205, "step": 5882 }, { "epoch": 0.8715555555555555, "grad_norm": 1.3827793598175049, "learning_rate": 2.5856189770200152e-05, "loss": 0.8481, "step": 5883 }, { "epoch": 0.8717037037037038, "grad_norm": 1.1526957750320435, "learning_rate": 2.5826538176426983e-05, "loss": 0.828, "step": 5884 }, { "epoch": 0.8718518518518519, "grad_norm": 2.1674742698669434, "learning_rate": 2.5796886582653817e-05, "loss": 0.9764, "step": 5885 }, { "epoch": 0.872, "grad_norm": 3.5550127029418945, "learning_rate": 2.576723498888065e-05, "loss": 0.9491, "step": 5886 }, { "epoch": 0.8721481481481481, "grad_norm": 1.5558884143829346, "learning_rate": 2.573758339510749e-05, "loss": 1.2718, "step": 5887 }, { "epoch": 0.8722962962962963, "grad_norm": 1.6453651189804077, "learning_rate": 2.5707931801334323e-05, "loss": 0.8926, "step": 5888 }, { "epoch": 0.8724444444444445, "grad_norm": 2.0549097061157227, "learning_rate": 2.5678280207561158e-05, "loss": 1.0412, "step": 5889 }, { "epoch": 0.8725925925925926, "grad_norm": 1.3418998718261719, "learning_rate": 2.5648628613787995e-05, "loss": 0.9795, "step": 5890 }, { "epoch": 0.8727407407407407, "grad_norm": 1.3054873943328857, "learning_rate": 2.561897702001483e-05, "loss": 1.0158, "step": 5891 }, { "epoch": 0.8728888888888889, "grad_norm": 1.6010377407073975, "learning_rate": 2.558932542624166e-05, "loss": 1.0346, "step": 5892 }, { "epoch": 0.8730370370370371, "grad_norm": 1.4372097253799438, "learning_rate": 2.5559673832468495e-05, "loss": 1.0819, "step": 5893 }, { "epoch": 0.8731851851851852, "grad_norm": 1.422444462776184, "learning_rate": 2.5530022238695333e-05, "loss": 0.9573, "step": 5894 }, { "epoch": 0.8733333333333333, "grad_norm": 1.6464009284973145, "learning_rate": 2.5500370644922167e-05, "loss": 1.107, "step": 5895 }, { "epoch": 0.8734814814814815, "grad_norm": 1.4371873140335083, "learning_rate": 2.5470719051149e-05, "loss": 0.9562, "step": 5896 }, { "epoch": 0.8736296296296296, "grad_norm": 1.4094585180282593, "learning_rate": 2.5441067457375832e-05, "loss": 0.9187, "step": 5897 }, { "epoch": 0.8737777777777778, "grad_norm": 1.658033013343811, "learning_rate": 2.5411415863602673e-05, "loss": 1.1806, "step": 5898 }, { "epoch": 0.8739259259259259, "grad_norm": 1.6226048469543457, "learning_rate": 2.5381764269829504e-05, "loss": 0.93, "step": 5899 }, { "epoch": 0.8740740740740741, "grad_norm": 1.6565712690353394, "learning_rate": 2.535211267605634e-05, "loss": 1.1747, "step": 5900 }, { "epoch": 0.8742222222222222, "grad_norm": 3.0640833377838135, "learning_rate": 2.5322461082283173e-05, "loss": 1.0731, "step": 5901 }, { "epoch": 0.8743703703703704, "grad_norm": 1.7662317752838135, "learning_rate": 2.529280948851001e-05, "loss": 1.0931, "step": 5902 }, { "epoch": 0.8745185185185185, "grad_norm": 1.674347996711731, "learning_rate": 2.5263157894736845e-05, "loss": 1.0951, "step": 5903 }, { "epoch": 0.8746666666666667, "grad_norm": 1.3502625226974487, "learning_rate": 2.523350630096368e-05, "loss": 1.0802, "step": 5904 }, { "epoch": 0.8748148148148148, "grad_norm": 3.889312267303467, "learning_rate": 2.520385470719051e-05, "loss": 0.9884, "step": 5905 }, { "epoch": 0.8749629629629629, "grad_norm": 1.6618272066116333, "learning_rate": 2.517420311341735e-05, "loss": 0.9486, "step": 5906 }, { "epoch": 0.8751111111111111, "grad_norm": 2.9989893436431885, "learning_rate": 2.5144551519644182e-05, "loss": 0.9962, "step": 5907 }, { "epoch": 0.8752592592592593, "grad_norm": 1.588752269744873, "learning_rate": 2.5114899925871016e-05, "loss": 0.9569, "step": 5908 }, { "epoch": 0.8754074074074074, "grad_norm": 1.2568234205245972, "learning_rate": 2.508524833209785e-05, "loss": 0.886, "step": 5909 }, { "epoch": 0.8755555555555555, "grad_norm": 2.087700128555298, "learning_rate": 2.5055596738324688e-05, "loss": 0.9015, "step": 5910 }, { "epoch": 0.8757037037037037, "grad_norm": 1.795215129852295, "learning_rate": 2.5025945144551522e-05, "loss": 0.8777, "step": 5911 }, { "epoch": 0.8758518518518519, "grad_norm": 1.0963869094848633, "learning_rate": 2.4996293550778353e-05, "loss": 1.1008, "step": 5912 }, { "epoch": 0.876, "grad_norm": 1.1991289854049683, "learning_rate": 2.496664195700519e-05, "loss": 0.9306, "step": 5913 }, { "epoch": 0.8761481481481481, "grad_norm": 1.3601399660110474, "learning_rate": 2.4936990363232025e-05, "loss": 1.1353, "step": 5914 }, { "epoch": 0.8762962962962964, "grad_norm": 1.3399142026901245, "learning_rate": 2.490733876945886e-05, "loss": 0.8172, "step": 5915 }, { "epoch": 0.8764444444444445, "grad_norm": 2.9210901260375977, "learning_rate": 2.4877687175685694e-05, "loss": 0.868, "step": 5916 }, { "epoch": 0.8765925925925926, "grad_norm": 1.5540348291397095, "learning_rate": 2.4848035581912528e-05, "loss": 1.0308, "step": 5917 }, { "epoch": 0.8767407407407407, "grad_norm": 1.3015390634536743, "learning_rate": 2.4818383988139363e-05, "loss": 0.9215, "step": 5918 }, { "epoch": 0.8768888888888889, "grad_norm": 2.2629032135009766, "learning_rate": 2.47887323943662e-05, "loss": 1.2229, "step": 5919 }, { "epoch": 0.8770370370370371, "grad_norm": 2.0165510177612305, "learning_rate": 2.475908080059303e-05, "loss": 1.0342, "step": 5920 }, { "epoch": 0.8771851851851852, "grad_norm": 1.203102946281433, "learning_rate": 2.472942920681987e-05, "loss": 0.7615, "step": 5921 }, { "epoch": 0.8773333333333333, "grad_norm": 1.7544171810150146, "learning_rate": 2.46997776130467e-05, "loss": 0.9569, "step": 5922 }, { "epoch": 0.8774814814814815, "grad_norm": 1.2334192991256714, "learning_rate": 2.4670126019273537e-05, "loss": 1.0773, "step": 5923 }, { "epoch": 0.8776296296296296, "grad_norm": 1.4766578674316406, "learning_rate": 2.464047442550037e-05, "loss": 0.9939, "step": 5924 }, { "epoch": 0.8777777777777778, "grad_norm": 1.7966846227645874, "learning_rate": 2.4610822831727206e-05, "loss": 1.0282, "step": 5925 }, { "epoch": 0.8779259259259259, "grad_norm": 4.310170650482178, "learning_rate": 2.458117123795404e-05, "loss": 1.1213, "step": 5926 }, { "epoch": 0.8780740740740741, "grad_norm": 2.1789815425872803, "learning_rate": 2.4551519644180875e-05, "loss": 0.927, "step": 5927 }, { "epoch": 0.8782222222222222, "grad_norm": 1.471701741218567, "learning_rate": 2.4521868050407712e-05, "loss": 0.9364, "step": 5928 }, { "epoch": 0.8783703703703704, "grad_norm": 1.8487812280654907, "learning_rate": 2.4492216456634547e-05, "loss": 1.0671, "step": 5929 }, { "epoch": 0.8785185185185185, "grad_norm": 1.4639313220977783, "learning_rate": 2.446256486286138e-05, "loss": 1.2213, "step": 5930 }, { "epoch": 0.8786666666666667, "grad_norm": 1.207892656326294, "learning_rate": 2.4432913269088215e-05, "loss": 0.8711, "step": 5931 }, { "epoch": 0.8788148148148148, "grad_norm": 1.275417447090149, "learning_rate": 2.440326167531505e-05, "loss": 0.8091, "step": 5932 }, { "epoch": 0.878962962962963, "grad_norm": 1.6400730609893799, "learning_rate": 2.4373610081541884e-05, "loss": 0.9387, "step": 5933 }, { "epoch": 0.8791111111111111, "grad_norm": 2.544860363006592, "learning_rate": 2.434395848776872e-05, "loss": 0.7733, "step": 5934 }, { "epoch": 0.8792592592592593, "grad_norm": 2.9975552558898926, "learning_rate": 2.4314306893995552e-05, "loss": 0.992, "step": 5935 }, { "epoch": 0.8794074074074074, "grad_norm": 4.002200603485107, "learning_rate": 2.428465530022239e-05, "loss": 0.9922, "step": 5936 }, { "epoch": 0.8795555555555555, "grad_norm": 1.1059911251068115, "learning_rate": 2.425500370644922e-05, "loss": 0.9983, "step": 5937 }, { "epoch": 0.8797037037037037, "grad_norm": 1.6267253160476685, "learning_rate": 2.422535211267606e-05, "loss": 0.8535, "step": 5938 }, { "epoch": 0.8798518518518519, "grad_norm": 1.4666725397109985, "learning_rate": 2.4195700518902893e-05, "loss": 1.1576, "step": 5939 }, { "epoch": 0.88, "grad_norm": 1.4765816926956177, "learning_rate": 2.4166048925129727e-05, "loss": 0.882, "step": 5940 }, { "epoch": 0.8801481481481481, "grad_norm": 1.4435561895370483, "learning_rate": 2.413639733135656e-05, "loss": 1.1641, "step": 5941 }, { "epoch": 0.8802962962962962, "grad_norm": 2.0997135639190674, "learning_rate": 2.4106745737583396e-05, "loss": 0.8, "step": 5942 }, { "epoch": 0.8804444444444445, "grad_norm": 2.8202171325683594, "learning_rate": 2.407709414381023e-05, "loss": 0.8823, "step": 5943 }, { "epoch": 0.8805925925925926, "grad_norm": 2.149346113204956, "learning_rate": 2.4047442550037068e-05, "loss": 0.9647, "step": 5944 }, { "epoch": 0.8807407407407407, "grad_norm": 1.2742054462432861, "learning_rate": 2.40177909562639e-05, "loss": 0.8836, "step": 5945 }, { "epoch": 0.8808888888888889, "grad_norm": 1.5382885932922363, "learning_rate": 2.3988139362490736e-05, "loss": 1.3162, "step": 5946 }, { "epoch": 0.8810370370370371, "grad_norm": 1.5108246803283691, "learning_rate": 2.395848776871757e-05, "loss": 1.0424, "step": 5947 }, { "epoch": 0.8811851851851852, "grad_norm": 1.398380994796753, "learning_rate": 2.3928836174944405e-05, "loss": 0.8191, "step": 5948 }, { "epoch": 0.8813333333333333, "grad_norm": 1.1411432027816772, "learning_rate": 2.389918458117124e-05, "loss": 0.9616, "step": 5949 }, { "epoch": 0.8814814814814815, "grad_norm": 2.4727354049682617, "learning_rate": 2.3869532987398074e-05, "loss": 0.9446, "step": 5950 }, { "epoch": 0.8816296296296297, "grad_norm": 1.8627889156341553, "learning_rate": 2.3839881393624908e-05, "loss": 1.0192, "step": 5951 }, { "epoch": 0.8817777777777778, "grad_norm": 1.4378087520599365, "learning_rate": 2.3810229799851742e-05, "loss": 0.9181, "step": 5952 }, { "epoch": 0.8819259259259259, "grad_norm": 1.4933042526245117, "learning_rate": 2.3780578206078576e-05, "loss": 1.2257, "step": 5953 }, { "epoch": 0.8820740740740741, "grad_norm": 1.9550085067749023, "learning_rate": 2.3750926612305414e-05, "loss": 0.8789, "step": 5954 }, { "epoch": 0.8822222222222222, "grad_norm": 1.4890142679214478, "learning_rate": 2.3721275018532245e-05, "loss": 0.998, "step": 5955 }, { "epoch": 0.8823703703703704, "grad_norm": 2.3651130199432373, "learning_rate": 2.3691623424759083e-05, "loss": 1.2468, "step": 5956 }, { "epoch": 0.8825185185185185, "grad_norm": 4.208375930786133, "learning_rate": 2.3661971830985917e-05, "loss": 1.0645, "step": 5957 }, { "epoch": 0.8826666666666667, "grad_norm": 1.6968998908996582, "learning_rate": 2.363232023721275e-05, "loss": 0.8437, "step": 5958 }, { "epoch": 0.8828148148148148, "grad_norm": 2.545179843902588, "learning_rate": 2.3602668643439586e-05, "loss": 1.2186, "step": 5959 }, { "epoch": 0.882962962962963, "grad_norm": 1.222191572189331, "learning_rate": 2.357301704966642e-05, "loss": 1.1235, "step": 5960 }, { "epoch": 0.8831111111111111, "grad_norm": 1.57589852809906, "learning_rate": 2.3543365455893254e-05, "loss": 1.2609, "step": 5961 }, { "epoch": 0.8832592592592593, "grad_norm": 1.8529330492019653, "learning_rate": 2.3513713862120092e-05, "loss": 1.0748, "step": 5962 }, { "epoch": 0.8834074074074074, "grad_norm": 1.7854204177856445, "learning_rate": 2.3484062268346923e-05, "loss": 1.2563, "step": 5963 }, { "epoch": 0.8835555555555555, "grad_norm": 3.024566173553467, "learning_rate": 2.345441067457376e-05, "loss": 0.9598, "step": 5964 }, { "epoch": 0.8837037037037037, "grad_norm": 1.2156152725219727, "learning_rate": 2.342475908080059e-05, "loss": 1.0619, "step": 5965 }, { "epoch": 0.8838518518518519, "grad_norm": 1.3999546766281128, "learning_rate": 2.339510748702743e-05, "loss": 0.8424, "step": 5966 }, { "epoch": 0.884, "grad_norm": 1.7890654802322388, "learning_rate": 2.3365455893254263e-05, "loss": 0.9101, "step": 5967 }, { "epoch": 0.8841481481481481, "grad_norm": 2.8123888969421387, "learning_rate": 2.3335804299481098e-05, "loss": 1.2402, "step": 5968 }, { "epoch": 0.8842962962962962, "grad_norm": 1.3883906602859497, "learning_rate": 2.3306152705707935e-05, "loss": 0.8954, "step": 5969 }, { "epoch": 0.8844444444444445, "grad_norm": 1.5781482458114624, "learning_rate": 2.3276501111934766e-05, "loss": 0.9034, "step": 5970 }, { "epoch": 0.8845925925925926, "grad_norm": 1.5325167179107666, "learning_rate": 2.3246849518161604e-05, "loss": 0.9061, "step": 5971 }, { "epoch": 0.8847407407407407, "grad_norm": 1.437687873840332, "learning_rate": 2.3217197924388438e-05, "loss": 1.0718, "step": 5972 }, { "epoch": 0.8848888888888888, "grad_norm": 1.120303988456726, "learning_rate": 2.3187546330615273e-05, "loss": 0.955, "step": 5973 }, { "epoch": 0.8850370370370371, "grad_norm": 2.4920289516448975, "learning_rate": 2.3157894736842107e-05, "loss": 1.0021, "step": 5974 }, { "epoch": 0.8851851851851852, "grad_norm": 1.659559726715088, "learning_rate": 2.312824314306894e-05, "loss": 0.9747, "step": 5975 }, { "epoch": 0.8853333333333333, "grad_norm": 2.555039405822754, "learning_rate": 2.3098591549295775e-05, "loss": 0.788, "step": 5976 }, { "epoch": 0.8854814814814815, "grad_norm": 1.2507604360580444, "learning_rate": 2.3068939955522613e-05, "loss": 1.0211, "step": 5977 }, { "epoch": 0.8856296296296297, "grad_norm": 1.7848315238952637, "learning_rate": 2.3039288361749444e-05, "loss": 0.9813, "step": 5978 }, { "epoch": 0.8857777777777778, "grad_norm": 1.4910756349563599, "learning_rate": 2.300963676797628e-05, "loss": 0.9126, "step": 5979 }, { "epoch": 0.8859259259259259, "grad_norm": 2.5473949909210205, "learning_rate": 2.2979985174203113e-05, "loss": 1.2652, "step": 5980 }, { "epoch": 0.8860740740740741, "grad_norm": 1.586308479309082, "learning_rate": 2.295033358042995e-05, "loss": 0.8096, "step": 5981 }, { "epoch": 0.8862222222222222, "grad_norm": 1.499227523803711, "learning_rate": 2.2920681986656785e-05, "loss": 1.1573, "step": 5982 }, { "epoch": 0.8863703703703704, "grad_norm": 1.1961390972137451, "learning_rate": 2.289103039288362e-05, "loss": 1.1098, "step": 5983 }, { "epoch": 0.8865185185185185, "grad_norm": 2.57843279838562, "learning_rate": 2.2861378799110453e-05, "loss": 0.9151, "step": 5984 }, { "epoch": 0.8866666666666667, "grad_norm": 1.743828296661377, "learning_rate": 2.2831727205337287e-05, "loss": 1.0712, "step": 5985 }, { "epoch": 0.8868148148148148, "grad_norm": 1.6296634674072266, "learning_rate": 2.2802075611564122e-05, "loss": 0.8652, "step": 5986 }, { "epoch": 0.886962962962963, "grad_norm": 1.3595842123031616, "learning_rate": 2.277242401779096e-05, "loss": 1.0706, "step": 5987 }, { "epoch": 0.8871111111111111, "grad_norm": 1.528507947921753, "learning_rate": 2.274277242401779e-05, "loss": 0.9773, "step": 5988 }, { "epoch": 0.8872592592592593, "grad_norm": 2.0428545475006104, "learning_rate": 2.2713120830244628e-05, "loss": 0.9653, "step": 5989 }, { "epoch": 0.8874074074074074, "grad_norm": 1.4239282608032227, "learning_rate": 2.2683469236471462e-05, "loss": 1.0398, "step": 5990 }, { "epoch": 0.8875555555555555, "grad_norm": 1.3365000486373901, "learning_rate": 2.2653817642698297e-05, "loss": 0.8188, "step": 5991 }, { "epoch": 0.8877037037037037, "grad_norm": 2.038303852081299, "learning_rate": 2.262416604892513e-05, "loss": 1.1364, "step": 5992 }, { "epoch": 0.8878518518518519, "grad_norm": 1.3759441375732422, "learning_rate": 2.2594514455151965e-05, "loss": 1.0479, "step": 5993 }, { "epoch": 0.888, "grad_norm": 1.4477378129959106, "learning_rate": 2.25648628613788e-05, "loss": 1.0599, "step": 5994 }, { "epoch": 0.8881481481481481, "grad_norm": 2.6113638877868652, "learning_rate": 2.2535211267605634e-05, "loss": 0.9383, "step": 5995 }, { "epoch": 0.8882962962962963, "grad_norm": 1.367432951927185, "learning_rate": 2.2505559673832468e-05, "loss": 0.9804, "step": 5996 }, { "epoch": 0.8884444444444445, "grad_norm": 1.3314729928970337, "learning_rate": 2.2475908080059306e-05, "loss": 0.9013, "step": 5997 }, { "epoch": 0.8885925925925926, "grad_norm": 2.2478277683258057, "learning_rate": 2.2446256486286137e-05, "loss": 1.0514, "step": 5998 }, { "epoch": 0.8887407407407407, "grad_norm": 1.8422318696975708, "learning_rate": 2.2416604892512974e-05, "loss": 0.7727, "step": 5999 }, { "epoch": 0.8888888888888888, "grad_norm": 2.4531776905059814, "learning_rate": 2.238695329873981e-05, "loss": 0.9298, "step": 6000 }, { "epoch": 0.8890370370370371, "grad_norm": 2.0126442909240723, "learning_rate": 2.2357301704966643e-05, "loss": 1.0887, "step": 6001 }, { "epoch": 0.8891851851851852, "grad_norm": 1.7374687194824219, "learning_rate": 2.2327650111193477e-05, "loss": 1.0529, "step": 6002 }, { "epoch": 0.8893333333333333, "grad_norm": 1.3899040222167969, "learning_rate": 2.229799851742031e-05, "loss": 0.9656, "step": 6003 }, { "epoch": 0.8894814814814814, "grad_norm": 2.106459617614746, "learning_rate": 2.2268346923647146e-05, "loss": 0.965, "step": 6004 }, { "epoch": 0.8896296296296297, "grad_norm": 1.8286856412887573, "learning_rate": 2.2238695329873984e-05, "loss": 1.2328, "step": 6005 }, { "epoch": 0.8897777777777778, "grad_norm": 1.7465448379516602, "learning_rate": 2.2209043736100814e-05, "loss": 0.8868, "step": 6006 }, { "epoch": 0.8899259259259259, "grad_norm": 1.3104788064956665, "learning_rate": 2.2179392142327652e-05, "loss": 0.9981, "step": 6007 }, { "epoch": 0.8900740740740741, "grad_norm": 1.680923342704773, "learning_rate": 2.2149740548554483e-05, "loss": 1.0375, "step": 6008 }, { "epoch": 0.8902222222222222, "grad_norm": 1.623653769493103, "learning_rate": 2.212008895478132e-05, "loss": 0.9158, "step": 6009 }, { "epoch": 0.8903703703703704, "grad_norm": 1.3549065589904785, "learning_rate": 2.2090437361008155e-05, "loss": 0.9938, "step": 6010 }, { "epoch": 0.8905185185185185, "grad_norm": 1.6504497528076172, "learning_rate": 2.206078576723499e-05, "loss": 1.1917, "step": 6011 }, { "epoch": 0.8906666666666667, "grad_norm": 1.3264446258544922, "learning_rate": 2.2031134173461827e-05, "loss": 1.1057, "step": 6012 }, { "epoch": 0.8908148148148148, "grad_norm": 1.9469859600067139, "learning_rate": 2.2001482579688658e-05, "loss": 1.2426, "step": 6013 }, { "epoch": 0.890962962962963, "grad_norm": 1.6104439496994019, "learning_rate": 2.1971830985915496e-05, "loss": 0.9436, "step": 6014 }, { "epoch": 0.8911111111111111, "grad_norm": 2.565323829650879, "learning_rate": 2.194217939214233e-05, "loss": 0.8797, "step": 6015 }, { "epoch": 0.8912592592592593, "grad_norm": 1.6997771263122559, "learning_rate": 2.1912527798369164e-05, "loss": 0.8972, "step": 6016 }, { "epoch": 0.8914074074074074, "grad_norm": 1.7999343872070312, "learning_rate": 2.1882876204596e-05, "loss": 1.093, "step": 6017 }, { "epoch": 0.8915555555555555, "grad_norm": 2.178159236907959, "learning_rate": 2.1853224610822833e-05, "loss": 1.004, "step": 6018 }, { "epoch": 0.8917037037037037, "grad_norm": 2.5334293842315674, "learning_rate": 2.1823573017049667e-05, "loss": 1.084, "step": 6019 }, { "epoch": 0.8918518518518519, "grad_norm": 1.747208595275879, "learning_rate": 2.1793921423276505e-05, "loss": 1.0382, "step": 6020 }, { "epoch": 0.892, "grad_norm": 1.4712276458740234, "learning_rate": 2.1764269829503336e-05, "loss": 0.9342, "step": 6021 }, { "epoch": 0.8921481481481481, "grad_norm": 2.3379294872283936, "learning_rate": 2.1734618235730173e-05, "loss": 1.0192, "step": 6022 }, { "epoch": 0.8922962962962963, "grad_norm": 1.1035223007202148, "learning_rate": 2.1704966641957004e-05, "loss": 0.775, "step": 6023 }, { "epoch": 0.8924444444444445, "grad_norm": 2.642238140106201, "learning_rate": 2.1675315048183842e-05, "loss": 1.1903, "step": 6024 }, { "epoch": 0.8925925925925926, "grad_norm": 1.4049618244171143, "learning_rate": 2.1645663454410676e-05, "loss": 0.8349, "step": 6025 }, { "epoch": 0.8927407407407407, "grad_norm": 1.7419692277908325, "learning_rate": 2.161601186063751e-05, "loss": 0.8475, "step": 6026 }, { "epoch": 0.8928888888888888, "grad_norm": 1.5502527952194214, "learning_rate": 2.1586360266864345e-05, "loss": 0.9261, "step": 6027 }, { "epoch": 0.8930370370370371, "grad_norm": 1.383769154548645, "learning_rate": 2.155670867309118e-05, "loss": 0.9649, "step": 6028 }, { "epoch": 0.8931851851851852, "grad_norm": 1.8883402347564697, "learning_rate": 2.1527057079318013e-05, "loss": 1.1961, "step": 6029 }, { "epoch": 0.8933333333333333, "grad_norm": 2.3370683193206787, "learning_rate": 2.149740548554485e-05, "loss": 0.9132, "step": 6030 }, { "epoch": 0.8934814814814814, "grad_norm": 1.52423095703125, "learning_rate": 2.1467753891771682e-05, "loss": 1.336, "step": 6031 }, { "epoch": 0.8936296296296297, "grad_norm": 1.5249754190444946, "learning_rate": 2.143810229799852e-05, "loss": 1.1204, "step": 6032 }, { "epoch": 0.8937777777777778, "grad_norm": 1.9380327463150024, "learning_rate": 2.1408450704225354e-05, "loss": 1.0009, "step": 6033 }, { "epoch": 0.8939259259259259, "grad_norm": 1.273807168006897, "learning_rate": 2.1378799110452188e-05, "loss": 1.0166, "step": 6034 }, { "epoch": 0.894074074074074, "grad_norm": 4.784523010253906, "learning_rate": 2.1349147516679023e-05, "loss": 1.0915, "step": 6035 }, { "epoch": 0.8942222222222223, "grad_norm": 1.5389429330825806, "learning_rate": 2.1319495922905857e-05, "loss": 0.9364, "step": 6036 }, { "epoch": 0.8943703703703704, "grad_norm": 2.0165131092071533, "learning_rate": 2.128984432913269e-05, "loss": 0.8758, "step": 6037 }, { "epoch": 0.8945185185185185, "grad_norm": 3.4345901012420654, "learning_rate": 2.1260192735359525e-05, "loss": 1.1543, "step": 6038 }, { "epoch": 0.8946666666666667, "grad_norm": 1.4130337238311768, "learning_rate": 2.123054114158636e-05, "loss": 1.0309, "step": 6039 }, { "epoch": 0.8948148148148148, "grad_norm": 2.438339948654175, "learning_rate": 2.1200889547813197e-05, "loss": 0.6411, "step": 6040 }, { "epoch": 0.894962962962963, "grad_norm": 1.5909713506698608, "learning_rate": 2.117123795404003e-05, "loss": 0.9461, "step": 6041 }, { "epoch": 0.8951111111111111, "grad_norm": 1.5332297086715698, "learning_rate": 2.1141586360266866e-05, "loss": 1.2626, "step": 6042 }, { "epoch": 0.8952592592592593, "grad_norm": 1.4497228860855103, "learning_rate": 2.11119347664937e-05, "loss": 1.1115, "step": 6043 }, { "epoch": 0.8954074074074074, "grad_norm": 1.4441673755645752, "learning_rate": 2.1082283172720535e-05, "loss": 1.0415, "step": 6044 }, { "epoch": 0.8955555555555555, "grad_norm": 1.511989951133728, "learning_rate": 2.105263157894737e-05, "loss": 1.0469, "step": 6045 }, { "epoch": 0.8957037037037037, "grad_norm": 1.5368791818618774, "learning_rate": 2.1022979985174203e-05, "loss": 0.8377, "step": 6046 }, { "epoch": 0.8958518518518519, "grad_norm": 1.3114882707595825, "learning_rate": 2.0993328391401037e-05, "loss": 0.8663, "step": 6047 }, { "epoch": 0.896, "grad_norm": 1.5364413261413574, "learning_rate": 2.0963676797627875e-05, "loss": 0.9263, "step": 6048 }, { "epoch": 0.8961481481481481, "grad_norm": 1.2767763137817383, "learning_rate": 2.0934025203854706e-05, "loss": 0.7743, "step": 6049 }, { "epoch": 0.8962962962962963, "grad_norm": 1.5056242942810059, "learning_rate": 2.0904373610081544e-05, "loss": 0.915, "step": 6050 }, { "epoch": 0.8964444444444445, "grad_norm": 1.2498618364334106, "learning_rate": 2.0874722016308375e-05, "loss": 0.8222, "step": 6051 }, { "epoch": 0.8965925925925926, "grad_norm": 1.6661354303359985, "learning_rate": 2.0845070422535212e-05, "loss": 0.997, "step": 6052 }, { "epoch": 0.8967407407407407, "grad_norm": 11.182560920715332, "learning_rate": 2.0815418828762047e-05, "loss": 1.1014, "step": 6053 }, { "epoch": 0.8968888888888888, "grad_norm": 1.9369115829467773, "learning_rate": 2.078576723498888e-05, "loss": 1.2052, "step": 6054 }, { "epoch": 0.8970370370370371, "grad_norm": 1.5227136611938477, "learning_rate": 2.075611564121572e-05, "loss": 1.2471, "step": 6055 }, { "epoch": 0.8971851851851852, "grad_norm": 1.391202449798584, "learning_rate": 2.072646404744255e-05, "loss": 0.8846, "step": 6056 }, { "epoch": 0.8973333333333333, "grad_norm": 1.3359544277191162, "learning_rate": 2.0696812453669387e-05, "loss": 1.0103, "step": 6057 }, { "epoch": 0.8974814814814814, "grad_norm": 2.266838550567627, "learning_rate": 2.066716085989622e-05, "loss": 1.0017, "step": 6058 }, { "epoch": 0.8976296296296297, "grad_norm": 1.3277329206466675, "learning_rate": 2.0637509266123056e-05, "loss": 1.0146, "step": 6059 }, { "epoch": 0.8977777777777778, "grad_norm": 3.2451281547546387, "learning_rate": 2.060785767234989e-05, "loss": 1.2078, "step": 6060 }, { "epoch": 0.8979259259259259, "grad_norm": 1.8383342027664185, "learning_rate": 2.0578206078576724e-05, "loss": 1.0267, "step": 6061 }, { "epoch": 0.898074074074074, "grad_norm": 1.8881727457046509, "learning_rate": 2.054855448480356e-05, "loss": 0.9216, "step": 6062 }, { "epoch": 0.8982222222222223, "grad_norm": 1.3337632417678833, "learning_rate": 2.0518902891030396e-05, "loss": 1.0229, "step": 6063 }, { "epoch": 0.8983703703703704, "grad_norm": 2.845303773880005, "learning_rate": 2.0489251297257227e-05, "loss": 0.766, "step": 6064 }, { "epoch": 0.8985185185185185, "grad_norm": 3.4624359607696533, "learning_rate": 2.0459599703484065e-05, "loss": 0.9487, "step": 6065 }, { "epoch": 0.8986666666666666, "grad_norm": 2.0220963954925537, "learning_rate": 2.0429948109710896e-05, "loss": 0.7419, "step": 6066 }, { "epoch": 0.8988148148148148, "grad_norm": 2.0012857913970947, "learning_rate": 2.0400296515937734e-05, "loss": 1.1056, "step": 6067 }, { "epoch": 0.898962962962963, "grad_norm": 2.571730375289917, "learning_rate": 2.0370644922164568e-05, "loss": 1.0651, "step": 6068 }, { "epoch": 0.8991111111111111, "grad_norm": 2.7319865226745605, "learning_rate": 2.0340993328391402e-05, "loss": 1.0422, "step": 6069 }, { "epoch": 0.8992592592592593, "grad_norm": 2.19328236579895, "learning_rate": 2.0311341734618236e-05, "loss": 0.8098, "step": 6070 }, { "epoch": 0.8994074074074074, "grad_norm": 3.327632427215576, "learning_rate": 2.028169014084507e-05, "loss": 0.8891, "step": 6071 }, { "epoch": 0.8995555555555556, "grad_norm": 1.7550420761108398, "learning_rate": 2.0252038547071905e-05, "loss": 0.9928, "step": 6072 }, { "epoch": 0.8997037037037037, "grad_norm": 1.1450306177139282, "learning_rate": 2.0222386953298743e-05, "loss": 1.0184, "step": 6073 }, { "epoch": 0.8998518518518519, "grad_norm": 2.0636534690856934, "learning_rate": 2.0192735359525574e-05, "loss": 0.9704, "step": 6074 }, { "epoch": 0.9, "grad_norm": 2.0672996044158936, "learning_rate": 2.016308376575241e-05, "loss": 0.925, "step": 6075 }, { "epoch": 0.9001481481481481, "grad_norm": 2.060155153274536, "learning_rate": 2.0133432171979246e-05, "loss": 1.0096, "step": 6076 }, { "epoch": 0.9002962962962963, "grad_norm": 1.3767861127853394, "learning_rate": 2.010378057820608e-05, "loss": 0.8819, "step": 6077 }, { "epoch": 0.9004444444444445, "grad_norm": 1.256442666053772, "learning_rate": 2.0074128984432914e-05, "loss": 1.0233, "step": 6078 }, { "epoch": 0.9005925925925926, "grad_norm": 2.882657289505005, "learning_rate": 2.004447739065975e-05, "loss": 1.1429, "step": 6079 }, { "epoch": 0.9007407407407407, "grad_norm": 1.7089165449142456, "learning_rate": 2.0014825796886583e-05, "loss": 0.923, "step": 6080 }, { "epoch": 0.9008888888888889, "grad_norm": 1.8851364850997925, "learning_rate": 1.9985174203113417e-05, "loss": 0.8472, "step": 6081 }, { "epoch": 0.9010370370370371, "grad_norm": 3.659064292907715, "learning_rate": 1.995552260934025e-05, "loss": 1.1521, "step": 6082 }, { "epoch": 0.9011851851851852, "grad_norm": 1.409136414527893, "learning_rate": 1.992587101556709e-05, "loss": 0.8831, "step": 6083 }, { "epoch": 0.9013333333333333, "grad_norm": 1.80681574344635, "learning_rate": 1.989621942179392e-05, "loss": 1.0433, "step": 6084 }, { "epoch": 0.9014814814814814, "grad_norm": 1.6147675514221191, "learning_rate": 1.9866567828020758e-05, "loss": 1.0592, "step": 6085 }, { "epoch": 0.9016296296296297, "grad_norm": 2.0292410850524902, "learning_rate": 1.9836916234247592e-05, "loss": 0.8812, "step": 6086 }, { "epoch": 0.9017777777777778, "grad_norm": 1.399593472480774, "learning_rate": 1.9807264640474426e-05, "loss": 0.8079, "step": 6087 }, { "epoch": 0.9019259259259259, "grad_norm": 1.9720758199691772, "learning_rate": 1.977761304670126e-05, "loss": 0.9161, "step": 6088 }, { "epoch": 0.902074074074074, "grad_norm": 2.0885941982269287, "learning_rate": 1.9747961452928095e-05, "loss": 1.1102, "step": 6089 }, { "epoch": 0.9022222222222223, "grad_norm": 1.5641653537750244, "learning_rate": 1.971830985915493e-05, "loss": 0.7803, "step": 6090 }, { "epoch": 0.9023703703703704, "grad_norm": 1.7609938383102417, "learning_rate": 1.9688658265381767e-05, "loss": 1.0168, "step": 6091 }, { "epoch": 0.9025185185185185, "grad_norm": 1.8712331056594849, "learning_rate": 1.9659006671608598e-05, "loss": 0.9302, "step": 6092 }, { "epoch": 0.9026666666666666, "grad_norm": 1.8043031692504883, "learning_rate": 1.9629355077835435e-05, "loss": 1.2285, "step": 6093 }, { "epoch": 0.9028148148148148, "grad_norm": 1.5237916707992554, "learning_rate": 1.959970348406227e-05, "loss": 1.1949, "step": 6094 }, { "epoch": 0.902962962962963, "grad_norm": 2.2680342197418213, "learning_rate": 1.9570051890289104e-05, "loss": 0.938, "step": 6095 }, { "epoch": 0.9031111111111111, "grad_norm": 1.5735719203948975, "learning_rate": 1.9540400296515938e-05, "loss": 0.8851, "step": 6096 }, { "epoch": 0.9032592592592592, "grad_norm": 1.5011521577835083, "learning_rate": 1.9510748702742773e-05, "loss": 0.9529, "step": 6097 }, { "epoch": 0.9034074074074074, "grad_norm": 1.7351727485656738, "learning_rate": 1.948109710896961e-05, "loss": 1.1232, "step": 6098 }, { "epoch": 0.9035555555555556, "grad_norm": 1.602634310722351, "learning_rate": 1.945144551519644e-05, "loss": 1.0819, "step": 6099 }, { "epoch": 0.9037037037037037, "grad_norm": 1.820095419883728, "learning_rate": 1.942179392142328e-05, "loss": 1.0187, "step": 6100 }, { "epoch": 0.9038518518518519, "grad_norm": 1.6920205354690552, "learning_rate": 1.9392142327650113e-05, "loss": 1.3522, "step": 6101 }, { "epoch": 0.904, "grad_norm": 1.2265843152999878, "learning_rate": 1.9362490733876947e-05, "loss": 1.071, "step": 6102 }, { "epoch": 0.9041481481481481, "grad_norm": 2.0982909202575684, "learning_rate": 1.9332839140103782e-05, "loss": 1.1262, "step": 6103 }, { "epoch": 0.9042962962962963, "grad_norm": 2.1813714504241943, "learning_rate": 1.9303187546330616e-05, "loss": 0.9635, "step": 6104 }, { "epoch": 0.9044444444444445, "grad_norm": 1.8678935766220093, "learning_rate": 1.927353595255745e-05, "loss": 0.9363, "step": 6105 }, { "epoch": 0.9045925925925926, "grad_norm": 1.7868329286575317, "learning_rate": 1.9243884358784288e-05, "loss": 1.2062, "step": 6106 }, { "epoch": 0.9047407407407407, "grad_norm": 1.2214263677597046, "learning_rate": 1.921423276501112e-05, "loss": 0.8393, "step": 6107 }, { "epoch": 0.9048888888888889, "grad_norm": 1.5382243394851685, "learning_rate": 1.9184581171237957e-05, "loss": 1.1659, "step": 6108 }, { "epoch": 0.9050370370370371, "grad_norm": 3.6227638721466064, "learning_rate": 1.9154929577464788e-05, "loss": 1.1544, "step": 6109 }, { "epoch": 0.9051851851851852, "grad_norm": 1.6020253896713257, "learning_rate": 1.9125277983691625e-05, "loss": 1.0659, "step": 6110 }, { "epoch": 0.9053333333333333, "grad_norm": 1.2939000129699707, "learning_rate": 1.909562638991846e-05, "loss": 0.7358, "step": 6111 }, { "epoch": 0.9054814814814814, "grad_norm": 2.0559980869293213, "learning_rate": 1.9065974796145294e-05, "loss": 1.007, "step": 6112 }, { "epoch": 0.9056296296296297, "grad_norm": 1.1219966411590576, "learning_rate": 1.9036323202372128e-05, "loss": 0.7504, "step": 6113 }, { "epoch": 0.9057777777777778, "grad_norm": 2.00435733795166, "learning_rate": 1.9006671608598962e-05, "loss": 0.8884, "step": 6114 }, { "epoch": 0.9059259259259259, "grad_norm": 1.2631276845932007, "learning_rate": 1.8977020014825797e-05, "loss": 1.11, "step": 6115 }, { "epoch": 0.906074074074074, "grad_norm": 1.498734474182129, "learning_rate": 1.8947368421052634e-05, "loss": 1.1082, "step": 6116 }, { "epoch": 0.9062222222222223, "grad_norm": 2.8208577632904053, "learning_rate": 1.8917716827279465e-05, "loss": 0.8463, "step": 6117 }, { "epoch": 0.9063703703703704, "grad_norm": 1.2041140794754028, "learning_rate": 1.8888065233506303e-05, "loss": 0.8351, "step": 6118 }, { "epoch": 0.9065185185185185, "grad_norm": 1.9682738780975342, "learning_rate": 1.8858413639733137e-05, "loss": 0.9793, "step": 6119 }, { "epoch": 0.9066666666666666, "grad_norm": 1.3063067197799683, "learning_rate": 1.882876204595997e-05, "loss": 1.034, "step": 6120 }, { "epoch": 0.9068148148148149, "grad_norm": 1.426026701927185, "learning_rate": 1.8799110452186806e-05, "loss": 1.1258, "step": 6121 }, { "epoch": 0.906962962962963, "grad_norm": 1.9268131256103516, "learning_rate": 1.876945885841364e-05, "loss": 1.0602, "step": 6122 }, { "epoch": 0.9071111111111111, "grad_norm": 1.168556571006775, "learning_rate": 1.8739807264640474e-05, "loss": 0.9043, "step": 6123 }, { "epoch": 0.9072592592592592, "grad_norm": 1.3348885774612427, "learning_rate": 1.871015567086731e-05, "loss": 0.9788, "step": 6124 }, { "epoch": 0.9074074074074074, "grad_norm": 2.003084182739258, "learning_rate": 1.8680504077094143e-05, "loss": 1.0414, "step": 6125 }, { "epoch": 0.9075555555555556, "grad_norm": 1.5825577974319458, "learning_rate": 1.865085248332098e-05, "loss": 0.898, "step": 6126 }, { "epoch": 0.9077037037037037, "grad_norm": 1.6305758953094482, "learning_rate": 1.862120088954781e-05, "loss": 1.076, "step": 6127 }, { "epoch": 0.9078518518518518, "grad_norm": 1.466801643371582, "learning_rate": 1.859154929577465e-05, "loss": 1.0831, "step": 6128 }, { "epoch": 0.908, "grad_norm": 1.4597145318984985, "learning_rate": 1.8561897702001484e-05, "loss": 0.8688, "step": 6129 }, { "epoch": 0.9081481481481481, "grad_norm": 1.4790699481964111, "learning_rate": 1.8532246108228318e-05, "loss": 0.9855, "step": 6130 }, { "epoch": 0.9082962962962963, "grad_norm": 3.157147169113159, "learning_rate": 1.8502594514455152e-05, "loss": 1.111, "step": 6131 }, { "epoch": 0.9084444444444445, "grad_norm": 1.674553394317627, "learning_rate": 1.8472942920681986e-05, "loss": 1.0364, "step": 6132 }, { "epoch": 0.9085925925925926, "grad_norm": 1.1325260400772095, "learning_rate": 1.844329132690882e-05, "loss": 0.8898, "step": 6133 }, { "epoch": 0.9087407407407407, "grad_norm": 1.7265822887420654, "learning_rate": 1.841363973313566e-05, "loss": 1.1837, "step": 6134 }, { "epoch": 0.9088888888888889, "grad_norm": 1.8278820514678955, "learning_rate": 1.8383988139362493e-05, "loss": 1.0625, "step": 6135 }, { "epoch": 0.9090370370370371, "grad_norm": 1.437045931816101, "learning_rate": 1.8354336545589327e-05, "loss": 0.9, "step": 6136 }, { "epoch": 0.9091851851851852, "grad_norm": 2.230379819869995, "learning_rate": 1.832468495181616e-05, "loss": 0.9827, "step": 6137 }, { "epoch": 0.9093333333333333, "grad_norm": 2.0429258346557617, "learning_rate": 1.8295033358042996e-05, "loss": 0.7465, "step": 6138 }, { "epoch": 0.9094814814814814, "grad_norm": 2.3619439601898193, "learning_rate": 1.826538176426983e-05, "loss": 0.9752, "step": 6139 }, { "epoch": 0.9096296296296297, "grad_norm": 1.613619327545166, "learning_rate": 1.8235730170496664e-05, "loss": 0.9935, "step": 6140 }, { "epoch": 0.9097777777777778, "grad_norm": 1.7696385383605957, "learning_rate": 1.8206078576723502e-05, "loss": 0.8633, "step": 6141 }, { "epoch": 0.9099259259259259, "grad_norm": 1.6603771448135376, "learning_rate": 1.8176426982950333e-05, "loss": 0.9566, "step": 6142 }, { "epoch": 0.910074074074074, "grad_norm": 2.8760082721710205, "learning_rate": 1.814677538917717e-05, "loss": 1.0243, "step": 6143 }, { "epoch": 0.9102222222222223, "grad_norm": 1.4774494171142578, "learning_rate": 1.8117123795404005e-05, "loss": 1.1372, "step": 6144 }, { "epoch": 0.9103703703703704, "grad_norm": 2.1504828929901123, "learning_rate": 1.808747220163084e-05, "loss": 1.1725, "step": 6145 }, { "epoch": 0.9105185185185185, "grad_norm": 1.656188726425171, "learning_rate": 1.8057820607857673e-05, "loss": 1.1643, "step": 6146 }, { "epoch": 0.9106666666666666, "grad_norm": 1.4865316152572632, "learning_rate": 1.8028169014084508e-05, "loss": 1.1018, "step": 6147 }, { "epoch": 0.9108148148148149, "grad_norm": 1.9200849533081055, "learning_rate": 1.7998517420311342e-05, "loss": 0.932, "step": 6148 }, { "epoch": 0.910962962962963, "grad_norm": 2.5810506343841553, "learning_rate": 1.796886582653818e-05, "loss": 1.149, "step": 6149 }, { "epoch": 0.9111111111111111, "grad_norm": 1.8590203523635864, "learning_rate": 1.793921423276501e-05, "loss": 0.9868, "step": 6150 }, { "epoch": 0.9112592592592592, "grad_norm": 1.795326828956604, "learning_rate": 1.7909562638991848e-05, "loss": 1.1829, "step": 6151 }, { "epoch": 0.9114074074074074, "grad_norm": 1.2344616651535034, "learning_rate": 1.787991104521868e-05, "loss": 0.9823, "step": 6152 }, { "epoch": 0.9115555555555556, "grad_norm": 1.5617588758468628, "learning_rate": 1.7850259451445517e-05, "loss": 1.1079, "step": 6153 }, { "epoch": 0.9117037037037037, "grad_norm": 1.4923650026321411, "learning_rate": 1.782060785767235e-05, "loss": 1.062, "step": 6154 }, { "epoch": 0.9118518518518518, "grad_norm": 4.258939266204834, "learning_rate": 1.7790956263899185e-05, "loss": 1.036, "step": 6155 }, { "epoch": 0.912, "grad_norm": 3.625718593597412, "learning_rate": 1.776130467012602e-05, "loss": 1.1777, "step": 6156 }, { "epoch": 0.9121481481481482, "grad_norm": 1.7118535041809082, "learning_rate": 1.7731653076352854e-05, "loss": 0.9751, "step": 6157 }, { "epoch": 0.9122962962962963, "grad_norm": 1.1163208484649658, "learning_rate": 1.770200148257969e-05, "loss": 0.867, "step": 6158 }, { "epoch": 0.9124444444444444, "grad_norm": 1.6679993867874146, "learning_rate": 1.7672349888806526e-05, "loss": 1.1601, "step": 6159 }, { "epoch": 0.9125925925925926, "grad_norm": 1.2561055421829224, "learning_rate": 1.7642698295033357e-05, "loss": 0.9182, "step": 6160 }, { "epoch": 0.9127407407407407, "grad_norm": 3.2386274337768555, "learning_rate": 1.7613046701260195e-05, "loss": 1.125, "step": 6161 }, { "epoch": 0.9128888888888889, "grad_norm": 1.2529643774032593, "learning_rate": 1.758339510748703e-05, "loss": 0.7451, "step": 6162 }, { "epoch": 0.9130370370370371, "grad_norm": 1.0242217779159546, "learning_rate": 1.7553743513713863e-05, "loss": 0.7875, "step": 6163 }, { "epoch": 0.9131851851851852, "grad_norm": 1.5616753101348877, "learning_rate": 1.7524091919940698e-05, "loss": 0.9641, "step": 6164 }, { "epoch": 0.9133333333333333, "grad_norm": 3.3702902793884277, "learning_rate": 1.7494440326167532e-05, "loss": 0.8851, "step": 6165 }, { "epoch": 0.9134814814814814, "grad_norm": 2.8215017318725586, "learning_rate": 1.7464788732394366e-05, "loss": 1.1532, "step": 6166 }, { "epoch": 0.9136296296296297, "grad_norm": 1.7007137537002563, "learning_rate": 1.74351371386212e-05, "loss": 0.9779, "step": 6167 }, { "epoch": 0.9137777777777778, "grad_norm": 3.317284107208252, "learning_rate": 1.7405485544848035e-05, "loss": 0.8305, "step": 6168 }, { "epoch": 0.9139259259259259, "grad_norm": 1.1211053133010864, "learning_rate": 1.7375833951074872e-05, "loss": 0.9122, "step": 6169 }, { "epoch": 0.914074074074074, "grad_norm": 1.5618693828582764, "learning_rate": 1.7346182357301703e-05, "loss": 1.0456, "step": 6170 }, { "epoch": 0.9142222222222223, "grad_norm": 1.43927800655365, "learning_rate": 1.731653076352854e-05, "loss": 0.8483, "step": 6171 }, { "epoch": 0.9143703703703704, "grad_norm": 1.8553197383880615, "learning_rate": 1.7286879169755375e-05, "loss": 0.8967, "step": 6172 }, { "epoch": 0.9145185185185185, "grad_norm": 2.838083505630493, "learning_rate": 1.725722757598221e-05, "loss": 1.0936, "step": 6173 }, { "epoch": 0.9146666666666666, "grad_norm": 1.672043800354004, "learning_rate": 1.7227575982209044e-05, "loss": 1.0261, "step": 6174 }, { "epoch": 0.9148148148148149, "grad_norm": 1.1843780279159546, "learning_rate": 1.7197924388435878e-05, "loss": 0.893, "step": 6175 }, { "epoch": 0.914962962962963, "grad_norm": 2.937354326248169, "learning_rate": 1.7168272794662716e-05, "loss": 0.9888, "step": 6176 }, { "epoch": 0.9151111111111111, "grad_norm": 2.701634168624878, "learning_rate": 1.713862120088955e-05, "loss": 1.089, "step": 6177 }, { "epoch": 0.9152592592592592, "grad_norm": 1.9226313829421997, "learning_rate": 1.7108969607116384e-05, "loss": 1.1289, "step": 6178 }, { "epoch": 0.9154074074074074, "grad_norm": 1.6943758726119995, "learning_rate": 1.707931801334322e-05, "loss": 0.7995, "step": 6179 }, { "epoch": 0.9155555555555556, "grad_norm": 1.6026709079742432, "learning_rate": 1.7049666419570053e-05, "loss": 0.8349, "step": 6180 }, { "epoch": 0.9157037037037037, "grad_norm": 3.4536123275756836, "learning_rate": 1.7020014825796887e-05, "loss": 1.1635, "step": 6181 }, { "epoch": 0.9158518518518518, "grad_norm": 1.6723133325576782, "learning_rate": 1.699036323202372e-05, "loss": 0.9406, "step": 6182 }, { "epoch": 0.916, "grad_norm": 1.5767154693603516, "learning_rate": 1.6960711638250556e-05, "loss": 0.9219, "step": 6183 }, { "epoch": 0.9161481481481482, "grad_norm": 1.8430118560791016, "learning_rate": 1.6931060044477394e-05, "loss": 1.1754, "step": 6184 }, { "epoch": 0.9162962962962963, "grad_norm": 1.6995733976364136, "learning_rate": 1.6901408450704224e-05, "loss": 1.5034, "step": 6185 }, { "epoch": 0.9164444444444444, "grad_norm": 1.59422767162323, "learning_rate": 1.6871756856931062e-05, "loss": 1.2148, "step": 6186 }, { "epoch": 0.9165925925925926, "grad_norm": 1.5773909091949463, "learning_rate": 1.6842105263157896e-05, "loss": 0.9517, "step": 6187 }, { "epoch": 0.9167407407407407, "grad_norm": 1.9356043338775635, "learning_rate": 1.681245366938473e-05, "loss": 0.8084, "step": 6188 }, { "epoch": 0.9168888888888889, "grad_norm": 2.1239495277404785, "learning_rate": 1.6782802075611565e-05, "loss": 0.9879, "step": 6189 }, { "epoch": 0.917037037037037, "grad_norm": 1.4792206287384033, "learning_rate": 1.67531504818384e-05, "loss": 0.8791, "step": 6190 }, { "epoch": 0.9171851851851852, "grad_norm": 1.8909317255020142, "learning_rate": 1.6723498888065234e-05, "loss": 1.0974, "step": 6191 }, { "epoch": 0.9173333333333333, "grad_norm": 1.6568989753723145, "learning_rate": 1.669384729429207e-05, "loss": 1.0819, "step": 6192 }, { "epoch": 0.9174814814814815, "grad_norm": 1.7282850742340088, "learning_rate": 1.6664195700518902e-05, "loss": 1.0477, "step": 6193 }, { "epoch": 0.9176296296296297, "grad_norm": 3.181680679321289, "learning_rate": 1.663454410674574e-05, "loss": 0.9178, "step": 6194 }, { "epoch": 0.9177777777777778, "grad_norm": 1.6595582962036133, "learning_rate": 1.660489251297257e-05, "loss": 0.9918, "step": 6195 }, { "epoch": 0.9179259259259259, "grad_norm": 2.2920784950256348, "learning_rate": 1.657524091919941e-05, "loss": 0.932, "step": 6196 }, { "epoch": 0.918074074074074, "grad_norm": 1.3185635805130005, "learning_rate": 1.6545589325426243e-05, "loss": 0.7316, "step": 6197 }, { "epoch": 0.9182222222222223, "grad_norm": 1.3961563110351562, "learning_rate": 1.6515937731653077e-05, "loss": 0.7957, "step": 6198 }, { "epoch": 0.9183703703703704, "grad_norm": 5.240330219268799, "learning_rate": 1.648628613787991e-05, "loss": 0.9162, "step": 6199 }, { "epoch": 0.9185185185185185, "grad_norm": 2.126986265182495, "learning_rate": 1.6456634544106746e-05, "loss": 1.1555, "step": 6200 }, { "epoch": 0.9186666666666666, "grad_norm": 1.5535329580307007, "learning_rate": 1.642698295033358e-05, "loss": 1.0565, "step": 6201 }, { "epoch": 0.9188148148148149, "grad_norm": 1.3964463472366333, "learning_rate": 1.6397331356560418e-05, "loss": 0.8956, "step": 6202 }, { "epoch": 0.918962962962963, "grad_norm": 1.6732627153396606, "learning_rate": 1.636767976278725e-05, "loss": 1.0568, "step": 6203 }, { "epoch": 0.9191111111111111, "grad_norm": 1.8250062465667725, "learning_rate": 1.6338028169014086e-05, "loss": 0.9854, "step": 6204 }, { "epoch": 0.9192592592592592, "grad_norm": 1.4552170038223267, "learning_rate": 1.630837657524092e-05, "loss": 0.9542, "step": 6205 }, { "epoch": 0.9194074074074075, "grad_norm": 4.43630313873291, "learning_rate": 1.6278724981467755e-05, "loss": 0.9096, "step": 6206 }, { "epoch": 0.9195555555555556, "grad_norm": 2.3064446449279785, "learning_rate": 1.624907338769459e-05, "loss": 0.9531, "step": 6207 }, { "epoch": 0.9197037037037037, "grad_norm": 1.996294617652893, "learning_rate": 1.6219421793921423e-05, "loss": 0.9177, "step": 6208 }, { "epoch": 0.9198518518518518, "grad_norm": 1.2575515508651733, "learning_rate": 1.6189770200148258e-05, "loss": 1.0533, "step": 6209 }, { "epoch": 0.92, "grad_norm": 1.8964482545852661, "learning_rate": 1.6160118606375092e-05, "loss": 1.0478, "step": 6210 }, { "epoch": 0.9201481481481482, "grad_norm": 1.3098604679107666, "learning_rate": 1.6130467012601926e-05, "loss": 0.8478, "step": 6211 }, { "epoch": 0.9202962962962963, "grad_norm": 1.1962435245513916, "learning_rate": 1.6100815418828764e-05, "loss": 1.0145, "step": 6212 }, { "epoch": 0.9204444444444444, "grad_norm": 1.5967687368392944, "learning_rate": 1.6071163825055595e-05, "loss": 0.98, "step": 6213 }, { "epoch": 0.9205925925925926, "grad_norm": 1.894650936126709, "learning_rate": 1.6041512231282433e-05, "loss": 0.9342, "step": 6214 }, { "epoch": 0.9207407407407407, "grad_norm": 2.727250337600708, "learning_rate": 1.6011860637509267e-05, "loss": 1.2005, "step": 6215 }, { "epoch": 0.9208888888888889, "grad_norm": 1.2794644832611084, "learning_rate": 1.59822090437361e-05, "loss": 0.9428, "step": 6216 }, { "epoch": 0.921037037037037, "grad_norm": 1.8853224515914917, "learning_rate": 1.5952557449962936e-05, "loss": 1.0673, "step": 6217 }, { "epoch": 0.9211851851851852, "grad_norm": 1.2796276807785034, "learning_rate": 1.592290585618977e-05, "loss": 1.0564, "step": 6218 }, { "epoch": 0.9213333333333333, "grad_norm": 2.350651264190674, "learning_rate": 1.5893254262416607e-05, "loss": 1.097, "step": 6219 }, { "epoch": 0.9214814814814815, "grad_norm": 1.7367689609527588, "learning_rate": 1.5863602668643442e-05, "loss": 1.0272, "step": 6220 }, { "epoch": 0.9216296296296296, "grad_norm": 1.8030822277069092, "learning_rate": 1.5833951074870276e-05, "loss": 1.0805, "step": 6221 }, { "epoch": 0.9217777777777778, "grad_norm": 1.5849909782409668, "learning_rate": 1.580429948109711e-05, "loss": 1.0965, "step": 6222 }, { "epoch": 0.9219259259259259, "grad_norm": 7.2515668869018555, "learning_rate": 1.5774647887323945e-05, "loss": 1.2613, "step": 6223 }, { "epoch": 0.922074074074074, "grad_norm": 1.6564565896987915, "learning_rate": 1.574499629355078e-05, "loss": 0.9734, "step": 6224 }, { "epoch": 0.9222222222222223, "grad_norm": 2.6008694171905518, "learning_rate": 1.5715344699777613e-05, "loss": 1.0323, "step": 6225 }, { "epoch": 0.9223703703703704, "grad_norm": 1.9936848878860474, "learning_rate": 1.5685693106004448e-05, "loss": 0.8073, "step": 6226 }, { "epoch": 0.9225185185185185, "grad_norm": 2.0086851119995117, "learning_rate": 1.5656041512231285e-05, "loss": 1.1109, "step": 6227 }, { "epoch": 0.9226666666666666, "grad_norm": 2.9048655033111572, "learning_rate": 1.5626389918458116e-05, "loss": 1.0379, "step": 6228 }, { "epoch": 0.9228148148148149, "grad_norm": 2.1338531970977783, "learning_rate": 1.5596738324684954e-05, "loss": 1.0836, "step": 6229 }, { "epoch": 0.922962962962963, "grad_norm": 1.5965696573257446, "learning_rate": 1.5567086730911788e-05, "loss": 1.143, "step": 6230 }, { "epoch": 0.9231111111111111, "grad_norm": 2.2106244564056396, "learning_rate": 1.5537435137138622e-05, "loss": 1.0348, "step": 6231 }, { "epoch": 0.9232592592592592, "grad_norm": 1.9293761253356934, "learning_rate": 1.5507783543365457e-05, "loss": 1.2752, "step": 6232 }, { "epoch": 0.9234074074074075, "grad_norm": 1.8146687746047974, "learning_rate": 1.547813194959229e-05, "loss": 0.8558, "step": 6233 }, { "epoch": 0.9235555555555556, "grad_norm": 1.446638584136963, "learning_rate": 1.5448480355819125e-05, "loss": 1.0929, "step": 6234 }, { "epoch": 0.9237037037037037, "grad_norm": 2.2936630249023438, "learning_rate": 1.5418828762045963e-05, "loss": 0.9336, "step": 6235 }, { "epoch": 0.9238518518518518, "grad_norm": 2.1772422790527344, "learning_rate": 1.5389177168272794e-05, "loss": 0.9225, "step": 6236 }, { "epoch": 0.924, "grad_norm": 2.055823802947998, "learning_rate": 1.535952557449963e-05, "loss": 1.1602, "step": 6237 }, { "epoch": 0.9241481481481482, "grad_norm": 1.4553918838500977, "learning_rate": 1.5329873980726462e-05, "loss": 0.9859, "step": 6238 }, { "epoch": 0.9242962962962963, "grad_norm": 3.1852595806121826, "learning_rate": 1.53002223869533e-05, "loss": 0.8281, "step": 6239 }, { "epoch": 0.9244444444444444, "grad_norm": 1.612755298614502, "learning_rate": 1.5270570793180134e-05, "loss": 0.7845, "step": 6240 }, { "epoch": 0.9245925925925926, "grad_norm": 1.8736701011657715, "learning_rate": 1.5240919199406969e-05, "loss": 0.8355, "step": 6241 }, { "epoch": 0.9247407407407408, "grad_norm": 1.7704135179519653, "learning_rate": 1.5211267605633803e-05, "loss": 1.0393, "step": 6242 }, { "epoch": 0.9248888888888889, "grad_norm": 1.5719643831253052, "learning_rate": 1.5181616011860639e-05, "loss": 1.0683, "step": 6243 }, { "epoch": 0.925037037037037, "grad_norm": 1.3279207944869995, "learning_rate": 1.5151964418087472e-05, "loss": 0.8243, "step": 6244 }, { "epoch": 0.9251851851851852, "grad_norm": 1.4003965854644775, "learning_rate": 1.5122312824314308e-05, "loss": 0.9751, "step": 6245 }, { "epoch": 0.9253333333333333, "grad_norm": 3.054779529571533, "learning_rate": 1.5092661230541142e-05, "loss": 1.2106, "step": 6246 }, { "epoch": 0.9254814814814815, "grad_norm": 1.6408637762069702, "learning_rate": 1.5063009636767978e-05, "loss": 1.0453, "step": 6247 }, { "epoch": 0.9256296296296296, "grad_norm": 1.2181177139282227, "learning_rate": 1.503335804299481e-05, "loss": 1.0361, "step": 6248 }, { "epoch": 0.9257777777777778, "grad_norm": 1.3258213996887207, "learning_rate": 1.5003706449221647e-05, "loss": 1.1213, "step": 6249 }, { "epoch": 0.9259259259259259, "grad_norm": 2.0105180740356445, "learning_rate": 1.497405485544848e-05, "loss": 1.0037, "step": 6250 }, { "epoch": 0.926074074074074, "grad_norm": 1.7296967506408691, "learning_rate": 1.4944403261675317e-05, "loss": 0.9131, "step": 6251 }, { "epoch": 0.9262222222222222, "grad_norm": 1.6838256120681763, "learning_rate": 1.491475166790215e-05, "loss": 1.1037, "step": 6252 }, { "epoch": 0.9263703703703704, "grad_norm": 2.1008238792419434, "learning_rate": 1.4885100074128985e-05, "loss": 1.1618, "step": 6253 }, { "epoch": 0.9265185185185185, "grad_norm": 2.8668837547302246, "learning_rate": 1.4855448480355818e-05, "loss": 1.0182, "step": 6254 }, { "epoch": 0.9266666666666666, "grad_norm": 1.3735758066177368, "learning_rate": 1.4825796886582654e-05, "loss": 0.7997, "step": 6255 }, { "epoch": 0.9268148148148149, "grad_norm": 3.844618797302246, "learning_rate": 1.4796145292809488e-05, "loss": 0.8231, "step": 6256 }, { "epoch": 0.926962962962963, "grad_norm": 1.8267629146575928, "learning_rate": 1.4766493699036324e-05, "loss": 0.9104, "step": 6257 }, { "epoch": 0.9271111111111111, "grad_norm": 2.0057625770568848, "learning_rate": 1.4736842105263157e-05, "loss": 0.8926, "step": 6258 }, { "epoch": 0.9272592592592592, "grad_norm": 2.0294721126556396, "learning_rate": 1.4707190511489993e-05, "loss": 0.9732, "step": 6259 }, { "epoch": 0.9274074074074075, "grad_norm": 1.9177600145339966, "learning_rate": 1.4677538917716829e-05, "loss": 0.8258, "step": 6260 }, { "epoch": 0.9275555555555556, "grad_norm": 3.415085792541504, "learning_rate": 1.4647887323943663e-05, "loss": 1.1299, "step": 6261 }, { "epoch": 0.9277037037037037, "grad_norm": 2.0719778537750244, "learning_rate": 1.4618235730170499e-05, "loss": 1.1079, "step": 6262 }, { "epoch": 0.9278518518518518, "grad_norm": 1.4281821250915527, "learning_rate": 1.4588584136397332e-05, "loss": 0.9103, "step": 6263 }, { "epoch": 0.928, "grad_norm": 3.8981099128723145, "learning_rate": 1.4558932542624168e-05, "loss": 1.0281, "step": 6264 }, { "epoch": 0.9281481481481482, "grad_norm": 1.7960991859436035, "learning_rate": 1.4529280948851002e-05, "loss": 1.0856, "step": 6265 }, { "epoch": 0.9282962962962963, "grad_norm": 1.6068756580352783, "learning_rate": 1.4499629355077838e-05, "loss": 1.0139, "step": 6266 }, { "epoch": 0.9284444444444444, "grad_norm": 2.6951966285705566, "learning_rate": 1.446997776130467e-05, "loss": 0.8411, "step": 6267 }, { "epoch": 0.9285925925925926, "grad_norm": 1.224673867225647, "learning_rate": 1.4440326167531507e-05, "loss": 1.0133, "step": 6268 }, { "epoch": 0.9287407407407408, "grad_norm": 1.793215036392212, "learning_rate": 1.441067457375834e-05, "loss": 1.0426, "step": 6269 }, { "epoch": 0.9288888888888889, "grad_norm": 1.3081490993499756, "learning_rate": 1.4381022979985175e-05, "loss": 0.8096, "step": 6270 }, { "epoch": 0.929037037037037, "grad_norm": 1.283671259880066, "learning_rate": 1.435137138621201e-05, "loss": 0.8562, "step": 6271 }, { "epoch": 0.9291851851851852, "grad_norm": 1.2067022323608398, "learning_rate": 1.4321719792438845e-05, "loss": 1.0622, "step": 6272 }, { "epoch": 0.9293333333333333, "grad_norm": 2.639707326889038, "learning_rate": 1.4292068198665678e-05, "loss": 1.0466, "step": 6273 }, { "epoch": 0.9294814814814815, "grad_norm": 1.2013615369796753, "learning_rate": 1.4262416604892514e-05, "loss": 1.2571, "step": 6274 }, { "epoch": 0.9296296296296296, "grad_norm": 1.916764259338379, "learning_rate": 1.4232765011119348e-05, "loss": 0.9469, "step": 6275 }, { "epoch": 0.9297777777777778, "grad_norm": 1.9235259294509888, "learning_rate": 1.4203113417346184e-05, "loss": 1.121, "step": 6276 }, { "epoch": 0.9299259259259259, "grad_norm": 1.2695482969284058, "learning_rate": 1.4173461823573017e-05, "loss": 0.937, "step": 6277 }, { "epoch": 0.930074074074074, "grad_norm": 1.1932200193405151, "learning_rate": 1.4143810229799853e-05, "loss": 0.8635, "step": 6278 }, { "epoch": 0.9302222222222222, "grad_norm": 1.7852343320846558, "learning_rate": 1.4114158636026687e-05, "loss": 1.2502, "step": 6279 }, { "epoch": 0.9303703703703704, "grad_norm": 1.09291672706604, "learning_rate": 1.4084507042253523e-05, "loss": 0.9238, "step": 6280 }, { "epoch": 0.9305185185185185, "grad_norm": 1.2748316526412964, "learning_rate": 1.4054855448480356e-05, "loss": 1.2238, "step": 6281 }, { "epoch": 0.9306666666666666, "grad_norm": 1.7004750967025757, "learning_rate": 1.4025203854707192e-05, "loss": 1.1988, "step": 6282 }, { "epoch": 0.9308148148148148, "grad_norm": 1.8087915182113647, "learning_rate": 1.3995552260934024e-05, "loss": 1.1623, "step": 6283 }, { "epoch": 0.930962962962963, "grad_norm": 3.6730096340179443, "learning_rate": 1.396590066716086e-05, "loss": 1.2439, "step": 6284 }, { "epoch": 0.9311111111111111, "grad_norm": 1.895340919494629, "learning_rate": 1.3936249073387695e-05, "loss": 1.0449, "step": 6285 }, { "epoch": 0.9312592592592592, "grad_norm": 2.183992624282837, "learning_rate": 1.390659747961453e-05, "loss": 1.0405, "step": 6286 }, { "epoch": 0.9314074074074075, "grad_norm": 1.5544954538345337, "learning_rate": 1.3876945885841363e-05, "loss": 0.7898, "step": 6287 }, { "epoch": 0.9315555555555556, "grad_norm": 1.2439019680023193, "learning_rate": 1.38472942920682e-05, "loss": 1.0442, "step": 6288 }, { "epoch": 0.9317037037037037, "grad_norm": 1.3818901777267456, "learning_rate": 1.3817642698295034e-05, "loss": 1.1079, "step": 6289 }, { "epoch": 0.9318518518518518, "grad_norm": 1.4408797025680542, "learning_rate": 1.378799110452187e-05, "loss": 1.0344, "step": 6290 }, { "epoch": 0.932, "grad_norm": 1.2716048955917358, "learning_rate": 1.3758339510748702e-05, "loss": 1.0635, "step": 6291 }, { "epoch": 0.9321481481481482, "grad_norm": 1.6171166896820068, "learning_rate": 1.3728687916975538e-05, "loss": 1.0549, "step": 6292 }, { "epoch": 0.9322962962962963, "grad_norm": 1.4237630367279053, "learning_rate": 1.3699036323202372e-05, "loss": 0.8729, "step": 6293 }, { "epoch": 0.9324444444444444, "grad_norm": 1.8324460983276367, "learning_rate": 1.3669384729429208e-05, "loss": 0.882, "step": 6294 }, { "epoch": 0.9325925925925926, "grad_norm": 1.6489802598953247, "learning_rate": 1.3639733135656041e-05, "loss": 1.0537, "step": 6295 }, { "epoch": 0.9327407407407408, "grad_norm": 1.4294698238372803, "learning_rate": 1.3610081541882877e-05, "loss": 0.9593, "step": 6296 }, { "epoch": 0.9328888888888889, "grad_norm": 1.6796156167984009, "learning_rate": 1.358042994810971e-05, "loss": 1.1417, "step": 6297 }, { "epoch": 0.933037037037037, "grad_norm": 2.3616726398468018, "learning_rate": 1.3550778354336546e-05, "loss": 1.104, "step": 6298 }, { "epoch": 0.9331851851851852, "grad_norm": 1.9400267601013184, "learning_rate": 1.352112676056338e-05, "loss": 1.1002, "step": 6299 }, { "epoch": 0.9333333333333333, "grad_norm": 1.6310405731201172, "learning_rate": 1.3491475166790216e-05, "loss": 0.9469, "step": 6300 }, { "epoch": 0.9334814814814815, "grad_norm": 1.4283984899520874, "learning_rate": 1.3461823573017052e-05, "loss": 1.0257, "step": 6301 }, { "epoch": 0.9336296296296296, "grad_norm": 1.5619159936904907, "learning_rate": 1.3432171979243885e-05, "loss": 1.0525, "step": 6302 }, { "epoch": 0.9337777777777778, "grad_norm": 1.9005515575408936, "learning_rate": 1.340252038547072e-05, "loss": 1.2217, "step": 6303 }, { "epoch": 0.9339259259259259, "grad_norm": 2.2778842449188232, "learning_rate": 1.3372868791697555e-05, "loss": 1.0038, "step": 6304 }, { "epoch": 0.9340740740740741, "grad_norm": 2.3246541023254395, "learning_rate": 1.334321719792439e-05, "loss": 0.8599, "step": 6305 }, { "epoch": 0.9342222222222222, "grad_norm": 1.6047136783599854, "learning_rate": 1.3313565604151223e-05, "loss": 0.995, "step": 6306 }, { "epoch": 0.9343703703703704, "grad_norm": 1.4049116373062134, "learning_rate": 1.328391401037806e-05, "loss": 1.2271, "step": 6307 }, { "epoch": 0.9345185185185185, "grad_norm": 2.139193058013916, "learning_rate": 1.3254262416604894e-05, "loss": 1.2563, "step": 6308 }, { "epoch": 0.9346666666666666, "grad_norm": 2.2409770488739014, "learning_rate": 1.322461082283173e-05, "loss": 1.1317, "step": 6309 }, { "epoch": 0.9348148148148148, "grad_norm": 1.7860808372497559, "learning_rate": 1.3194959229058562e-05, "loss": 0.943, "step": 6310 }, { "epoch": 0.934962962962963, "grad_norm": 1.4653562307357788, "learning_rate": 1.3165307635285398e-05, "loss": 1.0892, "step": 6311 }, { "epoch": 0.9351111111111111, "grad_norm": 2.659821033477783, "learning_rate": 1.3135656041512231e-05, "loss": 1.0764, "step": 6312 }, { "epoch": 0.9352592592592592, "grad_norm": 6.066284656524658, "learning_rate": 1.3106004447739067e-05, "loss": 1.1168, "step": 6313 }, { "epoch": 0.9354074074074074, "grad_norm": 1.884269118309021, "learning_rate": 1.3076352853965901e-05, "loss": 0.9515, "step": 6314 }, { "epoch": 0.9355555555555556, "grad_norm": 2.44565486907959, "learning_rate": 1.3046701260192737e-05, "loss": 0.9311, "step": 6315 }, { "epoch": 0.9357037037037037, "grad_norm": 2.0083768367767334, "learning_rate": 1.301704966641957e-05, "loss": 0.9074, "step": 6316 }, { "epoch": 0.9358518518518518, "grad_norm": 1.552828311920166, "learning_rate": 1.2987398072646406e-05, "loss": 1.0611, "step": 6317 }, { "epoch": 0.936, "grad_norm": 1.6066118478775024, "learning_rate": 1.295774647887324e-05, "loss": 0.9488, "step": 6318 }, { "epoch": 0.9361481481481482, "grad_norm": 2.312607526779175, "learning_rate": 1.2928094885100076e-05, "loss": 1.0095, "step": 6319 }, { "epoch": 0.9362962962962963, "grad_norm": 1.71018648147583, "learning_rate": 1.2898443291326909e-05, "loss": 1.0762, "step": 6320 }, { "epoch": 0.9364444444444444, "grad_norm": 1.7650734186172485, "learning_rate": 1.2868791697553745e-05, "loss": 1.0212, "step": 6321 }, { "epoch": 0.9365925925925926, "grad_norm": 2.5213658809661865, "learning_rate": 1.2839140103780579e-05, "loss": 0.7357, "step": 6322 }, { "epoch": 0.9367407407407408, "grad_norm": 1.4773727655410767, "learning_rate": 1.2809488510007415e-05, "loss": 1.0724, "step": 6323 }, { "epoch": 0.9368888888888889, "grad_norm": 3.468384027481079, "learning_rate": 1.2779836916234247e-05, "loss": 1.1406, "step": 6324 }, { "epoch": 0.937037037037037, "grad_norm": 0.9928172826766968, "learning_rate": 1.2750185322461083e-05, "loss": 0.8181, "step": 6325 }, { "epoch": 0.9371851851851852, "grad_norm": 1.7566633224487305, "learning_rate": 1.2720533728687916e-05, "loss": 1.0395, "step": 6326 }, { "epoch": 0.9373333333333334, "grad_norm": 2.067307233810425, "learning_rate": 1.2690882134914752e-05, "loss": 1.0468, "step": 6327 }, { "epoch": 0.9374814814814815, "grad_norm": 1.3327713012695312, "learning_rate": 1.2661230541141586e-05, "loss": 0.8396, "step": 6328 }, { "epoch": 0.9376296296296296, "grad_norm": 2.0988290309906006, "learning_rate": 1.2631578947368422e-05, "loss": 0.7821, "step": 6329 }, { "epoch": 0.9377777777777778, "grad_norm": 2.2182228565216064, "learning_rate": 1.2601927353595255e-05, "loss": 0.8587, "step": 6330 }, { "epoch": 0.9379259259259259, "grad_norm": 2.2985942363739014, "learning_rate": 1.2572275759822091e-05, "loss": 0.9524, "step": 6331 }, { "epoch": 0.9380740740740741, "grad_norm": 1.6669297218322754, "learning_rate": 1.2542624166048925e-05, "loss": 1.1466, "step": 6332 }, { "epoch": 0.9382222222222222, "grad_norm": 1.1697335243225098, "learning_rate": 1.2512972572275761e-05, "loss": 0.8193, "step": 6333 }, { "epoch": 0.9383703703703704, "grad_norm": 1.3751897811889648, "learning_rate": 1.2483320978502596e-05, "loss": 1.1782, "step": 6334 }, { "epoch": 0.9385185185185185, "grad_norm": 1.8882319927215576, "learning_rate": 1.245366938472943e-05, "loss": 1.0286, "step": 6335 }, { "epoch": 0.9386666666666666, "grad_norm": 1.8155957460403442, "learning_rate": 1.2424017790956264e-05, "loss": 1.4968, "step": 6336 }, { "epoch": 0.9388148148148148, "grad_norm": 1.2545281648635864, "learning_rate": 1.23943661971831e-05, "loss": 0.9662, "step": 6337 }, { "epoch": 0.938962962962963, "grad_norm": 1.09188973903656, "learning_rate": 1.2364714603409934e-05, "loss": 1.0145, "step": 6338 }, { "epoch": 0.9391111111111111, "grad_norm": 1.5590113401412964, "learning_rate": 1.2335063009636769e-05, "loss": 0.9294, "step": 6339 }, { "epoch": 0.9392592592592592, "grad_norm": 1.930653691291809, "learning_rate": 1.2305411415863603e-05, "loss": 1.0176, "step": 6340 }, { "epoch": 0.9394074074074074, "grad_norm": 1.7895466089248657, "learning_rate": 1.2275759822090437e-05, "loss": 1.1676, "step": 6341 }, { "epoch": 0.9395555555555556, "grad_norm": 2.135303020477295, "learning_rate": 1.2246108228317273e-05, "loss": 1.0209, "step": 6342 }, { "epoch": 0.9397037037037037, "grad_norm": 1.345557689666748, "learning_rate": 1.2216456634544108e-05, "loss": 0.8773, "step": 6343 }, { "epoch": 0.9398518518518518, "grad_norm": 1.3942506313323975, "learning_rate": 1.2186805040770942e-05, "loss": 0.8741, "step": 6344 }, { "epoch": 0.94, "grad_norm": 1.980754017829895, "learning_rate": 1.2157153446997776e-05, "loss": 0.9258, "step": 6345 }, { "epoch": 0.9401481481481482, "grad_norm": 1.8694645166397095, "learning_rate": 1.212750185322461e-05, "loss": 0.9663, "step": 6346 }, { "epoch": 0.9402962962962963, "grad_norm": 1.7359546422958374, "learning_rate": 1.2097850259451446e-05, "loss": 1.0104, "step": 6347 }, { "epoch": 0.9404444444444444, "grad_norm": 1.3269741535186768, "learning_rate": 1.206819866567828e-05, "loss": 0.8816, "step": 6348 }, { "epoch": 0.9405925925925926, "grad_norm": 2.214254140853882, "learning_rate": 1.2038547071905115e-05, "loss": 1.0563, "step": 6349 }, { "epoch": 0.9407407407407408, "grad_norm": 1.2962080240249634, "learning_rate": 1.200889547813195e-05, "loss": 0.9694, "step": 6350 }, { "epoch": 0.9408888888888889, "grad_norm": 2.1471807956695557, "learning_rate": 1.1979243884358785e-05, "loss": 0.9636, "step": 6351 }, { "epoch": 0.941037037037037, "grad_norm": 1.5202158689498901, "learning_rate": 1.194959229058562e-05, "loss": 0.9733, "step": 6352 }, { "epoch": 0.9411851851851852, "grad_norm": 2.029224395751953, "learning_rate": 1.1919940696812454e-05, "loss": 0.8564, "step": 6353 }, { "epoch": 0.9413333333333334, "grad_norm": 1.591579556465149, "learning_rate": 1.1890289103039288e-05, "loss": 0.9636, "step": 6354 }, { "epoch": 0.9414814814814815, "grad_norm": 1.2714380025863647, "learning_rate": 1.1860637509266123e-05, "loss": 0.8864, "step": 6355 }, { "epoch": 0.9416296296296296, "grad_norm": 2.172243356704712, "learning_rate": 1.1830985915492958e-05, "loss": 1.2668, "step": 6356 }, { "epoch": 0.9417777777777778, "grad_norm": 1.522011160850525, "learning_rate": 1.1801334321719793e-05, "loss": 0.9845, "step": 6357 }, { "epoch": 0.9419259259259259, "grad_norm": 1.4941858053207397, "learning_rate": 1.1771682727946627e-05, "loss": 1.0767, "step": 6358 }, { "epoch": 0.9420740740740741, "grad_norm": 1.603853464126587, "learning_rate": 1.1742031134173461e-05, "loss": 1.0759, "step": 6359 }, { "epoch": 0.9422222222222222, "grad_norm": 3.413689613342285, "learning_rate": 1.1712379540400296e-05, "loss": 1.0716, "step": 6360 }, { "epoch": 0.9423703703703704, "grad_norm": 2.329671859741211, "learning_rate": 1.1682727946627132e-05, "loss": 0.7877, "step": 6361 }, { "epoch": 0.9425185185185185, "grad_norm": 1.9882824420928955, "learning_rate": 1.1653076352853968e-05, "loss": 0.8958, "step": 6362 }, { "epoch": 0.9426666666666667, "grad_norm": 2.55271577835083, "learning_rate": 1.1623424759080802e-05, "loss": 1.2704, "step": 6363 }, { "epoch": 0.9428148148148148, "grad_norm": 1.7175090312957764, "learning_rate": 1.1593773165307636e-05, "loss": 0.8766, "step": 6364 }, { "epoch": 0.942962962962963, "grad_norm": 1.5329500436782837, "learning_rate": 1.156412157153447e-05, "loss": 1.2447, "step": 6365 }, { "epoch": 0.9431111111111111, "grad_norm": 2.170001745223999, "learning_rate": 1.1534469977761307e-05, "loss": 0.9401, "step": 6366 }, { "epoch": 0.9432592592592592, "grad_norm": 1.2377965450286865, "learning_rate": 1.150481838398814e-05, "loss": 0.9982, "step": 6367 }, { "epoch": 0.9434074074074074, "grad_norm": 2.794631242752075, "learning_rate": 1.1475166790214975e-05, "loss": 0.8746, "step": 6368 }, { "epoch": 0.9435555555555556, "grad_norm": 1.6014835834503174, "learning_rate": 1.144551519644181e-05, "loss": 0.9898, "step": 6369 }, { "epoch": 0.9437037037037037, "grad_norm": 1.6563359498977661, "learning_rate": 1.1415863602668644e-05, "loss": 1.2244, "step": 6370 }, { "epoch": 0.9438518518518518, "grad_norm": 1.6768234968185425, "learning_rate": 1.138621200889548e-05, "loss": 0.9442, "step": 6371 }, { "epoch": 0.944, "grad_norm": 1.246340274810791, "learning_rate": 1.1356560415122314e-05, "loss": 1.1951, "step": 6372 }, { "epoch": 0.9441481481481482, "grad_norm": 1.406167984008789, "learning_rate": 1.1326908821349148e-05, "loss": 0.6989, "step": 6373 }, { "epoch": 0.9442962962962963, "grad_norm": 5.012592315673828, "learning_rate": 1.1297257227575983e-05, "loss": 0.9495, "step": 6374 }, { "epoch": 0.9444444444444444, "grad_norm": 1.3904874324798584, "learning_rate": 1.1267605633802817e-05, "loss": 1.0901, "step": 6375 }, { "epoch": 0.9445925925925925, "grad_norm": 1.3849149942398071, "learning_rate": 1.1237954040029653e-05, "loss": 1.1584, "step": 6376 }, { "epoch": 0.9447407407407408, "grad_norm": 1.42518150806427, "learning_rate": 1.1208302446256487e-05, "loss": 1.0418, "step": 6377 }, { "epoch": 0.9448888888888889, "grad_norm": 2.9414753913879395, "learning_rate": 1.1178650852483321e-05, "loss": 1.0448, "step": 6378 }, { "epoch": 0.945037037037037, "grad_norm": 1.3405108451843262, "learning_rate": 1.1148999258710156e-05, "loss": 1.059, "step": 6379 }, { "epoch": 0.9451851851851852, "grad_norm": 2.4102418422698975, "learning_rate": 1.1119347664936992e-05, "loss": 0.9772, "step": 6380 }, { "epoch": 0.9453333333333334, "grad_norm": 2.4661507606506348, "learning_rate": 1.1089696071163826e-05, "loss": 1.1724, "step": 6381 }, { "epoch": 0.9454814814814815, "grad_norm": 1.4191091060638428, "learning_rate": 1.106004447739066e-05, "loss": 0.8019, "step": 6382 }, { "epoch": 0.9456296296296296, "grad_norm": 1.429807186126709, "learning_rate": 1.1030392883617495e-05, "loss": 1.0792, "step": 6383 }, { "epoch": 0.9457777777777778, "grad_norm": 1.7942218780517578, "learning_rate": 1.1000741289844329e-05, "loss": 0.9689, "step": 6384 }, { "epoch": 0.945925925925926, "grad_norm": 2.545766592025757, "learning_rate": 1.0971089696071165e-05, "loss": 1.0841, "step": 6385 }, { "epoch": 0.9460740740740741, "grad_norm": 1.7510018348693848, "learning_rate": 1.0941438102298e-05, "loss": 1.0998, "step": 6386 }, { "epoch": 0.9462222222222222, "grad_norm": 1.6062695980072021, "learning_rate": 1.0911786508524834e-05, "loss": 0.9178, "step": 6387 }, { "epoch": 0.9463703703703704, "grad_norm": 1.8828139305114746, "learning_rate": 1.0882134914751668e-05, "loss": 1.0608, "step": 6388 }, { "epoch": 0.9465185185185185, "grad_norm": 1.3656530380249023, "learning_rate": 1.0852483320978502e-05, "loss": 0.8773, "step": 6389 }, { "epoch": 0.9466666666666667, "grad_norm": 2.1898579597473145, "learning_rate": 1.0822831727205338e-05, "loss": 0.9648, "step": 6390 }, { "epoch": 0.9468148148148148, "grad_norm": 1.3358759880065918, "learning_rate": 1.0793180133432172e-05, "loss": 1.0784, "step": 6391 }, { "epoch": 0.946962962962963, "grad_norm": 2.2030200958251953, "learning_rate": 1.0763528539659007e-05, "loss": 1.0226, "step": 6392 }, { "epoch": 0.9471111111111111, "grad_norm": 1.8959722518920898, "learning_rate": 1.0733876945885841e-05, "loss": 1.0063, "step": 6393 }, { "epoch": 0.9472592592592592, "grad_norm": 1.3742667436599731, "learning_rate": 1.0704225352112677e-05, "loss": 1.0494, "step": 6394 }, { "epoch": 0.9474074074074074, "grad_norm": 1.5060292482376099, "learning_rate": 1.0674573758339511e-05, "loss": 0.9241, "step": 6395 }, { "epoch": 0.9475555555555556, "grad_norm": 1.7580757141113281, "learning_rate": 1.0644922164566346e-05, "loss": 1.1803, "step": 6396 }, { "epoch": 0.9477037037037037, "grad_norm": 2.092602252960205, "learning_rate": 1.061527057079318e-05, "loss": 1.1633, "step": 6397 }, { "epoch": 0.9478518518518518, "grad_norm": 2.8467867374420166, "learning_rate": 1.0585618977020014e-05, "loss": 1.1381, "step": 6398 }, { "epoch": 0.948, "grad_norm": 1.915920376777649, "learning_rate": 1.055596738324685e-05, "loss": 0.9824, "step": 6399 }, { "epoch": 0.9481481481481482, "grad_norm": 1.6160054206848145, "learning_rate": 1.0526315789473684e-05, "loss": 0.9829, "step": 6400 }, { "epoch": 0.9482962962962963, "grad_norm": 1.2259273529052734, "learning_rate": 1.0496664195700519e-05, "loss": 1.0014, "step": 6401 }, { "epoch": 0.9484444444444444, "grad_norm": 1.5422176122665405, "learning_rate": 1.0467012601927353e-05, "loss": 0.8151, "step": 6402 }, { "epoch": 0.9485925925925925, "grad_norm": 2.2120344638824463, "learning_rate": 1.0437361008154187e-05, "loss": 1.3064, "step": 6403 }, { "epoch": 0.9487407407407408, "grad_norm": 1.8899518251419067, "learning_rate": 1.0407709414381023e-05, "loss": 1.0727, "step": 6404 }, { "epoch": 0.9488888888888889, "grad_norm": 2.15838885307312, "learning_rate": 1.037805782060786e-05, "loss": 1.0804, "step": 6405 }, { "epoch": 0.949037037037037, "grad_norm": 2.040877342224121, "learning_rate": 1.0348406226834694e-05, "loss": 0.8583, "step": 6406 }, { "epoch": 0.9491851851851851, "grad_norm": 1.3041592836380005, "learning_rate": 1.0318754633061528e-05, "loss": 1.0614, "step": 6407 }, { "epoch": 0.9493333333333334, "grad_norm": 3.521601676940918, "learning_rate": 1.0289103039288362e-05, "loss": 0.7696, "step": 6408 }, { "epoch": 0.9494814814814815, "grad_norm": 2.875967264175415, "learning_rate": 1.0259451445515198e-05, "loss": 0.9629, "step": 6409 }, { "epoch": 0.9496296296296296, "grad_norm": 3.0974297523498535, "learning_rate": 1.0229799851742032e-05, "loss": 0.7914, "step": 6410 }, { "epoch": 0.9497777777777778, "grad_norm": 3.428769588470459, "learning_rate": 1.0200148257968867e-05, "loss": 0.8564, "step": 6411 }, { "epoch": 0.949925925925926, "grad_norm": 1.9870706796646118, "learning_rate": 1.0170496664195701e-05, "loss": 1.0374, "step": 6412 }, { "epoch": 0.9500740740740741, "grad_norm": 1.303742527961731, "learning_rate": 1.0140845070422535e-05, "loss": 0.8516, "step": 6413 }, { "epoch": 0.9502222222222222, "grad_norm": 1.4214708805084229, "learning_rate": 1.0111193476649371e-05, "loss": 0.8222, "step": 6414 }, { "epoch": 0.9503703703703704, "grad_norm": 3.44465708732605, "learning_rate": 1.0081541882876206e-05, "loss": 1.1527, "step": 6415 }, { "epoch": 0.9505185185185185, "grad_norm": 1.6411240100860596, "learning_rate": 1.005189028910304e-05, "loss": 1.0337, "step": 6416 }, { "epoch": 0.9506666666666667, "grad_norm": 1.5941029787063599, "learning_rate": 1.0022238695329874e-05, "loss": 0.9292, "step": 6417 }, { "epoch": 0.9508148148148148, "grad_norm": 1.6715425252914429, "learning_rate": 9.992587101556709e-06, "loss": 0.9052, "step": 6418 }, { "epoch": 0.950962962962963, "grad_norm": 1.9611213207244873, "learning_rate": 9.962935507783545e-06, "loss": 1.0738, "step": 6419 }, { "epoch": 0.9511111111111111, "grad_norm": 1.359175682067871, "learning_rate": 9.933283914010379e-06, "loss": 1.0809, "step": 6420 }, { "epoch": 0.9512592592592592, "grad_norm": 31.894559860229492, "learning_rate": 9.903632320237213e-06, "loss": 0.8927, "step": 6421 }, { "epoch": 0.9514074074074074, "grad_norm": 2.469003915786743, "learning_rate": 9.873980726464047e-06, "loss": 1.1294, "step": 6422 }, { "epoch": 0.9515555555555556, "grad_norm": 1.718002200126648, "learning_rate": 9.844329132690883e-06, "loss": 1.1614, "step": 6423 }, { "epoch": 0.9517037037037037, "grad_norm": 1.6662921905517578, "learning_rate": 9.814677538917718e-06, "loss": 0.952, "step": 6424 }, { "epoch": 0.9518518518518518, "grad_norm": 1.274985909461975, "learning_rate": 9.785025945144552e-06, "loss": 0.9689, "step": 6425 }, { "epoch": 0.952, "grad_norm": 1.6987491846084595, "learning_rate": 9.755374351371386e-06, "loss": 0.9865, "step": 6426 }, { "epoch": 0.9521481481481482, "grad_norm": 1.5471701622009277, "learning_rate": 9.72572275759822e-06, "loss": 1.0109, "step": 6427 }, { "epoch": 0.9522962962962963, "grad_norm": 1.410084843635559, "learning_rate": 9.696071163825057e-06, "loss": 1.0172, "step": 6428 }, { "epoch": 0.9524444444444444, "grad_norm": 2.285048246383667, "learning_rate": 9.666419570051891e-06, "loss": 0.8758, "step": 6429 }, { "epoch": 0.9525925925925925, "grad_norm": 1.4201016426086426, "learning_rate": 9.636767976278725e-06, "loss": 1.2978, "step": 6430 }, { "epoch": 0.9527407407407408, "grad_norm": 1.587431788444519, "learning_rate": 9.60711638250556e-06, "loss": 1.215, "step": 6431 }, { "epoch": 0.9528888888888889, "grad_norm": 1.606855034828186, "learning_rate": 9.577464788732394e-06, "loss": 1.0183, "step": 6432 }, { "epoch": 0.953037037037037, "grad_norm": 2.1990363597869873, "learning_rate": 9.54781319495923e-06, "loss": 0.9638, "step": 6433 }, { "epoch": 0.9531851851851851, "grad_norm": 2.6111652851104736, "learning_rate": 9.518161601186064e-06, "loss": 0.7105, "step": 6434 }, { "epoch": 0.9533333333333334, "grad_norm": 1.7862600088119507, "learning_rate": 9.488510007412898e-06, "loss": 0.9702, "step": 6435 }, { "epoch": 0.9534814814814815, "grad_norm": 1.4127826690673828, "learning_rate": 9.458858413639733e-06, "loss": 0.9044, "step": 6436 }, { "epoch": 0.9536296296296296, "grad_norm": 2.47554087638855, "learning_rate": 9.429206819866569e-06, "loss": 1.1324, "step": 6437 }, { "epoch": 0.9537777777777777, "grad_norm": 1.462254524230957, "learning_rate": 9.399555226093403e-06, "loss": 0.9682, "step": 6438 }, { "epoch": 0.953925925925926, "grad_norm": 1.5132200717926025, "learning_rate": 9.369903632320237e-06, "loss": 1.1639, "step": 6439 }, { "epoch": 0.9540740740740741, "grad_norm": 1.4542734622955322, "learning_rate": 9.340252038547072e-06, "loss": 1.045, "step": 6440 }, { "epoch": 0.9542222222222222, "grad_norm": 1.4217482805252075, "learning_rate": 9.310600444773906e-06, "loss": 0.9905, "step": 6441 }, { "epoch": 0.9543703703703704, "grad_norm": 2.0644896030426025, "learning_rate": 9.280948851000742e-06, "loss": 0.958, "step": 6442 }, { "epoch": 0.9545185185185185, "grad_norm": 1.2777775526046753, "learning_rate": 9.251297257227576e-06, "loss": 0.9854, "step": 6443 }, { "epoch": 0.9546666666666667, "grad_norm": 1.7243410348892212, "learning_rate": 9.22164566345441e-06, "loss": 1.0892, "step": 6444 }, { "epoch": 0.9548148148148148, "grad_norm": 1.822312593460083, "learning_rate": 9.191994069681246e-06, "loss": 0.9043, "step": 6445 }, { "epoch": 0.954962962962963, "grad_norm": 2.3563039302825928, "learning_rate": 9.16234247590808e-06, "loss": 1.0392, "step": 6446 }, { "epoch": 0.9551111111111111, "grad_norm": 1.7175642251968384, "learning_rate": 9.132690882134915e-06, "loss": 1.0496, "step": 6447 }, { "epoch": 0.9552592592592593, "grad_norm": 2.303910970687866, "learning_rate": 9.103039288361751e-06, "loss": 1.0964, "step": 6448 }, { "epoch": 0.9554074074074074, "grad_norm": 1.6031699180603027, "learning_rate": 9.073387694588585e-06, "loss": 0.9436, "step": 6449 }, { "epoch": 0.9555555555555556, "grad_norm": 1.137147307395935, "learning_rate": 9.04373610081542e-06, "loss": 0.7867, "step": 6450 }, { "epoch": 0.9557037037037037, "grad_norm": 1.563591480255127, "learning_rate": 9.014084507042254e-06, "loss": 0.8269, "step": 6451 }, { "epoch": 0.9558518518518518, "grad_norm": 2.2050209045410156, "learning_rate": 8.98443291326909e-06, "loss": 1.0443, "step": 6452 }, { "epoch": 0.956, "grad_norm": 2.460078001022339, "learning_rate": 8.954781319495924e-06, "loss": 0.7234, "step": 6453 }, { "epoch": 0.9561481481481482, "grad_norm": 2.6189513206481934, "learning_rate": 8.925129725722758e-06, "loss": 0.9367, "step": 6454 }, { "epoch": 0.9562962962962963, "grad_norm": 1.8325825929641724, "learning_rate": 8.895478131949593e-06, "loss": 0.8708, "step": 6455 }, { "epoch": 0.9564444444444444, "grad_norm": 1.6920523643493652, "learning_rate": 8.865826538176427e-06, "loss": 0.8271, "step": 6456 }, { "epoch": 0.9565925925925925, "grad_norm": 5.583585262298584, "learning_rate": 8.836174944403263e-06, "loss": 1.0079, "step": 6457 }, { "epoch": 0.9567407407407408, "grad_norm": 2.426647901535034, "learning_rate": 8.806523350630097e-06, "loss": 0.9096, "step": 6458 }, { "epoch": 0.9568888888888889, "grad_norm": 3.149855852127075, "learning_rate": 8.776871756856932e-06, "loss": 0.8714, "step": 6459 }, { "epoch": 0.957037037037037, "grad_norm": 1.0600141286849976, "learning_rate": 8.747220163083766e-06, "loss": 0.8498, "step": 6460 }, { "epoch": 0.9571851851851851, "grad_norm": 1.5460138320922852, "learning_rate": 8.7175685693106e-06, "loss": 1.2422, "step": 6461 }, { "epoch": 0.9573333333333334, "grad_norm": 2.473261833190918, "learning_rate": 8.687916975537436e-06, "loss": 0.9405, "step": 6462 }, { "epoch": 0.9574814814814815, "grad_norm": 1.4824777841567993, "learning_rate": 8.65826538176427e-06, "loss": 0.8999, "step": 6463 }, { "epoch": 0.9576296296296296, "grad_norm": 2.313084602355957, "learning_rate": 8.628613787991105e-06, "loss": 1.0752, "step": 6464 }, { "epoch": 0.9577777777777777, "grad_norm": 4.357772350311279, "learning_rate": 8.598962194217939e-06, "loss": 0.7528, "step": 6465 }, { "epoch": 0.957925925925926, "grad_norm": 1.278823733329773, "learning_rate": 8.569310600444775e-06, "loss": 0.9011, "step": 6466 }, { "epoch": 0.9580740740740741, "grad_norm": 1.3825360536575317, "learning_rate": 8.53965900667161e-06, "loss": 1.2506, "step": 6467 }, { "epoch": 0.9582222222222222, "grad_norm": 1.978281021118164, "learning_rate": 8.510007412898444e-06, "loss": 0.9997, "step": 6468 }, { "epoch": 0.9583703703703703, "grad_norm": 1.4772392511367798, "learning_rate": 8.480355819125278e-06, "loss": 0.9156, "step": 6469 }, { "epoch": 0.9585185185185185, "grad_norm": 1.7920863628387451, "learning_rate": 8.450704225352112e-06, "loss": 0.8526, "step": 6470 }, { "epoch": 0.9586666666666667, "grad_norm": 1.5702614784240723, "learning_rate": 8.421052631578948e-06, "loss": 0.8537, "step": 6471 }, { "epoch": 0.9588148148148148, "grad_norm": 2.078935146331787, "learning_rate": 8.391401037805783e-06, "loss": 0.877, "step": 6472 }, { "epoch": 0.958962962962963, "grad_norm": 1.4247301816940308, "learning_rate": 8.361749444032617e-06, "loss": 0.9224, "step": 6473 }, { "epoch": 0.9591111111111111, "grad_norm": 2.6739819049835205, "learning_rate": 8.332097850259451e-06, "loss": 1.047, "step": 6474 }, { "epoch": 0.9592592592592593, "grad_norm": 7.315777778625488, "learning_rate": 8.302446256486285e-06, "loss": 0.881, "step": 6475 }, { "epoch": 0.9594074074074074, "grad_norm": 1.5592920780181885, "learning_rate": 8.272794662713121e-06, "loss": 1.1478, "step": 6476 }, { "epoch": 0.9595555555555556, "grad_norm": 1.735410451889038, "learning_rate": 8.243143068939956e-06, "loss": 1.0698, "step": 6477 }, { "epoch": 0.9597037037037037, "grad_norm": 1.1631207466125488, "learning_rate": 8.21349147516679e-06, "loss": 0.9357, "step": 6478 }, { "epoch": 0.9598518518518518, "grad_norm": 1.8847887516021729, "learning_rate": 8.183839881393624e-06, "loss": 1.1412, "step": 6479 }, { "epoch": 0.96, "grad_norm": 2.4948129653930664, "learning_rate": 8.15418828762046e-06, "loss": 1.012, "step": 6480 }, { "epoch": 0.9601481481481482, "grad_norm": 1.49558424949646, "learning_rate": 8.124536693847295e-06, "loss": 0.9518, "step": 6481 }, { "epoch": 0.9602962962962963, "grad_norm": 1.9432427883148193, "learning_rate": 8.094885100074129e-06, "loss": 0.8645, "step": 6482 }, { "epoch": 0.9604444444444444, "grad_norm": 1.3539835214614868, "learning_rate": 8.065233506300963e-06, "loss": 0.814, "step": 6483 }, { "epoch": 0.9605925925925926, "grad_norm": 2.094672918319702, "learning_rate": 8.035581912527797e-06, "loss": 0.9301, "step": 6484 }, { "epoch": 0.9607407407407408, "grad_norm": 2.508594512939453, "learning_rate": 8.005930318754633e-06, "loss": 1.0625, "step": 6485 }, { "epoch": 0.9608888888888889, "grad_norm": 1.4947229623794556, "learning_rate": 7.976278724981468e-06, "loss": 0.9313, "step": 6486 }, { "epoch": 0.961037037037037, "grad_norm": 1.365053415298462, "learning_rate": 7.946627131208304e-06, "loss": 0.9352, "step": 6487 }, { "epoch": 0.9611851851851851, "grad_norm": 1.3837653398513794, "learning_rate": 7.916975537435138e-06, "loss": 0.7481, "step": 6488 }, { "epoch": 0.9613333333333334, "grad_norm": 2.1290810108184814, "learning_rate": 7.887323943661972e-06, "loss": 1.0015, "step": 6489 }, { "epoch": 0.9614814814814815, "grad_norm": 1.3843375444412231, "learning_rate": 7.857672349888807e-06, "loss": 0.8474, "step": 6490 }, { "epoch": 0.9616296296296296, "grad_norm": 1.7854416370391846, "learning_rate": 7.828020756115643e-06, "loss": 0.819, "step": 6491 }, { "epoch": 0.9617777777777777, "grad_norm": 1.7904064655303955, "learning_rate": 7.798369162342477e-06, "loss": 0.8276, "step": 6492 }, { "epoch": 0.961925925925926, "grad_norm": 8.385926246643066, "learning_rate": 7.768717568569311e-06, "loss": 0.6376, "step": 6493 }, { "epoch": 0.9620740740740741, "grad_norm": 1.8393787145614624, "learning_rate": 7.739065974796146e-06, "loss": 0.8887, "step": 6494 }, { "epoch": 0.9622222222222222, "grad_norm": 1.6400352716445923, "learning_rate": 7.709414381022981e-06, "loss": 0.9952, "step": 6495 }, { "epoch": 0.9623703703703703, "grad_norm": 1.658842921257019, "learning_rate": 7.679762787249816e-06, "loss": 1.0377, "step": 6496 }, { "epoch": 0.9625185185185186, "grad_norm": 2.5278728008270264, "learning_rate": 7.65011119347665e-06, "loss": 0.8351, "step": 6497 }, { "epoch": 0.9626666666666667, "grad_norm": 1.5374990701675415, "learning_rate": 7.620459599703484e-06, "loss": 0.8171, "step": 6498 }, { "epoch": 0.9628148148148148, "grad_norm": 2.5066142082214355, "learning_rate": 7.5908080059303195e-06, "loss": 1.0571, "step": 6499 }, { "epoch": 0.9629629629629629, "grad_norm": 1.5100359916687012, "learning_rate": 7.561156412157154e-06, "loss": 1.1588, "step": 6500 }, { "epoch": 0.9631111111111111, "grad_norm": 2.0300960540771484, "learning_rate": 7.531504818383989e-06, "loss": 1.0068, "step": 6501 }, { "epoch": 0.9632592592592593, "grad_norm": 1.3035870790481567, "learning_rate": 7.501853224610823e-06, "loss": 0.9973, "step": 6502 }, { "epoch": 0.9634074074074074, "grad_norm": 2.154204845428467, "learning_rate": 7.472201630837658e-06, "loss": 0.7465, "step": 6503 }, { "epoch": 0.9635555555555556, "grad_norm": 1.118698000907898, "learning_rate": 7.442550037064493e-06, "loss": 1.0132, "step": 6504 }, { "epoch": 0.9637037037037037, "grad_norm": 1.9799890518188477, "learning_rate": 7.412898443291327e-06, "loss": 0.9759, "step": 6505 }, { "epoch": 0.9638518518518518, "grad_norm": 1.299014925956726, "learning_rate": 7.383246849518162e-06, "loss": 0.9128, "step": 6506 }, { "epoch": 0.964, "grad_norm": 1.2748594284057617, "learning_rate": 7.3535952557449964e-06, "loss": 0.9606, "step": 6507 }, { "epoch": 0.9641481481481482, "grad_norm": 1.5896930694580078, "learning_rate": 7.3239436619718316e-06, "loss": 0.7978, "step": 6508 }, { "epoch": 0.9642962962962963, "grad_norm": 1.6602082252502441, "learning_rate": 7.294292068198666e-06, "loss": 0.8388, "step": 6509 }, { "epoch": 0.9644444444444444, "grad_norm": 4.57015323638916, "learning_rate": 7.264640474425501e-06, "loss": 0.8386, "step": 6510 }, { "epoch": 0.9645925925925926, "grad_norm": 1.690658688545227, "learning_rate": 7.234988880652335e-06, "loss": 0.9881, "step": 6511 }, { "epoch": 0.9647407407407408, "grad_norm": 2.0589194297790527, "learning_rate": 7.20533728687917e-06, "loss": 0.7598, "step": 6512 }, { "epoch": 0.9648888888888889, "grad_norm": 1.7922649383544922, "learning_rate": 7.175685693106005e-06, "loss": 0.9423, "step": 6513 }, { "epoch": 0.965037037037037, "grad_norm": 1.9171276092529297, "learning_rate": 7.146034099332839e-06, "loss": 0.9521, "step": 6514 }, { "epoch": 0.9651851851851851, "grad_norm": 1.361095666885376, "learning_rate": 7.116382505559674e-06, "loss": 0.9834, "step": 6515 }, { "epoch": 0.9653333333333334, "grad_norm": 2.308819532394409, "learning_rate": 7.0867309117865085e-06, "loss": 1.1278, "step": 6516 }, { "epoch": 0.9654814814814815, "grad_norm": 1.7899411916732788, "learning_rate": 7.057079318013344e-06, "loss": 1.1066, "step": 6517 }, { "epoch": 0.9656296296296296, "grad_norm": 1.8420344591140747, "learning_rate": 7.027427724240178e-06, "loss": 1.0559, "step": 6518 }, { "epoch": 0.9657777777777777, "grad_norm": 2.107468605041504, "learning_rate": 6.997776130467012e-06, "loss": 0.8807, "step": 6519 }, { "epoch": 0.965925925925926, "grad_norm": 2.2763946056365967, "learning_rate": 6.968124536693847e-06, "loss": 0.9905, "step": 6520 }, { "epoch": 0.9660740740740741, "grad_norm": 2.3293261528015137, "learning_rate": 6.938472942920682e-06, "loss": 1.0107, "step": 6521 }, { "epoch": 0.9662222222222222, "grad_norm": 2.9471616744995117, "learning_rate": 6.908821349147517e-06, "loss": 0.9852, "step": 6522 }, { "epoch": 0.9663703703703703, "grad_norm": 1.65193772315979, "learning_rate": 6.879169755374351e-06, "loss": 0.9008, "step": 6523 }, { "epoch": 0.9665185185185186, "grad_norm": 1.3680131435394287, "learning_rate": 6.849518161601186e-06, "loss": 0.8921, "step": 6524 }, { "epoch": 0.9666666666666667, "grad_norm": 1.1072235107421875, "learning_rate": 6.8198665678280205e-06, "loss": 1.0373, "step": 6525 }, { "epoch": 0.9668148148148148, "grad_norm": 1.5350090265274048, "learning_rate": 6.790214974054855e-06, "loss": 0.8268, "step": 6526 }, { "epoch": 0.9669629629629629, "grad_norm": 1.4053906202316284, "learning_rate": 6.76056338028169e-06, "loss": 0.8095, "step": 6527 }, { "epoch": 0.9671111111111111, "grad_norm": 2.2893593311309814, "learning_rate": 6.730911786508526e-06, "loss": 1.1104, "step": 6528 }, { "epoch": 0.9672592592592593, "grad_norm": 1.2980014085769653, "learning_rate": 6.70126019273536e-06, "loss": 0.8806, "step": 6529 }, { "epoch": 0.9674074074074074, "grad_norm": 2.058032989501953, "learning_rate": 6.671608598962195e-06, "loss": 1.0578, "step": 6530 }, { "epoch": 0.9675555555555555, "grad_norm": 2.80718994140625, "learning_rate": 6.64195700518903e-06, "loss": 0.8415, "step": 6531 }, { "epoch": 0.9677037037037037, "grad_norm": 2.3966293334960938, "learning_rate": 6.612305411415865e-06, "loss": 0.9529, "step": 6532 }, { "epoch": 0.9678518518518519, "grad_norm": 1.762076735496521, "learning_rate": 6.582653817642699e-06, "loss": 1.025, "step": 6533 }, { "epoch": 0.968, "grad_norm": 1.6157422065734863, "learning_rate": 6.5530022238695334e-06, "loss": 0.8749, "step": 6534 }, { "epoch": 0.9681481481481482, "grad_norm": 1.8800804615020752, "learning_rate": 6.5233506300963686e-06, "loss": 1.1532, "step": 6535 }, { "epoch": 0.9682962962962963, "grad_norm": 3.605458974838257, "learning_rate": 6.493699036323203e-06, "loss": 0.9089, "step": 6536 }, { "epoch": 0.9684444444444444, "grad_norm": 1.5120714902877808, "learning_rate": 6.464047442550038e-06, "loss": 1.0531, "step": 6537 }, { "epoch": 0.9685925925925926, "grad_norm": 2.1298978328704834, "learning_rate": 6.434395848776872e-06, "loss": 1.0819, "step": 6538 }, { "epoch": 0.9687407407407408, "grad_norm": 1.5408835411071777, "learning_rate": 6.4047442550037074e-06, "loss": 1.3253, "step": 6539 }, { "epoch": 0.9688888888888889, "grad_norm": 3.161810874938965, "learning_rate": 6.375092661230542e-06, "loss": 1.0546, "step": 6540 }, { "epoch": 0.969037037037037, "grad_norm": 5.96724271774292, "learning_rate": 6.345441067457376e-06, "loss": 0.8363, "step": 6541 }, { "epoch": 0.9691851851851851, "grad_norm": 1.551337718963623, "learning_rate": 6.315789473684211e-06, "loss": 0.7753, "step": 6542 }, { "epoch": 0.9693333333333334, "grad_norm": 2.6459968090057373, "learning_rate": 6.2861378799110455e-06, "loss": 0.8285, "step": 6543 }, { "epoch": 0.9694814814814815, "grad_norm": 1.5539460182189941, "learning_rate": 6.256486286137881e-06, "loss": 1.2449, "step": 6544 }, { "epoch": 0.9696296296296296, "grad_norm": 2.9508183002471924, "learning_rate": 6.226834692364715e-06, "loss": 0.9232, "step": 6545 }, { "epoch": 0.9697777777777777, "grad_norm": 1.4745116233825684, "learning_rate": 6.19718309859155e-06, "loss": 0.8183, "step": 6546 }, { "epoch": 0.969925925925926, "grad_norm": 1.5062710046768188, "learning_rate": 6.167531504818384e-06, "loss": 0.8961, "step": 6547 }, { "epoch": 0.9700740740740741, "grad_norm": 1.6228721141815186, "learning_rate": 6.137879911045219e-06, "loss": 1.0116, "step": 6548 }, { "epoch": 0.9702222222222222, "grad_norm": 1.65943443775177, "learning_rate": 6.108228317272054e-06, "loss": 0.8478, "step": 6549 }, { "epoch": 0.9703703703703703, "grad_norm": 3.417999744415283, "learning_rate": 6.078576723498888e-06, "loss": 0.925, "step": 6550 }, { "epoch": 0.9705185185185186, "grad_norm": 1.63390052318573, "learning_rate": 6.048925129725723e-06, "loss": 1.072, "step": 6551 }, { "epoch": 0.9706666666666667, "grad_norm": 1.3300889730453491, "learning_rate": 6.0192735359525575e-06, "loss": 0.9626, "step": 6552 }, { "epoch": 0.9708148148148148, "grad_norm": 1.8411054611206055, "learning_rate": 5.989621942179393e-06, "loss": 1.0189, "step": 6553 }, { "epoch": 0.9709629629629629, "grad_norm": 2.7753472328186035, "learning_rate": 5.959970348406227e-06, "loss": 0.9686, "step": 6554 }, { "epoch": 0.9711111111111111, "grad_norm": 2.478764295578003, "learning_rate": 5.930318754633061e-06, "loss": 0.8573, "step": 6555 }, { "epoch": 0.9712592592592593, "grad_norm": 1.5853458642959595, "learning_rate": 5.900667160859896e-06, "loss": 0.7584, "step": 6556 }, { "epoch": 0.9714074074074074, "grad_norm": 1.3515368700027466, "learning_rate": 5.871015567086731e-06, "loss": 1.1072, "step": 6557 }, { "epoch": 0.9715555555555555, "grad_norm": 1.7907657623291016, "learning_rate": 5.841363973313566e-06, "loss": 0.9754, "step": 6558 }, { "epoch": 0.9717037037037037, "grad_norm": 3.0642828941345215, "learning_rate": 5.811712379540401e-06, "loss": 0.8858, "step": 6559 }, { "epoch": 0.9718518518518519, "grad_norm": 1.4035335779190063, "learning_rate": 5.782060785767235e-06, "loss": 1.2295, "step": 6560 }, { "epoch": 0.972, "grad_norm": 1.6747664213180542, "learning_rate": 5.75240919199407e-06, "loss": 0.9296, "step": 6561 }, { "epoch": 0.9721481481481481, "grad_norm": 2.323160409927368, "learning_rate": 5.722757598220905e-06, "loss": 1.0252, "step": 6562 }, { "epoch": 0.9722962962962963, "grad_norm": 1.5858911275863647, "learning_rate": 5.69310600444774e-06, "loss": 0.7889, "step": 6563 }, { "epoch": 0.9724444444444444, "grad_norm": 1.511087417602539, "learning_rate": 5.663454410674574e-06, "loss": 1.0915, "step": 6564 }, { "epoch": 0.9725925925925926, "grad_norm": 1.9369533061981201, "learning_rate": 5.6338028169014084e-06, "loss": 0.9701, "step": 6565 }, { "epoch": 0.9727407407407408, "grad_norm": 1.9651539325714111, "learning_rate": 5.604151223128244e-06, "loss": 1.0313, "step": 6566 }, { "epoch": 0.9728888888888889, "grad_norm": 1.8601475954055786, "learning_rate": 5.574499629355078e-06, "loss": 0.8278, "step": 6567 }, { "epoch": 0.973037037037037, "grad_norm": 2.7122087478637695, "learning_rate": 5.544848035581913e-06, "loss": 0.9094, "step": 6568 }, { "epoch": 0.9731851851851852, "grad_norm": 1.3325860500335693, "learning_rate": 5.515196441808747e-06, "loss": 1.1443, "step": 6569 }, { "epoch": 0.9733333333333334, "grad_norm": 3.21024489402771, "learning_rate": 5.4855448480355825e-06, "loss": 0.8182, "step": 6570 }, { "epoch": 0.9734814814814815, "grad_norm": 1.379136562347412, "learning_rate": 5.455893254262417e-06, "loss": 1.0857, "step": 6571 }, { "epoch": 0.9736296296296296, "grad_norm": 1.7300907373428345, "learning_rate": 5.426241660489251e-06, "loss": 1.0925, "step": 6572 }, { "epoch": 0.9737777777777777, "grad_norm": 1.1760673522949219, "learning_rate": 5.396590066716086e-06, "loss": 0.8552, "step": 6573 }, { "epoch": 0.973925925925926, "grad_norm": 1.8063416481018066, "learning_rate": 5.3669384729429205e-06, "loss": 0.8869, "step": 6574 }, { "epoch": 0.9740740740740741, "grad_norm": 1.8884493112564087, "learning_rate": 5.337286879169756e-06, "loss": 0.8914, "step": 6575 }, { "epoch": 0.9742222222222222, "grad_norm": 2.1271440982818604, "learning_rate": 5.30763528539659e-06, "loss": 0.852, "step": 6576 }, { "epoch": 0.9743703703703703, "grad_norm": 1.5887964963912964, "learning_rate": 5.277983691623425e-06, "loss": 0.8127, "step": 6577 }, { "epoch": 0.9745185185185186, "grad_norm": 1.3158174753189087, "learning_rate": 5.248332097850259e-06, "loss": 0.7721, "step": 6578 }, { "epoch": 0.9746666666666667, "grad_norm": 2.63740611076355, "learning_rate": 5.218680504077094e-06, "loss": 0.7826, "step": 6579 }, { "epoch": 0.9748148148148148, "grad_norm": 1.204813003540039, "learning_rate": 5.18902891030393e-06, "loss": 0.7437, "step": 6580 }, { "epoch": 0.9749629629629629, "grad_norm": 1.3193559646606445, "learning_rate": 5.159377316530764e-06, "loss": 1.0686, "step": 6581 }, { "epoch": 0.9751111111111112, "grad_norm": 1.195284128189087, "learning_rate": 5.129725722757599e-06, "loss": 1.1198, "step": 6582 }, { "epoch": 0.9752592592592593, "grad_norm": 1.8917425870895386, "learning_rate": 5.100074128984433e-06, "loss": 1.034, "step": 6583 }, { "epoch": 0.9754074074074074, "grad_norm": 2.3464300632476807, "learning_rate": 5.070422535211268e-06, "loss": 1.0442, "step": 6584 }, { "epoch": 0.9755555555555555, "grad_norm": 2.2270100116729736, "learning_rate": 5.040770941438103e-06, "loss": 1.0523, "step": 6585 }, { "epoch": 0.9757037037037037, "grad_norm": 1.4755874872207642, "learning_rate": 5.011119347664937e-06, "loss": 0.959, "step": 6586 }, { "epoch": 0.9758518518518519, "grad_norm": 3.1815688610076904, "learning_rate": 4.981467753891772e-06, "loss": 0.7292, "step": 6587 }, { "epoch": 0.976, "grad_norm": 1.4934961795806885, "learning_rate": 4.9518161601186066e-06, "loss": 0.9943, "step": 6588 }, { "epoch": 0.9761481481481481, "grad_norm": 1.389280080795288, "learning_rate": 4.922164566345442e-06, "loss": 0.9042, "step": 6589 }, { "epoch": 0.9762962962962963, "grad_norm": 1.9162555932998657, "learning_rate": 4.892512972572276e-06, "loss": 0.9948, "step": 6590 }, { "epoch": 0.9764444444444444, "grad_norm": 1.6919996738433838, "learning_rate": 4.86286137879911e-06, "loss": 0.842, "step": 6591 }, { "epoch": 0.9765925925925926, "grad_norm": 1.2007420063018799, "learning_rate": 4.8332097850259454e-06, "loss": 0.6764, "step": 6592 }, { "epoch": 0.9767407407407407, "grad_norm": 2.5459723472595215, "learning_rate": 4.80355819125278e-06, "loss": 0.8209, "step": 6593 }, { "epoch": 0.9768888888888889, "grad_norm": 2.35369873046875, "learning_rate": 4.773906597479615e-06, "loss": 0.8723, "step": 6594 }, { "epoch": 0.977037037037037, "grad_norm": 1.2594969272613525, "learning_rate": 4.744255003706449e-06, "loss": 0.9956, "step": 6595 }, { "epoch": 0.9771851851851852, "grad_norm": 1.4785923957824707, "learning_rate": 4.714603409933284e-06, "loss": 1.1136, "step": 6596 }, { "epoch": 0.9773333333333334, "grad_norm": 1.6019283533096313, "learning_rate": 4.684951816160119e-06, "loss": 0.9231, "step": 6597 }, { "epoch": 0.9774814814814815, "grad_norm": 1.6392041444778442, "learning_rate": 4.655300222386953e-06, "loss": 0.9903, "step": 6598 }, { "epoch": 0.9776296296296296, "grad_norm": 2.194157361984253, "learning_rate": 4.625648628613788e-06, "loss": 1.0814, "step": 6599 }, { "epoch": 0.9777777777777777, "grad_norm": 1.9493076801300049, "learning_rate": 4.595997034840623e-06, "loss": 1.0672, "step": 6600 }, { "epoch": 0.977925925925926, "grad_norm": 1.8517451286315918, "learning_rate": 4.5663454410674575e-06, "loss": 0.7373, "step": 6601 }, { "epoch": 0.9780740740740741, "grad_norm": 1.6022025346755981, "learning_rate": 4.536693847294293e-06, "loss": 0.8975, "step": 6602 }, { "epoch": 0.9782222222222222, "grad_norm": 1.5256716012954712, "learning_rate": 4.507042253521127e-06, "loss": 1.0167, "step": 6603 }, { "epoch": 0.9783703703703703, "grad_norm": 1.8543671369552612, "learning_rate": 4.477390659747962e-06, "loss": 0.8798, "step": 6604 }, { "epoch": 0.9785185185185186, "grad_norm": 1.1587783098220825, "learning_rate": 4.447739065974796e-06, "loss": 0.939, "step": 6605 }, { "epoch": 0.9786666666666667, "grad_norm": 1.7228477001190186, "learning_rate": 4.4180874722016315e-06, "loss": 0.8886, "step": 6606 }, { "epoch": 0.9788148148148148, "grad_norm": 1.776406168937683, "learning_rate": 4.388435878428466e-06, "loss": 1.0077, "step": 6607 }, { "epoch": 0.9789629629629629, "grad_norm": 2.580406427383423, "learning_rate": 4.3587842846553e-06, "loss": 1.0536, "step": 6608 }, { "epoch": 0.9791111111111112, "grad_norm": 1.3938255310058594, "learning_rate": 4.329132690882135e-06, "loss": 0.8188, "step": 6609 }, { "epoch": 0.9792592592592593, "grad_norm": 1.5968995094299316, "learning_rate": 4.2994810971089695e-06, "loss": 1.1257, "step": 6610 }, { "epoch": 0.9794074074074074, "grad_norm": 1.3835958242416382, "learning_rate": 4.269829503335805e-06, "loss": 1.1758, "step": 6611 }, { "epoch": 0.9795555555555555, "grad_norm": 2.4501984119415283, "learning_rate": 4.240177909562639e-06, "loss": 1.0919, "step": 6612 }, { "epoch": 0.9797037037037037, "grad_norm": 1.728827714920044, "learning_rate": 4.210526315789474e-06, "loss": 0.9941, "step": 6613 }, { "epoch": 0.9798518518518519, "grad_norm": 1.5663871765136719, "learning_rate": 4.180874722016308e-06, "loss": 0.7862, "step": 6614 }, { "epoch": 0.98, "grad_norm": 1.8939645290374756, "learning_rate": 4.151223128243143e-06, "loss": 0.9582, "step": 6615 }, { "epoch": 0.9801481481481481, "grad_norm": 1.8436479568481445, "learning_rate": 4.121571534469978e-06, "loss": 1.1677, "step": 6616 }, { "epoch": 0.9802962962962963, "grad_norm": 2.3548879623413086, "learning_rate": 4.091919940696812e-06, "loss": 0.9571, "step": 6617 }, { "epoch": 0.9804444444444445, "grad_norm": 1.6790826320648193, "learning_rate": 4.062268346923647e-06, "loss": 1.2313, "step": 6618 }, { "epoch": 0.9805925925925926, "grad_norm": 1.4376988410949707, "learning_rate": 4.032616753150482e-06, "loss": 0.926, "step": 6619 }, { "epoch": 0.9807407407407407, "grad_norm": 2.4382681846618652, "learning_rate": 4.002965159377317e-06, "loss": 1.1405, "step": 6620 }, { "epoch": 0.9808888888888889, "grad_norm": 1.5281683206558228, "learning_rate": 3.973313565604152e-06, "loss": 0.981, "step": 6621 }, { "epoch": 0.981037037037037, "grad_norm": 1.5083342790603638, "learning_rate": 3.943661971830986e-06, "loss": 0.9166, "step": 6622 }, { "epoch": 0.9811851851851852, "grad_norm": 1.549145221710205, "learning_rate": 3.914010378057821e-06, "loss": 0.9484, "step": 6623 }, { "epoch": 0.9813333333333333, "grad_norm": 1.4674689769744873, "learning_rate": 3.884358784284656e-06, "loss": 0.8851, "step": 6624 }, { "epoch": 0.9814814814814815, "grad_norm": 1.2259571552276611, "learning_rate": 3.854707190511491e-06, "loss": 1.0165, "step": 6625 }, { "epoch": 0.9816296296296296, "grad_norm": 1.5182334184646606, "learning_rate": 3.825055596738325e-06, "loss": 0.9455, "step": 6626 }, { "epoch": 0.9817777777777777, "grad_norm": 1.8458360433578491, "learning_rate": 3.7954040029651598e-06, "loss": 1.0414, "step": 6627 }, { "epoch": 0.981925925925926, "grad_norm": 1.4810439348220825, "learning_rate": 3.7657524091919945e-06, "loss": 0.9363, "step": 6628 }, { "epoch": 0.9820740740740741, "grad_norm": 1.8782624006271362, "learning_rate": 3.736100815418829e-06, "loss": 0.9122, "step": 6629 }, { "epoch": 0.9822222222222222, "grad_norm": 2.1159908771514893, "learning_rate": 3.7064492216456635e-06, "loss": 1.0393, "step": 6630 }, { "epoch": 0.9823703703703703, "grad_norm": 1.7776083946228027, "learning_rate": 3.6767976278724982e-06, "loss": 1.1439, "step": 6631 }, { "epoch": 0.9825185185185186, "grad_norm": 1.437274694442749, "learning_rate": 3.647146034099333e-06, "loss": 1.2951, "step": 6632 }, { "epoch": 0.9826666666666667, "grad_norm": 2.729619264602661, "learning_rate": 3.6174944403261677e-06, "loss": 0.834, "step": 6633 }, { "epoch": 0.9828148148148148, "grad_norm": 1.2838209867477417, "learning_rate": 3.5878428465530024e-06, "loss": 0.9613, "step": 6634 }, { "epoch": 0.9829629629629629, "grad_norm": 1.4120662212371826, "learning_rate": 3.558191252779837e-06, "loss": 1.2446, "step": 6635 }, { "epoch": 0.9831111111111112, "grad_norm": 1.354019045829773, "learning_rate": 3.528539659006672e-06, "loss": 0.9115, "step": 6636 }, { "epoch": 0.9832592592592593, "grad_norm": 1.6061333417892456, "learning_rate": 3.498888065233506e-06, "loss": 0.784, "step": 6637 }, { "epoch": 0.9834074074074074, "grad_norm": 1.5644516944885254, "learning_rate": 3.469236471460341e-06, "loss": 1.0192, "step": 6638 }, { "epoch": 0.9835555555555555, "grad_norm": 1.7661339044570923, "learning_rate": 3.4395848776871755e-06, "loss": 1.0909, "step": 6639 }, { "epoch": 0.9837037037037037, "grad_norm": 1.819481372833252, "learning_rate": 3.4099332839140103e-06, "loss": 1.147, "step": 6640 }, { "epoch": 0.9838518518518519, "grad_norm": 2.0585787296295166, "learning_rate": 3.380281690140845e-06, "loss": 0.8059, "step": 6641 }, { "epoch": 0.984, "grad_norm": 1.4306613206863403, "learning_rate": 3.35063009636768e-06, "loss": 0.9699, "step": 6642 }, { "epoch": 0.9841481481481481, "grad_norm": 1.5439941883087158, "learning_rate": 3.320978502594515e-06, "loss": 1.0914, "step": 6643 }, { "epoch": 0.9842962962962963, "grad_norm": 1.494676947593689, "learning_rate": 3.2913269088213496e-06, "loss": 0.9988, "step": 6644 }, { "epoch": 0.9844444444444445, "grad_norm": 1.6927095651626587, "learning_rate": 3.2616753150481843e-06, "loss": 1.0446, "step": 6645 }, { "epoch": 0.9845925925925926, "grad_norm": 1.642819881439209, "learning_rate": 3.232023721275019e-06, "loss": 0.929, "step": 6646 }, { "epoch": 0.9847407407407407, "grad_norm": 1.5157071352005005, "learning_rate": 3.2023721275018537e-06, "loss": 0.9346, "step": 6647 }, { "epoch": 0.9848888888888889, "grad_norm": 1.68966805934906, "learning_rate": 3.172720533728688e-06, "loss": 1.0874, "step": 6648 }, { "epoch": 0.985037037037037, "grad_norm": 1.5840626955032349, "learning_rate": 3.1430689399555227e-06, "loss": 0.7515, "step": 6649 }, { "epoch": 0.9851851851851852, "grad_norm": 2.2057268619537354, "learning_rate": 3.1134173461823575e-06, "loss": 1.1616, "step": 6650 }, { "epoch": 0.9853333333333333, "grad_norm": 1.8586816787719727, "learning_rate": 3.083765752409192e-06, "loss": 0.9756, "step": 6651 }, { "epoch": 0.9854814814814815, "grad_norm": 3.2284040451049805, "learning_rate": 3.054114158636027e-06, "loss": 1.0309, "step": 6652 }, { "epoch": 0.9856296296296296, "grad_norm": 1.3896958827972412, "learning_rate": 3.0244625648628616e-06, "loss": 0.9007, "step": 6653 }, { "epoch": 0.9857777777777778, "grad_norm": 2.136592388153076, "learning_rate": 2.9948109710896963e-06, "loss": 1.14, "step": 6654 }, { "epoch": 0.9859259259259259, "grad_norm": 2.3692097663879395, "learning_rate": 2.9651593773165306e-06, "loss": 1.0577, "step": 6655 }, { "epoch": 0.9860740740740741, "grad_norm": 3.259517192840576, "learning_rate": 2.9355077835433653e-06, "loss": 1.1438, "step": 6656 }, { "epoch": 0.9862222222222222, "grad_norm": 1.794457197189331, "learning_rate": 2.9058561897702005e-06, "loss": 0.947, "step": 6657 }, { "epoch": 0.9863703703703703, "grad_norm": 1.563719630241394, "learning_rate": 2.876204595997035e-06, "loss": 0.8938, "step": 6658 }, { "epoch": 0.9865185185185186, "grad_norm": 1.9052386283874512, "learning_rate": 2.84655300222387e-06, "loss": 1.0833, "step": 6659 }, { "epoch": 0.9866666666666667, "grad_norm": 1.1962292194366455, "learning_rate": 2.8169014084507042e-06, "loss": 1.1096, "step": 6660 }, { "epoch": 0.9868148148148148, "grad_norm": 2.1645631790161133, "learning_rate": 2.787249814677539e-06, "loss": 0.9058, "step": 6661 }, { "epoch": 0.9869629629629629, "grad_norm": 1.8755302429199219, "learning_rate": 2.7575982209043737e-06, "loss": 1.1832, "step": 6662 }, { "epoch": 0.9871111111111112, "grad_norm": 7.888797760009766, "learning_rate": 2.7279466271312084e-06, "loss": 0.9308, "step": 6663 }, { "epoch": 0.9872592592592593, "grad_norm": 1.6029328107833862, "learning_rate": 2.698295033358043e-06, "loss": 1.0354, "step": 6664 }, { "epoch": 0.9874074074074074, "grad_norm": 1.4510563611984253, "learning_rate": 2.668643439584878e-06, "loss": 1.0924, "step": 6665 }, { "epoch": 0.9875555555555555, "grad_norm": 2.7548041343688965, "learning_rate": 2.6389918458117125e-06, "loss": 1.4003, "step": 6666 }, { "epoch": 0.9877037037037038, "grad_norm": 1.3703808784484863, "learning_rate": 2.609340252038547e-06, "loss": 1.0794, "step": 6667 }, { "epoch": 0.9878518518518519, "grad_norm": 1.6302189826965332, "learning_rate": 2.579688658265382e-06, "loss": 0.9665, "step": 6668 }, { "epoch": 0.988, "grad_norm": 1.29432213306427, "learning_rate": 2.5500370644922167e-06, "loss": 0.8653, "step": 6669 }, { "epoch": 0.9881481481481481, "grad_norm": 1.2217215299606323, "learning_rate": 2.5203854707190514e-06, "loss": 0.8904, "step": 6670 }, { "epoch": 0.9882962962962963, "grad_norm": 2.8260674476623535, "learning_rate": 2.490733876945886e-06, "loss": 1.0907, "step": 6671 }, { "epoch": 0.9884444444444445, "grad_norm": 2.121968984603882, "learning_rate": 2.461082283172721e-06, "loss": 0.9767, "step": 6672 }, { "epoch": 0.9885925925925926, "grad_norm": 2.9519248008728027, "learning_rate": 2.431430689399555e-06, "loss": 0.9438, "step": 6673 }, { "epoch": 0.9887407407407407, "grad_norm": 2.851862907409668, "learning_rate": 2.40177909562639e-06, "loss": 0.872, "step": 6674 }, { "epoch": 0.9888888888888889, "grad_norm": 1.1649541854858398, "learning_rate": 2.3721275018532246e-06, "loss": 0.8041, "step": 6675 }, { "epoch": 0.989037037037037, "grad_norm": 1.4583702087402344, "learning_rate": 2.3424759080800593e-06, "loss": 1.0451, "step": 6676 }, { "epoch": 0.9891851851851852, "grad_norm": 2.0155556201934814, "learning_rate": 2.312824314306894e-06, "loss": 0.9207, "step": 6677 }, { "epoch": 0.9893333333333333, "grad_norm": 1.766683578491211, "learning_rate": 2.2831727205337287e-06, "loss": 0.9521, "step": 6678 }, { "epoch": 0.9894814814814815, "grad_norm": 1.982771635055542, "learning_rate": 2.2535211267605635e-06, "loss": 0.9774, "step": 6679 }, { "epoch": 0.9896296296296296, "grad_norm": 1.9274686574935913, "learning_rate": 2.223869532987398e-06, "loss": 1.0067, "step": 6680 }, { "epoch": 0.9897777777777778, "grad_norm": 1.6137382984161377, "learning_rate": 2.194217939214233e-06, "loss": 0.977, "step": 6681 }, { "epoch": 0.9899259259259259, "grad_norm": 1.5147463083267212, "learning_rate": 2.1645663454410676e-06, "loss": 0.87, "step": 6682 }, { "epoch": 0.9900740740740741, "grad_norm": 3.287322759628296, "learning_rate": 2.1349147516679023e-06, "loss": 0.803, "step": 6683 }, { "epoch": 0.9902222222222222, "grad_norm": 1.6876987218856812, "learning_rate": 2.105263157894737e-06, "loss": 1.152, "step": 6684 }, { "epoch": 0.9903703703703703, "grad_norm": 1.5785335302352905, "learning_rate": 2.0756115641215714e-06, "loss": 0.8979, "step": 6685 }, { "epoch": 0.9905185185185185, "grad_norm": 1.839627742767334, "learning_rate": 2.045959970348406e-06, "loss": 0.8454, "step": 6686 }, { "epoch": 0.9906666666666667, "grad_norm": 2.166228771209717, "learning_rate": 2.016308376575241e-06, "loss": 1.0755, "step": 6687 }, { "epoch": 0.9908148148148148, "grad_norm": 1.7218915224075317, "learning_rate": 1.986656782802076e-06, "loss": 1.122, "step": 6688 }, { "epoch": 0.9909629629629629, "grad_norm": 2.0172159671783447, "learning_rate": 1.9570051890289107e-06, "loss": 1.0773, "step": 6689 }, { "epoch": 0.9911111111111112, "grad_norm": 2.1524360179901123, "learning_rate": 1.9273535952557454e-06, "loss": 0.8959, "step": 6690 }, { "epoch": 0.9912592592592593, "grad_norm": 1.6936663389205933, "learning_rate": 1.8977020014825799e-06, "loss": 1.1442, "step": 6691 }, { "epoch": 0.9914074074074074, "grad_norm": 3.5624890327453613, "learning_rate": 1.8680504077094146e-06, "loss": 0.7474, "step": 6692 }, { "epoch": 0.9915555555555555, "grad_norm": 2.378261089324951, "learning_rate": 1.8383988139362491e-06, "loss": 0.8932, "step": 6693 }, { "epoch": 0.9917037037037038, "grad_norm": 1.7613540887832642, "learning_rate": 1.8087472201630838e-06, "loss": 0.6714, "step": 6694 }, { "epoch": 0.9918518518518519, "grad_norm": 1.6235287189483643, "learning_rate": 1.7790956263899185e-06, "loss": 0.9776, "step": 6695 }, { "epoch": 0.992, "grad_norm": 1.8661762475967407, "learning_rate": 1.749444032616753e-06, "loss": 1.0605, "step": 6696 }, { "epoch": 0.9921481481481481, "grad_norm": 2.032745599746704, "learning_rate": 1.7197924388435878e-06, "loss": 0.8629, "step": 6697 }, { "epoch": 0.9922962962962963, "grad_norm": 1.286075234413147, "learning_rate": 1.6901408450704225e-06, "loss": 0.9903, "step": 6698 }, { "epoch": 0.9924444444444445, "grad_norm": 1.348991870880127, "learning_rate": 1.6604892512972574e-06, "loss": 0.9778, "step": 6699 }, { "epoch": 0.9925925925925926, "grad_norm": 1.556408405303955, "learning_rate": 1.6308376575240921e-06, "loss": 0.9447, "step": 6700 }, { "epoch": 0.9927407407407407, "grad_norm": 1.4265048503875732, "learning_rate": 1.6011860637509269e-06, "loss": 1.0695, "step": 6701 }, { "epoch": 0.9928888888888889, "grad_norm": 1.8670324087142944, "learning_rate": 1.5715344699777614e-06, "loss": 0.9521, "step": 6702 }, { "epoch": 0.993037037037037, "grad_norm": 1.4540780782699585, "learning_rate": 1.541882876204596e-06, "loss": 0.8364, "step": 6703 }, { "epoch": 0.9931851851851852, "grad_norm": 1.2747162580490112, "learning_rate": 1.5122312824314308e-06, "loss": 0.8357, "step": 6704 }, { "epoch": 0.9933333333333333, "grad_norm": 1.5093507766723633, "learning_rate": 1.4825796886582653e-06, "loss": 1.0274, "step": 6705 }, { "epoch": 0.9934814814814815, "grad_norm": 1.8258951902389526, "learning_rate": 1.4529280948851002e-06, "loss": 1.1714, "step": 6706 }, { "epoch": 0.9936296296296296, "grad_norm": 1.4696855545043945, "learning_rate": 1.423276501111935e-06, "loss": 1.0053, "step": 6707 }, { "epoch": 0.9937777777777778, "grad_norm": 1.2955344915390015, "learning_rate": 1.3936249073387695e-06, "loss": 0.8922, "step": 6708 }, { "epoch": 0.9939259259259259, "grad_norm": 2.6339271068573, "learning_rate": 1.3639733135656042e-06, "loss": 0.7903, "step": 6709 }, { "epoch": 0.9940740740740741, "grad_norm": 1.6920212507247925, "learning_rate": 1.334321719792439e-06, "loss": 1.0618, "step": 6710 }, { "epoch": 0.9942222222222222, "grad_norm": 1.0905203819274902, "learning_rate": 1.3046701260192734e-06, "loss": 1.0253, "step": 6711 }, { "epoch": 0.9943703703703703, "grad_norm": 1.664962887763977, "learning_rate": 1.2750185322461083e-06, "loss": 1.1039, "step": 6712 }, { "epoch": 0.9945185185185185, "grad_norm": 1.530849814414978, "learning_rate": 1.245366938472943e-06, "loss": 0.983, "step": 6713 }, { "epoch": 0.9946666666666667, "grad_norm": 1.5666024684906006, "learning_rate": 1.2157153446997776e-06, "loss": 1.078, "step": 6714 }, { "epoch": 0.9948148148148148, "grad_norm": 1.3971589803695679, "learning_rate": 1.1860637509266123e-06, "loss": 0.9032, "step": 6715 }, { "epoch": 0.9949629629629629, "grad_norm": 2.2599244117736816, "learning_rate": 1.156412157153447e-06, "loss": 1.2584, "step": 6716 }, { "epoch": 0.9951111111111111, "grad_norm": 1.9005279541015625, "learning_rate": 1.1267605633802817e-06, "loss": 1.1237, "step": 6717 }, { "epoch": 0.9952592592592593, "grad_norm": 1.5285006761550903, "learning_rate": 1.0971089696071165e-06, "loss": 1.0576, "step": 6718 }, { "epoch": 0.9954074074074074, "grad_norm": 1.7799162864685059, "learning_rate": 1.0674573758339512e-06, "loss": 1.0083, "step": 6719 }, { "epoch": 0.9955555555555555, "grad_norm": 1.1048457622528076, "learning_rate": 1.0378057820607857e-06, "loss": 0.9721, "step": 6720 }, { "epoch": 0.9957037037037038, "grad_norm": 2.5665197372436523, "learning_rate": 1.0081541882876204e-06, "loss": 1.2598, "step": 6721 }, { "epoch": 0.9958518518518519, "grad_norm": 1.661734700202942, "learning_rate": 9.785025945144553e-07, "loss": 1.0126, "step": 6722 }, { "epoch": 0.996, "grad_norm": 1.7528083324432373, "learning_rate": 9.488510007412899e-07, "loss": 0.9573, "step": 6723 }, { "epoch": 0.9961481481481481, "grad_norm": 1.5361084938049316, "learning_rate": 9.191994069681246e-07, "loss": 0.9741, "step": 6724 }, { "epoch": 0.9962962962962963, "grad_norm": 1.998482584953308, "learning_rate": 8.895478131949593e-07, "loss": 0.9462, "step": 6725 }, { "epoch": 0.9964444444444445, "grad_norm": 2.735476493835449, "learning_rate": 8.598962194217939e-07, "loss": 0.9576, "step": 6726 }, { "epoch": 0.9965925925925926, "grad_norm": 2.3294827938079834, "learning_rate": 8.302446256486287e-07, "loss": 0.9671, "step": 6727 }, { "epoch": 0.9967407407407407, "grad_norm": 2.1672589778900146, "learning_rate": 8.005930318754634e-07, "loss": 1.0485, "step": 6728 }, { "epoch": 0.9968888888888889, "grad_norm": 1.595106840133667, "learning_rate": 7.70941438102298e-07, "loss": 1.1496, "step": 6729 }, { "epoch": 0.997037037037037, "grad_norm": 2.001743793487549, "learning_rate": 7.412898443291327e-07, "loss": 1.1263, "step": 6730 }, { "epoch": 0.9971851851851852, "grad_norm": 2.020989418029785, "learning_rate": 7.116382505559675e-07, "loss": 0.9639, "step": 6731 }, { "epoch": 0.9973333333333333, "grad_norm": 2.517296552658081, "learning_rate": 6.819866567828021e-07, "loss": 0.9608, "step": 6732 }, { "epoch": 0.9974814814814815, "grad_norm": 1.6524677276611328, "learning_rate": 6.523350630096367e-07, "loss": 1.1922, "step": 6733 }, { "epoch": 0.9976296296296296, "grad_norm": 1.6622039079666138, "learning_rate": 6.226834692364715e-07, "loss": 0.9946, "step": 6734 }, { "epoch": 0.9977777777777778, "grad_norm": 1.4304887056350708, "learning_rate": 5.930318754633061e-07, "loss": 1.0696, "step": 6735 }, { "epoch": 0.9979259259259259, "grad_norm": 1.105725884437561, "learning_rate": 5.633802816901409e-07, "loss": 0.9014, "step": 6736 }, { "epoch": 0.9980740740740741, "grad_norm": 1.4582500457763672, "learning_rate": 5.337286879169756e-07, "loss": 1.0234, "step": 6737 }, { "epoch": 0.9982222222222222, "grad_norm": 2.0556626319885254, "learning_rate": 5.040770941438102e-07, "loss": 0.9039, "step": 6738 }, { "epoch": 0.9983703703703704, "grad_norm": 2.3461997509002686, "learning_rate": 4.7442550037064497e-07, "loss": 1.0611, "step": 6739 }, { "epoch": 0.9985185185185185, "grad_norm": 1.3852459192276, "learning_rate": 4.4477390659747964e-07, "loss": 1.1235, "step": 6740 }, { "epoch": 0.9986666666666667, "grad_norm": 1.3906238079071045, "learning_rate": 4.1512231282431436e-07, "loss": 0.9771, "step": 6741 }, { "epoch": 0.9988148148148148, "grad_norm": 1.7508108615875244, "learning_rate": 3.85470719051149e-07, "loss": 0.974, "step": 6742 }, { "epoch": 0.9989629629629629, "grad_norm": 2.0732643604278564, "learning_rate": 3.5581912527798374e-07, "loss": 1.0118, "step": 6743 }, { "epoch": 0.9991111111111111, "grad_norm": 1.8446046113967896, "learning_rate": 3.2616753150481835e-07, "loss": 1.1361, "step": 6744 }, { "epoch": 0.9992592592592593, "grad_norm": 1.2646979093551636, "learning_rate": 2.9651593773165307e-07, "loss": 1.0286, "step": 6745 }, { "epoch": 0.9994074074074074, "grad_norm": 1.2417453527450562, "learning_rate": 2.668643439584878e-07, "loss": 1.1493, "step": 6746 }, { "epoch": 0.9995555555555555, "grad_norm": 1.6472728252410889, "learning_rate": 2.3721275018532249e-07, "loss": 0.9548, "step": 6747 }, { "epoch": 0.9997037037037036, "grad_norm": 2.7328872680664062, "learning_rate": 2.0756115641215718e-07, "loss": 1.014, "step": 6748 }, { "epoch": 0.9998518518518519, "grad_norm": 1.9749536514282227, "learning_rate": 1.7790956263899187e-07, "loss": 0.7927, "step": 6749 }, { "epoch": 1.0, "grad_norm": 2.27304744720459, "learning_rate": 1.4825796886582654e-07, "loss": 1.1388, "step": 6750 } ], "logging_steps": 1, "max_steps": 6750, "num_input_tokens_seen": 0, "num_train_epochs": 1, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.089575766795232e+17, "train_batch_size": 2, "trial_name": null, "trial_params": null }