{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 500, "global_step": 1638, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.003663003663003663, "grad_norm": 2.875383138656616, "learning_rate": 2.0000000000000002e-07, "loss": 2.6876986026763916, "step": 2 }, { "epoch": 0.007326007326007326, "grad_norm": 0.7562136650085449, "learning_rate": 6.000000000000001e-07, "loss": 1.6656783819198608, "step": 4 }, { "epoch": 0.01098901098901099, "grad_norm": 0.39378538727760315, "learning_rate": 1.0000000000000002e-06, "loss": 1.8813486099243164, "step": 6 }, { "epoch": 0.014652014652014652, "grad_norm": 0.1678856462240219, "learning_rate": 1.4000000000000001e-06, "loss": 2.073314666748047, "step": 8 }, { "epoch": 0.018315018315018316, "grad_norm": 0.16630569100379944, "learning_rate": 1.8000000000000001e-06, "loss": 2.2603981494903564, "step": 10 }, { "epoch": 0.02197802197802198, "grad_norm": 1.2313497066497803, "learning_rate": 2.2e-06, "loss": 2.0936238765716553, "step": 12 }, { "epoch": 0.02564102564102564, "grad_norm": 0.6229072213172913, "learning_rate": 2.6e-06, "loss": 1.786569595336914, "step": 14 }, { "epoch": 0.029304029304029304, "grad_norm": 0.21894435584545135, "learning_rate": 3e-06, "loss": 1.9302886724472046, "step": 16 }, { "epoch": 0.03296703296703297, "grad_norm": 0.8146782517433167, "learning_rate": 3.4000000000000005e-06, "loss": 1.9158211946487427, "step": 18 }, { "epoch": 0.03663003663003663, "grad_norm": 0.139973446726799, "learning_rate": 3.8000000000000005e-06, "loss": 1.801032543182373, "step": 20 }, { "epoch": 0.040293040293040296, "grad_norm": 0.9421126842498779, "learning_rate": 4.2000000000000004e-06, "loss": 1.4370536804199219, "step": 22 }, { "epoch": 0.04395604395604396, "grad_norm": 0.2687402665615082, "learning_rate": 4.600000000000001e-06, "loss": 1.680647611618042, "step": 24 }, { "epoch": 0.047619047619047616, "grad_norm": 0.15750955045223236, "learning_rate": 5e-06, "loss": 1.6444307565689087, "step": 26 }, { "epoch": 0.05128205128205128, "grad_norm": 0.3872029483318329, "learning_rate": 5.400000000000001e-06, "loss": 1.9668141603469849, "step": 28 }, { "epoch": 0.054945054945054944, "grad_norm": 0.7753072381019592, "learning_rate": 5.8e-06, "loss": 1.345158576965332, "step": 30 }, { "epoch": 0.05860805860805861, "grad_norm": 4.0286335945129395, "learning_rate": 6.200000000000001e-06, "loss": 1.2935595512390137, "step": 32 }, { "epoch": 0.06227106227106227, "grad_norm": 0.11804597079753876, "learning_rate": 6.600000000000001e-06, "loss": 1.310804843902588, "step": 34 }, { "epoch": 0.06593406593406594, "grad_norm": 0.3004632592201233, "learning_rate": 7e-06, "loss": 1.6871081590652466, "step": 36 }, { "epoch": 0.0695970695970696, "grad_norm": 0.3273477554321289, "learning_rate": 7.4e-06, "loss": 1.4714224338531494, "step": 38 }, { "epoch": 0.07326007326007326, "grad_norm": 0.38304704427719116, "learning_rate": 7.800000000000002e-06, "loss": 1.5327234268188477, "step": 40 }, { "epoch": 0.07692307692307693, "grad_norm": 0.17035049200057983, "learning_rate": 8.2e-06, "loss": 1.2890535593032837, "step": 42 }, { "epoch": 0.08058608058608059, "grad_norm": 0.18172013759613037, "learning_rate": 8.6e-06, "loss": 1.5969315767288208, "step": 44 }, { "epoch": 0.08424908424908426, "grad_norm": 0.1807372272014618, "learning_rate": 9e-06, "loss": 1.6807602643966675, "step": 46 }, { "epoch": 0.08791208791208792, "grad_norm": 0.2631019353866577, "learning_rate": 9.4e-06, "loss": 1.1396199464797974, "step": 48 }, { "epoch": 0.09157509157509157, "grad_norm": 0.10772737115621567, "learning_rate": 9.800000000000001e-06, "loss": 1.576991319656372, "step": 50 }, { "epoch": 0.09523809523809523, "grad_norm": 0.1471075862646103, "learning_rate": 9.999991193950434e-06, "loss": 0.8256194591522217, "step": 52 }, { "epoch": 0.0989010989010989, "grad_norm": 2.466968059539795, "learning_rate": 9.999920745760685e-06, "loss": 1.1205874681472778, "step": 54 }, { "epoch": 0.10256410256410256, "grad_norm": 0.4292626678943634, "learning_rate": 9.99977985048407e-06, "loss": 1.397847294807434, "step": 56 }, { "epoch": 0.10622710622710622, "grad_norm": 0.11925622820854187, "learning_rate": 9.999568510326332e-06, "loss": 1.2620929479599, "step": 58 }, { "epoch": 0.10989010989010989, "grad_norm": 0.08886076509952545, "learning_rate": 9.999286728596034e-06, "loss": 1.502614140510559, "step": 60 }, { "epoch": 0.11355311355311355, "grad_norm": 0.31627047061920166, "learning_rate": 9.998934509704524e-06, "loss": 1.5195817947387695, "step": 62 }, { "epoch": 0.11721611721611722, "grad_norm": 0.12464763224124908, "learning_rate": 9.998511859165853e-06, "loss": 1.4700745344161987, "step": 64 }, { "epoch": 0.12087912087912088, "grad_norm": 0.2327689528465271, "learning_rate": 9.998018783596694e-06, "loss": 1.4286034107208252, "step": 66 }, { "epoch": 0.12454212454212454, "grad_norm": 0.14609983563423157, "learning_rate": 9.997455290716233e-06, "loss": 1.3966401815414429, "step": 68 }, { "epoch": 0.1282051282051282, "grad_norm": 0.4661511182785034, "learning_rate": 9.996821389346058e-06, "loss": 1.2351371049880981, "step": 70 }, { "epoch": 0.13186813186813187, "grad_norm": 0.15036970376968384, "learning_rate": 9.99611708941001e-06, "loss": 1.5122816562652588, "step": 72 }, { "epoch": 0.13553113553113552, "grad_norm": 0.1798364669084549, "learning_rate": 9.995342401934034e-06, "loss": 1.6327002048492432, "step": 74 }, { "epoch": 0.1391941391941392, "grad_norm": 0.10958249866962433, "learning_rate": 9.994497339046004e-06, "loss": 1.124930739402771, "step": 76 }, { "epoch": 0.14285714285714285, "grad_norm": 0.09284580498933792, "learning_rate": 9.993581913975538e-06, "loss": 1.516735553741455, "step": 78 }, { "epoch": 0.14652014652014653, "grad_norm": 0.3077796399593353, "learning_rate": 9.99259614105378e-06, "loss": 1.620766043663025, "step": 80 }, { "epoch": 0.15018315018315018, "grad_norm": 0.1259474903345108, "learning_rate": 9.99154003571319e-06, "loss": 1.2517590522766113, "step": 82 }, { "epoch": 0.15384615384615385, "grad_norm": 0.10728312283754349, "learning_rate": 9.990413614487288e-06, "loss": 1.4343254566192627, "step": 84 }, { "epoch": 0.1575091575091575, "grad_norm": 0.3186304569244385, "learning_rate": 9.989216895010406e-06, "loss": 1.5559083223342896, "step": 86 }, { "epoch": 0.16117216117216118, "grad_norm": 0.47098618745803833, "learning_rate": 9.987949896017412e-06, "loss": 0.7234257459640503, "step": 88 }, { "epoch": 0.16483516483516483, "grad_norm": 0.07171591371297836, "learning_rate": 9.986612637343402e-06, "loss": 0.920280396938324, "step": 90 }, { "epoch": 0.1684981684981685, "grad_norm": 0.15488727390766144, "learning_rate": 9.985205139923408e-06, "loss": 1.374828577041626, "step": 92 }, { "epoch": 0.17216117216117216, "grad_norm": 0.17309579253196716, "learning_rate": 9.983727425792066e-06, "loss": 1.40683114528656, "step": 94 }, { "epoch": 0.17582417582417584, "grad_norm": 0.47913244366645813, "learning_rate": 9.982179518083255e-06, "loss": 1.296237826347351, "step": 96 }, { "epoch": 0.1794871794871795, "grad_norm": 0.2404627501964569, "learning_rate": 9.980561441029761e-06, "loss": 1.3930805921554565, "step": 98 }, { "epoch": 0.18315018315018314, "grad_norm": 0.273111492395401, "learning_rate": 9.978873219962874e-06, "loss": 1.218533992767334, "step": 100 }, { "epoch": 0.18681318681318682, "grad_norm": 0.524663507938385, "learning_rate": 9.977114881312008e-06, "loss": 1.3098607063293457, "step": 102 }, { "epoch": 0.19047619047619047, "grad_norm": 0.15053874254226685, "learning_rate": 9.975286452604275e-06, "loss": 1.0048173666000366, "step": 104 }, { "epoch": 0.19413919413919414, "grad_norm": 0.3083300292491913, "learning_rate": 9.973387962464066e-06, "loss": 1.1155184507369995, "step": 106 }, { "epoch": 0.1978021978021978, "grad_norm": 0.2675297260284424, "learning_rate": 9.971419440612591e-06, "loss": 1.3713957071304321, "step": 108 }, { "epoch": 0.20146520146520147, "grad_norm": 0.12364854663610458, "learning_rate": 9.969380917867421e-06, "loss": 1.2191200256347656, "step": 110 }, { "epoch": 0.20512820512820512, "grad_norm": 0.09928097575902939, "learning_rate": 9.967272426142007e-06, "loss": 1.275339961051941, "step": 112 }, { "epoch": 0.2087912087912088, "grad_norm": 0.24085743725299835, "learning_rate": 9.965093998445174e-06, "loss": 0.6748014092445374, "step": 114 }, { "epoch": 0.21245421245421245, "grad_norm": 0.3158385157585144, "learning_rate": 9.962845668880606e-06, "loss": 1.343500018119812, "step": 116 }, { "epoch": 0.21611721611721613, "grad_norm": 0.18101683259010315, "learning_rate": 9.96052747264632e-06, "loss": 1.3496334552764893, "step": 118 }, { "epoch": 0.21978021978021978, "grad_norm": 0.22877711057662964, "learning_rate": 9.9581394460341e-06, "loss": 1.399277925491333, "step": 120 }, { "epoch": 0.22344322344322345, "grad_norm": 0.1900375783443451, "learning_rate": 9.955681626428944e-06, "loss": 1.351351261138916, "step": 122 }, { "epoch": 0.2271062271062271, "grad_norm": 0.2790425419807434, "learning_rate": 9.95315405230847e-06, "loss": 1.0575695037841797, "step": 124 }, { "epoch": 0.23076923076923078, "grad_norm": 0.1452687829732895, "learning_rate": 9.950556763242316e-06, "loss": 0.9282295107841492, "step": 126 }, { "epoch": 0.23443223443223443, "grad_norm": 0.4646700322628021, "learning_rate": 9.947889799891517e-06, "loss": 1.2313032150268555, "step": 128 }, { "epoch": 0.23809523809523808, "grad_norm": 0.2979331612586975, "learning_rate": 9.94515320400788e-06, "loss": 1.3399178981781006, "step": 130 }, { "epoch": 0.24175824175824176, "grad_norm": 0.13976508378982544, "learning_rate": 9.942347018433312e-06, "loss": 1.439223289489746, "step": 132 }, { "epoch": 0.2454212454212454, "grad_norm": 0.18163283169269562, "learning_rate": 9.939471287099167e-06, "loss": 1.3410747051239014, "step": 134 }, { "epoch": 0.2490842490842491, "grad_norm": 0.62637859582901, "learning_rate": 9.936526055025547e-06, "loss": 1.3174734115600586, "step": 136 }, { "epoch": 0.25274725274725274, "grad_norm": 0.33270207047462463, "learning_rate": 9.933511368320602e-06, "loss": 1.1196776628494263, "step": 138 }, { "epoch": 0.2564102564102564, "grad_norm": 0.10809160023927689, "learning_rate": 9.930427274179808e-06, "loss": 1.3199713230133057, "step": 140 }, { "epoch": 0.2600732600732601, "grad_norm": 0.31561923027038574, "learning_rate": 9.927273820885223e-06, "loss": 1.3330121040344238, "step": 142 }, { "epoch": 0.26373626373626374, "grad_norm": 0.15577837824821472, "learning_rate": 9.924051057804742e-06, "loss": 1.2973798513412476, "step": 144 }, { "epoch": 0.2673992673992674, "grad_norm": 0.2457117736339569, "learning_rate": 9.920759035391308e-06, "loss": 1.454565405845642, "step": 146 }, { "epoch": 0.27106227106227104, "grad_norm": 0.8564723134040833, "learning_rate": 9.917397805182145e-06, "loss": 1.354149580001831, "step": 148 }, { "epoch": 0.27472527472527475, "grad_norm": 0.11519961059093475, "learning_rate": 9.913967419797924e-06, "loss": 0.9727800488471985, "step": 150 }, { "epoch": 0.2783882783882784, "grad_norm": 0.253650039434433, "learning_rate": 9.910467932941962e-06, "loss": 0.6865445375442505, "step": 152 }, { "epoch": 0.28205128205128205, "grad_norm": 0.2849763333797455, "learning_rate": 9.90689939939937e-06, "loss": 1.126017451286316, "step": 154 }, { "epoch": 0.2857142857142857, "grad_norm": 0.17028306424617767, "learning_rate": 9.903261875036192e-06, "loss": 1.3710747957229614, "step": 156 }, { "epoch": 0.2893772893772894, "grad_norm": 0.13653086125850677, "learning_rate": 9.899555416798546e-06, "loss": 1.3111331462860107, "step": 158 }, { "epoch": 0.29304029304029305, "grad_norm": 0.5475143790245056, "learning_rate": 9.895780082711717e-06, "loss": 0.9597386717796326, "step": 160 }, { "epoch": 0.2967032967032967, "grad_norm": 0.3660220503807068, "learning_rate": 9.891935931879252e-06, "loss": 1.5575504302978516, "step": 162 }, { "epoch": 0.30036630036630035, "grad_norm": 0.12948466837406158, "learning_rate": 9.888023024482041e-06, "loss": 1.0819988250732422, "step": 164 }, { "epoch": 0.304029304029304, "grad_norm": 0.5461969375610352, "learning_rate": 9.884041421777369e-06, "loss": 1.4256272315979004, "step": 166 }, { "epoch": 0.3076923076923077, "grad_norm": 0.16308943927288055, "learning_rate": 9.879991186097959e-06, "loss": 1.1550545692443848, "step": 168 }, { "epoch": 0.31135531135531136, "grad_norm": 0.23025067150592804, "learning_rate": 9.875872380850992e-06, "loss": 1.2838108539581299, "step": 170 }, { "epoch": 0.315018315018315, "grad_norm": 1.0842785835266113, "learning_rate": 9.871685070517124e-06, "loss": 1.027992606163025, "step": 172 }, { "epoch": 0.31868131868131866, "grad_norm": 0.8840537071228027, "learning_rate": 9.86742932064947e-06, "loss": 0.8895283341407776, "step": 174 }, { "epoch": 0.32234432234432236, "grad_norm": 0.4256756901741028, "learning_rate": 9.863105197872574e-06, "loss": 1.4210491180419922, "step": 176 }, { "epoch": 0.326007326007326, "grad_norm": 0.32473987340927124, "learning_rate": 9.858712769881375e-06, "loss": 0.940653920173645, "step": 178 }, { "epoch": 0.32967032967032966, "grad_norm": 0.1946435272693634, "learning_rate": 9.854252105440142e-06, "loss": 1.523209810256958, "step": 180 }, { "epoch": 0.3333333333333333, "grad_norm": 0.12392517179250717, "learning_rate": 9.849723274381395e-06, "loss": 0.9991880059242249, "step": 182 }, { "epoch": 0.336996336996337, "grad_norm": 0.18956027925014496, "learning_rate": 9.845126347604818e-06, "loss": 1.2698228359222412, "step": 184 }, { "epoch": 0.34065934065934067, "grad_norm": 0.31590884923934937, "learning_rate": 9.840461397076147e-06, "loss": 1.3860504627227783, "step": 186 }, { "epoch": 0.3443223443223443, "grad_norm": 0.11410943418741226, "learning_rate": 9.835728495826036e-06, "loss": 1.1887812614440918, "step": 188 }, { "epoch": 0.34798534798534797, "grad_norm": 0.29438552260398865, "learning_rate": 9.830927717948929e-06, "loss": 1.294023036956787, "step": 190 }, { "epoch": 0.3516483516483517, "grad_norm": 1.1163926124572754, "learning_rate": 9.826059138601883e-06, "loss": 1.124396800994873, "step": 192 }, { "epoch": 0.3553113553113553, "grad_norm": 0.09197133034467697, "learning_rate": 9.821122834003407e-06, "loss": 1.27751624584198, "step": 194 }, { "epoch": 0.358974358974359, "grad_norm": 0.23845773935317993, "learning_rate": 9.816118881432255e-06, "loss": 1.2824617624282837, "step": 196 }, { "epoch": 0.3626373626373626, "grad_norm": 0.16290828585624695, "learning_rate": 9.811047359226224e-06, "loss": 0.8826183080673218, "step": 198 }, { "epoch": 0.3663003663003663, "grad_norm": 0.24791596829891205, "learning_rate": 9.805908346780929e-06, "loss": 1.044391393661499, "step": 200 }, { "epoch": 0.36996336996337, "grad_norm": 0.2740170955657959, "learning_rate": 9.80070192454855e-06, "loss": 1.4561749696731567, "step": 202 }, { "epoch": 0.37362637362637363, "grad_norm": 0.33053258061408997, "learning_rate": 9.795428174036591e-06, "loss": 1.2278764247894287, "step": 204 }, { "epoch": 0.3772893772893773, "grad_norm": 0.7583060264587402, "learning_rate": 9.790087177806584e-06, "loss": 0.7968496084213257, "step": 206 }, { "epoch": 0.38095238095238093, "grad_norm": 0.2668805718421936, "learning_rate": 9.784679019472809e-06, "loss": 1.1589165925979614, "step": 208 }, { "epoch": 0.38461538461538464, "grad_norm": 0.21432484686374664, "learning_rate": 9.779203783700972e-06, "loss": 1.4328304529190063, "step": 210 }, { "epoch": 0.3882783882783883, "grad_norm": 0.191499263048172, "learning_rate": 9.773661556206903e-06, "loss": 1.0945113897323608, "step": 212 }, { "epoch": 0.39194139194139194, "grad_norm": 0.15214745700359344, "learning_rate": 9.768052423755192e-06, "loss": 1.1581294536590576, "step": 214 }, { "epoch": 0.3956043956043956, "grad_norm": 0.23848576843738556, "learning_rate": 9.762376474157839e-06, "loss": 1.2475342750549316, "step": 216 }, { "epoch": 0.3992673992673993, "grad_norm": 0.2269514501094818, "learning_rate": 9.756633796272876e-06, "loss": 1.2841179370880127, "step": 218 }, { "epoch": 0.40293040293040294, "grad_norm": 0.07938987016677856, "learning_rate": 9.750824480002982e-06, "loss": 0.623121976852417, "step": 220 }, { "epoch": 0.4065934065934066, "grad_norm": 0.3509514331817627, "learning_rate": 9.744948616294074e-06, "loss": 1.364533543586731, "step": 222 }, { "epoch": 0.41025641025641024, "grad_norm": 0.20469792187213898, "learning_rate": 9.739006297133878e-06, "loss": 1.0975794792175293, "step": 224 }, { "epoch": 0.4139194139194139, "grad_norm": 0.2600097358226776, "learning_rate": 9.732997615550495e-06, "loss": 1.2632966041564941, "step": 226 }, { "epoch": 0.4175824175824176, "grad_norm": 0.15840594470500946, "learning_rate": 9.726922665610935e-06, "loss": 1.3373838663101196, "step": 228 }, { "epoch": 0.42124542124542125, "grad_norm": 0.43822696805000305, "learning_rate": 9.720781542419662e-06, "loss": 1.2531630992889404, "step": 230 }, { "epoch": 0.4249084249084249, "grad_norm": 0.5942100286483765, "learning_rate": 9.714574342117086e-06, "loss": 1.0207842588424683, "step": 232 }, { "epoch": 0.42857142857142855, "grad_norm": 0.23664861917495728, "learning_rate": 9.70830116187807e-06, "loss": 1.5113677978515625, "step": 234 }, { "epoch": 0.43223443223443225, "grad_norm": 0.3284321427345276, "learning_rate": 9.701962099910407e-06, "loss": 1.0360337495803833, "step": 236 }, { "epoch": 0.4358974358974359, "grad_norm": 0.2513348460197449, "learning_rate": 9.695557255453273e-06, "loss": 1.0973368883132935, "step": 238 }, { "epoch": 0.43956043956043955, "grad_norm": 0.45316001772880554, "learning_rate": 9.68908672877569e-06, "loss": 0.9152914881706238, "step": 240 }, { "epoch": 0.4432234432234432, "grad_norm": 0.2768547236919403, "learning_rate": 9.682550621174942e-06, "loss": 0.8826823830604553, "step": 242 }, { "epoch": 0.4468864468864469, "grad_norm": 0.14853699505329132, "learning_rate": 9.675949034974992e-06, "loss": 0.5798932313919067, "step": 244 }, { "epoch": 0.45054945054945056, "grad_norm": 0.1571403294801712, "learning_rate": 9.669282073524892e-06, "loss": 1.2800544500350952, "step": 246 }, { "epoch": 0.4542124542124542, "grad_norm": 0.20789006352424622, "learning_rate": 9.662549841197148e-06, "loss": 0.893764853477478, "step": 248 }, { "epoch": 0.45787545787545786, "grad_norm": 0.7506678104400635, "learning_rate": 9.655752443386092e-06, "loss": 1.2865655422210693, "step": 250 }, { "epoch": 0.46153846153846156, "grad_norm": 0.39902183413505554, "learning_rate": 9.64888998650624e-06, "loss": 1.1993688344955444, "step": 252 }, { "epoch": 0.4652014652014652, "grad_norm": 0.3465142846107483, "learning_rate": 9.641962577990614e-06, "loss": 0.9851580262184143, "step": 254 }, { "epoch": 0.46886446886446886, "grad_norm": 0.18256494402885437, "learning_rate": 9.634970326289071e-06, "loss": 1.2847747802734375, "step": 256 }, { "epoch": 0.4725274725274725, "grad_norm": 0.24586841464042664, "learning_rate": 9.627913340866597e-06, "loss": 1.3066174983978271, "step": 258 }, { "epoch": 0.47619047619047616, "grad_norm": 0.11027955263853073, "learning_rate": 9.620791732201595e-06, "loss": 0.8039655685424805, "step": 260 }, { "epoch": 0.47985347985347987, "grad_norm": 0.15749269723892212, "learning_rate": 9.613605611784158e-06, "loss": 1.1634037494659424, "step": 262 }, { "epoch": 0.4835164835164835, "grad_norm": 0.23077067732810974, "learning_rate": 9.606355092114327e-06, "loss": 1.2528202533721924, "step": 264 }, { "epoch": 0.48717948717948717, "grad_norm": 0.18674089014530182, "learning_rate": 9.599040286700317e-06, "loss": 1.5212275981903076, "step": 266 }, { "epoch": 0.4908424908424908, "grad_norm": 0.4802699089050293, "learning_rate": 9.591661310056753e-06, "loss": 0.8288567662239075, "step": 268 }, { "epoch": 0.4945054945054945, "grad_norm": 0.1448894888162613, "learning_rate": 9.58421827770287e-06, "loss": 1.2230876684188843, "step": 270 }, { "epoch": 0.4981684981684982, "grad_norm": 0.19190412759780884, "learning_rate": 9.57671130616071e-06, "loss": 0.9024039506912231, "step": 272 }, { "epoch": 0.5018315018315018, "grad_norm": 0.3073454797267914, "learning_rate": 9.569140512953296e-06, "loss": 1.2714391946792603, "step": 274 }, { "epoch": 0.5054945054945055, "grad_norm": 0.3199959993362427, "learning_rate": 9.561506016602782e-06, "loss": 0.8202919363975525, "step": 276 }, { "epoch": 0.5091575091575091, "grad_norm": 0.09401345998048782, "learning_rate": 9.553807936628617e-06, "loss": 0.8935064673423767, "step": 278 }, { "epoch": 0.5128205128205128, "grad_norm": 0.21345993876457214, "learning_rate": 9.546046393545655e-06, "loss": 1.2741483449935913, "step": 280 }, { "epoch": 0.5164835164835165, "grad_norm": 0.23345427215099335, "learning_rate": 9.538221508862284e-06, "loss": 1.2695109844207764, "step": 282 }, { "epoch": 0.5201465201465202, "grad_norm": 0.16931022703647614, "learning_rate": 9.530333405078512e-06, "loss": 1.274514079093933, "step": 284 }, { "epoch": 0.5238095238095238, "grad_norm": 0.33658501505851746, "learning_rate": 9.522382205684053e-06, "loss": 1.0144422054290771, "step": 286 }, { "epoch": 0.5274725274725275, "grad_norm": 0.21759743988513947, "learning_rate": 9.514368035156398e-06, "loss": 1.2731945514678955, "step": 288 }, { "epoch": 0.5311355311355311, "grad_norm": 0.17717669904232025, "learning_rate": 9.506291018958857e-06, "loss": 1.2374247312545776, "step": 290 }, { "epoch": 0.5347985347985348, "grad_norm": 0.337706983089447, "learning_rate": 9.498151283538608e-06, "loss": 0.7559359669685364, "step": 292 }, { "epoch": 0.5384615384615384, "grad_norm": 0.035663675516843796, "learning_rate": 9.489948956324706e-06, "loss": 0.9581714868545532, "step": 294 }, { "epoch": 0.5421245421245421, "grad_norm": 0.12138810753822327, "learning_rate": 9.481684165726086e-06, "loss": 1.0345128774642944, "step": 296 }, { "epoch": 0.5457875457875457, "grad_norm": 0.39733827114105225, "learning_rate": 9.473357041129572e-06, "loss": 1.3242045640945435, "step": 298 }, { "epoch": 0.5494505494505495, "grad_norm": 0.16901174187660217, "learning_rate": 9.464967712897828e-06, "loss": 1.2276860475540161, "step": 300 }, { "epoch": 0.5531135531135531, "grad_norm": 0.5484493374824524, "learning_rate": 9.456516312367328e-06, "loss": 1.2076282501220703, "step": 302 }, { "epoch": 0.5567765567765568, "grad_norm": 0.17032906413078308, "learning_rate": 9.448002971846307e-06, "loss": 0.9942311644554138, "step": 304 }, { "epoch": 0.5604395604395604, "grad_norm": 0.24507595598697662, "learning_rate": 9.439427824612673e-06, "loss": 0.9752069115638733, "step": 306 }, { "epoch": 0.5641025641025641, "grad_norm": 0.40566012263298035, "learning_rate": 9.430791004911934e-06, "loss": 1.4564454555511475, "step": 308 }, { "epoch": 0.5677655677655677, "grad_norm": 0.1568066030740738, "learning_rate": 9.42209264795509e-06, "loss": 1.0061030387878418, "step": 310 }, { "epoch": 0.5714285714285714, "grad_norm": 0.16984346508979797, "learning_rate": 9.41333288991652e-06, "loss": 1.2216694355010986, "step": 312 }, { "epoch": 0.575091575091575, "grad_norm": 0.09158849716186523, "learning_rate": 9.404511867931847e-06, "loss": 1.1522339582443237, "step": 314 }, { "epoch": 0.5787545787545788, "grad_norm": 0.16296543180942535, "learning_rate": 9.39562972009579e-06, "loss": 1.293960452079773, "step": 316 }, { "epoch": 0.5824175824175825, "grad_norm": 0.24195973575115204, "learning_rate": 9.386686585460011e-06, "loss": 1.1431677341461182, "step": 318 }, { "epoch": 0.5860805860805861, "grad_norm": 0.1092909500002861, "learning_rate": 9.377682604030925e-06, "loss": 1.3567752838134766, "step": 320 }, { "epoch": 0.5897435897435898, "grad_norm": 0.1672687828540802, "learning_rate": 9.368617916767517e-06, "loss": 1.5480321645736694, "step": 322 }, { "epoch": 0.5934065934065934, "grad_norm": 0.18804782629013062, "learning_rate": 9.359492665579136e-06, "loss": 1.2884105443954468, "step": 324 }, { "epoch": 0.5970695970695971, "grad_norm": 0.2078697383403778, "learning_rate": 9.350306993323265e-06, "loss": 1.3802863359451294, "step": 326 }, { "epoch": 0.6007326007326007, "grad_norm": 0.16467250883579254, "learning_rate": 9.34106104380329e-06, "loss": 1.2509921789169312, "step": 328 }, { "epoch": 0.6043956043956044, "grad_norm": 0.46313583850860596, "learning_rate": 9.331754961766257e-06, "loss": 1.140839695930481, "step": 330 }, { "epoch": 0.608058608058608, "grad_norm": 0.14376887679100037, "learning_rate": 9.322388892900587e-06, "loss": 1.201643943786621, "step": 332 }, { "epoch": 0.6117216117216118, "grad_norm": 0.1362253874540329, "learning_rate": 9.312962983833815e-06, "loss": 1.3028783798217773, "step": 334 }, { "epoch": 0.6153846153846154, "grad_norm": 3.4290378093719482, "learning_rate": 9.303477382130278e-06, "loss": 0.973407506942749, "step": 336 }, { "epoch": 0.6190476190476191, "grad_norm": 0.16140861809253693, "learning_rate": 9.293932236288816e-06, "loss": 1.2559469938278198, "step": 338 }, { "epoch": 0.6227106227106227, "grad_norm": 0.1613743007183075, "learning_rate": 9.284327695740441e-06, "loss": 1.256553292274475, "step": 340 }, { "epoch": 0.6263736263736264, "grad_norm": 0.34570202231407166, "learning_rate": 9.274663910846004e-06, "loss": 0.5801024436950684, "step": 342 }, { "epoch": 0.63003663003663, "grad_norm": 0.28319358825683594, "learning_rate": 9.264941032893836e-06, "loss": 1.4648103713989258, "step": 344 }, { "epoch": 0.6336996336996337, "grad_norm": 16.52604866027832, "learning_rate": 9.255159214097374e-06, "loss": 0.6978890895843506, "step": 346 }, { "epoch": 0.6373626373626373, "grad_norm": 0.19958341121673584, "learning_rate": 9.245318607592795e-06, "loss": 1.1675150394439697, "step": 348 }, { "epoch": 0.6410256410256411, "grad_norm": 0.24133825302124023, "learning_rate": 9.235419367436602e-06, "loss": 0.8993176221847534, "step": 350 }, { "epoch": 0.6446886446886447, "grad_norm": 0.20524722337722778, "learning_rate": 9.225461648603223e-06, "loss": 0.9288710951805115, "step": 352 }, { "epoch": 0.6483516483516484, "grad_norm": 0.391886830329895, "learning_rate": 9.215445606982573e-06, "loss": 0.9668469429016113, "step": 354 }, { "epoch": 0.652014652014652, "grad_norm": 0.2540344297885895, "learning_rate": 9.205371399377628e-06, "loss": 1.1877306699752808, "step": 356 }, { "epoch": 0.6556776556776557, "grad_norm": 0.21765393018722534, "learning_rate": 9.195239183501961e-06, "loss": 1.1672714948654175, "step": 358 }, { "epoch": 0.6593406593406593, "grad_norm": 0.19967345893383026, "learning_rate": 9.185049117977276e-06, "loss": 0.7011613845825195, "step": 360 }, { "epoch": 0.663003663003663, "grad_norm": 0.7372376322746277, "learning_rate": 9.17480136233092e-06, "loss": 0.9566145539283752, "step": 362 }, { "epoch": 0.6666666666666666, "grad_norm": 0.20093770325183868, "learning_rate": 9.164496076993395e-06, "loss": 0.946535587310791, "step": 364 }, { "epoch": 0.6703296703296703, "grad_norm": 0.2989659607410431, "learning_rate": 9.154133423295836e-06, "loss": 1.203826904296875, "step": 366 }, { "epoch": 0.673992673992674, "grad_norm": 0.25106337666511536, "learning_rate": 9.143713563467495e-06, "loss": 1.0666961669921875, "step": 368 }, { "epoch": 0.6776556776556777, "grad_norm": 0.11923722177743912, "learning_rate": 9.133236660633192e-06, "loss": 1.097327709197998, "step": 370 }, { "epoch": 0.6813186813186813, "grad_norm": 0.3943967819213867, "learning_rate": 9.12270287881077e-06, "loss": 1.2011562585830688, "step": 372 }, { "epoch": 0.684981684981685, "grad_norm": 0.1692187637090683, "learning_rate": 9.112112382908516e-06, "loss": 1.2239218950271606, "step": 374 }, { "epoch": 0.6886446886446886, "grad_norm": 0.10792715102434158, "learning_rate": 9.101465338722596e-06, "loss": 0.9010005593299866, "step": 376 }, { "epoch": 0.6923076923076923, "grad_norm": 0.1825140416622162, "learning_rate": 9.090761912934441e-06, "loss": 0.8389140367507935, "step": 378 }, { "epoch": 0.6959706959706959, "grad_norm": 0.28178316354751587, "learning_rate": 9.080002273108155e-06, "loss": 1.0628230571746826, "step": 380 }, { "epoch": 0.6996336996336996, "grad_norm": 0.11631765961647034, "learning_rate": 9.069186587687872e-06, "loss": 0.9880151152610779, "step": 382 }, { "epoch": 0.7032967032967034, "grad_norm": 0.1353641152381897, "learning_rate": 9.058315025995142e-06, "loss": 1.2020447254180908, "step": 384 }, { "epoch": 0.706959706959707, "grad_norm": 0.6966851949691772, "learning_rate": 9.047387758226261e-06, "loss": 1.1148114204406738, "step": 386 }, { "epoch": 0.7106227106227107, "grad_norm": 0.08536599576473236, "learning_rate": 9.036404955449615e-06, "loss": 0.8987938165664673, "step": 388 }, { "epoch": 0.7142857142857143, "grad_norm": 0.164885014295578, "learning_rate": 9.025366789603002e-06, "loss": 1.0990866422653198, "step": 390 }, { "epoch": 0.717948717948718, "grad_norm": 0.1607430875301361, "learning_rate": 9.014273433490938e-06, "loss": 1.1975574493408203, "step": 392 }, { "epoch": 0.7216117216117216, "grad_norm": 0.3020445704460144, "learning_rate": 9.003125060781951e-06, "loss": 1.1362345218658447, "step": 394 }, { "epoch": 0.7252747252747253, "grad_norm": 0.23285327851772308, "learning_rate": 8.99192184600587e-06, "loss": 1.264463186264038, "step": 396 }, { "epoch": 0.7289377289377289, "grad_norm": 0.1471405327320099, "learning_rate": 8.98066396455108e-06, "loss": 0.9352213740348816, "step": 398 }, { "epoch": 0.7326007326007326, "grad_norm": 0.39496904611587524, "learning_rate": 8.969351592661787e-06, "loss": 0.8601157665252686, "step": 400 }, { "epoch": 0.7362637362637363, "grad_norm": 0.10789740085601807, "learning_rate": 8.957984907435254e-06, "loss": 1.2675104141235352, "step": 402 }, { "epoch": 0.73992673992674, "grad_norm": 0.13899658620357513, "learning_rate": 8.946564086819025e-06, "loss": 0.8569284081459045, "step": 404 }, { "epoch": 0.7435897435897436, "grad_norm": 0.2528247833251953, "learning_rate": 8.935089309608152e-06, "loss": 1.0413234233856201, "step": 406 }, { "epoch": 0.7472527472527473, "grad_norm": 0.14703120291233063, "learning_rate": 8.92356075544238e-06, "loss": 1.0387818813323975, "step": 408 }, { "epoch": 0.7509157509157509, "grad_norm": 0.39029836654663086, "learning_rate": 8.911978604803346e-06, "loss": 0.8937767744064331, "step": 410 }, { "epoch": 0.7545787545787546, "grad_norm": 0.23668618500232697, "learning_rate": 8.900343039011745e-06, "loss": 1.1923093795776367, "step": 412 }, { "epoch": 0.7582417582417582, "grad_norm": 0.4645112454891205, "learning_rate": 8.888654240224503e-06, "loss": 1.0234112739562988, "step": 414 }, { "epoch": 0.7619047619047619, "grad_norm": 0.35793423652648926, "learning_rate": 8.876912391431913e-06, "loss": 1.2955764532089233, "step": 416 }, { "epoch": 0.7655677655677655, "grad_norm": 0.09080661088228226, "learning_rate": 8.86511767645478e-06, "loss": 0.6778948903083801, "step": 418 }, { "epoch": 0.7692307692307693, "grad_norm": 0.5035378932952881, "learning_rate": 8.853270279941533e-06, "loss": 1.2608743906021118, "step": 420 }, { "epoch": 0.7728937728937729, "grad_norm": 0.48378103971481323, "learning_rate": 8.841370387365344e-06, "loss": 1.015937328338623, "step": 422 }, { "epoch": 0.7765567765567766, "grad_norm": 0.18112313747406006, "learning_rate": 8.829418185021221e-06, "loss": 0.5042012929916382, "step": 424 }, { "epoch": 0.7802197802197802, "grad_norm": 0.2163010537624359, "learning_rate": 8.817413860023089e-06, "loss": 0.8268504738807678, "step": 426 }, { "epoch": 0.7838827838827839, "grad_norm": 0.2586314082145691, "learning_rate": 8.805357600300863e-06, "loss": 1.0975161790847778, "step": 428 }, { "epoch": 0.7875457875457875, "grad_norm": 0.32366475462913513, "learning_rate": 8.793249594597508e-06, "loss": 1.2304267883300781, "step": 430 }, { "epoch": 0.7912087912087912, "grad_norm": 0.18010596930980682, "learning_rate": 8.781090032466079e-06, "loss": 1.3180345296859741, "step": 432 }, { "epoch": 0.7948717948717948, "grad_norm": 0.15381111204624176, "learning_rate": 8.768879104266758e-06, "loss": 0.894809901714325, "step": 434 }, { "epoch": 0.7985347985347986, "grad_norm": 0.3722904920578003, "learning_rate": 8.756617001163869e-06, "loss": 0.9750258326530457, "step": 436 }, { "epoch": 0.8021978021978022, "grad_norm": 0.1223578080534935, "learning_rate": 8.744303915122895e-06, "loss": 0.8995143175125122, "step": 438 }, { "epoch": 0.8058608058608059, "grad_norm": 0.17111073434352875, "learning_rate": 8.73194003890746e-06, "loss": 1.3294800519943237, "step": 440 }, { "epoch": 0.8095238095238095, "grad_norm": 0.8301756978034973, "learning_rate": 8.719525566076322e-06, "loss": 1.2234307527542114, "step": 442 }, { "epoch": 0.8131868131868132, "grad_norm": 0.26485782861709595, "learning_rate": 8.707060690980334e-06, "loss": 1.2229658365249634, "step": 444 }, { "epoch": 0.8168498168498168, "grad_norm": 0.4809357225894928, "learning_rate": 8.69454560875941e-06, "loss": 1.2412751913070679, "step": 446 }, { "epoch": 0.8205128205128205, "grad_norm": 0.19155052304267883, "learning_rate": 8.681980515339464e-06, "loss": 1.2396912574768066, "step": 448 }, { "epoch": 0.8241758241758241, "grad_norm": 0.11245301365852356, "learning_rate": 8.669365607429344e-06, "loss": 1.467288851737976, "step": 450 }, { "epoch": 0.8278388278388278, "grad_norm": 0.203065425157547, "learning_rate": 8.656701082517752e-06, "loss": 1.008663296699524, "step": 452 }, { "epoch": 0.8315018315018315, "grad_norm": 0.0740152969956398, "learning_rate": 8.643987138870156e-06, "loss": 0.5013046860694885, "step": 454 }, { "epoch": 0.8351648351648352, "grad_norm": 0.06140409782528877, "learning_rate": 8.631223975525683e-06, "loss": 0.9132590293884277, "step": 456 }, { "epoch": 0.8388278388278388, "grad_norm": 0.7447589635848999, "learning_rate": 8.618411792293997e-06, "loss": 0.8399595618247986, "step": 458 }, { "epoch": 0.8424908424908425, "grad_norm": 0.43025752902030945, "learning_rate": 8.605550789752191e-06, "loss": 1.0485363006591797, "step": 460 }, { "epoch": 0.8461538461538461, "grad_norm": 0.1829681098461151, "learning_rate": 8.592641169241622e-06, "loss": 1.2453057765960693, "step": 462 }, { "epoch": 0.8498168498168498, "grad_norm": 0.24352803826332092, "learning_rate": 8.579683132864769e-06, "loss": 1.193666696548462, "step": 464 }, { "epoch": 0.8534798534798534, "grad_norm": 0.08289831876754761, "learning_rate": 8.56667688348208e-06, "loss": 1.2128486633300781, "step": 466 }, { "epoch": 0.8571428571428571, "grad_norm": 0.13429510593414307, "learning_rate": 8.553622624708778e-06, "loss": 0.8921034932136536, "step": 468 }, { "epoch": 0.8608058608058609, "grad_norm": 0.2065141648054123, "learning_rate": 8.540520560911688e-06, "loss": 0.9356565475463867, "step": 470 }, { "epoch": 0.8644688644688645, "grad_norm": 0.8355104923248291, "learning_rate": 8.527370897206024e-06, "loss": 1.1900638341903687, "step": 472 }, { "epoch": 0.8681318681318682, "grad_norm": 0.7807270884513855, "learning_rate": 8.514173839452194e-06, "loss": 0.9948893189430237, "step": 474 }, { "epoch": 0.8717948717948718, "grad_norm": 0.13567706942558289, "learning_rate": 8.50092959425256e-06, "loss": 1.1165426969528198, "step": 476 }, { "epoch": 0.8754578754578755, "grad_norm": 0.2913936376571655, "learning_rate": 8.487638368948221e-06, "loss": 1.0576797723770142, "step": 478 }, { "epoch": 0.8791208791208791, "grad_norm": 0.502364993095398, "learning_rate": 8.47430037161575e-06, "loss": 1.0835438966751099, "step": 480 }, { "epoch": 0.8827838827838828, "grad_norm": 0.7570469379425049, "learning_rate": 8.460915811063952e-06, "loss": 1.204832673072815, "step": 482 }, { "epoch": 0.8864468864468864, "grad_norm": 0.20631951093673706, "learning_rate": 8.447484896830581e-06, "loss": 1.2826550006866455, "step": 484 }, { "epoch": 0.8901098901098901, "grad_norm": 0.18968936800956726, "learning_rate": 8.43400783917907e-06, "loss": 0.9170786142349243, "step": 486 }, { "epoch": 0.8937728937728938, "grad_norm": 0.357719749212265, "learning_rate": 8.420484849095233e-06, "loss": 1.1806507110595703, "step": 488 }, { "epoch": 0.8974358974358975, "grad_norm": 0.10344009846448898, "learning_rate": 8.406916138283971e-06, "loss": 1.1227405071258545, "step": 490 }, { "epoch": 0.9010989010989011, "grad_norm": 0.18624161183834076, "learning_rate": 8.393301919165947e-06, "loss": 1.067802906036377, "step": 492 }, { "epoch": 0.9047619047619048, "grad_norm": 0.23537158966064453, "learning_rate": 8.379642404874261e-06, "loss": 0.5906503796577454, "step": 494 }, { "epoch": 0.9084249084249084, "grad_norm": 0.28865331411361694, "learning_rate": 8.365937809251124e-06, "loss": 1.2992898225784302, "step": 496 }, { "epoch": 0.9120879120879121, "grad_norm": 0.25028523802757263, "learning_rate": 8.352188346844501e-06, "loss": 1.1510648727416992, "step": 498 }, { "epoch": 0.9157509157509157, "grad_norm": 0.16311851143836975, "learning_rate": 8.338394232904753e-06, "loss": 0.8221940398216248, "step": 500 }, { "epoch": 0.9194139194139194, "grad_norm": 0.040686819702386856, "learning_rate": 8.324555683381276e-06, "loss": 0.8739909529685974, "step": 502 }, { "epoch": 0.9230769230769231, "grad_norm": 0.22888082265853882, "learning_rate": 8.3106729149191e-06, "loss": 0.954814076423645, "step": 504 }, { "epoch": 0.9267399267399268, "grad_norm": 0.1041514128446579, "learning_rate": 8.296746144855525e-06, "loss": 0.8583929538726807, "step": 506 }, { "epoch": 0.9304029304029304, "grad_norm": 0.1827676147222519, "learning_rate": 8.282775591216691e-06, "loss": 1.1817222833633423, "step": 508 }, { "epoch": 0.9340659340659341, "grad_norm": 0.21790483593940735, "learning_rate": 8.268761472714193e-06, "loss": 1.2396169900894165, "step": 510 }, { "epoch": 0.9377289377289377, "grad_norm": 0.16166400909423828, "learning_rate": 8.254704008741629e-06, "loss": 1.1866990327835083, "step": 512 }, { "epoch": 0.9413919413919414, "grad_norm": 0.1379297822713852, "learning_rate": 8.240603419371181e-06, "loss": 0.9622292518615723, "step": 514 }, { "epoch": 0.945054945054945, "grad_norm": 0.12670904397964478, "learning_rate": 8.22645992535017e-06, "loss": 1.1956757307052612, "step": 516 }, { "epoch": 0.9487179487179487, "grad_norm": 0.09928123652935028, "learning_rate": 8.2122737480976e-06, "loss": 1.2315433025360107, "step": 518 }, { "epoch": 0.9523809523809523, "grad_norm": 0.3714980185031891, "learning_rate": 8.19804510970068e-06, "loss": 1.2325382232666016, "step": 520 }, { "epoch": 0.9560439560439561, "grad_norm": 0.1642584353685379, "learning_rate": 8.183774232911362e-06, "loss": 0.8959419131278992, "step": 522 }, { "epoch": 0.9597069597069597, "grad_norm": 0.30108144879341125, "learning_rate": 8.169461341142848e-06, "loss": 1.0453133583068848, "step": 524 }, { "epoch": 0.9633699633699634, "grad_norm": 0.2879306972026825, "learning_rate": 8.155106658466094e-06, "loss": 0.9845118522644043, "step": 526 }, { "epoch": 0.967032967032967, "grad_norm": 0.32583609223365784, "learning_rate": 8.140710409606289e-06, "loss": 0.657010018825531, "step": 528 }, { "epoch": 0.9706959706959707, "grad_norm": 0.19444482028484344, "learning_rate": 8.126272819939364e-06, "loss": 0.9151591062545776, "step": 530 }, { "epoch": 0.9743589743589743, "grad_norm": 1.4497026205062866, "learning_rate": 8.111794115488437e-06, "loss": 1.3719483613967896, "step": 532 }, { "epoch": 0.978021978021978, "grad_norm": 0.6636638641357422, "learning_rate": 8.097274522920291e-06, "loss": 0.6158185601234436, "step": 534 }, { "epoch": 0.9816849816849816, "grad_norm": 0.23120753467082977, "learning_rate": 8.082714269541814e-06, "loss": 0.8738659620285034, "step": 536 }, { "epoch": 0.9853479853479854, "grad_norm": 0.21588478982448578, "learning_rate": 8.068113583296456e-06, "loss": 0.8082484602928162, "step": 538 }, { "epoch": 0.989010989010989, "grad_norm": 0.37794405221939087, "learning_rate": 8.053472692760643e-06, "loss": 1.1449978351593018, "step": 540 }, { "epoch": 0.9926739926739927, "grad_norm": 0.21002766489982605, "learning_rate": 8.038791827140208e-06, "loss": 1.1861510276794434, "step": 542 }, { "epoch": 0.9963369963369964, "grad_norm": 0.2881741523742676, "learning_rate": 8.0240712162668e-06, "loss": 1.2074682712554932, "step": 544 }, { "epoch": 1.0, "grad_norm": 0.14409518241882324, "learning_rate": 8.009311090594297e-06, "loss": 1.3737009763717651, "step": 546 }, { "epoch": 1.0036630036630036, "grad_norm": 0.3364971876144409, "learning_rate": 7.994511681195175e-06, "loss": 1.010398268699646, "step": 548 }, { "epoch": 1.0073260073260073, "grad_norm": 0.20106230676174164, "learning_rate": 7.97967321975691e-06, "loss": 1.2091031074523926, "step": 550 }, { "epoch": 1.010989010989011, "grad_norm": 0.3865948021411896, "learning_rate": 7.964795938578347e-06, "loss": 0.7033045887947083, "step": 552 }, { "epoch": 1.0146520146520146, "grad_norm": 0.1542910784482956, "learning_rate": 7.949880070566058e-06, "loss": 1.161207914352417, "step": 554 }, { "epoch": 1.0183150183150182, "grad_norm": 0.17765195667743683, "learning_rate": 7.9349258492307e-06, "loss": 1.050184726715088, "step": 556 }, { "epoch": 1.021978021978022, "grad_norm": 0.13882611691951752, "learning_rate": 7.91993350868336e-06, "loss": 1.2430739402770996, "step": 558 }, { "epoch": 1.0256410256410255, "grad_norm": 0.16586051881313324, "learning_rate": 7.904903283631884e-06, "loss": 0.9440419673919678, "step": 560 }, { "epoch": 1.0293040293040292, "grad_norm": 0.19672122597694397, "learning_rate": 7.88983540937721e-06, "loss": 1.1903315782546997, "step": 562 }, { "epoch": 1.032967032967033, "grad_norm": 0.23967498540878296, "learning_rate": 7.87473012180968e-06, "loss": 1.1820085048675537, "step": 564 }, { "epoch": 1.0366300366300367, "grad_norm": 0.17814430594444275, "learning_rate": 7.859587657405353e-06, "loss": 1.196739912033081, "step": 566 }, { "epoch": 1.0402930402930404, "grad_norm": 0.1445707082748413, "learning_rate": 7.84440825322229e-06, "loss": 1.084715723991394, "step": 568 }, { "epoch": 1.043956043956044, "grad_norm": 0.18223969638347626, "learning_rate": 7.829192146896854e-06, "loss": 1.1364811658859253, "step": 570 }, { "epoch": 1.0476190476190477, "grad_norm": 0.1871526837348938, "learning_rate": 7.813939576639993e-06, "loss": 1.2730778455734253, "step": 572 }, { "epoch": 1.0512820512820513, "grad_norm": 0.48518720269203186, "learning_rate": 7.798650781233495e-06, "loss": 1.1072925329208374, "step": 574 }, { "epoch": 1.054945054945055, "grad_norm": 0.2393021285533905, "learning_rate": 7.783326000026266e-06, "loss": 1.2872074842453003, "step": 576 }, { "epoch": 1.0586080586080586, "grad_norm": 0.17235060036182404, "learning_rate": 7.767965472930575e-06, "loss": 1.2461888790130615, "step": 578 }, { "epoch": 1.0622710622710623, "grad_norm": 0.501559853553772, "learning_rate": 7.752569440418297e-06, "loss": 1.2427866458892822, "step": 580 }, { "epoch": 1.065934065934066, "grad_norm": 0.2508564889431, "learning_rate": 7.737138143517153e-06, "loss": 1.0125867128372192, "step": 582 }, { "epoch": 1.0695970695970696, "grad_norm": 0.34929409623146057, "learning_rate": 7.721671823806934e-06, "loss": 1.1760741472244263, "step": 584 }, { "epoch": 1.0732600732600732, "grad_norm": 0.11314375698566437, "learning_rate": 7.70617072341572e-06, "loss": 1.127938151359558, "step": 586 }, { "epoch": 1.0769230769230769, "grad_norm": 0.1307147890329361, "learning_rate": 7.690635085016087e-06, "loss": 0.8474472165107727, "step": 588 }, { "epoch": 1.0805860805860805, "grad_norm": 0.19595587253570557, "learning_rate": 7.675065151821313e-06, "loss": 1.2290217876434326, "step": 590 }, { "epoch": 1.0842490842490842, "grad_norm": 0.05816657096147537, "learning_rate": 7.659461167581564e-06, "loss": 1.065525770187378, "step": 592 }, { "epoch": 1.0879120879120878, "grad_norm": 0.1757960468530655, "learning_rate": 7.643823376580087e-06, "loss": 1.0110828876495361, "step": 594 }, { "epoch": 1.0915750915750915, "grad_norm": 0.4253121614456177, "learning_rate": 7.628152023629369e-06, "loss": 1.0302798748016357, "step": 596 }, { "epoch": 1.0952380952380953, "grad_norm": 0.17801064252853394, "learning_rate": 7.61244735406733e-06, "loss": 1.1227004528045654, "step": 598 }, { "epoch": 1.098901098901099, "grad_norm": 0.24273711442947388, "learning_rate": 7.596709613753457e-06, "loss": 1.1816527843475342, "step": 600 }, { "epoch": 1.1025641025641026, "grad_norm": 1.4160579442977905, "learning_rate": 7.5809390490649685e-06, "loss": 0.8195367455482483, "step": 602 }, { "epoch": 1.1062271062271063, "grad_norm": 0.25498881936073303, "learning_rate": 7.565135906892954e-06, "loss": 0.9860736727714539, "step": 604 }, { "epoch": 1.10989010989011, "grad_norm": 0.2312251329421997, "learning_rate": 7.549300434638515e-06, "loss": 0.9298585057258606, "step": 606 }, { "epoch": 1.1135531135531136, "grad_norm": 0.40971049666404724, "learning_rate": 7.533432880208879e-06, "loss": 1.3407394886016846, "step": 608 }, { "epoch": 1.1172161172161172, "grad_norm": 0.26371797919273376, "learning_rate": 7.517533492013527e-06, "loss": 0.7484307289123535, "step": 610 }, { "epoch": 1.120879120879121, "grad_norm": 0.18549509346485138, "learning_rate": 7.501602518960308e-06, "loss": 1.2191801071166992, "step": 612 }, { "epoch": 1.1245421245421245, "grad_norm": 0.2595532238483429, "learning_rate": 7.485640210451535e-06, "loss": 1.0103733539581299, "step": 614 }, { "epoch": 1.1282051282051282, "grad_norm": 0.1361607164144516, "learning_rate": 7.469646816380085e-06, "loss": 1.1822372674942017, "step": 616 }, { "epoch": 1.1318681318681318, "grad_norm": 0.36064237356185913, "learning_rate": 7.453622587125479e-06, "loss": 0.42253974080085754, "step": 618 }, { "epoch": 1.1355311355311355, "grad_norm": 0.25480979681015015, "learning_rate": 7.437567773549976e-06, "loss": 1.1068378686904907, "step": 620 }, { "epoch": 1.1391941391941391, "grad_norm": 0.4690706133842468, "learning_rate": 7.421482626994635e-06, "loss": 0.7852658629417419, "step": 622 }, { "epoch": 1.1428571428571428, "grad_norm": 0.2576858401298523, "learning_rate": 7.405367399275384e-06, "loss": 1.1447125673294067, "step": 624 }, { "epoch": 1.1465201465201464, "grad_norm": 0.1483878493309021, "learning_rate": 7.389222342679073e-06, "loss": 1.4121488332748413, "step": 626 }, { "epoch": 1.15018315018315, "grad_norm": 0.4962548017501831, "learning_rate": 7.373047709959537e-06, "loss": 0.4587477743625641, "step": 628 }, { "epoch": 1.1538461538461537, "grad_norm": 0.05175771191716194, "learning_rate": 7.356843754333626e-06, "loss": 1.1379830837249756, "step": 630 }, { "epoch": 1.1575091575091574, "grad_norm": 0.8406626582145691, "learning_rate": 7.340610729477242e-06, "loss": 1.1201821565628052, "step": 632 }, { "epoch": 1.1611721611721613, "grad_norm": 0.25670814514160156, "learning_rate": 7.324348889521377e-06, "loss": 0.8717086315155029, "step": 634 }, { "epoch": 1.164835164835165, "grad_norm": 0.15090753138065338, "learning_rate": 7.308058489048125e-06, "loss": 1.0203039646148682, "step": 636 }, { "epoch": 1.1684981684981686, "grad_norm": 0.34527626633644104, "learning_rate": 7.291739783086701e-06, "loss": 1.2124344110488892, "step": 638 }, { "epoch": 1.1721611721611722, "grad_norm": 0.05906078591942787, "learning_rate": 7.275393027109451e-06, "loss": 0.761792778968811, "step": 640 }, { "epoch": 1.1758241758241759, "grad_norm": 0.22481679916381836, "learning_rate": 7.259018477027842e-06, "loss": 1.1472866535186768, "step": 642 }, { "epoch": 1.1794871794871795, "grad_norm": 0.1917589157819748, "learning_rate": 7.242616389188472e-06, "loss": 1.1815375089645386, "step": 644 }, { "epoch": 1.1831501831501832, "grad_norm": 0.1334036886692047, "learning_rate": 7.226187020369039e-06, "loss": 0.7848197817802429, "step": 646 }, { "epoch": 1.1868131868131868, "grad_norm": 0.6945109367370605, "learning_rate": 7.209730627774333e-06, "loss": 0.93724524974823, "step": 648 }, { "epoch": 1.1904761904761905, "grad_norm": 0.22903983294963837, "learning_rate": 7.193247469032209e-06, "loss": 1.1586498022079468, "step": 650 }, { "epoch": 1.1941391941391941, "grad_norm": 0.1996290683746338, "learning_rate": 7.1767378021895464e-06, "loss": 0.6816765666007996, "step": 652 }, { "epoch": 1.1978021978021978, "grad_norm": 0.3227924108505249, "learning_rate": 7.160201885708219e-06, "loss": 1.3321443796157837, "step": 654 }, { "epoch": 1.2014652014652014, "grad_norm": 0.1688106805086136, "learning_rate": 7.143639978461038e-06, "loss": 0.8470932841300964, "step": 656 }, { "epoch": 1.205128205128205, "grad_norm": 1.263344645500183, "learning_rate": 7.127052339727708e-06, "loss": 0.9178895950317383, "step": 658 }, { "epoch": 1.2087912087912087, "grad_norm": 0.17517027258872986, "learning_rate": 7.110439229190762e-06, "loss": 1.1735132932662964, "step": 660 }, { "epoch": 1.2124542124542124, "grad_norm": 0.36022061109542847, "learning_rate": 7.093800906931505e-06, "loss": 1.0736725330352783, "step": 662 }, { "epoch": 1.2161172161172162, "grad_norm": 0.14516492187976837, "learning_rate": 7.077137633425928e-06, "loss": 0.9138533473014832, "step": 664 }, { "epoch": 1.2197802197802199, "grad_norm": 0.18955856561660767, "learning_rate": 7.060449669540646e-06, "loss": 0.8576375842094421, "step": 666 }, { "epoch": 1.2234432234432235, "grad_norm": 1.3548957109451294, "learning_rate": 7.043737276528799e-06, "loss": 0.9260948896408081, "step": 668 }, { "epoch": 1.2271062271062272, "grad_norm": 0.4962684214115143, "learning_rate": 7.027000716025975e-06, "loss": 0.710183322429657, "step": 670 }, { "epoch": 1.2307692307692308, "grad_norm": 0.1915455311536789, "learning_rate": 7.010240250046109e-06, "loss": 1.2020713090896606, "step": 672 }, { "epoch": 1.2344322344322345, "grad_norm": 0.2438817024230957, "learning_rate": 6.9934561409773724e-06, "loss": 1.1766732931137085, "step": 674 }, { "epoch": 1.2380952380952381, "grad_norm": 0.1694273203611374, "learning_rate": 6.976648651578087e-06, "loss": 1.1996291875839233, "step": 676 }, { "epoch": 1.2417582417582418, "grad_norm": 0.16501298546791077, "learning_rate": 6.959818044972585e-06, "loss": 0.7851068377494812, "step": 678 }, { "epoch": 1.2454212454212454, "grad_norm": 0.13900168240070343, "learning_rate": 6.942964584647109e-06, "loss": 0.8421606421470642, "step": 680 }, { "epoch": 1.249084249084249, "grad_norm": 0.9977712035179138, "learning_rate": 6.926088534445682e-06, "loss": 1.2277159690856934, "step": 682 }, { "epoch": 1.2527472527472527, "grad_norm": 0.13499097526073456, "learning_rate": 6.909190158565973e-06, "loss": 1.1799771785736084, "step": 684 }, { "epoch": 1.2564102564102564, "grad_norm": 0.11800684034824371, "learning_rate": 6.892269721555161e-06, "loss": 0.8362367153167725, "step": 686 }, { "epoch": 1.26007326007326, "grad_norm": 0.23727478086948395, "learning_rate": 6.875327488305805e-06, "loss": 1.1368072032928467, "step": 688 }, { "epoch": 1.2637362637362637, "grad_norm": 0.6073961853981018, "learning_rate": 6.858363724051678e-06, "loss": 1.1791174411773682, "step": 690 }, { "epoch": 1.2673992673992673, "grad_norm": 0.23184753954410553, "learning_rate": 6.841378694363631e-06, "loss": 1.2035536766052246, "step": 692 }, { "epoch": 1.271062271062271, "grad_norm": 0.17120184004306793, "learning_rate": 6.824372665145424e-06, "loss": 0.9986141324043274, "step": 694 }, { "epoch": 1.2747252747252746, "grad_norm": 0.279720276594162, "learning_rate": 6.80734590262958e-06, "loss": 1.0483033657073975, "step": 696 }, { "epoch": 1.2783882783882783, "grad_norm": 0.16889701783657074, "learning_rate": 6.79029867337319e-06, "loss": 0.9515765905380249, "step": 698 }, { "epoch": 1.282051282051282, "grad_norm": 0.06983523815870285, "learning_rate": 6.773231244253766e-06, "loss": 0.6616621017456055, "step": 700 }, { "epoch": 1.2857142857142856, "grad_norm": 0.26745325326919556, "learning_rate": 6.756143882465051e-06, "loss": 0.552936851978302, "step": 702 }, { "epoch": 1.2893772893772895, "grad_norm": 1.2814058065414429, "learning_rate": 6.739036855512835e-06, "loss": 1.3208832740783691, "step": 704 }, { "epoch": 1.293040293040293, "grad_norm": 0.24780850112438202, "learning_rate": 6.721910431210771e-06, "loss": 1.0253862142562866, "step": 706 }, { "epoch": 1.2967032967032968, "grad_norm": 0.07103469967842102, "learning_rate": 6.704764877676181e-06, "loss": 0.9762220978736877, "step": 708 }, { "epoch": 1.3003663003663004, "grad_norm": 0.1995328813791275, "learning_rate": 6.687600463325859e-06, "loss": 0.5912091732025146, "step": 710 }, { "epoch": 1.304029304029304, "grad_norm": 0.3986748456954956, "learning_rate": 6.670417456871871e-06, "loss": 0.9025965929031372, "step": 712 }, { "epoch": 1.3076923076923077, "grad_norm": 0.12167935073375702, "learning_rate": 6.653216127317338e-06, "loss": 1.2370021343231201, "step": 714 }, { "epoch": 1.3113553113553114, "grad_norm": 1.2277730703353882, "learning_rate": 6.635996743952242e-06, "loss": 1.1707024574279785, "step": 716 }, { "epoch": 1.315018315018315, "grad_norm": 0.1129893958568573, "learning_rate": 6.618759576349196e-06, "loss": 0.9717994928359985, "step": 718 }, { "epoch": 1.3186813186813187, "grad_norm": 0.11017405986785889, "learning_rate": 6.601504894359227e-06, "loss": 0.943675696849823, "step": 720 }, { "epoch": 1.3223443223443223, "grad_norm": 0.1213424950838089, "learning_rate": 6.584232968107557e-06, "loss": 0.9619688987731934, "step": 722 }, { "epoch": 1.326007326007326, "grad_norm": 0.46485310792922974, "learning_rate": 6.566944067989366e-06, "loss": 1.2262362241744995, "step": 724 }, { "epoch": 1.3296703296703296, "grad_norm": 0.16346322000026703, "learning_rate": 6.549638464665566e-06, "loss": 1.1035256385803223, "step": 726 }, { "epoch": 1.3333333333333333, "grad_norm": 0.8575840592384338, "learning_rate": 6.532316429058562e-06, "loss": 1.1614726781845093, "step": 728 }, { "epoch": 1.3369963369963371, "grad_norm": 0.2909935712814331, "learning_rate": 6.514978232348003e-06, "loss": 0.765929639339447, "step": 730 }, { "epoch": 1.3406593406593408, "grad_norm": 0.27958768606185913, "learning_rate": 6.497624145966549e-06, "loss": 0.8523128032684326, "step": 732 }, { "epoch": 1.3443223443223444, "grad_norm": 0.05668744817376137, "learning_rate": 6.480254441595615e-06, "loss": 0.635466456413269, "step": 734 }, { "epoch": 1.347985347985348, "grad_norm": 0.243035688996315, "learning_rate": 6.462869391161116e-06, "loss": 1.1623685359954834, "step": 736 }, { "epoch": 1.3516483516483517, "grad_norm": 0.18739135563373566, "learning_rate": 6.445469266829214e-06, "loss": 0.7601761817932129, "step": 738 }, { "epoch": 1.3553113553113554, "grad_norm": 0.18719348311424255, "learning_rate": 6.428054341002058e-06, "loss": 1.0253567695617676, "step": 740 }, { "epoch": 1.358974358974359, "grad_norm": 0.44151896238327026, "learning_rate": 6.41062488631351e-06, "loss": 0.6381222605705261, "step": 742 }, { "epoch": 1.3626373626373627, "grad_norm": 0.15360459685325623, "learning_rate": 6.393181175624893e-06, "loss": 1.2534339427947998, "step": 744 }, { "epoch": 1.3663003663003663, "grad_norm": 0.1375494748353958, "learning_rate": 6.375723482020702e-06, "loss": 1.0938211679458618, "step": 746 }, { "epoch": 1.36996336996337, "grad_norm": 0.2471369057893753, "learning_rate": 6.3582520788043465e-06, "loss": 1.0957386493682861, "step": 748 }, { "epoch": 1.3736263736263736, "grad_norm": 0.4160969853401184, "learning_rate": 6.340767239493851e-06, "loss": 0.8028813600540161, "step": 750 }, { "epoch": 1.3772893772893773, "grad_norm": 0.1555003523826599, "learning_rate": 6.323269237817595e-06, "loss": 1.1485873460769653, "step": 752 }, { "epoch": 1.380952380952381, "grad_norm": 0.15514326095581055, "learning_rate": 6.3057583477100114e-06, "loss": 0.9761220812797546, "step": 754 }, { "epoch": 1.3846153846153846, "grad_norm": 0.2655651867389679, "learning_rate": 6.288234843307304e-06, "loss": 1.3567599058151245, "step": 756 }, { "epoch": 1.3882783882783882, "grad_norm": 0.22227928042411804, "learning_rate": 6.270698998943158e-06, "loss": 1.2285981178283691, "step": 758 }, { "epoch": 1.3919413919413919, "grad_norm": 0.08404932916164398, "learning_rate": 6.253151089144443e-06, "loss": 1.1724284887313843, "step": 760 }, { "epoch": 1.3956043956043955, "grad_norm": 0.270246684551239, "learning_rate": 6.235591388626916e-06, "loss": 1.1640665531158447, "step": 762 }, { "epoch": 1.3992673992673992, "grad_norm": 0.22832772135734558, "learning_rate": 6.218020172290912e-06, "loss": 0.7001198530197144, "step": 764 }, { "epoch": 1.4029304029304028, "grad_norm": 0.12530791759490967, "learning_rate": 6.2004377152170595e-06, "loss": 0.9129507541656494, "step": 766 }, { "epoch": 1.4065934065934065, "grad_norm": 1.337783932685852, "learning_rate": 6.182844292661955e-06, "loss": 0.947498619556427, "step": 768 }, { "epoch": 1.4102564102564101, "grad_norm": 0.20773783326148987, "learning_rate": 6.165240180053864e-06, "loss": 1.2057294845581055, "step": 770 }, { "epoch": 1.4139194139194138, "grad_norm": 0.21221360564231873, "learning_rate": 6.147625652988409e-06, "loss": 1.2334250211715698, "step": 772 }, { "epoch": 1.4175824175824177, "grad_norm": 0.2252884805202484, "learning_rate": 6.130000987224252e-06, "loss": 1.0532145500183105, "step": 774 }, { "epoch": 1.4212454212454213, "grad_norm": 0.1621122807264328, "learning_rate": 6.11236645867877e-06, "loss": 1.1797196865081787, "step": 776 }, { "epoch": 1.424908424908425, "grad_norm": 0.4637870192527771, "learning_rate": 6.09472234342376e-06, "loss": 0.8449276685714722, "step": 778 }, { "epoch": 1.4285714285714286, "grad_norm": 0.16811135411262512, "learning_rate": 6.077068917681085e-06, "loss": 1.383507490158081, "step": 780 }, { "epoch": 1.4322344322344323, "grad_norm": 0.5571224093437195, "learning_rate": 6.059406457818372e-06, "loss": 1.2903873920440674, "step": 782 }, { "epoch": 1.435897435897436, "grad_norm": 0.19635361433029175, "learning_rate": 6.0417352403446815e-06, "loss": 1.178612232208252, "step": 784 }, { "epoch": 1.4395604395604396, "grad_norm": 0.5237776637077332, "learning_rate": 6.024055541906171e-06, "loss": 1.0071418285369873, "step": 786 }, { "epoch": 1.4432234432234432, "grad_norm": 0.22411468625068665, "learning_rate": 6.006367639281773e-06, "loss": 1.157625436782837, "step": 788 }, { "epoch": 1.4468864468864469, "grad_norm": 0.17982420325279236, "learning_rate": 5.988671809378851e-06, "loss": 0.7583225965499878, "step": 790 }, { "epoch": 1.4505494505494505, "grad_norm": 0.16371974349021912, "learning_rate": 5.970968329228884e-06, "loss": 0.9400377869606018, "step": 792 }, { "epoch": 1.4542124542124542, "grad_norm": 0.27952075004577637, "learning_rate": 5.953257475983104e-06, "loss": 0.818259060382843, "step": 794 }, { "epoch": 1.4578754578754578, "grad_norm": 0.10578649491071701, "learning_rate": 5.935539526908178e-06, "loss": 1.1710015535354614, "step": 796 }, { "epoch": 1.4615384615384617, "grad_norm": 0.18775279819965363, "learning_rate": 5.917814759381857e-06, "loss": 0.811826765537262, "step": 798 }, { "epoch": 1.4652014652014653, "grad_norm": 0.6075829267501831, "learning_rate": 5.900083450888636e-06, "loss": 1.0531878471374512, "step": 800 }, { "epoch": 1.468864468864469, "grad_norm": 0.47543665766716003, "learning_rate": 5.882345879015412e-06, "loss": 1.5944072008132935, "step": 802 }, { "epoch": 1.4725274725274726, "grad_norm": 0.19731402397155762, "learning_rate": 5.864602321447133e-06, "loss": 1.0520607233047485, "step": 804 }, { "epoch": 1.4761904761904763, "grad_norm": 0.7726924419403076, "learning_rate": 5.846853055962456e-06, "loss": 1.2063556909561157, "step": 806 }, { "epoch": 1.47985347985348, "grad_norm": 0.15811191499233246, "learning_rate": 5.829098360429397e-06, "loss": 1.1925911903381348, "step": 808 }, { "epoch": 1.4835164835164836, "grad_norm": 0.4315653145313263, "learning_rate": 5.811338512800983e-06, "loss": 1.0306977033615112, "step": 810 }, { "epoch": 1.4871794871794872, "grad_norm": 0.16353972256183624, "learning_rate": 5.793573791110888e-06, "loss": 0.8834646940231323, "step": 812 }, { "epoch": 1.4908424908424909, "grad_norm": 0.13097569346427917, "learning_rate": 5.775804473469104e-06, "loss": 1.2225075960159302, "step": 814 }, { "epoch": 1.4945054945054945, "grad_norm": 0.16530992090702057, "learning_rate": 5.758030838057562e-06, "loss": 1.1620936393737793, "step": 816 }, { "epoch": 1.4981684981684982, "grad_norm": 0.06754113733768463, "learning_rate": 5.7402531631257975e-06, "loss": 0.43427881598472595, "step": 818 }, { "epoch": 1.5018315018315018, "grad_norm": 0.3140871226787567, "learning_rate": 5.722471726986577e-06, "loss": 1.1740379333496094, "step": 820 }, { "epoch": 1.5054945054945055, "grad_norm": 1.0577014684677124, "learning_rate": 5.7046868080115554e-06, "loss": 0.9447726011276245, "step": 822 }, { "epoch": 1.5091575091575091, "grad_norm": 0.18314354121685028, "learning_rate": 5.686898684626909e-06, "loss": 0.8410064578056335, "step": 824 }, { "epoch": 1.5128205128205128, "grad_norm": 0.41597187519073486, "learning_rate": 5.6691076353089836e-06, "loss": 1.0812100172042847, "step": 826 }, { "epoch": 1.5164835164835164, "grad_norm": 0.6020085215568542, "learning_rate": 5.651313938579925e-06, "loss": 1.0242727994918823, "step": 828 }, { "epoch": 1.52014652014652, "grad_norm": 0.23912610113620758, "learning_rate": 5.633517873003329e-06, "loss": 1.0295336246490479, "step": 830 }, { "epoch": 1.5238095238095237, "grad_norm": 0.18377065658569336, "learning_rate": 5.615719717179877e-06, "loss": 1.2712222337722778, "step": 832 }, { "epoch": 1.5274725274725274, "grad_norm": 0.41018202900886536, "learning_rate": 5.59791974974297e-06, "loss": 0.825635552406311, "step": 834 }, { "epoch": 1.531135531135531, "grad_norm": 0.17661726474761963, "learning_rate": 5.580118249354371e-06, "loss": 1.1780991554260254, "step": 836 }, { "epoch": 1.5347985347985347, "grad_norm": 0.2789783477783203, "learning_rate": 5.562315494699845e-06, "loss": 1.231903314590454, "step": 838 }, { "epoch": 1.5384615384615383, "grad_norm": 0.38799595832824707, "learning_rate": 5.544511764484788e-06, "loss": 1.0772476196289062, "step": 840 }, { "epoch": 1.542124542124542, "grad_norm": 0.12171674519777298, "learning_rate": 5.526707337429871e-06, "loss": 0.896016001701355, "step": 842 }, { "epoch": 1.5457875457875456, "grad_norm": 2.428612232208252, "learning_rate": 5.508902492266676e-06, "loss": 1.067119836807251, "step": 844 }, { "epoch": 1.5494505494505495, "grad_norm": 0.12386277318000793, "learning_rate": 5.491097507733326e-06, "loss": 1.2031583786010742, "step": 846 }, { "epoch": 1.5531135531135531, "grad_norm": 0.6815460920333862, "learning_rate": 5.473292662570131e-06, "loss": 0.5977136492729187, "step": 848 }, { "epoch": 1.5567765567765568, "grad_norm": 0.2659470736980438, "learning_rate": 5.455488235515214e-06, "loss": 1.0548949241638184, "step": 850 }, { "epoch": 1.5604395604395604, "grad_norm": 0.14139074087142944, "learning_rate": 5.4376845053001585e-06, "loss": 1.2025469541549683, "step": 852 }, { "epoch": 1.564102564102564, "grad_norm": 0.3239622414112091, "learning_rate": 5.41988175064563e-06, "loss": 0.8870663642883301, "step": 854 }, { "epoch": 1.5677655677655677, "grad_norm": 0.1715698391199112, "learning_rate": 5.402080250257031e-06, "loss": 0.8264884948730469, "step": 856 }, { "epoch": 1.5714285714285714, "grad_norm": 0.3756884038448334, "learning_rate": 5.384280282820126e-06, "loss": 0.974433422088623, "step": 858 }, { "epoch": 1.575091575091575, "grad_norm": 0.21274326741695404, "learning_rate": 5.3664821269966714e-06, "loss": 0.7866367101669312, "step": 860 }, { "epoch": 1.578754578754579, "grad_norm": 0.16765311360359192, "learning_rate": 5.348686061420078e-06, "loss": 1.1772246360778809, "step": 862 }, { "epoch": 1.5824175824175826, "grad_norm": 0.31508204340934753, "learning_rate": 5.330892364691018e-06, "loss": 0.9374992251396179, "step": 864 }, { "epoch": 1.5860805860805862, "grad_norm": 0.19520021975040436, "learning_rate": 5.3131013153730916e-06, "loss": 1.2052057981491089, "step": 866 }, { "epoch": 1.5897435897435899, "grad_norm": 0.13115696609020233, "learning_rate": 5.295313191988447e-06, "loss": 1.2084887027740479, "step": 868 }, { "epoch": 1.5934065934065935, "grad_norm": 0.7018041014671326, "learning_rate": 5.277528273013425e-06, "loss": 0.609009325504303, "step": 870 }, { "epoch": 1.5970695970695972, "grad_norm": 0.16342367231845856, "learning_rate": 5.259746836874203e-06, "loss": 1.2070071697235107, "step": 872 }, { "epoch": 1.6007326007326008, "grad_norm": 0.14944781363010406, "learning_rate": 5.2419691619424396e-06, "loss": 1.091475486755371, "step": 874 }, { "epoch": 1.6043956043956045, "grad_norm": 0.3255991041660309, "learning_rate": 5.224195526530897e-06, "loss": 0.8270645141601562, "step": 876 }, { "epoch": 1.6080586080586081, "grad_norm": 0.3492541015148163, "learning_rate": 5.206426208889113e-06, "loss": 1.054788589477539, "step": 878 }, { "epoch": 1.6117216117216118, "grad_norm": 0.1763620376586914, "learning_rate": 5.18866148719902e-06, "loss": 0.9270405769348145, "step": 880 }, { "epoch": 1.6153846153846154, "grad_norm": 0.37243205308914185, "learning_rate": 5.170901639570605e-06, "loss": 1.163893222808838, "step": 882 }, { "epoch": 1.619047619047619, "grad_norm": 0.20206376910209656, "learning_rate": 5.153146944037545e-06, "loss": 0.978087842464447, "step": 884 }, { "epoch": 1.6227106227106227, "grad_norm": 0.1183437779545784, "learning_rate": 5.135397678552869e-06, "loss": 1.1707783937454224, "step": 886 }, { "epoch": 1.6263736263736264, "grad_norm": 0.19354073703289032, "learning_rate": 5.11765412098459e-06, "loss": 0.8640797138214111, "step": 888 }, { "epoch": 1.63003663003663, "grad_norm": 0.13133522868156433, "learning_rate": 5.099916549111365e-06, "loss": 1.2608891725540161, "step": 890 }, { "epoch": 1.6336996336996337, "grad_norm": 0.12352308630943298, "learning_rate": 5.082185240618146e-06, "loss": 0.9242100715637207, "step": 892 }, { "epoch": 1.6373626373626373, "grad_norm": 0.20042523741722107, "learning_rate": 5.064460473091823e-06, "loss": 1.2130396366119385, "step": 894 }, { "epoch": 1.641025641025641, "grad_norm": 0.5343178510665894, "learning_rate": 5.046742524016899e-06, "loss": 0.7868685722351074, "step": 896 }, { "epoch": 1.6446886446886446, "grad_norm": 0.22285579144954681, "learning_rate": 5.029031670771119e-06, "loss": 1.1629694700241089, "step": 898 }, { "epoch": 1.6483516483516483, "grad_norm": 0.05358535423874855, "learning_rate": 5.0113281906211485e-06, "loss": 0.8543750643730164, "step": 900 }, { "epoch": 1.652014652014652, "grad_norm": 0.51787930727005, "learning_rate": 4.99363236071823e-06, "loss": 0.9102002382278442, "step": 902 }, { "epoch": 1.6556776556776556, "grad_norm": 0.15900075435638428, "learning_rate": 4.975944458093831e-06, "loss": 1.042647361755371, "step": 904 }, { "epoch": 1.6593406593406592, "grad_norm": 0.7878313064575195, "learning_rate": 4.958264759655319e-06, "loss": 1.166403889656067, "step": 906 }, { "epoch": 1.6630036630036629, "grad_norm": 0.25159403681755066, "learning_rate": 4.940593542181629e-06, "loss": 0.7760780453681946, "step": 908 }, { "epoch": 1.6666666666666665, "grad_norm": 0.43353399634361267, "learning_rate": 4.922931082318917e-06, "loss": 1.2228432893753052, "step": 910 }, { "epoch": 1.6703296703296702, "grad_norm": 0.13905330002307892, "learning_rate": 4.905277656576243e-06, "loss": 1.241356611251831, "step": 912 }, { "epoch": 1.673992673992674, "grad_norm": 0.14692410826683044, "learning_rate": 4.8876335413212305e-06, "loss": 1.1272119283676147, "step": 914 }, { "epoch": 1.6776556776556777, "grad_norm": 0.2749308943748474, "learning_rate": 4.86999901277575e-06, "loss": 1.240549921989441, "step": 916 }, { "epoch": 1.6813186813186813, "grad_norm": 0.2792946994304657, "learning_rate": 4.852374347011591e-06, "loss": 0.8525235056877136, "step": 918 }, { "epoch": 1.684981684981685, "grad_norm": 0.10312946885824203, "learning_rate": 4.834759819946137e-06, "loss": 1.2491165399551392, "step": 920 }, { "epoch": 1.6886446886446886, "grad_norm": 0.18224607408046722, "learning_rate": 4.817155707338048e-06, "loss": 1.0514307022094727, "step": 922 }, { "epoch": 1.6923076923076923, "grad_norm": 0.1929437667131424, "learning_rate": 4.799562284782944e-06, "loss": 0.8574016690254211, "step": 924 }, { "epoch": 1.695970695970696, "grad_norm": 3.134303092956543, "learning_rate": 4.78197982770909e-06, "loss": 1.052782654762268, "step": 926 }, { "epoch": 1.6996336996336996, "grad_norm": 0.31613659858703613, "learning_rate": 4.7644086113730855e-06, "loss": 0.8344395756721497, "step": 928 }, { "epoch": 1.7032967032967035, "grad_norm": 0.03118388168513775, "learning_rate": 4.746848910855558e-06, "loss": 0.9363417625427246, "step": 930 }, { "epoch": 1.7069597069597071, "grad_norm": 0.22982323169708252, "learning_rate": 4.729301001056842e-06, "loss": 0.959007978439331, "step": 932 }, { "epoch": 1.7106227106227108, "grad_norm": 0.20650729537010193, "learning_rate": 4.711765156692697e-06, "loss": 1.4183884859085083, "step": 934 }, { "epoch": 1.7142857142857144, "grad_norm": 0.29698464274406433, "learning_rate": 4.694241652289992e-06, "loss": 1.221863865852356, "step": 936 }, { "epoch": 1.717948717948718, "grad_norm": 0.23697128891944885, "learning_rate": 4.676730762182407e-06, "loss": 0.47039785981178284, "step": 938 }, { "epoch": 1.7216117216117217, "grad_norm": 0.16988730430603027, "learning_rate": 4.659232760506149e-06, "loss": 0.6852482557296753, "step": 940 }, { "epoch": 1.7252747252747254, "grad_norm": 0.28183671832084656, "learning_rate": 4.641747921195657e-06, "loss": 1.152092456817627, "step": 942 }, { "epoch": 1.728937728937729, "grad_norm": 0.614911675453186, "learning_rate": 4.624276517979298e-06, "loss": 0.9434917569160461, "step": 944 }, { "epoch": 1.7326007326007327, "grad_norm": 0.19769613444805145, "learning_rate": 4.606818824375109e-06, "loss": 1.1814640760421753, "step": 946 }, { "epoch": 1.7362637362637363, "grad_norm": 0.2297975867986679, "learning_rate": 4.589375113686492e-06, "loss": 1.1364959478378296, "step": 948 }, { "epoch": 1.73992673992674, "grad_norm": 0.14440670609474182, "learning_rate": 4.571945658997944e-06, "loss": 1.0138092041015625, "step": 950 }, { "epoch": 1.7435897435897436, "grad_norm": 0.0975588783621788, "learning_rate": 4.554530733170788e-06, "loss": 0.9809228181838989, "step": 952 }, { "epoch": 1.7472527472527473, "grad_norm": 0.14117704331874847, "learning_rate": 4.5371306088388856e-06, "loss": 1.239669680595398, "step": 954 }, { "epoch": 1.750915750915751, "grad_norm": 0.228751540184021, "learning_rate": 4.519745558404387e-06, "loss": 0.8224953413009644, "step": 956 }, { "epoch": 1.7545787545787546, "grad_norm": 0.7675461173057556, "learning_rate": 4.502375854033453e-06, "loss": 1.2402071952819824, "step": 958 }, { "epoch": 1.7582417582417582, "grad_norm": 0.11008962988853455, "learning_rate": 4.4850217676519995e-06, "loss": 0.5849726796150208, "step": 960 }, { "epoch": 1.7619047619047619, "grad_norm": 0.3784470558166504, "learning_rate": 4.46768357094144e-06, "loss": 1.023523211479187, "step": 962 }, { "epoch": 1.7655677655677655, "grad_norm": 0.16590407490730286, "learning_rate": 4.4503615353344346e-06, "loss": 1.091076135635376, "step": 964 }, { "epoch": 1.7692307692307692, "grad_norm": 0.16390341520309448, "learning_rate": 4.433055932010635e-06, "loss": 1.2073513269424438, "step": 966 }, { "epoch": 1.7728937728937728, "grad_norm": 1.0820486545562744, "learning_rate": 4.4157670318924454e-06, "loss": 0.5969150066375732, "step": 968 }, { "epoch": 1.7765567765567765, "grad_norm": 0.16822820901870728, "learning_rate": 4.398495105640774e-06, "loss": 0.8644286394119263, "step": 970 }, { "epoch": 1.7802197802197801, "grad_norm": 0.2925519645214081, "learning_rate": 4.381240423650805e-06, "loss": 0.9442048072814941, "step": 972 }, { "epoch": 1.7838827838827838, "grad_norm": 0.4801477789878845, "learning_rate": 4.364003256047758e-06, "loss": 1.279288649559021, "step": 974 }, { "epoch": 1.7875457875457874, "grad_norm": 0.1608477234840393, "learning_rate": 4.346783872682662e-06, "loss": 1.2263715267181396, "step": 976 }, { "epoch": 1.791208791208791, "grad_norm": 0.13645397126674652, "learning_rate": 4.329582543128131e-06, "loss": 0.9317041635513306, "step": 978 }, { "epoch": 1.7948717948717947, "grad_norm": 0.258089154958725, "learning_rate": 4.312399536674141e-06, "loss": 0.9728096723556519, "step": 980 }, { "epoch": 1.7985347985347986, "grad_norm": 2.324678421020508, "learning_rate": 4.295235122323822e-06, "loss": 1.1650446653366089, "step": 982 }, { "epoch": 1.8021978021978022, "grad_norm": 0.2941892147064209, "learning_rate": 4.278089568789231e-06, "loss": 1.1338319778442383, "step": 984 }, { "epoch": 1.8058608058608059, "grad_norm": 0.08391306549310684, "learning_rate": 4.260963144487168e-06, "loss": 0.6776608824729919, "step": 986 }, { "epoch": 1.8095238095238095, "grad_norm": 0.4403177797794342, "learning_rate": 4.2438561175349505e-06, "loss": 0.8319576382637024, "step": 988 }, { "epoch": 1.8131868131868132, "grad_norm": 0.34780237078666687, "learning_rate": 4.2267687557462345e-06, "loss": 0.7826079726219177, "step": 990 }, { "epoch": 1.8168498168498168, "grad_norm": 0.11686074733734131, "learning_rate": 4.209701326626812e-06, "loss": 0.795200526714325, "step": 992 }, { "epoch": 1.8205128205128205, "grad_norm": 0.12649790942668915, "learning_rate": 4.192654097370423e-06, "loss": 0.8667728900909424, "step": 994 }, { "epoch": 1.8241758241758241, "grad_norm": 0.5255754590034485, "learning_rate": 4.175627334854575e-06, "loss": 1.1568585634231567, "step": 996 }, { "epoch": 1.8278388278388278, "grad_norm": 0.10009193420410156, "learning_rate": 4.1586213056363724e-06, "loss": 0.8747377991676331, "step": 998 }, { "epoch": 1.8315018315018317, "grad_norm": 0.17984943091869354, "learning_rate": 4.141636275948324e-06, "loss": 1.1325833797454834, "step": 1000 }, { "epoch": 1.8351648351648353, "grad_norm": 0.17336614429950714, "learning_rate": 4.1246725116941964e-06, "loss": 1.166914463043213, "step": 1002 }, { "epoch": 1.838827838827839, "grad_norm": 0.20862817764282227, "learning_rate": 4.10773027844484e-06, "loss": 1.1623833179473877, "step": 1004 }, { "epoch": 1.8424908424908426, "grad_norm": 0.15748779475688934, "learning_rate": 4.090809841434029e-06, "loss": 1.164290428161621, "step": 1006 }, { "epoch": 1.8461538461538463, "grad_norm": 0.31245389580726624, "learning_rate": 4.073911465554319e-06, "loss": 0.8208089470863342, "step": 1008 }, { "epoch": 1.84981684981685, "grad_norm": 0.18492500483989716, "learning_rate": 4.057035415352892e-06, "loss": 1.1237512826919556, "step": 1010 }, { "epoch": 1.8534798534798536, "grad_norm": 0.14317144453525543, "learning_rate": 4.0401819550274165e-06, "loss": 0.7784026861190796, "step": 1012 }, { "epoch": 1.8571428571428572, "grad_norm": 0.1404157131910324, "learning_rate": 4.023351348421915e-06, "loss": 1.176824688911438, "step": 1014 }, { "epoch": 1.8608058608058609, "grad_norm": 0.14713755249977112, "learning_rate": 4.006543859022628e-06, "loss": 1.1646744012832642, "step": 1016 }, { "epoch": 1.8644688644688645, "grad_norm": 0.19819317758083344, "learning_rate": 3.989759749953893e-06, "loss": 1.256286382675171, "step": 1018 }, { "epoch": 1.8681318681318682, "grad_norm": 0.034005679190158844, "learning_rate": 3.972999283974026e-06, "loss": 0.9847078323364258, "step": 1020 }, { "epoch": 1.8717948717948718, "grad_norm": 0.17176663875579834, "learning_rate": 3.956262723471203e-06, "loss": 1.1373211145401, "step": 1022 }, { "epoch": 1.8754578754578755, "grad_norm": 0.3484453558921814, "learning_rate": 3.9395503304593565e-06, "loss": 0.5924882292747498, "step": 1024 }, { "epoch": 1.879120879120879, "grad_norm": 0.16725610196590424, "learning_rate": 3.922862366574074e-06, "loss": 1.1780312061309814, "step": 1026 }, { "epoch": 1.8827838827838828, "grad_norm": 0.1592869609594345, "learning_rate": 3.906199093068497e-06, "loss": 0.9455581903457642, "step": 1028 }, { "epoch": 1.8864468864468864, "grad_norm": 0.4676535129547119, "learning_rate": 3.889560770809239e-06, "loss": 1.1824193000793457, "step": 1030 }, { "epoch": 1.89010989010989, "grad_norm": 0.13471902906894684, "learning_rate": 3.872947660272295e-06, "loss": 0.9769763350486755, "step": 1032 }, { "epoch": 1.8937728937728937, "grad_norm": 0.24125701189041138, "learning_rate": 3.856360021538964e-06, "loss": 0.8109256029129028, "step": 1034 }, { "epoch": 1.8974358974358974, "grad_norm": 0.27469104528427124, "learning_rate": 3.8397981142917815e-06, "loss": 1.2156492471694946, "step": 1036 }, { "epoch": 1.901098901098901, "grad_norm": 0.16270951926708221, "learning_rate": 3.823262197810454e-06, "loss": 1.183699369430542, "step": 1038 }, { "epoch": 1.9047619047619047, "grad_norm": 0.37082114815711975, "learning_rate": 3.806752530967792e-06, "loss": 1.2584105730056763, "step": 1040 }, { "epoch": 1.9084249084249083, "grad_norm": 0.9490067362785339, "learning_rate": 3.790269372225668e-06, "loss": 0.6401211023330688, "step": 1042 }, { "epoch": 1.912087912087912, "grad_norm": 0.15817776322364807, "learning_rate": 3.773812979630964e-06, "loss": 0.9084805250167847, "step": 1044 }, { "epoch": 1.9157509157509156, "grad_norm": 0.1630954146385193, "learning_rate": 3.7573836108115303e-06, "loss": 1.1366910934448242, "step": 1046 }, { "epoch": 1.9194139194139193, "grad_norm": 0.3042643964290619, "learning_rate": 3.740981522972159e-06, "loss": 0.514860987663269, "step": 1048 }, { "epoch": 1.9230769230769231, "grad_norm": 0.25363633036613464, "learning_rate": 3.724606972890551e-06, "loss": 0.9003884792327881, "step": 1050 }, { "epoch": 1.9267399267399268, "grad_norm": 0.22043615579605103, "learning_rate": 3.7082602169132995e-06, "loss": 0.8399287462234497, "step": 1052 }, { "epoch": 1.9304029304029304, "grad_norm": 0.1377377063035965, "learning_rate": 3.6919415109518776e-06, "loss": 1.1453593969345093, "step": 1054 }, { "epoch": 1.934065934065934, "grad_norm": 0.22288501262664795, "learning_rate": 3.6756511104786254e-06, "loss": 0.770913302898407, "step": 1056 }, { "epoch": 1.9377289377289377, "grad_norm": 0.16052567958831787, "learning_rate": 3.6593892705227586e-06, "loss": 1.003678321838379, "step": 1058 }, { "epoch": 1.9413919413919414, "grad_norm": 0.3310210704803467, "learning_rate": 3.643156245666377e-06, "loss": 1.1094727516174316, "step": 1060 }, { "epoch": 1.945054945054945, "grad_norm": 0.18297162652015686, "learning_rate": 3.626952290040463e-06, "loss": 0.8664683103561401, "step": 1062 }, { "epoch": 1.9487179487179487, "grad_norm": 0.1476193070411682, "learning_rate": 3.6107776573209263e-06, "loss": 0.8188486099243164, "step": 1064 }, { "epoch": 1.9523809523809523, "grad_norm": 0.20653630793094635, "learning_rate": 3.59463260072462e-06, "loss": 1.1757713556289673, "step": 1066 }, { "epoch": 1.9560439560439562, "grad_norm": 0.12281164526939392, "learning_rate": 3.5785173730053667e-06, "loss": 1.2063580751419067, "step": 1068 }, { "epoch": 1.9597069597069599, "grad_norm": 0.07409633696079254, "learning_rate": 3.5624322264500246e-06, "loss": 0.7450681328773499, "step": 1070 }, { "epoch": 1.9633699633699635, "grad_norm": 0.14261014759540558, "learning_rate": 3.5463774128745232e-06, "loss": 0.881243884563446, "step": 1072 }, { "epoch": 1.9670329670329672, "grad_norm": 0.12503007054328918, "learning_rate": 3.530353183619918e-06, "loss": 1.161426067352295, "step": 1074 }, { "epoch": 1.9706959706959708, "grad_norm": 0.11768339574337006, "learning_rate": 3.514359789548466e-06, "loss": 1.1456844806671143, "step": 1076 }, { "epoch": 1.9743589743589745, "grad_norm": 0.23258298635482788, "learning_rate": 3.4983974810396927e-06, "loss": 1.0247056484222412, "step": 1078 }, { "epoch": 1.978021978021978, "grad_norm": 0.28044548630714417, "learning_rate": 3.4824665079864735e-06, "loss": 1.1190541982650757, "step": 1080 }, { "epoch": 1.9816849816849818, "grad_norm": 0.12241950631141663, "learning_rate": 3.466567119791123e-06, "loss": 1.126396656036377, "step": 1082 }, { "epoch": 1.9853479853479854, "grad_norm": 0.05192271247506142, "learning_rate": 3.4506995653614873e-06, "loss": 0.7499899864196777, "step": 1084 }, { "epoch": 1.989010989010989, "grad_norm": 0.5542802214622498, "learning_rate": 3.4348640931070463e-06, "loss": 0.981029748916626, "step": 1086 }, { "epoch": 1.9926739926739927, "grad_norm": 0.24883772432804108, "learning_rate": 3.4190609509350338e-06, "loss": 1.0121923685073853, "step": 1088 }, { "epoch": 1.9963369963369964, "grad_norm": 0.1047605574131012, "learning_rate": 3.403290386246544e-06, "loss": 0.9460771679878235, "step": 1090 }, { "epoch": 2.0, "grad_norm": 0.14438582956790924, "learning_rate": 3.3875526459326714e-06, "loss": 1.0866570472717285, "step": 1092 }, { "epoch": 2.0036630036630036, "grad_norm": 0.13476252555847168, "learning_rate": 3.3718479763706324e-06, "loss": 1.140030860900879, "step": 1094 }, { "epoch": 2.0073260073260073, "grad_norm": 0.39099064469337463, "learning_rate": 3.356176623419915e-06, "loss": 1.1750749349594116, "step": 1096 }, { "epoch": 2.010989010989011, "grad_norm": 0.15205055475234985, "learning_rate": 3.340538832418436e-06, "loss": 1.1374648809432983, "step": 1098 }, { "epoch": 2.0146520146520146, "grad_norm": 0.4141819179058075, "learning_rate": 3.3249348481786904e-06, "loss": 1.2270292043685913, "step": 1100 }, { "epoch": 2.0183150183150182, "grad_norm": 0.2034660130739212, "learning_rate": 3.3093649149839148e-06, "loss": 0.8838691711425781, "step": 1102 }, { "epoch": 2.021978021978022, "grad_norm": 0.12057554721832275, "learning_rate": 3.2938292765842817e-06, "loss": 1.1789038181304932, "step": 1104 }, { "epoch": 2.0256410256410255, "grad_norm": 0.2732870578765869, "learning_rate": 3.2783281761930673e-06, "loss": 0.8000632524490356, "step": 1106 }, { "epoch": 2.029304029304029, "grad_norm": 0.10481557995080948, "learning_rate": 3.262861856482849e-06, "loss": 1.2116031646728516, "step": 1108 }, { "epoch": 2.032967032967033, "grad_norm": 0.19622857868671417, "learning_rate": 3.247430559581706e-06, "loss": 0.9533130526542664, "step": 1110 }, { "epoch": 2.0366300366300365, "grad_norm": 0.15817232429981232, "learning_rate": 3.2320345270694263e-06, "loss": 0.6461672186851501, "step": 1112 }, { "epoch": 2.04029304029304, "grad_norm": 0.30624663829803467, "learning_rate": 3.216673999973734e-06, "loss": 0.893692672252655, "step": 1114 }, { "epoch": 2.043956043956044, "grad_norm": 0.20368322730064392, "learning_rate": 3.201349218766506e-06, "loss": 1.2045972347259521, "step": 1116 }, { "epoch": 2.0476190476190474, "grad_norm": 0.3576587736606598, "learning_rate": 3.186060423360009e-06, "loss": 1.1595624685287476, "step": 1118 }, { "epoch": 2.051282051282051, "grad_norm": 0.15144126117229462, "learning_rate": 3.170807853103146e-06, "loss": 0.8582723736763, "step": 1120 }, { "epoch": 2.0549450549450547, "grad_norm": 0.3102099895477295, "learning_rate": 3.155591746777713e-06, "loss": 1.3617991209030151, "step": 1122 }, { "epoch": 2.0586080586080584, "grad_norm": 0.13681809604167938, "learning_rate": 3.140412342594648e-06, "loss": 1.1718530654907227, "step": 1124 }, { "epoch": 2.062271062271062, "grad_norm": 0.14834214746952057, "learning_rate": 3.12526987819032e-06, "loss": 0.8169500827789307, "step": 1126 }, { "epoch": 2.065934065934066, "grad_norm": 0.3267018795013428, "learning_rate": 3.1101645906227924e-06, "loss": 1.1410131454467773, "step": 1128 }, { "epoch": 2.06959706959707, "grad_norm": 0.2117215096950531, "learning_rate": 3.0950967163681177e-06, "loss": 1.1394081115722656, "step": 1130 }, { "epoch": 2.0732600732600734, "grad_norm": 0.18249589204788208, "learning_rate": 3.08006649131664e-06, "loss": 1.1663340330123901, "step": 1132 }, { "epoch": 2.076923076923077, "grad_norm": 0.15602019429206848, "learning_rate": 3.0650741507693004e-06, "loss": 1.1466034650802612, "step": 1134 }, { "epoch": 2.0805860805860807, "grad_norm": 0.3648541271686554, "learning_rate": 3.0501199294339435e-06, "loss": 0.8573122620582581, "step": 1136 }, { "epoch": 2.0842490842490844, "grad_norm": 0.1585971862077713, "learning_rate": 3.0352040614216555e-06, "loss": 1.1506117582321167, "step": 1138 }, { "epoch": 2.087912087912088, "grad_norm": 0.17453494668006897, "learning_rate": 3.0203267802430915e-06, "loss": 1.0754824876785278, "step": 1140 }, { "epoch": 2.0915750915750917, "grad_norm": 0.1620694249868393, "learning_rate": 3.0054883188048266e-06, "loss": 1.1398316621780396, "step": 1142 }, { "epoch": 2.0952380952380953, "grad_norm": 0.06643152236938477, "learning_rate": 2.9906889094057062e-06, "loss": 0.4219062924385071, "step": 1144 }, { "epoch": 2.098901098901099, "grad_norm": 0.7500718235969543, "learning_rate": 2.9759287837332007e-06, "loss": 0.9941345453262329, "step": 1146 }, { "epoch": 2.1025641025641026, "grad_norm": 0.17045439779758453, "learning_rate": 2.961208172859794e-06, "loss": 0.84036785364151, "step": 1148 }, { "epoch": 2.1062271062271063, "grad_norm": 0.2622012197971344, "learning_rate": 2.946527307239359e-06, "loss": 0.8539763689041138, "step": 1150 }, { "epoch": 2.10989010989011, "grad_norm": 0.42088064551353455, "learning_rate": 2.9318864167035452e-06, "loss": 0.985520601272583, "step": 1152 }, { "epoch": 2.1135531135531136, "grad_norm": 0.3410366475582123, "learning_rate": 2.9172857304581857e-06, "loss": 0.900378942489624, "step": 1154 }, { "epoch": 2.1172161172161172, "grad_norm": 0.3229033052921295, "learning_rate": 2.902725477079711e-06, "loss": 1.1304961442947388, "step": 1156 }, { "epoch": 2.120879120879121, "grad_norm": 0.4168906807899475, "learning_rate": 2.8882058845115633e-06, "loss": 1.0916647911071777, "step": 1158 }, { "epoch": 2.1245421245421245, "grad_norm": 0.34625813364982605, "learning_rate": 2.873727180060637e-06, "loss": 0.909528374671936, "step": 1160 }, { "epoch": 2.128205128205128, "grad_norm": 0.2843841016292572, "learning_rate": 2.8592895903937124e-06, "loss": 0.8306626677513123, "step": 1162 }, { "epoch": 2.131868131868132, "grad_norm": 0.23222553730010986, "learning_rate": 2.8448933415339085e-06, "loss": 0.9491928815841675, "step": 1164 }, { "epoch": 2.1355311355311355, "grad_norm": 0.1373523473739624, "learning_rate": 2.8305386588571517e-06, "loss": 0.45827817916870117, "step": 1166 }, { "epoch": 2.139194139194139, "grad_norm": 0.1498524397611618, "learning_rate": 2.816225767088638e-06, "loss": 0.4394649267196655, "step": 1168 }, { "epoch": 2.142857142857143, "grad_norm": 0.203684002161026, "learning_rate": 2.801954890299322e-06, "loss": 0.9728699922561646, "step": 1170 }, { "epoch": 2.1465201465201464, "grad_norm": 0.4509437382221222, "learning_rate": 2.7877262519024027e-06, "loss": 1.19068443775177, "step": 1172 }, { "epoch": 2.15018315018315, "grad_norm": 0.37610286474227905, "learning_rate": 2.7735400746498302e-06, "loss": 1.306997299194336, "step": 1174 }, { "epoch": 2.1538461538461537, "grad_norm": 1.0810068845748901, "learning_rate": 2.7593965806288204e-06, "loss": 0.8269945979118347, "step": 1176 }, { "epoch": 2.1575091575091574, "grad_norm": 0.11437740176916122, "learning_rate": 2.7452959912583744e-06, "loss": 1.174338698387146, "step": 1178 }, { "epoch": 2.161172161172161, "grad_norm": 0.1378648579120636, "learning_rate": 2.7312385272858087e-06, "loss": 1.1485635042190552, "step": 1180 }, { "epoch": 2.1648351648351647, "grad_norm": 0.1592278927564621, "learning_rate": 2.7172244087833077e-06, "loss": 1.208397388458252, "step": 1182 }, { "epoch": 2.1684981684981683, "grad_norm": 0.1117565929889679, "learning_rate": 2.7032538551444776e-06, "loss": 1.175192952156067, "step": 1184 }, { "epoch": 2.172161172161172, "grad_norm": 0.12953658401966095, "learning_rate": 2.6893270850809024e-06, "loss": 0.3722214698791504, "step": 1186 }, { "epoch": 2.1758241758241756, "grad_norm": 0.27550461888313293, "learning_rate": 2.6754443166187267e-06, "loss": 1.2698341608047485, "step": 1188 }, { "epoch": 2.1794871794871793, "grad_norm": 0.3155595660209656, "learning_rate": 2.661605767095248e-06, "loss": 0.8203377723693848, "step": 1190 }, { "epoch": 2.183150183150183, "grad_norm": 0.17399396002292633, "learning_rate": 2.6478116531554997e-06, "loss": 1.01655912399292, "step": 1192 }, { "epoch": 2.186813186813187, "grad_norm": 0.1317910999059677, "learning_rate": 2.6340621907488777e-06, "loss": 0.8621305823326111, "step": 1194 }, { "epoch": 2.1904761904761907, "grad_norm": 0.2792171835899353, "learning_rate": 2.620357595125742e-06, "loss": 0.9206136465072632, "step": 1196 }, { "epoch": 2.1941391941391943, "grad_norm": 0.15142017602920532, "learning_rate": 2.6066980808340553e-06, "loss": 1.1463533639907837, "step": 1198 }, { "epoch": 2.197802197802198, "grad_norm": 0.2487816959619522, "learning_rate": 2.5930838617160304e-06, "loss": 0.8177496790885925, "step": 1200 }, { "epoch": 2.2014652014652016, "grad_norm": 0.176479309797287, "learning_rate": 2.579515150904767e-06, "loss": 1.2105001211166382, "step": 1202 }, { "epoch": 2.2051282051282053, "grad_norm": 0.2774566113948822, "learning_rate": 2.5659921608209325e-06, "loss": 1.165309190750122, "step": 1204 }, { "epoch": 2.208791208791209, "grad_norm": 0.18029530346393585, "learning_rate": 2.5525151031694214e-06, "loss": 0.5955395102500916, "step": 1206 }, { "epoch": 2.2124542124542126, "grad_norm": 0.44882774353027344, "learning_rate": 2.5390841889360483e-06, "loss": 0.7616056203842163, "step": 1208 }, { "epoch": 2.2161172161172162, "grad_norm": 1.0745935440063477, "learning_rate": 2.525699628384249e-06, "loss": 0.6935135722160339, "step": 1210 }, { "epoch": 2.21978021978022, "grad_norm": 0.17599527537822723, "learning_rate": 2.5123616310517797e-06, "loss": 1.1335649490356445, "step": 1212 }, { "epoch": 2.2234432234432235, "grad_norm": 0.38573309779167175, "learning_rate": 2.4990704057474405e-06, "loss": 0.8549797534942627, "step": 1214 }, { "epoch": 2.227106227106227, "grad_norm": 0.5317236185073853, "learning_rate": 2.485826160547807e-06, "loss": 0.9798864126205444, "step": 1216 }, { "epoch": 2.230769230769231, "grad_norm": 0.12072915583848953, "learning_rate": 2.4726291027939775e-06, "loss": 1.137038230895996, "step": 1218 }, { "epoch": 2.2344322344322345, "grad_norm": 0.13340038061141968, "learning_rate": 2.459479439088314e-06, "loss": 1.1505991220474243, "step": 1220 }, { "epoch": 2.238095238095238, "grad_norm": 0.19366510212421417, "learning_rate": 2.4463773752912232e-06, "loss": 1.1624219417572021, "step": 1222 }, { "epoch": 2.241758241758242, "grad_norm": 0.2833138108253479, "learning_rate": 2.4333231165179226e-06, "loss": 0.5617607831954956, "step": 1224 }, { "epoch": 2.2454212454212454, "grad_norm": 0.14608268439769745, "learning_rate": 2.420316867135232e-06, "loss": 1.1109657287597656, "step": 1226 }, { "epoch": 2.249084249084249, "grad_norm": 2.9962241649627686, "learning_rate": 2.407358830758381e-06, "loss": 0.6706120371818542, "step": 1228 }, { "epoch": 2.2527472527472527, "grad_norm": 0.044207386672496796, "learning_rate": 2.394449210247811e-06, "loss": 0.6224187016487122, "step": 1230 }, { "epoch": 2.2564102564102564, "grad_norm": 0.20471802353858948, "learning_rate": 2.381588207706003e-06, "loss": 0.6815849542617798, "step": 1232 }, { "epoch": 2.26007326007326, "grad_norm": 0.3602707087993622, "learning_rate": 2.3687760244743198e-06, "loss": 1.157220482826233, "step": 1234 }, { "epoch": 2.2637362637362637, "grad_norm": 0.8389260172843933, "learning_rate": 2.356012861129845e-06, "loss": 0.7905306220054626, "step": 1236 }, { "epoch": 2.2673992673992673, "grad_norm": 0.12152452766895294, "learning_rate": 2.3432989174822496e-06, "loss": 0.998111367225647, "step": 1238 }, { "epoch": 2.271062271062271, "grad_norm": 0.15299645066261292, "learning_rate": 2.330634392570658e-06, "loss": 0.9482631683349609, "step": 1240 }, { "epoch": 2.2747252747252746, "grad_norm": 0.22156605124473572, "learning_rate": 2.3180194846605367e-06, "loss": 0.9491860866546631, "step": 1242 }, { "epoch": 2.2783882783882783, "grad_norm": 0.1533634215593338, "learning_rate": 2.3054543912405896e-06, "loss": 1.1562466621398926, "step": 1244 }, { "epoch": 2.282051282051282, "grad_norm": 0.12872643768787384, "learning_rate": 2.2929393090196663e-06, "loss": 0.7593182921409607, "step": 1246 }, { "epoch": 2.2857142857142856, "grad_norm": 0.25250881910324097, "learning_rate": 2.2804744339236796e-06, "loss": 0.7431901097297668, "step": 1248 }, { "epoch": 2.2893772893772892, "grad_norm": 0.1763988882303238, "learning_rate": 2.268059961092541e-06, "loss": 1.127759575843811, "step": 1250 }, { "epoch": 2.293040293040293, "grad_norm": 0.2666459381580353, "learning_rate": 2.255696084877107e-06, "loss": 0.8839851021766663, "step": 1252 }, { "epoch": 2.2967032967032965, "grad_norm": 0.18553560972213745, "learning_rate": 2.2433829988361316e-06, "loss": 1.2005871534347534, "step": 1254 }, { "epoch": 2.3003663003663, "grad_norm": 0.20974372327327728, "learning_rate": 2.231120895733245e-06, "loss": 1.2160831689834595, "step": 1256 }, { "epoch": 2.304029304029304, "grad_norm": 0.27016669511795044, "learning_rate": 2.2189099675339233e-06, "loss": 0.8103601336479187, "step": 1258 }, { "epoch": 2.3076923076923075, "grad_norm": 0.2763507664203644, "learning_rate": 2.206750405402493e-06, "loss": 1.232648491859436, "step": 1260 }, { "epoch": 2.311355311355311, "grad_norm": 0.20276162028312683, "learning_rate": 2.194642399699138e-06, "loss": 1.0822112560272217, "step": 1262 }, { "epoch": 2.315018315018315, "grad_norm": 0.1820443570613861, "learning_rate": 2.1825861399769126e-06, "loss": 0.9380193948745728, "step": 1264 }, { "epoch": 2.3186813186813184, "grad_norm": 0.20645156502723694, "learning_rate": 2.17058181497878e-06, "loss": 0.8565780520439148, "step": 1266 }, { "epoch": 2.3223443223443225, "grad_norm": 0.30140256881713867, "learning_rate": 2.1586296126346566e-06, "loss": 0.8535648584365845, "step": 1268 }, { "epoch": 2.326007326007326, "grad_norm": 0.27577510476112366, "learning_rate": 2.1467297200584677e-06, "loss": 1.2173646688461304, "step": 1270 }, { "epoch": 2.32967032967033, "grad_norm": 0.1859835982322693, "learning_rate": 2.134882323545221e-06, "loss": 1.0475445985794067, "step": 1272 }, { "epoch": 2.3333333333333335, "grad_norm": 0.5028762817382812, "learning_rate": 2.123087608568088e-06, "loss": 0.7030253410339355, "step": 1274 }, { "epoch": 2.336996336996337, "grad_norm": 0.17414085566997528, "learning_rate": 2.1113457597754977e-06, "loss": 1.058994174003601, "step": 1276 }, { "epoch": 2.340659340659341, "grad_norm": 0.195421501994133, "learning_rate": 2.0996569609882555e-06, "loss": 0.8695497512817383, "step": 1278 }, { "epoch": 2.3443223443223444, "grad_norm": 0.1678563356399536, "learning_rate": 2.0880213951966564e-06, "loss": 0.7928240299224854, "step": 1280 }, { "epoch": 2.347985347985348, "grad_norm": 0.15970492362976074, "learning_rate": 2.076439244557622e-06, "loss": 0.6427817344665527, "step": 1282 }, { "epoch": 2.3516483516483517, "grad_norm": 0.15121600031852722, "learning_rate": 2.064910690391849e-06, "loss": 1.1278434991836548, "step": 1284 }, { "epoch": 2.3553113553113554, "grad_norm": 2.746044397354126, "learning_rate": 2.053435913180976e-06, "loss": 0.6882444024085999, "step": 1286 }, { "epoch": 2.358974358974359, "grad_norm": 0.14493419229984283, "learning_rate": 2.0420150925647476e-06, "loss": 0.9737670421600342, "step": 1288 }, { "epoch": 2.3626373626373627, "grad_norm": 0.1830594837665558, "learning_rate": 2.0306484073382144e-06, "loss": 0.9390268325805664, "step": 1290 }, { "epoch": 2.3663003663003663, "grad_norm": 0.17552392184734344, "learning_rate": 2.019336035448922e-06, "loss": 0.8130999207496643, "step": 1292 }, { "epoch": 2.36996336996337, "grad_norm": 0.4816751182079315, "learning_rate": 2.008078153994131e-06, "loss": 0.9279530644416809, "step": 1294 }, { "epoch": 2.3736263736263736, "grad_norm": 0.2578529119491577, "learning_rate": 1.99687493921805e-06, "loss": 1.3056340217590332, "step": 1296 }, { "epoch": 2.3772893772893773, "grad_norm": 0.24960176646709442, "learning_rate": 1.9857265665090637e-06, "loss": 1.138514757156372, "step": 1298 }, { "epoch": 2.380952380952381, "grad_norm": 0.20973335206508636, "learning_rate": 1.9746332103969994e-06, "loss": 1.196106195449829, "step": 1300 }, { "epoch": 2.3846153846153846, "grad_norm": 0.4483489990234375, "learning_rate": 1.9635950445503867e-06, "loss": 0.952997624874115, "step": 1302 }, { "epoch": 2.3882783882783882, "grad_norm": 0.9477534890174866, "learning_rate": 1.9526122417737396e-06, "loss": 0.5085421204566956, "step": 1304 }, { "epoch": 2.391941391941392, "grad_norm": 0.17980064451694489, "learning_rate": 1.941684974004857e-06, "loss": 0.9798279404640198, "step": 1306 }, { "epoch": 2.3956043956043955, "grad_norm": 0.21208752691745758, "learning_rate": 1.930813412312129e-06, "loss": 1.1446267366409302, "step": 1308 }, { "epoch": 2.399267399267399, "grad_norm": 0.14319059252738953, "learning_rate": 1.919997726891847e-06, "loss": 0.5433471202850342, "step": 1310 }, { "epoch": 2.402930402930403, "grad_norm": 0.25561878085136414, "learning_rate": 1.909238087065559e-06, "loss": 1.1503570079803467, "step": 1312 }, { "epoch": 2.4065934065934065, "grad_norm": 0.13398070633411407, "learning_rate": 1.8985346612774058e-06, "loss": 0.8720892667770386, "step": 1314 }, { "epoch": 2.41025641025641, "grad_norm": 0.1498894989490509, "learning_rate": 1.8878876170914862e-06, "loss": 1.14559006690979, "step": 1316 }, { "epoch": 2.413919413919414, "grad_norm": 0.3363962769508362, "learning_rate": 1.877297121189233e-06, "loss": 0.8333287239074707, "step": 1318 }, { "epoch": 2.4175824175824174, "grad_norm": 0.15025848150253296, "learning_rate": 1.8667633393668097e-06, "loss": 0.8138965368270874, "step": 1320 }, { "epoch": 2.421245421245421, "grad_norm": 0.21664276719093323, "learning_rate": 1.856286436532506e-06, "loss": 0.689363420009613, "step": 1322 }, { "epoch": 2.4249084249084247, "grad_norm": 1.3246759176254272, "learning_rate": 1.845866576704165e-06, "loss": 0.7871432900428772, "step": 1324 }, { "epoch": 2.4285714285714284, "grad_norm": 0.38431447744369507, "learning_rate": 1.8355039230066068e-06, "loss": 0.7976049184799194, "step": 1326 }, { "epoch": 2.4322344322344325, "grad_norm": 0.07272528856992722, "learning_rate": 1.8251986376690806e-06, "loss": 0.734397292137146, "step": 1328 }, { "epoch": 2.435897435897436, "grad_norm": 0.23010677099227905, "learning_rate": 1.8149508820227258e-06, "loss": 0.8264967799186707, "step": 1330 }, { "epoch": 2.4395604395604398, "grad_norm": 0.2325713038444519, "learning_rate": 1.8047608164980393e-06, "loss": 1.1099257469177246, "step": 1332 }, { "epoch": 2.4432234432234434, "grad_norm": 0.2141243815422058, "learning_rate": 1.7946286006223728e-06, "loss": 0.7992602586746216, "step": 1334 }, { "epoch": 2.446886446886447, "grad_norm": 0.3261476755142212, "learning_rate": 1.7845543930174288e-06, "loss": 0.7330154776573181, "step": 1336 }, { "epoch": 2.4505494505494507, "grad_norm": 0.19834889471530914, "learning_rate": 1.7745383513967784e-06, "loss": 1.0567998886108398, "step": 1338 }, { "epoch": 2.4542124542124544, "grad_norm": 0.13338837027549744, "learning_rate": 1.7645806325633975e-06, "loss": 0.9307959675788879, "step": 1340 }, { "epoch": 2.457875457875458, "grad_norm": 0.0941123366355896, "learning_rate": 1.7546813924072064e-06, "loss": 0.7225639820098877, "step": 1342 }, { "epoch": 2.4615384615384617, "grad_norm": 0.11015522480010986, "learning_rate": 1.7448407859026267e-06, "loss": 0.8351444602012634, "step": 1344 }, { "epoch": 2.4652014652014653, "grad_norm": 0.16074956953525543, "learning_rate": 1.7350589671061657e-06, "loss": 1.1353893280029297, "step": 1346 }, { "epoch": 2.468864468864469, "grad_norm": 0.21541282534599304, "learning_rate": 1.7253360891539963e-06, "loss": 1.1350133419036865, "step": 1348 }, { "epoch": 2.4725274725274726, "grad_norm": 0.18318095803260803, "learning_rate": 1.7156723042595602e-06, "loss": 0.7882329821586609, "step": 1350 }, { "epoch": 2.4761904761904763, "grad_norm": 0.20827817916870117, "learning_rate": 1.7060677637111863e-06, "loss": 0.9048058390617371, "step": 1352 }, { "epoch": 2.47985347985348, "grad_norm": 0.3399142622947693, "learning_rate": 1.6965226178697237e-06, "loss": 0.988274335861206, "step": 1354 }, { "epoch": 2.4835164835164836, "grad_norm": 0.03409822657704353, "learning_rate": 1.6870370161661852e-06, "loss": 0.9388930201530457, "step": 1356 }, { "epoch": 2.4871794871794872, "grad_norm": 0.11549941450357437, "learning_rate": 1.6776111070994129e-06, "loss": 1.1141780614852905, "step": 1358 }, { "epoch": 2.490842490842491, "grad_norm": 0.21529677510261536, "learning_rate": 1.6682450382337445e-06, "loss": 0.9177558422088623, "step": 1360 }, { "epoch": 2.4945054945054945, "grad_norm": 0.21112927794456482, "learning_rate": 1.65893895619671e-06, "loss": 0.8482896685600281, "step": 1362 }, { "epoch": 2.498168498168498, "grad_norm": 0.3684331476688385, "learning_rate": 1.6496930066767381e-06, "loss": 0.8899385333061218, "step": 1364 }, { "epoch": 2.501831501831502, "grad_norm": 0.5180490016937256, "learning_rate": 1.6405073344208652e-06, "loss": 1.1375821828842163, "step": 1366 }, { "epoch": 2.5054945054945055, "grad_norm": 0.14490839838981628, "learning_rate": 1.6313820832324833e-06, "loss": 0.8489875793457031, "step": 1368 }, { "epoch": 2.509157509157509, "grad_norm": 0.26114216446876526, "learning_rate": 1.6223173959690766e-06, "loss": 1.0175533294677734, "step": 1370 }, { "epoch": 2.5128205128205128, "grad_norm": 0.07394483685493469, "learning_rate": 1.6133134145399895e-06, "loss": 0.679277777671814, "step": 1372 }, { "epoch": 2.5164835164835164, "grad_norm": 0.22844818234443665, "learning_rate": 1.6043702799042097e-06, "loss": 0.8118609189987183, "step": 1374 }, { "epoch": 2.52014652014652, "grad_norm": 0.946811854839325, "learning_rate": 1.5954881320681541e-06, "loss": 0.9923216700553894, "step": 1376 }, { "epoch": 2.5238095238095237, "grad_norm": 0.46443161368370056, "learning_rate": 1.586667110083481e-06, "loss": 0.8106738924980164, "step": 1378 }, { "epoch": 2.5274725274725274, "grad_norm": 0.1713973730802536, "learning_rate": 1.5779073520449115e-06, "loss": 0.9600465893745422, "step": 1380 }, { "epoch": 2.531135531135531, "grad_norm": 0.023757750168442726, "learning_rate": 1.5692089950880671e-06, "loss": 0.9061873555183411, "step": 1382 }, { "epoch": 2.5347985347985347, "grad_norm": 0.13470512628555298, "learning_rate": 1.5605721753873273e-06, "loss": 0.8136062622070312, "step": 1384 }, { "epoch": 2.5384615384615383, "grad_norm": 0.6172438859939575, "learning_rate": 1.5519970281536947e-06, "loss": 1.1290100812911987, "step": 1386 }, { "epoch": 2.542124542124542, "grad_norm": 0.229129359126091, "learning_rate": 1.5434836876326723e-06, "loss": 0.7960153222084045, "step": 1388 }, { "epoch": 2.5457875457875456, "grad_norm": 0.23978465795516968, "learning_rate": 1.5350322871021738e-06, "loss": 0.8506826162338257, "step": 1390 }, { "epoch": 2.5494505494505493, "grad_norm": 0.4824867844581604, "learning_rate": 1.5266429588704294e-06, "loss": 1.025938868522644, "step": 1392 }, { "epoch": 2.553113553113553, "grad_norm": 0.18570579588413239, "learning_rate": 1.518315834273915e-06, "loss": 0.7308077216148376, "step": 1394 }, { "epoch": 2.5567765567765566, "grad_norm": 0.11341089010238647, "learning_rate": 1.510051043675297e-06, "loss": 0.37588629126548767, "step": 1396 }, { "epoch": 2.5604395604395602, "grad_norm": 0.19933566451072693, "learning_rate": 1.5018487164613931e-06, "loss": 1.1432240009307861, "step": 1398 }, { "epoch": 2.564102564102564, "grad_norm": 0.2133670151233673, "learning_rate": 1.4937089810411428e-06, "loss": 1.141809105873108, "step": 1400 }, { "epoch": 2.5677655677655675, "grad_norm": 0.3012371361255646, "learning_rate": 1.4856319648436034e-06, "loss": 0.9912227988243103, "step": 1402 }, { "epoch": 2.571428571428571, "grad_norm": 0.16756081581115723, "learning_rate": 1.4776177943159484e-06, "loss": 1.1359539031982422, "step": 1404 }, { "epoch": 2.575091575091575, "grad_norm": 0.9049347043037415, "learning_rate": 1.4696665949214889e-06, "loss": 0.5541988611221313, "step": 1406 }, { "epoch": 2.578754578754579, "grad_norm": 0.1514206826686859, "learning_rate": 1.4617784911377158e-06, "loss": 1.2034826278686523, "step": 1408 }, { "epoch": 2.5824175824175826, "grad_norm": 0.5452237129211426, "learning_rate": 1.4539536064543453e-06, "loss": 0.9588869214057922, "step": 1410 }, { "epoch": 2.586080586080586, "grad_norm": 0.43010222911834717, "learning_rate": 1.446192063371385e-06, "loss": 0.90684974193573, "step": 1412 }, { "epoch": 2.58974358974359, "grad_norm": 0.2458840310573578, "learning_rate": 1.4384939833972197e-06, "loss": 1.0172938108444214, "step": 1414 }, { "epoch": 2.5934065934065935, "grad_norm": 0.1467057466506958, "learning_rate": 1.4308594870467056e-06, "loss": 1.1102759838104248, "step": 1416 }, { "epoch": 2.597069597069597, "grad_norm": 0.6239453554153442, "learning_rate": 1.4232886938392893e-06, "loss": 0.8101827502250671, "step": 1418 }, { "epoch": 2.600732600732601, "grad_norm": 0.18740800023078918, "learning_rate": 1.4157817222971312e-06, "loss": 1.1065106391906738, "step": 1420 }, { "epoch": 2.6043956043956045, "grad_norm": 0.5177209377288818, "learning_rate": 1.4083386899432489e-06, "loss": 1.074950933456421, "step": 1422 }, { "epoch": 2.608058608058608, "grad_norm": 0.18076905608177185, "learning_rate": 1.4009597132996842e-06, "loss": 1.2177599668502808, "step": 1424 }, { "epoch": 2.6117216117216118, "grad_norm": 0.7345294952392578, "learning_rate": 1.393644907885674e-06, "loss": 1.3366779088974, "step": 1426 }, { "epoch": 2.6153846153846154, "grad_norm": 0.14318975806236267, "learning_rate": 1.3863943882158417e-06, "loss": 1.1753196716308594, "step": 1428 }, { "epoch": 2.619047619047619, "grad_norm": 0.1478182077407837, "learning_rate": 1.379208267798406e-06, "loss": 1.1063532829284668, "step": 1430 }, { "epoch": 2.6227106227106227, "grad_norm": 0.274620920419693, "learning_rate": 1.3720866591334045e-06, "loss": 1.0099287033081055, "step": 1432 }, { "epoch": 2.6263736263736264, "grad_norm": 0.365405797958374, "learning_rate": 1.3650296737109292e-06, "loss": 1.0578190088272095, "step": 1434 }, { "epoch": 2.63003663003663, "grad_norm": 0.21154209971427917, "learning_rate": 1.3580374220093868e-06, "loss": 1.346867322921753, "step": 1436 }, { "epoch": 2.6336996336996337, "grad_norm": 0.19875630736351013, "learning_rate": 1.3511100134937625e-06, "loss": 1.1731492280960083, "step": 1438 }, { "epoch": 2.6373626373626373, "grad_norm": 0.3672538101673126, "learning_rate": 1.3442475566139093e-06, "loss": 1.13294517993927, "step": 1440 }, { "epoch": 2.641025641025641, "grad_norm": 0.3166159689426422, "learning_rate": 1.3374501588028546e-06, "loss": 1.1464821100234985, "step": 1442 }, { "epoch": 2.6446886446886446, "grad_norm": 0.28593555092811584, "learning_rate": 1.3307179264751082e-06, "loss": 1.1436622142791748, "step": 1444 }, { "epoch": 2.6483516483516483, "grad_norm": 0.3862296938896179, "learning_rate": 1.3240509650250083e-06, "loss": 0.8166991472244263, "step": 1446 }, { "epoch": 2.652014652014652, "grad_norm": 0.17282630503177643, "learning_rate": 1.3174493788250605e-06, "loss": 0.8451816439628601, "step": 1448 }, { "epoch": 2.6556776556776556, "grad_norm": 0.22388476133346558, "learning_rate": 1.3109132712243117e-06, "loss": 1.1225379705429077, "step": 1450 }, { "epoch": 2.659340659340659, "grad_norm": 0.15126027166843414, "learning_rate": 1.3044427445467276e-06, "loss": 0.7850918769836426, "step": 1452 }, { "epoch": 2.663003663003663, "grad_norm": 0.05748463794589043, "learning_rate": 1.2980379000895946e-06, "loss": 0.7346314191818237, "step": 1454 }, { "epoch": 2.6666666666666665, "grad_norm": 0.4359929859638214, "learning_rate": 1.2916988381219303e-06, "loss": 1.1165975332260132, "step": 1456 }, { "epoch": 2.67032967032967, "grad_norm": 0.20032697916030884, "learning_rate": 1.2854256578829148e-06, "loss": 0.7857989072799683, "step": 1458 }, { "epoch": 2.6739926739926743, "grad_norm": 0.06527489423751831, "learning_rate": 1.2792184575803392e-06, "loss": 0.8251097798347473, "step": 1460 }, { "epoch": 2.677655677655678, "grad_norm": 0.28165748715400696, "learning_rate": 1.2730773343890662e-06, "loss": 0.8670933842658997, "step": 1462 }, { "epoch": 2.6813186813186816, "grad_norm": 0.3216964602470398, "learning_rate": 1.2670023844495071e-06, "loss": 1.1086490154266357, "step": 1464 }, { "epoch": 2.684981684981685, "grad_norm": 0.1745329648256302, "learning_rate": 1.2609937028661226e-06, "loss": 0.908940315246582, "step": 1466 }, { "epoch": 2.688644688644689, "grad_norm": 0.12318509072065353, "learning_rate": 1.2550513837059261e-06, "loss": 1.0815136432647705, "step": 1468 }, { "epoch": 2.6923076923076925, "grad_norm": 0.37340617179870605, "learning_rate": 1.2491755199970188e-06, "loss": 0.6923399567604065, "step": 1470 }, { "epoch": 2.695970695970696, "grad_norm": 0.20242176949977875, "learning_rate": 1.2433662037271263e-06, "loss": 0.8187569379806519, "step": 1472 }, { "epoch": 2.6996336996337, "grad_norm": 0.6501942873001099, "learning_rate": 1.2376235258421628e-06, "loss": 0.4654901623725891, "step": 1474 }, { "epoch": 2.7032967032967035, "grad_norm": 0.21495883166790009, "learning_rate": 1.2319475762448084e-06, "loss": 1.17780339717865, "step": 1476 }, { "epoch": 2.706959706959707, "grad_norm": 0.34030434489250183, "learning_rate": 1.2263384437930969e-06, "loss": 0.7136227488517761, "step": 1478 }, { "epoch": 2.7106227106227108, "grad_norm": 0.8260899782180786, "learning_rate": 1.2207962162990287e-06, "loss": 1.1193125247955322, "step": 1480 }, { "epoch": 2.7142857142857144, "grad_norm": 0.217088520526886, "learning_rate": 1.2153209805271943e-06, "loss": 1.132580280303955, "step": 1482 }, { "epoch": 2.717948717948718, "grad_norm": 0.6372915506362915, "learning_rate": 1.2099128221934164e-06, "loss": 1.0393377542495728, "step": 1484 }, { "epoch": 2.7216117216117217, "grad_norm": 0.14800269901752472, "learning_rate": 1.2045718259634083e-06, "loss": 1.1727163791656494, "step": 1486 }, { "epoch": 2.7252747252747254, "grad_norm": 0.1804278939962387, "learning_rate": 1.1992980754514497e-06, "loss": 1.1531107425689697, "step": 1488 }, { "epoch": 2.728937728937729, "grad_norm": 0.4734005331993103, "learning_rate": 1.1940916532190739e-06, "loss": 0.5333794951438904, "step": 1490 }, { "epoch": 2.7326007326007327, "grad_norm": 0.24780096113681793, "learning_rate": 1.1889526407737776e-06, "loss": 1.1573615074157715, "step": 1492 }, { "epoch": 2.7362637362637363, "grad_norm": 0.2443196028470993, "learning_rate": 1.1838811185677466e-06, "loss": 0.6827471256256104, "step": 1494 }, { "epoch": 2.73992673992674, "grad_norm": 0.5809857249259949, "learning_rate": 1.1788771659965935e-06, "loss": 1.2393468618392944, "step": 1496 }, { "epoch": 2.7435897435897436, "grad_norm": 0.4661528170108795, "learning_rate": 1.173940861398117e-06, "loss": 1.1121079921722412, "step": 1498 }, { "epoch": 2.7472527472527473, "grad_norm": 0.27154994010925293, "learning_rate": 1.1690722820510723e-06, "loss": 0.7914168834686279, "step": 1500 }, { "epoch": 2.750915750915751, "grad_norm": 0.18499144911766052, "learning_rate": 1.164271504173964e-06, "loss": 1.0800108909606934, "step": 1502 }, { "epoch": 2.7545787545787546, "grad_norm": 0.37535104155540466, "learning_rate": 1.159538602923855e-06, "loss": 1.1396592855453491, "step": 1504 }, { "epoch": 2.758241758241758, "grad_norm": 0.31983864307403564, "learning_rate": 1.1548736523951822e-06, "loss": 1.1717373132705688, "step": 1506 }, { "epoch": 2.761904761904762, "grad_norm": 0.04418055713176727, "learning_rate": 1.1502767256186053e-06, "loss": 0.9536030292510986, "step": 1508 }, { "epoch": 2.7655677655677655, "grad_norm": 0.13261856138706207, "learning_rate": 1.1457478945598591e-06, "loss": 1.0200964212417603, "step": 1510 }, { "epoch": 2.769230769230769, "grad_norm": 0.29484888911247253, "learning_rate": 1.1412872301186253e-06, "loss": 0.9747733473777771, "step": 1512 }, { "epoch": 2.772893772893773, "grad_norm": 0.1159660741686821, "learning_rate": 1.1368948021274269e-06, "loss": 1.116559624671936, "step": 1514 }, { "epoch": 2.7765567765567765, "grad_norm": 0.3250535726547241, "learning_rate": 1.1325706793505317e-06, "loss": 1.064975380897522, "step": 1516 }, { "epoch": 2.78021978021978, "grad_norm": 0.18949034810066223, "learning_rate": 1.1283149294828773e-06, "loss": 1.0048205852508545, "step": 1518 }, { "epoch": 2.7838827838827838, "grad_norm": 0.39678439497947693, "learning_rate": 1.1241276191490097e-06, "loss": 0.5427751541137695, "step": 1520 }, { "epoch": 2.7875457875457874, "grad_norm": 1.3462748527526855, "learning_rate": 1.120008813902044e-06, "loss": 0.7995284199714661, "step": 1522 }, { "epoch": 2.791208791208791, "grad_norm": 0.15732638537883759, "learning_rate": 1.1159585782226325e-06, "loss": 0.8446041345596313, "step": 1524 }, { "epoch": 2.7948717948717947, "grad_norm": 0.2313084453344345, "learning_rate": 1.1119769755179595e-06, "loss": 1.1773189306259155, "step": 1526 }, { "epoch": 2.7985347985347984, "grad_norm": 0.14309756457805634, "learning_rate": 1.1080640681207485e-06, "loss": 1.1459267139434814, "step": 1528 }, { "epoch": 2.802197802197802, "grad_norm": 0.1798963099718094, "learning_rate": 1.104219917288284e-06, "loss": 1.1224641799926758, "step": 1530 }, { "epoch": 2.8058608058608057, "grad_norm": 0.24613995850086212, "learning_rate": 1.100444583201454e-06, "loss": 1.016000509262085, "step": 1532 }, { "epoch": 2.8095238095238093, "grad_norm": 0.178895965218544, "learning_rate": 1.0967381249638085e-06, "loss": 0.7900265455245972, "step": 1534 }, { "epoch": 2.813186813186813, "grad_norm": 0.2273297756910324, "learning_rate": 1.0931006006006324e-06, "loss": 1.347412109375, "step": 1536 }, { "epoch": 2.8168498168498166, "grad_norm": 0.21277707815170288, "learning_rate": 1.089532067058039e-06, "loss": 0.9508707523345947, "step": 1538 }, { "epoch": 2.8205128205128203, "grad_norm": 0.19118960201740265, "learning_rate": 1.0860325802020772e-06, "loss": 0.8098848462104797, "step": 1540 }, { "epoch": 2.824175824175824, "grad_norm": 0.14161139726638794, "learning_rate": 1.0826021948178566e-06, "loss": 0.9036679863929749, "step": 1542 }, { "epoch": 2.8278388278388276, "grad_norm": 0.1456916779279709, "learning_rate": 1.0792409646086922e-06, "loss": 1.1096038818359375, "step": 1544 }, { "epoch": 2.8315018315018317, "grad_norm": 0.5517901182174683, "learning_rate": 1.0759489421952602e-06, "loss": 1.1584891080856323, "step": 1546 }, { "epoch": 2.8351648351648353, "grad_norm": 0.11882911622524261, "learning_rate": 1.0727261791147784e-06, "loss": 1.2254421710968018, "step": 1548 }, { "epoch": 2.838827838827839, "grad_norm": 0.23024114966392517, "learning_rate": 1.0695727258201938e-06, "loss": 0.8998859524726868, "step": 1550 }, { "epoch": 2.8424908424908426, "grad_norm": 0.07301481068134308, "learning_rate": 1.0664886316793988e-06, "loss": 0.6015828847885132, "step": 1552 }, { "epoch": 2.8461538461538463, "grad_norm": 0.12812356650829315, "learning_rate": 1.0634739449744534e-06, "loss": 1.156007170677185, "step": 1554 }, { "epoch": 2.84981684981685, "grad_norm": 0.1785007119178772, "learning_rate": 1.0605287129008337e-06, "loss": 1.0002185106277466, "step": 1556 }, { "epoch": 2.8534798534798536, "grad_norm": 0.5185611844062805, "learning_rate": 1.0576529815666892e-06, "loss": 1.142732858657837, "step": 1558 }, { "epoch": 2.857142857142857, "grad_norm": 0.2965824007987976, "learning_rate": 1.0548467959921217e-06, "loss": 0.6404973864555359, "step": 1560 }, { "epoch": 2.860805860805861, "grad_norm": 0.1833876669406891, "learning_rate": 1.0521102001084835e-06, "loss": 0.7605476975440979, "step": 1562 }, { "epoch": 2.8644688644688645, "grad_norm": 0.5239128470420837, "learning_rate": 1.0494432367576862e-06, "loss": 0.9357516169548035, "step": 1564 }, { "epoch": 2.868131868131868, "grad_norm": 0.3669067621231079, "learning_rate": 1.0468459476915317e-06, "loss": 0.7723519206047058, "step": 1566 }, { "epoch": 2.871794871794872, "grad_norm": 0.40440791845321655, "learning_rate": 1.044318373571057e-06, "loss": 0.735063374042511, "step": 1568 }, { "epoch": 2.8754578754578755, "grad_norm": 0.09417320787906647, "learning_rate": 1.0418605539659014e-06, "loss": 0.7597877979278564, "step": 1570 }, { "epoch": 2.879120879120879, "grad_norm": 0.40769991278648376, "learning_rate": 1.0394725273536817e-06, "loss": 0.8062982559204102, "step": 1572 }, { "epoch": 2.8827838827838828, "grad_norm": 0.2923339307308197, "learning_rate": 1.0371543311193944e-06, "loss": 1.0577229261398315, "step": 1574 }, { "epoch": 2.8864468864468864, "grad_norm": 0.3852575719356537, "learning_rate": 1.034906001554827e-06, "loss": 1.0765886306762695, "step": 1576 }, { "epoch": 2.89010989010989, "grad_norm": 0.12469828873872757, "learning_rate": 1.0327275738579934e-06, "loss": 0.9185457229614258, "step": 1578 }, { "epoch": 2.8937728937728937, "grad_norm": 0.3026789724826813, "learning_rate": 1.0306190821325792e-06, "loss": 1.2011407613754272, "step": 1580 }, { "epoch": 2.8974358974358974, "grad_norm": 0.2271515429019928, "learning_rate": 1.0285805593874105e-06, "loss": 0.8856844305992126, "step": 1582 }, { "epoch": 2.901098901098901, "grad_norm": 0.11647852510213852, "learning_rate": 1.026612037535935e-06, "loss": 0.8170561790466309, "step": 1584 }, { "epoch": 2.9047619047619047, "grad_norm": 0.12595300376415253, "learning_rate": 1.0247135473957253e-06, "loss": 0.8300210237503052, "step": 1586 }, { "epoch": 2.9084249084249083, "grad_norm": 0.21114195883274078, "learning_rate": 1.0228851186879932e-06, "loss": 1.1618390083312988, "step": 1588 }, { "epoch": 2.912087912087912, "grad_norm": 0.2155926376581192, "learning_rate": 1.0211267800371263e-06, "loss": 1.2564477920532227, "step": 1590 }, { "epoch": 2.9157509157509156, "grad_norm": 0.40559911727905273, "learning_rate": 1.01943855897024e-06, "loss": 0.628135085105896, "step": 1592 }, { "epoch": 2.9194139194139193, "grad_norm": 0.22793929278850555, "learning_rate": 1.0178204819167451e-06, "loss": 1.1933345794677734, "step": 1594 }, { "epoch": 2.9230769230769234, "grad_norm": 0.30155590176582336, "learning_rate": 1.0162725742079355e-06, "loss": 0.804075300693512, "step": 1596 }, { "epoch": 2.926739926739927, "grad_norm": 0.4184918999671936, "learning_rate": 1.0147948600765919e-06, "loss": 1.19660484790802, "step": 1598 }, { "epoch": 2.9304029304029307, "grad_norm": 0.1114964559674263, "learning_rate": 1.0133873626565994e-06, "loss": 0.8411705493927002, "step": 1600 }, { "epoch": 2.9340659340659343, "grad_norm": 0.19453909993171692, "learning_rate": 1.0120501039825902e-06, "loss": 1.1576671600341797, "step": 1602 }, { "epoch": 2.937728937728938, "grad_norm": 0.06379074603319168, "learning_rate": 1.0107831049895937e-06, "loss": 0.9160769581794739, "step": 1604 }, { "epoch": 2.9413919413919416, "grad_norm": 0.19216328859329224, "learning_rate": 1.009586385512713e-06, "loss": 1.1501901149749756, "step": 1606 }, { "epoch": 2.9450549450549453, "grad_norm": 1.06039297580719, "learning_rate": 1.0084599642868117e-06, "loss": 0.7293557524681091, "step": 1608 }, { "epoch": 2.948717948717949, "grad_norm": 0.30301105976104736, "learning_rate": 1.0074038589462206e-06, "loss": 1.062568187713623, "step": 1610 }, { "epoch": 2.9523809523809526, "grad_norm": 0.22648635506629944, "learning_rate": 1.0064180860244631e-06, "loss": 0.9736372828483582, "step": 1612 }, { "epoch": 2.956043956043956, "grad_norm": 0.6904452443122864, "learning_rate": 1.0055026609539963e-06, "loss": 0.7423111796379089, "step": 1614 }, { "epoch": 2.95970695970696, "grad_norm": 0.1181085854768753, "learning_rate": 1.004657598065967e-06, "loss": 0.8746036291122437, "step": 1616 }, { "epoch": 2.9633699633699635, "grad_norm": 0.25594648718833923, "learning_rate": 1.0038829105899911e-06, "loss": 1.4269702434539795, "step": 1618 }, { "epoch": 2.967032967032967, "grad_norm": 0.6465041041374207, "learning_rate": 1.0031786106539428e-06, "loss": 1.2628575563430786, "step": 1620 }, { "epoch": 2.970695970695971, "grad_norm": 0.17348702251911163, "learning_rate": 1.0025447092837677e-06, "loss": 0.964820921421051, "step": 1622 }, { "epoch": 2.9743589743589745, "grad_norm": 3.296407699584961, "learning_rate": 1.0019812164033077e-06, "loss": 0.7985995411872864, "step": 1624 }, { "epoch": 2.978021978021978, "grad_norm": 0.11664870381355286, "learning_rate": 1.0014881408341481e-06, "loss": 0.9173464775085449, "step": 1626 }, { "epoch": 2.9816849816849818, "grad_norm": 0.10260229557752609, "learning_rate": 1.0010654902954773e-06, "loss": 0.9848383069038391, "step": 1628 }, { "epoch": 2.9853479853479854, "grad_norm": 0.200631782412529, "learning_rate": 1.0007132714039676e-06, "loss": 1.4417872428894043, "step": 1630 }, { "epoch": 2.989010989010989, "grad_norm": 0.32539039850234985, "learning_rate": 1.0004314896736694e-06, "loss": 1.0627717971801758, "step": 1632 }, { "epoch": 2.9926739926739927, "grad_norm": 0.17502747476100922, "learning_rate": 1.0002201495159287e-06, "loss": 0.8705639839172363, "step": 1634 }, { "epoch": 2.9963369963369964, "grad_norm": 0.19851884245872498, "learning_rate": 1.0000792542393144e-06, "loss": 1.3285937309265137, "step": 1636 }, { "epoch": 3.0, "grad_norm": 0.10878675431013107, "learning_rate": 1.0000088060495672e-06, "loss": 1.0932306051254272, "step": 1638 }, { "epoch": 3.0, "step": 1638, "total_flos": 8.4482141520606e+18, "train_loss": 1.0599846049178943, "train_runtime": 55254.3839, "train_samples_per_second": 0.711, "train_steps_per_second": 0.03 } ], "logging_steps": 2, "max_steps": 1638, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 99999, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": false, "should_training_stop": false }, "attributes": {} } }, "total_flos": 8.4482141520606e+18, "train_batch_size": 1, "trial_name": null, "trial_params": null }