{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.244705882352941, "eval_steps": 500, "global_step": 240, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.009411764705882352, "grad_norm": 0.8085687756538391, "learning_rate": 0.0, "loss": 0.4256, "step": 1 }, { "epoch": 0.018823529411764704, "grad_norm": 1.3872617483139038, "learning_rate": 4e-05, "loss": 0.7559, "step": 2 }, { "epoch": 0.02823529411764706, "grad_norm": 1.3308924436569214, "learning_rate": 8e-05, "loss": 0.5674, "step": 3 }, { "epoch": 0.03764705882352941, "grad_norm": 0.9933507442474365, "learning_rate": 0.00012, "loss": 0.5053, "step": 4 }, { "epoch": 0.047058823529411764, "grad_norm": 0.5066264271736145, "learning_rate": 0.00016, "loss": 0.4117, "step": 5 }, { "epoch": 0.05647058823529412, "grad_norm": 0.3941880464553833, "learning_rate": 0.0002, "loss": 0.3627, "step": 6 }, { "epoch": 0.06588235294117648, "grad_norm": 0.3703334629535675, "learning_rate": 0.0001991489361702128, "loss": 0.2119, "step": 7 }, { "epoch": 0.07529411764705882, "grad_norm": 0.25386443734169006, "learning_rate": 0.00019829787234042554, "loss": 0.26, "step": 8 }, { "epoch": 0.08470588235294117, "grad_norm": 0.18589483201503754, "learning_rate": 0.00019744680851063832, "loss": 0.2611, "step": 9 }, { "epoch": 0.09411764705882353, "grad_norm": 0.3556481897830963, "learning_rate": 0.00019659574468085107, "loss": 0.3863, "step": 10 }, { "epoch": 0.10352941176470588, "grad_norm": 0.18439431488513947, "learning_rate": 0.00019574468085106384, "loss": 0.2365, "step": 11 }, { "epoch": 0.11294117647058824, "grad_norm": 0.17448313534259796, "learning_rate": 0.0001948936170212766, "loss": 0.2748, "step": 12 }, { "epoch": 0.1223529411764706, "grad_norm": 0.23789159953594208, "learning_rate": 0.00019404255319148937, "loss": 0.2422, "step": 13 }, { "epoch": 0.13176470588235295, "grad_norm": 0.19852899014949799, "learning_rate": 0.00019319148936170212, "loss": 0.2522, "step": 14 }, { "epoch": 0.1411764705882353, "grad_norm": 0.4207107424736023, "learning_rate": 0.0001923404255319149, "loss": 0.2644, "step": 15 }, { "epoch": 0.15058823529411763, "grad_norm": 0.30403465032577515, "learning_rate": 0.00019148936170212768, "loss": 0.2582, "step": 16 }, { "epoch": 0.16, "grad_norm": 0.21143439412117004, "learning_rate": 0.00019063829787234045, "loss": 0.1965, "step": 17 }, { "epoch": 0.16941176470588235, "grad_norm": 0.3473486304283142, "learning_rate": 0.0001897872340425532, "loss": 0.2762, "step": 18 }, { "epoch": 0.17882352941176471, "grad_norm": 0.20471632480621338, "learning_rate": 0.00018893617021276598, "loss": 0.2573, "step": 19 }, { "epoch": 0.18823529411764706, "grad_norm": 0.20691461861133575, "learning_rate": 0.00018808510638297873, "loss": 0.2012, "step": 20 }, { "epoch": 0.1976470588235294, "grad_norm": 0.2261233776807785, "learning_rate": 0.0001872340425531915, "loss": 0.2161, "step": 21 }, { "epoch": 0.20705882352941177, "grad_norm": 0.2860106825828552, "learning_rate": 0.00018638297872340426, "loss": 0.2597, "step": 22 }, { "epoch": 0.2164705882352941, "grad_norm": 0.2977927327156067, "learning_rate": 0.00018553191489361704, "loss": 0.2822, "step": 23 }, { "epoch": 0.22588235294117648, "grad_norm": 0.323324054479599, "learning_rate": 0.0001846808510638298, "loss": 0.1893, "step": 24 }, { "epoch": 0.23529411764705882, "grad_norm": 0.28017860651016235, "learning_rate": 0.00018382978723404257, "loss": 0.1978, "step": 25 }, { "epoch": 0.2447058823529412, "grad_norm": 0.19440393149852753, "learning_rate": 0.00018297872340425532, "loss": 0.1935, "step": 26 }, { "epoch": 0.2541176470588235, "grad_norm": 0.3916899859905243, "learning_rate": 0.0001821276595744681, "loss": 0.2655, "step": 27 }, { "epoch": 0.2635294117647059, "grad_norm": 0.27680760622024536, "learning_rate": 0.00018127659574468084, "loss": 0.2443, "step": 28 }, { "epoch": 0.27294117647058824, "grad_norm": 0.2137499898672104, "learning_rate": 0.00018042553191489362, "loss": 0.2145, "step": 29 }, { "epoch": 0.2823529411764706, "grad_norm": 0.2534061670303345, "learning_rate": 0.00017957446808510637, "loss": 0.2295, "step": 30 }, { "epoch": 0.2917647058823529, "grad_norm": 0.22554630041122437, "learning_rate": 0.00017872340425531915, "loss": 0.2054, "step": 31 }, { "epoch": 0.30117647058823527, "grad_norm": 0.33873406052589417, "learning_rate": 0.0001778723404255319, "loss": 0.2665, "step": 32 }, { "epoch": 0.31058823529411766, "grad_norm": 0.2693963348865509, "learning_rate": 0.00017702127659574468, "loss": 0.2128, "step": 33 }, { "epoch": 0.32, "grad_norm": 0.2127694934606552, "learning_rate": 0.00017617021276595746, "loss": 0.1437, "step": 34 }, { "epoch": 0.32941176470588235, "grad_norm": 0.3235378861427307, "learning_rate": 0.00017531914893617023, "loss": 0.1666, "step": 35 }, { "epoch": 0.3388235294117647, "grad_norm": 0.3119862973690033, "learning_rate": 0.00017446808510638298, "loss": 0.2539, "step": 36 }, { "epoch": 0.34823529411764703, "grad_norm": 0.3158186376094818, "learning_rate": 0.00017361702127659576, "loss": 0.1993, "step": 37 }, { "epoch": 0.35764705882352943, "grad_norm": 0.23814643919467926, "learning_rate": 0.0001727659574468085, "loss": 0.166, "step": 38 }, { "epoch": 0.36705882352941177, "grad_norm": 0.18762339651584625, "learning_rate": 0.0001719148936170213, "loss": 0.1638, "step": 39 }, { "epoch": 0.3764705882352941, "grad_norm": 0.21114632487297058, "learning_rate": 0.00017106382978723404, "loss": 0.2818, "step": 40 }, { "epoch": 0.38588235294117645, "grad_norm": 0.30701348185539246, "learning_rate": 0.00017021276595744682, "loss": 0.1896, "step": 41 }, { "epoch": 0.3952941176470588, "grad_norm": 0.37595638632774353, "learning_rate": 0.0001693617021276596, "loss": 0.2778, "step": 42 }, { "epoch": 0.4047058823529412, "grad_norm": 0.19554150104522705, "learning_rate": 0.00016851063829787235, "loss": 0.2649, "step": 43 }, { "epoch": 0.41411764705882353, "grad_norm": 0.3296668827533722, "learning_rate": 0.00016765957446808512, "loss": 0.2704, "step": 44 }, { "epoch": 0.4235294117647059, "grad_norm": 0.515943169593811, "learning_rate": 0.00016680851063829787, "loss": 0.3078, "step": 45 }, { "epoch": 0.4329411764705882, "grad_norm": 0.24951788783073425, "learning_rate": 0.00016595744680851065, "loss": 0.2295, "step": 46 }, { "epoch": 0.4423529411764706, "grad_norm": 0.27803778648376465, "learning_rate": 0.0001651063829787234, "loss": 0.1961, "step": 47 }, { "epoch": 0.45176470588235296, "grad_norm": 0.1960020810365677, "learning_rate": 0.00016425531914893618, "loss": 0.2334, "step": 48 }, { "epoch": 0.4611764705882353, "grad_norm": 0.23366108536720276, "learning_rate": 0.00016340425531914893, "loss": 0.157, "step": 49 }, { "epoch": 0.47058823529411764, "grad_norm": 0.22643855214118958, "learning_rate": 0.0001625531914893617, "loss": 0.1648, "step": 50 }, { "epoch": 0.48, "grad_norm": 0.203178271651268, "learning_rate": 0.00016170212765957446, "loss": 0.1274, "step": 51 }, { "epoch": 0.4894117647058824, "grad_norm": 0.33611050248146057, "learning_rate": 0.00016085106382978726, "loss": 0.1737, "step": 52 }, { "epoch": 0.4988235294117647, "grad_norm": 0.4413496255874634, "learning_rate": 0.00016, "loss": 0.2414, "step": 53 }, { "epoch": 0.508235294117647, "grad_norm": 0.24674558639526367, "learning_rate": 0.0001591489361702128, "loss": 0.228, "step": 54 }, { "epoch": 0.5176470588235295, "grad_norm": 0.21306754648685455, "learning_rate": 0.00015829787234042554, "loss": 0.1727, "step": 55 }, { "epoch": 0.5270588235294118, "grad_norm": 0.22712849080562592, "learning_rate": 0.00015744680851063832, "loss": 0.1612, "step": 56 }, { "epoch": 0.5364705882352941, "grad_norm": 0.23271703720092773, "learning_rate": 0.00015659574468085107, "loss": 0.2325, "step": 57 }, { "epoch": 0.5458823529411765, "grad_norm": 0.2999236285686493, "learning_rate": 0.00015574468085106385, "loss": 0.1876, "step": 58 }, { "epoch": 0.5552941176470588, "grad_norm": 0.21795004606246948, "learning_rate": 0.0001548936170212766, "loss": 0.2159, "step": 59 }, { "epoch": 0.5647058823529412, "grad_norm": 0.3243270516395569, "learning_rate": 0.00015404255319148937, "loss": 0.2184, "step": 60 }, { "epoch": 0.5741176470588235, "grad_norm": 0.3677728474140167, "learning_rate": 0.00015319148936170213, "loss": 0.202, "step": 61 }, { "epoch": 0.5835294117647059, "grad_norm": 0.3357590138912201, "learning_rate": 0.0001523404255319149, "loss": 0.1905, "step": 62 }, { "epoch": 0.5929411764705882, "grad_norm": 0.33323776721954346, "learning_rate": 0.00015148936170212765, "loss": 0.2903, "step": 63 }, { "epoch": 0.6023529411764705, "grad_norm": 0.2734071612358093, "learning_rate": 0.00015063829787234043, "loss": 0.1627, "step": 64 }, { "epoch": 0.611764705882353, "grad_norm": 0.29738208651542664, "learning_rate": 0.00014978723404255318, "loss": 0.1801, "step": 65 }, { "epoch": 0.6211764705882353, "grad_norm": 0.22212739288806915, "learning_rate": 0.00014893617021276596, "loss": 0.2177, "step": 66 }, { "epoch": 0.6305882352941177, "grad_norm": 0.18247352540493011, "learning_rate": 0.0001480851063829787, "loss": 0.2103, "step": 67 }, { "epoch": 0.64, "grad_norm": 0.26197579503059387, "learning_rate": 0.0001472340425531915, "loss": 0.2514, "step": 68 }, { "epoch": 0.6494117647058824, "grad_norm": 0.21243281662464142, "learning_rate": 0.00014638297872340426, "loss": 0.1667, "step": 69 }, { "epoch": 0.6588235294117647, "grad_norm": 0.38252201676368713, "learning_rate": 0.00014553191489361704, "loss": 0.181, "step": 70 }, { "epoch": 0.668235294117647, "grad_norm": 0.28905001282691956, "learning_rate": 0.0001446808510638298, "loss": 0.2095, "step": 71 }, { "epoch": 0.6776470588235294, "grad_norm": 0.20469601452350616, "learning_rate": 0.00014382978723404257, "loss": 0.1561, "step": 72 }, { "epoch": 0.6870588235294117, "grad_norm": 0.25762686133384705, "learning_rate": 0.00014297872340425532, "loss": 0.1768, "step": 73 }, { "epoch": 0.6964705882352941, "grad_norm": 0.24704404175281525, "learning_rate": 0.0001421276595744681, "loss": 0.2484, "step": 74 }, { "epoch": 0.7058823529411765, "grad_norm": 0.275851845741272, "learning_rate": 0.00014127659574468085, "loss": 0.183, "step": 75 }, { "epoch": 0.7152941176470589, "grad_norm": 0.2425452470779419, "learning_rate": 0.00014042553191489363, "loss": 0.2125, "step": 76 }, { "epoch": 0.7247058823529412, "grad_norm": 0.25931957364082336, "learning_rate": 0.0001395744680851064, "loss": 0.1208, "step": 77 }, { "epoch": 0.7341176470588235, "grad_norm": 0.2956542372703552, "learning_rate": 0.00013872340425531915, "loss": 0.1712, "step": 78 }, { "epoch": 0.7435294117647059, "grad_norm": 0.25559189915657043, "learning_rate": 0.00013787234042553193, "loss": 0.1584, "step": 79 }, { "epoch": 0.7529411764705882, "grad_norm": 0.3442539870738983, "learning_rate": 0.00013702127659574468, "loss": 0.1303, "step": 80 }, { "epoch": 0.7623529411764706, "grad_norm": 0.23330341279506683, "learning_rate": 0.00013617021276595746, "loss": 0.1523, "step": 81 }, { "epoch": 0.7717647058823529, "grad_norm": 0.28044548630714417, "learning_rate": 0.0001353191489361702, "loss": 0.1137, "step": 82 }, { "epoch": 0.7811764705882352, "grad_norm": 0.32764652371406555, "learning_rate": 0.000134468085106383, "loss": 0.1864, "step": 83 }, { "epoch": 0.7905882352941176, "grad_norm": 0.25059211254119873, "learning_rate": 0.00013361702127659574, "loss": 0.1634, "step": 84 }, { "epoch": 0.8, "grad_norm": 0.21415087580680847, "learning_rate": 0.00013276595744680852, "loss": 0.1389, "step": 85 }, { "epoch": 0.8094117647058824, "grad_norm": 0.24330438673496246, "learning_rate": 0.00013191489361702127, "loss": 0.2107, "step": 86 }, { "epoch": 0.8188235294117647, "grad_norm": 0.3163723647594452, "learning_rate": 0.00013106382978723404, "loss": 0.1567, "step": 87 }, { "epoch": 0.8282352941176471, "grad_norm": 0.31534239649772644, "learning_rate": 0.00013021276595744682, "loss": 0.2164, "step": 88 }, { "epoch": 0.8376470588235294, "grad_norm": 0.32444867491722107, "learning_rate": 0.0001293617021276596, "loss": 0.2682, "step": 89 }, { "epoch": 0.8470588235294118, "grad_norm": 0.21235518157482147, "learning_rate": 0.00012851063829787235, "loss": 0.1137, "step": 90 }, { "epoch": 0.8564705882352941, "grad_norm": 0.29725661873817444, "learning_rate": 0.00012765957446808513, "loss": 0.2246, "step": 91 }, { "epoch": 0.8658823529411764, "grad_norm": 0.21191883087158203, "learning_rate": 0.00012680851063829788, "loss": 0.2591, "step": 92 }, { "epoch": 0.8752941176470588, "grad_norm": 0.25724413990974426, "learning_rate": 0.00012595744680851065, "loss": 0.3289, "step": 93 }, { "epoch": 0.8847058823529412, "grad_norm": 0.33288782835006714, "learning_rate": 0.0001251063829787234, "loss": 0.1999, "step": 94 }, { "epoch": 0.8941176470588236, "grad_norm": 0.26389646530151367, "learning_rate": 0.00012425531914893618, "loss": 0.1281, "step": 95 }, { "epoch": 0.9035294117647059, "grad_norm": 0.21038174629211426, "learning_rate": 0.00012340425531914893, "loss": 0.1759, "step": 96 }, { "epoch": 0.9129411764705883, "grad_norm": 0.15450036525726318, "learning_rate": 0.0001225531914893617, "loss": 0.1188, "step": 97 }, { "epoch": 0.9223529411764706, "grad_norm": 0.14022140204906464, "learning_rate": 0.00012170212765957448, "loss": 0.089, "step": 98 }, { "epoch": 0.9317647058823529, "grad_norm": 0.2061687856912613, "learning_rate": 0.00012085106382978724, "loss": 0.1858, "step": 99 }, { "epoch": 0.9411764705882353, "grad_norm": 0.2657790184020996, "learning_rate": 0.00012, "loss": 0.1508, "step": 100 }, { "epoch": 0.9505882352941176, "grad_norm": 0.23362405598163605, "learning_rate": 0.00011914893617021277, "loss": 0.1581, "step": 101 }, { "epoch": 0.96, "grad_norm": 0.40954795479774475, "learning_rate": 0.00011829787234042553, "loss": 0.2477, "step": 102 }, { "epoch": 0.9694117647058823, "grad_norm": 0.22806493937969208, "learning_rate": 0.0001174468085106383, "loss": 0.1394, "step": 103 }, { "epoch": 0.9788235294117648, "grad_norm": 0.23955920338630676, "learning_rate": 0.00011659574468085106, "loss": 0.1882, "step": 104 }, { "epoch": 0.9882352941176471, "grad_norm": 0.24184423685073853, "learning_rate": 0.00011574468085106382, "loss": 0.228, "step": 105 }, { "epoch": 0.9976470588235294, "grad_norm": 0.30989891290664673, "learning_rate": 0.00011489361702127661, "loss": 0.1631, "step": 106 }, { "epoch": 1.0, "grad_norm": 0.3729061186313629, "learning_rate": 0.00011404255319148938, "loss": 0.1422, "step": 107 }, { "epoch": 1.0094117647058825, "grad_norm": 0.1809885948896408, "learning_rate": 0.00011319148936170214, "loss": 0.1159, "step": 108 }, { "epoch": 1.0188235294117647, "grad_norm": 0.16268357634544373, "learning_rate": 0.0001123404255319149, "loss": 0.1033, "step": 109 }, { "epoch": 1.0282352941176471, "grad_norm": 0.159242644906044, "learning_rate": 0.00011148936170212767, "loss": 0.1209, "step": 110 }, { "epoch": 1.0376470588235294, "grad_norm": 0.33948951959609985, "learning_rate": 0.00011063829787234043, "loss": 0.1485, "step": 111 }, { "epoch": 1.0470588235294118, "grad_norm": 0.17068567872047424, "learning_rate": 0.0001097872340425532, "loss": 0.1092, "step": 112 }, { "epoch": 1.056470588235294, "grad_norm": 0.16619639098644257, "learning_rate": 0.00010893617021276596, "loss": 0.1005, "step": 113 }, { "epoch": 1.0658823529411765, "grad_norm": 0.20606489479541779, "learning_rate": 0.00010808510638297873, "loss": 0.14, "step": 114 }, { "epoch": 1.0752941176470587, "grad_norm": 0.28147006034851074, "learning_rate": 0.00010723404255319149, "loss": 0.1613, "step": 115 }, { "epoch": 1.0847058823529412, "grad_norm": 0.19388940930366516, "learning_rate": 0.00010638297872340425, "loss": 0.1308, "step": 116 }, { "epoch": 1.0941176470588236, "grad_norm": 0.21097290515899658, "learning_rate": 0.00010553191489361702, "loss": 0.1588, "step": 117 }, { "epoch": 1.1035294117647059, "grad_norm": 0.1876417100429535, "learning_rate": 0.00010468085106382978, "loss": 0.1163, "step": 118 }, { "epoch": 1.1129411764705883, "grad_norm": 0.17917850613594055, "learning_rate": 0.00010382978723404255, "loss": 0.0855, "step": 119 }, { "epoch": 1.1223529411764706, "grad_norm": 0.20767854154109955, "learning_rate": 0.00010297872340425532, "loss": 0.0891, "step": 120 }, { "epoch": 1.131764705882353, "grad_norm": 0.21883057057857513, "learning_rate": 0.00010212765957446809, "loss": 0.1052, "step": 121 }, { "epoch": 1.1411764705882352, "grad_norm": 0.32774657011032104, "learning_rate": 0.00010127659574468085, "loss": 0.1866, "step": 122 }, { "epoch": 1.1505882352941177, "grad_norm": 0.32191288471221924, "learning_rate": 0.00010042553191489362, "loss": 0.1817, "step": 123 }, { "epoch": 1.16, "grad_norm": 0.2663422226905823, "learning_rate": 9.95744680851064e-05, "loss": 0.2135, "step": 124 }, { "epoch": 1.1694117647058824, "grad_norm": 0.19601747393608093, "learning_rate": 9.872340425531916e-05, "loss": 0.1152, "step": 125 }, { "epoch": 1.1788235294117646, "grad_norm": 0.22476732730865479, "learning_rate": 9.787234042553192e-05, "loss": 0.1368, "step": 126 }, { "epoch": 1.188235294117647, "grad_norm": 0.2908172607421875, "learning_rate": 9.702127659574469e-05, "loss": 0.1594, "step": 127 }, { "epoch": 1.1976470588235295, "grad_norm": 0.19038249552249908, "learning_rate": 9.617021276595745e-05, "loss": 0.12, "step": 128 }, { "epoch": 1.2070588235294117, "grad_norm": 0.239775151014328, "learning_rate": 9.531914893617023e-05, "loss": 0.1203, "step": 129 }, { "epoch": 1.2164705882352942, "grad_norm": 0.3028945028781891, "learning_rate": 9.446808510638299e-05, "loss": 0.1797, "step": 130 }, { "epoch": 1.2258823529411764, "grad_norm": 0.2879508435726166, "learning_rate": 9.361702127659576e-05, "loss": 0.1679, "step": 131 }, { "epoch": 1.2352941176470589, "grad_norm": 0.2691670358181, "learning_rate": 9.276595744680852e-05, "loss": 0.121, "step": 132 }, { "epoch": 1.244705882352941, "grad_norm": 0.18453848361968994, "learning_rate": 9.191489361702128e-05, "loss": 0.147, "step": 133 }, { "epoch": 1.2541176470588236, "grad_norm": 0.21089564263820648, "learning_rate": 9.106382978723405e-05, "loss": 0.1073, "step": 134 }, { "epoch": 1.263529411764706, "grad_norm": 0.26895055174827576, "learning_rate": 9.021276595744681e-05, "loss": 0.1971, "step": 135 }, { "epoch": 1.2729411764705882, "grad_norm": 0.26298171281814575, "learning_rate": 8.936170212765958e-05, "loss": 0.112, "step": 136 }, { "epoch": 1.2823529411764705, "grad_norm": 0.20527559518814087, "learning_rate": 8.851063829787234e-05, "loss": 0.1321, "step": 137 }, { "epoch": 1.291764705882353, "grad_norm": 0.2358022779226303, "learning_rate": 8.765957446808512e-05, "loss": 0.1196, "step": 138 }, { "epoch": 1.3011764705882354, "grad_norm": 0.19852350652217865, "learning_rate": 8.680851063829788e-05, "loss": 0.1367, "step": 139 }, { "epoch": 1.3105882352941176, "grad_norm": 0.19292528927326202, "learning_rate": 8.595744680851064e-05, "loss": 0.063, "step": 140 }, { "epoch": 1.32, "grad_norm": 0.227496936917305, "learning_rate": 8.510638297872341e-05, "loss": 0.1328, "step": 141 }, { "epoch": 1.3294117647058823, "grad_norm": 0.22281454503536224, "learning_rate": 8.425531914893617e-05, "loss": 0.0913, "step": 142 }, { "epoch": 1.3388235294117647, "grad_norm": 0.24629417061805725, "learning_rate": 8.340425531914894e-05, "loss": 0.1564, "step": 143 }, { "epoch": 1.348235294117647, "grad_norm": 0.3085138499736786, "learning_rate": 8.25531914893617e-05, "loss": 0.1128, "step": 144 }, { "epoch": 1.3576470588235294, "grad_norm": 0.31958431005477905, "learning_rate": 8.170212765957446e-05, "loss": 0.1167, "step": 145 }, { "epoch": 1.3670588235294119, "grad_norm": 0.3235880732536316, "learning_rate": 8.085106382978723e-05, "loss": 0.1334, "step": 146 }, { "epoch": 1.3764705882352941, "grad_norm": 0.2108842432498932, "learning_rate": 8e-05, "loss": 0.1304, "step": 147 }, { "epoch": 1.3858823529411763, "grad_norm": 0.24360014498233795, "learning_rate": 7.914893617021277e-05, "loss": 0.0802, "step": 148 }, { "epoch": 1.3952941176470588, "grad_norm": 0.3124058246612549, "learning_rate": 7.829787234042553e-05, "loss": 0.177, "step": 149 }, { "epoch": 1.4047058823529412, "grad_norm": 0.1749386191368103, "learning_rate": 7.74468085106383e-05, "loss": 0.0688, "step": 150 }, { "epoch": 1.4141176470588235, "grad_norm": 0.20923930406570435, "learning_rate": 7.659574468085106e-05, "loss": 0.1211, "step": 151 }, { "epoch": 1.423529411764706, "grad_norm": 0.23498158156871796, "learning_rate": 7.574468085106383e-05, "loss": 0.1758, "step": 152 }, { "epoch": 1.4329411764705882, "grad_norm": 0.2076374739408493, "learning_rate": 7.489361702127659e-05, "loss": 0.1108, "step": 153 }, { "epoch": 1.4423529411764706, "grad_norm": 0.32030659914016724, "learning_rate": 7.404255319148935e-05, "loss": 0.2205, "step": 154 }, { "epoch": 1.4517647058823528, "grad_norm": 0.30138102173805237, "learning_rate": 7.319148936170213e-05, "loss": 0.1242, "step": 155 }, { "epoch": 1.4611764705882353, "grad_norm": 0.20646269619464874, "learning_rate": 7.23404255319149e-05, "loss": 0.1307, "step": 156 }, { "epoch": 1.4705882352941178, "grad_norm": 0.3253270387649536, "learning_rate": 7.148936170212766e-05, "loss": 0.1834, "step": 157 }, { "epoch": 1.48, "grad_norm": 0.382090300321579, "learning_rate": 7.063829787234042e-05, "loss": 0.1134, "step": 158 }, { "epoch": 1.4894117647058824, "grad_norm": 0.1788143664598465, "learning_rate": 6.97872340425532e-05, "loss": 0.1126, "step": 159 }, { "epoch": 1.4988235294117647, "grad_norm": 0.23251184821128845, "learning_rate": 6.893617021276597e-05, "loss": 0.0852, "step": 160 }, { "epoch": 1.5082352941176471, "grad_norm": 0.2091732621192932, "learning_rate": 6.808510638297873e-05, "loss": 0.1348, "step": 161 }, { "epoch": 1.5176470588235293, "grad_norm": 0.22493578493595123, "learning_rate": 6.72340425531915e-05, "loss": 0.0944, "step": 162 }, { "epoch": 1.5270588235294118, "grad_norm": 0.36818596720695496, "learning_rate": 6.638297872340426e-05, "loss": 0.1519, "step": 163 }, { "epoch": 1.5364705882352943, "grad_norm": 0.18922051787376404, "learning_rate": 6.553191489361702e-05, "loss": 0.0994, "step": 164 }, { "epoch": 1.5458823529411765, "grad_norm": 0.2148643583059311, "learning_rate": 6.46808510638298e-05, "loss": 0.0937, "step": 165 }, { "epoch": 1.5552941176470587, "grad_norm": 0.29744479060173035, "learning_rate": 6.382978723404256e-05, "loss": 0.176, "step": 166 }, { "epoch": 1.5647058823529412, "grad_norm": 0.24048790335655212, "learning_rate": 6.297872340425533e-05, "loss": 0.0922, "step": 167 }, { "epoch": 1.5741176470588236, "grad_norm": 0.2661268711090088, "learning_rate": 6.212765957446809e-05, "loss": 0.1847, "step": 168 }, { "epoch": 1.5835294117647059, "grad_norm": 0.2629673182964325, "learning_rate": 6.127659574468086e-05, "loss": 0.1319, "step": 169 }, { "epoch": 1.592941176470588, "grad_norm": 0.2114630937576294, "learning_rate": 6.042553191489362e-05, "loss": 0.1031, "step": 170 }, { "epoch": 1.6023529411764705, "grad_norm": 0.2681063115596771, "learning_rate": 5.9574468085106384e-05, "loss": 0.1409, "step": 171 }, { "epoch": 1.611764705882353, "grad_norm": 0.2776590585708618, "learning_rate": 5.872340425531915e-05, "loss": 0.1455, "step": 172 }, { "epoch": 1.6211764705882352, "grad_norm": 0.28415653109550476, "learning_rate": 5.787234042553191e-05, "loss": 0.1753, "step": 173 }, { "epoch": 1.6305882352941177, "grad_norm": 0.3313392698764801, "learning_rate": 5.702127659574469e-05, "loss": 0.115, "step": 174 }, { "epoch": 1.6400000000000001, "grad_norm": 0.202665776014328, "learning_rate": 5.617021276595745e-05, "loss": 0.1024, "step": 175 }, { "epoch": 1.6494117647058824, "grad_norm": 0.24025441706180573, "learning_rate": 5.531914893617022e-05, "loss": 0.1127, "step": 176 }, { "epoch": 1.6588235294117646, "grad_norm": 0.24172081053256989, "learning_rate": 5.446808510638298e-05, "loss": 0.1548, "step": 177 }, { "epoch": 1.668235294117647, "grad_norm": 0.22560444474220276, "learning_rate": 5.3617021276595745e-05, "loss": 0.1373, "step": 178 }, { "epoch": 1.6776470588235295, "grad_norm": 0.2639141082763672, "learning_rate": 5.276595744680851e-05, "loss": 0.1611, "step": 179 }, { "epoch": 1.6870588235294117, "grad_norm": 0.231312096118927, "learning_rate": 5.191489361702127e-05, "loss": 0.1136, "step": 180 }, { "epoch": 1.696470588235294, "grad_norm": 0.2878687381744385, "learning_rate": 5.1063829787234044e-05, "loss": 0.0947, "step": 181 }, { "epoch": 1.7058823529411766, "grad_norm": 0.20883168280124664, "learning_rate": 5.021276595744681e-05, "loss": 0.1508, "step": 182 }, { "epoch": 1.7152941176470589, "grad_norm": 0.1777513027191162, "learning_rate": 4.936170212765958e-05, "loss": 0.0995, "step": 183 }, { "epoch": 1.724705882352941, "grad_norm": 0.2738886773586273, "learning_rate": 4.851063829787234e-05, "loss": 0.1215, "step": 184 }, { "epoch": 1.7341176470588235, "grad_norm": 0.3017210066318512, "learning_rate": 4.7659574468085114e-05, "loss": 0.1051, "step": 185 }, { "epoch": 1.743529411764706, "grad_norm": 0.2284962385892868, "learning_rate": 4.680851063829788e-05, "loss": 0.1366, "step": 186 }, { "epoch": 1.7529411764705882, "grad_norm": 0.25317302346229553, "learning_rate": 4.595744680851064e-05, "loss": 0.1723, "step": 187 }, { "epoch": 1.7623529411764705, "grad_norm": 0.3226059377193451, "learning_rate": 4.5106382978723406e-05, "loss": 0.1851, "step": 188 }, { "epoch": 1.771764705882353, "grad_norm": 0.23642048239707947, "learning_rate": 4.425531914893617e-05, "loss": 0.0906, "step": 189 }, { "epoch": 1.7811764705882354, "grad_norm": 0.41941365599632263, "learning_rate": 4.340425531914894e-05, "loss": 0.1894, "step": 190 }, { "epoch": 1.7905882352941176, "grad_norm": 0.25548532605171204, "learning_rate": 4.2553191489361704e-05, "loss": 0.133, "step": 191 }, { "epoch": 1.8, "grad_norm": 0.41250723600387573, "learning_rate": 4.170212765957447e-05, "loss": 0.2158, "step": 192 }, { "epoch": 1.8094117647058825, "grad_norm": 0.2293664664030075, "learning_rate": 4.085106382978723e-05, "loss": 0.1193, "step": 193 }, { "epoch": 1.8188235294117647, "grad_norm": 0.2659620940685272, "learning_rate": 4e-05, "loss": 0.1104, "step": 194 }, { "epoch": 1.828235294117647, "grad_norm": 0.25475817918777466, "learning_rate": 3.914893617021277e-05, "loss": 0.1283, "step": 195 }, { "epoch": 1.8376470588235294, "grad_norm": 0.3606954514980316, "learning_rate": 3.829787234042553e-05, "loss": 0.2067, "step": 196 }, { "epoch": 1.8470588235294119, "grad_norm": 0.25670263171195984, "learning_rate": 3.7446808510638295e-05, "loss": 0.1435, "step": 197 }, { "epoch": 1.856470588235294, "grad_norm": 0.21306878328323364, "learning_rate": 3.6595744680851066e-05, "loss": 0.1203, "step": 198 }, { "epoch": 1.8658823529411763, "grad_norm": 0.2932317554950714, "learning_rate": 3.574468085106383e-05, "loss": 0.1449, "step": 199 }, { "epoch": 1.8752941176470588, "grad_norm": 0.27417030930519104, "learning_rate": 3.48936170212766e-05, "loss": 0.183, "step": 200 }, { "epoch": 1.8847058823529412, "grad_norm": 0.21384155750274658, "learning_rate": 3.4042553191489365e-05, "loss": 0.0958, "step": 201 }, { "epoch": 1.8941176470588235, "grad_norm": 0.4246179163455963, "learning_rate": 3.319148936170213e-05, "loss": 0.0902, "step": 202 }, { "epoch": 1.903529411764706, "grad_norm": 0.24406206607818604, "learning_rate": 3.23404255319149e-05, "loss": 0.1508, "step": 203 }, { "epoch": 1.9129411764705884, "grad_norm": 0.29728034138679504, "learning_rate": 3.1489361702127664e-05, "loss": 0.0963, "step": 204 }, { "epoch": 1.9223529411764706, "grad_norm": 0.2817091643810272, "learning_rate": 3.063829787234043e-05, "loss": 0.1004, "step": 205 }, { "epoch": 1.9317647058823528, "grad_norm": 0.24360136687755585, "learning_rate": 2.9787234042553192e-05, "loss": 0.116, "step": 206 }, { "epoch": 1.9411764705882353, "grad_norm": 0.37449270486831665, "learning_rate": 2.8936170212765956e-05, "loss": 0.116, "step": 207 }, { "epoch": 1.9505882352941177, "grad_norm": 0.28712770342826843, "learning_rate": 2.8085106382978727e-05, "loss": 0.1471, "step": 208 }, { "epoch": 1.96, "grad_norm": 0.3738057613372803, "learning_rate": 2.723404255319149e-05, "loss": 0.1659, "step": 209 }, { "epoch": 1.9694117647058822, "grad_norm": 0.23495592176914215, "learning_rate": 2.6382978723404255e-05, "loss": 0.1511, "step": 210 }, { "epoch": 1.9788235294117649, "grad_norm": 0.2240479439496994, "learning_rate": 2.5531914893617022e-05, "loss": 0.1029, "step": 211 }, { "epoch": 1.988235294117647, "grad_norm": 0.3040638267993927, "learning_rate": 2.468085106382979e-05, "loss": 0.1475, "step": 212 }, { "epoch": 1.9976470588235293, "grad_norm": 0.31952324509620667, "learning_rate": 2.3829787234042557e-05, "loss": 0.2251, "step": 213 }, { "epoch": 2.0, "grad_norm": 0.8529971837997437, "learning_rate": 2.297872340425532e-05, "loss": 0.1501, "step": 214 }, { "epoch": 2.0094117647058822, "grad_norm": 0.15144610404968262, "learning_rate": 2.2127659574468085e-05, "loss": 0.0475, "step": 215 }, { "epoch": 2.018823529411765, "grad_norm": 0.22670157253742218, "learning_rate": 2.1276595744680852e-05, "loss": 0.1331, "step": 216 }, { "epoch": 2.028235294117647, "grad_norm": 0.1298380345106125, "learning_rate": 2.0425531914893616e-05, "loss": 0.0565, "step": 217 }, { "epoch": 2.0376470588235294, "grad_norm": 0.20339643955230713, "learning_rate": 1.9574468085106384e-05, "loss": 0.0792, "step": 218 }, { "epoch": 2.0470588235294116, "grad_norm": 0.1728561669588089, "learning_rate": 1.8723404255319148e-05, "loss": 0.073, "step": 219 }, { "epoch": 2.0564705882352943, "grad_norm": 0.18114320933818817, "learning_rate": 1.7872340425531915e-05, "loss": 0.0842, "step": 220 }, { "epoch": 2.0658823529411765, "grad_norm": 0.22063469886779785, "learning_rate": 1.7021276595744682e-05, "loss": 0.149, "step": 221 }, { "epoch": 2.0752941176470587, "grad_norm": 0.21402296423912048, "learning_rate": 1.617021276595745e-05, "loss": 0.0904, "step": 222 }, { "epoch": 2.084705882352941, "grad_norm": 0.1926405131816864, "learning_rate": 1.5319148936170214e-05, "loss": 0.1171, "step": 223 }, { "epoch": 2.0941176470588236, "grad_norm": 0.19383728504180908, "learning_rate": 1.4468085106382978e-05, "loss": 0.1062, "step": 224 }, { "epoch": 2.103529411764706, "grad_norm": 0.19772332906723022, "learning_rate": 1.3617021276595745e-05, "loss": 0.0943, "step": 225 }, { "epoch": 2.112941176470588, "grad_norm": 0.1649761199951172, "learning_rate": 1.2765957446808511e-05, "loss": 0.0856, "step": 226 }, { "epoch": 2.1223529411764708, "grad_norm": 0.2451670914888382, "learning_rate": 1.1914893617021278e-05, "loss": 0.0941, "step": 227 }, { "epoch": 2.131764705882353, "grad_norm": 0.16791269183158875, "learning_rate": 1.1063829787234042e-05, "loss": 0.0835, "step": 228 }, { "epoch": 2.1411764705882352, "grad_norm": 0.23293572664260864, "learning_rate": 1.0212765957446808e-05, "loss": 0.1359, "step": 229 }, { "epoch": 2.1505882352941175, "grad_norm": 0.21684999763965607, "learning_rate": 9.361702127659574e-06, "loss": 0.0793, "step": 230 }, { "epoch": 2.16, "grad_norm": 0.2551932632923126, "learning_rate": 8.510638297872341e-06, "loss": 0.1588, "step": 231 }, { "epoch": 2.1694117647058824, "grad_norm": 0.2618826925754547, "learning_rate": 7.659574468085107e-06, "loss": 0.1019, "step": 232 }, { "epoch": 2.1788235294117646, "grad_norm": 0.17673631012439728, "learning_rate": 6.808510638297873e-06, "loss": 0.0576, "step": 233 }, { "epoch": 2.1882352941176473, "grad_norm": 0.18946610391139984, "learning_rate": 5.957446808510639e-06, "loss": 0.0929, "step": 234 }, { "epoch": 2.1976470588235295, "grad_norm": 0.23730279505252838, "learning_rate": 5.106382978723404e-06, "loss": 0.1291, "step": 235 }, { "epoch": 2.2070588235294117, "grad_norm": 0.23422257602214813, "learning_rate": 4.255319148936171e-06, "loss": 0.1169, "step": 236 }, { "epoch": 2.216470588235294, "grad_norm": 0.2329464703798294, "learning_rate": 3.4042553191489363e-06, "loss": 0.1428, "step": 237 }, { "epoch": 2.2258823529411766, "grad_norm": 0.22677987813949585, "learning_rate": 2.553191489361702e-06, "loss": 0.1091, "step": 238 }, { "epoch": 2.235294117647059, "grad_norm": 0.18695490062236786, "learning_rate": 1.7021276595744682e-06, "loss": 0.0791, "step": 239 }, { "epoch": 2.244705882352941, "grad_norm": 0.13504858314990997, "learning_rate": 8.510638297872341e-07, "loss": 0.0638, "step": 240 } ], "logging_steps": 1, "max_steps": 240, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 10, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 3.82456266624e+16, "train_batch_size": 2, "trial_name": null, "trial_params": null }