{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.0, "eval_steps": 400, "global_step": 375, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.016, "grad_norm": 0.5721463561058044, "learning_rate": 0.000997326203208556, "loss": 22.6723, "step": 2 }, { "epoch": 0.032, "grad_norm": 1.5221903324127197, "learning_rate": 0.0009919786096256684, "loss": 22.7502, "step": 4 }, { "epoch": 0.048, "grad_norm": 3.269012212753296, "learning_rate": 0.0009866310160427808, "loss": 22.2706, "step": 6 }, { "epoch": 0.064, "grad_norm": 4.567020416259766, "learning_rate": 0.0009812834224598931, "loss": 21.3625, "step": 8 }, { "epoch": 0.08, "grad_norm": 7.019204139709473, "learning_rate": 0.0009759358288770054, "loss": 20.7279, "step": 10 }, { "epoch": 0.096, "grad_norm": 8.498096466064453, "learning_rate": 0.0009705882352941176, "loss": 20.8221, "step": 12 }, { "epoch": 0.112, "grad_norm": 7.8151397705078125, "learning_rate": 0.00096524064171123, "loss": 20.4136, "step": 14 }, { "epoch": 0.128, "grad_norm": 8.028499603271484, "learning_rate": 0.0009598930481283422, "loss": 20.2719, "step": 16 }, { "epoch": 0.144, "grad_norm": 8.516434669494629, "learning_rate": 0.0009545454545454546, "loss": 20.1681, "step": 18 }, { "epoch": 0.16, "grad_norm": 8.52490520477295, "learning_rate": 0.0009491978609625669, "loss": 19.8895, "step": 20 }, { "epoch": 0.176, "grad_norm": 6.709629058837891, "learning_rate": 0.0009438502673796791, "loss": 19.93, "step": 22 }, { "epoch": 0.192, "grad_norm": 6.038687705993652, "learning_rate": 0.0009385026737967914, "loss": 19.6312, "step": 24 }, { "epoch": 0.208, "grad_norm": 5.785665512084961, "learning_rate": 0.0009331550802139037, "loss": 19.7683, "step": 26 }, { "epoch": 0.224, "grad_norm": 5.79067850112915, "learning_rate": 0.0009278074866310161, "loss": 19.6965, "step": 28 }, { "epoch": 0.24, "grad_norm": 5.166928291320801, "learning_rate": 0.0009224598930481284, "loss": 19.4005, "step": 30 }, { "epoch": 0.256, "grad_norm": 4.578023433685303, "learning_rate": 0.0009171122994652407, "loss": 19.3963, "step": 32 }, { "epoch": 0.272, "grad_norm": 4.7540693283081055, "learning_rate": 0.0009117647058823529, "loss": 19.4129, "step": 34 }, { "epoch": 0.288, "grad_norm": 5.394408226013184, "learning_rate": 0.0009064171122994653, "loss": 19.5821, "step": 36 }, { "epoch": 0.304, "grad_norm": 4.4902753829956055, "learning_rate": 0.0009010695187165776, "loss": 19.6562, "step": 38 }, { "epoch": 0.32, "grad_norm": 5.49019193649292, "learning_rate": 0.0008957219251336899, "loss": 19.3588, "step": 40 }, { "epoch": 0.336, "grad_norm": 4.184142589569092, "learning_rate": 0.0008903743315508022, "loss": 18.9032, "step": 42 }, { "epoch": 0.352, "grad_norm": 3.98618483543396, "learning_rate": 0.0008850267379679144, "loss": 19.1882, "step": 44 }, { "epoch": 0.368, "grad_norm": 4.851687908172607, "learning_rate": 0.0008796791443850267, "loss": 19.4565, "step": 46 }, { "epoch": 0.384, "grad_norm": 4.108444690704346, "learning_rate": 0.0008743315508021391, "loss": 19.6149, "step": 48 }, { "epoch": 0.4, "grad_norm": 3.7055838108062744, "learning_rate": 0.0008689839572192514, "loss": 18.9573, "step": 50 }, { "epoch": 0.416, "grad_norm": 4.930137634277344, "learning_rate": 0.0008636363636363636, "loss": 19.4389, "step": 52 }, { "epoch": 0.432, "grad_norm": 3.910098075866699, "learning_rate": 0.000858288770053476, "loss": 19.1465, "step": 54 }, { "epoch": 0.448, "grad_norm": 4.0127716064453125, "learning_rate": 0.0008529411764705882, "loss": 19.5038, "step": 56 }, { "epoch": 0.464, "grad_norm": 4.495028018951416, "learning_rate": 0.0008475935828877005, "loss": 19.3252, "step": 58 }, { "epoch": 0.48, "grad_norm": 3.7703821659088135, "learning_rate": 0.0008422459893048129, "loss": 19.0238, "step": 60 }, { "epoch": 0.496, "grad_norm": 3.6335291862487793, "learning_rate": 0.0008368983957219252, "loss": 19.1296, "step": 62 }, { "epoch": 0.512, "grad_norm": 3.819183588027954, "learning_rate": 0.0008315508021390374, "loss": 18.4946, "step": 64 }, { "epoch": 0.528, "grad_norm": 3.3171255588531494, "learning_rate": 0.0008262032085561497, "loss": 18.8054, "step": 66 }, { "epoch": 0.544, "grad_norm": 4.316566467285156, "learning_rate": 0.000820855614973262, "loss": 19.162, "step": 68 }, { "epoch": 0.56, "grad_norm": 3.39648175239563, "learning_rate": 0.0008155080213903744, "loss": 18.5671, "step": 70 }, { "epoch": 0.576, "grad_norm": 3.7200136184692383, "learning_rate": 0.0008101604278074867, "loss": 18.9179, "step": 72 }, { "epoch": 0.592, "grad_norm": 3.6730430126190186, "learning_rate": 0.0008048128342245989, "loss": 18.7162, "step": 74 }, { "epoch": 0.608, "grad_norm": 3.5580945014953613, "learning_rate": 0.0007994652406417113, "loss": 19.0574, "step": 76 }, { "epoch": 0.624, "grad_norm": 3.4793589115142822, "learning_rate": 0.0007941176470588235, "loss": 18.8649, "step": 78 }, { "epoch": 0.64, "grad_norm": 4.074679374694824, "learning_rate": 0.0007887700534759359, "loss": 18.5553, "step": 80 }, { "epoch": 0.656, "grad_norm": 3.315810441970825, "learning_rate": 0.0007834224598930482, "loss": 18.2136, "step": 82 }, { "epoch": 0.672, "grad_norm": 4.288172721862793, "learning_rate": 0.0007780748663101605, "loss": 18.6089, "step": 84 }, { "epoch": 0.688, "grad_norm": 3.5749149322509766, "learning_rate": 0.0007727272727272727, "loss": 18.8697, "step": 86 }, { "epoch": 0.704, "grad_norm": 3.608825206756592, "learning_rate": 0.000767379679144385, "loss": 18.4129, "step": 88 }, { "epoch": 0.72, "grad_norm": 3.5199592113494873, "learning_rate": 0.0007620320855614974, "loss": 18.1619, "step": 90 }, { "epoch": 0.736, "grad_norm": 3.5022549629211426, "learning_rate": 0.0007566844919786096, "loss": 18.7368, "step": 92 }, { "epoch": 0.752, "grad_norm": 3.6002230644226074, "learning_rate": 0.000751336898395722, "loss": 18.7792, "step": 94 }, { "epoch": 0.768, "grad_norm": 4.682362079620361, "learning_rate": 0.0007459893048128342, "loss": 18.5495, "step": 96 }, { "epoch": 0.784, "grad_norm": 3.6108767986297607, "learning_rate": 0.0007406417112299465, "loss": 18.7077, "step": 98 }, { "epoch": 0.8, "grad_norm": 3.4719815254211426, "learning_rate": 0.0007352941176470589, "loss": 18.3262, "step": 100 }, { "epoch": 0.816, "grad_norm": 4.4115986824035645, "learning_rate": 0.0007299465240641712, "loss": 18.3416, "step": 102 }, { "epoch": 0.832, "grad_norm": 3.324169158935547, "learning_rate": 0.0007245989304812834, "loss": 18.7297, "step": 104 }, { "epoch": 0.848, "grad_norm": 3.4287421703338623, "learning_rate": 0.0007192513368983958, "loss": 18.4499, "step": 106 }, { "epoch": 0.864, "grad_norm": 3.9451239109039307, "learning_rate": 0.000713903743315508, "loss": 18.2669, "step": 108 }, { "epoch": 0.88, "grad_norm": 3.5031988620758057, "learning_rate": 0.0007085561497326202, "loss": 18.8895, "step": 110 }, { "epoch": 0.896, "grad_norm": 3.5174903869628906, "learning_rate": 0.0007032085561497327, "loss": 18.2961, "step": 112 }, { "epoch": 0.912, "grad_norm": 4.080729961395264, "learning_rate": 0.0006978609625668449, "loss": 18.5613, "step": 114 }, { "epoch": 0.928, "grad_norm": 3.7523930072784424, "learning_rate": 0.0006925133689839572, "loss": 18.5538, "step": 116 }, { "epoch": 0.944, "grad_norm": 3.066669225692749, "learning_rate": 0.0006871657754010695, "loss": 18.6904, "step": 118 }, { "epoch": 0.96, "grad_norm": 4.274256706237793, "learning_rate": 0.0006818181818181818, "loss": 18.6147, "step": 120 }, { "epoch": 0.976, "grad_norm": 3.690139055252075, "learning_rate": 0.0006764705882352942, "loss": 18.1693, "step": 122 }, { "epoch": 0.992, "grad_norm": 3.6681807041168213, "learning_rate": 0.0006711229946524065, "loss": 18.2498, "step": 124 }, { "epoch": 1.008, "grad_norm": 3.5203354358673096, "learning_rate": 0.0006657754010695187, "loss": 18.4522, "step": 126 }, { "epoch": 1.024, "grad_norm": 4.650991439819336, "learning_rate": 0.000660427807486631, "loss": 18.2839, "step": 128 }, { "epoch": 1.04, "grad_norm": 3.7944228649139404, "learning_rate": 0.0006550802139037433, "loss": 18.051, "step": 130 }, { "epoch": 1.056, "grad_norm": 3.2437500953674316, "learning_rate": 0.0006497326203208556, "loss": 18.1842, "step": 132 }, { "epoch": 1.072, "grad_norm": 3.2863543033599854, "learning_rate": 0.000644385026737968, "loss": 18.2304, "step": 134 }, { "epoch": 1.088, "grad_norm": 3.553260326385498, "learning_rate": 0.0006390374331550802, "loss": 18.1385, "step": 136 }, { "epoch": 1.104, "grad_norm": 3.4277195930480957, "learning_rate": 0.0006336898395721925, "loss": 18.1337, "step": 138 }, { "epoch": 1.12, "grad_norm": 3.974073886871338, "learning_rate": 0.0006283422459893048, "loss": 18.0326, "step": 140 }, { "epoch": 1.1360000000000001, "grad_norm": 3.3450510501861572, "learning_rate": 0.0006229946524064172, "loss": 18.2695, "step": 142 }, { "epoch": 1.152, "grad_norm": 3.2181997299194336, "learning_rate": 0.0006176470588235294, "loss": 18.0315, "step": 144 }, { "epoch": 1.168, "grad_norm": 3.8346364498138428, "learning_rate": 0.0006122994652406418, "loss": 18.4272, "step": 146 }, { "epoch": 1.184, "grad_norm": 3.2085418701171875, "learning_rate": 0.000606951871657754, "loss": 18.1768, "step": 148 }, { "epoch": 1.2, "grad_norm": 3.462108850479126, "learning_rate": 0.0006016042780748662, "loss": 18.1731, "step": 150 }, { "epoch": 1.216, "grad_norm": 3.444965362548828, "learning_rate": 0.0005962566844919787, "loss": 18.3599, "step": 152 }, { "epoch": 1.232, "grad_norm": 3.3701171875, "learning_rate": 0.0005909090909090909, "loss": 18.1495, "step": 154 }, { "epoch": 1.248, "grad_norm": 3.5145843029022217, "learning_rate": 0.0005855614973262032, "loss": 18.0835, "step": 156 }, { "epoch": 1.264, "grad_norm": 3.4785313606262207, "learning_rate": 0.0005802139037433155, "loss": 17.8138, "step": 158 }, { "epoch": 1.28, "grad_norm": 3.9735538959503174, "learning_rate": 0.0005748663101604278, "loss": 18.0071, "step": 160 }, { "epoch": 1.296, "grad_norm": 3.650447368621826, "learning_rate": 0.00056951871657754, "loss": 18.0124, "step": 162 }, { "epoch": 1.312, "grad_norm": 3.6459813117980957, "learning_rate": 0.0005641711229946525, "loss": 18.0059, "step": 164 }, { "epoch": 1.328, "grad_norm": 3.2154831886291504, "learning_rate": 0.0005588235294117647, "loss": 17.9694, "step": 166 }, { "epoch": 1.3439999999999999, "grad_norm": 3.367403507232666, "learning_rate": 0.0005534759358288771, "loss": 17.6557, "step": 168 }, { "epoch": 1.3599999999999999, "grad_norm": 3.9948298931121826, "learning_rate": 0.0005481283422459893, "loss": 18.1942, "step": 170 }, { "epoch": 1.376, "grad_norm": 3.3495073318481445, "learning_rate": 0.0005427807486631015, "loss": 18.2016, "step": 172 }, { "epoch": 1.392, "grad_norm": 3.373162269592285, "learning_rate": 0.000537433155080214, "loss": 18.0422, "step": 174 }, { "epoch": 1.408, "grad_norm": 4.063633441925049, "learning_rate": 0.0005320855614973262, "loss": 18.0809, "step": 176 }, { "epoch": 1.424, "grad_norm": 3.4912514686584473, "learning_rate": 0.0005267379679144385, "loss": 18.0674, "step": 178 }, { "epoch": 1.44, "grad_norm": 3.5900015830993652, "learning_rate": 0.0005213903743315508, "loss": 17.9285, "step": 180 }, { "epoch": 1.456, "grad_norm": 4.066802024841309, "learning_rate": 0.0005160427807486631, "loss": 18.1551, "step": 182 }, { "epoch": 1.472, "grad_norm": 3.9782357215881348, "learning_rate": 0.0005106951871657754, "loss": 18.0509, "step": 184 }, { "epoch": 1.488, "grad_norm": 3.314682960510254, "learning_rate": 0.0005053475935828878, "loss": 17.7608, "step": 186 }, { "epoch": 1.504, "grad_norm": 3.3548595905303955, "learning_rate": 0.0005, "loss": 17.8103, "step": 188 }, { "epoch": 1.52, "grad_norm": 3.3475797176361084, "learning_rate": 0.0004946524064171123, "loss": 17.9465, "step": 190 }, { "epoch": 1.536, "grad_norm": 3.4256432056427, "learning_rate": 0.0004893048128342246, "loss": 17.6619, "step": 192 }, { "epoch": 1.552, "grad_norm": 3.390056848526001, "learning_rate": 0.0004839572192513369, "loss": 17.9681, "step": 194 }, { "epoch": 1.568, "grad_norm": 3.4441208839416504, "learning_rate": 0.00047860962566844924, "loss": 17.9407, "step": 196 }, { "epoch": 1.584, "grad_norm": 3.2374165058135986, "learning_rate": 0.0004732620320855615, "loss": 17.7235, "step": 198 }, { "epoch": 1.6, "grad_norm": 3.5628514289855957, "learning_rate": 0.0004679144385026738, "loss": 18.1743, "step": 200 }, { "epoch": 1.616, "grad_norm": 3.41139554977417, "learning_rate": 0.00046256684491978613, "loss": 17.8456, "step": 202 }, { "epoch": 1.6320000000000001, "grad_norm": 3.423110008239746, "learning_rate": 0.0004572192513368984, "loss": 17.6656, "step": 204 }, { "epoch": 1.6480000000000001, "grad_norm": 3.3344337940216064, "learning_rate": 0.00045187165775401067, "loss": 17.962, "step": 206 }, { "epoch": 1.6640000000000001, "grad_norm": 3.5036981105804443, "learning_rate": 0.000446524064171123, "loss": 18.0875, "step": 208 }, { "epoch": 1.6800000000000002, "grad_norm": 3.4953839778900146, "learning_rate": 0.0004411764705882353, "loss": 17.3435, "step": 210 }, { "epoch": 1.696, "grad_norm": 3.6864068508148193, "learning_rate": 0.0004358288770053476, "loss": 17.9087, "step": 212 }, { "epoch": 1.712, "grad_norm": 3.4755449295043945, "learning_rate": 0.0004304812834224599, "loss": 17.5076, "step": 214 }, { "epoch": 1.728, "grad_norm": 3.8116891384124756, "learning_rate": 0.0004251336898395722, "loss": 17.9272, "step": 216 }, { "epoch": 1.744, "grad_norm": 3.18284010887146, "learning_rate": 0.0004197860962566845, "loss": 17.7148, "step": 218 }, { "epoch": 1.76, "grad_norm": 3.2884979248046875, "learning_rate": 0.0004144385026737968, "loss": 17.8813, "step": 220 }, { "epoch": 1.776, "grad_norm": 3.3735768795013428, "learning_rate": 0.00040909090909090913, "loss": 18.0372, "step": 222 }, { "epoch": 1.792, "grad_norm": 3.2611794471740723, "learning_rate": 0.00040374331550802143, "loss": 17.3771, "step": 224 }, { "epoch": 1.808, "grad_norm": 3.3338570594787598, "learning_rate": 0.00039839572192513367, "loss": 18.4657, "step": 226 }, { "epoch": 1.8239999999999998, "grad_norm": 3.405127763748169, "learning_rate": 0.000393048128342246, "loss": 17.9076, "step": 228 }, { "epoch": 1.8399999999999999, "grad_norm": 3.561793565750122, "learning_rate": 0.0003877005347593583, "loss": 17.8996, "step": 230 }, { "epoch": 1.8559999999999999, "grad_norm": 3.5615479946136475, "learning_rate": 0.00038235294117647055, "loss": 17.6746, "step": 232 }, { "epoch": 1.8719999999999999, "grad_norm": 3.4306275844573975, "learning_rate": 0.0003770053475935829, "loss": 17.7182, "step": 234 }, { "epoch": 1.888, "grad_norm": 3.5057003498077393, "learning_rate": 0.0003716577540106952, "loss": 17.8058, "step": 236 }, { "epoch": 1.904, "grad_norm": 3.3117101192474365, "learning_rate": 0.0003663101604278075, "loss": 17.8643, "step": 238 }, { "epoch": 1.92, "grad_norm": 3.6897945404052734, "learning_rate": 0.0003609625668449198, "loss": 17.8266, "step": 240 }, { "epoch": 1.936, "grad_norm": 3.7577505111694336, "learning_rate": 0.0003556149732620321, "loss": 18.6381, "step": 242 }, { "epoch": 1.952, "grad_norm": 3.2401480674743652, "learning_rate": 0.0003502673796791444, "loss": 17.6933, "step": 244 }, { "epoch": 1.968, "grad_norm": 3.6619515419006348, "learning_rate": 0.0003449197860962567, "loss": 18.0547, "step": 246 }, { "epoch": 1.984, "grad_norm": 3.8387668132781982, "learning_rate": 0.000339572192513369, "loss": 17.7932, "step": 248 }, { "epoch": 2.0, "grad_norm": 3.390653371810913, "learning_rate": 0.0003342245989304813, "loss": 17.2655, "step": 250 }, { "epoch": 2.016, "grad_norm": 3.40058970451355, "learning_rate": 0.00032887700534759356, "loss": 17.703, "step": 252 }, { "epoch": 2.032, "grad_norm": 3.568702220916748, "learning_rate": 0.0003235294117647059, "loss": 17.2042, "step": 254 }, { "epoch": 2.048, "grad_norm": 3.529431104660034, "learning_rate": 0.0003181818181818182, "loss": 17.5732, "step": 256 }, { "epoch": 2.064, "grad_norm": 3.3919003009796143, "learning_rate": 0.00031283422459893044, "loss": 17.6191, "step": 258 }, { "epoch": 2.08, "grad_norm": 3.878042459487915, "learning_rate": 0.0003074866310160428, "loss": 17.4911, "step": 260 }, { "epoch": 2.096, "grad_norm": 3.772318124771118, "learning_rate": 0.0003021390374331551, "loss": 17.7258, "step": 262 }, { "epoch": 2.112, "grad_norm": 3.4453060626983643, "learning_rate": 0.0002967914438502674, "loss": 17.4906, "step": 264 }, { "epoch": 2.128, "grad_norm": 3.4957454204559326, "learning_rate": 0.0002914438502673797, "loss": 17.5716, "step": 266 }, { "epoch": 2.144, "grad_norm": 3.530831813812256, "learning_rate": 0.000286096256684492, "loss": 17.4089, "step": 268 }, { "epoch": 2.16, "grad_norm": 3.7524755001068115, "learning_rate": 0.0002807486631016043, "loss": 17.7712, "step": 270 }, { "epoch": 2.176, "grad_norm": 3.297961711883545, "learning_rate": 0.00027540106951871656, "loss": 17.4408, "step": 272 }, { "epoch": 2.192, "grad_norm": 3.3661088943481445, "learning_rate": 0.0002700534759358289, "loss": 17.6753, "step": 274 }, { "epoch": 2.208, "grad_norm": 3.646210193634033, "learning_rate": 0.0002647058823529412, "loss": 17.7821, "step": 276 }, { "epoch": 2.224, "grad_norm": 3.475140333175659, "learning_rate": 0.00025935828877005345, "loss": 17.6129, "step": 278 }, { "epoch": 2.24, "grad_norm": 3.4734578132629395, "learning_rate": 0.0002540106951871658, "loss": 17.6856, "step": 280 }, { "epoch": 2.2560000000000002, "grad_norm": 3.491572380065918, "learning_rate": 0.0002486631016042781, "loss": 17.6071, "step": 282 }, { "epoch": 2.2720000000000002, "grad_norm": 3.4102542400360107, "learning_rate": 0.0002433155080213904, "loss": 17.352, "step": 284 }, { "epoch": 2.288, "grad_norm": 3.393477439880371, "learning_rate": 0.00023796791443850268, "loss": 17.2612, "step": 286 }, { "epoch": 2.304, "grad_norm": 3.112462282180786, "learning_rate": 0.000232620320855615, "loss": 17.3272, "step": 288 }, { "epoch": 2.32, "grad_norm": 3.3398191928863525, "learning_rate": 0.00022727272727272727, "loss": 17.5815, "step": 290 }, { "epoch": 2.336, "grad_norm": 3.5039889812469482, "learning_rate": 0.00022192513368983957, "loss": 17.7557, "step": 292 }, { "epoch": 2.352, "grad_norm": 3.532892942428589, "learning_rate": 0.0002165775401069519, "loss": 18.0523, "step": 294 }, { "epoch": 2.368, "grad_norm": 3.2969062328338623, "learning_rate": 0.00021122994652406418, "loss": 17.7496, "step": 296 }, { "epoch": 2.384, "grad_norm": 3.262855291366577, "learning_rate": 0.00020588235294117645, "loss": 17.793, "step": 298 }, { "epoch": 2.4, "grad_norm": 3.459914445877075, "learning_rate": 0.00020053475935828877, "loss": 17.9245, "step": 300 }, { "epoch": 2.416, "grad_norm": 3.6749696731567383, "learning_rate": 0.00019518716577540107, "loss": 17.7125, "step": 302 }, { "epoch": 2.432, "grad_norm": 3.266754150390625, "learning_rate": 0.0001898395721925134, "loss": 17.5905, "step": 304 }, { "epoch": 2.448, "grad_norm": 3.1848971843719482, "learning_rate": 0.00018449197860962566, "loss": 17.523, "step": 306 }, { "epoch": 2.464, "grad_norm": 3.2962844371795654, "learning_rate": 0.00017914438502673795, "loss": 17.5297, "step": 308 }, { "epoch": 2.48, "grad_norm": 3.4688000679016113, "learning_rate": 0.00017379679144385028, "loss": 17.6315, "step": 310 }, { "epoch": 2.496, "grad_norm": 3.4146833419799805, "learning_rate": 0.00016844919786096257, "loss": 17.5776, "step": 312 }, { "epoch": 2.512, "grad_norm": 3.3122944831848145, "learning_rate": 0.0001631016042780749, "loss": 17.7264, "step": 314 }, { "epoch": 2.528, "grad_norm": 3.2939462661743164, "learning_rate": 0.00015775401069518716, "loss": 17.48, "step": 316 }, { "epoch": 2.544, "grad_norm": 3.8504631519317627, "learning_rate": 0.00015240641711229946, "loss": 17.3854, "step": 318 }, { "epoch": 2.56, "grad_norm": 4.062356948852539, "learning_rate": 0.00014705882352941178, "loss": 17.6811, "step": 320 }, { "epoch": 2.576, "grad_norm": 3.741989850997925, "learning_rate": 0.00014171122994652407, "loss": 17.4078, "step": 322 }, { "epoch": 2.592, "grad_norm": 3.7287967205047607, "learning_rate": 0.00013636363636363637, "loss": 17.3517, "step": 324 }, { "epoch": 2.608, "grad_norm": 3.6224465370178223, "learning_rate": 0.00013101604278074866, "loss": 17.254, "step": 326 }, { "epoch": 2.624, "grad_norm": 3.5674147605895996, "learning_rate": 0.00012566844919786096, "loss": 17.869, "step": 328 }, { "epoch": 2.64, "grad_norm": 3.722736358642578, "learning_rate": 0.00012032085561497325, "loss": 17.7399, "step": 330 }, { "epoch": 2.656, "grad_norm": 3.6463096141815186, "learning_rate": 0.00011497326203208556, "loss": 17.5016, "step": 332 }, { "epoch": 2.672, "grad_norm": 3.5358524322509766, "learning_rate": 0.00010962566844919786, "loss": 17.0355, "step": 334 }, { "epoch": 2.6879999999999997, "grad_norm": 3.5321309566497803, "learning_rate": 0.00010427807486631017, "loss": 17.5089, "step": 336 }, { "epoch": 2.7039999999999997, "grad_norm": 3.4019291400909424, "learning_rate": 9.893048128342247e-05, "loss": 17.3768, "step": 338 }, { "epoch": 2.7199999999999998, "grad_norm": 3.4486570358276367, "learning_rate": 9.358288770053476e-05, "loss": 17.488, "step": 340 }, { "epoch": 2.7359999999999998, "grad_norm": 3.7740256786346436, "learning_rate": 8.823529411764706e-05, "loss": 17.5768, "step": 342 }, { "epoch": 2.752, "grad_norm": 3.5659339427948, "learning_rate": 8.288770053475936e-05, "loss": 17.6865, "step": 344 }, { "epoch": 2.768, "grad_norm": 3.3678972721099854, "learning_rate": 7.754010695187167e-05, "loss": 17.4687, "step": 346 }, { "epoch": 2.784, "grad_norm": 3.585134506225586, "learning_rate": 7.219251336898395e-05, "loss": 17.536, "step": 348 }, { "epoch": 2.8, "grad_norm": 3.6471846103668213, "learning_rate": 6.684491978609626e-05, "loss": 17.6269, "step": 350 }, { "epoch": 2.816, "grad_norm": 3.533790111541748, "learning_rate": 6.149732620320857e-05, "loss": 17.5771, "step": 352 }, { "epoch": 2.832, "grad_norm": 3.7971367835998535, "learning_rate": 5.614973262032086e-05, "loss": 17.874, "step": 354 }, { "epoch": 2.848, "grad_norm": 3.391874074935913, "learning_rate": 5.080213903743316e-05, "loss": 17.2528, "step": 356 }, { "epoch": 2.864, "grad_norm": 3.069033145904541, "learning_rate": 4.545454545454546e-05, "loss": 17.6175, "step": 358 }, { "epoch": 2.88, "grad_norm": 3.780275821685791, "learning_rate": 4.0106951871657754e-05, "loss": 17.2663, "step": 360 }, { "epoch": 2.896, "grad_norm": 3.3377978801727295, "learning_rate": 3.4759358288770055e-05, "loss": 17.3711, "step": 362 }, { "epoch": 2.912, "grad_norm": 3.356203317642212, "learning_rate": 2.9411764705882354e-05, "loss": 17.6077, "step": 364 }, { "epoch": 2.928, "grad_norm": 3.302241563796997, "learning_rate": 2.4064171122994652e-05, "loss": 17.4777, "step": 366 }, { "epoch": 2.944, "grad_norm": 3.73811411857605, "learning_rate": 1.871657754010695e-05, "loss": 17.3149, "step": 368 }, { "epoch": 2.96, "grad_norm": 3.392902135848999, "learning_rate": 1.336898395721925e-05, "loss": 17.8118, "step": 370 }, { "epoch": 2.976, "grad_norm": 3.8080010414123535, "learning_rate": 8.021390374331552e-06, "loss": 17.1875, "step": 372 }, { "epoch": 2.992, "grad_norm": 3.5202646255493164, "learning_rate": 2.67379679144385e-06, "loss": 17.7556, "step": 374 }, { "epoch": 3.0, "step": 375, "total_flos": 2.6461914289864704e+17, "train_loss": 18.264725362141927, "train_runtime": 1944.3243, "train_samples_per_second": 24.687, "train_steps_per_second": 0.193 }, { "epoch": 3.0, "eval_loss": 2.2290163040161133, "eval_runtime": 83.3238, "eval_samples_per_second": 24.003, "eval_steps_per_second": 3.0, "step": 375 }, { "epoch": 3.0, "eval_loss": 2.226619243621826, "eval_runtime": 83.9815, "eval_samples_per_second": 23.815, "eval_steps_per_second": 2.977, "step": 375 } ], "logging_steps": 2, "max_steps": 375, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 2.6461914289864704e+17, "train_batch_size": 16, "trial_name": null, "trial_params": null }