{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 5.0, "eval_steps": 30, "global_step": 400, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.025078369905956112, "grad_norm": 0.43725547194480896, "learning_rate": 8.000000000000001e-06, "loss": 1.9437, "step": 2 }, { "epoch": 0.050156739811912224, "grad_norm": 0.6708689332008362, "learning_rate": 2.4e-05, "loss": 1.849, "step": 4 }, { "epoch": 0.07523510971786834, "grad_norm": 0.3925197422504425, "learning_rate": 4e-05, "loss": 1.8725, "step": 6 }, { "epoch": 0.10031347962382445, "grad_norm": 0.4107086956501007, "learning_rate": 5.6000000000000006e-05, "loss": 1.7837, "step": 8 }, { "epoch": 0.12539184952978055, "grad_norm": 0.5341728329658508, "learning_rate": 7.2e-05, "loss": 1.8033, "step": 10 }, { "epoch": 0.15047021943573669, "grad_norm": 0.43938425183296204, "learning_rate": 8.800000000000001e-05, "loss": 1.5958, "step": 12 }, { "epoch": 0.1755485893416928, "grad_norm": 0.3650668263435364, "learning_rate": 0.00010400000000000001, "loss": 1.5566, "step": 14 }, { "epoch": 0.2006269592476489, "grad_norm": 0.3861461877822876, "learning_rate": 0.00012, "loss": 1.3631, "step": 16 }, { "epoch": 0.22570532915360503, "grad_norm": 0.3685659170150757, "learning_rate": 0.00013600000000000003, "loss": 1.3227, "step": 18 }, { "epoch": 0.2507836990595611, "grad_norm": 0.4720342457294464, "learning_rate": 0.000152, "loss": 1.1027, "step": 20 }, { "epoch": 0.27586206896551724, "grad_norm": 0.40199315547943115, "learning_rate": 0.000168, "loss": 1.018, "step": 22 }, { "epoch": 0.30094043887147337, "grad_norm": 0.22768579423427582, "learning_rate": 0.00018400000000000003, "loss": 0.9077, "step": 24 }, { "epoch": 0.32601880877742945, "grad_norm": 0.2388496696949005, "learning_rate": 0.0002, "loss": 0.7587, "step": 26 }, { "epoch": 0.3510971786833856, "grad_norm": 0.19000130891799927, "learning_rate": 0.0001998993710691824, "loss": 0.7766, "step": 28 }, { "epoch": 0.3761755485893417, "grad_norm": 0.1965627372264862, "learning_rate": 0.0001997987421383648, "loss": 0.7575, "step": 30 }, { "epoch": 0.4012539184952978, "grad_norm": 0.18852603435516357, "learning_rate": 0.00019969811320754718, "loss": 0.8292, "step": 32 }, { "epoch": 0.4263322884012539, "grad_norm": 0.16236507892608643, "learning_rate": 0.00019959748427672956, "loss": 0.7272, "step": 34 }, { "epoch": 0.45141065830721006, "grad_norm": 0.14128465950489044, "learning_rate": 0.00019949685534591195, "loss": 0.751, "step": 36 }, { "epoch": 0.47648902821316613, "grad_norm": 0.14623811841011047, "learning_rate": 0.00019939622641509434, "loss": 0.6661, "step": 38 }, { "epoch": 0.5015673981191222, "grad_norm": 0.14945009350776672, "learning_rate": 0.00019929559748427673, "loss": 0.7435, "step": 40 }, { "epoch": 0.5266457680250783, "grad_norm": 0.1485632061958313, "learning_rate": 0.00019919496855345915, "loss": 0.6466, "step": 42 }, { "epoch": 0.5517241379310345, "grad_norm": 0.14336936175823212, "learning_rate": 0.00019909433962264153, "loss": 0.7, "step": 44 }, { "epoch": 0.5768025078369906, "grad_norm": 0.1316949725151062, "learning_rate": 0.0001989937106918239, "loss": 0.6194, "step": 46 }, { "epoch": 0.6018808777429467, "grad_norm": 0.14485935866832733, "learning_rate": 0.00019889308176100629, "loss": 0.8044, "step": 48 }, { "epoch": 0.6269592476489029, "grad_norm": 0.12860289216041565, "learning_rate": 0.00019879245283018867, "loss": 0.5773, "step": 50 }, { "epoch": 0.6520376175548589, "grad_norm": 0.16889798641204834, "learning_rate": 0.0001986918238993711, "loss": 0.7281, "step": 52 }, { "epoch": 0.677115987460815, "grad_norm": 0.14104238152503967, "learning_rate": 0.00019859119496855348, "loss": 0.6654, "step": 54 }, { "epoch": 0.7021943573667712, "grad_norm": 0.1515515297651291, "learning_rate": 0.00019849056603773587, "loss": 0.6833, "step": 56 }, { "epoch": 0.7272727272727273, "grad_norm": 0.16043280065059662, "learning_rate": 0.00019838993710691826, "loss": 0.6599, "step": 58 }, { "epoch": 0.7523510971786834, "grad_norm": 0.15895870327949524, "learning_rate": 0.00019828930817610062, "loss": 0.686, "step": 60 }, { "epoch": 0.7774294670846394, "grad_norm": 0.3486965298652649, "learning_rate": 0.00019818867924528303, "loss": 0.6835, "step": 62 }, { "epoch": 0.8025078369905956, "grad_norm": 0.1274975836277008, "learning_rate": 0.00019808805031446542, "loss": 0.5425, "step": 64 }, { "epoch": 0.8275862068965517, "grad_norm": 0.1669531911611557, "learning_rate": 0.0001979874213836478, "loss": 0.766, "step": 66 }, { "epoch": 0.8526645768025078, "grad_norm": 0.14856377243995667, "learning_rate": 0.0001978867924528302, "loss": 0.5993, "step": 68 }, { "epoch": 0.877742946708464, "grad_norm": 0.13947483897209167, "learning_rate": 0.0001977861635220126, "loss": 0.7071, "step": 70 }, { "epoch": 0.9028213166144201, "grad_norm": 0.15276968479156494, "learning_rate": 0.00019768553459119498, "loss": 0.687, "step": 72 }, { "epoch": 0.9278996865203761, "grad_norm": 0.14595958590507507, "learning_rate": 0.00019758490566037737, "loss": 0.6495, "step": 74 }, { "epoch": 0.9529780564263323, "grad_norm": 0.17588546872138977, "learning_rate": 0.00019748427672955975, "loss": 0.6584, "step": 76 }, { "epoch": 0.9780564263322884, "grad_norm": 0.15688568353652954, "learning_rate": 0.00019738364779874214, "loss": 0.6769, "step": 78 }, { "epoch": 1.0, "grad_norm": 0.1879902333021164, "learning_rate": 0.00019728301886792453, "loss": 0.5924, "step": 80 }, { "epoch": 1.025078369905956, "grad_norm": 0.15292422473430634, "learning_rate": 0.00019718238993710695, "loss": 0.5727, "step": 82 }, { "epoch": 1.0501567398119123, "grad_norm": 0.14542542397975922, "learning_rate": 0.00019708176100628934, "loss": 0.5528, "step": 84 }, { "epoch": 1.0752351097178683, "grad_norm": 0.15912258625030518, "learning_rate": 0.0001969811320754717, "loss": 0.5404, "step": 86 }, { "epoch": 1.1003134796238245, "grad_norm": 0.16443438827991486, "learning_rate": 0.00019688050314465409, "loss": 0.544, "step": 88 }, { "epoch": 1.1253918495297806, "grad_norm": 0.18315915763378143, "learning_rate": 0.00019677987421383647, "loss": 0.5768, "step": 90 }, { "epoch": 1.1504702194357366, "grad_norm": 0.16878078877925873, "learning_rate": 0.0001966792452830189, "loss": 0.6918, "step": 92 }, { "epoch": 1.1755485893416928, "grad_norm": 0.1652018129825592, "learning_rate": 0.00019657861635220128, "loss": 0.5903, "step": 94 }, { "epoch": 1.2006269592476488, "grad_norm": 0.181439608335495, "learning_rate": 0.00019647798742138367, "loss": 0.5917, "step": 96 }, { "epoch": 1.225705329153605, "grad_norm": 0.15887363255023956, "learning_rate": 0.00019637735849056606, "loss": 0.5631, "step": 98 }, { "epoch": 1.250783699059561, "grad_norm": 0.16309796273708344, "learning_rate": 0.00019627672955974842, "loss": 0.6206, "step": 100 }, { "epoch": 1.2758620689655173, "grad_norm": 0.19174307584762573, "learning_rate": 0.00019617610062893083, "loss": 0.5375, "step": 102 }, { "epoch": 1.3009404388714734, "grad_norm": 0.2240614891052246, "learning_rate": 0.00019607547169811322, "loss": 0.6407, "step": 104 }, { "epoch": 1.3260188087774294, "grad_norm": 0.1673874855041504, "learning_rate": 0.0001959748427672956, "loss": 0.631, "step": 106 }, { "epoch": 1.3510971786833856, "grad_norm": 0.16143380105495453, "learning_rate": 0.000195874213836478, "loss": 0.4985, "step": 108 }, { "epoch": 1.3761755485893417, "grad_norm": 0.18511593341827393, "learning_rate": 0.0001957735849056604, "loss": 0.5844, "step": 110 }, { "epoch": 1.4012539184952977, "grad_norm": 0.15226313471794128, "learning_rate": 0.00019567295597484278, "loss": 0.5237, "step": 112 }, { "epoch": 1.426332288401254, "grad_norm": 0.16536672413349152, "learning_rate": 0.00019557232704402517, "loss": 0.4945, "step": 114 }, { "epoch": 1.4514106583072102, "grad_norm": 0.17802752554416656, "learning_rate": 0.00019547169811320755, "loss": 0.632, "step": 116 }, { "epoch": 1.4764890282131662, "grad_norm": 0.18615730106830597, "learning_rate": 0.00019537106918238994, "loss": 0.6778, "step": 118 }, { "epoch": 1.5015673981191222, "grad_norm": 0.16418549418449402, "learning_rate": 0.00019527044025157233, "loss": 0.6141, "step": 120 }, { "epoch": 1.5266457680250785, "grad_norm": 0.16458265483379364, "learning_rate": 0.00019516981132075475, "loss": 0.5316, "step": 122 }, { "epoch": 1.5517241379310345, "grad_norm": 0.15842436254024506, "learning_rate": 0.0001950691823899371, "loss": 0.4761, "step": 124 }, { "epoch": 1.5768025078369905, "grad_norm": 0.15368856489658356, "learning_rate": 0.0001949685534591195, "loss": 0.471, "step": 126 }, { "epoch": 1.6018808777429467, "grad_norm": 0.16104081273078918, "learning_rate": 0.0001948679245283019, "loss": 0.572, "step": 128 }, { "epoch": 1.626959247648903, "grad_norm": 0.1696012020111084, "learning_rate": 0.00019476729559748428, "loss": 0.5293, "step": 130 }, { "epoch": 1.6520376175548588, "grad_norm": 0.16601622104644775, "learning_rate": 0.0001946666666666667, "loss": 0.6569, "step": 132 }, { "epoch": 1.677115987460815, "grad_norm": 0.15106241405010223, "learning_rate": 0.00019456603773584908, "loss": 0.5687, "step": 134 }, { "epoch": 1.7021943573667713, "grad_norm": 0.18189087510108948, "learning_rate": 0.00019446540880503147, "loss": 0.4965, "step": 136 }, { "epoch": 1.7272727272727273, "grad_norm": 0.15463034808635712, "learning_rate": 0.00019436477987421383, "loss": 0.6398, "step": 138 }, { "epoch": 1.7523510971786833, "grad_norm": 0.17576032876968384, "learning_rate": 0.00019426415094339622, "loss": 0.6122, "step": 140 }, { "epoch": 1.7774294670846396, "grad_norm": 0.14750592410564423, "learning_rate": 0.00019416352201257863, "loss": 0.5475, "step": 142 }, { "epoch": 1.8025078369905956, "grad_norm": 0.15765072405338287, "learning_rate": 0.00019406289308176102, "loss": 0.443, "step": 144 }, { "epoch": 1.8275862068965516, "grad_norm": 0.16872242093086243, "learning_rate": 0.0001939622641509434, "loss": 0.5061, "step": 146 }, { "epoch": 1.8526645768025078, "grad_norm": 0.16207149624824524, "learning_rate": 0.0001938616352201258, "loss": 0.572, "step": 148 }, { "epoch": 1.877742946708464, "grad_norm": 0.16720207035541534, "learning_rate": 0.0001937610062893082, "loss": 0.5329, "step": 150 }, { "epoch": 1.90282131661442, "grad_norm": 0.1653318852186203, "learning_rate": 0.00019366037735849058, "loss": 0.5361, "step": 152 }, { "epoch": 1.9278996865203761, "grad_norm": 0.1918332576751709, "learning_rate": 0.00019355974842767297, "loss": 0.6219, "step": 154 }, { "epoch": 1.9529780564263324, "grad_norm": 0.1535947322845459, "learning_rate": 0.00019345911949685536, "loss": 0.586, "step": 156 }, { "epoch": 1.9780564263322884, "grad_norm": 0.16222791373729706, "learning_rate": 0.00019335849056603774, "loss": 0.5228, "step": 158 }, { "epoch": 2.0, "grad_norm": 0.24010032415390015, "learning_rate": 0.00019325786163522013, "loss": 0.5218, "step": 160 }, { "epoch": 2.0250783699059562, "grad_norm": 0.15844006836414337, "learning_rate": 0.00019315723270440255, "loss": 0.5111, "step": 162 }, { "epoch": 2.050156739811912, "grad_norm": 0.1755230873823166, "learning_rate": 0.0001930566037735849, "loss": 0.6063, "step": 164 }, { "epoch": 2.0752351097178683, "grad_norm": 0.2025759369134903, "learning_rate": 0.0001929559748427673, "loss": 0.5672, "step": 166 }, { "epoch": 2.1003134796238245, "grad_norm": 0.2058378905057907, "learning_rate": 0.0001928553459119497, "loss": 0.4629, "step": 168 }, { "epoch": 2.1253918495297803, "grad_norm": 0.1765381544828415, "learning_rate": 0.00019275471698113208, "loss": 0.5236, "step": 170 }, { "epoch": 2.1504702194357366, "grad_norm": 0.20618358254432678, "learning_rate": 0.0001926540880503145, "loss": 0.5029, "step": 172 }, { "epoch": 2.175548589341693, "grad_norm": 0.1737968772649765, "learning_rate": 0.00019255345911949688, "loss": 0.3964, "step": 174 }, { "epoch": 2.200626959247649, "grad_norm": 0.20385882258415222, "learning_rate": 0.00019245283018867927, "loss": 0.5188, "step": 176 }, { "epoch": 2.225705329153605, "grad_norm": 0.2051456868648529, "learning_rate": 0.00019235220125786163, "loss": 0.4548, "step": 178 }, { "epoch": 2.250783699059561, "grad_norm": 0.18826241791248322, "learning_rate": 0.00019225157232704402, "loss": 0.4515, "step": 180 }, { "epoch": 2.2758620689655173, "grad_norm": 0.18653476238250732, "learning_rate": 0.00019215094339622644, "loss": 0.5373, "step": 182 }, { "epoch": 2.300940438871473, "grad_norm": 0.179554283618927, "learning_rate": 0.00019205031446540882, "loss": 0.49, "step": 184 }, { "epoch": 2.3260188087774294, "grad_norm": 0.18949083983898163, "learning_rate": 0.0001919496855345912, "loss": 0.4795, "step": 186 }, { "epoch": 2.3510971786833856, "grad_norm": 0.21681569516658783, "learning_rate": 0.0001918490566037736, "loss": 0.4826, "step": 188 }, { "epoch": 2.376175548589342, "grad_norm": 0.20997639000415802, "learning_rate": 0.000191748427672956, "loss": 0.3699, "step": 190 }, { "epoch": 2.4012539184952977, "grad_norm": 0.3043127954006195, "learning_rate": 0.00019164779874213838, "loss": 0.5264, "step": 192 }, { "epoch": 2.426332288401254, "grad_norm": 0.19533301889896393, "learning_rate": 0.00019154716981132077, "loss": 0.4243, "step": 194 }, { "epoch": 2.45141065830721, "grad_norm": 0.20891591906547546, "learning_rate": 0.00019144654088050316, "loss": 0.4748, "step": 196 }, { "epoch": 2.476489028213166, "grad_norm": 0.1940625011920929, "learning_rate": 0.00019134591194968554, "loss": 0.456, "step": 198 }, { "epoch": 2.501567398119122, "grad_norm": 0.2169208973646164, "learning_rate": 0.00019124528301886793, "loss": 0.577, "step": 200 }, { "epoch": 2.5266457680250785, "grad_norm": 0.21462920308113098, "learning_rate": 0.00019114465408805032, "loss": 0.3583, "step": 202 }, { "epoch": 2.5517241379310347, "grad_norm": 0.22243842482566833, "learning_rate": 0.0001910440251572327, "loss": 0.5473, "step": 204 }, { "epoch": 2.5768025078369905, "grad_norm": 0.20357415080070496, "learning_rate": 0.0001909433962264151, "loss": 0.5596, "step": 206 }, { "epoch": 2.6018808777429467, "grad_norm": 0.21374137699604034, "learning_rate": 0.0001908427672955975, "loss": 0.6041, "step": 208 }, { "epoch": 2.626959247648903, "grad_norm": 0.22612103819847107, "learning_rate": 0.00019074213836477988, "loss": 0.4825, "step": 210 }, { "epoch": 2.652037617554859, "grad_norm": 0.182185098528862, "learning_rate": 0.0001906415094339623, "loss": 0.5961, "step": 212 }, { "epoch": 2.677115987460815, "grad_norm": 0.21316243708133698, "learning_rate": 0.00019054088050314468, "loss": 0.4892, "step": 214 }, { "epoch": 2.7021943573667713, "grad_norm": 0.20594292879104614, "learning_rate": 0.00019044025157232704, "loss": 0.5294, "step": 216 }, { "epoch": 2.7272727272727275, "grad_norm": 0.18579436838626862, "learning_rate": 0.00019033962264150943, "loss": 0.4867, "step": 218 }, { "epoch": 2.7523510971786833, "grad_norm": 0.20978513360023499, "learning_rate": 0.00019023899371069182, "loss": 0.5459, "step": 220 }, { "epoch": 2.7774294670846396, "grad_norm": 0.20766879618167877, "learning_rate": 0.00019013836477987424, "loss": 0.4467, "step": 222 }, { "epoch": 2.8025078369905954, "grad_norm": 0.2247876673936844, "learning_rate": 0.00019003773584905662, "loss": 0.4955, "step": 224 }, { "epoch": 2.8275862068965516, "grad_norm": 0.20031589269638062, "learning_rate": 0.00018993710691823901, "loss": 0.4274, "step": 226 }, { "epoch": 2.852664576802508, "grad_norm": 0.22423385083675385, "learning_rate": 0.0001898364779874214, "loss": 0.4741, "step": 228 }, { "epoch": 2.877742946708464, "grad_norm": 0.1920011192560196, "learning_rate": 0.00018973584905660376, "loss": 0.4802, "step": 230 }, { "epoch": 2.9028213166144203, "grad_norm": 0.1996566653251648, "learning_rate": 0.00018963522012578615, "loss": 0.5204, "step": 232 }, { "epoch": 2.927899686520376, "grad_norm": 0.18659324944019318, "learning_rate": 0.00018953459119496857, "loss": 0.563, "step": 234 }, { "epoch": 2.9529780564263324, "grad_norm": 0.20645543932914734, "learning_rate": 0.00018943396226415096, "loss": 0.5968, "step": 236 }, { "epoch": 2.978056426332288, "grad_norm": 0.20103755593299866, "learning_rate": 0.00018933333333333335, "loss": 0.4683, "step": 238 }, { "epoch": 3.0, "grad_norm": 0.26790672540664673, "learning_rate": 0.00018923270440251573, "loss": 0.4214, "step": 240 }, { "epoch": 3.0250783699059562, "grad_norm": 0.24234965443611145, "learning_rate": 0.00018913207547169812, "loss": 0.4778, "step": 242 }, { "epoch": 3.050156739811912, "grad_norm": 0.2758055031299591, "learning_rate": 0.0001890314465408805, "loss": 0.4029, "step": 244 }, { "epoch": 3.0752351097178683, "grad_norm": 0.2382444590330124, "learning_rate": 0.0001889308176100629, "loss": 0.3801, "step": 246 }, { "epoch": 3.1003134796238245, "grad_norm": 0.24490897357463837, "learning_rate": 0.0001888301886792453, "loss": 0.3975, "step": 248 }, { "epoch": 3.1253918495297803, "grad_norm": 0.26239147782325745, "learning_rate": 0.00018872955974842768, "loss": 0.4038, "step": 250 }, { "epoch": 3.1504702194357366, "grad_norm": 0.24605032801628113, "learning_rate": 0.00018862893081761007, "loss": 0.4364, "step": 252 }, { "epoch": 3.175548589341693, "grad_norm": 0.28764280676841736, "learning_rate": 0.00018852830188679248, "loss": 0.3577, "step": 254 }, { "epoch": 3.200626959247649, "grad_norm": 0.23804618418216705, "learning_rate": 0.00018842767295597484, "loss": 0.4155, "step": 256 }, { "epoch": 3.225705329153605, "grad_norm": 0.25497862696647644, "learning_rate": 0.00018832704402515723, "loss": 0.4654, "step": 258 }, { "epoch": 3.250783699059561, "grad_norm": 0.23537839949131012, "learning_rate": 0.00018822641509433962, "loss": 0.4144, "step": 260 }, { "epoch": 3.2758620689655173, "grad_norm": 0.268036812543869, "learning_rate": 0.000188125786163522, "loss": 0.4546, "step": 262 }, { "epoch": 3.300940438871473, "grad_norm": 0.25395911931991577, "learning_rate": 0.00018802515723270443, "loss": 0.4604, "step": 264 }, { "epoch": 3.3260188087774294, "grad_norm": 0.3395281732082367, "learning_rate": 0.00018792452830188681, "loss": 0.5816, "step": 266 }, { "epoch": 3.3510971786833856, "grad_norm": 0.258900910615921, "learning_rate": 0.0001878238993710692, "loss": 0.4415, "step": 268 }, { "epoch": 3.376175548589342, "grad_norm": 0.24031828343868256, "learning_rate": 0.00018772327044025156, "loss": 0.469, "step": 270 }, { "epoch": 3.4012539184952977, "grad_norm": 0.26624906063079834, "learning_rate": 0.00018762264150943395, "loss": 0.4304, "step": 272 }, { "epoch": 3.426332288401254, "grad_norm": 0.2869020998477936, "learning_rate": 0.00018752201257861637, "loss": 0.4623, "step": 274 }, { "epoch": 3.45141065830721, "grad_norm": 0.2383798062801361, "learning_rate": 0.00018742138364779876, "loss": 0.3973, "step": 276 }, { "epoch": 3.476489028213166, "grad_norm": 0.25947991013526917, "learning_rate": 0.00018732075471698115, "loss": 0.4468, "step": 278 }, { "epoch": 3.501567398119122, "grad_norm": 0.21950559318065643, "learning_rate": 0.00018722012578616354, "loss": 0.3432, "step": 280 }, { "epoch": 3.5266457680250785, "grad_norm": 0.26003995537757874, "learning_rate": 0.00018711949685534592, "loss": 0.4664, "step": 282 }, { "epoch": 3.5517241379310347, "grad_norm": 0.2847505807876587, "learning_rate": 0.0001870188679245283, "loss": 0.4583, "step": 284 }, { "epoch": 3.5768025078369905, "grad_norm": 0.2824760973453522, "learning_rate": 0.0001869182389937107, "loss": 0.4735, "step": 286 }, { "epoch": 3.6018808777429467, "grad_norm": 0.268838107585907, "learning_rate": 0.0001868176100628931, "loss": 0.4071, "step": 288 }, { "epoch": 3.626959247648903, "grad_norm": 0.24519529938697815, "learning_rate": 0.00018671698113207548, "loss": 0.4178, "step": 290 }, { "epoch": 3.652037617554859, "grad_norm": 0.24740180373191833, "learning_rate": 0.00018661635220125787, "loss": 0.4716, "step": 292 }, { "epoch": 3.677115987460815, "grad_norm": 0.22623687982559204, "learning_rate": 0.00018651572327044026, "loss": 0.3645, "step": 294 }, { "epoch": 3.7021943573667713, "grad_norm": 0.2554280459880829, "learning_rate": 0.00018641509433962264, "loss": 0.4044, "step": 296 }, { "epoch": 3.7272727272727275, "grad_norm": 0.2251761108636856, "learning_rate": 0.00018631446540880503, "loss": 0.3663, "step": 298 }, { "epoch": 3.7523510971786833, "grad_norm": 0.20053140819072723, "learning_rate": 0.00018621383647798742, "loss": 0.4342, "step": 300 }, { "epoch": 3.7774294670846396, "grad_norm": 0.2692326605319977, "learning_rate": 0.0001861132075471698, "loss": 0.4268, "step": 302 }, { "epoch": 3.8025078369905954, "grad_norm": 0.23218081891536713, "learning_rate": 0.00018601257861635223, "loss": 0.4848, "step": 304 }, { "epoch": 3.8275862068965516, "grad_norm": 0.2571001648902893, "learning_rate": 0.00018591194968553462, "loss": 0.5391, "step": 306 }, { "epoch": 3.852664576802508, "grad_norm": 0.20899826288223267, "learning_rate": 0.00018581132075471698, "loss": 0.4183, "step": 308 }, { "epoch": 3.877742946708464, "grad_norm": 0.24893143773078918, "learning_rate": 0.00018571069182389937, "loss": 0.4314, "step": 310 }, { "epoch": 3.9028213166144203, "grad_norm": 0.26598888635635376, "learning_rate": 0.00018561006289308175, "loss": 0.4182, "step": 312 }, { "epoch": 3.927899686520376, "grad_norm": 0.24121470749378204, "learning_rate": 0.00018550943396226417, "loss": 0.4327, "step": 314 }, { "epoch": 3.9529780564263324, "grad_norm": 0.2874317467212677, "learning_rate": 0.00018540880503144656, "loss": 0.4616, "step": 316 }, { "epoch": 3.978056426332288, "grad_norm": 0.22735589742660522, "learning_rate": 0.00018530817610062895, "loss": 0.376, "step": 318 }, { "epoch": 4.0, "grad_norm": 0.3424147069454193, "learning_rate": 0.00018520754716981134, "loss": 0.537, "step": 320 }, { "epoch": 4.025078369905956, "grad_norm": 0.27403339743614197, "learning_rate": 0.0001851069182389937, "loss": 0.3539, "step": 322 }, { "epoch": 4.0501567398119125, "grad_norm": 0.33905503153800964, "learning_rate": 0.0001850062893081761, "loss": 0.3367, "step": 324 }, { "epoch": 4.075235109717869, "grad_norm": 0.47853460907936096, "learning_rate": 0.0001849056603773585, "loss": 0.393, "step": 326 }, { "epoch": 4.100313479623824, "grad_norm": 0.3133102059364319, "learning_rate": 0.0001848050314465409, "loss": 0.3762, "step": 328 }, { "epoch": 4.12539184952978, "grad_norm": 0.22834369540214539, "learning_rate": 0.00018470440251572328, "loss": 0.3404, "step": 330 }, { "epoch": 4.150470219435737, "grad_norm": 0.2537166476249695, "learning_rate": 0.00018460377358490567, "loss": 0.3582, "step": 332 }, { "epoch": 4.175548589341693, "grad_norm": 0.35708144307136536, "learning_rate": 0.00018450314465408806, "loss": 0.369, "step": 334 }, { "epoch": 4.200626959247649, "grad_norm": 0.3224405348300934, "learning_rate": 0.00018440251572327045, "loss": 0.3458, "step": 336 }, { "epoch": 4.225705329153605, "grad_norm": 0.34621739387512207, "learning_rate": 0.00018430188679245283, "loss": 0.4176, "step": 338 }, { "epoch": 4.250783699059561, "grad_norm": 0.24818404018878937, "learning_rate": 0.00018420125786163522, "loss": 0.3205, "step": 340 }, { "epoch": 4.275862068965517, "grad_norm": 0.29599064588546753, "learning_rate": 0.0001841006289308176, "loss": 0.3696, "step": 342 }, { "epoch": 4.300940438871473, "grad_norm": 0.2980504333972931, "learning_rate": 0.00018400000000000003, "loss": 0.3256, "step": 344 }, { "epoch": 4.326018808777429, "grad_norm": 0.35454604029655457, "learning_rate": 0.0001838993710691824, "loss": 0.4957, "step": 346 }, { "epoch": 4.351097178683386, "grad_norm": 0.3497369885444641, "learning_rate": 0.00018379874213836478, "loss": 0.3941, "step": 348 }, { "epoch": 4.376175548589342, "grad_norm": 0.32460692524909973, "learning_rate": 0.00018369811320754717, "loss": 0.3615, "step": 350 }, { "epoch": 4.401253918495298, "grad_norm": 0.29358094930648804, "learning_rate": 0.00018359748427672955, "loss": 0.3749, "step": 352 }, { "epoch": 4.4263322884012535, "grad_norm": 0.2807920575141907, "learning_rate": 0.00018349685534591197, "loss": 0.3575, "step": 354 }, { "epoch": 4.45141065830721, "grad_norm": 0.2809455096721649, "learning_rate": 0.00018339622641509436, "loss": 0.3097, "step": 356 }, { "epoch": 4.476489028213166, "grad_norm": 0.3250884413719177, "learning_rate": 0.00018329559748427675, "loss": 0.4113, "step": 358 }, { "epoch": 4.501567398119122, "grad_norm": 0.29040804505348206, "learning_rate": 0.0001831949685534591, "loss": 0.4481, "step": 360 }, { "epoch": 4.5266457680250785, "grad_norm": 0.3208359479904175, "learning_rate": 0.0001830943396226415, "loss": 0.4438, "step": 362 }, { "epoch": 4.551724137931035, "grad_norm": 0.23080404102802277, "learning_rate": 0.00018299371069182391, "loss": 0.2099, "step": 364 }, { "epoch": 4.576802507836991, "grad_norm": 0.2984071373939514, "learning_rate": 0.0001828930817610063, "loss": 0.3339, "step": 366 }, { "epoch": 4.601880877742946, "grad_norm": 0.3299279808998108, "learning_rate": 0.0001827924528301887, "loss": 0.4208, "step": 368 }, { "epoch": 4.6269592476489025, "grad_norm": 0.3243483006954193, "learning_rate": 0.00018269182389937108, "loss": 0.4165, "step": 370 }, { "epoch": 4.652037617554859, "grad_norm": 0.29541853070259094, "learning_rate": 0.00018259119496855347, "loss": 0.3703, "step": 372 }, { "epoch": 4.677115987460815, "grad_norm": 0.3010431230068207, "learning_rate": 0.00018249056603773586, "loss": 0.4034, "step": 374 }, { "epoch": 4.702194357366771, "grad_norm": 0.2970607578754425, "learning_rate": 0.00018238993710691825, "loss": 0.3151, "step": 376 }, { "epoch": 4.7272727272727275, "grad_norm": 0.2794083058834076, "learning_rate": 0.00018228930817610063, "loss": 0.3605, "step": 378 }, { "epoch": 4.752351097178684, "grad_norm": 0.2949012219905853, "learning_rate": 0.00018218867924528302, "loss": 0.343, "step": 380 }, { "epoch": 4.777429467084639, "grad_norm": 0.28160709142684937, "learning_rate": 0.0001820880503144654, "loss": 0.4515, "step": 382 }, { "epoch": 4.802507836990595, "grad_norm": 0.296051561832428, "learning_rate": 0.00018198742138364783, "loss": 0.3908, "step": 384 }, { "epoch": 4.827586206896552, "grad_norm": 0.26115506887435913, "learning_rate": 0.0001818867924528302, "loss": 0.3312, "step": 386 }, { "epoch": 4.852664576802508, "grad_norm": 0.27632880210876465, "learning_rate": 0.00018178616352201258, "loss": 0.3179, "step": 388 }, { "epoch": 4.877742946708464, "grad_norm": 0.2973230481147766, "learning_rate": 0.00018168553459119497, "loss": 0.382, "step": 390 }, { "epoch": 4.90282131661442, "grad_norm": 0.2833520472049713, "learning_rate": 0.00018158490566037736, "loss": 0.3363, "step": 392 }, { "epoch": 4.927899686520377, "grad_norm": 0.30823326110839844, "learning_rate": 0.00018148427672955977, "loss": 0.3234, "step": 394 }, { "epoch": 4.952978056426332, "grad_norm": 0.2736763060092926, "learning_rate": 0.00018138364779874216, "loss": 0.415, "step": 396 }, { "epoch": 4.978056426332288, "grad_norm": 0.2832898199558258, "learning_rate": 0.00018128301886792455, "loss": 0.3755, "step": 398 }, { "epoch": 5.0, "grad_norm": 0.3281781077384949, "learning_rate": 0.0001811823899371069, "loss": 0.2756, "step": 400 } ], "logging_steps": 2, "max_steps": 4000, "num_input_tokens_seen": 0, "num_train_epochs": 50, "save_steps": 100, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.25570189464994e+18, "train_batch_size": 4, "trial_name": null, "trial_params": null }