| { | |
| "best_global_step": null, | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 0.4537205081669691, | |
| "eval_steps": 500, | |
| "global_step": 500, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.004537205081669692, | |
| "grad_norm": 1.4140625, | |
| "learning_rate": 1.7632653061224493e-05, | |
| "loss": 2.0708, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.009074410163339383, | |
| "grad_norm": 1.1796875, | |
| "learning_rate": 3.96734693877551e-05, | |
| "loss": 1.9893, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.013611615245009074, | |
| "grad_norm": 0.85546875, | |
| "learning_rate": 6.171428571428573e-05, | |
| "loss": 1.955, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.018148820326678767, | |
| "grad_norm": 1.0, | |
| "learning_rate": 8.375510204081634e-05, | |
| "loss": 1.9002, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.022686025408348458, | |
| "grad_norm": 0.78515625, | |
| "learning_rate": 0.00010579591836734696, | |
| "loss": 1.8666, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.02722323049001815, | |
| "grad_norm": 0.71875, | |
| "learning_rate": 0.00012783673469387758, | |
| "loss": 1.9462, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.03176043557168784, | |
| "grad_norm": 0.734375, | |
| "learning_rate": 0.00014987755102040818, | |
| "loss": 1.9157, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.036297640653357534, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.00015428528732777552, | |
| "loss": 1.9, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.04083484573502722, | |
| "grad_norm": 0.7265625, | |
| "learning_rate": 0.00015428355282194904, | |
| "loss": 1.9086, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.045372050816696916, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.00015428048412333237, | |
| "loss": 1.918, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.0499092558983666, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.00015427608130269275, | |
| "loss": 1.8871, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.0544464609800363, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.00015427034446156347, | |
| "loss": 1.879, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.05898366606170599, | |
| "grad_norm": 0.6484375, | |
| "learning_rate": 0.00015426327373224179, | |
| "loss": 1.8881, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.06352087114337568, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.00015425486927778568, | |
| "loss": 1.8637, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.06805807622504537, | |
| "grad_norm": 0.66796875, | |
| "learning_rate": 0.0001542451312920101, | |
| "loss": 1.9391, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.07259528130671507, | |
| "grad_norm": 0.65234375, | |
| "learning_rate": 0.0001542340599994826, | |
| "loss": 1.9262, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.07713248638838476, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 0.0001542216556555181, | |
| "loss": 1.9177, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.08166969147005444, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.00015420791854617294, | |
| "loss": 1.9026, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.08620689655172414, | |
| "grad_norm": 0.69140625, | |
| "learning_rate": 0.0001541928489882384, | |
| "loss": 1.868, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.09074410163339383, | |
| "grad_norm": 0.62890625, | |
| "learning_rate": 0.0001541764473292333, | |
| "loss": 1.8556, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 0.09528130671506352, | |
| "grad_norm": 0.69921875, | |
| "learning_rate": 0.00015415871394739605, | |
| "loss": 1.8941, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 0.0998185117967332, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.00015413964925167585, | |
| "loss": 1.8843, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 0.10435571687840291, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 0.00015411925368172337, | |
| "loss": 1.8913, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 0.1088929219600726, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 0.0001540975277078805, | |
| "loss": 1.8362, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 0.11343012704174228, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 0.0001540744718311696, | |
| "loss": 1.851, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 0.11796733212341198, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.00015405008658328184, | |
| "loss": 1.8704, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 0.12250453720508167, | |
| "grad_norm": 0.640625, | |
| "learning_rate": 0.000154024372526565, | |
| "loss": 1.8304, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 0.12704174228675136, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 0.0001539973302540106, | |
| "loss": 1.8121, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 0.13157894736842105, | |
| "grad_norm": 0.66015625, | |
| "learning_rate": 0.00015396896038924, | |
| "loss": 1.861, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 0.13611615245009073, | |
| "grad_norm": 0.96484375, | |
| "learning_rate": 0.0001539392635864902, | |
| "loss": 1.858, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 0.14065335753176045, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 0.00015390824053059868, | |
| "loss": 1.8305, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 0.14519056261343014, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 0.00015387589193698766, | |
| "loss": 1.8998, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 0.14972776769509982, | |
| "grad_norm": 0.921875, | |
| "learning_rate": 0.00015384221855164752, | |
| "loss": 1.8725, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 0.1542649727767695, | |
| "grad_norm": 0.59375, | |
| "learning_rate": 0.00015380722115111966, | |
| "loss": 1.8505, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 0.1588021778584392, | |
| "grad_norm": 0.62109375, | |
| "learning_rate": 0.00015377090054247852, | |
| "loss": 1.8817, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 0.16333938294010888, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 0.00015373325756331313, | |
| "loss": 1.8494, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 0.16787658802177857, | |
| "grad_norm": 0.61328125, | |
| "learning_rate": 0.00015369429308170762, | |
| "loss": 1.9201, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 0.1724137931034483, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 0.00015365400799622123, | |
| "loss": 1.8467, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 0.17695099818511797, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.00015361240323586768, | |
| "loss": 1.8867, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 0.18148820326678766, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 0.00015356947976009368, | |
| "loss": 1.8469, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 0.18602540834845735, | |
| "grad_norm": 0.703125, | |
| "learning_rate": 0.00015352523855875685, | |
| "loss": 1.8678, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 0.19056261343012704, | |
| "grad_norm": 0.6171875, | |
| "learning_rate": 0.00015347968065210273, | |
| "loss": 1.8783, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 0.19509981851179672, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 0.00015343280709074155, | |
| "loss": 1.8663, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 0.1996370235934664, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 0.0001533846189556237, | |
| "loss": 1.8891, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 0.20417422867513613, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 0.000153335117358015, | |
| "loss": 1.87, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 0.20871143375680581, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 0.00015328430343947104, | |
| "loss": 1.7773, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 0.2132486388384755, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 0.00015323217837181068, | |
| "loss": 1.8276, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 0.2177858439201452, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 0.00015317874335708936, | |
| "loss": 1.8301, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 0.22232304900181488, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 0.00015312399962757098, | |
| "loss": 1.8491, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 0.22686025408348456, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 0.0001530679484456999, | |
| "loss": 1.8976, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 0.23139745916515425, | |
| "grad_norm": 0.57421875, | |
| "learning_rate": 0.00015301059110407148, | |
| "loss": 1.7919, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 0.23593466424682397, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 0.00015295192892540244, | |
| "loss": 1.8364, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 0.24047186932849365, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.00015289196326250036, | |
| "loss": 1.846, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 0.24500907441016334, | |
| "grad_norm": 0.58203125, | |
| "learning_rate": 0.00015283069549823237, | |
| "loss": 1.8033, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 0.24954627949183303, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 0.00015276812704549347, | |
| "loss": 1.8463, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 0.2540834845735027, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 0.00015270425934717368, | |
| "loss": 1.7821, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 0.25862068965517243, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 0.00015263909387612496, | |
| "loss": 1.7916, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 0.2631578947368421, | |
| "grad_norm": 0.671875, | |
| "learning_rate": 0.0001525726321351272, | |
| "loss": 1.8298, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 0.2676950998185118, | |
| "grad_norm": 0.5078125, | |
| "learning_rate": 0.00015250487565685353, | |
| "loss": 1.8417, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 0.27223230490018147, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 0.000152435826003835, | |
| "loss": 1.8475, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 0.2767695099818512, | |
| "grad_norm": 1.1328125, | |
| "learning_rate": 0.00015236548476842452, | |
| "loss": 1.7997, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 0.2813067150635209, | |
| "grad_norm": 0.6640625, | |
| "learning_rate": 0.00015229385357276023, | |
| "loss": 1.8284, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 0.28584392014519056, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 0.00015222093406872794, | |
| "loss": 1.8628, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 0.29038112522686027, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 0.00015214672793792313, | |
| "loss": 1.8286, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 0.29491833030852993, | |
| "grad_norm": 0.53125, | |
| "learning_rate": 0.00015207123689161224, | |
| "loss": 1.8207, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 0.29945553539019965, | |
| "grad_norm": 0.74609375, | |
| "learning_rate": 0.00015199446267069304, | |
| "loss": 1.8005, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 0.3039927404718693, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 0.0001519164070456546, | |
| "loss": 1.8659, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 0.308529945553539, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 0.00015183707181653643, | |
| "loss": 1.8642, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 0.31306715063520874, | |
| "grad_norm": 0.6015625, | |
| "learning_rate": 0.00015175645881288695, | |
| "loss": 1.7679, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 0.3176043557168784, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 0.00015167456989372132, | |
| "loss": 1.8267, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 0.3221415607985481, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 0.00015159140694747864, | |
| "loss": 1.826, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 0.32667876588021777, | |
| "grad_norm": 0.6328125, | |
| "learning_rate": 0.0001515069718919782, | |
| "loss": 1.8233, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 0.3312159709618875, | |
| "grad_norm": 0.51171875, | |
| "learning_rate": 0.0001514212666743755, | |
| "loss": 1.8402, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 0.33575317604355714, | |
| "grad_norm": 0.52734375, | |
| "learning_rate": 0.00015133429327111715, | |
| "loss": 1.8067, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 0.34029038112522686, | |
| "grad_norm": 0.5234375, | |
| "learning_rate": 0.00015124605368789542, | |
| "loss": 1.7867, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 0.3448275862068966, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 0.00015115654995960187, | |
| "loss": 1.8151, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 0.34936479128856623, | |
| "grad_norm": 0.58984375, | |
| "learning_rate": 0.00015106578415028057, | |
| "loss": 1.7844, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 0.35390199637023595, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 0.00015097375835308036, | |
| "loss": 1.818, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 0.3584392014519056, | |
| "grad_norm": 0.60546875, | |
| "learning_rate": 0.00015088047469020668, | |
| "loss": 1.8075, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 0.3629764065335753, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 0.00015078593531287252, | |
| "loss": 1.8531, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 0.367513611615245, | |
| "grad_norm": 0.73828125, | |
| "learning_rate": 0.00015069014240124903, | |
| "loss": 1.7792, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 0.3720508166969147, | |
| "grad_norm": 0.546875, | |
| "learning_rate": 0.0001505930981644149, | |
| "loss": 1.8016, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 0.3765880217785844, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 0.00015049480484030574, | |
| "loss": 1.7925, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 0.3811252268602541, | |
| "grad_norm": 0.609375, | |
| "learning_rate": 0.0001503952646956623, | |
| "loss": 1.7902, | |
| "step": 420 | |
| }, | |
| { | |
| "epoch": 0.3856624319419238, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 0.00015029448002597828, | |
| "loss": 1.785, | |
| "step": 425 | |
| }, | |
| { | |
| "epoch": 0.39019963702359345, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 0.00015019245315544728, | |
| "loss": 1.8234, | |
| "step": 430 | |
| }, | |
| { | |
| "epoch": 0.39473684210526316, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 0.00015008918643690932, | |
| "loss": 1.7869, | |
| "step": 435 | |
| }, | |
| { | |
| "epoch": 0.3992740471869328, | |
| "grad_norm": 0.5703125, | |
| "learning_rate": 0.00014998468225179657, | |
| "loss": 1.8319, | |
| "step": 440 | |
| }, | |
| { | |
| "epoch": 0.40381125226860254, | |
| "grad_norm": 0.54296875, | |
| "learning_rate": 0.00014987894301007825, | |
| "loss": 1.7935, | |
| "step": 445 | |
| }, | |
| { | |
| "epoch": 0.40834845735027225, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 0.0001497719711502054, | |
| "loss": 1.7998, | |
| "step": 450 | |
| }, | |
| { | |
| "epoch": 0.4128856624319419, | |
| "grad_norm": 0.578125, | |
| "learning_rate": 0.0001496637691390543, | |
| "loss": 1.8234, | |
| "step": 455 | |
| }, | |
| { | |
| "epoch": 0.41742286751361163, | |
| "grad_norm": 1.40625, | |
| "learning_rate": 0.00014955433947186982, | |
| "loss": 1.7724, | |
| "step": 460 | |
| }, | |
| { | |
| "epoch": 0.4219600725952813, | |
| "grad_norm": 0.56640625, | |
| "learning_rate": 0.0001494436846722077, | |
| "loss": 1.7939, | |
| "step": 465 | |
| }, | |
| { | |
| "epoch": 0.426497277676951, | |
| "grad_norm": 0.55859375, | |
| "learning_rate": 0.00014933180729187652, | |
| "loss": 1.7921, | |
| "step": 470 | |
| }, | |
| { | |
| "epoch": 0.43103448275862066, | |
| "grad_norm": 0.5390625, | |
| "learning_rate": 0.0001492187099108787, | |
| "loss": 1.8269, | |
| "step": 475 | |
| }, | |
| { | |
| "epoch": 0.4355716878402904, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 0.00014910439513735114, | |
| "loss": 1.8065, | |
| "step": 480 | |
| }, | |
| { | |
| "epoch": 0.4401088929219601, | |
| "grad_norm": 0.59765625, | |
| "learning_rate": 0.00014898886560750502, | |
| "loss": 1.793, | |
| "step": 485 | |
| }, | |
| { | |
| "epoch": 0.44464609800362975, | |
| "grad_norm": 0.5859375, | |
| "learning_rate": 0.00014887212398556488, | |
| "loss": 1.8052, | |
| "step": 490 | |
| }, | |
| { | |
| "epoch": 0.44918330308529947, | |
| "grad_norm": 0.5625, | |
| "learning_rate": 0.00014875417296370742, | |
| "loss": 1.7471, | |
| "step": 495 | |
| }, | |
| { | |
| "epoch": 0.4537205081669691, | |
| "grad_norm": 0.55078125, | |
| "learning_rate": 0.00014863501526199925, | |
| "loss": 1.7734, | |
| "step": 500 | |
| }, | |
| { | |
| "epoch": 0.4537205081669691, | |
| "eval_loss": 1.7299695014953613, | |
| "eval_runtime": 4.0995, | |
| "eval_samples_per_second": 15.612, | |
| "eval_steps_per_second": 15.612, | |
| "step": 500 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 3306, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 3, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": false | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 1.097833201532928e+18, | |
| "train_batch_size": 48, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |