| { | |
| "best_metric": null, | |
| "best_model_checkpoint": null, | |
| "epoch": 3.9665071770334928, | |
| "eval_steps": 500, | |
| "global_step": 416, | |
| "is_hyper_param_search": false, | |
| "is_local_process_zero": true, | |
| "is_world_process_zero": true, | |
| "log_history": [ | |
| { | |
| "epoch": 0.04784688995215311, | |
| "grad_norm": 1.6178438569586449, | |
| "learning_rate": 5.9523809523809525e-06, | |
| "loss": 0.5431, | |
| "mean_token_accuracy": 0.8649710834026336, | |
| "step": 5 | |
| }, | |
| { | |
| "epoch": 0.09569377990430622, | |
| "grad_norm": 0.8399210434256071, | |
| "learning_rate": 1.1904761904761905e-05, | |
| "loss": 0.4584, | |
| "mean_token_accuracy": 0.8745080888271332, | |
| "step": 10 | |
| }, | |
| { | |
| "epoch": 0.14354066985645933, | |
| "grad_norm": 0.4157952966846664, | |
| "learning_rate": 1.785714285714286e-05, | |
| "loss": 0.4063, | |
| "mean_token_accuracy": 0.8852763116359711, | |
| "step": 15 | |
| }, | |
| { | |
| "epoch": 0.19138755980861244, | |
| "grad_norm": 0.2766662800285874, | |
| "learning_rate": 2.380952380952381e-05, | |
| "loss": 0.3877, | |
| "mean_token_accuracy": 0.8882795989513397, | |
| "step": 20 | |
| }, | |
| { | |
| "epoch": 0.23923444976076555, | |
| "grad_norm": 0.2679535519393889, | |
| "learning_rate": 2.9761904761904762e-05, | |
| "loss": 0.374, | |
| "mean_token_accuracy": 0.8910047888755799, | |
| "step": 25 | |
| }, | |
| { | |
| "epoch": 0.28708133971291866, | |
| "grad_norm": 0.27461170501370225, | |
| "learning_rate": 3.571428571428572e-05, | |
| "loss": 0.3623, | |
| "mean_token_accuracy": 0.8936996936798096, | |
| "step": 30 | |
| }, | |
| { | |
| "epoch": 0.3349282296650718, | |
| "grad_norm": 0.2621068877552152, | |
| "learning_rate": 4.166666666666667e-05, | |
| "loss": 0.3545, | |
| "mean_token_accuracy": 0.8954194068908692, | |
| "step": 35 | |
| }, | |
| { | |
| "epoch": 0.3827751196172249, | |
| "grad_norm": 0.2232430857126827, | |
| "learning_rate": 4.761904761904762e-05, | |
| "loss": 0.351, | |
| "mean_token_accuracy": 0.8956089854240418, | |
| "step": 40 | |
| }, | |
| { | |
| "epoch": 0.430622009569378, | |
| "grad_norm": 0.2469440843623975, | |
| "learning_rate": 4.99928562114719e-05, | |
| "loss": 0.3458, | |
| "mean_token_accuracy": 0.897181648015976, | |
| "step": 45 | |
| }, | |
| { | |
| "epoch": 0.4784688995215311, | |
| "grad_norm": 0.23865360558587098, | |
| "learning_rate": 4.9949216152802965e-05, | |
| "loss": 0.3449, | |
| "mean_token_accuracy": 0.8968794465065002, | |
| "step": 50 | |
| }, | |
| { | |
| "epoch": 0.5263157894736842, | |
| "grad_norm": 0.22245226377013297, | |
| "learning_rate": 4.9865981682910456e-05, | |
| "loss": 0.3402, | |
| "mean_token_accuracy": 0.8982591807842255, | |
| "step": 55 | |
| }, | |
| { | |
| "epoch": 0.5741626794258373, | |
| "grad_norm": 0.2161337900395713, | |
| "learning_rate": 4.97432996051307e-05, | |
| "loss": 0.3388, | |
| "mean_token_accuracy": 0.8982585489749908, | |
| "step": 60 | |
| }, | |
| { | |
| "epoch": 0.6220095693779905, | |
| "grad_norm": 0.20816980844934332, | |
| "learning_rate": 4.958138629782647e-05, | |
| "loss": 0.3343, | |
| "mean_token_accuracy": 0.8995780885219574, | |
| "step": 65 | |
| }, | |
| { | |
| "epoch": 0.6698564593301436, | |
| "grad_norm": 0.20952600298862226, | |
| "learning_rate": 4.938052733275354e-05, | |
| "loss": 0.3297, | |
| "mean_token_accuracy": 0.9005344986915589, | |
| "step": 70 | |
| }, | |
| { | |
| "epoch": 0.7177033492822966, | |
| "grad_norm": 0.22463823169406383, | |
| "learning_rate": 4.914107697138843e-05, | |
| "loss": 0.3254, | |
| "mean_token_accuracy": 0.901619678735733, | |
| "step": 75 | |
| }, | |
| { | |
| "epoch": 0.7655502392344498, | |
| "grad_norm": 0.20387680313285525, | |
| "learning_rate": 4.886345754010597e-05, | |
| "loss": 0.3228, | |
| "mean_token_accuracy": 0.9023577153682709, | |
| "step": 80 | |
| }, | |
| { | |
| "epoch": 0.8133971291866029, | |
| "grad_norm": 0.1967095271533209, | |
| "learning_rate": 4.854815868530855e-05, | |
| "loss": 0.3189, | |
| "mean_token_accuracy": 0.9035642802715301, | |
| "step": 85 | |
| }, | |
| { | |
| "epoch": 0.861244019138756, | |
| "grad_norm": 0.20177701754772745, | |
| "learning_rate": 4.819573650982088e-05, | |
| "loss": 0.3155, | |
| "mean_token_accuracy": 0.9040892541408538, | |
| "step": 90 | |
| }, | |
| { | |
| "epoch": 0.9090909090909091, | |
| "grad_norm": 0.21489007606420435, | |
| "learning_rate": 4.780681259207339e-05, | |
| "loss": 0.3148, | |
| "mean_token_accuracy": 0.9040708005428314, | |
| "step": 95 | |
| }, | |
| { | |
| "epoch": 0.9569377990430622, | |
| "grad_norm": 0.21379980317514838, | |
| "learning_rate": 4.738207288980417e-05, | |
| "loss": 0.3156, | |
| "mean_token_accuracy": 0.903698742389679, | |
| "step": 100 | |
| }, | |
| { | |
| "epoch": 1.0, | |
| "grad_norm": 0.2158756062319493, | |
| "learning_rate": 4.692226653021304e-05, | |
| "loss": 0.3082, | |
| "mean_token_accuracy": 0.9040403498543633, | |
| "step": 105 | |
| }, | |
| { | |
| "epoch": 1.0478468899521531, | |
| "grad_norm": 0.21457223097848338, | |
| "learning_rate": 4.6428204488701576e-05, | |
| "loss": 0.2509, | |
| "mean_token_accuracy": 0.9200618088245391, | |
| "step": 110 | |
| }, | |
| { | |
| "epoch": 1.0956937799043063, | |
| "grad_norm": 0.20058528076642096, | |
| "learning_rate": 4.5900758158529505e-05, | |
| "loss": 0.2506, | |
| "mean_token_accuracy": 0.9204949855804443, | |
| "step": 115 | |
| }, | |
| { | |
| "epoch": 1.1435406698564594, | |
| "grad_norm": 0.20876791389798524, | |
| "learning_rate": 4.534085781391011e-05, | |
| "loss": 0.2496, | |
| "mean_token_accuracy": 0.9207346796989441, | |
| "step": 120 | |
| }, | |
| { | |
| "epoch": 1.1913875598086126, | |
| "grad_norm": 0.20289299393352286, | |
| "learning_rate": 4.474949096925538e-05, | |
| "loss": 0.2506, | |
| "mean_token_accuracy": 0.9200530827045441, | |
| "step": 125 | |
| }, | |
| { | |
| "epoch": 1.2392344497607655, | |
| "grad_norm": 0.21305573701153363, | |
| "learning_rate": 4.4127700637464834e-05, | |
| "loss": 0.2507, | |
| "mean_token_accuracy": 0.9202218174934387, | |
| "step": 130 | |
| }, | |
| { | |
| "epoch": 1.2870813397129186, | |
| "grad_norm": 0.19168843820402073, | |
| "learning_rate": 4.347658349032977e-05, | |
| "loss": 0.2475, | |
| "mean_token_accuracy": 0.9211139142513275, | |
| "step": 135 | |
| }, | |
| { | |
| "epoch": 1.3349282296650717, | |
| "grad_norm": 0.23386818676449111, | |
| "learning_rate": 4.279728792429768e-05, | |
| "loss": 0.2493, | |
| "mean_token_accuracy": 0.9205289006233215, | |
| "step": 140 | |
| }, | |
| { | |
| "epoch": 1.3827751196172249, | |
| "grad_norm": 0.19829038322936948, | |
| "learning_rate": 4.209101203500809e-05, | |
| "loss": 0.2469, | |
| "mean_token_accuracy": 0.9209350109100342, | |
| "step": 145 | |
| }, | |
| { | |
| "epoch": 1.430622009569378, | |
| "grad_norm": 0.22373994346715842, | |
| "learning_rate": 4.135900150417243e-05, | |
| "loss": 0.2416, | |
| "mean_token_accuracy": 0.9225721061229706, | |
| "step": 150 | |
| }, | |
| { | |
| "epoch": 1.4784688995215312, | |
| "grad_norm": 0.20748037358682295, | |
| "learning_rate": 4.0602547402524813e-05, | |
| "loss": 0.2396, | |
| "mean_token_accuracy": 0.9232222616672516, | |
| "step": 155 | |
| }, | |
| { | |
| "epoch": 1.526315789473684, | |
| "grad_norm": 0.18363411909081742, | |
| "learning_rate": 3.982298391271858e-05, | |
| "loss": 0.2474, | |
| "mean_token_accuracy": 0.9210693001747131, | |
| "step": 160 | |
| }, | |
| { | |
| "epoch": 1.5741626794258372, | |
| "grad_norm": 0.18354584761777637, | |
| "learning_rate": 3.902168597618509e-05, | |
| "loss": 0.2426, | |
| "mean_token_accuracy": 0.9223567545413971, | |
| "step": 165 | |
| }, | |
| { | |
| "epoch": 1.6220095693779903, | |
| "grad_norm": 0.19344690588457217, | |
| "learning_rate": 3.82000668681049e-05, | |
| "loss": 0.2419, | |
| "mean_token_accuracy": 0.9226630091667175, | |
| "step": 170 | |
| }, | |
| { | |
| "epoch": 1.6698564593301435, | |
| "grad_norm": 0.19835339977982622, | |
| "learning_rate": 3.735957570476844e-05, | |
| "loss": 0.2364, | |
| "mean_token_accuracy": 0.9246738970279693, | |
| "step": 175 | |
| }, | |
| { | |
| "epoch": 1.7177033492822966, | |
| "grad_norm": 0.1893700609423051, | |
| "learning_rate": 3.65016948877226e-05, | |
| "loss": 0.237, | |
| "mean_token_accuracy": 0.9241552948951721, | |
| "step": 180 | |
| }, | |
| { | |
| "epoch": 1.7655502392344498, | |
| "grad_norm": 0.20092617650862296, | |
| "learning_rate": 3.562793748921095e-05, | |
| "loss": 0.2302, | |
| "mean_token_accuracy": 0.9259460866451263, | |
| "step": 185 | |
| }, | |
| { | |
| "epoch": 1.813397129186603, | |
| "grad_norm": 0.19482063162086233, | |
| "learning_rate": 3.473984458351913e-05, | |
| "loss": 0.2294, | |
| "mean_token_accuracy": 0.9257330477237702, | |
| "step": 190 | |
| }, | |
| { | |
| "epoch": 1.861244019138756, | |
| "grad_norm": 0.23594862888300963, | |
| "learning_rate": 3.383898252893217e-05, | |
| "loss": 0.2252, | |
| "mean_token_accuracy": 0.9272994875907898, | |
| "step": 195 | |
| }, | |
| { | |
| "epoch": 1.9090909090909092, | |
| "grad_norm": 0.2264429074763011, | |
| "learning_rate": 3.292694020509744e-05, | |
| "loss": 0.2285, | |
| "mean_token_accuracy": 0.9267280995845795, | |
| "step": 200 | |
| }, | |
| { | |
| "epoch": 1.9569377990430623, | |
| "grad_norm": 0.20243011319089546, | |
| "learning_rate": 3.200532621066612e-05, | |
| "loss": 0.2317, | |
| "mean_token_accuracy": 0.925784581899643, | |
| "step": 205 | |
| }, | |
| { | |
| "epoch": 2.0, | |
| "grad_norm": 0.2761279417111308, | |
| "learning_rate": 3.10757660261555e-05, | |
| "loss": 0.2187, | |
| "mean_token_accuracy": 0.9277391168806288, | |
| "step": 210 | |
| }, | |
| { | |
| "epoch": 2.047846889952153, | |
| "grad_norm": 0.21472765995198592, | |
| "learning_rate": 3.013989914703625e-05, | |
| "loss": 0.1588, | |
| "mean_token_accuracy": 0.9470981001853943, | |
| "step": 215 | |
| }, | |
| { | |
| "epoch": 2.0956937799043063, | |
| "grad_norm": 0.19241730654495223, | |
| "learning_rate": 2.919937619210103e-05, | |
| "loss": 0.1597, | |
| "mean_token_accuracy": 0.9465132236480713, | |
| "step": 220 | |
| }, | |
| { | |
| "epoch": 2.1435406698564594, | |
| "grad_norm": 0.2010987341848127, | |
| "learning_rate": 2.825585599221456e-05, | |
| "loss": 0.1517, | |
| "mean_token_accuracy": 0.9487400650978088, | |
| "step": 225 | |
| }, | |
| { | |
| "epoch": 2.1913875598086126, | |
| "grad_norm": 0.18857044810118412, | |
| "learning_rate": 2.7311002664579755e-05, | |
| "loss": 0.1522, | |
| "mean_token_accuracy": 0.9488288640975953, | |
| "step": 230 | |
| }, | |
| { | |
| "epoch": 2.2392344497607657, | |
| "grad_norm": 0.1973021475908052, | |
| "learning_rate": 2.6366482677680226e-05, | |
| "loss": 0.1539, | |
| "mean_token_accuracy": 0.9482394576072692, | |
| "step": 235 | |
| }, | |
| { | |
| "epoch": 2.287081339712919, | |
| "grad_norm": 0.1916451391340417, | |
| "learning_rate": 2.5423961912075712e-05, | |
| "loss": 0.1519, | |
| "mean_token_accuracy": 0.9490710437297821, | |
| "step": 240 | |
| }, | |
| { | |
| "epoch": 2.334928229665072, | |
| "grad_norm": 0.18310116347345545, | |
| "learning_rate": 2.448510272223445e-05, | |
| "loss": 0.1593, | |
| "mean_token_accuracy": 0.9469183087348938, | |
| "step": 245 | |
| }, | |
| { | |
| "epoch": 2.382775119617225, | |
| "grad_norm": 0.19992899486646826, | |
| "learning_rate": 2.3551561004584644e-05, | |
| "loss": 0.1527, | |
| "mean_token_accuracy": 0.9489526867866516, | |
| "step": 250 | |
| }, | |
| { | |
| "epoch": 2.430622009569378, | |
| "grad_norm": 0.20001274217428525, | |
| "learning_rate": 2.2624983276956214e-05, | |
| "loss": 0.1545, | |
| "mean_token_accuracy": 0.9480966806411744, | |
| "step": 255 | |
| }, | |
| { | |
| "epoch": 2.478468899521531, | |
| "grad_norm": 0.19971823143333786, | |
| "learning_rate": 2.17070037745638e-05, | |
| "loss": 0.1516, | |
| "mean_token_accuracy": 0.9491800308227539, | |
| "step": 260 | |
| }, | |
| { | |
| "epoch": 2.526315789473684, | |
| "grad_norm": 0.17534350087544737, | |
| "learning_rate": 2.079924156765312e-05, | |
| "loss": 0.15, | |
| "mean_token_accuracy": 0.9497306644916534, | |
| "step": 265 | |
| }, | |
| { | |
| "epoch": 2.574162679425837, | |
| "grad_norm": 0.17941495258495743, | |
| "learning_rate": 1.9903297705894207e-05, | |
| "loss": 0.1552, | |
| "mean_token_accuracy": 0.9483123421669006, | |
| "step": 270 | |
| }, | |
| { | |
| "epoch": 2.6220095693779903, | |
| "grad_norm": 0.1886616293140272, | |
| "learning_rate": 1.9020752394558096e-05, | |
| "loss": 0.1509, | |
| "mean_token_accuracy": 0.9495630502700806, | |
| "step": 275 | |
| }, | |
| { | |
| "epoch": 2.6698564593301435, | |
| "grad_norm": 0.19242543918662838, | |
| "learning_rate": 1.815316220745756e-05, | |
| "loss": 0.1546, | |
| "mean_token_accuracy": 0.9482314586639404, | |
| "step": 280 | |
| }, | |
| { | |
| "epoch": 2.7177033492822966, | |
| "grad_norm": 0.18201614867985003, | |
| "learning_rate": 1.73020573415673e-05, | |
| "loss": 0.1496, | |
| "mean_token_accuracy": 0.9498195767402648, | |
| "step": 285 | |
| }, | |
| { | |
| "epoch": 2.7655502392344498, | |
| "grad_norm": 0.20205387945258876, | |
| "learning_rate": 1.646893891816591e-05, | |
| "loss": 0.1495, | |
| "mean_token_accuracy": 0.9499354422092438, | |
| "step": 290 | |
| }, | |
| { | |
| "epoch": 2.813397129186603, | |
| "grad_norm": 0.17887628669217684, | |
| "learning_rate": 1.5655276335259493e-05, | |
| "loss": 0.1495, | |
| "mean_token_accuracy": 0.950058388710022, | |
| "step": 295 | |
| }, | |
| { | |
| "epoch": 2.861244019138756, | |
| "grad_norm": 0.18210621293658882, | |
| "learning_rate": 1.4862504675956803e-05, | |
| "loss": 0.1477, | |
| "mean_token_accuracy": 0.9507902562618256, | |
| "step": 300 | |
| }, | |
| { | |
| "epoch": 2.909090909090909, | |
| "grad_norm": 0.16610798667043808, | |
| "learning_rate": 1.4092022177366448e-05, | |
| "loss": 0.1472, | |
| "mean_token_accuracy": 0.9505416512489319, | |
| "step": 305 | |
| }, | |
| { | |
| "epoch": 2.9569377990430623, | |
| "grad_norm": 0.19093889547243018, | |
| "learning_rate": 1.334518776448086e-05, | |
| "loss": 0.1449, | |
| "mean_token_accuracy": 0.9513530313968659, | |
| "step": 310 | |
| }, | |
| { | |
| "epoch": 3.0, | |
| "grad_norm": 0.17832946925769, | |
| "learning_rate": 1.2623318653396027e-05, | |
| "loss": 0.1407, | |
| "mean_token_accuracy": 0.9511178533236185, | |
| "step": 315 | |
| }, | |
| { | |
| "epoch": 3.047846889952153, | |
| "grad_norm": 0.25680975157440067, | |
| "learning_rate": 1.192768802809487e-05, | |
| "loss": 0.101, | |
| "mean_token_accuracy": 0.9665960729122162, | |
| "step": 320 | |
| }, | |
| { | |
| "epoch": 3.0956937799043063, | |
| "grad_norm": 0.18777563649802764, | |
| "learning_rate": 1.1259522794891156e-05, | |
| "loss": 0.0961, | |
| "mean_token_accuracy": 0.9677205324172974, | |
| "step": 325 | |
| }, | |
| { | |
| "epoch": 3.1435406698564594, | |
| "grad_norm": 0.17478369339727004, | |
| "learning_rate": 1.0620001418495165e-05, | |
| "loss": 0.0985, | |
| "mean_token_accuracy": 0.966835618019104, | |
| "step": 330 | |
| }, | |
| { | |
| "epoch": 3.1913875598086126, | |
| "grad_norm": 0.17275966764155462, | |
| "learning_rate": 1.0010251843517089e-05, | |
| "loss": 0.0972, | |
| "mean_token_accuracy": 0.9672303974628449, | |
| "step": 335 | |
| }, | |
| { | |
| "epoch": 3.2392344497607657, | |
| "grad_norm": 0.16993093028595546, | |
| "learning_rate": 9.431349505074635e-06, | |
| "loss": 0.0956, | |
| "mean_token_accuracy": 0.9676419258117676, | |
| "step": 340 | |
| }, | |
| { | |
| "epoch": 3.287081339712919, | |
| "grad_norm": 0.16631908155701744, | |
| "learning_rate": 8.884315432013085e-06, | |
| "loss": 0.0947, | |
| "mean_token_accuracy": 0.9682831168174744, | |
| "step": 345 | |
| }, | |
| { | |
| "epoch": 3.334928229665072, | |
| "grad_norm": 0.16712721419639617, | |
| "learning_rate": 8.370114446083686e-06, | |
| "loss": 0.0963, | |
| "mean_token_accuracy": 0.9672574043273926, | |
| "step": 350 | |
| }, | |
| { | |
| "epoch": 3.382775119617225, | |
| "grad_norm": 0.16752495961320182, | |
| "learning_rate": 7.88965346025611e-06, | |
| "loss": 0.0932, | |
| "mean_token_accuracy": 0.9686044454574585, | |
| "step": 355 | |
| }, | |
| { | |
| "epoch": 3.430622009569378, | |
| "grad_norm": 0.16411359819333338, | |
| "learning_rate": 7.443779879166704e-06, | |
| "loss": 0.0936, | |
| "mean_token_accuracy": 0.968251782655716, | |
| "step": 360 | |
| }, | |
| { | |
| "epoch": 3.478468899521531, | |
| "grad_norm": 0.17214899232280093, | |
| "learning_rate": 7.033280104523337e-06, | |
| "loss": 0.0947, | |
| "mean_token_accuracy": 0.9681008815765381, | |
| "step": 365 | |
| }, | |
| { | |
| "epoch": 3.526315789473684, | |
| "grad_norm": 0.16393252030290434, | |
| "learning_rate": 6.658878148103265e-06, | |
| "loss": 0.0954, | |
| "mean_token_accuracy": 0.9676827132701874, | |
| "step": 370 | |
| }, | |
| { | |
| "epoch": 3.574162679425837, | |
| "grad_norm": 0.15820317829789238, | |
| "learning_rate": 6.3212343547899925e-06, | |
| "loss": 0.0941, | |
| "mean_token_accuracy": 0.9682557284832001, | |
| "step": 375 | |
| }, | |
| { | |
| "epoch": 3.6220095693779903, | |
| "grad_norm": 0.1691401819724291, | |
| "learning_rate": 6.020944237901609e-06, | |
| "loss": 0.0957, | |
| "mean_token_accuracy": 0.9677995264530181, | |
| "step": 380 | |
| }, | |
| { | |
| "epoch": 3.6698564593301435, | |
| "grad_norm": 0.15841220320611066, | |
| "learning_rate": 5.7585374288645935e-06, | |
| "loss": 0.095, | |
| "mean_token_accuracy": 0.9678770661354065, | |
| "step": 385 | |
| }, | |
| { | |
| "epoch": 3.7177033492822966, | |
| "grad_norm": 0.15989013852853978, | |
| "learning_rate": 5.534476743085694e-06, | |
| "loss": 0.0933, | |
| "mean_token_accuracy": 0.9685147047042847, | |
| "step": 390 | |
| }, | |
| { | |
| "epoch": 3.7655502392344498, | |
| "grad_norm": 0.1602782553069919, | |
| "learning_rate": 5.349157363669362e-06, | |
| "loss": 0.0926, | |
| "mean_token_accuracy": 0.9689649045467377, | |
| "step": 395 | |
| }, | |
| { | |
| "epoch": 3.813397129186603, | |
| "grad_norm": 0.16816859348589558, | |
| "learning_rate": 5.202906144420483e-06, | |
| "loss": 0.0924, | |
| "mean_token_accuracy": 0.9688647747039795, | |
| "step": 400 | |
| }, | |
| { | |
| "epoch": 3.861244019138756, | |
| "grad_norm": 0.16695686882956462, | |
| "learning_rate": 5.095981033361725e-06, | |
| "loss": 0.0936, | |
| "mean_token_accuracy": 0.9684687733650208, | |
| "step": 405 | |
| }, | |
| { | |
| "epoch": 3.909090909090909, | |
| "grad_norm": 0.17408482158108302, | |
| "learning_rate": 5.028570617782212e-06, | |
| "loss": 0.0924, | |
| "mean_token_accuracy": 0.9686804115772247, | |
| "step": 410 | |
| }, | |
| { | |
| "epoch": 3.9569377990430623, | |
| "grad_norm": 0.16266285247102608, | |
| "learning_rate": 5.000793791620026e-06, | |
| "loss": 0.0928, | |
| "mean_token_accuracy": 0.9688069880008697, | |
| "step": 415 | |
| }, | |
| { | |
| "epoch": 3.9665071770334928, | |
| "mean_token_accuracy": 0.9688756763935089, | |
| "step": 416, | |
| "total_flos": 213866215112704.0, | |
| "train_loss": 0.21202428278943095, | |
| "train_runtime": 2397.3093, | |
| "train_samples_per_second": 2.778, | |
| "train_steps_per_second": 0.174 | |
| } | |
| ], | |
| "logging_steps": 5, | |
| "max_steps": 416, | |
| "num_input_tokens_seen": 0, | |
| "num_train_epochs": 4, | |
| "save_steps": 500, | |
| "stateful_callbacks": { | |
| "TrainerControl": { | |
| "args": { | |
| "should_epoch_stop": false, | |
| "should_evaluate": false, | |
| "should_log": false, | |
| "should_save": true, | |
| "should_training_stop": true | |
| }, | |
| "attributes": {} | |
| } | |
| }, | |
| "total_flos": 213866215112704.0, | |
| "train_batch_size": 1, | |
| "trial_name": null, | |
| "trial_params": null | |
| } | |