{ "best_metric": null, "best_model_checkpoint": null, "epoch": 3.9665071770334928, "eval_steps": 500, "global_step": 416, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.04784688995215311, "grad_norm": 1.6178438569586449, "learning_rate": 5.9523809523809525e-06, "loss": 0.5431, "mean_token_accuracy": 0.8649710834026336, "step": 5 }, { "epoch": 0.09569377990430622, "grad_norm": 0.8399210434256071, "learning_rate": 1.1904761904761905e-05, "loss": 0.4584, "mean_token_accuracy": 0.8745080888271332, "step": 10 }, { "epoch": 0.14354066985645933, "grad_norm": 0.4157952966846664, "learning_rate": 1.785714285714286e-05, "loss": 0.4063, "mean_token_accuracy": 0.8852763116359711, "step": 15 }, { "epoch": 0.19138755980861244, "grad_norm": 0.2766662800285874, "learning_rate": 2.380952380952381e-05, "loss": 0.3877, "mean_token_accuracy": 0.8882795989513397, "step": 20 }, { "epoch": 0.23923444976076555, "grad_norm": 0.2679535519393889, "learning_rate": 2.9761904761904762e-05, "loss": 0.374, "mean_token_accuracy": 0.8910047888755799, "step": 25 }, { "epoch": 0.28708133971291866, "grad_norm": 0.27461170501370225, "learning_rate": 3.571428571428572e-05, "loss": 0.3623, "mean_token_accuracy": 0.8936996936798096, "step": 30 }, { "epoch": 0.3349282296650718, "grad_norm": 0.2621068877552152, "learning_rate": 4.166666666666667e-05, "loss": 0.3545, "mean_token_accuracy": 0.8954194068908692, "step": 35 }, { "epoch": 0.3827751196172249, "grad_norm": 0.2232430857126827, "learning_rate": 4.761904761904762e-05, "loss": 0.351, "mean_token_accuracy": 0.8956089854240418, "step": 40 }, { "epoch": 0.430622009569378, "grad_norm": 0.2469440843623975, "learning_rate": 4.99928562114719e-05, "loss": 0.3458, "mean_token_accuracy": 0.897181648015976, "step": 45 }, { "epoch": 0.4784688995215311, "grad_norm": 0.23865360558587098, "learning_rate": 4.9949216152802965e-05, "loss": 0.3449, "mean_token_accuracy": 0.8968794465065002, "step": 50 }, { "epoch": 0.5263157894736842, "grad_norm": 0.22245226377013297, "learning_rate": 4.9865981682910456e-05, "loss": 0.3402, "mean_token_accuracy": 0.8982591807842255, "step": 55 }, { "epoch": 0.5741626794258373, "grad_norm": 0.2161337900395713, "learning_rate": 4.97432996051307e-05, "loss": 0.3388, "mean_token_accuracy": 0.8982585489749908, "step": 60 }, { "epoch": 0.6220095693779905, "grad_norm": 0.20816980844934332, "learning_rate": 4.958138629782647e-05, "loss": 0.3343, "mean_token_accuracy": 0.8995780885219574, "step": 65 }, { "epoch": 0.6698564593301436, "grad_norm": 0.20952600298862226, "learning_rate": 4.938052733275354e-05, "loss": 0.3297, "mean_token_accuracy": 0.9005344986915589, "step": 70 }, { "epoch": 0.7177033492822966, "grad_norm": 0.22463823169406383, "learning_rate": 4.914107697138843e-05, "loss": 0.3254, "mean_token_accuracy": 0.901619678735733, "step": 75 }, { "epoch": 0.7655502392344498, "grad_norm": 0.20387680313285525, "learning_rate": 4.886345754010597e-05, "loss": 0.3228, "mean_token_accuracy": 0.9023577153682709, "step": 80 }, { "epoch": 0.8133971291866029, "grad_norm": 0.1967095271533209, "learning_rate": 4.854815868530855e-05, "loss": 0.3189, "mean_token_accuracy": 0.9035642802715301, "step": 85 }, { "epoch": 0.861244019138756, "grad_norm": 0.20177701754772745, "learning_rate": 4.819573650982088e-05, "loss": 0.3155, "mean_token_accuracy": 0.9040892541408538, "step": 90 }, { "epoch": 0.9090909090909091, "grad_norm": 0.21489007606420435, "learning_rate": 4.780681259207339e-05, "loss": 0.3148, "mean_token_accuracy": 0.9040708005428314, "step": 95 }, { "epoch": 0.9569377990430622, "grad_norm": 0.21379980317514838, "learning_rate": 4.738207288980417e-05, "loss": 0.3156, "mean_token_accuracy": 0.903698742389679, "step": 100 }, { "epoch": 1.0, "grad_norm": 0.2158756062319493, "learning_rate": 4.692226653021304e-05, "loss": 0.3082, "mean_token_accuracy": 0.9040403498543633, "step": 105 }, { "epoch": 1.0478468899521531, "grad_norm": 0.21457223097848338, "learning_rate": 4.6428204488701576e-05, "loss": 0.2509, "mean_token_accuracy": 0.9200618088245391, "step": 110 }, { "epoch": 1.0956937799043063, "grad_norm": 0.20058528076642096, "learning_rate": 4.5900758158529505e-05, "loss": 0.2506, "mean_token_accuracy": 0.9204949855804443, "step": 115 }, { "epoch": 1.1435406698564594, "grad_norm": 0.20876791389798524, "learning_rate": 4.534085781391011e-05, "loss": 0.2496, "mean_token_accuracy": 0.9207346796989441, "step": 120 }, { "epoch": 1.1913875598086126, "grad_norm": 0.20289299393352286, "learning_rate": 4.474949096925538e-05, "loss": 0.2506, "mean_token_accuracy": 0.9200530827045441, "step": 125 }, { "epoch": 1.2392344497607655, "grad_norm": 0.21305573701153363, "learning_rate": 4.4127700637464834e-05, "loss": 0.2507, "mean_token_accuracy": 0.9202218174934387, "step": 130 }, { "epoch": 1.2870813397129186, "grad_norm": 0.19168843820402073, "learning_rate": 4.347658349032977e-05, "loss": 0.2475, "mean_token_accuracy": 0.9211139142513275, "step": 135 }, { "epoch": 1.3349282296650717, "grad_norm": 0.23386818676449111, "learning_rate": 4.279728792429768e-05, "loss": 0.2493, "mean_token_accuracy": 0.9205289006233215, "step": 140 }, { "epoch": 1.3827751196172249, "grad_norm": 0.19829038322936948, "learning_rate": 4.209101203500809e-05, "loss": 0.2469, "mean_token_accuracy": 0.9209350109100342, "step": 145 }, { "epoch": 1.430622009569378, "grad_norm": 0.22373994346715842, "learning_rate": 4.135900150417243e-05, "loss": 0.2416, "mean_token_accuracy": 0.9225721061229706, "step": 150 }, { "epoch": 1.4784688995215312, "grad_norm": 0.20748037358682295, "learning_rate": 4.0602547402524813e-05, "loss": 0.2396, "mean_token_accuracy": 0.9232222616672516, "step": 155 }, { "epoch": 1.526315789473684, "grad_norm": 0.18363411909081742, "learning_rate": 3.982298391271858e-05, "loss": 0.2474, "mean_token_accuracy": 0.9210693001747131, "step": 160 }, { "epoch": 1.5741626794258372, "grad_norm": 0.18354584761777637, "learning_rate": 3.902168597618509e-05, "loss": 0.2426, "mean_token_accuracy": 0.9223567545413971, "step": 165 }, { "epoch": 1.6220095693779903, "grad_norm": 0.19344690588457217, "learning_rate": 3.82000668681049e-05, "loss": 0.2419, "mean_token_accuracy": 0.9226630091667175, "step": 170 }, { "epoch": 1.6698564593301435, "grad_norm": 0.19835339977982622, "learning_rate": 3.735957570476844e-05, "loss": 0.2364, "mean_token_accuracy": 0.9246738970279693, "step": 175 }, { "epoch": 1.7177033492822966, "grad_norm": 0.1893700609423051, "learning_rate": 3.65016948877226e-05, "loss": 0.237, "mean_token_accuracy": 0.9241552948951721, "step": 180 }, { "epoch": 1.7655502392344498, "grad_norm": 0.20092617650862296, "learning_rate": 3.562793748921095e-05, "loss": 0.2302, "mean_token_accuracy": 0.9259460866451263, "step": 185 }, { "epoch": 1.813397129186603, "grad_norm": 0.19482063162086233, "learning_rate": 3.473984458351913e-05, "loss": 0.2294, "mean_token_accuracy": 0.9257330477237702, "step": 190 }, { "epoch": 1.861244019138756, "grad_norm": 0.23594862888300963, "learning_rate": 3.383898252893217e-05, "loss": 0.2252, "mean_token_accuracy": 0.9272994875907898, "step": 195 }, { "epoch": 1.9090909090909092, "grad_norm": 0.2264429074763011, "learning_rate": 3.292694020509744e-05, "loss": 0.2285, "mean_token_accuracy": 0.9267280995845795, "step": 200 }, { "epoch": 1.9569377990430623, "grad_norm": 0.20243011319089546, "learning_rate": 3.200532621066612e-05, "loss": 0.2317, "mean_token_accuracy": 0.925784581899643, "step": 205 }, { "epoch": 2.0, "grad_norm": 0.2761279417111308, "learning_rate": 3.10757660261555e-05, "loss": 0.2187, "mean_token_accuracy": 0.9277391168806288, "step": 210 }, { "epoch": 2.047846889952153, "grad_norm": 0.21472765995198592, "learning_rate": 3.013989914703625e-05, "loss": 0.1588, "mean_token_accuracy": 0.9470981001853943, "step": 215 }, { "epoch": 2.0956937799043063, "grad_norm": 0.19241730654495223, "learning_rate": 2.919937619210103e-05, "loss": 0.1597, "mean_token_accuracy": 0.9465132236480713, "step": 220 }, { "epoch": 2.1435406698564594, "grad_norm": 0.2010987341848127, "learning_rate": 2.825585599221456e-05, "loss": 0.1517, "mean_token_accuracy": 0.9487400650978088, "step": 225 }, { "epoch": 2.1913875598086126, "grad_norm": 0.18857044810118412, "learning_rate": 2.7311002664579755e-05, "loss": 0.1522, "mean_token_accuracy": 0.9488288640975953, "step": 230 }, { "epoch": 2.2392344497607657, "grad_norm": 0.1973021475908052, "learning_rate": 2.6366482677680226e-05, "loss": 0.1539, "mean_token_accuracy": 0.9482394576072692, "step": 235 }, { "epoch": 2.287081339712919, "grad_norm": 0.1916451391340417, "learning_rate": 2.5423961912075712e-05, "loss": 0.1519, "mean_token_accuracy": 0.9490710437297821, "step": 240 }, { "epoch": 2.334928229665072, "grad_norm": 0.18310116347345545, "learning_rate": 2.448510272223445e-05, "loss": 0.1593, "mean_token_accuracy": 0.9469183087348938, "step": 245 }, { "epoch": 2.382775119617225, "grad_norm": 0.19992899486646826, "learning_rate": 2.3551561004584644e-05, "loss": 0.1527, "mean_token_accuracy": 0.9489526867866516, "step": 250 }, { "epoch": 2.430622009569378, "grad_norm": 0.20001274217428525, "learning_rate": 2.2624983276956214e-05, "loss": 0.1545, "mean_token_accuracy": 0.9480966806411744, "step": 255 }, { "epoch": 2.478468899521531, "grad_norm": 0.19971823143333786, "learning_rate": 2.17070037745638e-05, "loss": 0.1516, "mean_token_accuracy": 0.9491800308227539, "step": 260 }, { "epoch": 2.526315789473684, "grad_norm": 0.17534350087544737, "learning_rate": 2.079924156765312e-05, "loss": 0.15, "mean_token_accuracy": 0.9497306644916534, "step": 265 }, { "epoch": 2.574162679425837, "grad_norm": 0.17941495258495743, "learning_rate": 1.9903297705894207e-05, "loss": 0.1552, "mean_token_accuracy": 0.9483123421669006, "step": 270 }, { "epoch": 2.6220095693779903, "grad_norm": 0.1886616293140272, "learning_rate": 1.9020752394558096e-05, "loss": 0.1509, "mean_token_accuracy": 0.9495630502700806, "step": 275 }, { "epoch": 2.6698564593301435, "grad_norm": 0.19242543918662838, "learning_rate": 1.815316220745756e-05, "loss": 0.1546, "mean_token_accuracy": 0.9482314586639404, "step": 280 }, { "epoch": 2.7177033492822966, "grad_norm": 0.18201614867985003, "learning_rate": 1.73020573415673e-05, "loss": 0.1496, "mean_token_accuracy": 0.9498195767402648, "step": 285 }, { "epoch": 2.7655502392344498, "grad_norm": 0.20205387945258876, "learning_rate": 1.646893891816591e-05, "loss": 0.1495, "mean_token_accuracy": 0.9499354422092438, "step": 290 }, { "epoch": 2.813397129186603, "grad_norm": 0.17887628669217684, "learning_rate": 1.5655276335259493e-05, "loss": 0.1495, "mean_token_accuracy": 0.950058388710022, "step": 295 }, { "epoch": 2.861244019138756, "grad_norm": 0.18210621293658882, "learning_rate": 1.4862504675956803e-05, "loss": 0.1477, "mean_token_accuracy": 0.9507902562618256, "step": 300 }, { "epoch": 2.909090909090909, "grad_norm": 0.16610798667043808, "learning_rate": 1.4092022177366448e-05, "loss": 0.1472, "mean_token_accuracy": 0.9505416512489319, "step": 305 }, { "epoch": 2.9569377990430623, "grad_norm": 0.19093889547243018, "learning_rate": 1.334518776448086e-05, "loss": 0.1449, "mean_token_accuracy": 0.9513530313968659, "step": 310 }, { "epoch": 3.0, "grad_norm": 0.17832946925769, "learning_rate": 1.2623318653396027e-05, "loss": 0.1407, "mean_token_accuracy": 0.9511178533236185, "step": 315 }, { "epoch": 3.047846889952153, "grad_norm": 0.25680975157440067, "learning_rate": 1.192768802809487e-05, "loss": 0.101, "mean_token_accuracy": 0.9665960729122162, "step": 320 }, { "epoch": 3.0956937799043063, "grad_norm": 0.18777563649802764, "learning_rate": 1.1259522794891156e-05, "loss": 0.0961, "mean_token_accuracy": 0.9677205324172974, "step": 325 }, { "epoch": 3.1435406698564594, "grad_norm": 0.17478369339727004, "learning_rate": 1.0620001418495165e-05, "loss": 0.0985, "mean_token_accuracy": 0.966835618019104, "step": 330 }, { "epoch": 3.1913875598086126, "grad_norm": 0.17275966764155462, "learning_rate": 1.0010251843517089e-05, "loss": 0.0972, "mean_token_accuracy": 0.9672303974628449, "step": 335 }, { "epoch": 3.2392344497607657, "grad_norm": 0.16993093028595546, "learning_rate": 9.431349505074635e-06, "loss": 0.0956, "mean_token_accuracy": 0.9676419258117676, "step": 340 }, { "epoch": 3.287081339712919, "grad_norm": 0.16631908155701744, "learning_rate": 8.884315432013085e-06, "loss": 0.0947, "mean_token_accuracy": 0.9682831168174744, "step": 345 }, { "epoch": 3.334928229665072, "grad_norm": 0.16712721419639617, "learning_rate": 8.370114446083686e-06, "loss": 0.0963, "mean_token_accuracy": 0.9672574043273926, "step": 350 }, { "epoch": 3.382775119617225, "grad_norm": 0.16752495961320182, "learning_rate": 7.88965346025611e-06, "loss": 0.0932, "mean_token_accuracy": 0.9686044454574585, "step": 355 }, { "epoch": 3.430622009569378, "grad_norm": 0.16411359819333338, "learning_rate": 7.443779879166704e-06, "loss": 0.0936, "mean_token_accuracy": 0.968251782655716, "step": 360 }, { "epoch": 3.478468899521531, "grad_norm": 0.17214899232280093, "learning_rate": 7.033280104523337e-06, "loss": 0.0947, "mean_token_accuracy": 0.9681008815765381, "step": 365 }, { "epoch": 3.526315789473684, "grad_norm": 0.16393252030290434, "learning_rate": 6.658878148103265e-06, "loss": 0.0954, "mean_token_accuracy": 0.9676827132701874, "step": 370 }, { "epoch": 3.574162679425837, "grad_norm": 0.15820317829789238, "learning_rate": 6.3212343547899925e-06, "loss": 0.0941, "mean_token_accuracy": 0.9682557284832001, "step": 375 }, { "epoch": 3.6220095693779903, "grad_norm": 0.1691401819724291, "learning_rate": 6.020944237901609e-06, "loss": 0.0957, "mean_token_accuracy": 0.9677995264530181, "step": 380 }, { "epoch": 3.6698564593301435, "grad_norm": 0.15841220320611066, "learning_rate": 5.7585374288645935e-06, "loss": 0.095, "mean_token_accuracy": 0.9678770661354065, "step": 385 }, { "epoch": 3.7177033492822966, "grad_norm": 0.15989013852853978, "learning_rate": 5.534476743085694e-06, "loss": 0.0933, "mean_token_accuracy": 0.9685147047042847, "step": 390 }, { "epoch": 3.7655502392344498, "grad_norm": 0.1602782553069919, "learning_rate": 5.349157363669362e-06, "loss": 0.0926, "mean_token_accuracy": 0.9689649045467377, "step": 395 }, { "epoch": 3.813397129186603, "grad_norm": 0.16816859348589558, "learning_rate": 5.202906144420483e-06, "loss": 0.0924, "mean_token_accuracy": 0.9688647747039795, "step": 400 }, { "epoch": 3.861244019138756, "grad_norm": 0.16695686882956462, "learning_rate": 5.095981033361725e-06, "loss": 0.0936, "mean_token_accuracy": 0.9684687733650208, "step": 405 }, { "epoch": 3.909090909090909, "grad_norm": 0.17408482158108302, "learning_rate": 5.028570617782212e-06, "loss": 0.0924, "mean_token_accuracy": 0.9686804115772247, "step": 410 }, { "epoch": 3.9569377990430623, "grad_norm": 0.16266285247102608, "learning_rate": 5.000793791620026e-06, "loss": 0.0928, "mean_token_accuracy": 0.9688069880008697, "step": 415 }, { "epoch": 3.9665071770334928, "mean_token_accuracy": 0.9688756763935089, "step": 416, "total_flos": 213866215112704.0, "train_loss": 0.21202428278943095, "train_runtime": 2397.3093, "train_samples_per_second": 2.778, "train_steps_per_second": 0.174 } ], "logging_steps": 5, "max_steps": 416, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": true }, "attributes": {} } }, "total_flos": 213866215112704.0, "train_batch_size": 1, "trial_name": null, "trial_params": null }