{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.1203659123736158, "eval_steps": 500, "global_step": 500, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001203659123736158, "grad_norm": 4.625, "learning_rate": 2.065577232275662e-05, "loss": 1.9427, "step": 5 }, { "epoch": 0.002407318247472316, "grad_norm": 3.90625, "learning_rate": 4.647548772620239e-05, "loss": 1.8343, "step": 10 }, { "epoch": 0.0036109773712084737, "grad_norm": 3.09375, "learning_rate": 7.229520312964818e-05, "loss": 1.7165, "step": 15 }, { "epoch": 0.004814636494944632, "grad_norm": 3.09375, "learning_rate": 9.811491853309394e-05, "loss": 1.5909, "step": 20 }, { "epoch": 0.00601829561868079, "grad_norm": 3.6875, "learning_rate": 0.00012393463393653973, "loss": 1.5327, "step": 25 }, { "epoch": 0.007221954742416947, "grad_norm": 3.0625, "learning_rate": 0.0001497543493399855, "loss": 1.4456, "step": 30 }, { "epoch": 0.008425613866153106, "grad_norm": 2.65625, "learning_rate": 0.00017557406474343128, "loss": 1.3967, "step": 35 }, { "epoch": 0.009629272989889264, "grad_norm": 2.90625, "learning_rate": 0.0001807379731713583, "loss": 1.3353, "step": 40 }, { "epoch": 0.010832932113625422, "grad_norm": 3.421875, "learning_rate": 0.00018073783239457288, "loss": 1.3252, "step": 45 }, { "epoch": 0.01203659123736158, "grad_norm": 2.875, "learning_rate": 0.00018073758332819127, "loss": 1.2706, "step": 50 }, { "epoch": 0.013240250361097737, "grad_norm": 3.015625, "learning_rate": 0.00018073722597261146, "loss": 1.2637, "step": 55 }, { "epoch": 0.014443909484833895, "grad_norm": 2.71875, "learning_rate": 0.00018073676032840438, "loss": 1.218, "step": 60 }, { "epoch": 0.015647568608570053, "grad_norm": 2.734375, "learning_rate": 0.00018073618639631402, "loss": 1.2207, "step": 65 }, { "epoch": 0.016851227732306212, "grad_norm": 2.765625, "learning_rate": 0.00018073550417725735, "loss": 1.1901, "step": 70 }, { "epoch": 0.018054886856042368, "grad_norm": 2.4375, "learning_rate": 0.0001807347136723244, "loss": 1.1795, "step": 75 }, { "epoch": 0.019258545979778528, "grad_norm": 2.65625, "learning_rate": 0.00018073381488277823, "loss": 1.1604, "step": 80 }, { "epoch": 0.020462205103514684, "grad_norm": 2.4375, "learning_rate": 0.00018073280781005481, "loss": 1.1633, "step": 85 }, { "epoch": 0.021665864227250843, "grad_norm": 2.671875, "learning_rate": 0.00018073169245576325, "loss": 1.1536, "step": 90 }, { "epoch": 0.022869523350987, "grad_norm": 2.4375, "learning_rate": 0.00018073046882168553, "loss": 1.1296, "step": 95 }, { "epoch": 0.02407318247472316, "grad_norm": 2.515625, "learning_rate": 0.00018072913690977675, "loss": 1.1209, "step": 100 }, { "epoch": 0.025276841598459315, "grad_norm": 2.515625, "learning_rate": 0.00018072769672216498, "loss": 1.0913, "step": 105 }, { "epoch": 0.026480500722195474, "grad_norm": 2.484375, "learning_rate": 0.0001807261482611512, "loss": 1.1271, "step": 110 }, { "epoch": 0.027684159845931634, "grad_norm": 2.71875, "learning_rate": 0.00018072449152920953, "loss": 1.1263, "step": 115 }, { "epoch": 0.02888781896966779, "grad_norm": 2.75, "learning_rate": 0.00018072272652898695, "loss": 1.0988, "step": 120 }, { "epoch": 0.03009147809340395, "grad_norm": 2.296875, "learning_rate": 0.0001807208532633035, "loss": 1.0511, "step": 125 }, { "epoch": 0.031295137217140105, "grad_norm": 2.890625, "learning_rate": 0.0001807188717351522, "loss": 1.1231, "step": 130 }, { "epoch": 0.03249879634087626, "grad_norm": 2.546875, "learning_rate": 0.00018071678194769898, "loss": 1.054, "step": 135 }, { "epoch": 0.033702455464612424, "grad_norm": 2.515625, "learning_rate": 0.0001807145839042828, "loss": 1.0571, "step": 140 }, { "epoch": 0.03490611458834858, "grad_norm": 2.703125, "learning_rate": 0.0001807122776084156, "loss": 1.0463, "step": 145 }, { "epoch": 0.036109773712084736, "grad_norm": 2.296875, "learning_rate": 0.00018070986306378223, "loss": 1.0482, "step": 150 }, { "epoch": 0.03731343283582089, "grad_norm": 2.375, "learning_rate": 0.00018070734027424048, "loss": 1.0518, "step": 155 }, { "epoch": 0.038517091959557055, "grad_norm": 2.25, "learning_rate": 0.00018070470924382115, "loss": 1.0772, "step": 160 }, { "epoch": 0.03972075108329321, "grad_norm": 2.140625, "learning_rate": 0.00018070196997672797, "loss": 1.0396, "step": 165 }, { "epoch": 0.04092441020702937, "grad_norm": 2.484375, "learning_rate": 0.00018069912247733758, "loss": 1.0425, "step": 170 }, { "epoch": 0.04212806933076553, "grad_norm": 2.0625, "learning_rate": 0.00018069616675019952, "loss": 1.034, "step": 175 }, { "epoch": 0.043331728454501686, "grad_norm": 2.265625, "learning_rate": 0.00018069310280003633, "loss": 1.0382, "step": 180 }, { "epoch": 0.04453538757823784, "grad_norm": 1.984375, "learning_rate": 0.00018068993063174337, "loss": 0.982, "step": 185 }, { "epoch": 0.045739046701974, "grad_norm": 2.515625, "learning_rate": 0.00018068665025038899, "loss": 1.0471, "step": 190 }, { "epoch": 0.04694270582571016, "grad_norm": 2.421875, "learning_rate": 0.00018068326166121437, "loss": 1.002, "step": 195 }, { "epoch": 0.04814636494944632, "grad_norm": 2.390625, "learning_rate": 0.00018067976486963364, "loss": 0.9909, "step": 200 }, { "epoch": 0.04935002407318247, "grad_norm": 1.921875, "learning_rate": 0.00018067615988123374, "loss": 0.9895, "step": 205 }, { "epoch": 0.05055368319691863, "grad_norm": 1.921875, "learning_rate": 0.00018067244670177452, "loss": 0.9895, "step": 210 }, { "epoch": 0.05175734232065479, "grad_norm": 2.4375, "learning_rate": 0.00018066862533718873, "loss": 0.9932, "step": 215 }, { "epoch": 0.05296100144439095, "grad_norm": 2.0625, "learning_rate": 0.0001806646957935819, "loss": 0.9931, "step": 220 }, { "epoch": 0.054164660568127104, "grad_norm": 2.28125, "learning_rate": 0.00018066065807723243, "loss": 0.9888, "step": 225 }, { "epoch": 0.05536831969186327, "grad_norm": 2.21875, "learning_rate": 0.00018065651219459158, "loss": 0.9817, "step": 230 }, { "epoch": 0.05657197881559942, "grad_norm": 2.234375, "learning_rate": 0.00018065225815228335, "loss": 0.9572, "step": 235 }, { "epoch": 0.05777563793933558, "grad_norm": 2.09375, "learning_rate": 0.00018064789595710468, "loss": 0.9525, "step": 240 }, { "epoch": 0.058979297063071735, "grad_norm": 2.09375, "learning_rate": 0.00018064342561602522, "loss": 0.9726, "step": 245 }, { "epoch": 0.0601829561868079, "grad_norm": 2.21875, "learning_rate": 0.00018063884713618737, "loss": 0.9637, "step": 250 }, { "epoch": 0.061386615310544054, "grad_norm": 2.203125, "learning_rate": 0.00018063416052490648, "loss": 0.9902, "step": 255 }, { "epoch": 0.06259027443428021, "grad_norm": 2.1875, "learning_rate": 0.00018062936578967044, "loss": 0.9271, "step": 260 }, { "epoch": 0.06379393355801637, "grad_norm": 2.375, "learning_rate": 0.00018062446293814008, "loss": 0.9563, "step": 265 }, { "epoch": 0.06499759268175252, "grad_norm": 2.171875, "learning_rate": 0.0001806194519781489, "loss": 0.9442, "step": 270 }, { "epoch": 0.06620125180548869, "grad_norm": 2.078125, "learning_rate": 0.00018061433291770306, "loss": 0.945, "step": 275 }, { "epoch": 0.06740491092922485, "grad_norm": 2.328125, "learning_rate": 0.00018060910576498158, "loss": 0.9385, "step": 280 }, { "epoch": 0.068608570052961, "grad_norm": 2.296875, "learning_rate": 0.0001806037705283361, "loss": 0.9244, "step": 285 }, { "epoch": 0.06981222917669716, "grad_norm": 2.234375, "learning_rate": 0.0001805983272162909, "loss": 0.926, "step": 290 }, { "epoch": 0.07101588830043332, "grad_norm": 2.40625, "learning_rate": 0.00018059277583754304, "loss": 0.9232, "step": 295 }, { "epoch": 0.07221954742416947, "grad_norm": 2.09375, "learning_rate": 0.00018058711640096223, "loss": 0.8977, "step": 300 }, { "epoch": 0.07342320654790563, "grad_norm": 2.125, "learning_rate": 0.00018058134891559078, "loss": 0.9126, "step": 305 }, { "epoch": 0.07462686567164178, "grad_norm": 2.125, "learning_rate": 0.00018057547339064362, "loss": 0.9649, "step": 310 }, { "epoch": 0.07583052479537795, "grad_norm": 2.265625, "learning_rate": 0.00018056948983550834, "loss": 0.8945, "step": 315 }, { "epoch": 0.07703418391911411, "grad_norm": 2.203125, "learning_rate": 0.00018056339825974518, "loss": 0.9023, "step": 320 }, { "epoch": 0.07823784304285027, "grad_norm": 1.9375, "learning_rate": 0.00018055719867308685, "loss": 0.907, "step": 325 }, { "epoch": 0.07944150216658642, "grad_norm": 2.046875, "learning_rate": 0.00018055089108543872, "loss": 0.9306, "step": 330 }, { "epoch": 0.08064516129032258, "grad_norm": 2.359375, "learning_rate": 0.00018054447550687873, "loss": 0.9115, "step": 335 }, { "epoch": 0.08184882041405873, "grad_norm": 2.03125, "learning_rate": 0.00018053795194765732, "loss": 0.909, "step": 340 }, { "epoch": 0.08305247953779489, "grad_norm": 2.171875, "learning_rate": 0.00018053132041819745, "loss": 0.8878, "step": 345 }, { "epoch": 0.08425613866153106, "grad_norm": 1.90625, "learning_rate": 0.00018052458092909456, "loss": 0.9281, "step": 350 }, { "epoch": 0.08545979778526722, "grad_norm": 2.15625, "learning_rate": 0.00018051773349111671, "loss": 0.9012, "step": 355 }, { "epoch": 0.08666345690900337, "grad_norm": 1.7890625, "learning_rate": 0.00018051077811520431, "loss": 0.9071, "step": 360 }, { "epoch": 0.08786711603273953, "grad_norm": 2.0625, "learning_rate": 0.00018050371481247027, "loss": 0.9063, "step": 365 }, { "epoch": 0.08907077515647568, "grad_norm": 1.859375, "learning_rate": 0.00018049654359419994, "loss": 0.903, "step": 370 }, { "epoch": 0.09027443428021184, "grad_norm": 2.09375, "learning_rate": 0.00018048926447185106, "loss": 0.9166, "step": 375 }, { "epoch": 0.091478093403948, "grad_norm": 1.8359375, "learning_rate": 0.00018048187745705387, "loss": 0.8833, "step": 380 }, { "epoch": 0.09268175252768417, "grad_norm": 2.125, "learning_rate": 0.00018047438256161086, "loss": 0.8981, "step": 385 }, { "epoch": 0.09388541165142032, "grad_norm": 2.03125, "learning_rate": 0.00018046677979749698, "loss": 0.8898, "step": 390 }, { "epoch": 0.09508907077515648, "grad_norm": 1.9609375, "learning_rate": 0.00018045906917685947, "loss": 0.8837, "step": 395 }, { "epoch": 0.09629272989889263, "grad_norm": 2.015625, "learning_rate": 0.000180451250712018, "loss": 0.8803, "step": 400 }, { "epoch": 0.09749638902262879, "grad_norm": 1.8984375, "learning_rate": 0.00018044332441546437, "loss": 0.8851, "step": 405 }, { "epoch": 0.09870004814636495, "grad_norm": 1.9453125, "learning_rate": 0.00018043529029986285, "loss": 0.8997, "step": 410 }, { "epoch": 0.0999037072701011, "grad_norm": 1.9140625, "learning_rate": 0.00018042714837804985, "loss": 0.8991, "step": 415 }, { "epoch": 0.10110736639383726, "grad_norm": 2.03125, "learning_rate": 0.0001804188986630341, "loss": 0.8939, "step": 420 }, { "epoch": 0.10231102551757343, "grad_norm": 2.203125, "learning_rate": 0.00018041054116799653, "loss": 0.8914, "step": 425 }, { "epoch": 0.10351468464130958, "grad_norm": 2.1875, "learning_rate": 0.00018040207590629026, "loss": 0.8398, "step": 430 }, { "epoch": 0.10471834376504574, "grad_norm": 1.8203125, "learning_rate": 0.0001803935028914406, "loss": 0.8818, "step": 435 }, { "epoch": 0.1059220028887819, "grad_norm": 1.953125, "learning_rate": 0.00018038482213714508, "loss": 0.8518, "step": 440 }, { "epoch": 0.10712566201251805, "grad_norm": 2.09375, "learning_rate": 0.00018037603365727323, "loss": 0.8874, "step": 445 }, { "epoch": 0.10832932113625421, "grad_norm": 1.921875, "learning_rate": 0.00018036713746586689, "loss": 0.8923, "step": 450 }, { "epoch": 0.10953298025999036, "grad_norm": 1.7109375, "learning_rate": 0.00018035813357713984, "loss": 0.8945, "step": 455 }, { "epoch": 0.11073663938372653, "grad_norm": 1.71875, "learning_rate": 0.00018034902200547796, "loss": 0.857, "step": 460 }, { "epoch": 0.11194029850746269, "grad_norm": 2.015625, "learning_rate": 0.00018033980276543928, "loss": 0.8514, "step": 465 }, { "epoch": 0.11314395763119885, "grad_norm": 1.703125, "learning_rate": 0.0001803304758717537, "loss": 0.8674, "step": 470 }, { "epoch": 0.114347616754935, "grad_norm": 1.96875, "learning_rate": 0.00018032104133932326, "loss": 0.9035, "step": 475 }, { "epoch": 0.11555127587867116, "grad_norm": 1.7734375, "learning_rate": 0.00018031149918322191, "loss": 0.8484, "step": 480 }, { "epoch": 0.11675493500240731, "grad_norm": 1.9140625, "learning_rate": 0.0001803018494186956, "loss": 0.8609, "step": 485 }, { "epoch": 0.11795859412614347, "grad_norm": 1.875, "learning_rate": 0.0001802920920611621, "loss": 0.8008, "step": 490 }, { "epoch": 0.11916225324987964, "grad_norm": 1.7890625, "learning_rate": 0.00018028222712621126, "loss": 0.8406, "step": 495 }, { "epoch": 0.1203659123736158, "grad_norm": 2.03125, "learning_rate": 0.00018027225462960463, "loss": 0.8552, "step": 500 }, { "epoch": 0.1203659123736158, "eval_loss": 0.7584288716316223, "eval_runtime": 2.397, "eval_samples_per_second": 83.437, "eval_steps_per_second": 83.437, "step": 500 } ], "logging_steps": 5, "max_steps": 12462, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.2542017536e+16, "train_batch_size": 48, "trial_name": null, "trial_params": null }