| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.1203659123736158, |
| "eval_steps": 500, |
| "global_step": 500, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.001203659123736158, |
| "grad_norm": 4.625, |
| "learning_rate": 2.065577232275662e-05, |
| "loss": 1.9427, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.002407318247472316, |
| "grad_norm": 3.90625, |
| "learning_rate": 4.647548772620239e-05, |
| "loss": 1.8343, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0036109773712084737, |
| "grad_norm": 3.09375, |
| "learning_rate": 7.229520312964818e-05, |
| "loss": 1.7165, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.004814636494944632, |
| "grad_norm": 3.09375, |
| "learning_rate": 9.811491853309394e-05, |
| "loss": 1.5909, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.00601829561868079, |
| "grad_norm": 3.6875, |
| "learning_rate": 0.00012393463393653973, |
| "loss": 1.5327, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.007221954742416947, |
| "grad_norm": 3.0625, |
| "learning_rate": 0.0001497543493399855, |
| "loss": 1.4456, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.008425613866153106, |
| "grad_norm": 2.65625, |
| "learning_rate": 0.00017557406474343128, |
| "loss": 1.3967, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.009629272989889264, |
| "grad_norm": 2.90625, |
| "learning_rate": 0.0001807379731713583, |
| "loss": 1.3353, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.010832932113625422, |
| "grad_norm": 3.421875, |
| "learning_rate": 0.00018073783239457288, |
| "loss": 1.3252, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.01203659123736158, |
| "grad_norm": 2.875, |
| "learning_rate": 0.00018073758332819127, |
| "loss": 1.2706, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.013240250361097737, |
| "grad_norm": 3.015625, |
| "learning_rate": 0.00018073722597261146, |
| "loss": 1.2637, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.014443909484833895, |
| "grad_norm": 2.71875, |
| "learning_rate": 0.00018073676032840438, |
| "loss": 1.218, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.015647568608570053, |
| "grad_norm": 2.734375, |
| "learning_rate": 0.00018073618639631402, |
| "loss": 1.2207, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.016851227732306212, |
| "grad_norm": 2.765625, |
| "learning_rate": 0.00018073550417725735, |
| "loss": 1.1901, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.018054886856042368, |
| "grad_norm": 2.4375, |
| "learning_rate": 0.0001807347136723244, |
| "loss": 1.1795, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.019258545979778528, |
| "grad_norm": 2.65625, |
| "learning_rate": 0.00018073381488277823, |
| "loss": 1.1604, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.020462205103514684, |
| "grad_norm": 2.4375, |
| "learning_rate": 0.00018073280781005481, |
| "loss": 1.1633, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.021665864227250843, |
| "grad_norm": 2.671875, |
| "learning_rate": 0.00018073169245576325, |
| "loss": 1.1536, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.022869523350987, |
| "grad_norm": 2.4375, |
| "learning_rate": 0.00018073046882168553, |
| "loss": 1.1296, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.02407318247472316, |
| "grad_norm": 2.515625, |
| "learning_rate": 0.00018072913690977675, |
| "loss": 1.1209, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.025276841598459315, |
| "grad_norm": 2.515625, |
| "learning_rate": 0.00018072769672216498, |
| "loss": 1.0913, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.026480500722195474, |
| "grad_norm": 2.484375, |
| "learning_rate": 0.0001807261482611512, |
| "loss": 1.1271, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.027684159845931634, |
| "grad_norm": 2.71875, |
| "learning_rate": 0.00018072449152920953, |
| "loss": 1.1263, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.02888781896966779, |
| "grad_norm": 2.75, |
| "learning_rate": 0.00018072272652898695, |
| "loss": 1.0988, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.03009147809340395, |
| "grad_norm": 2.296875, |
| "learning_rate": 0.0001807208532633035, |
| "loss": 1.0511, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.031295137217140105, |
| "grad_norm": 2.890625, |
| "learning_rate": 0.0001807188717351522, |
| "loss": 1.1231, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.03249879634087626, |
| "grad_norm": 2.546875, |
| "learning_rate": 0.00018071678194769898, |
| "loss": 1.054, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.033702455464612424, |
| "grad_norm": 2.515625, |
| "learning_rate": 0.0001807145839042828, |
| "loss": 1.0571, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.03490611458834858, |
| "grad_norm": 2.703125, |
| "learning_rate": 0.0001807122776084156, |
| "loss": 1.0463, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.036109773712084736, |
| "grad_norm": 2.296875, |
| "learning_rate": 0.00018070986306378223, |
| "loss": 1.0482, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.03731343283582089, |
| "grad_norm": 2.375, |
| "learning_rate": 0.00018070734027424048, |
| "loss": 1.0518, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.038517091959557055, |
| "grad_norm": 2.25, |
| "learning_rate": 0.00018070470924382115, |
| "loss": 1.0772, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.03972075108329321, |
| "grad_norm": 2.140625, |
| "learning_rate": 0.00018070196997672797, |
| "loss": 1.0396, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.04092441020702937, |
| "grad_norm": 2.484375, |
| "learning_rate": 0.00018069912247733758, |
| "loss": 1.0425, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.04212806933076553, |
| "grad_norm": 2.0625, |
| "learning_rate": 0.00018069616675019952, |
| "loss": 1.034, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.043331728454501686, |
| "grad_norm": 2.265625, |
| "learning_rate": 0.00018069310280003633, |
| "loss": 1.0382, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.04453538757823784, |
| "grad_norm": 1.984375, |
| "learning_rate": 0.00018068993063174337, |
| "loss": 0.982, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.045739046701974, |
| "grad_norm": 2.515625, |
| "learning_rate": 0.00018068665025038899, |
| "loss": 1.0471, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.04694270582571016, |
| "grad_norm": 2.421875, |
| "learning_rate": 0.00018068326166121437, |
| "loss": 1.002, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.04814636494944632, |
| "grad_norm": 2.390625, |
| "learning_rate": 0.00018067976486963364, |
| "loss": 0.9909, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.04935002407318247, |
| "grad_norm": 1.921875, |
| "learning_rate": 0.00018067615988123374, |
| "loss": 0.9895, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.05055368319691863, |
| "grad_norm": 1.921875, |
| "learning_rate": 0.00018067244670177452, |
| "loss": 0.9895, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.05175734232065479, |
| "grad_norm": 2.4375, |
| "learning_rate": 0.00018066862533718873, |
| "loss": 0.9932, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.05296100144439095, |
| "grad_norm": 2.0625, |
| "learning_rate": 0.0001806646957935819, |
| "loss": 0.9931, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.054164660568127104, |
| "grad_norm": 2.28125, |
| "learning_rate": 0.00018066065807723243, |
| "loss": 0.9888, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.05536831969186327, |
| "grad_norm": 2.21875, |
| "learning_rate": 0.00018065651219459158, |
| "loss": 0.9817, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.05657197881559942, |
| "grad_norm": 2.234375, |
| "learning_rate": 0.00018065225815228335, |
| "loss": 0.9572, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.05777563793933558, |
| "grad_norm": 2.09375, |
| "learning_rate": 0.00018064789595710468, |
| "loss": 0.9525, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.058979297063071735, |
| "grad_norm": 2.09375, |
| "learning_rate": 0.00018064342561602522, |
| "loss": 0.9726, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.0601829561868079, |
| "grad_norm": 2.21875, |
| "learning_rate": 0.00018063884713618737, |
| "loss": 0.9637, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.061386615310544054, |
| "grad_norm": 2.203125, |
| "learning_rate": 0.00018063416052490648, |
| "loss": 0.9902, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.06259027443428021, |
| "grad_norm": 2.1875, |
| "learning_rate": 0.00018062936578967044, |
| "loss": 0.9271, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.06379393355801637, |
| "grad_norm": 2.375, |
| "learning_rate": 0.00018062446293814008, |
| "loss": 0.9563, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.06499759268175252, |
| "grad_norm": 2.171875, |
| "learning_rate": 0.0001806194519781489, |
| "loss": 0.9442, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.06620125180548869, |
| "grad_norm": 2.078125, |
| "learning_rate": 0.00018061433291770306, |
| "loss": 0.945, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.06740491092922485, |
| "grad_norm": 2.328125, |
| "learning_rate": 0.00018060910576498158, |
| "loss": 0.9385, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.068608570052961, |
| "grad_norm": 2.296875, |
| "learning_rate": 0.0001806037705283361, |
| "loss": 0.9244, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.06981222917669716, |
| "grad_norm": 2.234375, |
| "learning_rate": 0.0001805983272162909, |
| "loss": 0.926, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.07101588830043332, |
| "grad_norm": 2.40625, |
| "learning_rate": 0.00018059277583754304, |
| "loss": 0.9232, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.07221954742416947, |
| "grad_norm": 2.09375, |
| "learning_rate": 0.00018058711640096223, |
| "loss": 0.8977, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.07342320654790563, |
| "grad_norm": 2.125, |
| "learning_rate": 0.00018058134891559078, |
| "loss": 0.9126, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.07462686567164178, |
| "grad_norm": 2.125, |
| "learning_rate": 0.00018057547339064362, |
| "loss": 0.9649, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.07583052479537795, |
| "grad_norm": 2.265625, |
| "learning_rate": 0.00018056948983550834, |
| "loss": 0.8945, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.07703418391911411, |
| "grad_norm": 2.203125, |
| "learning_rate": 0.00018056339825974518, |
| "loss": 0.9023, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.07823784304285027, |
| "grad_norm": 1.9375, |
| "learning_rate": 0.00018055719867308685, |
| "loss": 0.907, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.07944150216658642, |
| "grad_norm": 2.046875, |
| "learning_rate": 0.00018055089108543872, |
| "loss": 0.9306, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.08064516129032258, |
| "grad_norm": 2.359375, |
| "learning_rate": 0.00018054447550687873, |
| "loss": 0.9115, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.08184882041405873, |
| "grad_norm": 2.03125, |
| "learning_rate": 0.00018053795194765732, |
| "loss": 0.909, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.08305247953779489, |
| "grad_norm": 2.171875, |
| "learning_rate": 0.00018053132041819745, |
| "loss": 0.8878, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.08425613866153106, |
| "grad_norm": 1.90625, |
| "learning_rate": 0.00018052458092909456, |
| "loss": 0.9281, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.08545979778526722, |
| "grad_norm": 2.15625, |
| "learning_rate": 0.00018051773349111671, |
| "loss": 0.9012, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.08666345690900337, |
| "grad_norm": 1.7890625, |
| "learning_rate": 0.00018051077811520431, |
| "loss": 0.9071, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.08786711603273953, |
| "grad_norm": 2.0625, |
| "learning_rate": 0.00018050371481247027, |
| "loss": 0.9063, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.08907077515647568, |
| "grad_norm": 1.859375, |
| "learning_rate": 0.00018049654359419994, |
| "loss": 0.903, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.09027443428021184, |
| "grad_norm": 2.09375, |
| "learning_rate": 0.00018048926447185106, |
| "loss": 0.9166, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.091478093403948, |
| "grad_norm": 1.8359375, |
| "learning_rate": 0.00018048187745705387, |
| "loss": 0.8833, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.09268175252768417, |
| "grad_norm": 2.125, |
| "learning_rate": 0.00018047438256161086, |
| "loss": 0.8981, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.09388541165142032, |
| "grad_norm": 2.03125, |
| "learning_rate": 0.00018046677979749698, |
| "loss": 0.8898, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.09508907077515648, |
| "grad_norm": 1.9609375, |
| "learning_rate": 0.00018045906917685947, |
| "loss": 0.8837, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.09629272989889263, |
| "grad_norm": 2.015625, |
| "learning_rate": 0.000180451250712018, |
| "loss": 0.8803, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.09749638902262879, |
| "grad_norm": 1.8984375, |
| "learning_rate": 0.00018044332441546437, |
| "loss": 0.8851, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.09870004814636495, |
| "grad_norm": 1.9453125, |
| "learning_rate": 0.00018043529029986285, |
| "loss": 0.8997, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.0999037072701011, |
| "grad_norm": 1.9140625, |
| "learning_rate": 0.00018042714837804985, |
| "loss": 0.8991, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.10110736639383726, |
| "grad_norm": 2.03125, |
| "learning_rate": 0.0001804188986630341, |
| "loss": 0.8939, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.10231102551757343, |
| "grad_norm": 2.203125, |
| "learning_rate": 0.00018041054116799653, |
| "loss": 0.8914, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.10351468464130958, |
| "grad_norm": 2.1875, |
| "learning_rate": 0.00018040207590629026, |
| "loss": 0.8398, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.10471834376504574, |
| "grad_norm": 1.8203125, |
| "learning_rate": 0.0001803935028914406, |
| "loss": 0.8818, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.1059220028887819, |
| "grad_norm": 1.953125, |
| "learning_rate": 0.00018038482213714508, |
| "loss": 0.8518, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.10712566201251805, |
| "grad_norm": 2.09375, |
| "learning_rate": 0.00018037603365727323, |
| "loss": 0.8874, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.10832932113625421, |
| "grad_norm": 1.921875, |
| "learning_rate": 0.00018036713746586689, |
| "loss": 0.8923, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.10953298025999036, |
| "grad_norm": 1.7109375, |
| "learning_rate": 0.00018035813357713984, |
| "loss": 0.8945, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.11073663938372653, |
| "grad_norm": 1.71875, |
| "learning_rate": 0.00018034902200547796, |
| "loss": 0.857, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.11194029850746269, |
| "grad_norm": 2.015625, |
| "learning_rate": 0.00018033980276543928, |
| "loss": 0.8514, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.11314395763119885, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.0001803304758717537, |
| "loss": 0.8674, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.114347616754935, |
| "grad_norm": 1.96875, |
| "learning_rate": 0.00018032104133932326, |
| "loss": 0.9035, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.11555127587867116, |
| "grad_norm": 1.7734375, |
| "learning_rate": 0.00018031149918322191, |
| "loss": 0.8484, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.11675493500240731, |
| "grad_norm": 1.9140625, |
| "learning_rate": 0.0001803018494186956, |
| "loss": 0.8609, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.11795859412614347, |
| "grad_norm": 1.875, |
| "learning_rate": 0.0001802920920611621, |
| "loss": 0.8008, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.11916225324987964, |
| "grad_norm": 1.7890625, |
| "learning_rate": 0.00018028222712621126, |
| "loss": 0.8406, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.1203659123736158, |
| "grad_norm": 2.03125, |
| "learning_rate": 0.00018027225462960463, |
| "loss": 0.8552, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.1203659123736158, |
| "eval_loss": 0.7584288716316223, |
| "eval_runtime": 2.397, |
| "eval_samples_per_second": 83.437, |
| "eval_steps_per_second": 83.437, |
| "step": 500 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 12462, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.2542017536e+16, |
| "train_batch_size": 48, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|