| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 2.999277804525758, |
| "eval_steps": 500, |
| "global_step": 12459, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.001203659123736158, |
| "grad_norm": 4.625, |
| "learning_rate": 2.065577232275662e-05, |
| "loss": 1.9428, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.002407318247472316, |
| "grad_norm": 3.890625, |
| "learning_rate": 4.647548772620239e-05, |
| "loss": 1.8343, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0036109773712084737, |
| "grad_norm": 3.09375, |
| "learning_rate": 7.229520312964818e-05, |
| "loss": 1.7166, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.004814636494944632, |
| "grad_norm": 3.078125, |
| "learning_rate": 9.811491853309394e-05, |
| "loss": 1.5911, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.00601829561868079, |
| "grad_norm": 3.734375, |
| "learning_rate": 0.00012393463393653973, |
| "loss": 1.5328, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.007221954742416947, |
| "grad_norm": 3.09375, |
| "learning_rate": 0.0001497543493399855, |
| "loss": 1.4457, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.008425613866153106, |
| "grad_norm": 2.6875, |
| "learning_rate": 0.00017557406474343128, |
| "loss": 1.3968, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.009629272989889264, |
| "grad_norm": 2.921875, |
| "learning_rate": 0.0001807379731713583, |
| "loss": 1.3355, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.010832932113625422, |
| "grad_norm": 3.484375, |
| "learning_rate": 0.00018073783239457288, |
| "loss": 1.3258, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.01203659123736158, |
| "grad_norm": 2.828125, |
| "learning_rate": 0.00018073758332819127, |
| "loss": 1.2706, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.013240250361097737, |
| "grad_norm": 2.921875, |
| "learning_rate": 0.00018073722597261146, |
| "loss": 1.2635, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.014443909484833895, |
| "grad_norm": 2.8125, |
| "learning_rate": 0.00018073676032840438, |
| "loss": 1.2178, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.015647568608570053, |
| "grad_norm": 2.796875, |
| "learning_rate": 0.00018073618639631402, |
| "loss": 1.2227, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.016851227732306212, |
| "grad_norm": 2.8125, |
| "learning_rate": 0.00018073550417725735, |
| "loss": 1.1902, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.018054886856042368, |
| "grad_norm": 2.453125, |
| "learning_rate": 0.0001807347136723244, |
| "loss": 1.1803, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.019258545979778528, |
| "grad_norm": 2.609375, |
| "learning_rate": 0.00018073381488277823, |
| "loss": 1.1605, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.020462205103514684, |
| "grad_norm": 2.453125, |
| "learning_rate": 0.00018073280781005481, |
| "loss": 1.1619, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.021665864227250843, |
| "grad_norm": 2.625, |
| "learning_rate": 0.00018073169245576325, |
| "loss": 1.1545, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.022869523350987, |
| "grad_norm": 2.375, |
| "learning_rate": 0.00018073046882168553, |
| "loss": 1.13, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.02407318247472316, |
| "grad_norm": 2.421875, |
| "learning_rate": 0.00018072913690977675, |
| "loss": 1.121, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.025276841598459315, |
| "grad_norm": 2.5625, |
| "learning_rate": 0.00018072769672216498, |
| "loss": 1.0923, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.026480500722195474, |
| "grad_norm": 2.4375, |
| "learning_rate": 0.0001807261482611512, |
| "loss": 1.1263, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.027684159845931634, |
| "grad_norm": 2.84375, |
| "learning_rate": 0.00018072449152920953, |
| "loss": 1.1257, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.02888781896966779, |
| "grad_norm": 2.875, |
| "learning_rate": 0.00018072272652898695, |
| "loss": 1.0985, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.03009147809340395, |
| "grad_norm": 2.359375, |
| "learning_rate": 0.0001807208532633035, |
| "loss": 1.0515, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.031295137217140105, |
| "grad_norm": 2.8125, |
| "learning_rate": 0.0001807188717351522, |
| "loss": 1.1238, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.03249879634087626, |
| "grad_norm": 2.4375, |
| "learning_rate": 0.00018071678194769898, |
| "loss": 1.0551, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.033702455464612424, |
| "grad_norm": 2.40625, |
| "learning_rate": 0.0001807145839042828, |
| "loss": 1.0581, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.03490611458834858, |
| "grad_norm": 2.890625, |
| "learning_rate": 0.0001807122776084156, |
| "loss": 1.0491, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.036109773712084736, |
| "grad_norm": 2.203125, |
| "learning_rate": 0.00018070986306378223, |
| "loss": 1.0508, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.03731343283582089, |
| "grad_norm": 2.46875, |
| "learning_rate": 0.00018070734027424048, |
| "loss": 1.053, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.038517091959557055, |
| "grad_norm": 2.265625, |
| "learning_rate": 0.00018070470924382115, |
| "loss": 1.0778, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.03972075108329321, |
| "grad_norm": 2.1875, |
| "learning_rate": 0.00018070196997672797, |
| "loss": 1.0409, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.04092441020702937, |
| "grad_norm": 2.5, |
| "learning_rate": 0.00018069912247733758, |
| "loss": 1.0466, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.04212806933076553, |
| "grad_norm": 2.03125, |
| "learning_rate": 0.00018069616675019952, |
| "loss": 1.0337, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.043331728454501686, |
| "grad_norm": 2.421875, |
| "learning_rate": 0.00018069310280003633, |
| "loss": 1.0368, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.04453538757823784, |
| "grad_norm": 1.9609375, |
| "learning_rate": 0.00018068993063174337, |
| "loss": 0.9803, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.045739046701974, |
| "grad_norm": 2.40625, |
| "learning_rate": 0.00018068665025038899, |
| "loss": 1.0449, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.04694270582571016, |
| "grad_norm": 2.25, |
| "learning_rate": 0.00018068326166121437, |
| "loss": 1.0, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.04814636494944632, |
| "grad_norm": 2.484375, |
| "learning_rate": 0.00018067976486963364, |
| "loss": 0.992, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.04935002407318247, |
| "grad_norm": 1.9609375, |
| "learning_rate": 0.00018067615988123374, |
| "loss": 0.9909, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.05055368319691863, |
| "grad_norm": 2.03125, |
| "learning_rate": 0.00018067244670177452, |
| "loss": 0.9914, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.05175734232065479, |
| "grad_norm": 2.34375, |
| "learning_rate": 0.00018066862533718873, |
| "loss": 0.9937, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.05296100144439095, |
| "grad_norm": 2.140625, |
| "learning_rate": 0.0001806646957935819, |
| "loss": 0.9911, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.054164660568127104, |
| "grad_norm": 2.296875, |
| "learning_rate": 0.00018066065807723243, |
| "loss": 0.9876, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.05536831969186327, |
| "grad_norm": 2.34375, |
| "learning_rate": 0.00018065651219459158, |
| "loss": 0.9808, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.05657197881559942, |
| "grad_norm": 2.421875, |
| "learning_rate": 0.00018065225815228335, |
| "loss": 0.9586, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.05777563793933558, |
| "grad_norm": 2.125, |
| "learning_rate": 0.00018064789595710468, |
| "loss": 0.9513, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.058979297063071735, |
| "grad_norm": 2.109375, |
| "learning_rate": 0.00018064342561602522, |
| "loss": 0.9707, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.0601829561868079, |
| "grad_norm": 2.109375, |
| "learning_rate": 0.00018063884713618737, |
| "loss": 0.9642, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.061386615310544054, |
| "grad_norm": 2.15625, |
| "learning_rate": 0.00018063416052490648, |
| "loss": 0.9899, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.06259027443428021, |
| "grad_norm": 2.1875, |
| "learning_rate": 0.00018062936578967044, |
| "loss": 0.9258, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.06379393355801637, |
| "grad_norm": 2.296875, |
| "learning_rate": 0.00018062446293814008, |
| "loss": 0.9583, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.06499759268175252, |
| "grad_norm": 2.125, |
| "learning_rate": 0.0001806194519781489, |
| "loss": 0.9433, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.06620125180548869, |
| "grad_norm": 2.234375, |
| "learning_rate": 0.00018061433291770306, |
| "loss": 0.9476, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.06740491092922485, |
| "grad_norm": 2.25, |
| "learning_rate": 0.00018060910576498158, |
| "loss": 0.9403, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.068608570052961, |
| "grad_norm": 2.203125, |
| "learning_rate": 0.0001806037705283361, |
| "loss": 0.9219, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.06981222917669716, |
| "grad_norm": 2.265625, |
| "learning_rate": 0.0001805983272162909, |
| "loss": 0.9263, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.07101588830043332, |
| "grad_norm": 2.4375, |
| "learning_rate": 0.00018059277583754304, |
| "loss": 0.9205, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.07221954742416947, |
| "grad_norm": 2.171875, |
| "learning_rate": 0.00018058711640096223, |
| "loss": 0.8964, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.07342320654790563, |
| "grad_norm": 2.21875, |
| "learning_rate": 0.00018058134891559078, |
| "loss": 0.9138, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.07462686567164178, |
| "grad_norm": 2.03125, |
| "learning_rate": 0.00018057547339064362, |
| "loss": 0.9659, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.07583052479537795, |
| "grad_norm": 2.171875, |
| "learning_rate": 0.00018056948983550834, |
| "loss": 0.8931, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.07703418391911411, |
| "grad_norm": 2.234375, |
| "learning_rate": 0.00018056339825974518, |
| "loss": 0.9025, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.07823784304285027, |
| "grad_norm": 1.921875, |
| "learning_rate": 0.00018055719867308685, |
| "loss": 0.9069, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.07944150216658642, |
| "grad_norm": 2.046875, |
| "learning_rate": 0.00018055089108543872, |
| "loss": 0.9294, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.08064516129032258, |
| "grad_norm": 2.453125, |
| "learning_rate": 0.00018054447550687873, |
| "loss": 0.9123, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.08184882041405873, |
| "grad_norm": 2.140625, |
| "learning_rate": 0.00018053795194765732, |
| "loss": 0.9084, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.08305247953779489, |
| "grad_norm": 2.15625, |
| "learning_rate": 0.00018053132041819745, |
| "loss": 0.8883, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.08425613866153106, |
| "grad_norm": 1.859375, |
| "learning_rate": 0.00018052458092909456, |
| "loss": 0.9296, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.08545979778526722, |
| "grad_norm": 2.09375, |
| "learning_rate": 0.00018051773349111671, |
| "loss": 0.8994, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.08666345690900337, |
| "grad_norm": 1.7890625, |
| "learning_rate": 0.00018051077811520431, |
| "loss": 0.9062, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.08786711603273953, |
| "grad_norm": 2.015625, |
| "learning_rate": 0.00018050371481247027, |
| "loss": 0.9067, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.08907077515647568, |
| "grad_norm": 1.828125, |
| "learning_rate": 0.00018049654359419994, |
| "loss": 0.902, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.09027443428021184, |
| "grad_norm": 2.125, |
| "learning_rate": 0.00018048926447185106, |
| "loss": 0.9134, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.091478093403948, |
| "grad_norm": 1.84375, |
| "learning_rate": 0.00018048187745705387, |
| "loss": 0.885, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.09268175252768417, |
| "grad_norm": 2.046875, |
| "learning_rate": 0.00018047438256161086, |
| "loss": 0.9, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.09388541165142032, |
| "grad_norm": 2.078125, |
| "learning_rate": 0.00018046677979749698, |
| "loss": 0.8892, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.09508907077515648, |
| "grad_norm": 1.953125, |
| "learning_rate": 0.00018045906917685947, |
| "loss": 0.8873, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.09629272989889263, |
| "grad_norm": 2.09375, |
| "learning_rate": 0.000180451250712018, |
| "loss": 0.882, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.09749638902262879, |
| "grad_norm": 1.8828125, |
| "learning_rate": 0.00018044332441546437, |
| "loss": 0.8877, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.09870004814636495, |
| "grad_norm": 2.171875, |
| "learning_rate": 0.00018043529029986285, |
| "loss": 0.8968, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.0999037072701011, |
| "grad_norm": 1.9375, |
| "learning_rate": 0.00018042714837804985, |
| "loss": 0.9009, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.10110736639383726, |
| "grad_norm": 2.046875, |
| "learning_rate": 0.0001804188986630341, |
| "loss": 0.8936, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.10231102551757343, |
| "grad_norm": 2.015625, |
| "learning_rate": 0.00018041054116799653, |
| "loss": 0.8864, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.10351468464130958, |
| "grad_norm": 2.25, |
| "learning_rate": 0.00018040207590629026, |
| "loss": 0.8341, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.10471834376504574, |
| "grad_norm": 1.8203125, |
| "learning_rate": 0.0001803935028914406, |
| "loss": 0.8809, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.1059220028887819, |
| "grad_norm": 2.03125, |
| "learning_rate": 0.00018038482213714508, |
| "loss": 0.8482, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.10712566201251805, |
| "grad_norm": 2.09375, |
| "learning_rate": 0.00018037603365727323, |
| "loss": 0.8869, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.10832932113625421, |
| "grad_norm": 1.9765625, |
| "learning_rate": 0.00018036713746586689, |
| "loss": 0.8903, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.10953298025999036, |
| "grad_norm": 1.84375, |
| "learning_rate": 0.00018035813357713984, |
| "loss": 0.8944, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.11073663938372653, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.00018034902200547796, |
| "loss": 0.8577, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.11194029850746269, |
| "grad_norm": 1.9609375, |
| "learning_rate": 0.00018033980276543928, |
| "loss": 0.8499, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.11314395763119885, |
| "grad_norm": 1.7734375, |
| "learning_rate": 0.0001803304758717537, |
| "loss": 0.8646, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.114347616754935, |
| "grad_norm": 2.03125, |
| "learning_rate": 0.00018032104133932326, |
| "loss": 0.8994, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.11555127587867116, |
| "grad_norm": 1.7578125, |
| "learning_rate": 0.00018031149918322191, |
| "loss": 0.8491, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.11675493500240731, |
| "grad_norm": 2.046875, |
| "learning_rate": 0.0001803018494186956, |
| "loss": 0.8594, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.11795859412614347, |
| "grad_norm": 1.953125, |
| "learning_rate": 0.0001802920920611621, |
| "loss": 0.8103, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.11916225324987964, |
| "grad_norm": 1.890625, |
| "learning_rate": 0.00018028222712621126, |
| "loss": 0.8405, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.1203659123736158, |
| "grad_norm": 1.8984375, |
| "learning_rate": 0.00018027225462960463, |
| "loss": 0.8522, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.1203659123736158, |
| "eval_loss": 0.7550991773605347, |
| "eval_runtime": 2.3423, |
| "eval_samples_per_second": 85.387, |
| "eval_steps_per_second": 85.387, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.12156957149735195, |
| "grad_norm": 2.0, |
| "learning_rate": 0.00018026217458727575, |
| "loss": 0.8116, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.12277323062108811, |
| "grad_norm": 2.21875, |
| "learning_rate": 0.00018025198701532993, |
| "loss": 0.8449, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.12397688974482426, |
| "grad_norm": 1.890625, |
| "learning_rate": 0.00018024169193004433, |
| "loss": 0.8823, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.12518054886856042, |
| "grad_norm": 2.03125, |
| "learning_rate": 0.00018023128934786784, |
| "loss": 0.8989, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.12638420799229658, |
| "grad_norm": 2.0, |
| "learning_rate": 0.0001802207792854211, |
| "loss": 0.8832, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.12758786711603273, |
| "grad_norm": 2.171875, |
| "learning_rate": 0.00018021016175949651, |
| "loss": 0.8406, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.1287915262397689, |
| "grad_norm": 1.9296875, |
| "learning_rate": 0.00018019943678705816, |
| "loss": 0.8324, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.12999518536350504, |
| "grad_norm": 1.8203125, |
| "learning_rate": 0.00018018860438524177, |
| "loss": 0.8331, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.1311988444872412, |
| "grad_norm": 1.9296875, |
| "learning_rate": 0.00018017766457135482, |
| "loss": 0.8508, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.13240250361097738, |
| "grad_norm": 1.90625, |
| "learning_rate": 0.00018016661736287623, |
| "loss": 0.8103, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.13360616273471354, |
| "grad_norm": 1.8125, |
| "learning_rate": 0.0001801554627774567, |
| "loss": 0.849, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.1348098218584497, |
| "grad_norm": 1.765625, |
| "learning_rate": 0.0001801442008329183, |
| "loss": 0.8389, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.13601348098218585, |
| "grad_norm": 2.0625, |
| "learning_rate": 0.00018013283154725486, |
| "loss": 0.8162, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.137217140105922, |
| "grad_norm": 1.765625, |
| "learning_rate": 0.00018012135493863146, |
| "loss": 0.8182, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.13842079922965816, |
| "grad_norm": 1.640625, |
| "learning_rate": 0.0001801097710253848, |
| "loss": 0.7734, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.13962445835339432, |
| "grad_norm": 1.8671875, |
| "learning_rate": 0.00018009807982602308, |
| "loss": 0.8289, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.14082811747713048, |
| "grad_norm": 1.984375, |
| "learning_rate": 0.00018008628135922578, |
| "loss": 0.8231, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.14203177660086663, |
| "grad_norm": 1.921875, |
| "learning_rate": 0.00018007437564384381, |
| "loss": 0.8323, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.1432354357246028, |
| "grad_norm": 1.9609375, |
| "learning_rate": 0.00018006236269889952, |
| "loss": 0.8316, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.14443909484833894, |
| "grad_norm": 1.9140625, |
| "learning_rate": 0.00018005024254358646, |
| "loss": 0.8063, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.1456427539720751, |
| "grad_norm": 1.71875, |
| "learning_rate": 0.00018003801519726954, |
| "loss": 0.8243, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.14684641309581126, |
| "grad_norm": 1.6953125, |
| "learning_rate": 0.00018002568067948494, |
| "loss": 0.8155, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.1480500722195474, |
| "grad_norm": 1.8046875, |
| "learning_rate": 0.00018001323900994, |
| "loss": 0.8342, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.14925373134328357, |
| "grad_norm": 1.8515625, |
| "learning_rate": 0.00018000069020851343, |
| "loss": 0.8569, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.15045739046701975, |
| "grad_norm": 1.96875, |
| "learning_rate": 0.00017998803429525488, |
| "loss": 0.8212, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.1516610495907559, |
| "grad_norm": 1.828125, |
| "learning_rate": 0.00017997527129038534, |
| "loss": 0.8478, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.15286470871449206, |
| "grad_norm": 1.8515625, |
| "learning_rate": 0.00017996240121429677, |
| "loss": 0.8347, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.15406836783822822, |
| "grad_norm": 1.609375, |
| "learning_rate": 0.0001799494240875523, |
| "loss": 0.8067, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.15527202696196438, |
| "grad_norm": 1.9609375, |
| "learning_rate": 0.000179936339930886, |
| "loss": 0.8296, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.15647568608570053, |
| "grad_norm": 1.6640625, |
| "learning_rate": 0.000179923148765203, |
| "loss": 0.8329, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.1576793452094367, |
| "grad_norm": 1.7890625, |
| "learning_rate": 0.00017990985061157948, |
| "loss": 0.8557, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.15888300433317284, |
| "grad_norm": 1.8125, |
| "learning_rate": 0.00017989644549126234, |
| "loss": 0.8177, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.160086663456909, |
| "grad_norm": 1.75, |
| "learning_rate": 0.00017988293342566965, |
| "loss": 0.8396, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.16129032258064516, |
| "grad_norm": 2.03125, |
| "learning_rate": 0.0001798693144363902, |
| "loss": 0.8214, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.1624939817043813, |
| "grad_norm": 1.7265625, |
| "learning_rate": 0.00017985558854518362, |
| "loss": 0.789, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.16369764082811747, |
| "grad_norm": 1.921875, |
| "learning_rate": 0.00017984175577398037, |
| "loss": 0.8046, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.16490129995185362, |
| "grad_norm": 1.828125, |
| "learning_rate": 0.0001798278161448817, |
| "loss": 0.8014, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.16610495907558978, |
| "grad_norm": 1.8203125, |
| "learning_rate": 0.0001798137696801595, |
| "loss": 0.8043, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.16730861819932596, |
| "grad_norm": 1.8515625, |
| "learning_rate": 0.00017979961640225648, |
| "loss": 0.7911, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.16851227732306212, |
| "grad_norm": 1.7734375, |
| "learning_rate": 0.00017978535633378595, |
| "loss": 0.7907, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.16971593644679828, |
| "grad_norm": 1.96875, |
| "learning_rate": 0.00017977098949753179, |
| "loss": 0.8338, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.17091959557053443, |
| "grad_norm": 1.9375, |
| "learning_rate": 0.00017975651591644855, |
| "loss": 0.8179, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.1721232546942706, |
| "grad_norm": 1.7734375, |
| "learning_rate": 0.00017974193561366133, |
| "loss": 0.8143, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.17332691381800674, |
| "grad_norm": 1.78125, |
| "learning_rate": 0.0001797272486124657, |
| "loss": 0.7901, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.1745305729417429, |
| "grad_norm": 1.8515625, |
| "learning_rate": 0.0001797124549363277, |
| "loss": 0.8494, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.17573423206547906, |
| "grad_norm": 1.890625, |
| "learning_rate": 0.00017969755460888387, |
| "loss": 0.8067, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.1769378911892152, |
| "grad_norm": 2.0625, |
| "learning_rate": 0.00017968254765394107, |
| "loss": 0.8148, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.17814155031295137, |
| "grad_norm": 1.640625, |
| "learning_rate": 0.0001796674340954766, |
| "loss": 0.7815, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.17934520943668752, |
| "grad_norm": 1.734375, |
| "learning_rate": 0.00017965221395763802, |
| "loss": 0.7736, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.18054886856042368, |
| "grad_norm": 1.71875, |
| "learning_rate": 0.00017963688726474326, |
| "loss": 0.7575, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.18175252768415984, |
| "grad_norm": 1.7109375, |
| "learning_rate": 0.00017962145404128038, |
| "loss": 0.7874, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.182956186807896, |
| "grad_norm": 1.671875, |
| "learning_rate": 0.0001796059143119078, |
| "loss": 0.7536, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.18415984593163215, |
| "grad_norm": 1.8125, |
| "learning_rate": 0.0001795902681014539, |
| "loss": 0.7967, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.18536350505536833, |
| "grad_norm": 1.671875, |
| "learning_rate": 0.00017957451543491746, |
| "loss": 0.7517, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.1865671641791045, |
| "grad_norm": 1.6640625, |
| "learning_rate": 0.00017955865633746711, |
| "loss": 0.8332, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.18777082330284064, |
| "grad_norm": 1.9921875, |
| "learning_rate": 0.00017954269083444164, |
| "loss": 0.7853, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.1889744824265768, |
| "grad_norm": 1.8125, |
| "learning_rate": 0.0001795266189513498, |
| "loss": 0.8002, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.19017814155031296, |
| "grad_norm": 1.8828125, |
| "learning_rate": 0.00017951044071387042, |
| "loss": 0.779, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.1913818006740491, |
| "grad_norm": 1.71875, |
| "learning_rate": 0.00017949415614785212, |
| "loss": 0.7482, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.19258545979778527, |
| "grad_norm": 1.8125, |
| "learning_rate": 0.00017947776527931346, |
| "loss": 0.7858, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.19378911892152142, |
| "grad_norm": 1.7890625, |
| "learning_rate": 0.00017946126813444285, |
| "loss": 0.7877, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.19499277804525758, |
| "grad_norm": 1.78125, |
| "learning_rate": 0.00017944466473959853, |
| "loss": 0.7832, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.19619643716899374, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.00017942795512130845, |
| "loss": 0.7673, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.1974000962927299, |
| "grad_norm": 1.78125, |
| "learning_rate": 0.00017941113930627025, |
| "loss": 0.7841, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.19860375541646605, |
| "grad_norm": 1.8203125, |
| "learning_rate": 0.00017939421732135135, |
| "loss": 0.778, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.1998074145402022, |
| "grad_norm": 1.8125, |
| "learning_rate": 0.00017937718919358873, |
| "loss": 0.7848, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.20101107366393836, |
| "grad_norm": 1.84375, |
| "learning_rate": 0.00017936005495018897, |
| "loss": 0.7925, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.20221473278767452, |
| "grad_norm": 1.65625, |
| "learning_rate": 0.0001793428146185282, |
| "loss": 0.7661, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.2034183919114107, |
| "grad_norm": 1.6328125, |
| "learning_rate": 0.00017932546822615206, |
| "loss": 0.7855, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.20462205103514686, |
| "grad_norm": 1.7734375, |
| "learning_rate": 0.00017930801580077563, |
| "loss": 0.7898, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.205825710158883, |
| "grad_norm": 1.5390625, |
| "learning_rate": 0.00017929045737028336, |
| "loss": 0.8075, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.20702936928261917, |
| "grad_norm": 1.6796875, |
| "learning_rate": 0.0001792727929627292, |
| "loss": 0.7859, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.20823302840635532, |
| "grad_norm": 1.6328125, |
| "learning_rate": 0.0001792550226063363, |
| "loss": 0.8242, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.20943668753009148, |
| "grad_norm": 1.671875, |
| "learning_rate": 0.00017923714632949716, |
| "loss": 0.7973, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.21064034665382764, |
| "grad_norm": 1.921875, |
| "learning_rate": 0.00017921916416077348, |
| "loss": 0.7716, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.2118440057775638, |
| "grad_norm": 1.59375, |
| "learning_rate": 0.00017920107612889616, |
| "loss": 0.7995, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.21304766490129995, |
| "grad_norm": 1.6328125, |
| "learning_rate": 0.0001791828822627652, |
| "loss": 0.7775, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.2142513240250361, |
| "grad_norm": 1.609375, |
| "learning_rate": 0.0001791645825914498, |
| "loss": 0.7624, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.21545498314877226, |
| "grad_norm": 1.6875, |
| "learning_rate": 0.00017914617714418808, |
| "loss": 0.7662, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.21665864227250842, |
| "grad_norm": 1.515625, |
| "learning_rate": 0.00017912766595038726, |
| "loss": 0.7519, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.21786230139624457, |
| "grad_norm": 1.796875, |
| "learning_rate": 0.00017910904903962349, |
| "loss": 0.7878, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.21906596051998073, |
| "grad_norm": 1.890625, |
| "learning_rate": 0.00017909032644164178, |
| "loss": 0.7837, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.2202696196437169, |
| "grad_norm": 1.7421875, |
| "learning_rate": 0.0001790714981863561, |
| "loss": 0.7594, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.22147327876745307, |
| "grad_norm": 1.734375, |
| "learning_rate": 0.0001790525643038491, |
| "loss": 0.7812, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.22267693789118922, |
| "grad_norm": 1.765625, |
| "learning_rate": 0.00017903352482437235, |
| "loss": 0.7765, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.22388059701492538, |
| "grad_norm": 1.921875, |
| "learning_rate": 0.00017901437977834605, |
| "loss": 0.7673, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.22508425613866154, |
| "grad_norm": 1.6484375, |
| "learning_rate": 0.00017899512919635898, |
| "loss": 0.7732, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.2262879152623977, |
| "grad_norm": 1.8828125, |
| "learning_rate": 0.00017897577310916875, |
| "loss": 0.7819, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.22749157438613385, |
| "grad_norm": 1.7734375, |
| "learning_rate": 0.00017895631154770135, |
| "loss": 0.7743, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.22869523350987, |
| "grad_norm": 1.6875, |
| "learning_rate": 0.00017893674454305143, |
| "loss": 0.7987, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.22989889263360616, |
| "grad_norm": 1.734375, |
| "learning_rate": 0.000178917072126482, |
| "loss": 0.7453, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.23110255175734232, |
| "grad_norm": 1.8125, |
| "learning_rate": 0.0001788972943294245, |
| "loss": 0.7587, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.23230621088107847, |
| "grad_norm": 1.6875, |
| "learning_rate": 0.0001788774111834789, |
| "loss": 0.7719, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.23350987000481463, |
| "grad_norm": 1.609375, |
| "learning_rate": 0.00017885742272041326, |
| "loss": 0.7624, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.23471352912855079, |
| "grad_norm": 1.6484375, |
| "learning_rate": 0.00017883732897216407, |
| "loss": 0.7362, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.23591718825228694, |
| "grad_norm": 1.5859375, |
| "learning_rate": 0.000178817129970836, |
| "loss": 0.7371, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.2371208473760231, |
| "grad_norm": 1.515625, |
| "learning_rate": 0.00017879682574870184, |
| "loss": 0.7411, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.23832450649975928, |
| "grad_norm": 1.796875, |
| "learning_rate": 0.00017877641633820253, |
| "loss": 0.746, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.23952816562349544, |
| "grad_norm": 1.6171875, |
| "learning_rate": 0.0001787559017719471, |
| "loss": 0.7595, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.2407318247472316, |
| "grad_norm": 1.6171875, |
| "learning_rate": 0.00017873528208271254, |
| "loss": 0.7449, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.2407318247472316, |
| "eval_loss": 0.655630350112915, |
| "eval_runtime": 2.3436, |
| "eval_samples_per_second": 85.338, |
| "eval_steps_per_second": 85.338, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.24193548387096775, |
| "grad_norm": 1.6796875, |
| "learning_rate": 0.00017871455730344388, |
| "loss": 0.7731, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.2431391429947039, |
| "grad_norm": 1.5859375, |
| "learning_rate": 0.00017869372746725398, |
| "loss": 0.7532, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.24434280211844006, |
| "grad_norm": 1.59375, |
| "learning_rate": 0.00017867279260742354, |
| "loss": 0.7577, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.24554646124217622, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.00017865175275740117, |
| "loss": 0.7327, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.24675012036591237, |
| "grad_norm": 1.46875, |
| "learning_rate": 0.00017863060795080308, |
| "loss": 0.7278, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.24795377948964853, |
| "grad_norm": 1.671875, |
| "learning_rate": 0.0001786093582214133, |
| "loss": 0.7203, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.24915743861338469, |
| "grad_norm": 1.7890625, |
| "learning_rate": 0.00017858800360318343, |
| "loss": 0.7666, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.25036109773712084, |
| "grad_norm": 1.5625, |
| "learning_rate": 0.0001785665441302327, |
| "loss": 0.7579, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.251564756860857, |
| "grad_norm": 1.71875, |
| "learning_rate": 0.00017854497983684782, |
| "loss": 0.7048, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.25276841598459315, |
| "grad_norm": 1.578125, |
| "learning_rate": 0.000178523310757483, |
| "loss": 0.743, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.2539720751083293, |
| "grad_norm": 1.6953125, |
| "learning_rate": 0.0001785015369267599, |
| "loss": 0.7023, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.25517573423206547, |
| "grad_norm": 1.890625, |
| "learning_rate": 0.0001784796583794675, |
| "loss": 0.7709, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.2563793933558016, |
| "grad_norm": 1.5859375, |
| "learning_rate": 0.00017845767515056214, |
| "loss": 0.7612, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.2575830524795378, |
| "grad_norm": 1.59375, |
| "learning_rate": 0.00017843558727516733, |
| "loss": 0.7406, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.25878671160327393, |
| "grad_norm": 1.6171875, |
| "learning_rate": 0.0001784133947885739, |
| "loss": 0.7741, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.2599903707270101, |
| "grad_norm": 1.5859375, |
| "learning_rate": 0.00017839109772623966, |
| "loss": 0.8016, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.26119402985074625, |
| "grad_norm": 1.8125, |
| "learning_rate": 0.0001783686961237897, |
| "loss": 0.712, |
| "step": 1085 |
| }, |
| { |
| "epoch": 0.2623976889744824, |
| "grad_norm": 1.59375, |
| "learning_rate": 0.00017834619001701597, |
| "loss": 0.7446, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.26360134809821856, |
| "grad_norm": 1.609375, |
| "learning_rate": 0.0001783235794418775, |
| "loss": 0.7314, |
| "step": 1095 |
| }, |
| { |
| "epoch": 0.26480500722195477, |
| "grad_norm": 1.640625, |
| "learning_rate": 0.0001783008644345002, |
| "loss": 0.743, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.2660086663456909, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.00017827804503117676, |
| "loss": 0.7261, |
| "step": 1105 |
| }, |
| { |
| "epoch": 0.2672123254694271, |
| "grad_norm": 1.578125, |
| "learning_rate": 0.0001782551212683668, |
| "loss": 0.6977, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.26841598459316324, |
| "grad_norm": 1.78125, |
| "learning_rate": 0.00017823209318269662, |
| "loss": 0.7521, |
| "step": 1115 |
| }, |
| { |
| "epoch": 0.2696196437168994, |
| "grad_norm": 1.7109375, |
| "learning_rate": 0.00017820896081095918, |
| "loss": 0.7073, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.27082330284063555, |
| "grad_norm": 1.6171875, |
| "learning_rate": 0.00017818572419011402, |
| "loss": 0.7562, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.2720269619643717, |
| "grad_norm": 1.625, |
| "learning_rate": 0.0001781623833572874, |
| "loss": 0.7414, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.27323062108810786, |
| "grad_norm": 1.6640625, |
| "learning_rate": 0.00017813893834977196, |
| "loss": 0.7684, |
| "step": 1135 |
| }, |
| { |
| "epoch": 0.274434280211844, |
| "grad_norm": 1.53125, |
| "learning_rate": 0.00017811538920502678, |
| "loss": 0.7436, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.2756379393355802, |
| "grad_norm": 1.6953125, |
| "learning_rate": 0.0001780917359606774, |
| "loss": 0.7491, |
| "step": 1145 |
| }, |
| { |
| "epoch": 0.27684159845931633, |
| "grad_norm": 1.6328125, |
| "learning_rate": 0.00017806797865451557, |
| "loss": 0.7647, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.2780452575830525, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.00017804411732449946, |
| "loss": 0.7486, |
| "step": 1155 |
| }, |
| { |
| "epoch": 0.27924891670678864, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.0001780201520087533, |
| "loss": 0.7637, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.2804525758305248, |
| "grad_norm": 1.5078125, |
| "learning_rate": 0.00017799608274556757, |
| "loss": 0.703, |
| "step": 1165 |
| }, |
| { |
| "epoch": 0.28165623495426095, |
| "grad_norm": 1.6953125, |
| "learning_rate": 0.00017797190957339872, |
| "loss": 0.7377, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.2828598940779971, |
| "grad_norm": 1.5, |
| "learning_rate": 0.00017794763253086934, |
| "loss": 0.7271, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.28406355320173327, |
| "grad_norm": 1.5390625, |
| "learning_rate": 0.00017792325165676788, |
| "loss": 0.7541, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.2852672123254694, |
| "grad_norm": 1.5, |
| "learning_rate": 0.00017789876699004874, |
| "loss": 0.7331, |
| "step": 1185 |
| }, |
| { |
| "epoch": 0.2864708714492056, |
| "grad_norm": 1.6640625, |
| "learning_rate": 0.00017787417856983216, |
| "loss": 0.7607, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.28767453057294173, |
| "grad_norm": 1.6171875, |
| "learning_rate": 0.0001778494864354041, |
| "loss": 0.7371, |
| "step": 1195 |
| }, |
| { |
| "epoch": 0.2888781896966779, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.00017782469062621627, |
| "loss": 0.744, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.29008184882041405, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.00017779979118188603, |
| "loss": 0.7065, |
| "step": 1205 |
| }, |
| { |
| "epoch": 0.2912855079441502, |
| "grad_norm": 1.6015625, |
| "learning_rate": 0.00017777478814219632, |
| "loss": 0.7438, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.29248916706788636, |
| "grad_norm": 1.671875, |
| "learning_rate": 0.00017774968154709558, |
| "loss": 0.7451, |
| "step": 1215 |
| }, |
| { |
| "epoch": 0.2936928261916225, |
| "grad_norm": 1.6640625, |
| "learning_rate": 0.00017772447143669766, |
| "loss": 0.7435, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.29489648531535867, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.00017769915785128193, |
| "loss": 0.7337, |
| "step": 1225 |
| }, |
| { |
| "epoch": 0.2961001444390948, |
| "grad_norm": 1.6171875, |
| "learning_rate": 0.00017767374083129295, |
| "loss": 0.7463, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.297303803562831, |
| "grad_norm": 1.6015625, |
| "learning_rate": 0.00017764822041734066, |
| "loss": 0.7128, |
| "step": 1235 |
| }, |
| { |
| "epoch": 0.29850746268656714, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.0001776225966502001, |
| "loss": 0.7143, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.29971112181030335, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.0001775968695708115, |
| "loss": 0.7268, |
| "step": 1245 |
| }, |
| { |
| "epoch": 0.3009147809340395, |
| "grad_norm": 1.6640625, |
| "learning_rate": 0.00017757103922028008, |
| "loss": 0.7318, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.30211844005777566, |
| "grad_norm": 1.71875, |
| "learning_rate": 0.0001775451056398762, |
| "loss": 0.7249, |
| "step": 1255 |
| }, |
| { |
| "epoch": 0.3033220991815118, |
| "grad_norm": 1.5625, |
| "learning_rate": 0.00017751906887103502, |
| "loss": 0.7291, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.304525758305248, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.0001774929289553567, |
| "loss": 0.7048, |
| "step": 1265 |
| }, |
| { |
| "epoch": 0.30572941742898413, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.000177466685934606, |
| "loss": 0.7292, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.3069330765527203, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.00017744033985071263, |
| "loss": 0.7248, |
| "step": 1275 |
| }, |
| { |
| "epoch": 0.30813673567645644, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.00017741389074577086, |
| "loss": 0.7061, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.3093403948001926, |
| "grad_norm": 1.578125, |
| "learning_rate": 0.00017738733866203956, |
| "loss": 0.7451, |
| "step": 1285 |
| }, |
| { |
| "epoch": 0.31054405392392875, |
| "grad_norm": 1.734375, |
| "learning_rate": 0.00017736068364194218, |
| "loss": 0.7291, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.3117477130476649, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.00017733392572806658, |
| "loss": 0.7464, |
| "step": 1295 |
| }, |
| { |
| "epoch": 0.31295137217140107, |
| "grad_norm": 1.6328125, |
| "learning_rate": 0.00017730706496316506, |
| "loss": 0.7026, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.3141550312951372, |
| "grad_norm": 1.78125, |
| "learning_rate": 0.00017728010139015426, |
| "loss": 0.7097, |
| "step": 1305 |
| }, |
| { |
| "epoch": 0.3153586904188734, |
| "grad_norm": 1.828125, |
| "learning_rate": 0.000177253035052115, |
| "loss": 0.727, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.31656234954260953, |
| "grad_norm": 1.7734375, |
| "learning_rate": 0.0001772258659922924, |
| "loss": 0.7524, |
| "step": 1315 |
| }, |
| { |
| "epoch": 0.3177660086663457, |
| "grad_norm": 1.6796875, |
| "learning_rate": 0.00017719859425409566, |
| "loss": 0.7328, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.31896966779008185, |
| "grad_norm": 1.71875, |
| "learning_rate": 0.000177171219881098, |
| "loss": 0.7341, |
| "step": 1325 |
| }, |
| { |
| "epoch": 0.320173326913818, |
| "grad_norm": 1.65625, |
| "learning_rate": 0.0001771437429170366, |
| "loss": 0.7141, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.32137698603755416, |
| "grad_norm": 1.6953125, |
| "learning_rate": 0.00017711616340581273, |
| "loss": 0.718, |
| "step": 1335 |
| }, |
| { |
| "epoch": 0.3225806451612903, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.00017708848139149128, |
| "loss": 0.6941, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.32378430428502647, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.00017706069691830105, |
| "loss": 0.6899, |
| "step": 1345 |
| }, |
| { |
| "epoch": 0.3249879634087626, |
| "grad_norm": 1.6875, |
| "learning_rate": 0.00017703281003063448, |
| "loss": 0.6839, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.3261916225324988, |
| "grad_norm": 1.6328125, |
| "learning_rate": 0.0001770048207730477, |
| "loss": 0.7249, |
| "step": 1355 |
| }, |
| { |
| "epoch": 0.32739528165623494, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.00017697672919026036, |
| "loss": 0.71, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.3285989407799711, |
| "grad_norm": 1.59375, |
| "learning_rate": 0.00017694853532715558, |
| "loss": 0.7557, |
| "step": 1365 |
| }, |
| { |
| "epoch": 0.32980259990370725, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.00017692023922877997, |
| "loss": 0.6683, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.3310062590274434, |
| "grad_norm": 1.59375, |
| "learning_rate": 0.0001768918409403434, |
| "loss": 0.7334, |
| "step": 1375 |
| }, |
| { |
| "epoch": 0.33220991815117956, |
| "grad_norm": 1.6171875, |
| "learning_rate": 0.0001768633405072191, |
| "loss": 0.684, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.3334135772749157, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.0001768347379749434, |
| "loss": 0.6837, |
| "step": 1385 |
| }, |
| { |
| "epoch": 0.33461723639865193, |
| "grad_norm": 1.5859375, |
| "learning_rate": 0.00017680603338921587, |
| "loss": 0.7148, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.3358208955223881, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.00017677722679589901, |
| "loss": 0.7273, |
| "step": 1395 |
| }, |
| { |
| "epoch": 0.33702455464612424, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.00017674831824101845, |
| "loss": 0.7363, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.3382282137698604, |
| "grad_norm": 1.671875, |
| "learning_rate": 0.0001767193077707626, |
| "loss": 0.68, |
| "step": 1405 |
| }, |
| { |
| "epoch": 0.33943187289359655, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.00017669019543148278, |
| "loss": 0.7127, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.3406355320173327, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.00017666098126969303, |
| "loss": 0.72, |
| "step": 1415 |
| }, |
| { |
| "epoch": 0.34183919114106887, |
| "grad_norm": 1.78125, |
| "learning_rate": 0.0001766316653320701, |
| "loss": 0.7284, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.343042850264805, |
| "grad_norm": 1.5625, |
| "learning_rate": 0.00017660224766545332, |
| "loss": 0.729, |
| "step": 1425 |
| }, |
| { |
| "epoch": 0.3442465093885412, |
| "grad_norm": 1.71875, |
| "learning_rate": 0.00017657272831684458, |
| "loss": 0.7067, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.34545016851227733, |
| "grad_norm": 1.4375, |
| "learning_rate": 0.0001765431073334083, |
| "loss": 0.7222, |
| "step": 1435 |
| }, |
| { |
| "epoch": 0.3466538276360135, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.00017651338476247117, |
| "loss": 0.7052, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.34785748675974965, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.0001764835606515222, |
| "loss": 0.6729, |
| "step": 1445 |
| }, |
| { |
| "epoch": 0.3490611458834858, |
| "grad_norm": 1.5078125, |
| "learning_rate": 0.00017645363504821275, |
| "loss": 0.6867, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.35026480500722196, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00017642360800035619, |
| "loss": 0.6774, |
| "step": 1455 |
| }, |
| { |
| "epoch": 0.3514684641309581, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.0001763934795559281, |
| "loss": 0.6915, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.35267212325469427, |
| "grad_norm": 1.5, |
| "learning_rate": 0.000176363249763066, |
| "loss": 0.7101, |
| "step": 1465 |
| }, |
| { |
| "epoch": 0.3538757823784304, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.0001763329186700693, |
| "loss": 0.7403, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.3550794415021666, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.00017630248632539937, |
| "loss": 0.7208, |
| "step": 1475 |
| }, |
| { |
| "epoch": 0.35628310062590274, |
| "grad_norm": 1.640625, |
| "learning_rate": 0.00017627195277767924, |
| "loss": 0.714, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.3574867597496389, |
| "grad_norm": 1.5859375, |
| "learning_rate": 0.00017624131807569376, |
| "loss": 0.6912, |
| "step": 1485 |
| }, |
| { |
| "epoch": 0.35869041887337505, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.00017621058226838932, |
| "loss": 0.6765, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.3598940779971112, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.0001761797454048738, |
| "loss": 0.6883, |
| "step": 1495 |
| }, |
| { |
| "epoch": 0.36109773712084736, |
| "grad_norm": 1.671875, |
| "learning_rate": 0.00017614880753441666, |
| "loss": 0.6799, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.36109773712084736, |
| "eval_loss": 0.603824257850647, |
| "eval_runtime": 2.3494, |
| "eval_samples_per_second": 85.128, |
| "eval_steps_per_second": 85.128, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.3623013962445835, |
| "grad_norm": 1.6015625, |
| "learning_rate": 0.00017611776870644867, |
| "loss": 0.7172, |
| "step": 1505 |
| }, |
| { |
| "epoch": 0.3635050553683197, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.00017608662897056193, |
| "loss": 0.6978, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.36470871449205583, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.0001760553883765097, |
| "loss": 0.706, |
| "step": 1515 |
| }, |
| { |
| "epoch": 0.365912373615792, |
| "grad_norm": 1.5, |
| "learning_rate": 0.00017602404697420653, |
| "loss": 0.6777, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.36711603273952814, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.0001759926048137279, |
| "loss": 0.6695, |
| "step": 1525 |
| }, |
| { |
| "epoch": 0.3683196918632643, |
| "grad_norm": 1.53125, |
| "learning_rate": 0.0001759610619453103, |
| "loss": 0.7313, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.36952335098700045, |
| "grad_norm": 1.71875, |
| "learning_rate": 0.00017592941841935118, |
| "loss": 0.722, |
| "step": 1535 |
| }, |
| { |
| "epoch": 0.37072701011073667, |
| "grad_norm": 1.71875, |
| "learning_rate": 0.00017589767428640874, |
| "loss": 0.7167, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.3719306692344728, |
| "grad_norm": 1.5078125, |
| "learning_rate": 0.00017586582959720203, |
| "loss": 0.69, |
| "step": 1545 |
| }, |
| { |
| "epoch": 0.373134328358209, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.00017583388440261066, |
| "loss": 0.691, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.37433798748194513, |
| "grad_norm": 1.5, |
| "learning_rate": 0.00017580183875367486, |
| "loss": 0.7092, |
| "step": 1555 |
| }, |
| { |
| "epoch": 0.3755416466056813, |
| "grad_norm": 1.5625, |
| "learning_rate": 0.00017576969270159543, |
| "loss": 0.7057, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.37674530572941745, |
| "grad_norm": 1.5859375, |
| "learning_rate": 0.00017573744629773342, |
| "loss": 0.6886, |
| "step": 1565 |
| }, |
| { |
| "epoch": 0.3779489648531536, |
| "grad_norm": 1.5, |
| "learning_rate": 0.00017570509959361034, |
| "loss": 0.7254, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.37915262397688976, |
| "grad_norm": 1.6015625, |
| "learning_rate": 0.000175672652640908, |
| "loss": 0.7132, |
| "step": 1575 |
| }, |
| { |
| "epoch": 0.3803562831006259, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.00017564010549146823, |
| "loss": 0.687, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.38155994222436207, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.00017560745819729306, |
| "loss": 0.6896, |
| "step": 1585 |
| }, |
| { |
| "epoch": 0.3827636013480982, |
| "grad_norm": 1.578125, |
| "learning_rate": 0.00017557471081054452, |
| "loss": 0.7031, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.3839672604718344, |
| "grad_norm": 1.6015625, |
| "learning_rate": 0.00017554186338354451, |
| "loss": 0.6904, |
| "step": 1595 |
| }, |
| { |
| "epoch": 0.38517091959557054, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.00017550891596877484, |
| "loss": 0.6994, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.3863745787193067, |
| "grad_norm": 1.4375, |
| "learning_rate": 0.00017547586861887696, |
| "loss": 0.6923, |
| "step": 1605 |
| }, |
| { |
| "epoch": 0.38757823784304285, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.00017544272138665212, |
| "loss": 0.6936, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.388781896966779, |
| "grad_norm": 1.59375, |
| "learning_rate": 0.0001754094743250611, |
| "loss": 0.7102, |
| "step": 1615 |
| }, |
| { |
| "epoch": 0.38998555609051516, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00017537612748722416, |
| "loss": 0.6807, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.3911892152142513, |
| "grad_norm": 1.65625, |
| "learning_rate": 0.00017534268092642098, |
| "loss": 0.7496, |
| "step": 1625 |
| }, |
| { |
| "epoch": 0.3923928743379875, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.00017530913469609066, |
| "loss": 0.6886, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.39359653346172363, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00017527548884983137, |
| "loss": 0.7033, |
| "step": 1635 |
| }, |
| { |
| "epoch": 0.3948001925854598, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.00017524174344140067, |
| "loss": 0.7094, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.39600385170919594, |
| "grad_norm": 1.6328125, |
| "learning_rate": 0.00017520789852471498, |
| "loss": 0.6735, |
| "step": 1645 |
| }, |
| { |
| "epoch": 0.3972075108329321, |
| "grad_norm": 1.5859375, |
| "learning_rate": 0.00017517395415384983, |
| "loss": 0.6832, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.39841116995666825, |
| "grad_norm": 1.515625, |
| "learning_rate": 0.00017513991038303964, |
| "loss": 0.6997, |
| "step": 1655 |
| }, |
| { |
| "epoch": 0.3996148290804044, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.0001751057672666776, |
| "loss": 0.6992, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.40081848820414057, |
| "grad_norm": 1.5, |
| "learning_rate": 0.00017507152485931565, |
| "loss": 0.674, |
| "step": 1665 |
| }, |
| { |
| "epoch": 0.4020221473278767, |
| "grad_norm": 1.7109375, |
| "learning_rate": 0.00017503718321566444, |
| "loss": 0.67, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.4032258064516129, |
| "grad_norm": 1.6328125, |
| "learning_rate": 0.00017500274239059304, |
| "loss": 0.6838, |
| "step": 1675 |
| }, |
| { |
| "epoch": 0.40442946557534903, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.0001749682024391291, |
| "loss": 0.716, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.40563312469908525, |
| "grad_norm": 1.578125, |
| "learning_rate": 0.00017493356341645862, |
| "loss": 0.6694, |
| "step": 1685 |
| }, |
| { |
| "epoch": 0.4068367838228214, |
| "grad_norm": 1.6328125, |
| "learning_rate": 0.00017489882537792583, |
| "loss": 0.7015, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.40804044294655756, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.00017486398837903325, |
| "loss": 0.6801, |
| "step": 1695 |
| }, |
| { |
| "epoch": 0.4092441020702937, |
| "grad_norm": 1.5, |
| "learning_rate": 0.00017482905247544146, |
| "loss": 0.6746, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.41044776119402987, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.00017479401772296907, |
| "loss": 0.7102, |
| "step": 1705 |
| }, |
| { |
| "epoch": 0.411651420317766, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00017475888417759264, |
| "loss": 0.6631, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.4128550794415022, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.0001747236518954466, |
| "loss": 0.6714, |
| "step": 1715 |
| }, |
| { |
| "epoch": 0.41405873856523834, |
| "grad_norm": 1.578125, |
| "learning_rate": 0.00017468832093282303, |
| "loss": 0.6976, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.4152623976889745, |
| "grad_norm": 1.5390625, |
| "learning_rate": 0.00017465289134617183, |
| "loss": 0.6858, |
| "step": 1725 |
| }, |
| { |
| "epoch": 0.41646605681271065, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.00017461736319210038, |
| "loss": 0.6579, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.4176697159364468, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00017458173652737353, |
| "loss": 0.6895, |
| "step": 1735 |
| }, |
| { |
| "epoch": 0.41887337506018296, |
| "grad_norm": 1.640625, |
| "learning_rate": 0.00017454601140891353, |
| "loss": 0.6781, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.4200770341839191, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.00017451018789380008, |
| "loss": 0.6734, |
| "step": 1745 |
| }, |
| { |
| "epoch": 0.4212806933076553, |
| "grad_norm": 1.515625, |
| "learning_rate": 0.0001744742660392699, |
| "loss": 0.687, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.42248435243139143, |
| "grad_norm": 1.4140625, |
| "learning_rate": 0.00017443824590271685, |
| "loss": 0.6909, |
| "step": 1755 |
| }, |
| { |
| "epoch": 0.4236880115551276, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.00017440212754169193, |
| "loss": 0.6658, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.42489167067886374, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.00017436591101390304, |
| "loss": 0.6879, |
| "step": 1765 |
| }, |
| { |
| "epoch": 0.4260953298025999, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.00017432959637721492, |
| "loss": 0.6712, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.42729898892633605, |
| "grad_norm": 1.59375, |
| "learning_rate": 0.00017429318368964897, |
| "loss": 0.6484, |
| "step": 1775 |
| }, |
| { |
| "epoch": 0.4285026480500722, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.0001742566730093834, |
| "loss": 0.6784, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.42970630717380837, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.0001742200643947529, |
| "loss": 0.699, |
| "step": 1785 |
| }, |
| { |
| "epoch": 0.4309099662975445, |
| "grad_norm": 1.515625, |
| "learning_rate": 0.00017418335790424862, |
| "loss": 0.7127, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.4321136254212807, |
| "grad_norm": 1.625, |
| "learning_rate": 0.00017414655359651818, |
| "loss": 0.6998, |
| "step": 1795 |
| }, |
| { |
| "epoch": 0.43331728454501683, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.00017410965153036537, |
| "loss": 0.6865, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.434520943668753, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.0001740726517647503, |
| "loss": 0.7226, |
| "step": 1805 |
| }, |
| { |
| "epoch": 0.43572460279248915, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00017403555435878903, |
| "loss": 0.6809, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.4369282619162253, |
| "grad_norm": 1.4921875, |
| "learning_rate": 0.00017399835937175376, |
| "loss": 0.6601, |
| "step": 1815 |
| }, |
| { |
| "epoch": 0.43813192103996146, |
| "grad_norm": 1.6796875, |
| "learning_rate": 0.00017396106686307252, |
| "loss": 0.6662, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.4393355801636976, |
| "grad_norm": 1.4375, |
| "learning_rate": 0.00017392367689232922, |
| "loss": 0.6769, |
| "step": 1825 |
| }, |
| { |
| "epoch": 0.4405392392874338, |
| "grad_norm": 1.4140625, |
| "learning_rate": 0.00017388618951926337, |
| "loss": 0.6615, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.44174289841117, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00017384860480377026, |
| "loss": 0.6853, |
| "step": 1835 |
| }, |
| { |
| "epoch": 0.44294655753490614, |
| "grad_norm": 1.5859375, |
| "learning_rate": 0.00017381092280590061, |
| "loss": 0.7043, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.4441502166586423, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00017377314358586055, |
| "loss": 0.662, |
| "step": 1845 |
| }, |
| { |
| "epoch": 0.44535387578237845, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.00017373526720401163, |
| "loss": 0.6932, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.4465575349061146, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.0001736972937208706, |
| "loss": 0.6811, |
| "step": 1855 |
| }, |
| { |
| "epoch": 0.44776119402985076, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00017365922319710934, |
| "loss": 0.6901, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.4489648531535869, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00017362105569355477, |
| "loss": 0.6312, |
| "step": 1865 |
| }, |
| { |
| "epoch": 0.4501685122773231, |
| "grad_norm": 1.703125, |
| "learning_rate": 0.0001735827912711888, |
| "loss": 0.6867, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.45137217140105923, |
| "grad_norm": 1.515625, |
| "learning_rate": 0.00017354442999114816, |
| "loss": 0.6822, |
| "step": 1875 |
| }, |
| { |
| "epoch": 0.4525758305247954, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.00017350597191472436, |
| "loss": 0.672, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.45377948964853154, |
| "grad_norm": 1.4375, |
| "learning_rate": 0.00017346741710336352, |
| "loss": 0.6874, |
| "step": 1885 |
| }, |
| { |
| "epoch": 0.4549831487722677, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.00017342876561866636, |
| "loss": 0.6388, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.45618680789600385, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.00017339001752238805, |
| "loss": 0.6422, |
| "step": 1895 |
| }, |
| { |
| "epoch": 0.45739046701974, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.00017335117287643807, |
| "loss": 0.7195, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.45859412614347617, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00017331223174288027, |
| "loss": 0.6871, |
| "step": 1905 |
| }, |
| { |
| "epoch": 0.4597977852672123, |
| "grad_norm": 1.4921875, |
| "learning_rate": 0.00017327319418393257, |
| "loss": 0.6816, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.4610014443909485, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.000173234060261967, |
| "loss": 0.6728, |
| "step": 1915 |
| }, |
| { |
| "epoch": 0.46220510351468463, |
| "grad_norm": 1.5390625, |
| "learning_rate": 0.00017319483003950948, |
| "loss": 0.6706, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.4634087626384208, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.0001731555035792399, |
| "loss": 0.707, |
| "step": 1925 |
| }, |
| { |
| "epoch": 0.46461242176215695, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.0001731160809439918, |
| "loss": 0.6717, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.4658160808858931, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.00017307656219675257, |
| "loss": 0.6665, |
| "step": 1935 |
| }, |
| { |
| "epoch": 0.46701974000962926, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00017303694740066294, |
| "loss": 0.6663, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.4682233991333654, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.00017299723661901717, |
| "loss": 0.663, |
| "step": 1945 |
| }, |
| { |
| "epoch": 0.46942705825710157, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.000172957429915263, |
| "loss": 0.6483, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.4706307173808377, |
| "grad_norm": 1.375, |
| "learning_rate": 0.0001729175273530013, |
| "loss": 0.6526, |
| "step": 1955 |
| }, |
| { |
| "epoch": 0.4718343765045739, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.0001728775289959861, |
| "loss": 0.6336, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.47303803562831004, |
| "grad_norm": 1.671875, |
| "learning_rate": 0.0001728374349081246, |
| "loss": 0.6845, |
| "step": 1965 |
| }, |
| { |
| "epoch": 0.4742416947520462, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00017279724515347682, |
| "loss": 0.6943, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.4754453538757824, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.00017275695979625564, |
| "loss": 0.678, |
| "step": 1975 |
| }, |
| { |
| "epoch": 0.47664901299951856, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00017271657890082683, |
| "loss": 0.6725, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.4778526721232547, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00017267610253170865, |
| "loss": 0.6427, |
| "step": 1985 |
| }, |
| { |
| "epoch": 0.4790563312469909, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00017263553075357195, |
| "loss": 0.66, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.48025999037072703, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.00017259486363124013, |
| "loss": 0.6455, |
| "step": 1995 |
| }, |
| { |
| "epoch": 0.4814636494944632, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.0001725541012296887, |
| "loss": 0.6376, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.4814636494944632, |
| "eval_loss": 0.5736396312713623, |
| "eval_runtime": 2.3497, |
| "eval_samples_per_second": 85.118, |
| "eval_steps_per_second": 85.118, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.48266730861819934, |
| "grad_norm": 1.53125, |
| "learning_rate": 0.00017251324361404558, |
| "loss": 0.6659, |
| "step": 2005 |
| }, |
| { |
| "epoch": 0.4838709677419355, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.0001724722908495908, |
| "loss": 0.6464, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.48507462686567165, |
| "grad_norm": 1.4921875, |
| "learning_rate": 0.0001724312430017563, |
| "loss": 0.6679, |
| "step": 2015 |
| }, |
| { |
| "epoch": 0.4862782859894078, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.0001723901001361261, |
| "loss": 0.6484, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.48748194511314397, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.0001723488623184359, |
| "loss": 0.6647, |
| "step": 2025 |
| }, |
| { |
| "epoch": 0.4886856042368801, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.0001723075296145732, |
| "loss": 0.6346, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.4898892633606163, |
| "grad_norm": 1.5703125, |
| "learning_rate": 0.00017226610209057698, |
| "loss": 0.6896, |
| "step": 2035 |
| }, |
| { |
| "epoch": 0.49109292248435243, |
| "grad_norm": 1.671875, |
| "learning_rate": 0.00017222457981263793, |
| "loss": 0.6608, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.4922965816080886, |
| "grad_norm": 1.46875, |
| "learning_rate": 0.00017218296284709795, |
| "loss": 0.6392, |
| "step": 2045 |
| }, |
| { |
| "epoch": 0.49350024073182475, |
| "grad_norm": 1.5390625, |
| "learning_rate": 0.00017214125126045022, |
| "loss": 0.6745, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.4947038998555609, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.00017209944511933925, |
| "loss": 0.6423, |
| "step": 2055 |
| }, |
| { |
| "epoch": 0.49590755897929706, |
| "grad_norm": 1.4921875, |
| "learning_rate": 0.0001720575444905605, |
| "loss": 0.6368, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.4971112181030332, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.00017201554944106044, |
| "loss": 0.6609, |
| "step": 2065 |
| }, |
| { |
| "epoch": 0.49831487722676937, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.0001719734600379364, |
| "loss": 0.6643, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.4995185363505055, |
| "grad_norm": 1.375, |
| "learning_rate": 0.00017193127634843643, |
| "loss": 0.6324, |
| "step": 2075 |
| }, |
| { |
| "epoch": 0.5007221954742417, |
| "grad_norm": 1.671875, |
| "learning_rate": 0.00017188899843995927, |
| "loss": 0.6465, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.5019258545979779, |
| "grad_norm": 1.4375, |
| "learning_rate": 0.00017184662638005418, |
| "loss": 0.6856, |
| "step": 2085 |
| }, |
| { |
| "epoch": 0.503129513721714, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.00017180416023642085, |
| "loss": 0.6859, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.5043331728454502, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00017176160007690926, |
| "loss": 0.6251, |
| "step": 2095 |
| }, |
| { |
| "epoch": 0.5055368319691863, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00017171894596951974, |
| "loss": 0.6829, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.5067404910929225, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00017167619798240247, |
| "loss": 0.6331, |
| "step": 2105 |
| }, |
| { |
| "epoch": 0.5079441502166586, |
| "grad_norm": 1.5390625, |
| "learning_rate": 0.00017163335618385788, |
| "loss": 0.6639, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.5091478093403948, |
| "grad_norm": 1.5859375, |
| "learning_rate": 0.00017159042064233615, |
| "loss": 0.673, |
| "step": 2115 |
| }, |
| { |
| "epoch": 0.5103514684641309, |
| "grad_norm": 1.609375, |
| "learning_rate": 0.0001715473914264373, |
| "loss": 0.6561, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.5115551275878671, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.00017150426860491089, |
| "loss": 0.6932, |
| "step": 2125 |
| }, |
| { |
| "epoch": 0.5127587867116032, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.00017146105224665622, |
| "loss": 0.604, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.5139624458353395, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.00017141774242072195, |
| "loss": 0.6275, |
| "step": 2135 |
| }, |
| { |
| "epoch": 0.5151661049590756, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00017137433919630604, |
| "loss": 0.6817, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.5163697640828118, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.00017133084264275576, |
| "loss": 0.6489, |
| "step": 2145 |
| }, |
| { |
| "epoch": 0.5175734232065479, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.0001712872528295674, |
| "loss": 0.6387, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.5187770823302841, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.00017124356982638637, |
| "loss": 0.6671, |
| "step": 2155 |
| }, |
| { |
| "epoch": 0.5199807414540202, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.00017119979370300683, |
| "loss": 0.6469, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.5211844005777564, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.00017115592452937188, |
| "loss": 0.6686, |
| "step": 2165 |
| }, |
| { |
| "epoch": 0.5223880597014925, |
| "grad_norm": 1.46875, |
| "learning_rate": 0.00017111196237557316, |
| "loss": 0.6767, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.5235917188252287, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00017106790731185096, |
| "loss": 0.6494, |
| "step": 2175 |
| }, |
| { |
| "epoch": 0.5247953779489648, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00017102375940859397, |
| "loss": 0.6521, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.525999037072701, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.00017097951873633918, |
| "loss": 0.6689, |
| "step": 2185 |
| }, |
| { |
| "epoch": 0.5272026961964371, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00017093518536577193, |
| "loss": 0.65, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.5284063553201733, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.0001708907593677255, |
| "loss": 0.6565, |
| "step": 2195 |
| }, |
| { |
| "epoch": 0.5296100144439095, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.0001708462408131813, |
| "loss": 0.6561, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.5308136735676456, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.0001708016297732685, |
| "loss": 0.6656, |
| "step": 2205 |
| }, |
| { |
| "epoch": 0.5320173326913819, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.00017075692631926418, |
| "loss": 0.6479, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.533220991815118, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.00017071213052259295, |
| "loss": 0.6401, |
| "step": 2215 |
| }, |
| { |
| "epoch": 0.5344246509388542, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.00017066724245482702, |
| "loss": 0.6677, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.5356283100625903, |
| "grad_norm": 1.546875, |
| "learning_rate": 0.00017062226218768608, |
| "loss": 0.6617, |
| "step": 2225 |
| }, |
| { |
| "epoch": 0.5368319691863265, |
| "grad_norm": 1.5390625, |
| "learning_rate": 0.00017057718979303696, |
| "loss": 0.6479, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.5380356283100626, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00017053202534289384, |
| "loss": 0.6772, |
| "step": 2235 |
| }, |
| { |
| "epoch": 0.5392392874337988, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00017048676890941796, |
| "loss": 0.6362, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.5404429465575349, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.00017044142056491746, |
| "loss": 0.629, |
| "step": 2245 |
| }, |
| { |
| "epoch": 0.5416466056812711, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.00017039598038184737, |
| "loss": 0.6257, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.5428502648050072, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.0001703504484328095, |
| "loss": 0.6523, |
| "step": 2255 |
| }, |
| { |
| "epoch": 0.5440539239287434, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.00017030482479055216, |
| "loss": 0.6657, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.5452575830524795, |
| "grad_norm": 1.46875, |
| "learning_rate": 0.0001702591095279703, |
| "loss": 0.6787, |
| "step": 2265 |
| }, |
| { |
| "epoch": 0.5464612421762157, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.00017021330271810515, |
| "loss": 0.6605, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.5476649012999518, |
| "grad_norm": 1.4375, |
| "learning_rate": 0.00017016740443414428, |
| "loss": 0.6444, |
| "step": 2275 |
| }, |
| { |
| "epoch": 0.548868560423688, |
| "grad_norm": 1.4375, |
| "learning_rate": 0.00017012141474942135, |
| "loss": 0.6792, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.5500722195474241, |
| "grad_norm": 1.4375, |
| "learning_rate": 0.0001700753337374161, |
| "loss": 0.6539, |
| "step": 2285 |
| }, |
| { |
| "epoch": 0.5512758786711603, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.0001700291614717542, |
| "loss": 0.6465, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.5524795377948964, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00016998289802620705, |
| "loss": 0.6388, |
| "step": 2295 |
| }, |
| { |
| "epoch": 0.5536831969186327, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.00016993654347469183, |
| "loss": 0.6497, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.5548868560423688, |
| "grad_norm": 1.5078125, |
| "learning_rate": 0.00016989009789127114, |
| "loss": 0.6617, |
| "step": 2305 |
| }, |
| { |
| "epoch": 0.556090515166105, |
| "grad_norm": 1.515625, |
| "learning_rate": 0.00016984356135015322, |
| "loss": 0.6336, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.5572941742898411, |
| "grad_norm": 1.4921875, |
| "learning_rate": 0.00016979693392569145, |
| "loss": 0.628, |
| "step": 2315 |
| }, |
| { |
| "epoch": 0.5584978334135773, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00016975021569238456, |
| "loss": 0.6733, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.5597014925373134, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00016970340672487627, |
| "loss": 0.6282, |
| "step": 2325 |
| }, |
| { |
| "epoch": 0.5609051516610496, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.00016965650709795536, |
| "loss": 0.64, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.5621088107847857, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00016960951688655535, |
| "loss": 0.6105, |
| "step": 2335 |
| }, |
| { |
| "epoch": 0.5633124699085219, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00016956243616575458, |
| "loss": 0.6637, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.5645161290322581, |
| "grad_norm": 1.4140625, |
| "learning_rate": 0.00016951526501077598, |
| "loss": 0.6459, |
| "step": 2345 |
| }, |
| { |
| "epoch": 0.5657197881559942, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.00016946800349698697, |
| "loss": 0.6501, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.5669234472797304, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00016942065169989933, |
| "loss": 0.6138, |
| "step": 2355 |
| }, |
| { |
| "epoch": 0.5681271064034665, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.00016937320969516906, |
| "loss": 0.6259, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.5693307655272027, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.00016932567755859634, |
| "loss": 0.6153, |
| "step": 2365 |
| }, |
| { |
| "epoch": 0.5705344246509388, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00016927805536612534, |
| "loss": 0.6106, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.571738083774675, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00016923034319384414, |
| "loss": 0.6342, |
| "step": 2375 |
| }, |
| { |
| "epoch": 0.5729417428984112, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.00016918254111798455, |
| "loss": 0.627, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.5741454020221474, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.00016913464921492199, |
| "loss": 0.6348, |
| "step": 2385 |
| }, |
| { |
| "epoch": 0.5753490611458835, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.0001690866675611755, |
| "loss": 0.6824, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.5765527202696197, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00016903859623340742, |
| "loss": 0.6375, |
| "step": 2395 |
| }, |
| { |
| "epoch": 0.5777563793933558, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00016899043530842344, |
| "loss": 0.6473, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.578960038517092, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00016894218486317235, |
| "loss": 0.6524, |
| "step": 2405 |
| }, |
| { |
| "epoch": 0.5801636976408281, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.000168893844974746, |
| "loss": 0.6084, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.5813673567645643, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00016884541572037917, |
| "loss": 0.612, |
| "step": 2415 |
| }, |
| { |
| "epoch": 0.5825710158883004, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.00016879689717744936, |
| "loss": 0.6657, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.5837746750120366, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00016874828942347673, |
| "loss": 0.629, |
| "step": 2425 |
| }, |
| { |
| "epoch": 0.5849783341357727, |
| "grad_norm": 2.203125, |
| "learning_rate": 0.00016869959253612408, |
| "loss": 0.6586, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.5861819932595089, |
| "grad_norm": 1.5546875, |
| "learning_rate": 0.0001686508065931965, |
| "loss": 0.6274, |
| "step": 2435 |
| }, |
| { |
| "epoch": 0.587385652383245, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.00016860193167264148, |
| "loss": 0.6241, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.5885893115069812, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.00016855296785254852, |
| "loss": 0.6426, |
| "step": 2445 |
| }, |
| { |
| "epoch": 0.5897929706307173, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.00016850391521114937, |
| "loss": 0.6492, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.5909966297544536, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.0001684547738268175, |
| "loss": 0.6299, |
| "step": 2455 |
| }, |
| { |
| "epoch": 0.5922002888781897, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00016840554377806823, |
| "loss": 0.6246, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.5934039480019259, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.0001683562251435586, |
| "loss": 0.6388, |
| "step": 2465 |
| }, |
| { |
| "epoch": 0.594607607125662, |
| "grad_norm": 1.53125, |
| "learning_rate": 0.00016830681800208714, |
| "loss": 0.6423, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.5958112662493982, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.0001682573224325938, |
| "loss": 0.6324, |
| "step": 2475 |
| }, |
| { |
| "epoch": 0.5970149253731343, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.00016820773851415976, |
| "loss": 0.6533, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.5982185844968705, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.0001681580663260075, |
| "loss": 0.7084, |
| "step": 2485 |
| }, |
| { |
| "epoch": 0.5994222436206067, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.00016810830594750034, |
| "loss": 0.618, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.6006259027443428, |
| "grad_norm": 1.5625, |
| "learning_rate": 0.0001680584574581427, |
| "loss": 0.6244, |
| "step": 2495 |
| }, |
| { |
| "epoch": 0.601829561868079, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.00016800852093757965, |
| "loss": 0.6373, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.601829561868079, |
| "eval_loss": 0.5356536507606506, |
| "eval_runtime": 2.3478, |
| "eval_samples_per_second": 85.185, |
| "eval_steps_per_second": 85.185, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.6030332209918151, |
| "grad_norm": 1.4375, |
| "learning_rate": 0.00016795849646559694, |
| "loss": 0.6164, |
| "step": 2505 |
| }, |
| { |
| "epoch": 0.6042368801155513, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00016790838412212086, |
| "loss": 0.6377, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.6054405392392874, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.00016785818398721807, |
| "loss": 0.6171, |
| "step": 2515 |
| }, |
| { |
| "epoch": 0.6066441983630236, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.00016780789614109556, |
| "loss": 0.649, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.6078478574867597, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.0001677575206641004, |
| "loss": 0.644, |
| "step": 2525 |
| }, |
| { |
| "epoch": 0.609051516610496, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.00016770705763671966, |
| "loss": 0.6246, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.610255175734232, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.00016765650713958035, |
| "loss": 0.6154, |
| "step": 2535 |
| }, |
| { |
| "epoch": 0.6114588348579683, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.00016760586925344923, |
| "loss": 0.6131, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.6126624939817044, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00016755514405923256, |
| "loss": 0.6289, |
| "step": 2545 |
| }, |
| { |
| "epoch": 0.6138661531054406, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00016750433163797632, |
| "loss": 0.6327, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.6150698122291767, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00016745343207086566, |
| "loss": 0.6132, |
| "step": 2555 |
| }, |
| { |
| "epoch": 0.6162734713529129, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00016740244543922504, |
| "loss": 0.613, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.617477130476649, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00016735137182451808, |
| "loss": 0.6422, |
| "step": 2565 |
| }, |
| { |
| "epoch": 0.6186807896003852, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.00016730021130834729, |
| "loss": 0.6259, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.6198844487241213, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.00016724896397245402, |
| "loss": 0.6017, |
| "step": 2575 |
| }, |
| { |
| "epoch": 0.6210881078478575, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00016719762989871845, |
| "loss": 0.5948, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.6222917669715936, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.00016714620916915922, |
| "loss": 0.6597, |
| "step": 2585 |
| }, |
| { |
| "epoch": 0.6234954260953298, |
| "grad_norm": 1.46875, |
| "learning_rate": 0.0001670947018659335, |
| "loss": 0.6211, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.6246990852190659, |
| "grad_norm": 1.375, |
| "learning_rate": 0.00016704310807133673, |
| "loss": 0.6293, |
| "step": 2595 |
| }, |
| { |
| "epoch": 0.6259027443428021, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.0001669914278678026, |
| "loss": 0.6254, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.6271064034665382, |
| "grad_norm": 1.5078125, |
| "learning_rate": 0.0001669396613379028, |
| "loss": 0.633, |
| "step": 2605 |
| }, |
| { |
| "epoch": 0.6283100625902744, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.00016688780856434696, |
| "loss": 0.6533, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.6295137217140105, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00016683586962998258, |
| "loss": 0.641, |
| "step": 2615 |
| }, |
| { |
| "epoch": 0.6307173808377468, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00016678384461779472, |
| "loss": 0.6204, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.6319210399614829, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.000166731733610906, |
| "loss": 0.6634, |
| "step": 2625 |
| }, |
| { |
| "epoch": 0.6331246990852191, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.00016667953669257648, |
| "loss": 0.6318, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.6343283582089553, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00016662725394620345, |
| "loss": 0.6196, |
| "step": 2635 |
| }, |
| { |
| "epoch": 0.6355320173326914, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.00016657488545532132, |
| "loss": 0.6651, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.6367356764564276, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00016652243130360153, |
| "loss": 0.6361, |
| "step": 2645 |
| }, |
| { |
| "epoch": 0.6379393355801637, |
| "grad_norm": 1.4140625, |
| "learning_rate": 0.00016646989157485237, |
| "loss": 0.6251, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.6391429947038999, |
| "grad_norm": 1.5390625, |
| "learning_rate": 0.00016641726635301883, |
| "loss": 0.6421, |
| "step": 2655 |
| }, |
| { |
| "epoch": 0.640346653827636, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.00016636455572218255, |
| "loss": 0.6256, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.6415503129513722, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00016631175976656158, |
| "loss": 0.6211, |
| "step": 2665 |
| }, |
| { |
| "epoch": 0.6427539720751083, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.00016625887857051036, |
| "loss": 0.6005, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.6439576311988445, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00016620591221851947, |
| "loss": 0.6192, |
| "step": 2675 |
| }, |
| { |
| "epoch": 0.6451612903225806, |
| "grad_norm": 1.5625, |
| "learning_rate": 0.00016615286079521555, |
| "loss": 0.63, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.6463649494463168, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.0001660997243853612, |
| "loss": 0.6259, |
| "step": 2685 |
| }, |
| { |
| "epoch": 0.6475686085700529, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00016604650307385475, |
| "loss": 0.6238, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.6487722676937892, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.0001659931969457302, |
| "loss": 0.6168, |
| "step": 2695 |
| }, |
| { |
| "epoch": 0.6499759268175253, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.00016593980608615717, |
| "loss": 0.6238, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.6511795859412615, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00016588633058044045, |
| "loss": 0.6764, |
| "step": 2705 |
| }, |
| { |
| "epoch": 0.6523832450649976, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.00016583277051402027, |
| "loss": 0.6082, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.6535869041887338, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00016577912597247183, |
| "loss": 0.6378, |
| "step": 2715 |
| }, |
| { |
| "epoch": 0.6547905633124699, |
| "grad_norm": 1.4140625, |
| "learning_rate": 0.0001657253970415054, |
| "loss": 0.62, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.6559942224362061, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.000165671583806966, |
| "loss": 0.6159, |
| "step": 2725 |
| }, |
| { |
| "epoch": 0.6571978815599422, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.0001656176863548334, |
| "loss": 0.6557, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.6584015406836784, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.00016556370477122196, |
| "loss": 0.6616, |
| "step": 2735 |
| }, |
| { |
| "epoch": 0.6596051998074145, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00016550963914238031, |
| "loss": 0.6135, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.6608088589311507, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.00016545548955469158, |
| "loss": 0.5916, |
| "step": 2745 |
| }, |
| { |
| "epoch": 0.6620125180548868, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.0001654012560946728, |
| "loss": 0.6324, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.663216177178623, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.0001653469388489753, |
| "loss": 0.6404, |
| "step": 2755 |
| }, |
| { |
| "epoch": 0.6644198363023591, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00016529253790438395, |
| "loss": 0.6192, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.6656234954260953, |
| "grad_norm": 1.4140625, |
| "learning_rate": 0.00016523805334781762, |
| "loss": 0.6326, |
| "step": 2765 |
| }, |
| { |
| "epoch": 0.6668271545498314, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.00016518348526632865, |
| "loss": 0.635, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.6680308136735676, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.0001651288337471028, |
| "loss": 0.6192, |
| "step": 2775 |
| }, |
| { |
| "epoch": 0.6692344727973039, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.00016507409887745922, |
| "loss": 0.6423, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.67043813192104, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.00016501928074485016, |
| "loss": 0.6264, |
| "step": 2785 |
| }, |
| { |
| "epoch": 0.6716417910447762, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.00016496437943686103, |
| "loss": 0.6028, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.6728454501685123, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.00016490939504120998, |
| "loss": 0.6289, |
| "step": 2795 |
| }, |
| { |
| "epoch": 0.6740491092922485, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.00016485432764574797, |
| "loss": 0.6272, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.6752527684159846, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.00016479917733845862, |
| "loss": 0.6095, |
| "step": 2805 |
| }, |
| { |
| "epoch": 0.6764564275397208, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.00016474394420745798, |
| "loss": 0.6311, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.6776600866634569, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.0001646886283409944, |
| "loss": 0.6475, |
| "step": 2815 |
| }, |
| { |
| "epoch": 0.6788637457871931, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.0001646332298274485, |
| "loss": 0.63, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.6800674049109292, |
| "grad_norm": 1.59375, |
| "learning_rate": 0.00016457774875533283, |
| "loss": 0.6144, |
| "step": 2825 |
| }, |
| { |
| "epoch": 0.6812710640346654, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.000164522185213292, |
| "loss": 0.6001, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.6824747231584015, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.0001644665392901023, |
| "loss": 0.621, |
| "step": 2835 |
| }, |
| { |
| "epoch": 0.6836783822821377, |
| "grad_norm": 1.375, |
| "learning_rate": 0.00016441081107467157, |
| "loss": 0.639, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.6848820414058738, |
| "grad_norm": 1.375, |
| "learning_rate": 0.00016435500065603933, |
| "loss": 0.6184, |
| "step": 2845 |
| }, |
| { |
| "epoch": 0.68608570052961, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.0001642991081233762, |
| "loss": 0.6164, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.6872893596533461, |
| "grad_norm": 1.4921875, |
| "learning_rate": 0.00016424313356598423, |
| "loss": 0.5994, |
| "step": 2855 |
| }, |
| { |
| "epoch": 0.6884930187770824, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.00016418707707329636, |
| "loss": 0.6175, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.6896966779008185, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.0001641309387348765, |
| "loss": 0.6209, |
| "step": 2865 |
| }, |
| { |
| "epoch": 0.6909003370245547, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.00016407471864041926, |
| "loss": 0.6126, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.6921039961482908, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.00016401841687975005, |
| "loss": 0.605, |
| "step": 2875 |
| }, |
| { |
| "epoch": 0.693307655272027, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00016396203354282458, |
| "loss": 0.6187, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.6945113143957631, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.00016390556871972902, |
| "loss": 0.6374, |
| "step": 2885 |
| }, |
| { |
| "epoch": 0.6957149735194993, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00016384902250067963, |
| "loss": 0.6246, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.6969186326432354, |
| "grad_norm": 1.5, |
| "learning_rate": 0.0001637923949760228, |
| "loss": 0.6356, |
| "step": 2895 |
| }, |
| { |
| "epoch": 0.6981222917669716, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.00016373568623623478, |
| "loss": 0.5897, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.6993259508907077, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.00016367889637192162, |
| "loss": 0.63, |
| "step": 2905 |
| }, |
| { |
| "epoch": 0.7005296100144439, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00016362202547381892, |
| "loss": 0.5854, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.70173326913818, |
| "grad_norm": 1.4375, |
| "learning_rate": 0.00016356507363279188, |
| "loss": 0.6286, |
| "step": 2915 |
| }, |
| { |
| "epoch": 0.7029369282619162, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.0001635080409398349, |
| "loss": 0.5537, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.7041405873856523, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00016345092748607158, |
| "loss": 0.6272, |
| "step": 2925 |
| }, |
| { |
| "epoch": 0.7053442465093885, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.00016339373336275457, |
| "loss": 0.5848, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.7065479056331248, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.00016333645866126544, |
| "loss": 0.5941, |
| "step": 2935 |
| }, |
| { |
| "epoch": 0.7077515647568609, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00016327910347311452, |
| "loss": 0.583, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.7089552238805971, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00016322166788994063, |
| "loss": 0.6296, |
| "step": 2945 |
| }, |
| { |
| "epoch": 0.7101588830043332, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00016316415200351116, |
| "loss": 0.6183, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.7113625421280694, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00016310655590572173, |
| "loss": 0.604, |
| "step": 2955 |
| }, |
| { |
| "epoch": 0.7125662012518055, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.0001630488796885961, |
| "loss": 0.5714, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.7137698603755417, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.00016299112344428615, |
| "loss": 0.5934, |
| "step": 2965 |
| }, |
| { |
| "epoch": 0.7149735194992778, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.00016293328726507152, |
| "loss": 0.612, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.716177178623014, |
| "grad_norm": 1.4375, |
| "learning_rate": 0.00016287537124335958, |
| "loss": 0.5918, |
| "step": 2975 |
| }, |
| { |
| "epoch": 0.7173808377467501, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.00016281737547168534, |
| "loss": 0.6047, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.7185844968704863, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00016275930004271114, |
| "loss": 0.6189, |
| "step": 2985 |
| }, |
| { |
| "epoch": 0.7197881559942224, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.0001627011450492266, |
| "loss": 0.6126, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.7209918151179586, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.0001626429105841485, |
| "loss": 0.5967, |
| "step": 2995 |
| }, |
| { |
| "epoch": 0.7221954742416947, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.00016258459674052067, |
| "loss": 0.5948, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.7221954742416947, |
| "eval_loss": 0.5183995962142944, |
| "eval_runtime": 2.351, |
| "eval_samples_per_second": 85.07, |
| "eval_steps_per_second": 85.07, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.7233991333654309, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00016252620361151358, |
| "loss": 0.6386, |
| "step": 3005 |
| }, |
| { |
| "epoch": 0.724602792489167, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.00016246773129042453, |
| "loss": 0.6711, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.7258064516129032, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.0001624091798706773, |
| "loss": 0.6083, |
| "step": 3015 |
| }, |
| { |
| "epoch": 0.7270101107366393, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00016235054944582203, |
| "loss": 0.5965, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.7282137698603756, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.0001622918401095351, |
| "loss": 0.593, |
| "step": 3025 |
| }, |
| { |
| "epoch": 0.7294174289841117, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.000162233051955619, |
| "loss": 0.6226, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.7306210881078479, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00016217418507800214, |
| "loss": 0.5963, |
| "step": 3035 |
| }, |
| { |
| "epoch": 0.731824747231584, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.0001621152395707387, |
| "loss": 0.5788, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.7330284063553202, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.00016205621552800847, |
| "loss": 0.5831, |
| "step": 3045 |
| }, |
| { |
| "epoch": 0.7342320654790563, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00016199711304411673, |
| "loss": 0.6308, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.7354357246027925, |
| "grad_norm": 1.5234375, |
| "learning_rate": 0.00016193793221349413, |
| "loss": 0.5763, |
| "step": 3055 |
| }, |
| { |
| "epoch": 0.7366393837265286, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.00016187867313069643, |
| "loss": 0.622, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.7378430428502648, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00016181933589040447, |
| "loss": 0.6127, |
| "step": 3065 |
| }, |
| { |
| "epoch": 0.7390467019740009, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00016175992058742398, |
| "loss": 0.6363, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.7402503610977371, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.00016170042731668537, |
| "loss": 0.6344, |
| "step": 3075 |
| }, |
| { |
| "epoch": 0.7414540202214733, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00016164085617324358, |
| "loss": 0.6138, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.7426576793452094, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.00016158120725227808, |
| "loss": 0.5386, |
| "step": 3085 |
| }, |
| { |
| "epoch": 0.7438613384689456, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00016152148064909255, |
| "loss": 0.6276, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.7450649975926817, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00016146167645911478, |
| "loss": 0.5944, |
| "step": 3095 |
| }, |
| { |
| "epoch": 0.746268656716418, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00016140179477789653, |
| "loss": 0.6163, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.7474723158401541, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.00016134183570111336, |
| "loss": 0.6032, |
| "step": 3105 |
| }, |
| { |
| "epoch": 0.7486759749638903, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00016128179932456456, |
| "loss": 0.569, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.7498796340876264, |
| "grad_norm": 1.1328125, |
| "learning_rate": 0.0001612216857441728, |
| "loss": 0.6101, |
| "step": 3115 |
| }, |
| { |
| "epoch": 0.7510832932113626, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00016116149505598424, |
| "loss": 0.5896, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.7522869523350987, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00016110122735616815, |
| "loss": 0.5981, |
| "step": 3125 |
| }, |
| { |
| "epoch": 0.7534906114588349, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.0001610408827410168, |
| "loss": 0.6139, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.754694270582571, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.0001609804613069455, |
| "loss": 0.5773, |
| "step": 3135 |
| }, |
| { |
| "epoch": 0.7558979297063072, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.00016091996315049215, |
| "loss": 0.6342, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.7571015888300433, |
| "grad_norm": 1.4140625, |
| "learning_rate": 0.00016085938836831736, |
| "loss": 0.5896, |
| "step": 3145 |
| }, |
| { |
| "epoch": 0.7583052479537795, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00016079873705720404, |
| "loss": 0.6124, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.7595089070775156, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.00016073800931405748, |
| "loss": 0.5761, |
| "step": 3155 |
| }, |
| { |
| "epoch": 0.7607125662012518, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.00016067720523590503, |
| "loss": 0.6186, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.7619162253249879, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00016061632491989604, |
| "loss": 0.604, |
| "step": 3165 |
| }, |
| { |
| "epoch": 0.7631198844487241, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00016055536846330161, |
| "loss": 0.5822, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.7643235435724602, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.00016049433596351453, |
| "loss": 0.6034, |
| "step": 3175 |
| }, |
| { |
| "epoch": 0.7655272026961965, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.00016043322751804913, |
| "loss": 0.5935, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.7667308618199326, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00016037204322454101, |
| "loss": 0.6009, |
| "step": 3185 |
| }, |
| { |
| "epoch": 0.7679345209436688, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.000160310783180747, |
| "loss": 0.6069, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.7691381800674049, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.00016024944748454495, |
| "loss": 0.6134, |
| "step": 3195 |
| }, |
| { |
| "epoch": 0.7703418391911411, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.00016018803623393354, |
| "loss": 0.6333, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.7715454983148772, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.00016012654952703227, |
| "loss": 0.5912, |
| "step": 3205 |
| }, |
| { |
| "epoch": 0.7727491574386134, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.00016006498746208106, |
| "loss": 0.6249, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.7739528165623495, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.0001600033501374404, |
| "loss": 0.6048, |
| "step": 3215 |
| }, |
| { |
| "epoch": 0.7751564756860857, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00015994163765159085, |
| "loss": 0.5762, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.7763601348098219, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00015987985010313318, |
| "loss": 0.6221, |
| "step": 3225 |
| }, |
| { |
| "epoch": 0.777563793933558, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00015981798759078807, |
| "loss": 0.6514, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.7787674530572942, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.00015975605021339588, |
| "loss": 0.6116, |
| "step": 3235 |
| }, |
| { |
| "epoch": 0.7799711121810303, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.00015969403806991679, |
| "loss": 0.5857, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.7811747713047665, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.0001596319512594302, |
| "loss": 0.5562, |
| "step": 3245 |
| }, |
| { |
| "epoch": 0.7823784304285026, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00015956978988113492, |
| "loss": 0.6025, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.7835820895522388, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.00015950755403434897, |
| "loss": 0.5709, |
| "step": 3255 |
| }, |
| { |
| "epoch": 0.784785748675975, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.00015944524381850922, |
| "loss": 0.6152, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.7859894077997112, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.00015938285933317145, |
| "loss": 0.5825, |
| "step": 3265 |
| }, |
| { |
| "epoch": 0.7871930669234473, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.00015932040067801007, |
| "loss": 0.5696, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.7883967260471835, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00015925786795281799, |
| "loss": 0.6222, |
| "step": 3275 |
| }, |
| { |
| "epoch": 0.7896003851709196, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00015919526125750646, |
| "loss": 0.5668, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.7908040442946558, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00015913258069210495, |
| "loss": 0.6167, |
| "step": 3285 |
| }, |
| { |
| "epoch": 0.7920077034183919, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00015906982635676092, |
| "loss": 0.5837, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.7932113625421281, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.00015900699835173977, |
| "loss": 0.6018, |
| "step": 3295 |
| }, |
| { |
| "epoch": 0.7944150216658642, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.00015894409677742445, |
| "loss": 0.5993, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.7956186807896004, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00015888112173431565, |
| "loss": 0.5965, |
| "step": 3305 |
| }, |
| { |
| "epoch": 0.7968223399133365, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.00015881807332303126, |
| "loss": 0.5797, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.7980259990370727, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00015875495164430653, |
| "loss": 0.5991, |
| "step": 3315 |
| }, |
| { |
| "epoch": 0.7992296581608088, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.0001586917567989937, |
| "loss": 0.6081, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.800433317284545, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00015862848888806194, |
| "loss": 0.6085, |
| "step": 3325 |
| }, |
| { |
| "epoch": 0.8016369764082811, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00015856514801259713, |
| "loss": 0.5954, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.8028406355320173, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00015850173427380177, |
| "loss": 0.5854, |
| "step": 3335 |
| }, |
| { |
| "epoch": 0.8040442946557534, |
| "grad_norm": 1.5, |
| "learning_rate": 0.0001584382477729948, |
| "loss": 0.6125, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.8052479537794897, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00015837468861161133, |
| "loss": 0.5994, |
| "step": 3345 |
| }, |
| { |
| "epoch": 0.8064516129032258, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.0001583110568912026, |
| "loss": 0.5539, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.807655272026962, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00015824735271343577, |
| "loss": 0.5923, |
| "step": 3355 |
| }, |
| { |
| "epoch": 0.8088589311506981, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00015818357618009384, |
| "loss": 0.5316, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.8100625902744343, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00015811972739307532, |
| "loss": 0.574, |
| "step": 3365 |
| }, |
| { |
| "epoch": 0.8112662493981705, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00015805580645439422, |
| "loss": 0.591, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.8124699085219066, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00015799181346617985, |
| "loss": 0.595, |
| "step": 3375 |
| }, |
| { |
| "epoch": 0.8136735676456428, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00015792774853067652, |
| "loss": 0.6029, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.8148772267693789, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00015786361175024362, |
| "loss": 0.5517, |
| "step": 3385 |
| }, |
| { |
| "epoch": 0.8160808858931151, |
| "grad_norm": 1.4140625, |
| "learning_rate": 0.00015779940322735529, |
| "loss": 0.6423, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.8172845450168512, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.00015773512306460025, |
| "loss": 0.5942, |
| "step": 3395 |
| }, |
| { |
| "epoch": 0.8184882041405874, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.0001576707713646818, |
| "loss": 0.5914, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.8196918632643235, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.00015760634823041737, |
| "loss": 0.6314, |
| "step": 3405 |
| }, |
| { |
| "epoch": 0.8208955223880597, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00015754185376473865, |
| "loss": 0.5919, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.8220991815117958, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.0001574772880706913, |
| "loss": 0.5966, |
| "step": 3415 |
| }, |
| { |
| "epoch": 0.823302840635532, |
| "grad_norm": 1.375, |
| "learning_rate": 0.0001574126512514347, |
| "loss": 0.5682, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.8245064997592682, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00015734794341024192, |
| "loss": 0.6198, |
| "step": 3425 |
| }, |
| { |
| "epoch": 0.8257101588830044, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.00015728316465049954, |
| "loss": 0.6307, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.8269138180067405, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00015721831507570734, |
| "loss": 0.5518, |
| "step": 3435 |
| }, |
| { |
| "epoch": 0.8281174771304767, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.0001571533947894784, |
| "loss": 0.561, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.8293211362542128, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00015708840389553862, |
| "loss": 0.5705, |
| "step": 3445 |
| }, |
| { |
| "epoch": 0.830524795377949, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00015702334249772688, |
| "loss": 0.5753, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.8317284545016851, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.00015695821069999452, |
| "loss": 0.6035, |
| "step": 3455 |
| }, |
| { |
| "epoch": 0.8329321136254213, |
| "grad_norm": 1.4375, |
| "learning_rate": 0.0001568930086064055, |
| "loss": 0.6083, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.8341357727491574, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00015682773632113604, |
| "loss": 0.5731, |
| "step": 3465 |
| }, |
| { |
| "epoch": 0.8353394318728936, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00015676239394847458, |
| "loss": 0.5819, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.8365430909966297, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.00015669698159282135, |
| "loss": 0.5685, |
| "step": 3475 |
| }, |
| { |
| "epoch": 0.8377467501203659, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.00015663149935868866, |
| "loss": 0.5876, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.838950409244102, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00015656594735070025, |
| "loss": 0.5996, |
| "step": 3485 |
| }, |
| { |
| "epoch": 0.8401540683678382, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00015650032567359146, |
| "loss": 0.5421, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.8413577274915743, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.00015643463443220886, |
| "loss": 0.6113, |
| "step": 3495 |
| }, |
| { |
| "epoch": 0.8425613866153105, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.0001563688737315103, |
| "loss": 0.6228, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.8425613866153105, |
| "eval_loss": 0.49500542879104614, |
| "eval_runtime": 2.3464, |
| "eval_samples_per_second": 85.235, |
| "eval_steps_per_second": 85.235, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.8437650457390466, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00015630304367656445, |
| "loss": 0.5976, |
| "step": 3505 |
| }, |
| { |
| "epoch": 0.8449687048627829, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.0001562371443725509, |
| "loss": 0.5939, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.8461723639865191, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00015617117592475982, |
| "loss": 0.5936, |
| "step": 3515 |
| }, |
| { |
| "epoch": 0.8473760231102552, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00015610513843859189, |
| "loss": 0.5781, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.8485796822339914, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00015603903201955808, |
| "loss": 0.6042, |
| "step": 3525 |
| }, |
| { |
| "epoch": 0.8497833413577275, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.0001559728567732795, |
| "loss": 0.588, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.8509870004814637, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.00015590661280548724, |
| "loss": 0.6451, |
| "step": 3535 |
| }, |
| { |
| "epoch": 0.8521906596051998, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00015584030022202218, |
| "loss": 0.5958, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.853394318728936, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.0001557739191288348, |
| "loss": 0.5927, |
| "step": 3545 |
| }, |
| { |
| "epoch": 0.8545979778526721, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.00015570746963198512, |
| "loss": 0.5637, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.8558016369764083, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00015564095183764234, |
| "loss": 0.6069, |
| "step": 3555 |
| }, |
| { |
| "epoch": 0.8570052961001444, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00015557436585208489, |
| "loss": 0.5895, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.8582089552238806, |
| "grad_norm": 1.25, |
| "learning_rate": 0.0001555077117817001, |
| "loss": 0.6186, |
| "step": 3565 |
| }, |
| { |
| "epoch": 0.8594126143476167, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.00015544098973298406, |
| "loss": 0.5915, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.8606162734713529, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00015537419981254152, |
| "loss": 0.595, |
| "step": 3575 |
| }, |
| { |
| "epoch": 0.861819932595089, |
| "grad_norm": 1.4140625, |
| "learning_rate": 0.00015530734212708568, |
| "loss": 0.5774, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.8630235917188253, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00015524041678343793, |
| "loss": 0.5634, |
| "step": 3585 |
| }, |
| { |
| "epoch": 0.8642272508425614, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.0001551734238885278, |
| "loss": 0.5802, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.8654309099662976, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00015510636354939284, |
| "loss": 0.6019, |
| "step": 3595 |
| }, |
| { |
| "epoch": 0.8666345690900337, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.0001550392358731782, |
| "loss": 0.5786, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.8678382282137699, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.0001549720409671368, |
| "loss": 0.5844, |
| "step": 3605 |
| }, |
| { |
| "epoch": 0.869041887337506, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.00015490477893862875, |
| "loss": 0.5678, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.8702455464612422, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.0001548374498951216, |
| "loss": 0.6117, |
| "step": 3615 |
| }, |
| { |
| "epoch": 0.8714492055849783, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00015477005394418992, |
| "loss": 0.5746, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.8726528647087145, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00015470259119351512, |
| "loss": 0.5975, |
| "step": 3625 |
| }, |
| { |
| "epoch": 0.8738565238324506, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00015463506175088536, |
| "loss": 0.6064, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.8750601829561868, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00015456746572419544, |
| "loss": 0.5696, |
| "step": 3635 |
| }, |
| { |
| "epoch": 0.8762638420799229, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00015449980322144644, |
| "loss": 0.6189, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.8774675012036591, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.00015443207435074572, |
| "loss": 0.5896, |
| "step": 3645 |
| }, |
| { |
| "epoch": 0.8786711603273952, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00015436427922030667, |
| "loss": 0.5845, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.8798748194511314, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00015429641793844844, |
| "loss": 0.5737, |
| "step": 3655 |
| }, |
| { |
| "epoch": 0.8810784785748677, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00015422849061359607, |
| "loss": 0.5527, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.8822821376986038, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.00015416049735427994, |
| "loss": 0.5435, |
| "step": 3665 |
| }, |
| { |
| "epoch": 0.88348579682234, |
| "grad_norm": 1.25, |
| "learning_rate": 0.0001540924382691359, |
| "loss": 0.614, |
| "step": 3670 |
| }, |
| { |
| "epoch": 0.8846894559460761, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.00015402431346690483, |
| "loss": 0.5945, |
| "step": 3675 |
| }, |
| { |
| "epoch": 0.8858931150698123, |
| "grad_norm": 1.3671875, |
| "learning_rate": 0.00015395612305643282, |
| "loss": 0.5986, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.8870967741935484, |
| "grad_norm": 1.25, |
| "learning_rate": 0.0001538878671466706, |
| "loss": 0.5526, |
| "step": 3685 |
| }, |
| { |
| "epoch": 0.8883004333172846, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.0001538195458466736, |
| "loss": 0.5799, |
| "step": 3690 |
| }, |
| { |
| "epoch": 0.8895040924410207, |
| "grad_norm": 1.4140625, |
| "learning_rate": 0.00015375115926560178, |
| "loss": 0.586, |
| "step": 3695 |
| }, |
| { |
| "epoch": 0.8907077515647569, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00015368270751271937, |
| "loss": 0.5983, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.891911410688493, |
| "grad_norm": 1.1015625, |
| "learning_rate": 0.0001536141906973947, |
| "loss": 0.5521, |
| "step": 3705 |
| }, |
| { |
| "epoch": 0.8931150698122292, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00015354560892910006, |
| "loss": 0.581, |
| "step": 3710 |
| }, |
| { |
| "epoch": 0.8943187289359653, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00015347696231741157, |
| "loss": 0.584, |
| "step": 3715 |
| }, |
| { |
| "epoch": 0.8955223880597015, |
| "grad_norm": 1.1328125, |
| "learning_rate": 0.00015340825097200891, |
| "loss": 0.5708, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.8967260471834376, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.0001533394750026752, |
| "loss": 0.5845, |
| "step": 3725 |
| }, |
| { |
| "epoch": 0.8979297063071738, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.00015327063451929675, |
| "loss": 0.5956, |
| "step": 3730 |
| }, |
| { |
| "epoch": 0.8991333654309099, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00015320172963186312, |
| "loss": 0.5435, |
| "step": 3735 |
| }, |
| { |
| "epoch": 0.9003370245546461, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00015313276045046656, |
| "loss": 0.5731, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.9015406836783822, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00015306372708530217, |
| "loss": 0.5524, |
| "step": 3745 |
| }, |
| { |
| "epoch": 0.9027443428021185, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00015299462964666756, |
| "loss": 0.578, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.9039480019258546, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00015292546824496276, |
| "loss": 0.5885, |
| "step": 3755 |
| }, |
| { |
| "epoch": 0.9051516610495908, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00015285624299068992, |
| "loss": 0.5916, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.9063553201733269, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00015278695399445328, |
| "loss": 0.6174, |
| "step": 3765 |
| }, |
| { |
| "epoch": 0.9075589792970631, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00015271760136695888, |
| "loss": 0.5735, |
| "step": 3770 |
| }, |
| { |
| "epoch": 0.9087626384207992, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.0001526481852190144, |
| "loss": 0.583, |
| "step": 3775 |
| }, |
| { |
| "epoch": 0.9099662975445354, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.00015257870566152916, |
| "loss": 0.5883, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.9111699566682715, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00015250916280551356, |
| "loss": 0.579, |
| "step": 3785 |
| }, |
| { |
| "epoch": 0.9123736157920077, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00015243955676207932, |
| "loss": 0.5754, |
| "step": 3790 |
| }, |
| { |
| "epoch": 0.9135772749157438, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00015236988764243906, |
| "loss": 0.561, |
| "step": 3795 |
| }, |
| { |
| "epoch": 0.91478093403948, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00015230015555790614, |
| "loss": 0.596, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.9159845931632162, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00015223036061989454, |
| "loss": 0.5806, |
| "step": 3805 |
| }, |
| { |
| "epoch": 0.9171882522869523, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00015216050293991875, |
| "loss": 0.6107, |
| "step": 3810 |
| }, |
| { |
| "epoch": 0.9183919114106885, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00015209058262959338, |
| "loss": 0.5624, |
| "step": 3815 |
| }, |
| { |
| "epoch": 0.9195955705344246, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00015202059980063315, |
| "loss": 0.6023, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.9207992296581609, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.00015195055456485276, |
| "loss": 0.5684, |
| "step": 3825 |
| }, |
| { |
| "epoch": 0.922002888781897, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00015188044703416646, |
| "loss": 0.589, |
| "step": 3830 |
| }, |
| { |
| "epoch": 0.9232065479056332, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00015181027732058812, |
| "loss": 0.5965, |
| "step": 3835 |
| }, |
| { |
| "epoch": 0.9244102070293693, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00015174004553623096, |
| "loss": 0.5394, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.9256138661531055, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.0001516697517933074, |
| "loss": 0.5685, |
| "step": 3845 |
| }, |
| { |
| "epoch": 0.9268175252768416, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.00015159939620412881, |
| "loss": 0.5824, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.9280211844005778, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.00015152897888110538, |
| "loss": 0.6205, |
| "step": 3855 |
| }, |
| { |
| "epoch": 0.9292248435243139, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00015145849993674596, |
| "loss": 0.5724, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.9304285026480501, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00015138795948365773, |
| "loss": 0.593, |
| "step": 3865 |
| }, |
| { |
| "epoch": 0.9316321617717862, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.0001513173576345464, |
| "loss": 0.605, |
| "step": 3870 |
| }, |
| { |
| "epoch": 0.9328358208955224, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00015124669450221554, |
| "loss": 0.5699, |
| "step": 3875 |
| }, |
| { |
| "epoch": 0.9340394800192585, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00015117597019956674, |
| "loss": 0.5892, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.9352431391429947, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.00015110518483959929, |
| "loss": 0.5434, |
| "step": 3885 |
| }, |
| { |
| "epoch": 0.9364467982667308, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00015103433853541008, |
| "loss": 0.5682, |
| "step": 3890 |
| }, |
| { |
| "epoch": 0.937650457390467, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00015096343140019332, |
| "loss": 0.565, |
| "step": 3895 |
| }, |
| { |
| "epoch": 0.9388541165142031, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.0001508924635472404, |
| "loss": 0.5885, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.9400577756379394, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.0001508214350899399, |
| "loss": 0.5608, |
| "step": 3905 |
| }, |
| { |
| "epoch": 0.9412614347616755, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.0001507503461417769, |
| "loss": 0.6009, |
| "step": 3910 |
| }, |
| { |
| "epoch": 0.9424650938854117, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.0001506791968163334, |
| "loss": 0.5904, |
| "step": 3915 |
| }, |
| { |
| "epoch": 0.9436687530091478, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00015060798722728782, |
| "loss": 0.5937, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.944872412132884, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.0001505367174884148, |
| "loss": 0.5483, |
| "step": 3925 |
| }, |
| { |
| "epoch": 0.9460760712566201, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00015046538771358508, |
| "loss": 0.5637, |
| "step": 3930 |
| }, |
| { |
| "epoch": 0.9472797303803563, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00015039399801676536, |
| "loss": 0.5642, |
| "step": 3935 |
| }, |
| { |
| "epoch": 0.9484833895040924, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.0001503225485120181, |
| "loss": 0.5515, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.9496870486278286, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.0001502510393135013, |
| "loss": 0.5478, |
| "step": 3945 |
| }, |
| { |
| "epoch": 0.9508907077515648, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.00015017947053546835, |
| "loss": 0.5527, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.9520943668753009, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00015010784229226772, |
| "loss": 0.5404, |
| "step": 3955 |
| }, |
| { |
| "epoch": 0.9532980259990371, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00015003615469834301, |
| "loss": 0.5384, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.9545016851227732, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00014996440786823272, |
| "loss": 0.5844, |
| "step": 3965 |
| }, |
| { |
| "epoch": 0.9557053442465094, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00014989260191656975, |
| "loss": 0.5613, |
| "step": 3970 |
| }, |
| { |
| "epoch": 0.9569090033702455, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.0001498207369580817, |
| "loss": 0.5627, |
| "step": 3975 |
| }, |
| { |
| "epoch": 0.9581126624939817, |
| "grad_norm": 1.40625, |
| "learning_rate": 0.0001497488131075903, |
| "loss": 0.5782, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.9593163216177178, |
| "grad_norm": 1.5, |
| "learning_rate": 0.00014967683048001146, |
| "loss": 0.5931, |
| "step": 3985 |
| }, |
| { |
| "epoch": 0.9605199807414541, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00014960478919035495, |
| "loss": 0.5425, |
| "step": 3990 |
| }, |
| { |
| "epoch": 0.9617236398651902, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00014953268935372427, |
| "loss": 0.5878, |
| "step": 3995 |
| }, |
| { |
| "epoch": 0.9629272989889264, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00014946053108531648, |
| "loss": 0.5478, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.9629272989889264, |
| "eval_loss": 0.47590532898902893, |
| "eval_runtime": 2.3469, |
| "eval_samples_per_second": 85.22, |
| "eval_steps_per_second": 85.22, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.9641309581126625, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00014938831450042202, |
| "loss": 0.6152, |
| "step": 4005 |
| }, |
| { |
| "epoch": 0.9653346172363987, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00014931603971442444, |
| "loss": 0.5709, |
| "step": 4010 |
| }, |
| { |
| "epoch": 0.9665382763601348, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.00014924370684280035, |
| "loss": 0.5675, |
| "step": 4015 |
| }, |
| { |
| "epoch": 0.967741935483871, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.00014917131600111912, |
| "loss": 0.5769, |
| "step": 4020 |
| }, |
| { |
| "epoch": 0.9689455946076071, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.0001490988673050428, |
| "loss": 0.5611, |
| "step": 4025 |
| }, |
| { |
| "epoch": 0.9701492537313433, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.0001490263608703258, |
| "loss": 0.5722, |
| "step": 4030 |
| }, |
| { |
| "epoch": 0.9713529128550794, |
| "grad_norm": 1.4296875, |
| "learning_rate": 0.00014895379681281477, |
| "loss": 0.5755, |
| "step": 4035 |
| }, |
| { |
| "epoch": 0.9725565719788156, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00014888117524844855, |
| "loss": 0.5695, |
| "step": 4040 |
| }, |
| { |
| "epoch": 0.9737602311025517, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.0001488084962932577, |
| "loss": 0.561, |
| "step": 4045 |
| }, |
| { |
| "epoch": 0.9749638902262879, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00014873576006336472, |
| "loss": 0.5846, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.976167549350024, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.00014866296667498329, |
| "loss": 0.5544, |
| "step": 4055 |
| }, |
| { |
| "epoch": 0.9773712084737602, |
| "grad_norm": 1.4140625, |
| "learning_rate": 0.0001485901162444186, |
| "loss": 0.5687, |
| "step": 4060 |
| }, |
| { |
| "epoch": 0.9785748675974963, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00014851720888806706, |
| "loss": 0.5482, |
| "step": 4065 |
| }, |
| { |
| "epoch": 0.9797785267212326, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00014844424472241582, |
| "loss": 0.5589, |
| "step": 4070 |
| }, |
| { |
| "epoch": 0.9809821858449687, |
| "grad_norm": 1.484375, |
| "learning_rate": 0.00014837122386404303, |
| "loss": 0.6161, |
| "step": 4075 |
| }, |
| { |
| "epoch": 0.9821858449687049, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.0001482981464296173, |
| "loss": 0.5831, |
| "step": 4080 |
| }, |
| { |
| "epoch": 0.983389504092441, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00014822501253589746, |
| "loss": 0.5635, |
| "step": 4085 |
| }, |
| { |
| "epoch": 0.9845931632161772, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.0001481518222997329, |
| "loss": 0.5789, |
| "step": 4090 |
| }, |
| { |
| "epoch": 0.9857968223399133, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00014807857583806283, |
| "loss": 0.5644, |
| "step": 4095 |
| }, |
| { |
| "epoch": 0.9870004814636495, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.00014800527326791626, |
| "loss": 0.5779, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.9882041405873857, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00014793191470641192, |
| "loss": 0.5736, |
| "step": 4105 |
| }, |
| { |
| "epoch": 0.9894077997111218, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.00014785850027075804, |
| "loss": 0.5663, |
| "step": 4110 |
| }, |
| { |
| "epoch": 0.990611458834858, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00014778503007825203, |
| "loss": 0.5569, |
| "step": 4115 |
| }, |
| { |
| "epoch": 0.9918151179585941, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.0001477115042462804, |
| "loss": 0.5779, |
| "step": 4120 |
| }, |
| { |
| "epoch": 0.9930187770823303, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.00014763792289231858, |
| "loss": 0.5641, |
| "step": 4125 |
| }, |
| { |
| "epoch": 0.9942224362060664, |
| "grad_norm": 1.25, |
| "learning_rate": 0.0001475642861339308, |
| "loss": 0.5579, |
| "step": 4130 |
| }, |
| { |
| "epoch": 0.9954260953298026, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.0001474905940887696, |
| "loss": 0.5679, |
| "step": 4135 |
| }, |
| { |
| "epoch": 0.9966297544535387, |
| "grad_norm": 1.375, |
| "learning_rate": 0.00014741684687457607, |
| "loss": 0.5502, |
| "step": 4140 |
| }, |
| { |
| "epoch": 0.997833413577275, |
| "grad_norm": 1.4140625, |
| "learning_rate": 0.0001473430446091793, |
| "loss": 0.5893, |
| "step": 4145 |
| }, |
| { |
| "epoch": 0.999037072701011, |
| "grad_norm": 1.4765625, |
| "learning_rate": 0.0001472691874104964, |
| "loss": 0.5679, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.9997592681752527, |
| "eval_loss": 0.472748339176178, |
| "eval_runtime": 2.335, |
| "eval_samples_per_second": 85.653, |
| "eval_steps_per_second": 85.653, |
| "step": 4153 |
| }, |
| { |
| "epoch": 1.0002407318247473, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.0001471952753965323, |
| "loss": 0.5774, |
| "step": 4155 |
| }, |
| { |
| "epoch": 1.0014443909484834, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00014712130868537934, |
| "loss": 0.5232, |
| "step": 4160 |
| }, |
| { |
| "epoch": 1.0026480500722195, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00014704728739521744, |
| "loss": 0.5089, |
| "step": 4165 |
| }, |
| { |
| "epoch": 1.0038517091959558, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00014697321164431365, |
| "loss": 0.5132, |
| "step": 4170 |
| }, |
| { |
| "epoch": 1.0050553683196919, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.000146899081551022, |
| "loss": 0.5083, |
| "step": 4175 |
| }, |
| { |
| "epoch": 1.006259027443428, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00014682489723378335, |
| "loss": 0.5214, |
| "step": 4180 |
| }, |
| { |
| "epoch": 1.007462686567164, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00014675065881112533, |
| "loss": 0.5504, |
| "step": 4185 |
| }, |
| { |
| "epoch": 1.0086663456909004, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.00014667636640166177, |
| "loss": 0.5096, |
| "step": 4190 |
| }, |
| { |
| "epoch": 1.0098700048146365, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.000146602020124093, |
| "loss": 0.5299, |
| "step": 4195 |
| }, |
| { |
| "epoch": 1.0110736639383726, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00014652762009720527, |
| "loss": 0.5343, |
| "step": 4200 |
| }, |
| { |
| "epoch": 1.0122773230621087, |
| "grad_norm": 1.375, |
| "learning_rate": 0.00014645316643987073, |
| "loss": 0.5527, |
| "step": 4205 |
| }, |
| { |
| "epoch": 1.013480982185845, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.00014637865927104727, |
| "loss": 0.5428, |
| "step": 4210 |
| }, |
| { |
| "epoch": 1.0146846413095811, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00014630409870977824, |
| "loss": 0.5018, |
| "step": 4215 |
| }, |
| { |
| "epoch": 1.0158883004333172, |
| "grad_norm": 1.25, |
| "learning_rate": 0.0001462294848751923, |
| "loss": 0.5233, |
| "step": 4220 |
| }, |
| { |
| "epoch": 1.0170919595570533, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.00014615481788650322, |
| "loss": 0.5147, |
| "step": 4225 |
| }, |
| { |
| "epoch": 1.0182956186807897, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00014608009786300973, |
| "loss": 0.533, |
| "step": 4230 |
| }, |
| { |
| "epoch": 1.0194992778045258, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.00014600532492409525, |
| "loss": 0.5167, |
| "step": 4235 |
| }, |
| { |
| "epoch": 1.0207029369282619, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.00014593049918922778, |
| "loss": 0.522, |
| "step": 4240 |
| }, |
| { |
| "epoch": 1.021906596051998, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.00014585562077795968, |
| "loss": 0.5444, |
| "step": 4245 |
| }, |
| { |
| "epoch": 1.0231102551757343, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.0001457806898099274, |
| "loss": 0.5221, |
| "step": 4250 |
| }, |
| { |
| "epoch": 1.0243139142994704, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00014570570640485149, |
| "loss": 0.5383, |
| "step": 4255 |
| }, |
| { |
| "epoch": 1.0255175734232065, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00014563067068253617, |
| "loss": 0.5406, |
| "step": 4260 |
| }, |
| { |
| "epoch": 1.0267212325469428, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00014555558276286928, |
| "loss": 0.5441, |
| "step": 4265 |
| }, |
| { |
| "epoch": 1.027924891670679, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.0001454804427658221, |
| "loss": 0.5236, |
| "step": 4270 |
| }, |
| { |
| "epoch": 1.029128550794415, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.00014540525081144904, |
| "loss": 0.5415, |
| "step": 4275 |
| }, |
| { |
| "epoch": 1.030332209918151, |
| "grad_norm": 1.125, |
| "learning_rate": 0.0001453300070198876, |
| "loss": 0.5187, |
| "step": 4280 |
| }, |
| { |
| "epoch": 1.0315358690418874, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00014525471151135804, |
| "loss": 0.5211, |
| "step": 4285 |
| }, |
| { |
| "epoch": 1.0327395281656235, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.0001451793644061633, |
| "loss": 0.4976, |
| "step": 4290 |
| }, |
| { |
| "epoch": 1.0339431872893596, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00014510396582468878, |
| "loss": 0.4808, |
| "step": 4295 |
| }, |
| { |
| "epoch": 1.0351468464130957, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00014502851588740203, |
| "loss": 0.4813, |
| "step": 4300 |
| }, |
| { |
| "epoch": 1.036350505536832, |
| "grad_norm": 1.3828125, |
| "learning_rate": 0.0001449530147148527, |
| "loss": 0.5307, |
| "step": 4305 |
| }, |
| { |
| "epoch": 1.0375541646605682, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00014487746242767231, |
| "loss": 0.5252, |
| "step": 4310 |
| }, |
| { |
| "epoch": 1.0387578237843043, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.00014480185914657413, |
| "loss": 0.5193, |
| "step": 4315 |
| }, |
| { |
| "epoch": 1.0399614829080404, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00014472620499235276, |
| "loss": 0.5147, |
| "step": 4320 |
| }, |
| { |
| "epoch": 1.0411651420317767, |
| "grad_norm": 1.125, |
| "learning_rate": 0.0001446505000858841, |
| "loss": 0.539, |
| "step": 4325 |
| }, |
| { |
| "epoch": 1.0423688011555128, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00014457474454812524, |
| "loss": 0.5113, |
| "step": 4330 |
| }, |
| { |
| "epoch": 1.0435724602792489, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00014449893850011406, |
| "loss": 0.4988, |
| "step": 4335 |
| }, |
| { |
| "epoch": 1.044776119402985, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.0001444230820629693, |
| "loss": 0.5076, |
| "step": 4340 |
| }, |
| { |
| "epoch": 1.0459797785267213, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00014434717535788998, |
| "loss": 0.5036, |
| "step": 4345 |
| }, |
| { |
| "epoch": 1.0471834376504574, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00014427121850615562, |
| "loss": 0.5293, |
| "step": 4350 |
| }, |
| { |
| "epoch": 1.0483870967741935, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.00014419521162912577, |
| "loss": 0.495, |
| "step": 4355 |
| }, |
| { |
| "epoch": 1.0495907558979296, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.00014411915484823998, |
| "loss": 0.4997, |
| "step": 4360 |
| }, |
| { |
| "epoch": 1.050794415021666, |
| "grad_norm": 1.421875, |
| "learning_rate": 0.0001440430482850174, |
| "loss": 0.5098, |
| "step": 4365 |
| }, |
| { |
| "epoch": 1.051998074145402, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00014396689206105692, |
| "loss": 0.5052, |
| "step": 4370 |
| }, |
| { |
| "epoch": 1.0532017332691381, |
| "grad_norm": 1.125, |
| "learning_rate": 0.00014389068629803658, |
| "loss": 0.5218, |
| "step": 4375 |
| }, |
| { |
| "epoch": 1.0544053923928742, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00014381443111771372, |
| "loss": 0.4885, |
| "step": 4380 |
| }, |
| { |
| "epoch": 1.0556090515166106, |
| "grad_norm": 1.4609375, |
| "learning_rate": 0.00014373812664192446, |
| "loss": 0.5334, |
| "step": 4385 |
| }, |
| { |
| "epoch": 1.0568127106403467, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.0001436617729925839, |
| "loss": 0.5438, |
| "step": 4390 |
| }, |
| { |
| "epoch": 1.0580163697640828, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00014358537029168555, |
| "loss": 0.5318, |
| "step": 4395 |
| }, |
| { |
| "epoch": 1.059220028887819, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.0001435089186613014, |
| "loss": 0.5104, |
| "step": 4400 |
| }, |
| { |
| "epoch": 1.0604236880115552, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00014343241822358145, |
| "loss": 0.5239, |
| "step": 4405 |
| }, |
| { |
| "epoch": 1.0616273471352913, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00014335586910075383, |
| "loss": 0.5177, |
| "step": 4410 |
| }, |
| { |
| "epoch": 1.0628310062590274, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00014327927141512446, |
| "loss": 0.5196, |
| "step": 4415 |
| }, |
| { |
| "epoch": 1.0640346653827637, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00014320262528907677, |
| "loss": 0.5273, |
| "step": 4420 |
| }, |
| { |
| "epoch": 1.0652383245064998, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00014312593084507167, |
| "loss": 0.5357, |
| "step": 4425 |
| }, |
| { |
| "epoch": 1.066441983630236, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00014304918820564713, |
| "loss": 0.5211, |
| "step": 4430 |
| }, |
| { |
| "epoch": 1.067645642753972, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00014297239749341828, |
| "loss": 0.561, |
| "step": 4435 |
| }, |
| { |
| "epoch": 1.0688493018777083, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00014289555883107696, |
| "loss": 0.5179, |
| "step": 4440 |
| }, |
| { |
| "epoch": 1.0700529610014444, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00014281867234139164, |
| "loss": 0.5178, |
| "step": 4445 |
| }, |
| { |
| "epoch": 1.0712566201251805, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.00014274173814720733, |
| "loss": 0.5149, |
| "step": 4450 |
| }, |
| { |
| "epoch": 1.0724602792489166, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00014266475637144502, |
| "loss": 0.5222, |
| "step": 4455 |
| }, |
| { |
| "epoch": 1.073663938372653, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.0001425877271371019, |
| "loss": 0.4933, |
| "step": 4460 |
| }, |
| { |
| "epoch": 1.074867597496389, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00014251065056725094, |
| "loss": 0.5129, |
| "step": 4465 |
| }, |
| { |
| "epoch": 1.0760712566201251, |
| "grad_norm": 1.1328125, |
| "learning_rate": 0.00014243352678504074, |
| "loss": 0.5195, |
| "step": 4470 |
| }, |
| { |
| "epoch": 1.0772749157438612, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00014235635591369536, |
| "loss": 0.51, |
| "step": 4475 |
| }, |
| { |
| "epoch": 1.0784785748675976, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00014227913807651402, |
| "loss": 0.4842, |
| "step": 4480 |
| }, |
| { |
| "epoch": 1.0796822339913337, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00014220187339687108, |
| "loss": 0.5143, |
| "step": 4485 |
| }, |
| { |
| "epoch": 1.0808858931150698, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00014212456199821566, |
| "loss": 0.5289, |
| "step": 4490 |
| }, |
| { |
| "epoch": 1.0820895522388059, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.00014204720400407155, |
| "loss": 0.5034, |
| "step": 4495 |
| }, |
| { |
| "epoch": 1.0832932113625422, |
| "grad_norm": 1.0703125, |
| "learning_rate": 0.000141969799538037, |
| "loss": 0.5247, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.0832932113625422, |
| "eval_loss": 0.4665265679359436, |
| "eval_runtime": 2.3497, |
| "eval_samples_per_second": 85.118, |
| "eval_steps_per_second": 85.118, |
| "step": 4500 |
| }, |
| { |
| "epoch": 1.0844968704862783, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.00014189234872378457, |
| "loss": 0.5059, |
| "step": 4505 |
| }, |
| { |
| "epoch": 1.0857005296100144, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00014181485168506067, |
| "loss": 0.5525, |
| "step": 4510 |
| }, |
| { |
| "epoch": 1.0869041887337505, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00014173730854568582, |
| "loss": 0.5133, |
| "step": 4515 |
| }, |
| { |
| "epoch": 1.0881078478574868, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.000141659719429554, |
| "loss": 0.5797, |
| "step": 4520 |
| }, |
| { |
| "epoch": 1.089311506981223, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00014158208446063278, |
| "loss": 0.5077, |
| "step": 4525 |
| }, |
| { |
| "epoch": 1.090515166104959, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.0001415044037629629, |
| "loss": 0.5252, |
| "step": 4530 |
| }, |
| { |
| "epoch": 1.0917188252286953, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.00014142667746065826, |
| "loss": 0.5603, |
| "step": 4535 |
| }, |
| { |
| "epoch": 1.0929224843524314, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00014134890567790552, |
| "loss": 0.5004, |
| "step": 4540 |
| }, |
| { |
| "epoch": 1.0941261434761675, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00014127108853896405, |
| "loss": 0.5274, |
| "step": 4545 |
| }, |
| { |
| "epoch": 1.0953298025999036, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.0001411932261681657, |
| "loss": 0.4984, |
| "step": 4550 |
| }, |
| { |
| "epoch": 1.09653346172364, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00014111531868991458, |
| "loss": 0.5327, |
| "step": 4555 |
| }, |
| { |
| "epoch": 1.097737120847376, |
| "grad_norm": 1.3984375, |
| "learning_rate": 0.0001410373662286869, |
| "loss": 0.5464, |
| "step": 4560 |
| }, |
| { |
| "epoch": 1.0989407799711122, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00014095936890903062, |
| "loss": 0.5222, |
| "step": 4565 |
| }, |
| { |
| "epoch": 1.1001444390948483, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00014088132685556554, |
| "loss": 0.5265, |
| "step": 4570 |
| }, |
| { |
| "epoch": 1.1013480982185846, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00014080324019298283, |
| "loss": 0.5023, |
| "step": 4575 |
| }, |
| { |
| "epoch": 1.1025517573423207, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00014072510904604497, |
| "loss": 0.4935, |
| "step": 4580 |
| }, |
| { |
| "epoch": 1.1037554164660568, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.0001406469335395855, |
| "loss": 0.5368, |
| "step": 4585 |
| }, |
| { |
| "epoch": 1.104959075589793, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.00014056871379850884, |
| "loss": 0.522, |
| "step": 4590 |
| }, |
| { |
| "epoch": 1.1061627347135292, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00014049044994779005, |
| "loss": 0.491, |
| "step": 4595 |
| }, |
| { |
| "epoch": 1.1073663938372653, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00014041214211247475, |
| "loss": 0.5186, |
| "step": 4600 |
| }, |
| { |
| "epoch": 1.1085700529610014, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00014033379041767874, |
| "loss": 0.4796, |
| "step": 4605 |
| }, |
| { |
| "epoch": 1.1097737120847375, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.000140255394988588, |
| "loss": 0.5297, |
| "step": 4610 |
| }, |
| { |
| "epoch": 1.1109773712084738, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.0001401769559504583, |
| "loss": 0.5243, |
| "step": 4615 |
| }, |
| { |
| "epoch": 1.11218103033221, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.0001400984734286151, |
| "loss": 0.4982, |
| "step": 4620 |
| }, |
| { |
| "epoch": 1.113384689455946, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00014001994754845337, |
| "loss": 0.5347, |
| "step": 4625 |
| }, |
| { |
| "epoch": 1.1145883485796821, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.0001399413784354373, |
| "loss": 0.549, |
| "step": 4630 |
| }, |
| { |
| "epoch": 1.1157920077034185, |
| "grad_norm": 1.453125, |
| "learning_rate": 0.00013986276621510025, |
| "loss": 0.5295, |
| "step": 4635 |
| }, |
| { |
| "epoch": 1.1169956668271546, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.00013978411101304438, |
| "loss": 0.5119, |
| "step": 4640 |
| }, |
| { |
| "epoch": 1.1181993259508907, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00013970541295494048, |
| "loss": 0.527, |
| "step": 4645 |
| }, |
| { |
| "epoch": 1.1194029850746268, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00013962667216652796, |
| "loss": 0.4967, |
| "step": 4650 |
| }, |
| { |
| "epoch": 1.120606644198363, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.00013954788877361434, |
| "loss": 0.5054, |
| "step": 4655 |
| }, |
| { |
| "epoch": 1.1218103033220992, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00013946906290207534, |
| "loss": 0.5073, |
| "step": 4660 |
| }, |
| { |
| "epoch": 1.1230139624458353, |
| "grad_norm": 1.1328125, |
| "learning_rate": 0.00013939019467785452, |
| "loss": 0.5399, |
| "step": 4665 |
| }, |
| { |
| "epoch": 1.1242176215695716, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00013931128422696296, |
| "loss": 0.483, |
| "step": 4670 |
| }, |
| { |
| "epoch": 1.1254212806933077, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.0001392323316754794, |
| "loss": 0.5337, |
| "step": 4675 |
| }, |
| { |
| "epoch": 1.1266249398170438, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00013915333714954986, |
| "loss": 0.5077, |
| "step": 4680 |
| }, |
| { |
| "epoch": 1.12782859894078, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00013907430077538723, |
| "loss": 0.5053, |
| "step": 4685 |
| }, |
| { |
| "epoch": 1.129032258064516, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00013899522267927142, |
| "loss": 0.5212, |
| "step": 4690 |
| }, |
| { |
| "epoch": 1.1302359171882523, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00013891610298754896, |
| "loss": 0.5059, |
| "step": 4695 |
| }, |
| { |
| "epoch": 1.1314395763119884, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.0001388369418266328, |
| "loss": 0.5154, |
| "step": 4700 |
| }, |
| { |
| "epoch": 1.1326432354357245, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.00013875773932300225, |
| "loss": 0.4965, |
| "step": 4705 |
| }, |
| { |
| "epoch": 1.1338468945594609, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.0001386784956032025, |
| "loss": 0.5163, |
| "step": 4710 |
| }, |
| { |
| "epoch": 1.135050553683197, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.0001385992107938448, |
| "loss": 0.5221, |
| "step": 4715 |
| }, |
| { |
| "epoch": 1.136254212806933, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00013851988502160595, |
| "loss": 0.5394, |
| "step": 4720 |
| }, |
| { |
| "epoch": 1.1374578719306692, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.0001384405184132281, |
| "loss": 0.517, |
| "step": 4725 |
| }, |
| { |
| "epoch": 1.1386615310544055, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.00013836111109551884, |
| "loss": 0.5179, |
| "step": 4730 |
| }, |
| { |
| "epoch": 1.1398651901781416, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.00013828166319535066, |
| "loss": 0.5361, |
| "step": 4735 |
| }, |
| { |
| "epoch": 1.1410688493018777, |
| "grad_norm": 1.125, |
| "learning_rate": 0.00013820217483966095, |
| "loss": 0.498, |
| "step": 4740 |
| }, |
| { |
| "epoch": 1.1422725084256138, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.00013812264615545175, |
| "loss": 0.5299, |
| "step": 4745 |
| }, |
| { |
| "epoch": 1.14347616754935, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00013804307726978946, |
| "loss": 0.5236, |
| "step": 4750 |
| }, |
| { |
| "epoch": 1.1446798266730862, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.00013796346830980484, |
| "loss": 0.5062, |
| "step": 4755 |
| }, |
| { |
| "epoch": 1.1458834857968223, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.0001378838194026925, |
| "loss": 0.5279, |
| "step": 4760 |
| }, |
| { |
| "epoch": 1.1470871449205584, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00013780413067571103, |
| "loss": 0.5, |
| "step": 4765 |
| }, |
| { |
| "epoch": 1.1482908040442947, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.0001377244022561826, |
| "loss": 0.5132, |
| "step": 4770 |
| }, |
| { |
| "epoch": 1.1494944631680308, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00013764463427149278, |
| "loss": 0.4908, |
| "step": 4775 |
| }, |
| { |
| "epoch": 1.150698122291767, |
| "grad_norm": 1.359375, |
| "learning_rate": 0.00013756482684909032, |
| "loss": 0.5248, |
| "step": 4780 |
| }, |
| { |
| "epoch": 1.151901781415503, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.00013748498011648705, |
| "loss": 0.5286, |
| "step": 4785 |
| }, |
| { |
| "epoch": 1.1531054405392394, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.0001374050942012576, |
| "loss": 0.5167, |
| "step": 4790 |
| }, |
| { |
| "epoch": 1.1543090996629755, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00013732516923103917, |
| "loss": 0.5104, |
| "step": 4795 |
| }, |
| { |
| "epoch": 1.1555127587867116, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00013724520533353138, |
| "loss": 0.4978, |
| "step": 4800 |
| }, |
| { |
| "epoch": 1.1567164179104479, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00013716520263649598, |
| "loss": 0.5154, |
| "step": 4805 |
| }, |
| { |
| "epoch": 1.157920077034184, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00013708516126775683, |
| "loss": 0.5168, |
| "step": 4810 |
| }, |
| { |
| "epoch": 1.15912373615792, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.0001370050813551995, |
| "loss": 0.5273, |
| "step": 4815 |
| }, |
| { |
| "epoch": 1.1603273952816562, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00013692496302677122, |
| "loss": 0.5251, |
| "step": 4820 |
| }, |
| { |
| "epoch": 1.1615310544053923, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00013684480641048044, |
| "loss": 0.5318, |
| "step": 4825 |
| }, |
| { |
| "epoch": 1.1627347135291286, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00013676461163439696, |
| "loss": 0.4715, |
| "step": 4830 |
| }, |
| { |
| "epoch": 1.1639383726528647, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.00013668437882665143, |
| "loss": 0.502, |
| "step": 4835 |
| }, |
| { |
| "epoch": 1.1651420317766008, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00013660410811543533, |
| "loss": 0.5194, |
| "step": 4840 |
| }, |
| { |
| "epoch": 1.1663456909003371, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00013652379962900068, |
| "loss": 0.5127, |
| "step": 4845 |
| }, |
| { |
| "epoch": 1.1675493500240732, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00013644345349565984, |
| "loss": 0.535, |
| "step": 4850 |
| }, |
| { |
| "epoch": 1.1687530091478093, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00013636306984378537, |
| "loss": 0.4666, |
| "step": 4855 |
| }, |
| { |
| "epoch": 1.1699566682715454, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.0001362826488018097, |
| "loss": 0.5204, |
| "step": 4860 |
| }, |
| { |
| "epoch": 1.1711603273952818, |
| "grad_norm": 1.1328125, |
| "learning_rate": 0.00013620219049822503, |
| "loss": 0.4957, |
| "step": 4865 |
| }, |
| { |
| "epoch": 1.1723639865190179, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00013612169506158314, |
| "loss": 0.4948, |
| "step": 4870 |
| }, |
| { |
| "epoch": 1.173567645642754, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00013604116262049508, |
| "loss": 0.5377, |
| "step": 4875 |
| }, |
| { |
| "epoch": 1.17477130476649, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00013596059330363107, |
| "loss": 0.5262, |
| "step": 4880 |
| }, |
| { |
| "epoch": 1.1759749638902264, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.0001358799872397202, |
| "loss": 0.5281, |
| "step": 4885 |
| }, |
| { |
| "epoch": 1.1771786230139625, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.0001357993445575503, |
| "loss": 0.4923, |
| "step": 4890 |
| }, |
| { |
| "epoch": 1.1783822821376986, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.0001357186653859677, |
| "loss": 0.4956, |
| "step": 4895 |
| }, |
| { |
| "epoch": 1.1795859412614347, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.0001356379498538771, |
| "loss": 0.4837, |
| "step": 4900 |
| }, |
| { |
| "epoch": 1.180789600385171, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.00013555719809024109, |
| "loss": 0.5027, |
| "step": 4905 |
| }, |
| { |
| "epoch": 1.181993259508907, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.0001354764102240804, |
| "loss": 0.5593, |
| "step": 4910 |
| }, |
| { |
| "epoch": 1.1831969186326432, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.0001353955863844733, |
| "loss": 0.4874, |
| "step": 4915 |
| }, |
| { |
| "epoch": 1.1844005777563793, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.00013531472670055557, |
| "loss": 0.513, |
| "step": 4920 |
| }, |
| { |
| "epoch": 1.1856042368801156, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00013523383130152023, |
| "loss": 0.5019, |
| "step": 4925 |
| }, |
| { |
| "epoch": 1.1868078960038517, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00013515290031661744, |
| "loss": 0.5332, |
| "step": 4930 |
| }, |
| { |
| "epoch": 1.1880115551275878, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.0001350719338751541, |
| "loss": 0.5233, |
| "step": 4935 |
| }, |
| { |
| "epoch": 1.1892152142513241, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.00013499093210649388, |
| "loss": 0.4874, |
| "step": 4940 |
| }, |
| { |
| "epoch": 1.1904188733750602, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00013490989514005684, |
| "loss": 0.5264, |
| "step": 4945 |
| }, |
| { |
| "epoch": 1.1916225324987963, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00013482882310531926, |
| "loss": 0.5037, |
| "step": 4950 |
| }, |
| { |
| "epoch": 1.1928261916225325, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00013474771613181347, |
| "loss": 0.4813, |
| "step": 4955 |
| }, |
| { |
| "epoch": 1.1940298507462686, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00013466657434912764, |
| "loss": 0.5354, |
| "step": 4960 |
| }, |
| { |
| "epoch": 1.1952335098700049, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.0001345853978869055, |
| "loss": 0.5012, |
| "step": 4965 |
| }, |
| { |
| "epoch": 1.196437168993741, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00013450418687484627, |
| "loss": 0.4819, |
| "step": 4970 |
| }, |
| { |
| "epoch": 1.197640828117477, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00013442294144270427, |
| "loss": 0.5128, |
| "step": 4975 |
| }, |
| { |
| "epoch": 1.1988444872412134, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00013434166172028895, |
| "loss": 0.5017, |
| "step": 4980 |
| }, |
| { |
| "epoch": 1.2000481463649495, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.0001342603478374644, |
| "loss": 0.5189, |
| "step": 4985 |
| }, |
| { |
| "epoch": 1.2012518054886856, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.0001341789999241494, |
| "loss": 0.4999, |
| "step": 4990 |
| }, |
| { |
| "epoch": 1.2024554646124217, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00013409761811031707, |
| "loss": 0.5187, |
| "step": 4995 |
| }, |
| { |
| "epoch": 1.2036591237361578, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00013401620252599466, |
| "loss": 0.5358, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.2036591237361578, |
| "eval_loss": 0.4536990225315094, |
| "eval_runtime": 2.3545, |
| "eval_samples_per_second": 84.944, |
| "eval_steps_per_second": 84.944, |
| "step": 5000 |
| }, |
| { |
| "epoch": 1.2048627828598941, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.0001339347533012634, |
| "loss": 0.4842, |
| "step": 5005 |
| }, |
| { |
| "epoch": 1.2060664419836302, |
| "grad_norm": 1.125, |
| "learning_rate": 0.0001338532705662583, |
| "loss": 0.4826, |
| "step": 5010 |
| }, |
| { |
| "epoch": 1.2072701011073663, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00013377175445116786, |
| "loss": 0.5059, |
| "step": 5015 |
| }, |
| { |
| "epoch": 1.2084737602311026, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.00013369020508623394, |
| "loss": 0.504, |
| "step": 5020 |
| }, |
| { |
| "epoch": 1.2096774193548387, |
| "grad_norm": 1.0859375, |
| "learning_rate": 0.00013360862260175153, |
| "loss": 0.5286, |
| "step": 5025 |
| }, |
| { |
| "epoch": 1.2108810784785748, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.0001335270071280685, |
| "loss": 0.518, |
| "step": 5030 |
| }, |
| { |
| "epoch": 1.212084737602311, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.0001334453587955855, |
| "loss": 0.5122, |
| "step": 5035 |
| }, |
| { |
| "epoch": 1.2132883967260473, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00013336367773475558, |
| "loss": 0.5292, |
| "step": 5040 |
| }, |
| { |
| "epoch": 1.2144920558497834, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.00013328196407608414, |
| "loss": 0.5213, |
| "step": 5045 |
| }, |
| { |
| "epoch": 1.2156957149735195, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.00013320021795012871, |
| "loss": 0.5076, |
| "step": 5050 |
| }, |
| { |
| "epoch": 1.2168993740972556, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.0001331184394874986, |
| "loss": 0.5195, |
| "step": 5055 |
| }, |
| { |
| "epoch": 1.218103033220992, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.00013303662881885487, |
| "loss": 0.5016, |
| "step": 5060 |
| }, |
| { |
| "epoch": 1.219306692344728, |
| "grad_norm": 1.0859375, |
| "learning_rate": 0.00013295478607490996, |
| "loss": 0.4838, |
| "step": 5065 |
| }, |
| { |
| "epoch": 1.220510351468464, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00013287291138642761, |
| "loss": 0.5046, |
| "step": 5070 |
| }, |
| { |
| "epoch": 1.2217140105922002, |
| "grad_norm": 1.1328125, |
| "learning_rate": 0.00013279100488422255, |
| "loss": 0.5018, |
| "step": 5075 |
| }, |
| { |
| "epoch": 1.2229176697159365, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00013270906669916042, |
| "loss": 0.5398, |
| "step": 5080 |
| }, |
| { |
| "epoch": 1.2241213288396726, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.00013262709696215742, |
| "loss": 0.4706, |
| "step": 5085 |
| }, |
| { |
| "epoch": 1.2253249879634087, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.00013254509580418015, |
| "loss": 0.5214, |
| "step": 5090 |
| }, |
| { |
| "epoch": 1.2265286470871448, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00013246306335624547, |
| "loss": 0.5027, |
| "step": 5095 |
| }, |
| { |
| "epoch": 1.2277323062108811, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.00013238099974942022, |
| "loss": 0.5232, |
| "step": 5100 |
| }, |
| { |
| "epoch": 1.2289359653346172, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.00013229890511482094, |
| "loss": 0.4929, |
| "step": 5105 |
| }, |
| { |
| "epoch": 1.2301396244583533, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.0001322167795836139, |
| "loss": 0.5032, |
| "step": 5110 |
| }, |
| { |
| "epoch": 1.2313432835820897, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00013213462328701454, |
| "loss": 0.5256, |
| "step": 5115 |
| }, |
| { |
| "epoch": 1.2325469427058258, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00013205243635628765, |
| "loss": 0.5033, |
| "step": 5120 |
| }, |
| { |
| "epoch": 1.2337506018295619, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00013197021892274685, |
| "loss": 0.4942, |
| "step": 5125 |
| }, |
| { |
| "epoch": 1.234954260953298, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.0001318879711177545, |
| "loss": 0.4887, |
| "step": 5130 |
| }, |
| { |
| "epoch": 1.236157920077034, |
| "grad_norm": 1.1328125, |
| "learning_rate": 0.00013180569307272156, |
| "loss": 0.4551, |
| "step": 5135 |
| }, |
| { |
| "epoch": 1.2373615792007704, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00013172338491910718, |
| "loss": 0.5323, |
| "step": 5140 |
| }, |
| { |
| "epoch": 1.2385652383245065, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.00013164104678841873, |
| "loss": 0.5175, |
| "step": 5145 |
| }, |
| { |
| "epoch": 1.2397688974482426, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00013155867881221145, |
| "loss": 0.5004, |
| "step": 5150 |
| }, |
| { |
| "epoch": 1.240972556571979, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00013147628112208825, |
| "loss": 0.4757, |
| "step": 5155 |
| }, |
| { |
| "epoch": 1.242176215695715, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.0001313938538496995, |
| "loss": 0.5226, |
| "step": 5160 |
| }, |
| { |
| "epoch": 1.2433798748194511, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00013131139712674282, |
| "loss": 0.496, |
| "step": 5165 |
| }, |
| { |
| "epoch": 1.2445835339431872, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00013122891108496295, |
| "loss": 0.5161, |
| "step": 5170 |
| }, |
| { |
| "epoch": 1.2457871930669235, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.00013114639585615144, |
| "loss": 0.498, |
| "step": 5175 |
| }, |
| { |
| "epoch": 1.2469908521906596, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.0001310638515721465, |
| "loss": 0.517, |
| "step": 5180 |
| }, |
| { |
| "epoch": 1.2481945113143957, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00013098127836483266, |
| "loss": 0.4963, |
| "step": 5185 |
| }, |
| { |
| "epoch": 1.2493981704381318, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.0001308986763661408, |
| "loss": 0.4906, |
| "step": 5190 |
| }, |
| { |
| "epoch": 1.2506018295618682, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.00013081604570804771, |
| "loss": 0.5061, |
| "step": 5195 |
| }, |
| { |
| "epoch": 1.2518054886856043, |
| "grad_norm": 1.1328125, |
| "learning_rate": 0.00013073338652257603, |
| "loss": 0.5145, |
| "step": 5200 |
| }, |
| { |
| "epoch": 1.2530091478093404, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.0001306506989417939, |
| "loss": 0.5046, |
| "step": 5205 |
| }, |
| { |
| "epoch": 1.2542128069330767, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00013056798309781493, |
| "loss": 0.5017, |
| "step": 5210 |
| }, |
| { |
| "epoch": 1.2554164660568128, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.0001304852391227978, |
| "loss": 0.4801, |
| "step": 5215 |
| }, |
| { |
| "epoch": 1.2566201251805489, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.00013040246714894616, |
| "loss": 0.5122, |
| "step": 5220 |
| }, |
| { |
| "epoch": 1.257823784304285, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00013031966730850844, |
| "loss": 0.4989, |
| "step": 5225 |
| }, |
| { |
| "epoch": 1.259027443428021, |
| "grad_norm": 1.125, |
| "learning_rate": 0.0001302368397337776, |
| "loss": 0.4914, |
| "step": 5230 |
| }, |
| { |
| "epoch": 1.2602311025517574, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.00013015398455709073, |
| "loss": 0.5161, |
| "step": 5235 |
| }, |
| { |
| "epoch": 1.2614347616754935, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.0001300711019108293, |
| "loss": 0.515, |
| "step": 5240 |
| }, |
| { |
| "epoch": 1.2626384207992296, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00012998819192741845, |
| "loss": 0.5001, |
| "step": 5245 |
| }, |
| { |
| "epoch": 1.263842079922966, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.0001299052547393271, |
| "loss": 0.5229, |
| "step": 5250 |
| }, |
| { |
| "epoch": 1.265045739046702, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00012982229047906764, |
| "loss": 0.5002, |
| "step": 5255 |
| }, |
| { |
| "epoch": 1.2662493981704381, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.0001297392992791957, |
| "loss": 0.5128, |
| "step": 5260 |
| }, |
| { |
| "epoch": 1.2674530572941742, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.0001296562812723099, |
| "loss": 0.5437, |
| "step": 5265 |
| }, |
| { |
| "epoch": 1.2686567164179103, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00012957323659105172, |
| "loss": 0.504, |
| "step": 5270 |
| }, |
| { |
| "epoch": 1.2698603755416467, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00012949016536810534, |
| "loss": 0.4892, |
| "step": 5275 |
| }, |
| { |
| "epoch": 1.2710640346653828, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00012940706773619722, |
| "loss": 0.5222, |
| "step": 5280 |
| }, |
| { |
| "epoch": 1.2722676937891189, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.0001293239438280961, |
| "loss": 0.5247, |
| "step": 5285 |
| }, |
| { |
| "epoch": 1.2734713529128552, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00012924079377661267, |
| "loss": 0.4893, |
| "step": 5290 |
| }, |
| { |
| "epoch": 1.2746750120365913, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00012915761771459938, |
| "loss": 0.5049, |
| "step": 5295 |
| }, |
| { |
| "epoch": 1.2758786711603274, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.00012907441577495027, |
| "loss": 0.5179, |
| "step": 5300 |
| }, |
| { |
| "epoch": 1.2770823302840635, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00012899118809060072, |
| "loss": 0.499, |
| "step": 5305 |
| }, |
| { |
| "epoch": 1.2782859894077996, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00012890793479452726, |
| "loss": 0.5587, |
| "step": 5310 |
| }, |
| { |
| "epoch": 1.279489648531536, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00012882465601974722, |
| "loss": 0.4966, |
| "step": 5315 |
| }, |
| { |
| "epoch": 1.280693307655272, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00012874135189931883, |
| "loss": 0.5036, |
| "step": 5320 |
| }, |
| { |
| "epoch": 1.281896966779008, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00012865802256634067, |
| "loss": 0.5005, |
| "step": 5325 |
| }, |
| { |
| "epoch": 1.2831006259027444, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.0001285746681539517, |
| "loss": 0.5167, |
| "step": 5330 |
| }, |
| { |
| "epoch": 1.2843042850264805, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00012849128879533083, |
| "loss": 0.5216, |
| "step": 5335 |
| }, |
| { |
| "epoch": 1.2855079441502166, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00012840788462369695, |
| "loss": 0.5221, |
| "step": 5340 |
| }, |
| { |
| "epoch": 1.286711603273953, |
| "grad_norm": 1.3203125, |
| "learning_rate": 0.00012832445577230854, |
| "loss": 0.4875, |
| "step": 5345 |
| }, |
| { |
| "epoch": 1.287915262397689, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00012824100237446352, |
| "loss": 0.4996, |
| "step": 5350 |
| }, |
| { |
| "epoch": 1.2891189215214252, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.000128157524563499, |
| "loss": 0.5301, |
| "step": 5355 |
| }, |
| { |
| "epoch": 1.2903225806451613, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.00012807402247279115, |
| "loss": 0.5297, |
| "step": 5360 |
| }, |
| { |
| "epoch": 1.2915262397688974, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00012799049623575488, |
| "loss": 0.4997, |
| "step": 5365 |
| }, |
| { |
| "epoch": 1.2927298988926337, |
| "grad_norm": 1.0234375, |
| "learning_rate": 0.00012790694598584373, |
| "loss": 0.4706, |
| "step": 5370 |
| }, |
| { |
| "epoch": 1.2939335580163698, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00012782337185654957, |
| "loss": 0.4911, |
| "step": 5375 |
| }, |
| { |
| "epoch": 1.2951372171401059, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.0001277397739814024, |
| "loss": 0.5139, |
| "step": 5380 |
| }, |
| { |
| "epoch": 1.2963408762638422, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.0001276561524939703, |
| "loss": 0.5281, |
| "step": 5385 |
| }, |
| { |
| "epoch": 1.2975445353875783, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00012757250752785886, |
| "loss": 0.518, |
| "step": 5390 |
| }, |
| { |
| "epoch": 1.2987481945113144, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.00012748883921671132, |
| "loss": 0.5116, |
| "step": 5395 |
| }, |
| { |
| "epoch": 1.2999518536350505, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.00012740514769420824, |
| "loss": 0.4575, |
| "step": 5400 |
| }, |
| { |
| "epoch": 1.3011555127587866, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00012732143309406713, |
| "loss": 0.5117, |
| "step": 5405 |
| }, |
| { |
| "epoch": 1.302359171882523, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.0001272376955500426, |
| "loss": 0.5249, |
| "step": 5410 |
| }, |
| { |
| "epoch": 1.303562831006259, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.00012715393519592563, |
| "loss": 0.4805, |
| "step": 5415 |
| }, |
| { |
| "epoch": 1.3047664901299951, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00012707015216554388, |
| "loss": 0.4965, |
| "step": 5420 |
| }, |
| { |
| "epoch": 1.3059701492537314, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00012698634659276113, |
| "loss": 0.4759, |
| "step": 5425 |
| }, |
| { |
| "epoch": 1.3071738083774675, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.0001269025186114772, |
| "loss": 0.4972, |
| "step": 5430 |
| }, |
| { |
| "epoch": 1.3083774675012037, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.0001268186683556277, |
| "loss": 0.5039, |
| "step": 5435 |
| }, |
| { |
| "epoch": 1.3095811266249398, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00012673479595918388, |
| "loss": 0.5127, |
| "step": 5440 |
| }, |
| { |
| "epoch": 1.3107847857486759, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.0001266509015561523, |
| "loss": 0.5255, |
| "step": 5445 |
| }, |
| { |
| "epoch": 1.3119884448724122, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00012656698528057473, |
| "loss": 0.4879, |
| "step": 5450 |
| }, |
| { |
| "epoch": 1.3131921039961483, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00012648304726652787, |
| "loss": 0.4959, |
| "step": 5455 |
| }, |
| { |
| "epoch": 1.3143957631198844, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.0001263990876481231, |
| "loss": 0.5016, |
| "step": 5460 |
| }, |
| { |
| "epoch": 1.3155994222436207, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00012631510655950644, |
| "loss": 0.5054, |
| "step": 5465 |
| }, |
| { |
| "epoch": 1.3168030813673568, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00012623110413485808, |
| "loss": 0.4687, |
| "step": 5470 |
| }, |
| { |
| "epoch": 1.318006740491093, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.0001261470805083924, |
| "loss": 0.4957, |
| "step": 5475 |
| }, |
| { |
| "epoch": 1.319210399614829, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.00012606303581435757, |
| "loss": 0.4938, |
| "step": 5480 |
| }, |
| { |
| "epoch": 1.320414058738565, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00012597897018703548, |
| "loss": 0.4824, |
| "step": 5485 |
| }, |
| { |
| "epoch": 1.3216177178623014, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.00012589488376074152, |
| "loss": 0.5155, |
| "step": 5490 |
| }, |
| { |
| "epoch": 1.3228213769860375, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.00012581077666982415, |
| "loss": 0.4871, |
| "step": 5495 |
| }, |
| { |
| "epoch": 1.3240250361097736, |
| "grad_norm": 1.1015625, |
| "learning_rate": 0.000125726649048665, |
| "loss": 0.503, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.3240250361097736, |
| "eval_loss": 0.4387615919113159, |
| "eval_runtime": 2.3516, |
| "eval_samples_per_second": 85.047, |
| "eval_steps_per_second": 85.047, |
| "step": 5500 |
| }, |
| { |
| "epoch": 1.32522869523351, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.0001256425010316784, |
| "loss": 0.515, |
| "step": 5505 |
| }, |
| { |
| "epoch": 1.326432354357246, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00012555833275331135, |
| "loss": 0.5221, |
| "step": 5510 |
| }, |
| { |
| "epoch": 1.3276360134809821, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00012547414434804316, |
| "loss": 0.4718, |
| "step": 5515 |
| }, |
| { |
| "epoch": 1.3288396726047185, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00012538993595038537, |
| "loss": 0.4931, |
| "step": 5520 |
| }, |
| { |
| "epoch": 1.3300433317284546, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.0001253057076948813, |
| "loss": 0.5102, |
| "step": 5525 |
| }, |
| { |
| "epoch": 1.3312469908521907, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.0001252214597161062, |
| "loss": 0.4856, |
| "step": 5530 |
| }, |
| { |
| "epoch": 1.3324506499759268, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.0001251371921486667, |
| "loss": 0.4995, |
| "step": 5535 |
| }, |
| { |
| "epoch": 1.3336543090996629, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00012505290512720082, |
| "loss": 0.4825, |
| "step": 5540 |
| }, |
| { |
| "epoch": 1.3348579682233992, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.0001249685987863776, |
| "loss": 0.5072, |
| "step": 5545 |
| }, |
| { |
| "epoch": 1.3360616273471353, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.0001248842732608969, |
| "loss": 0.5069, |
| "step": 5550 |
| }, |
| { |
| "epoch": 1.3372652864708714, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00012479992868548936, |
| "loss": 0.4969, |
| "step": 5555 |
| }, |
| { |
| "epoch": 1.3384689455946077, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00012471556519491592, |
| "loss": 0.5154, |
| "step": 5560 |
| }, |
| { |
| "epoch": 1.3396726047183438, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.0001246311829239679, |
| "loss": 0.5276, |
| "step": 5565 |
| }, |
| { |
| "epoch": 1.34087626384208, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.0001245467820074665, |
| "loss": 0.4811, |
| "step": 5570 |
| }, |
| { |
| "epoch": 1.342079922965816, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.00012446236258026276, |
| "loss": 0.5205, |
| "step": 5575 |
| }, |
| { |
| "epoch": 1.3432835820895521, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.00012437792477723724, |
| "loss": 0.4981, |
| "step": 5580 |
| }, |
| { |
| "epoch": 1.3444872412132884, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00012429346873329993, |
| "loss": 0.4763, |
| "step": 5585 |
| }, |
| { |
| "epoch": 1.3456909003370245, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.0001242089945833899, |
| "loss": 0.4831, |
| "step": 5590 |
| }, |
| { |
| "epoch": 1.3468945594607606, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00012412450246247528, |
| "loss": 0.538, |
| "step": 5595 |
| }, |
| { |
| "epoch": 1.348098218584497, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.00012403999250555273, |
| "loss": 0.5154, |
| "step": 5600 |
| }, |
| { |
| "epoch": 1.349301877708233, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00012395546484764757, |
| "loss": 0.4815, |
| "step": 5605 |
| }, |
| { |
| "epoch": 1.3505055368319692, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.00012387091962381324, |
| "loss": 0.4986, |
| "step": 5610 |
| }, |
| { |
| "epoch": 1.3517091959557053, |
| "grad_norm": 1.1328125, |
| "learning_rate": 0.0001237863569691314, |
| "loss": 0.4928, |
| "step": 5615 |
| }, |
| { |
| "epoch": 1.3529128550794414, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00012370177701871149, |
| "loss": 0.4881, |
| "step": 5620 |
| }, |
| { |
| "epoch": 1.3541165142031777, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00012361717990769057, |
| "loss": 0.5174, |
| "step": 5625 |
| }, |
| { |
| "epoch": 1.3553201733269138, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00012353256577123315, |
| "loss": 0.4902, |
| "step": 5630 |
| }, |
| { |
| "epoch": 1.35652383245065, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.0001234479347445309, |
| "loss": 0.4853, |
| "step": 5635 |
| }, |
| { |
| "epoch": 1.3577274915743862, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00012336328696280254, |
| "loss": 0.5045, |
| "step": 5640 |
| }, |
| { |
| "epoch": 1.3589311506981223, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.0001232786225612935, |
| "loss": 0.5301, |
| "step": 5645 |
| }, |
| { |
| "epoch": 1.3601348098218584, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.00012319394167527583, |
| "loss": 0.528, |
| "step": 5650 |
| }, |
| { |
| "epoch": 1.3613384689455947, |
| "grad_norm": 1.390625, |
| "learning_rate": 0.0001231092444400478, |
| "loss": 0.493, |
| "step": 5655 |
| }, |
| { |
| "epoch": 1.3625421280693308, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.000123024530990934, |
| "loss": 0.4953, |
| "step": 5660 |
| }, |
| { |
| "epoch": 1.363745787193067, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00012293980146328468, |
| "loss": 0.493, |
| "step": 5665 |
| }, |
| { |
| "epoch": 1.364949446316803, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00012285505599247598, |
| "loss": 0.487, |
| "step": 5670 |
| }, |
| { |
| "epoch": 1.3661531054405391, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00012277029471390944, |
| "loss": 0.4955, |
| "step": 5675 |
| }, |
| { |
| "epoch": 1.3673567645642755, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.0001226855177630118, |
| "loss": 0.4748, |
| "step": 5680 |
| }, |
| { |
| "epoch": 1.3685604236880116, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.00012260072527523491, |
| "loss": 0.5096, |
| "step": 5685 |
| }, |
| { |
| "epoch": 1.3697640828117477, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.00012251591738605548, |
| "loss": 0.4603, |
| "step": 5690 |
| }, |
| { |
| "epoch": 1.370967741935484, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.00012243109423097472, |
| "loss": 0.5298, |
| "step": 5695 |
| }, |
| { |
| "epoch": 1.37217140105922, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00012234625594551832, |
| "loss": 0.5053, |
| "step": 5700 |
| }, |
| { |
| "epoch": 1.3733750601829562, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.0001222614026652361, |
| "loss": 0.4938, |
| "step": 5705 |
| }, |
| { |
| "epoch": 1.3745787193066923, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00012217653452570185, |
| "loss": 0.4735, |
| "step": 5710 |
| }, |
| { |
| "epoch": 1.3757823784304284, |
| "grad_norm": 1.34375, |
| "learning_rate": 0.00012209165166251308, |
| "loss": 0.5206, |
| "step": 5715 |
| }, |
| { |
| "epoch": 1.3769860375541647, |
| "grad_norm": 1.09375, |
| "learning_rate": 0.00012200675421129087, |
| "loss": 0.484, |
| "step": 5720 |
| }, |
| { |
| "epoch": 1.3781896966779008, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00012192184230767959, |
| "loss": 0.5006, |
| "step": 5725 |
| }, |
| { |
| "epoch": 1.379393355801637, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.0001218369160873467, |
| "loss": 0.4873, |
| "step": 5730 |
| }, |
| { |
| "epoch": 1.3805970149253732, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.0001217519756859825, |
| "loss": 0.5143, |
| "step": 5735 |
| }, |
| { |
| "epoch": 1.3818006740491093, |
| "grad_norm": 1.125, |
| "learning_rate": 0.00012166702123929998, |
| "loss": 0.5015, |
| "step": 5740 |
| }, |
| { |
| "epoch": 1.3830043331728454, |
| "grad_norm": 1.1328125, |
| "learning_rate": 0.00012158205288303457, |
| "loss": 0.467, |
| "step": 5745 |
| }, |
| { |
| "epoch": 1.3842079922965815, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00012149707075294393, |
| "loss": 0.5055, |
| "step": 5750 |
| }, |
| { |
| "epoch": 1.3854116514203176, |
| "grad_norm": 1.0859375, |
| "learning_rate": 0.00012141207498480772, |
| "loss": 0.5042, |
| "step": 5755 |
| }, |
| { |
| "epoch": 1.386615310544054, |
| "grad_norm": 1.3125, |
| "learning_rate": 0.00012132706571442743, |
| "loss": 0.5204, |
| "step": 5760 |
| }, |
| { |
| "epoch": 1.38781896966779, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.00012124204307762598, |
| "loss": 0.4756, |
| "step": 5765 |
| }, |
| { |
| "epoch": 1.3890226287915262, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.00012115700721024783, |
| "loss": 0.4829, |
| "step": 5770 |
| }, |
| { |
| "epoch": 1.3902262879152625, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00012107195824815846, |
| "loss": 0.5153, |
| "step": 5775 |
| }, |
| { |
| "epoch": 1.3914299470389986, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00012098689632724433, |
| "loss": 0.4649, |
| "step": 5780 |
| }, |
| { |
| "epoch": 1.3926336061627347, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00012090182158341258, |
| "loss": 0.5136, |
| "step": 5785 |
| }, |
| { |
| "epoch": 1.393837265286471, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00012081673415259083, |
| "loss": 0.4893, |
| "step": 5790 |
| }, |
| { |
| "epoch": 1.395040924410207, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00012073163417072698, |
| "loss": 0.4856, |
| "step": 5795 |
| }, |
| { |
| "epoch": 1.3962445835339432, |
| "grad_norm": 1.1328125, |
| "learning_rate": 0.000120646521773789, |
| "loss": 0.4911, |
| "step": 5800 |
| }, |
| { |
| "epoch": 1.3974482426576793, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.00012056139709776468, |
| "loss": 0.5176, |
| "step": 5805 |
| }, |
| { |
| "epoch": 1.3986519017814154, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00012047626027866143, |
| "loss": 0.5161, |
| "step": 5810 |
| }, |
| { |
| "epoch": 1.3998555609051517, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00012039111145250606, |
| "loss": 0.5026, |
| "step": 5815 |
| }, |
| { |
| "epoch": 1.4010592200288878, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00012030595075534455, |
| "loss": 0.4906, |
| "step": 5820 |
| }, |
| { |
| "epoch": 1.402262879152624, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.00012022077832324187, |
| "loss": 0.4992, |
| "step": 5825 |
| }, |
| { |
| "epoch": 1.4034665382763603, |
| "grad_norm": 1.125, |
| "learning_rate": 0.00012013559429228176, |
| "loss": 0.5064, |
| "step": 5830 |
| }, |
| { |
| "epoch": 1.4046701974000964, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00012005039879856641, |
| "loss": 0.5087, |
| "step": 5835 |
| }, |
| { |
| "epoch": 1.4058738565238325, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00011996519197821648, |
| "loss": 0.4936, |
| "step": 5840 |
| }, |
| { |
| "epoch": 1.4070775156475686, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00011987997396737051, |
| "loss": 0.4609, |
| "step": 5845 |
| }, |
| { |
| "epoch": 1.4082811747713047, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.00011979474490218507, |
| "loss": 0.4639, |
| "step": 5850 |
| }, |
| { |
| "epoch": 1.409484833895041, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00011970950491883439, |
| "loss": 0.4731, |
| "step": 5855 |
| }, |
| { |
| "epoch": 1.410688493018777, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.00011962425415351009, |
| "loss": 0.467, |
| "step": 5860 |
| }, |
| { |
| "epoch": 1.4118921521425132, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00011953899274242107, |
| "loss": 0.4699, |
| "step": 5865 |
| }, |
| { |
| "epoch": 1.4130958112662495, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00011945372082179316, |
| "loss": 0.477, |
| "step": 5870 |
| }, |
| { |
| "epoch": 1.4142994703899856, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00011936843852786902, |
| "loss": 0.4717, |
| "step": 5875 |
| }, |
| { |
| "epoch": 1.4155031295137217, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00011928314599690795, |
| "loss": 0.4816, |
| "step": 5880 |
| }, |
| { |
| "epoch": 1.4167067886374578, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00011919784336518553, |
| "loss": 0.4641, |
| "step": 5885 |
| }, |
| { |
| "epoch": 1.417910447761194, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00011911253076899347, |
| "loss": 0.4853, |
| "step": 5890 |
| }, |
| { |
| "epoch": 1.4191141068849302, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.0001190272083446395, |
| "loss": 0.4879, |
| "step": 5895 |
| }, |
| { |
| "epoch": 1.4203177660086663, |
| "grad_norm": 1.125, |
| "learning_rate": 0.00011894187622844685, |
| "loss": 0.4662, |
| "step": 5900 |
| }, |
| { |
| "epoch": 1.4215214251324024, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.0001188565345567545, |
| "loss": 0.4762, |
| "step": 5905 |
| }, |
| { |
| "epoch": 1.4227250842561388, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.0001187711834659165, |
| "loss": 0.4984, |
| "step": 5910 |
| }, |
| { |
| "epoch": 1.4239287433798749, |
| "grad_norm": 1.1015625, |
| "learning_rate": 0.00011868582309230205, |
| "loss": 0.5133, |
| "step": 5915 |
| }, |
| { |
| "epoch": 1.425132402503611, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00011860045357229508, |
| "loss": 0.4922, |
| "step": 5920 |
| }, |
| { |
| "epoch": 1.4263360616273473, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00011851507504229425, |
| "loss": 0.4734, |
| "step": 5925 |
| }, |
| { |
| "epoch": 1.4275397207510834, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00011842968763871255, |
| "loss": 0.4833, |
| "step": 5930 |
| }, |
| { |
| "epoch": 1.4287433798748195, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.00011834429149797717, |
| "loss": 0.4907, |
| "step": 5935 |
| }, |
| { |
| "epoch": 1.4299470389985556, |
| "grad_norm": 1.1015625, |
| "learning_rate": 0.00011825888675652923, |
| "loss": 0.4982, |
| "step": 5940 |
| }, |
| { |
| "epoch": 1.4311506981222917, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00011817347355082364, |
| "loss": 0.493, |
| "step": 5945 |
| }, |
| { |
| "epoch": 1.432354357246028, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00011808805201732878, |
| "loss": 0.4807, |
| "step": 5950 |
| }, |
| { |
| "epoch": 1.433558016369764, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.0001180026222925264, |
| "loss": 0.5133, |
| "step": 5955 |
| }, |
| { |
| "epoch": 1.4347616754935002, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00011791718451291126, |
| "loss": 0.5, |
| "step": 5960 |
| }, |
| { |
| "epoch": 1.4359653346172365, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.00011783173881499106, |
| "loss": 0.4896, |
| "step": 5965 |
| }, |
| { |
| "epoch": 1.4371689937409726, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.00011774628533528607, |
| "loss": 0.469, |
| "step": 5970 |
| }, |
| { |
| "epoch": 1.4383726528647087, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00011766082421032915, |
| "loss": 0.4911, |
| "step": 5975 |
| }, |
| { |
| "epoch": 1.4395763119884448, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00011757535557666514, |
| "loss": 0.511, |
| "step": 5980 |
| }, |
| { |
| "epoch": 1.440779971112181, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00011748987957085109, |
| "loss": 0.4991, |
| "step": 5985 |
| }, |
| { |
| "epoch": 1.4419836302359172, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.00011740439632945572, |
| "loss": 0.4857, |
| "step": 5990 |
| }, |
| { |
| "epoch": 1.4431872893596533, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00011731890598905935, |
| "loss": 0.5143, |
| "step": 5995 |
| }, |
| { |
| "epoch": 1.4443909484833894, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00011723340868625362, |
| "loss": 0.5001, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.4443909484833894, |
| "eval_loss": 0.4288594126701355, |
| "eval_runtime": 2.3521, |
| "eval_samples_per_second": 85.03, |
| "eval_steps_per_second": 85.03, |
| "step": 6000 |
| }, |
| { |
| "epoch": 1.4455946076071258, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.00011714790455764127, |
| "loss": 0.4968, |
| "step": 6005 |
| }, |
| { |
| "epoch": 1.4467982667308619, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.00011706239373983604, |
| "loss": 0.4785, |
| "step": 6010 |
| }, |
| { |
| "epoch": 1.448001925854598, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00011697687636946226, |
| "loss": 0.478, |
| "step": 6015 |
| }, |
| { |
| "epoch": 1.449205584978334, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00011689135258315475, |
| "loss": 0.4734, |
| "step": 6020 |
| }, |
| { |
| "epoch": 1.4504092441020702, |
| "grad_norm": 1.0859375, |
| "learning_rate": 0.00011680582251755865, |
| "loss": 0.5085, |
| "step": 6025 |
| }, |
| { |
| "epoch": 1.4516129032258065, |
| "grad_norm": 1.0859375, |
| "learning_rate": 0.00011672028630932902, |
| "loss": 0.4644, |
| "step": 6030 |
| }, |
| { |
| "epoch": 1.4528165623495426, |
| "grad_norm": 1.09375, |
| "learning_rate": 0.0001166347440951308, |
| "loss": 0.4911, |
| "step": 6035 |
| }, |
| { |
| "epoch": 1.4540202214732787, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.00011654919601163857, |
| "loss": 0.455, |
| "step": 6040 |
| }, |
| { |
| "epoch": 1.455223880597015, |
| "grad_norm": 1.09375, |
| "learning_rate": 0.00011646364219553618, |
| "loss": 0.4715, |
| "step": 6045 |
| }, |
| { |
| "epoch": 1.4564275397207511, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00011637808278351675, |
| "loss": 0.4931, |
| "step": 6050 |
| }, |
| { |
| "epoch": 1.4576311988444872, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.00011629251791228225, |
| "loss": 0.521, |
| "step": 6055 |
| }, |
| { |
| "epoch": 1.4588348579682233, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.0001162069477185434, |
| "loss": 0.4943, |
| "step": 6060 |
| }, |
| { |
| "epoch": 1.4600385170919594, |
| "grad_norm": 1.125, |
| "learning_rate": 0.00011612137233901946, |
| "loss": 0.4828, |
| "step": 6065 |
| }, |
| { |
| "epoch": 1.4612421762156957, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00011603579191043795, |
| "loss": 0.4734, |
| "step": 6070 |
| }, |
| { |
| "epoch": 1.4624458353394318, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00011595020656953443, |
| "loss": 0.488, |
| "step": 6075 |
| }, |
| { |
| "epoch": 1.463649494463168, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.0001158646164530524, |
| "loss": 0.5259, |
| "step": 6080 |
| }, |
| { |
| "epoch": 1.4648531535869043, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.0001157790216977428, |
| "loss": 0.4766, |
| "step": 6085 |
| }, |
| { |
| "epoch": 1.4660568127106404, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.00011569342244036422, |
| "loss": 0.472, |
| "step": 6090 |
| }, |
| { |
| "epoch": 1.4672604718343765, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00011560781881768228, |
| "loss": 0.4969, |
| "step": 6095 |
| }, |
| { |
| "epoch": 1.4684641309581128, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00011552221096646963, |
| "loss": 0.4899, |
| "step": 6100 |
| }, |
| { |
| "epoch": 1.469667790081849, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00011543659902350574, |
| "loss": 0.4906, |
| "step": 6105 |
| }, |
| { |
| "epoch": 1.470871449205585, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.0001153509831255764, |
| "loss": 0.4671, |
| "step": 6110 |
| }, |
| { |
| "epoch": 1.472075108329321, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.00011526536340947398, |
| "loss": 0.504, |
| "step": 6115 |
| }, |
| { |
| "epoch": 1.4732787674530572, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00011517974001199682, |
| "loss": 0.5031, |
| "step": 6120 |
| }, |
| { |
| "epoch": 1.4744824265767935, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00011509411306994916, |
| "loss": 0.4837, |
| "step": 6125 |
| }, |
| { |
| "epoch": 1.4756860857005296, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00011500848272014087, |
| "loss": 0.4904, |
| "step": 6130 |
| }, |
| { |
| "epoch": 1.4768897448242657, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.00011492284909938733, |
| "loss": 0.5054, |
| "step": 6135 |
| }, |
| { |
| "epoch": 1.478093403948002, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00011483721234450912, |
| "loss": 0.513, |
| "step": 6140 |
| }, |
| { |
| "epoch": 1.4792970630717381, |
| "grad_norm": 1.125, |
| "learning_rate": 0.00011475157259233177, |
| "loss": 0.5087, |
| "step": 6145 |
| }, |
| { |
| "epoch": 1.4805007221954742, |
| "grad_norm": 1.3046875, |
| "learning_rate": 0.00011466592997968568, |
| "loss": 0.5239, |
| "step": 6150 |
| }, |
| { |
| "epoch": 1.4817043813192103, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.00011458028464340583, |
| "loss": 0.452, |
| "step": 6155 |
| }, |
| { |
| "epoch": 1.4829080404429464, |
| "grad_norm": 1.09375, |
| "learning_rate": 0.0001144946367203315, |
| "loss": 0.4743, |
| "step": 6160 |
| }, |
| { |
| "epoch": 1.4841116995666828, |
| "grad_norm": 1.265625, |
| "learning_rate": 0.00011440898634730602, |
| "loss": 0.499, |
| "step": 6165 |
| }, |
| { |
| "epoch": 1.4853153586904189, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00011432333366117681, |
| "loss": 0.513, |
| "step": 6170 |
| }, |
| { |
| "epoch": 1.486519017814155, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00011423767879879492, |
| "loss": 0.489, |
| "step": 6175 |
| }, |
| { |
| "epoch": 1.4877226769378913, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00011415202189701486, |
| "loss": 0.4873, |
| "step": 6180 |
| }, |
| { |
| "epoch": 1.4889263360616274, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00011406636309269433, |
| "loss": 0.4876, |
| "step": 6185 |
| }, |
| { |
| "epoch": 1.4901299951853635, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00011398070252269422, |
| "loss": 0.4835, |
| "step": 6190 |
| }, |
| { |
| "epoch": 1.4913336543090996, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.0001138950403238781, |
| "loss": 0.4885, |
| "step": 6195 |
| }, |
| { |
| "epoch": 1.4925373134328357, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.00011380937663311228, |
| "loss": 0.5127, |
| "step": 6200 |
| }, |
| { |
| "epoch": 1.493740972556572, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.0001137237115872653, |
| "loss": 0.4743, |
| "step": 6205 |
| }, |
| { |
| "epoch": 1.4949446316803081, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.000113638045323208, |
| "loss": 0.5125, |
| "step": 6210 |
| }, |
| { |
| "epoch": 1.4961482908040442, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00011355237797781306, |
| "loss": 0.4641, |
| "step": 6215 |
| }, |
| { |
| "epoch": 1.4973519499277805, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00011346670968795497, |
| "loss": 0.489, |
| "step": 6220 |
| }, |
| { |
| "epoch": 1.4985556090515166, |
| "grad_norm": 1.4453125, |
| "learning_rate": 0.00011338104059050968, |
| "loss": 0.4705, |
| "step": 6225 |
| }, |
| { |
| "epoch": 1.4997592681752527, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.00011329537082235446, |
| "loss": 0.4795, |
| "step": 6230 |
| }, |
| { |
| "epoch": 1.500962927298989, |
| "grad_norm": 1.0859375, |
| "learning_rate": 0.00011320970052036765, |
| "loss": 0.4708, |
| "step": 6235 |
| }, |
| { |
| "epoch": 1.502166586422725, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00011312402982142839, |
| "loss": 0.4994, |
| "step": 6240 |
| }, |
| { |
| "epoch": 1.5033702455464613, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00011303835886241655, |
| "loss": 0.4612, |
| "step": 6245 |
| }, |
| { |
| "epoch": 1.5045739046701974, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.00011295268778021228, |
| "loss": 0.4355, |
| "step": 6250 |
| }, |
| { |
| "epoch": 1.5057775637939335, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00011286701671169608, |
| "loss": 0.4846, |
| "step": 6255 |
| }, |
| { |
| "epoch": 1.5069812229176698, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00011278134579374833, |
| "loss": 0.489, |
| "step": 6260 |
| }, |
| { |
| "epoch": 1.5081848820414059, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00011269567516324919, |
| "loss": 0.4974, |
| "step": 6265 |
| }, |
| { |
| "epoch": 1.509388541165142, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00011261000495707838, |
| "loss": 0.5069, |
| "step": 6270 |
| }, |
| { |
| "epoch": 1.5105922002888783, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.00011252433531211492, |
| "loss": 0.494, |
| "step": 6275 |
| }, |
| { |
| "epoch": 1.5117958594126142, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00011243866636523691, |
| "loss": 0.4812, |
| "step": 6280 |
| }, |
| { |
| "epoch": 1.5129995185363505, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00011235299825332142, |
| "loss": 0.5093, |
| "step": 6285 |
| }, |
| { |
| "epoch": 1.5142031776600868, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.00011226733111324412, |
| "loss": 0.4921, |
| "step": 6290 |
| }, |
| { |
| "epoch": 1.5154068367838227, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00011218166508187913, |
| "loss": 0.4636, |
| "step": 6295 |
| }, |
| { |
| "epoch": 1.516610495907559, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00011209600029609879, |
| "loss": 0.4773, |
| "step": 6300 |
| }, |
| { |
| "epoch": 1.5178141550312951, |
| "grad_norm": 1.3515625, |
| "learning_rate": 0.00011201033689277348, |
| "loss": 0.4698, |
| "step": 6305 |
| }, |
| { |
| "epoch": 1.5190178141550312, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.00011192467500877134, |
| "loss": 0.4947, |
| "step": 6310 |
| }, |
| { |
| "epoch": 1.5202214732787676, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.00011183901478095815, |
| "loss": 0.5108, |
| "step": 6315 |
| }, |
| { |
| "epoch": 1.5214251324025037, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00011175335634619695, |
| "loss": 0.5156, |
| "step": 6320 |
| }, |
| { |
| "epoch": 1.5226287915262398, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.000111667699841348, |
| "loss": 0.4832, |
| "step": 6325 |
| }, |
| { |
| "epoch": 1.523832450649976, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00011158204540326837, |
| "loss": 0.4964, |
| "step": 6330 |
| }, |
| { |
| "epoch": 1.525036109773712, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00011149639316881197, |
| "loss": 0.4952, |
| "step": 6335 |
| }, |
| { |
| "epoch": 1.5262397688974483, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.00011141074327482907, |
| "loss": 0.4872, |
| "step": 6340 |
| }, |
| { |
| "epoch": 1.5274434280211844, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.00011132509585816626, |
| "loss": 0.4515, |
| "step": 6345 |
| }, |
| { |
| "epoch": 1.5286470871449205, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.00011123945105566613, |
| "loss": 0.4838, |
| "step": 6350 |
| }, |
| { |
| "epoch": 1.5298507462686568, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00011115380900416715, |
| "loss": 0.4489, |
| "step": 6355 |
| }, |
| { |
| "epoch": 1.531054405392393, |
| "grad_norm": 1.125, |
| "learning_rate": 0.00011106816984050336, |
| "loss": 0.4812, |
| "step": 6360 |
| }, |
| { |
| "epoch": 1.532258064516129, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00011098253370150419, |
| "loss": 0.4542, |
| "step": 6365 |
| }, |
| { |
| "epoch": 1.5334617236398653, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.00011089690072399423, |
| "loss": 0.4335, |
| "step": 6370 |
| }, |
| { |
| "epoch": 1.5346653827636012, |
| "grad_norm": 1.1328125, |
| "learning_rate": 0.00011081127104479302, |
| "loss": 0.5006, |
| "step": 6375 |
| }, |
| { |
| "epoch": 1.5358690418873375, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.00011072564480071481, |
| "loss": 0.442, |
| "step": 6380 |
| }, |
| { |
| "epoch": 1.5370727010110736, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00011064002212856843, |
| "loss": 0.4852, |
| "step": 6385 |
| }, |
| { |
| "epoch": 1.5382763601348097, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.00011055440316515699, |
| "loss": 0.4801, |
| "step": 6390 |
| }, |
| { |
| "epoch": 1.539480019258546, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.00011046878804727757, |
| "loss": 0.51, |
| "step": 6395 |
| }, |
| { |
| "epoch": 1.5406836783822822, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00011038317691172122, |
| "loss": 0.4616, |
| "step": 6400 |
| }, |
| { |
| "epoch": 1.5418873375060183, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.00011029756989527258, |
| "loss": 0.4843, |
| "step": 6405 |
| }, |
| { |
| "epoch": 1.5430909966297546, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.00011021196713470973, |
| "loss": 0.4987, |
| "step": 6410 |
| }, |
| { |
| "epoch": 1.5442946557534905, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00011012636876680393, |
| "loss": 0.4663, |
| "step": 6415 |
| }, |
| { |
| "epoch": 1.5454983148772268, |
| "grad_norm": 1.046875, |
| "learning_rate": 0.00011004077492831943, |
| "loss": 0.5064, |
| "step": 6420 |
| }, |
| { |
| "epoch": 1.5467019740009629, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.0001099551857560132, |
| "loss": 0.4703, |
| "step": 6425 |
| }, |
| { |
| "epoch": 1.547905633124699, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.00010986960138663487, |
| "loss": 0.5145, |
| "step": 6430 |
| }, |
| { |
| "epoch": 1.5491092922484353, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00010978402195692629, |
| "loss": 0.4831, |
| "step": 6435 |
| }, |
| { |
| "epoch": 1.5503129513721714, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.00010969844760362141, |
| "loss": 0.4762, |
| "step": 6440 |
| }, |
| { |
| "epoch": 1.5515166104959075, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.00010961287846344617, |
| "loss": 0.476, |
| "step": 6445 |
| }, |
| { |
| "epoch": 1.5527202696196438, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00010952731467311808, |
| "loss": 0.4894, |
| "step": 6450 |
| }, |
| { |
| "epoch": 1.55392392874338, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00010944175636934618, |
| "loss": 0.483, |
| "step": 6455 |
| }, |
| { |
| "epoch": 1.555127587867116, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.0001093562036888307, |
| "loss": 0.4539, |
| "step": 6460 |
| }, |
| { |
| "epoch": 1.5563312469908523, |
| "grad_norm": 1.296875, |
| "learning_rate": 0.00010927065676826285, |
| "loss": 0.4881, |
| "step": 6465 |
| }, |
| { |
| "epoch": 1.5575349061145882, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00010918511574432468, |
| "loss": 0.4733, |
| "step": 6470 |
| }, |
| { |
| "epoch": 1.5587385652383245, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.00010909958075368888, |
| "loss": 0.4957, |
| "step": 6475 |
| }, |
| { |
| "epoch": 1.5599422243620606, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00010901405193301842, |
| "loss": 0.4715, |
| "step": 6480 |
| }, |
| { |
| "epoch": 1.5611458834857967, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.00010892852941896639, |
| "loss": 0.4872, |
| "step": 6485 |
| }, |
| { |
| "epoch": 1.562349542609533, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.00010884301334817582, |
| "loss": 0.5172, |
| "step": 6490 |
| }, |
| { |
| "epoch": 1.5635532017332692, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00010875750385727954, |
| "loss": 0.4736, |
| "step": 6495 |
| }, |
| { |
| "epoch": 1.5647568608570053, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.0001086720010828998, |
| "loss": 0.5011, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.5647568608570053, |
| "eval_loss": 0.4124249219894409, |
| "eval_runtime": 2.3537, |
| "eval_samples_per_second": 84.974, |
| "eval_steps_per_second": 84.974, |
| "step": 6500 |
| }, |
| { |
| "epoch": 1.5659605199807416, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00010858650516164806, |
| "loss": 0.4938, |
| "step": 6505 |
| }, |
| { |
| "epoch": 1.5671641791044775, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00010850101623012493, |
| "loss": 0.4868, |
| "step": 6510 |
| }, |
| { |
| "epoch": 1.5683678382282138, |
| "grad_norm": 1.28125, |
| "learning_rate": 0.00010841553442491976, |
| "loss": 0.4827, |
| "step": 6515 |
| }, |
| { |
| "epoch": 1.56957149735195, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.00010833005988261058, |
| "loss": 0.4808, |
| "step": 6520 |
| }, |
| { |
| "epoch": 1.570775156475686, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.00010824459273976385, |
| "loss": 0.4448, |
| "step": 6525 |
| }, |
| { |
| "epoch": 1.5719788155994223, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.00010815913313293407, |
| "loss": 0.5128, |
| "step": 6530 |
| }, |
| { |
| "epoch": 1.5731824747231584, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.00010807368119866385, |
| "loss": 0.4939, |
| "step": 6535 |
| }, |
| { |
| "epoch": 1.5743861338468945, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.0001079882370734834, |
| "loss": 0.4713, |
| "step": 6540 |
| }, |
| { |
| "epoch": 1.5755897929706308, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00010790280089391061, |
| "loss": 0.4881, |
| "step": 6545 |
| }, |
| { |
| "epoch": 1.5767934520943667, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.00010781737279645057, |
| "loss": 0.4741, |
| "step": 6550 |
| }, |
| { |
| "epoch": 1.577997111218103, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00010773195291759545, |
| "loss": 0.4708, |
| "step": 6555 |
| }, |
| { |
| "epoch": 1.5792007703418391, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.00010764654139382435, |
| "loss": 0.467, |
| "step": 6560 |
| }, |
| { |
| "epoch": 1.5804044294655752, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00010756113836160297, |
| "loss": 0.4721, |
| "step": 6565 |
| }, |
| { |
| "epoch": 1.5816080885893116, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.00010747574395738347, |
| "loss": 0.4874, |
| "step": 6570 |
| }, |
| { |
| "epoch": 1.5828117477130477, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00010739035831760425, |
| "loss": 0.5051, |
| "step": 6575 |
| }, |
| { |
| "epoch": 1.5840154068367838, |
| "grad_norm": 1.1015625, |
| "learning_rate": 0.00010730498157868957, |
| "loss": 0.4875, |
| "step": 6580 |
| }, |
| { |
| "epoch": 1.58521906596052, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00010721961387704972, |
| "loss": 0.4675, |
| "step": 6585 |
| }, |
| { |
| "epoch": 1.5864227250842562, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00010713425534908028, |
| "loss": 0.4837, |
| "step": 6590 |
| }, |
| { |
| "epoch": 1.5876263842079923, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.00010704890613116234, |
| "loss": 0.4638, |
| "step": 6595 |
| }, |
| { |
| "epoch": 1.5888300433317286, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.00010696356635966209, |
| "loss": 0.4632, |
| "step": 6600 |
| }, |
| { |
| "epoch": 1.5900337024554645, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.00010687823617093058, |
| "loss": 0.4934, |
| "step": 6605 |
| }, |
| { |
| "epoch": 1.5912373615792008, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00010679291570130362, |
| "loss": 0.476, |
| "step": 6610 |
| }, |
| { |
| "epoch": 1.592441020702937, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00010670760508710139, |
| "loss": 0.5106, |
| "step": 6615 |
| }, |
| { |
| "epoch": 1.593644679826673, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00010662230446462838, |
| "loss": 0.4749, |
| "step": 6620 |
| }, |
| { |
| "epoch": 1.5948483389504093, |
| "grad_norm": 1.2734375, |
| "learning_rate": 0.0001065370139701732, |
| "loss": 0.4272, |
| "step": 6625 |
| }, |
| { |
| "epoch": 1.5960519980741454, |
| "grad_norm": 1.0625, |
| "learning_rate": 0.0001064517337400081, |
| "loss": 0.4753, |
| "step": 6630 |
| }, |
| { |
| "epoch": 1.5972556571978815, |
| "grad_norm": 1.125, |
| "learning_rate": 0.0001063664639103891, |
| "loss": 0.4324, |
| "step": 6635 |
| }, |
| { |
| "epoch": 1.5984593163216179, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.00010628120461755546, |
| "loss": 0.4633, |
| "step": 6640 |
| }, |
| { |
| "epoch": 1.5996629754453537, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.0001061959559977297, |
| "loss": 0.4943, |
| "step": 6645 |
| }, |
| { |
| "epoch": 1.60086663456909, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00010611071818711735, |
| "loss": 0.4756, |
| "step": 6650 |
| }, |
| { |
| "epoch": 1.6020702936928262, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.0001060254913219064, |
| "loss": 0.4873, |
| "step": 6655 |
| }, |
| { |
| "epoch": 1.6032739528165623, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.00010594027553826772, |
| "loss": 0.5023, |
| "step": 6660 |
| }, |
| { |
| "epoch": 1.6044776119402986, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.00010585507097235414, |
| "loss": 0.5088, |
| "step": 6665 |
| }, |
| { |
| "epoch": 1.6056812710640347, |
| "grad_norm": 1.2890625, |
| "learning_rate": 0.00010576987776030081, |
| "loss": 0.5043, |
| "step": 6670 |
| }, |
| { |
| "epoch": 1.6068849301877708, |
| "grad_norm": 1.0859375, |
| "learning_rate": 0.0001056846960382246, |
| "loss": 0.4875, |
| "step": 6675 |
| }, |
| { |
| "epoch": 1.608088589311507, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00010559952594222405, |
| "loss": 0.522, |
| "step": 6680 |
| }, |
| { |
| "epoch": 1.609292248435243, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.00010551436760837918, |
| "loss": 0.4958, |
| "step": 6685 |
| }, |
| { |
| "epoch": 1.6104959075589793, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.00010542922117275117, |
| "loss": 0.4595, |
| "step": 6690 |
| }, |
| { |
| "epoch": 1.6116995666827154, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.00010534408677138218, |
| "loss": 0.4668, |
| "step": 6695 |
| }, |
| { |
| "epoch": 1.6129032258064515, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00010525896454029514, |
| "loss": 0.4807, |
| "step": 6700 |
| }, |
| { |
| "epoch": 1.6141068849301878, |
| "grad_norm": 1.125, |
| "learning_rate": 0.00010517385461549358, |
| "loss": 0.4873, |
| "step": 6705 |
| }, |
| { |
| "epoch": 1.615310544053924, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00010508875713296138, |
| "loss": 0.4647, |
| "step": 6710 |
| }, |
| { |
| "epoch": 1.61651420317766, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.00010500367222866245, |
| "loss": 0.427, |
| "step": 6715 |
| }, |
| { |
| "epoch": 1.6177178623013964, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.00010491860003854065, |
| "loss": 0.5016, |
| "step": 6720 |
| }, |
| { |
| "epoch": 1.6189215214251322, |
| "grad_norm": 1.1875, |
| "learning_rate": 0.00010483354069851956, |
| "loss": 0.4748, |
| "step": 6725 |
| }, |
| { |
| "epoch": 1.6201251805488686, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00010474849434450218, |
| "loss": 0.4419, |
| "step": 6730 |
| }, |
| { |
| "epoch": 1.6213288396726049, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00010466346111237082, |
| "loss": 0.4921, |
| "step": 6735 |
| }, |
| { |
| "epoch": 1.6225324987963408, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00010457844113798672, |
| "loss": 0.4725, |
| "step": 6740 |
| }, |
| { |
| "epoch": 1.623736157920077, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.00010449343455719007, |
| "loss": 0.4811, |
| "step": 6745 |
| }, |
| { |
| "epoch": 1.6249398170438132, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00010440844150579957, |
| "loss": 0.4848, |
| "step": 6750 |
| }, |
| { |
| "epoch": 1.6261434761675493, |
| "grad_norm": 1.0546875, |
| "learning_rate": 0.0001043234621196123, |
| "loss": 0.4992, |
| "step": 6755 |
| }, |
| { |
| "epoch": 1.6273471352912856, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00010423849653440359, |
| "loss": 0.484, |
| "step": 6760 |
| }, |
| { |
| "epoch": 1.6285507944150217, |
| "grad_norm": 1.3359375, |
| "learning_rate": 0.00010415354488592661, |
| "loss": 0.4921, |
| "step": 6765 |
| }, |
| { |
| "epoch": 1.6297544535387578, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00010406860730991234, |
| "loss": 0.4796, |
| "step": 6770 |
| }, |
| { |
| "epoch": 1.6309581126624941, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.00010398368394206926, |
| "loss": 0.4552, |
| "step": 6775 |
| }, |
| { |
| "epoch": 1.63216177178623, |
| "grad_norm": 1.328125, |
| "learning_rate": 0.00010389877491808314, |
| "loss": 0.4549, |
| "step": 6780 |
| }, |
| { |
| "epoch": 1.6333654309099663, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.00010381388037361683, |
| "loss": 0.4465, |
| "step": 6785 |
| }, |
| { |
| "epoch": 1.6345690900337024, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.00010372900044431003, |
| "loss": 0.4927, |
| "step": 6790 |
| }, |
| { |
| "epoch": 1.6357727491574385, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00010364413526577915, |
| "loss": 0.4789, |
| "step": 6795 |
| }, |
| { |
| "epoch": 1.6369764082811749, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00010355928497361696, |
| "loss": 0.4934, |
| "step": 6800 |
| }, |
| { |
| "epoch": 1.638180067404911, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00010347444970339244, |
| "loss": 0.4821, |
| "step": 6805 |
| }, |
| { |
| "epoch": 1.639383726528647, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.00010338962959065069, |
| "loss": 0.461, |
| "step": 6810 |
| }, |
| { |
| "epoch": 1.6405873856523834, |
| "grad_norm": 1.234375, |
| "learning_rate": 0.00010330482477091243, |
| "loss": 0.4625, |
| "step": 6815 |
| }, |
| { |
| "epoch": 1.6417910447761193, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00010322003537967406, |
| "loss": 0.4604, |
| "step": 6820 |
| }, |
| { |
| "epoch": 1.6429947038998556, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.0001031352615524073, |
| "loss": 0.4654, |
| "step": 6825 |
| }, |
| { |
| "epoch": 1.6441983630235917, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00010305050342455897, |
| "loss": 0.486, |
| "step": 6830 |
| }, |
| { |
| "epoch": 1.6454020221473278, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00010296576113155084, |
| "loss": 0.4581, |
| "step": 6835 |
| }, |
| { |
| "epoch": 1.646605681271064, |
| "grad_norm": 1.09375, |
| "learning_rate": 0.00010288103480877935, |
| "loss": 0.4845, |
| "step": 6840 |
| }, |
| { |
| "epoch": 1.6478093403948002, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.00010279632459161548, |
| "loss": 0.4575, |
| "step": 6845 |
| }, |
| { |
| "epoch": 1.6490129995185363, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.00010271163061540445, |
| "loss": 0.5021, |
| "step": 6850 |
| }, |
| { |
| "epoch": 1.6502166586422726, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00010262695301546543, |
| "loss": 0.4466, |
| "step": 6855 |
| }, |
| { |
| "epoch": 1.6514203177660085, |
| "grad_norm": 1.1328125, |
| "learning_rate": 0.00010254229192709164, |
| "loss": 0.4566, |
| "step": 6860 |
| }, |
| { |
| "epoch": 1.6526239768897448, |
| "grad_norm": 1.1640625, |
| "learning_rate": 0.0001024576474855497, |
| "loss": 0.4657, |
| "step": 6865 |
| }, |
| { |
| "epoch": 1.653827636013481, |
| "grad_norm": 1.078125, |
| "learning_rate": 0.00010237301982607981, |
| "loss": 0.5034, |
| "step": 6870 |
| }, |
| { |
| "epoch": 1.655031295137217, |
| "grad_norm": 1.1328125, |
| "learning_rate": 0.0001022884090838952, |
| "loss": 0.4257, |
| "step": 6875 |
| }, |
| { |
| "epoch": 1.6562349542609534, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.00010220381539418218, |
| "loss": 0.4822, |
| "step": 6880 |
| }, |
| { |
| "epoch": 1.6574386133846895, |
| "grad_norm": 1.140625, |
| "learning_rate": 0.00010211923889209983, |
| "loss": 0.5075, |
| "step": 6885 |
| }, |
| { |
| "epoch": 1.6586422725084256, |
| "grad_norm": 1.2265625, |
| "learning_rate": 0.0001020346797127796, |
| "loss": 0.4946, |
| "step": 6890 |
| }, |
| { |
| "epoch": 1.6598459316321619, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.0001019501379913255, |
| "loss": 0.4642, |
| "step": 6895 |
| }, |
| { |
| "epoch": 1.661049590755898, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.00010186561386281345, |
| "loss": 0.4639, |
| "step": 6900 |
| }, |
| { |
| "epoch": 1.662253249879634, |
| "grad_norm": 1.1171875, |
| "learning_rate": 0.0001017811074622914, |
| "loss": 0.4764, |
| "step": 6905 |
| }, |
| { |
| "epoch": 1.6634569090033704, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00010169661892477887, |
| "loss": 0.4607, |
| "step": 6910 |
| }, |
| { |
| "epoch": 1.6646605681271063, |
| "grad_norm": 1.171875, |
| "learning_rate": 0.00010161214838526686, |
| "loss": 0.4905, |
| "step": 6915 |
| }, |
| { |
| "epoch": 1.6658642272508426, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.00010152769597871774, |
| "loss": 0.4737, |
| "step": 6920 |
| }, |
| { |
| "epoch": 1.6670678863745787, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.00010144326184006468, |
| "loss": 0.4716, |
| "step": 6925 |
| }, |
| { |
| "epoch": 1.6682715454983148, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.0001013588461042119, |
| "loss": 0.5117, |
| "step": 6930 |
| }, |
| { |
| "epoch": 1.6694752046220511, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00010127444890603404, |
| "loss": 0.4528, |
| "step": 6935 |
| }, |
| { |
| "epoch": 1.6706788637457872, |
| "grad_norm": 1.25, |
| "learning_rate": 0.00010119007038037618, |
| "loss": 0.4587, |
| "step": 6940 |
| }, |
| { |
| "epoch": 1.6718825228695233, |
| "grad_norm": 1.21875, |
| "learning_rate": 0.00010110571066205365, |
| "loss": 0.4969, |
| "step": 6945 |
| }, |
| { |
| "epoch": 1.6730861819932596, |
| "grad_norm": 1.1953125, |
| "learning_rate": 0.00010102136988585153, |
| "loss": 0.4338, |
| "step": 6950 |
| }, |
| { |
| "epoch": 1.6742898411169955, |
| "grad_norm": 1.203125, |
| "learning_rate": 0.00010093704818652488, |
| "loss": 0.4835, |
| "step": 6955 |
| }, |
| { |
| "epoch": 1.6754935002407318, |
| "grad_norm": 1.25, |
| "learning_rate": 0.0001008527456987981, |
| "loss": 0.448, |
| "step": 6960 |
| }, |
| { |
| "epoch": 1.676697159364468, |
| "grad_norm": 1.2109375, |
| "learning_rate": 0.00010076846255736495, |
| "loss": 0.494, |
| "step": 6965 |
| }, |
| { |
| "epoch": 1.677900818488204, |
| "grad_norm": 1.09375, |
| "learning_rate": 0.00010068419889688831, |
| "loss": 0.4518, |
| "step": 6970 |
| }, |
| { |
| "epoch": 1.6791044776119404, |
| "grad_norm": 1.2578125, |
| "learning_rate": 0.00010059995485199988, |
| "loss": 0.4678, |
| "step": 6975 |
| }, |
| { |
| "epoch": 1.6803081367356765, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00010051573055730008, |
| "loss": 0.4377, |
| "step": 6980 |
| }, |
| { |
| "epoch": 1.6815117958594126, |
| "grad_norm": 1.1796875, |
| "learning_rate": 0.00010043152614735774, |
| "loss": 0.4676, |
| "step": 6985 |
| }, |
| { |
| "epoch": 1.682715454983149, |
| "grad_norm": 1.2421875, |
| "learning_rate": 0.00010034734175670987, |
| "loss": 0.4992, |
| "step": 6990 |
| }, |
| { |
| "epoch": 1.6839191141068848, |
| "grad_norm": 1.125, |
| "learning_rate": 0.00010026317751986162, |
| "loss": 0.4562, |
| "step": 6995 |
| }, |
| { |
| "epoch": 1.685122773230621, |
| "grad_norm": 1.15625, |
| "learning_rate": 0.00010017903357128577, |
| "loss": 0.4707, |
| "step": 7000 |
| }, |
| { |
| "epoch": 1.685122773230621, |
| "eval_loss": 0.40290209650993347, |
| "eval_runtime": 2.3488, |
| "eval_samples_per_second": 85.151, |
| "eval_steps_per_second": 85.151, |
| "step": 7000 |
| }, |
| { |
| "epoch": 1.6863264323543572, |
| "grad_norm": 1.1484375, |
| "learning_rate": 0.0001000949100454229, |
| "loss": 0.4751, |
| "step": 7005 |
| }, |
| { |
| "epoch": 1.6875300914780933, |
| "grad_norm": 1.109375, |
| "learning_rate": 0.00010001080707668073, |
| "loss": 0.4645, |
| "step": 7010 |
| }, |
| { |
| "epoch": 1.6887337506018296, |
| "grad_norm": 1.1953125, |
| "learning_rate": 9.99267247994343e-05, |
| "loss": 0.4886, |
| "step": 7015 |
| }, |
| { |
| "epoch": 1.6899374097255657, |
| "grad_norm": 1.1328125, |
| "learning_rate": 9.984266334802554e-05, |
| "loss": 0.4251, |
| "step": 7020 |
| }, |
| { |
| "epoch": 1.6911410688493018, |
| "grad_norm": 1.0625, |
| "learning_rate": 9.975862285676306e-05, |
| "loss": 0.4545, |
| "step": 7025 |
| }, |
| { |
| "epoch": 1.6923447279730381, |
| "grad_norm": 1.1171875, |
| "learning_rate": 9.967460345992207e-05, |
| "loss": 0.4555, |
| "step": 7030 |
| }, |
| { |
| "epoch": 1.6935483870967742, |
| "grad_norm": 1.21875, |
| "learning_rate": 9.959060529174396e-05, |
| "loss": 0.4874, |
| "step": 7035 |
| }, |
| { |
| "epoch": 1.6947520462205103, |
| "grad_norm": 1.125, |
| "learning_rate": 9.950662848643635e-05, |
| "loss": 0.4654, |
| "step": 7040 |
| }, |
| { |
| "epoch": 1.6959557053442467, |
| "grad_norm": 1.265625, |
| "learning_rate": 9.942267317817261e-05, |
| "loss": 0.4806, |
| "step": 7045 |
| }, |
| { |
| "epoch": 1.6971593644679825, |
| "grad_norm": 1.15625, |
| "learning_rate": 9.933873950109178e-05, |
| "loss": 0.4521, |
| "step": 7050 |
| }, |
| { |
| "epoch": 1.6983630235917189, |
| "grad_norm": 1.1640625, |
| "learning_rate": 9.925482758929841e-05, |
| "loss": 0.445, |
| "step": 7055 |
| }, |
| { |
| "epoch": 1.699566682715455, |
| "grad_norm": 1.21875, |
| "learning_rate": 9.917093757686215e-05, |
| "loss": 0.4613, |
| "step": 7060 |
| }, |
| { |
| "epoch": 1.700770341839191, |
| "grad_norm": 1.09375, |
| "learning_rate": 9.908706959781779e-05, |
| "loss": 0.4411, |
| "step": 7065 |
| }, |
| { |
| "epoch": 1.7019740009629274, |
| "grad_norm": 1.15625, |
| "learning_rate": 9.900322378616488e-05, |
| "loss": 0.4746, |
| "step": 7070 |
| }, |
| { |
| "epoch": 1.7031776600866635, |
| "grad_norm": 1.1953125, |
| "learning_rate": 9.891940027586747e-05, |
| "loss": 0.4972, |
| "step": 7075 |
| }, |
| { |
| "epoch": 1.7043813192103996, |
| "grad_norm": 1.046875, |
| "learning_rate": 9.883559920085408e-05, |
| "loss": 0.4312, |
| "step": 7080 |
| }, |
| { |
| "epoch": 1.705584978334136, |
| "grad_norm": 1.0859375, |
| "learning_rate": 9.875182069501733e-05, |
| "loss": 0.4539, |
| "step": 7085 |
| }, |
| { |
| "epoch": 1.7067886374578718, |
| "grad_norm": 1.234375, |
| "learning_rate": 9.866806489221378e-05, |
| "loss": 0.4543, |
| "step": 7090 |
| }, |
| { |
| "epoch": 1.7079922965816081, |
| "grad_norm": 1.140625, |
| "learning_rate": 9.858433192626381e-05, |
| "loss": 0.4403, |
| "step": 7095 |
| }, |
| { |
| "epoch": 1.7091959557053442, |
| "grad_norm": 1.125, |
| "learning_rate": 9.850062193095115e-05, |
| "loss": 0.4739, |
| "step": 7100 |
| }, |
| { |
| "epoch": 1.7103996148290803, |
| "grad_norm": 1.234375, |
| "learning_rate": 9.841693504002295e-05, |
| "loss": 0.4529, |
| "step": 7105 |
| }, |
| { |
| "epoch": 1.7116032739528166, |
| "grad_norm": 1.2265625, |
| "learning_rate": 9.833327138718938e-05, |
| "loss": 0.485, |
| "step": 7110 |
| }, |
| { |
| "epoch": 1.7128069330765527, |
| "grad_norm": 1.1484375, |
| "learning_rate": 9.824963110612354e-05, |
| "loss": 0.4641, |
| "step": 7115 |
| }, |
| { |
| "epoch": 1.7140105922002888, |
| "grad_norm": 1.1015625, |
| "learning_rate": 9.816601433046117e-05, |
| "loss": 0.4743, |
| "step": 7120 |
| }, |
| { |
| "epoch": 1.7152142513240252, |
| "grad_norm": 1.1484375, |
| "learning_rate": 9.808242119380038e-05, |
| "loss": 0.4361, |
| "step": 7125 |
| }, |
| { |
| "epoch": 1.716417910447761, |
| "grad_norm": 1.46875, |
| "learning_rate": 9.799885182970167e-05, |
| "loss": 0.5017, |
| "step": 7130 |
| }, |
| { |
| "epoch": 1.7176215695714974, |
| "grad_norm": 1.2734375, |
| "learning_rate": 9.791530637168735e-05, |
| "loss": 0.4815, |
| "step": 7135 |
| }, |
| { |
| "epoch": 1.7188252286952335, |
| "grad_norm": 1.1328125, |
| "learning_rate": 9.783178495324173e-05, |
| "loss": 0.4753, |
| "step": 7140 |
| }, |
| { |
| "epoch": 1.7200288878189696, |
| "grad_norm": 1.171875, |
| "learning_rate": 9.774828770781058e-05, |
| "loss": 0.4376, |
| "step": 7145 |
| }, |
| { |
| "epoch": 1.7212325469427059, |
| "grad_norm": 1.015625, |
| "learning_rate": 9.766481476880111e-05, |
| "loss": 0.4664, |
| "step": 7150 |
| }, |
| { |
| "epoch": 1.722436206066442, |
| "grad_norm": 1.1015625, |
| "learning_rate": 9.75813662695817e-05, |
| "loss": 0.4686, |
| "step": 7155 |
| }, |
| { |
| "epoch": 1.723639865190178, |
| "grad_norm": 1.234375, |
| "learning_rate": 9.749794234348159e-05, |
| "loss": 0.4841, |
| "step": 7160 |
| }, |
| { |
| "epoch": 1.7248435243139144, |
| "grad_norm": 1.09375, |
| "learning_rate": 9.741454312379094e-05, |
| "loss": 0.4352, |
| "step": 7165 |
| }, |
| { |
| "epoch": 1.7260471834376505, |
| "grad_norm": 1.1328125, |
| "learning_rate": 9.733116874376018e-05, |
| "loss": 0.4566, |
| "step": 7170 |
| }, |
| { |
| "epoch": 1.7272508425613866, |
| "grad_norm": 1.25, |
| "learning_rate": 9.724781933660033e-05, |
| "loss": 0.5107, |
| "step": 7175 |
| }, |
| { |
| "epoch": 1.728454501685123, |
| "grad_norm": 1.234375, |
| "learning_rate": 9.716449503548231e-05, |
| "loss": 0.445, |
| "step": 7180 |
| }, |
| { |
| "epoch": 1.7296581608088588, |
| "grad_norm": 1.2265625, |
| "learning_rate": 9.708119597353697e-05, |
| "loss": 0.4901, |
| "step": 7185 |
| }, |
| { |
| "epoch": 1.7308618199325951, |
| "grad_norm": 1.21875, |
| "learning_rate": 9.699792228385495e-05, |
| "loss": 0.5007, |
| "step": 7190 |
| }, |
| { |
| "epoch": 1.7320654790563312, |
| "grad_norm": 1.125, |
| "learning_rate": 9.691467409948614e-05, |
| "loss": 0.444, |
| "step": 7195 |
| }, |
| { |
| "epoch": 1.7332691381800673, |
| "grad_norm": 1.1171875, |
| "learning_rate": 9.683145155343987e-05, |
| "loss": 0.4686, |
| "step": 7200 |
| }, |
| { |
| "epoch": 1.7344727973038037, |
| "grad_norm": 1.1171875, |
| "learning_rate": 9.674825477868445e-05, |
| "loss": 0.4509, |
| "step": 7205 |
| }, |
| { |
| "epoch": 1.7356764564275398, |
| "grad_norm": 1.171875, |
| "learning_rate": 9.666508390814692e-05, |
| "loss": 0.4759, |
| "step": 7210 |
| }, |
| { |
| "epoch": 1.7368801155512759, |
| "grad_norm": 1.2890625, |
| "learning_rate": 9.658193907471311e-05, |
| "loss": 0.4304, |
| "step": 7215 |
| }, |
| { |
| "epoch": 1.7380837746750122, |
| "grad_norm": 1.1796875, |
| "learning_rate": 9.649882041122704e-05, |
| "loss": 0.4717, |
| "step": 7220 |
| }, |
| { |
| "epoch": 1.739287433798748, |
| "grad_norm": 1.21875, |
| "learning_rate": 9.641572805049113e-05, |
| "loss": 0.4895, |
| "step": 7225 |
| }, |
| { |
| "epoch": 1.7404910929224844, |
| "grad_norm": 1.2734375, |
| "learning_rate": 9.633266212526563e-05, |
| "loss": 0.4505, |
| "step": 7230 |
| }, |
| { |
| "epoch": 1.7416947520462205, |
| "grad_norm": 1.1953125, |
| "learning_rate": 9.624962276826856e-05, |
| "loss": 0.4761, |
| "step": 7235 |
| }, |
| { |
| "epoch": 1.7428984111699566, |
| "grad_norm": 1.171875, |
| "learning_rate": 9.616661011217558e-05, |
| "loss": 0.4738, |
| "step": 7240 |
| }, |
| { |
| "epoch": 1.744102070293693, |
| "grad_norm": 1.046875, |
| "learning_rate": 9.608362428961959e-05, |
| "loss": 0.4677, |
| "step": 7245 |
| }, |
| { |
| "epoch": 1.745305729417429, |
| "grad_norm": 1.4296875, |
| "learning_rate": 9.600066543319069e-05, |
| "loss": 0.4675, |
| "step": 7250 |
| }, |
| { |
| "epoch": 1.746509388541165, |
| "grad_norm": 1.1328125, |
| "learning_rate": 9.591773367543587e-05, |
| "loss": 0.4501, |
| "step": 7255 |
| }, |
| { |
| "epoch": 1.7477130476649014, |
| "grad_norm": 1.125, |
| "learning_rate": 9.583482914885877e-05, |
| "loss": 0.4613, |
| "step": 7260 |
| }, |
| { |
| "epoch": 1.7489167067886373, |
| "grad_norm": 1.171875, |
| "learning_rate": 9.575195198591965e-05, |
| "loss": 0.478, |
| "step": 7265 |
| }, |
| { |
| "epoch": 1.7501203659123736, |
| "grad_norm": 1.1015625, |
| "learning_rate": 9.566910231903487e-05, |
| "loss": 0.4599, |
| "step": 7270 |
| }, |
| { |
| "epoch": 1.7513240250361097, |
| "grad_norm": 1.1015625, |
| "learning_rate": 9.558628028057707e-05, |
| "loss": 0.4457, |
| "step": 7275 |
| }, |
| { |
| "epoch": 1.7525276841598458, |
| "grad_norm": 1.125, |
| "learning_rate": 9.550348600287457e-05, |
| "loss": 0.4465, |
| "step": 7280 |
| }, |
| { |
| "epoch": 1.7537313432835822, |
| "grad_norm": 1.1171875, |
| "learning_rate": 9.542071961821139e-05, |
| "loss": 0.4525, |
| "step": 7285 |
| }, |
| { |
| "epoch": 1.7549350024073183, |
| "grad_norm": 1.1328125, |
| "learning_rate": 9.533798125882704e-05, |
| "loss": 0.451, |
| "step": 7290 |
| }, |
| { |
| "epoch": 1.7561386615310544, |
| "grad_norm": 1.0625, |
| "learning_rate": 9.525527105691617e-05, |
| "loss": 0.446, |
| "step": 7295 |
| }, |
| { |
| "epoch": 1.7573423206547907, |
| "grad_norm": 1.1328125, |
| "learning_rate": 9.51725891446285e-05, |
| "loss": 0.4422, |
| "step": 7300 |
| }, |
| { |
| "epoch": 1.7585459797785266, |
| "grad_norm": 1.09375, |
| "learning_rate": 9.508993565406854e-05, |
| "loss": 0.4566, |
| "step": 7305 |
| }, |
| { |
| "epoch": 1.7597496389022629, |
| "grad_norm": 1.1640625, |
| "learning_rate": 9.500731071729535e-05, |
| "loss": 0.493, |
| "step": 7310 |
| }, |
| { |
| "epoch": 1.7609532980259992, |
| "grad_norm": 1.03125, |
| "learning_rate": 9.492471446632245e-05, |
| "loss": 0.4364, |
| "step": 7315 |
| }, |
| { |
| "epoch": 1.762156957149735, |
| "grad_norm": 1.1640625, |
| "learning_rate": 9.484214703311737e-05, |
| "loss": 0.4506, |
| "step": 7320 |
| }, |
| { |
| "epoch": 1.7633606162734714, |
| "grad_norm": 1.125, |
| "learning_rate": 9.475960854960182e-05, |
| "loss": 0.4845, |
| "step": 7325 |
| }, |
| { |
| "epoch": 1.7645642753972075, |
| "grad_norm": 1.140625, |
| "learning_rate": 9.46770991476511e-05, |
| "loss": 0.448, |
| "step": 7330 |
| }, |
| { |
| "epoch": 1.7657679345209436, |
| "grad_norm": 1.1015625, |
| "learning_rate": 9.459461895909403e-05, |
| "loss": 0.4888, |
| "step": 7335 |
| }, |
| { |
| "epoch": 1.76697159364468, |
| "grad_norm": 1.1484375, |
| "learning_rate": 9.451216811571288e-05, |
| "loss": 0.4663, |
| "step": 7340 |
| }, |
| { |
| "epoch": 1.768175252768416, |
| "grad_norm": 1.2265625, |
| "learning_rate": 9.44297467492429e-05, |
| "loss": 0.4857, |
| "step": 7345 |
| }, |
| { |
| "epoch": 1.7693789118921521, |
| "grad_norm": 1.1484375, |
| "learning_rate": 9.434735499137234e-05, |
| "loss": 0.446, |
| "step": 7350 |
| }, |
| { |
| "epoch": 1.7705825710158885, |
| "grad_norm": 1.0703125, |
| "learning_rate": 9.426499297374213e-05, |
| "loss": 0.4694, |
| "step": 7355 |
| }, |
| { |
| "epoch": 1.7717862301396243, |
| "grad_norm": 1.15625, |
| "learning_rate": 9.418266082794563e-05, |
| "loss": 0.4568, |
| "step": 7360 |
| }, |
| { |
| "epoch": 1.7729898892633607, |
| "grad_norm": 1.2421875, |
| "learning_rate": 9.41003586855285e-05, |
| "loss": 0.4902, |
| "step": 7365 |
| }, |
| { |
| "epoch": 1.7741935483870968, |
| "grad_norm": 1.1796875, |
| "learning_rate": 9.401808667798844e-05, |
| "loss": 0.4444, |
| "step": 7370 |
| }, |
| { |
| "epoch": 1.7753972075108329, |
| "grad_norm": 1.2109375, |
| "learning_rate": 9.393584493677512e-05, |
| "loss": 0.4728, |
| "step": 7375 |
| }, |
| { |
| "epoch": 1.7766008666345692, |
| "grad_norm": 1.1953125, |
| "learning_rate": 9.385363359328963e-05, |
| "loss": 0.4763, |
| "step": 7380 |
| }, |
| { |
| "epoch": 1.7778045257583053, |
| "grad_norm": 1.1171875, |
| "learning_rate": 9.377145277888473e-05, |
| "loss": 0.4566, |
| "step": 7385 |
| }, |
| { |
| "epoch": 1.7790081848820414, |
| "grad_norm": 1.078125, |
| "learning_rate": 9.368930262486427e-05, |
| "loss": 0.4589, |
| "step": 7390 |
| }, |
| { |
| "epoch": 1.7802118440057777, |
| "grad_norm": 1.0625, |
| "learning_rate": 9.360718326248307e-05, |
| "loss": 0.4537, |
| "step": 7395 |
| }, |
| { |
| "epoch": 1.7814155031295136, |
| "grad_norm": 1.0703125, |
| "learning_rate": 9.352509482294692e-05, |
| "loss": 0.4597, |
| "step": 7400 |
| }, |
| { |
| "epoch": 1.78261916225325, |
| "grad_norm": 1.28125, |
| "learning_rate": 9.344303743741201e-05, |
| "loss": 0.4747, |
| "step": 7405 |
| }, |
| { |
| "epoch": 1.783822821376986, |
| "grad_norm": 1.109375, |
| "learning_rate": 9.336101123698506e-05, |
| "loss": 0.4555, |
| "step": 7410 |
| }, |
| { |
| "epoch": 1.785026480500722, |
| "grad_norm": 1.0703125, |
| "learning_rate": 9.327901635272294e-05, |
| "loss": 0.4594, |
| "step": 7415 |
| }, |
| { |
| "epoch": 1.7862301396244584, |
| "grad_norm": 1.109375, |
| "learning_rate": 9.319705291563237e-05, |
| "loss": 0.474, |
| "step": 7420 |
| }, |
| { |
| "epoch": 1.7874337987481945, |
| "grad_norm": 1.1171875, |
| "learning_rate": 9.311512105667e-05, |
| "loss": 0.4418, |
| "step": 7425 |
| }, |
| { |
| "epoch": 1.7886374578719306, |
| "grad_norm": 1.09375, |
| "learning_rate": 9.303322090674185e-05, |
| "loss": 0.4606, |
| "step": 7430 |
| }, |
| { |
| "epoch": 1.789841116995667, |
| "grad_norm": 1.2265625, |
| "learning_rate": 9.295135259670342e-05, |
| "loss": 0.4831, |
| "step": 7435 |
| }, |
| { |
| "epoch": 1.7910447761194028, |
| "grad_norm": 1.1796875, |
| "learning_rate": 9.286951625735929e-05, |
| "loss": 0.4549, |
| "step": 7440 |
| }, |
| { |
| "epoch": 1.7922484352431391, |
| "grad_norm": 1.1796875, |
| "learning_rate": 9.278771201946291e-05, |
| "loss": 0.4608, |
| "step": 7445 |
| }, |
| { |
| "epoch": 1.7934520943668752, |
| "grad_norm": 1.2578125, |
| "learning_rate": 9.270594001371654e-05, |
| "loss": 0.4771, |
| "step": 7450 |
| }, |
| { |
| "epoch": 1.7946557534906113, |
| "grad_norm": 1.1171875, |
| "learning_rate": 9.262420037077078e-05, |
| "loss": 0.4548, |
| "step": 7455 |
| }, |
| { |
| "epoch": 1.7958594126143477, |
| "grad_norm": 1.203125, |
| "learning_rate": 9.254249322122474e-05, |
| "loss": 0.4413, |
| "step": 7460 |
| }, |
| { |
| "epoch": 1.7970630717380838, |
| "grad_norm": 1.140625, |
| "learning_rate": 9.246081869562543e-05, |
| "loss": 0.4634, |
| "step": 7465 |
| }, |
| { |
| "epoch": 1.7982667308618199, |
| "grad_norm": 1.2265625, |
| "learning_rate": 9.237917692446784e-05, |
| "loss": 0.4422, |
| "step": 7470 |
| }, |
| { |
| "epoch": 1.7994703899855562, |
| "grad_norm": 1.3046875, |
| "learning_rate": 9.229756803819458e-05, |
| "loss": 0.478, |
| "step": 7475 |
| }, |
| { |
| "epoch": 1.8006740491092923, |
| "grad_norm": 1.0703125, |
| "learning_rate": 9.221599216719573e-05, |
| "loss": 0.4449, |
| "step": 7480 |
| }, |
| { |
| "epoch": 1.8018777082330284, |
| "grad_norm": 1.140625, |
| "learning_rate": 9.213444944180858e-05, |
| "loss": 0.4792, |
| "step": 7485 |
| }, |
| { |
| "epoch": 1.8030813673567647, |
| "grad_norm": 1.1796875, |
| "learning_rate": 9.205293999231758e-05, |
| "loss": 0.4494, |
| "step": 7490 |
| }, |
| { |
| "epoch": 1.8042850264805006, |
| "grad_norm": 1.21875, |
| "learning_rate": 9.197146394895386e-05, |
| "loss": 0.4398, |
| "step": 7495 |
| }, |
| { |
| "epoch": 1.805488685604237, |
| "grad_norm": 1.03125, |
| "learning_rate": 9.189002144189533e-05, |
| "loss": 0.4357, |
| "step": 7500 |
| }, |
| { |
| "epoch": 1.805488685604237, |
| "eval_loss": 0.39568030834198, |
| "eval_runtime": 2.3503, |
| "eval_samples_per_second": 85.097, |
| "eval_steps_per_second": 85.097, |
| "step": 7500 |
| }, |
| { |
| "epoch": 1.806692344727973, |
| "grad_norm": 1.203125, |
| "learning_rate": 9.180861260126613e-05, |
| "loss": 0.49, |
| "step": 7505 |
| }, |
| { |
| "epoch": 1.8078960038517091, |
| "grad_norm": 1.1640625, |
| "learning_rate": 9.172723755713681e-05, |
| "loss": 0.4472, |
| "step": 7510 |
| }, |
| { |
| "epoch": 1.8090996629754454, |
| "grad_norm": 1.1640625, |
| "learning_rate": 9.164589643952379e-05, |
| "loss": 0.4608, |
| "step": 7515 |
| }, |
| { |
| "epoch": 1.8103033220991815, |
| "grad_norm": 1.1328125, |
| "learning_rate": 9.156458937838935e-05, |
| "loss": 0.4465, |
| "step": 7520 |
| }, |
| { |
| "epoch": 1.8115069812229176, |
| "grad_norm": 1.1796875, |
| "learning_rate": 9.148331650364132e-05, |
| "loss": 0.4368, |
| "step": 7525 |
| }, |
| { |
| "epoch": 1.812710640346654, |
| "grad_norm": 1.09375, |
| "learning_rate": 9.14020779451329e-05, |
| "loss": 0.4707, |
| "step": 7530 |
| }, |
| { |
| "epoch": 1.8139142994703898, |
| "grad_norm": 1.1796875, |
| "learning_rate": 9.132087383266247e-05, |
| "loss": 0.496, |
| "step": 7535 |
| }, |
| { |
| "epoch": 1.8151179585941262, |
| "grad_norm": 1.265625, |
| "learning_rate": 9.123970429597343e-05, |
| "loss": 0.4682, |
| "step": 7540 |
| }, |
| { |
| "epoch": 1.8163216177178623, |
| "grad_norm": 1.1640625, |
| "learning_rate": 9.115856946475389e-05, |
| "loss": 0.4773, |
| "step": 7545 |
| }, |
| { |
| "epoch": 1.8175252768415984, |
| "grad_norm": 1.09375, |
| "learning_rate": 9.107746946863647e-05, |
| "loss": 0.4476, |
| "step": 7550 |
| }, |
| { |
| "epoch": 1.8187289359653347, |
| "grad_norm": 1.203125, |
| "learning_rate": 9.099640443719819e-05, |
| "loss": 0.4957, |
| "step": 7555 |
| }, |
| { |
| "epoch": 1.8199325950890708, |
| "grad_norm": 1.171875, |
| "learning_rate": 9.091537449996017e-05, |
| "loss": 0.4474, |
| "step": 7560 |
| }, |
| { |
| "epoch": 1.821136254212807, |
| "grad_norm": 1.1875, |
| "learning_rate": 9.083437978638755e-05, |
| "loss": 0.4621, |
| "step": 7565 |
| }, |
| { |
| "epoch": 1.8223399133365432, |
| "grad_norm": 1.1015625, |
| "learning_rate": 9.075342042588907e-05, |
| "loss": 0.4471, |
| "step": 7570 |
| }, |
| { |
| "epoch": 1.823543572460279, |
| "grad_norm": 1.2578125, |
| "learning_rate": 9.067249654781703e-05, |
| "loss": 0.4626, |
| "step": 7575 |
| }, |
| { |
| "epoch": 1.8247472315840154, |
| "grad_norm": 1.2265625, |
| "learning_rate": 9.059160828146701e-05, |
| "loss": 0.458, |
| "step": 7580 |
| }, |
| { |
| "epoch": 1.8259508907077515, |
| "grad_norm": 1.1328125, |
| "learning_rate": 9.051075575607781e-05, |
| "loss": 0.4532, |
| "step": 7585 |
| }, |
| { |
| "epoch": 1.8271545498314876, |
| "grad_norm": 1.2421875, |
| "learning_rate": 9.042993910083096e-05, |
| "loss": 0.4586, |
| "step": 7590 |
| }, |
| { |
| "epoch": 1.828358208955224, |
| "grad_norm": 1.0546875, |
| "learning_rate": 9.03491584448508e-05, |
| "loss": 0.4783, |
| "step": 7595 |
| }, |
| { |
| "epoch": 1.82956186807896, |
| "grad_norm": 1.2265625, |
| "learning_rate": 9.02684139172041e-05, |
| "loss": 0.4357, |
| "step": 7600 |
| }, |
| { |
| "epoch": 1.8307655272026961, |
| "grad_norm": 1.2734375, |
| "learning_rate": 9.018770564689988e-05, |
| "loss": 0.4525, |
| "step": 7605 |
| }, |
| { |
| "epoch": 1.8319691863264325, |
| "grad_norm": 1.109375, |
| "learning_rate": 9.010703376288933e-05, |
| "loss": 0.47, |
| "step": 7610 |
| }, |
| { |
| "epoch": 1.8331728454501686, |
| "grad_norm": 1.0390625, |
| "learning_rate": 9.00263983940654e-05, |
| "loss": 0.4505, |
| "step": 7615 |
| }, |
| { |
| "epoch": 1.8343765045739047, |
| "grad_norm": 1.2578125, |
| "learning_rate": 8.994579966926274e-05, |
| "loss": 0.4681, |
| "step": 7620 |
| }, |
| { |
| "epoch": 1.835580163697641, |
| "grad_norm": 1.21875, |
| "learning_rate": 8.986523771725747e-05, |
| "loss": 0.4418, |
| "step": 7625 |
| }, |
| { |
| "epoch": 1.8367838228213769, |
| "grad_norm": 1.2421875, |
| "learning_rate": 8.978471266676691e-05, |
| "loss": 0.4744, |
| "step": 7630 |
| }, |
| { |
| "epoch": 1.8379874819451132, |
| "grad_norm": 1.28125, |
| "learning_rate": 8.970422464644951e-05, |
| "loss": 0.4439, |
| "step": 7635 |
| }, |
| { |
| "epoch": 1.8391911410688493, |
| "grad_norm": 1.109375, |
| "learning_rate": 8.962377378490439e-05, |
| "loss": 0.4812, |
| "step": 7640 |
| }, |
| { |
| "epoch": 1.8403948001925854, |
| "grad_norm": 1.1171875, |
| "learning_rate": 8.954336021067146e-05, |
| "loss": 0.4641, |
| "step": 7645 |
| }, |
| { |
| "epoch": 1.8415984593163217, |
| "grad_norm": 1.1328125, |
| "learning_rate": 8.946298405223105e-05, |
| "loss": 0.4797, |
| "step": 7650 |
| }, |
| { |
| "epoch": 1.8428021184400578, |
| "grad_norm": 1.1640625, |
| "learning_rate": 8.938264543800356e-05, |
| "loss": 0.4551, |
| "step": 7655 |
| }, |
| { |
| "epoch": 1.844005777563794, |
| "grad_norm": 1.203125, |
| "learning_rate": 8.930234449634958e-05, |
| "loss": 0.4606, |
| "step": 7660 |
| }, |
| { |
| "epoch": 1.8452094366875302, |
| "grad_norm": 1.1953125, |
| "learning_rate": 8.922208135556936e-05, |
| "loss": 0.4816, |
| "step": 7665 |
| }, |
| { |
| "epoch": 1.8464130958112661, |
| "grad_norm": 1.140625, |
| "learning_rate": 8.914185614390285e-05, |
| "loss": 0.4715, |
| "step": 7670 |
| }, |
| { |
| "epoch": 1.8476167549350024, |
| "grad_norm": 1.0625, |
| "learning_rate": 8.90616689895294e-05, |
| "loss": 0.4704, |
| "step": 7675 |
| }, |
| { |
| "epoch": 1.8488204140587385, |
| "grad_norm": 1.078125, |
| "learning_rate": 8.898152002056751e-05, |
| "loss": 0.4278, |
| "step": 7680 |
| }, |
| { |
| "epoch": 1.8500240731824746, |
| "grad_norm": 1.1328125, |
| "learning_rate": 8.89014093650747e-05, |
| "loss": 0.4681, |
| "step": 7685 |
| }, |
| { |
| "epoch": 1.851227732306211, |
| "grad_norm": 1.25, |
| "learning_rate": 8.88213371510472e-05, |
| "loss": 0.4809, |
| "step": 7690 |
| }, |
| { |
| "epoch": 1.852431391429947, |
| "grad_norm": 1.1015625, |
| "learning_rate": 8.874130350641992e-05, |
| "loss": 0.4337, |
| "step": 7695 |
| }, |
| { |
| "epoch": 1.8536350505536832, |
| "grad_norm": 1.234375, |
| "learning_rate": 8.866130855906615e-05, |
| "loss": 0.467, |
| "step": 7700 |
| }, |
| { |
| "epoch": 1.8548387096774195, |
| "grad_norm": 1.203125, |
| "learning_rate": 8.858135243679725e-05, |
| "loss": 0.4486, |
| "step": 7705 |
| }, |
| { |
| "epoch": 1.8560423688011554, |
| "grad_norm": 1.0859375, |
| "learning_rate": 8.850143526736265e-05, |
| "loss": 0.4463, |
| "step": 7710 |
| }, |
| { |
| "epoch": 1.8572460279248917, |
| "grad_norm": 1.140625, |
| "learning_rate": 8.842155717844943e-05, |
| "loss": 0.4714, |
| "step": 7715 |
| }, |
| { |
| "epoch": 1.8584496870486278, |
| "grad_norm": 1.2109375, |
| "learning_rate": 8.834171829768236e-05, |
| "loss": 0.4678, |
| "step": 7720 |
| }, |
| { |
| "epoch": 1.8596533461723639, |
| "grad_norm": 1.15625, |
| "learning_rate": 8.826191875262353e-05, |
| "loss": 0.4437, |
| "step": 7725 |
| }, |
| { |
| "epoch": 1.8608570052961002, |
| "grad_norm": 1.2734375, |
| "learning_rate": 8.818215867077208e-05, |
| "loss": 0.4542, |
| "step": 7730 |
| }, |
| { |
| "epoch": 1.8620606644198363, |
| "grad_norm": 1.203125, |
| "learning_rate": 8.810243817956424e-05, |
| "loss": 0.4759, |
| "step": 7735 |
| }, |
| { |
| "epoch": 1.8632643235435724, |
| "grad_norm": 1.1171875, |
| "learning_rate": 8.802275740637285e-05, |
| "loss": 0.4581, |
| "step": 7740 |
| }, |
| { |
| "epoch": 1.8644679826673087, |
| "grad_norm": 1.0859375, |
| "learning_rate": 8.794311647850739e-05, |
| "loss": 0.4238, |
| "step": 7745 |
| }, |
| { |
| "epoch": 1.8656716417910446, |
| "grad_norm": 1.21875, |
| "learning_rate": 8.78635155232137e-05, |
| "loss": 0.4728, |
| "step": 7750 |
| }, |
| { |
| "epoch": 1.866875300914781, |
| "grad_norm": 1.21875, |
| "learning_rate": 8.778395466767365e-05, |
| "loss": 0.4451, |
| "step": 7755 |
| }, |
| { |
| "epoch": 1.8680789600385173, |
| "grad_norm": 1.359375, |
| "learning_rate": 8.770443403900507e-05, |
| "loss": 0.457, |
| "step": 7760 |
| }, |
| { |
| "epoch": 1.8692826191622531, |
| "grad_norm": 1.1640625, |
| "learning_rate": 8.762495376426158e-05, |
| "loss": 0.4512, |
| "step": 7765 |
| }, |
| { |
| "epoch": 1.8704862782859895, |
| "grad_norm": 1.1640625, |
| "learning_rate": 8.754551397043226e-05, |
| "loss": 0.4497, |
| "step": 7770 |
| }, |
| { |
| "epoch": 1.8716899374097256, |
| "grad_norm": 1.09375, |
| "learning_rate": 8.746611478444157e-05, |
| "loss": 0.4503, |
| "step": 7775 |
| }, |
| { |
| "epoch": 1.8728935965334617, |
| "grad_norm": 1.2890625, |
| "learning_rate": 8.738675633314902e-05, |
| "loss": 0.442, |
| "step": 7780 |
| }, |
| { |
| "epoch": 1.874097255657198, |
| "grad_norm": 1.171875, |
| "learning_rate": 8.730743874334909e-05, |
| "loss": 0.4566, |
| "step": 7785 |
| }, |
| { |
| "epoch": 1.875300914780934, |
| "grad_norm": 1.21875, |
| "learning_rate": 8.722816214177093e-05, |
| "loss": 0.4417, |
| "step": 7790 |
| }, |
| { |
| "epoch": 1.8765045739046702, |
| "grad_norm": 1.125, |
| "learning_rate": 8.714892665507825e-05, |
| "loss": 0.4951, |
| "step": 7795 |
| }, |
| { |
| "epoch": 1.8777082330284065, |
| "grad_norm": 1.265625, |
| "learning_rate": 8.706973240986906e-05, |
| "loss": 0.4616, |
| "step": 7800 |
| }, |
| { |
| "epoch": 1.8789118921521424, |
| "grad_norm": 1.1484375, |
| "learning_rate": 8.699057953267541e-05, |
| "loss": 0.4855, |
| "step": 7805 |
| }, |
| { |
| "epoch": 1.8801155512758787, |
| "grad_norm": 1.1171875, |
| "learning_rate": 8.691146814996337e-05, |
| "loss": 0.4607, |
| "step": 7810 |
| }, |
| { |
| "epoch": 1.8813192103996148, |
| "grad_norm": 1.109375, |
| "learning_rate": 8.683239838813261e-05, |
| "loss": 0.4378, |
| "step": 7815 |
| }, |
| { |
| "epoch": 1.882522869523351, |
| "grad_norm": 1.2265625, |
| "learning_rate": 8.675337037351635e-05, |
| "loss": 0.4872, |
| "step": 7820 |
| }, |
| { |
| "epoch": 1.8837265286470872, |
| "grad_norm": 1.1484375, |
| "learning_rate": 8.667438423238108e-05, |
| "loss": 0.4662, |
| "step": 7825 |
| }, |
| { |
| "epoch": 1.8849301877708233, |
| "grad_norm": 1.140625, |
| "learning_rate": 8.659544009092642e-05, |
| "loss": 0.4819, |
| "step": 7830 |
| }, |
| { |
| "epoch": 1.8861338468945594, |
| "grad_norm": 1.265625, |
| "learning_rate": 8.651653807528489e-05, |
| "loss": 0.4656, |
| "step": 7835 |
| }, |
| { |
| "epoch": 1.8873375060182958, |
| "grad_norm": 1.15625, |
| "learning_rate": 8.643767831152166e-05, |
| "loss": 0.4723, |
| "step": 7840 |
| }, |
| { |
| "epoch": 1.8885411651420316, |
| "grad_norm": 1.1953125, |
| "learning_rate": 8.635886092563444e-05, |
| "loss": 0.4626, |
| "step": 7845 |
| }, |
| { |
| "epoch": 1.889744824265768, |
| "grad_norm": 1.25, |
| "learning_rate": 8.628008604355316e-05, |
| "loss": 0.4311, |
| "step": 7850 |
| }, |
| { |
| "epoch": 1.890948483389504, |
| "grad_norm": 1.1875, |
| "learning_rate": 8.620135379113992e-05, |
| "loss": 0.4525, |
| "step": 7855 |
| }, |
| { |
| "epoch": 1.8921521425132402, |
| "grad_norm": 1.203125, |
| "learning_rate": 8.61226642941887e-05, |
| "loss": 0.4317, |
| "step": 7860 |
| }, |
| { |
| "epoch": 1.8933558016369765, |
| "grad_norm": 1.125, |
| "learning_rate": 8.60440176784251e-05, |
| "loss": 0.4601, |
| "step": 7865 |
| }, |
| { |
| "epoch": 1.8945594607607126, |
| "grad_norm": 1.1484375, |
| "learning_rate": 8.59654140695063e-05, |
| "loss": 0.4347, |
| "step": 7870 |
| }, |
| { |
| "epoch": 1.8957631198844487, |
| "grad_norm": 1.140625, |
| "learning_rate": 8.588685359302063e-05, |
| "loss": 0.4727, |
| "step": 7875 |
| }, |
| { |
| "epoch": 1.896966779008185, |
| "grad_norm": 1.0625, |
| "learning_rate": 8.580833637448769e-05, |
| "loss": 0.4099, |
| "step": 7880 |
| }, |
| { |
| "epoch": 1.8981704381319209, |
| "grad_norm": 1.171875, |
| "learning_rate": 8.572986253935781e-05, |
| "loss": 0.4509, |
| "step": 7885 |
| }, |
| { |
| "epoch": 1.8993740972556572, |
| "grad_norm": 1.078125, |
| "learning_rate": 8.565143221301208e-05, |
| "loss": 0.4716, |
| "step": 7890 |
| }, |
| { |
| "epoch": 1.9005777563793933, |
| "grad_norm": 1.09375, |
| "learning_rate": 8.557304552076206e-05, |
| "loss": 0.4672, |
| "step": 7895 |
| }, |
| { |
| "epoch": 1.9017814155031294, |
| "grad_norm": 1.0546875, |
| "learning_rate": 8.549470258784957e-05, |
| "loss": 0.4225, |
| "step": 7900 |
| }, |
| { |
| "epoch": 1.9029850746268657, |
| "grad_norm": 1.2421875, |
| "learning_rate": 8.541640353944654e-05, |
| "loss": 0.4699, |
| "step": 7905 |
| }, |
| { |
| "epoch": 1.9041887337506018, |
| "grad_norm": 1.1640625, |
| "learning_rate": 8.53381485006548e-05, |
| "loss": 0.4432, |
| "step": 7910 |
| }, |
| { |
| "epoch": 1.905392392874338, |
| "grad_norm": 1.109375, |
| "learning_rate": 8.525993759650581e-05, |
| "loss": 0.4421, |
| "step": 7915 |
| }, |
| { |
| "epoch": 1.9065960519980742, |
| "grad_norm": 1.1875, |
| "learning_rate": 8.518177095196058e-05, |
| "loss": 0.466, |
| "step": 7920 |
| }, |
| { |
| "epoch": 1.9077997111218103, |
| "grad_norm": 1.2578125, |
| "learning_rate": 8.510364869190934e-05, |
| "loss": 0.4965, |
| "step": 7925 |
| }, |
| { |
| "epoch": 1.9090033702455464, |
| "grad_norm": 1.1953125, |
| "learning_rate": 8.502557094117142e-05, |
| "loss": 0.4582, |
| "step": 7930 |
| }, |
| { |
| "epoch": 1.9102070293692828, |
| "grad_norm": 1.2578125, |
| "learning_rate": 8.494753782449512e-05, |
| "loss": 0.4507, |
| "step": 7935 |
| }, |
| { |
| "epoch": 1.9114106884930187, |
| "grad_norm": 1.09375, |
| "learning_rate": 8.486954946655726e-05, |
| "loss": 0.4378, |
| "step": 7940 |
| }, |
| { |
| "epoch": 1.912614347616755, |
| "grad_norm": 1.2109375, |
| "learning_rate": 8.479160599196334e-05, |
| "loss": 0.4681, |
| "step": 7945 |
| }, |
| { |
| "epoch": 1.913818006740491, |
| "grad_norm": 1.1640625, |
| "learning_rate": 8.471370752524695e-05, |
| "loss": 0.4373, |
| "step": 7950 |
| }, |
| { |
| "epoch": 1.9150216658642272, |
| "grad_norm": 1.25, |
| "learning_rate": 8.463585419086999e-05, |
| "loss": 0.4622, |
| "step": 7955 |
| }, |
| { |
| "epoch": 1.9162253249879635, |
| "grad_norm": 1.1015625, |
| "learning_rate": 8.455804611322205e-05, |
| "loss": 0.4669, |
| "step": 7960 |
| }, |
| { |
| "epoch": 1.9174289841116996, |
| "grad_norm": 1.140625, |
| "learning_rate": 8.448028341662049e-05, |
| "loss": 0.4499, |
| "step": 7965 |
| }, |
| { |
| "epoch": 1.9186326432354357, |
| "grad_norm": 0.98046875, |
| "learning_rate": 8.440256622531019e-05, |
| "loss": 0.4136, |
| "step": 7970 |
| }, |
| { |
| "epoch": 1.919836302359172, |
| "grad_norm": 1.0703125, |
| "learning_rate": 8.432489466346327e-05, |
| "loss": 0.4402, |
| "step": 7975 |
| }, |
| { |
| "epoch": 1.921039961482908, |
| "grad_norm": 1.296875, |
| "learning_rate": 8.4247268855179e-05, |
| "loss": 0.4496, |
| "step": 7980 |
| }, |
| { |
| "epoch": 1.9222436206066442, |
| "grad_norm": 1.140625, |
| "learning_rate": 8.416968892448354e-05, |
| "loss": 0.4861, |
| "step": 7985 |
| }, |
| { |
| "epoch": 1.9234472797303803, |
| "grad_norm": 1.1484375, |
| "learning_rate": 8.409215499532969e-05, |
| "loss": 0.4525, |
| "step": 7990 |
| }, |
| { |
| "epoch": 1.9246509388541164, |
| "grad_norm": 1.171875, |
| "learning_rate": 8.401466719159685e-05, |
| "loss": 0.4661, |
| "step": 7995 |
| }, |
| { |
| "epoch": 1.9258545979778527, |
| "grad_norm": 1.09375, |
| "learning_rate": 8.393722563709057e-05, |
| "loss": 0.436, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.9258545979778527, |
| "eval_loss": 0.38646742701530457, |
| "eval_runtime": 2.3414, |
| "eval_samples_per_second": 85.418, |
| "eval_steps_per_second": 85.418, |
| "step": 8000 |
| }, |
| { |
| "epoch": 1.9270582571015888, |
| "grad_norm": 1.328125, |
| "learning_rate": 8.385983045554271e-05, |
| "loss": 0.4722, |
| "step": 8005 |
| }, |
| { |
| "epoch": 1.928261916225325, |
| "grad_norm": 1.1328125, |
| "learning_rate": 8.378248177061085e-05, |
| "loss": 0.4317, |
| "step": 8010 |
| }, |
| { |
| "epoch": 1.9294655753490613, |
| "grad_norm": 1.2109375, |
| "learning_rate": 8.370517970587842e-05, |
| "loss": 0.4586, |
| "step": 8015 |
| }, |
| { |
| "epoch": 1.9306692344727971, |
| "grad_norm": 1.265625, |
| "learning_rate": 8.362792438485427e-05, |
| "loss": 0.4688, |
| "step": 8020 |
| }, |
| { |
| "epoch": 1.9318728935965335, |
| "grad_norm": 1.125, |
| "learning_rate": 8.355071593097257e-05, |
| "loss": 0.4873, |
| "step": 8025 |
| }, |
| { |
| "epoch": 1.9330765527202696, |
| "grad_norm": 1.1484375, |
| "learning_rate": 8.347355446759267e-05, |
| "loss": 0.489, |
| "step": 8030 |
| }, |
| { |
| "epoch": 1.9342802118440057, |
| "grad_norm": 1.1328125, |
| "learning_rate": 8.339644011799883e-05, |
| "loss": 0.4165, |
| "step": 8035 |
| }, |
| { |
| "epoch": 1.935483870967742, |
| "grad_norm": 1.1875, |
| "learning_rate": 8.331937300539996e-05, |
| "loss": 0.4854, |
| "step": 8040 |
| }, |
| { |
| "epoch": 1.936687530091478, |
| "grad_norm": 1.25, |
| "learning_rate": 8.324235325292962e-05, |
| "loss": 0.4409, |
| "step": 8045 |
| }, |
| { |
| "epoch": 1.9378911892152142, |
| "grad_norm": 1.2421875, |
| "learning_rate": 8.316538098364553e-05, |
| "loss": 0.4517, |
| "step": 8050 |
| }, |
| { |
| "epoch": 1.9390948483389505, |
| "grad_norm": 1.1328125, |
| "learning_rate": 8.308845632052976e-05, |
| "loss": 0.4462, |
| "step": 8055 |
| }, |
| { |
| "epoch": 1.9402985074626866, |
| "grad_norm": 1.1015625, |
| "learning_rate": 8.301157938648811e-05, |
| "loss": 0.4558, |
| "step": 8060 |
| }, |
| { |
| "epoch": 1.9415021665864227, |
| "grad_norm": 1.125, |
| "learning_rate": 8.293475030435027e-05, |
| "loss": 0.4457, |
| "step": 8065 |
| }, |
| { |
| "epoch": 1.942705825710159, |
| "grad_norm": 1.109375, |
| "learning_rate": 8.28579691968694e-05, |
| "loss": 0.429, |
| "step": 8070 |
| }, |
| { |
| "epoch": 1.943909484833895, |
| "grad_norm": 1.1484375, |
| "learning_rate": 8.278123618672199e-05, |
| "loss": 0.4432, |
| "step": 8075 |
| }, |
| { |
| "epoch": 1.9451131439576312, |
| "grad_norm": 1.2578125, |
| "learning_rate": 8.270455139650777e-05, |
| "loss": 0.4277, |
| "step": 8080 |
| }, |
| { |
| "epoch": 1.9463168030813673, |
| "grad_norm": 1.171875, |
| "learning_rate": 8.262791494874934e-05, |
| "loss": 0.4068, |
| "step": 8085 |
| }, |
| { |
| "epoch": 1.9475204622051034, |
| "grad_norm": 1.0703125, |
| "learning_rate": 8.255132696589212e-05, |
| "loss": 0.4259, |
| "step": 8090 |
| }, |
| { |
| "epoch": 1.9487241213288398, |
| "grad_norm": 1.015625, |
| "learning_rate": 8.247478757030408e-05, |
| "loss": 0.4313, |
| "step": 8095 |
| }, |
| { |
| "epoch": 1.9499277804525759, |
| "grad_norm": 1.375, |
| "learning_rate": 8.239829688427552e-05, |
| "loss": 0.4646, |
| "step": 8100 |
| }, |
| { |
| "epoch": 1.951131439576312, |
| "grad_norm": 1.0390625, |
| "learning_rate": 8.232185503001901e-05, |
| "loss": 0.4925, |
| "step": 8105 |
| }, |
| { |
| "epoch": 1.9523350987000483, |
| "grad_norm": 1.15625, |
| "learning_rate": 8.224546212966896e-05, |
| "loss": 0.4387, |
| "step": 8110 |
| }, |
| { |
| "epoch": 1.9535387578237842, |
| "grad_norm": 1.109375, |
| "learning_rate": 8.216911830528171e-05, |
| "loss": 0.4515, |
| "step": 8115 |
| }, |
| { |
| "epoch": 1.9547424169475205, |
| "grad_norm": 1.0859375, |
| "learning_rate": 8.209282367883507e-05, |
| "loss": 0.431, |
| "step": 8120 |
| }, |
| { |
| "epoch": 1.9559460760712566, |
| "grad_norm": 1.234375, |
| "learning_rate": 8.20165783722283e-05, |
| "loss": 0.4543, |
| "step": 8125 |
| }, |
| { |
| "epoch": 1.9571497351949927, |
| "grad_norm": 1.34375, |
| "learning_rate": 8.19403825072819e-05, |
| "loss": 0.4505, |
| "step": 8130 |
| }, |
| { |
| "epoch": 1.958353394318729, |
| "grad_norm": 1.1953125, |
| "learning_rate": 8.186423620573726e-05, |
| "loss": 0.4771, |
| "step": 8135 |
| }, |
| { |
| "epoch": 1.9595570534424651, |
| "grad_norm": 1.0703125, |
| "learning_rate": 8.17881395892567e-05, |
| "loss": 0.4488, |
| "step": 8140 |
| }, |
| { |
| "epoch": 1.9607607125662012, |
| "grad_norm": 1.203125, |
| "learning_rate": 8.17120927794231e-05, |
| "loss": 0.4494, |
| "step": 8145 |
| }, |
| { |
| "epoch": 1.9619643716899375, |
| "grad_norm": 1.2265625, |
| "learning_rate": 8.163609589773973e-05, |
| "loss": 0.4186, |
| "step": 8150 |
| }, |
| { |
| "epoch": 1.9631680308136734, |
| "grad_norm": 1.1953125, |
| "learning_rate": 8.15601490656302e-05, |
| "loss": 0.4626, |
| "step": 8155 |
| }, |
| { |
| "epoch": 1.9643716899374097, |
| "grad_norm": 1.1640625, |
| "learning_rate": 8.148425240443799e-05, |
| "loss": 0.4411, |
| "step": 8160 |
| }, |
| { |
| "epoch": 1.9655753490611458, |
| "grad_norm": 1.171875, |
| "learning_rate": 8.140840603542657e-05, |
| "loss": 0.4392, |
| "step": 8165 |
| }, |
| { |
| "epoch": 1.966779008184882, |
| "grad_norm": 1.078125, |
| "learning_rate": 8.133261007977897e-05, |
| "loss": 0.443, |
| "step": 8170 |
| }, |
| { |
| "epoch": 1.9679826673086183, |
| "grad_norm": 1.1171875, |
| "learning_rate": 8.125686465859771e-05, |
| "loss": 0.4552, |
| "step": 8175 |
| }, |
| { |
| "epoch": 1.9691863264323544, |
| "grad_norm": 1.0703125, |
| "learning_rate": 8.118116989290456e-05, |
| "loss": 0.4753, |
| "step": 8180 |
| }, |
| { |
| "epoch": 1.9703899855560905, |
| "grad_norm": 1.1875, |
| "learning_rate": 8.11055259036403e-05, |
| "loss": 0.4581, |
| "step": 8185 |
| }, |
| { |
| "epoch": 1.9715936446798268, |
| "grad_norm": 1.171875, |
| "learning_rate": 8.102993281166469e-05, |
| "loss": 0.4529, |
| "step": 8190 |
| }, |
| { |
| "epoch": 1.9727973038035629, |
| "grad_norm": 1.078125, |
| "learning_rate": 8.095439073775611e-05, |
| "loss": 0.4403, |
| "step": 8195 |
| }, |
| { |
| "epoch": 1.974000962927299, |
| "grad_norm": 1.1484375, |
| "learning_rate": 8.087889980261139e-05, |
| "loss": 0.4428, |
| "step": 8200 |
| }, |
| { |
| "epoch": 1.9752046220510353, |
| "grad_norm": 1.1875, |
| "learning_rate": 8.080346012684573e-05, |
| "loss": 0.4792, |
| "step": 8205 |
| }, |
| { |
| "epoch": 1.9764082811747712, |
| "grad_norm": 1.21875, |
| "learning_rate": 8.072807183099237e-05, |
| "loss": 0.4788, |
| "step": 8210 |
| }, |
| { |
| "epoch": 1.9776119402985075, |
| "grad_norm": 1.078125, |
| "learning_rate": 8.065273503550247e-05, |
| "loss": 0.4231, |
| "step": 8215 |
| }, |
| { |
| "epoch": 1.9788155994222436, |
| "grad_norm": 1.140625, |
| "learning_rate": 8.057744986074497e-05, |
| "loss": 0.4536, |
| "step": 8220 |
| }, |
| { |
| "epoch": 1.9800192585459797, |
| "grad_norm": 1.0546875, |
| "learning_rate": 8.050221642700624e-05, |
| "loss": 0.4245, |
| "step": 8225 |
| }, |
| { |
| "epoch": 1.981222917669716, |
| "grad_norm": 1.171875, |
| "learning_rate": 8.042703485449003e-05, |
| "loss": 0.4479, |
| "step": 8230 |
| }, |
| { |
| "epoch": 1.9824265767934521, |
| "grad_norm": 1.171875, |
| "learning_rate": 8.035190526331718e-05, |
| "loss": 0.4422, |
| "step": 8235 |
| }, |
| { |
| "epoch": 1.9836302359171882, |
| "grad_norm": 1.0546875, |
| "learning_rate": 8.027682777352556e-05, |
| "loss": 0.4483, |
| "step": 8240 |
| }, |
| { |
| "epoch": 1.9848338950409246, |
| "grad_norm": 1.1640625, |
| "learning_rate": 8.020180250506978e-05, |
| "loss": 0.4408, |
| "step": 8245 |
| }, |
| { |
| "epoch": 1.9860375541646604, |
| "grad_norm": 1.1484375, |
| "learning_rate": 8.01268295778209e-05, |
| "loss": 0.4745, |
| "step": 8250 |
| }, |
| { |
| "epoch": 1.9872412132883968, |
| "grad_norm": 1.0234375, |
| "learning_rate": 8.005190911156654e-05, |
| "loss": 0.4414, |
| "step": 8255 |
| }, |
| { |
| "epoch": 1.9884448724121329, |
| "grad_norm": 1.1328125, |
| "learning_rate": 7.997704122601031e-05, |
| "loss": 0.4462, |
| "step": 8260 |
| }, |
| { |
| "epoch": 1.989648531535869, |
| "grad_norm": 1.1171875, |
| "learning_rate": 7.990222604077194e-05, |
| "loss": 0.4435, |
| "step": 8265 |
| }, |
| { |
| "epoch": 1.9908521906596053, |
| "grad_norm": 1.0859375, |
| "learning_rate": 7.982746367538694e-05, |
| "loss": 0.465, |
| "step": 8270 |
| }, |
| { |
| "epoch": 1.9920558497833414, |
| "grad_norm": 1.15625, |
| "learning_rate": 7.975275424930638e-05, |
| "loss": 0.4433, |
| "step": 8275 |
| }, |
| { |
| "epoch": 1.9932595089070775, |
| "grad_norm": 1.1953125, |
| "learning_rate": 7.967809788189675e-05, |
| "loss": 0.4534, |
| "step": 8280 |
| }, |
| { |
| "epoch": 1.9944631680308138, |
| "grad_norm": 1.1484375, |
| "learning_rate": 7.96034946924398e-05, |
| "loss": 0.4665, |
| "step": 8285 |
| }, |
| { |
| "epoch": 1.9956668271545497, |
| "grad_norm": 1.1953125, |
| "learning_rate": 7.952894480013234e-05, |
| "loss": 0.456, |
| "step": 8290 |
| }, |
| { |
| "epoch": 1.996870486278286, |
| "grad_norm": 1.0703125, |
| "learning_rate": 7.945444832408592e-05, |
| "loss": 0.4459, |
| "step": 8295 |
| }, |
| { |
| "epoch": 1.998074145402022, |
| "grad_norm": 1.1171875, |
| "learning_rate": 7.938000538332684e-05, |
| "loss": 0.4198, |
| "step": 8300 |
| }, |
| { |
| "epoch": 1.9992778045257582, |
| "grad_norm": 1.1171875, |
| "learning_rate": 7.930561609679588e-05, |
| "loss": 0.4287, |
| "step": 8305 |
| }, |
| { |
| "epoch": 1.9995185363505055, |
| "eval_loss": 0.3824025094509125, |
| "eval_runtime": 2.3407, |
| "eval_samples_per_second": 85.446, |
| "eval_steps_per_second": 85.446, |
| "step": 8306 |
| }, |
| { |
| "epoch": 2.0004814636494945, |
| "grad_norm": 1.140625, |
| "learning_rate": 7.923128058334797e-05, |
| "loss": 0.4334, |
| "step": 8310 |
| }, |
| { |
| "epoch": 2.0016851227732304, |
| "grad_norm": 1.078125, |
| "learning_rate": 7.915699896175228e-05, |
| "loss": 0.3898, |
| "step": 8315 |
| }, |
| { |
| "epoch": 2.0028887818969667, |
| "grad_norm": 1.0234375, |
| "learning_rate": 7.908277135069175e-05, |
| "loss": 0.4385, |
| "step": 8320 |
| }, |
| { |
| "epoch": 2.004092441020703, |
| "grad_norm": 1.125, |
| "learning_rate": 7.900859786876308e-05, |
| "loss": 0.4072, |
| "step": 8325 |
| }, |
| { |
| "epoch": 2.005296100144439, |
| "grad_norm": 1.09375, |
| "learning_rate": 7.893447863447651e-05, |
| "loss": 0.3937, |
| "step": 8330 |
| }, |
| { |
| "epoch": 2.0064997592681753, |
| "grad_norm": 1.109375, |
| "learning_rate": 7.886041376625557e-05, |
| "loss": 0.4192, |
| "step": 8335 |
| }, |
| { |
| "epoch": 2.0077034183919116, |
| "grad_norm": 1.1796875, |
| "learning_rate": 7.87864033824369e-05, |
| "loss": 0.4446, |
| "step": 8340 |
| }, |
| { |
| "epoch": 2.0089070775156475, |
| "grad_norm": 1.015625, |
| "learning_rate": 7.871244760127011e-05, |
| "loss": 0.4018, |
| "step": 8345 |
| }, |
| { |
| "epoch": 2.0101107366393838, |
| "grad_norm": 1.140625, |
| "learning_rate": 7.863854654091763e-05, |
| "loss": 0.4013, |
| "step": 8350 |
| }, |
| { |
| "epoch": 2.01131439576312, |
| "grad_norm": 1.0859375, |
| "learning_rate": 7.85647003194544e-05, |
| "loss": 0.4188, |
| "step": 8355 |
| }, |
| { |
| "epoch": 2.012518054886856, |
| "grad_norm": 1.140625, |
| "learning_rate": 7.84909090548677e-05, |
| "loss": 0.3993, |
| "step": 8360 |
| }, |
| { |
| "epoch": 2.0137217140105923, |
| "grad_norm": 1.1015625, |
| "learning_rate": 7.841717286505714e-05, |
| "loss": 0.4066, |
| "step": 8365 |
| }, |
| { |
| "epoch": 2.014925373134328, |
| "grad_norm": 1.09375, |
| "learning_rate": 7.834349186783416e-05, |
| "loss": 0.3951, |
| "step": 8370 |
| }, |
| { |
| "epoch": 2.0161290322580645, |
| "grad_norm": 1.1484375, |
| "learning_rate": 7.826986618092216e-05, |
| "loss": 0.4215, |
| "step": 8375 |
| }, |
| { |
| "epoch": 2.017332691381801, |
| "grad_norm": 1.015625, |
| "learning_rate": 7.819629592195612e-05, |
| "loss": 0.3705, |
| "step": 8380 |
| }, |
| { |
| "epoch": 2.0185363505055367, |
| "grad_norm": 1.2421875, |
| "learning_rate": 7.812278120848243e-05, |
| "loss": 0.424, |
| "step": 8385 |
| }, |
| { |
| "epoch": 2.019740009629273, |
| "grad_norm": 1.109375, |
| "learning_rate": 7.804932215795877e-05, |
| "loss": 0.3994, |
| "step": 8390 |
| }, |
| { |
| "epoch": 2.0209436687530093, |
| "grad_norm": 1.1640625, |
| "learning_rate": 7.797591888775384e-05, |
| "loss": 0.3877, |
| "step": 8395 |
| }, |
| { |
| "epoch": 2.0221473278767452, |
| "grad_norm": 1.1796875, |
| "learning_rate": 7.790257151514726e-05, |
| "loss": 0.4092, |
| "step": 8400 |
| }, |
| { |
| "epoch": 2.0233509870004815, |
| "grad_norm": 1.109375, |
| "learning_rate": 7.782928015732937e-05, |
| "loss": 0.4255, |
| "step": 8405 |
| }, |
| { |
| "epoch": 2.0245546461242174, |
| "grad_norm": 1.0546875, |
| "learning_rate": 7.775604493140092e-05, |
| "loss": 0.4021, |
| "step": 8410 |
| }, |
| { |
| "epoch": 2.0257583052479537, |
| "grad_norm": 1.078125, |
| "learning_rate": 7.768286595437306e-05, |
| "loss": 0.3863, |
| "step": 8415 |
| }, |
| { |
| "epoch": 2.02696196437169, |
| "grad_norm": 1.046875, |
| "learning_rate": 7.760974334316702e-05, |
| "loss": 0.3939, |
| "step": 8420 |
| }, |
| { |
| "epoch": 2.028165623495426, |
| "grad_norm": 1.0859375, |
| "learning_rate": 7.753667721461396e-05, |
| "loss": 0.4396, |
| "step": 8425 |
| }, |
| { |
| "epoch": 2.0293692826191623, |
| "grad_norm": 1.0703125, |
| "learning_rate": 7.74636676854549e-05, |
| "loss": 0.3985, |
| "step": 8430 |
| }, |
| { |
| "epoch": 2.0305729417428986, |
| "grad_norm": 1.0546875, |
| "learning_rate": 7.739071487234031e-05, |
| "loss": 0.4124, |
| "step": 8435 |
| }, |
| { |
| "epoch": 2.0317766008666345, |
| "grad_norm": 1.21875, |
| "learning_rate": 7.731781889183004e-05, |
| "loss": 0.4312, |
| "step": 8440 |
| }, |
| { |
| "epoch": 2.032980259990371, |
| "grad_norm": 1.1015625, |
| "learning_rate": 7.724497986039326e-05, |
| "loss": 0.4224, |
| "step": 8445 |
| }, |
| { |
| "epoch": 2.0341839191141067, |
| "grad_norm": 1.3203125, |
| "learning_rate": 7.7172197894408e-05, |
| "loss": 0.4149, |
| "step": 8450 |
| }, |
| { |
| "epoch": 2.035387578237843, |
| "grad_norm": 1.0390625, |
| "learning_rate": 7.709947311016122e-05, |
| "loss": 0.4007, |
| "step": 8455 |
| }, |
| { |
| "epoch": 2.0365912373615793, |
| "grad_norm": 1.1953125, |
| "learning_rate": 7.702680562384848e-05, |
| "loss": 0.4025, |
| "step": 8460 |
| }, |
| { |
| "epoch": 2.037794896485315, |
| "grad_norm": 1.109375, |
| "learning_rate": 7.695419555157379e-05, |
| "loss": 0.4085, |
| "step": 8465 |
| }, |
| { |
| "epoch": 2.0389985556090515, |
| "grad_norm": 1.1484375, |
| "learning_rate": 7.68816430093494e-05, |
| "loss": 0.413, |
| "step": 8470 |
| }, |
| { |
| "epoch": 2.040202214732788, |
| "grad_norm": 1.1484375, |
| "learning_rate": 7.680914811309574e-05, |
| "loss": 0.3988, |
| "step": 8475 |
| }, |
| { |
| "epoch": 2.0414058738565237, |
| "grad_norm": 1.1640625, |
| "learning_rate": 7.673671097864102e-05, |
| "loss": 0.3933, |
| "step": 8480 |
| }, |
| { |
| "epoch": 2.04260953298026, |
| "grad_norm": 1.09375, |
| "learning_rate": 7.666433172172121e-05, |
| "loss": 0.4104, |
| "step": 8485 |
| }, |
| { |
| "epoch": 2.043813192103996, |
| "grad_norm": 1.0703125, |
| "learning_rate": 7.659201045797986e-05, |
| "loss": 0.4226, |
| "step": 8490 |
| }, |
| { |
| "epoch": 2.0450168512277322, |
| "grad_norm": 1.171875, |
| "learning_rate": 7.651974730296779e-05, |
| "loss": 0.399, |
| "step": 8495 |
| }, |
| { |
| "epoch": 2.0462205103514686, |
| "grad_norm": 1.203125, |
| "learning_rate": 7.644754237214296e-05, |
| "loss": 0.417, |
| "step": 8500 |
| }, |
| { |
| "epoch": 2.0462205103514686, |
| "eval_loss": 0.38362109661102295, |
| "eval_runtime": 2.3419, |
| "eval_samples_per_second": 85.401, |
| "eval_steps_per_second": 85.401, |
| "step": 8500 |
| }, |
| { |
| "epoch": 2.0474241694752044, |
| "grad_norm": 1.1328125, |
| "learning_rate": 7.637539578087033e-05, |
| "loss": 0.4, |
| "step": 8505 |
| }, |
| { |
| "epoch": 2.0486278285989408, |
| "grad_norm": 1.2109375, |
| "learning_rate": 7.63033076444217e-05, |
| "loss": 0.4213, |
| "step": 8510 |
| }, |
| { |
| "epoch": 2.049831487722677, |
| "grad_norm": 1.078125, |
| "learning_rate": 7.623127807797544e-05, |
| "loss": 0.4076, |
| "step": 8515 |
| }, |
| { |
| "epoch": 2.051035146846413, |
| "grad_norm": 1.0234375, |
| "learning_rate": 7.615930719661628e-05, |
| "loss": 0.4243, |
| "step": 8520 |
| }, |
| { |
| "epoch": 2.0522388059701493, |
| "grad_norm": 1.1875, |
| "learning_rate": 7.608739511533531e-05, |
| "loss": 0.3875, |
| "step": 8525 |
| }, |
| { |
| "epoch": 2.0534424650938856, |
| "grad_norm": 0.98046875, |
| "learning_rate": 7.601554194902953e-05, |
| "loss": 0.3694, |
| "step": 8530 |
| }, |
| { |
| "epoch": 2.0546461242176215, |
| "grad_norm": 1.0703125, |
| "learning_rate": 7.594374781250192e-05, |
| "loss": 0.4145, |
| "step": 8535 |
| }, |
| { |
| "epoch": 2.055849783341358, |
| "grad_norm": 1.0625, |
| "learning_rate": 7.58720128204611e-05, |
| "loss": 0.397, |
| "step": 8540 |
| }, |
| { |
| "epoch": 2.0570534424650937, |
| "grad_norm": 1.21875, |
| "learning_rate": 7.580033708752123e-05, |
| "loss": 0.3991, |
| "step": 8545 |
| }, |
| { |
| "epoch": 2.05825710158883, |
| "grad_norm": 1.1171875, |
| "learning_rate": 7.572872072820169e-05, |
| "loss": 0.3759, |
| "step": 8550 |
| }, |
| { |
| "epoch": 2.0594607607125663, |
| "grad_norm": 1.234375, |
| "learning_rate": 7.56571638569271e-05, |
| "loss": 0.4309, |
| "step": 8555 |
| }, |
| { |
| "epoch": 2.060664419836302, |
| "grad_norm": 1.1328125, |
| "learning_rate": 7.558566658802695e-05, |
| "loss": 0.4206, |
| "step": 8560 |
| }, |
| { |
| "epoch": 2.0618680789600385, |
| "grad_norm": 1.109375, |
| "learning_rate": 7.551422903573566e-05, |
| "loss": 0.4061, |
| "step": 8565 |
| }, |
| { |
| "epoch": 2.063071738083775, |
| "grad_norm": 1.140625, |
| "learning_rate": 7.5442851314192e-05, |
| "loss": 0.4041, |
| "step": 8570 |
| }, |
| { |
| "epoch": 2.0642753972075107, |
| "grad_norm": 1.1484375, |
| "learning_rate": 7.537153353743936e-05, |
| "loss": 0.3775, |
| "step": 8575 |
| }, |
| { |
| "epoch": 2.065479056331247, |
| "grad_norm": 1.1484375, |
| "learning_rate": 7.530027581942522e-05, |
| "loss": 0.4169, |
| "step": 8580 |
| }, |
| { |
| "epoch": 2.066682715454983, |
| "grad_norm": 1.09375, |
| "learning_rate": 7.522907827400114e-05, |
| "loss": 0.4011, |
| "step": 8585 |
| }, |
| { |
| "epoch": 2.0678863745787193, |
| "grad_norm": 1.125, |
| "learning_rate": 7.515794101492259e-05, |
| "loss": 0.4128, |
| "step": 8590 |
| }, |
| { |
| "epoch": 2.0690900337024556, |
| "grad_norm": 1.1328125, |
| "learning_rate": 7.508686415584865e-05, |
| "loss": 0.3937, |
| "step": 8595 |
| }, |
| { |
| "epoch": 2.0702936928261915, |
| "grad_norm": 1.125, |
| "learning_rate": 7.501584781034188e-05, |
| "loss": 0.4162, |
| "step": 8600 |
| }, |
| { |
| "epoch": 2.071497351949928, |
| "grad_norm": 1.125, |
| "learning_rate": 7.494489209186828e-05, |
| "loss": 0.4208, |
| "step": 8605 |
| }, |
| { |
| "epoch": 2.072701011073664, |
| "grad_norm": 1.1953125, |
| "learning_rate": 7.48739971137968e-05, |
| "loss": 0.4115, |
| "step": 8610 |
| }, |
| { |
| "epoch": 2.0739046701974, |
| "grad_norm": 1.140625, |
| "learning_rate": 7.480316298939953e-05, |
| "loss": 0.4011, |
| "step": 8615 |
| }, |
| { |
| "epoch": 2.0751083293211363, |
| "grad_norm": 1.1640625, |
| "learning_rate": 7.473238983185118e-05, |
| "loss": 0.3847, |
| "step": 8620 |
| }, |
| { |
| "epoch": 2.076311988444872, |
| "grad_norm": 1.2890625, |
| "learning_rate": 7.466167775422916e-05, |
| "loss": 0.4411, |
| "step": 8625 |
| }, |
| { |
| "epoch": 2.0775156475686085, |
| "grad_norm": 1.09375, |
| "learning_rate": 7.45910268695132e-05, |
| "loss": 0.4003, |
| "step": 8630 |
| }, |
| { |
| "epoch": 2.078719306692345, |
| "grad_norm": 1.21875, |
| "learning_rate": 7.45204372905853e-05, |
| "loss": 0.4185, |
| "step": 8635 |
| }, |
| { |
| "epoch": 2.0799229658160807, |
| "grad_norm": 1.09375, |
| "learning_rate": 7.444990913022957e-05, |
| "loss": 0.4144, |
| "step": 8640 |
| }, |
| { |
| "epoch": 2.081126624939817, |
| "grad_norm": 1.1015625, |
| "learning_rate": 7.437944250113187e-05, |
| "loss": 0.3826, |
| "step": 8645 |
| }, |
| { |
| "epoch": 2.0823302840635534, |
| "grad_norm": 1.1015625, |
| "learning_rate": 7.430903751587986e-05, |
| "loss": 0.3917, |
| "step": 8650 |
| }, |
| { |
| "epoch": 2.0835339431872892, |
| "grad_norm": 1.1484375, |
| "learning_rate": 7.423869428696263e-05, |
| "loss": 0.3866, |
| "step": 8655 |
| }, |
| { |
| "epoch": 2.0847376023110256, |
| "grad_norm": 1.0859375, |
| "learning_rate": 7.41684129267706e-05, |
| "loss": 0.41, |
| "step": 8660 |
| }, |
| { |
| "epoch": 2.085941261434762, |
| "grad_norm": 1.21875, |
| "learning_rate": 7.409819354759541e-05, |
| "loss": 0.3893, |
| "step": 8665 |
| }, |
| { |
| "epoch": 2.0871449205584978, |
| "grad_norm": 1.1875, |
| "learning_rate": 7.40280362616296e-05, |
| "loss": 0.4038, |
| "step": 8670 |
| }, |
| { |
| "epoch": 2.088348579682234, |
| "grad_norm": 1.1640625, |
| "learning_rate": 7.395794118096655e-05, |
| "loss": 0.4041, |
| "step": 8675 |
| }, |
| { |
| "epoch": 2.08955223880597, |
| "grad_norm": 1.109375, |
| "learning_rate": 7.38879084176002e-05, |
| "loss": 0.3963, |
| "step": 8680 |
| }, |
| { |
| "epoch": 2.0907558979297063, |
| "grad_norm": 1.109375, |
| "learning_rate": 7.381793808342497e-05, |
| "loss": 0.3936, |
| "step": 8685 |
| }, |
| { |
| "epoch": 2.0919595570534426, |
| "grad_norm": 1.125, |
| "learning_rate": 7.374803029023554e-05, |
| "loss": 0.4216, |
| "step": 8690 |
| }, |
| { |
| "epoch": 2.0931632161771785, |
| "grad_norm": 1.1953125, |
| "learning_rate": 7.367818514972661e-05, |
| "loss": 0.3984, |
| "step": 8695 |
| }, |
| { |
| "epoch": 2.094366875300915, |
| "grad_norm": 1.171875, |
| "learning_rate": 7.360840277349284e-05, |
| "loss": 0.3999, |
| "step": 8700 |
| }, |
| { |
| "epoch": 2.095570534424651, |
| "grad_norm": 1.0859375, |
| "learning_rate": 7.353868327302854e-05, |
| "loss": 0.3724, |
| "step": 8705 |
| }, |
| { |
| "epoch": 2.096774193548387, |
| "grad_norm": 1.125, |
| "learning_rate": 7.346902675972762e-05, |
| "loss": 0.3835, |
| "step": 8710 |
| }, |
| { |
| "epoch": 2.0979778526721233, |
| "grad_norm": 1.2265625, |
| "learning_rate": 7.339943334488335e-05, |
| "loss": 0.4211, |
| "step": 8715 |
| }, |
| { |
| "epoch": 2.099181511795859, |
| "grad_norm": 1.09375, |
| "learning_rate": 7.332990313968812e-05, |
| "loss": 0.4278, |
| "step": 8720 |
| }, |
| { |
| "epoch": 2.1003851709195955, |
| "grad_norm": 1.109375, |
| "learning_rate": 7.326043625523341e-05, |
| "loss": 0.4328, |
| "step": 8725 |
| }, |
| { |
| "epoch": 2.101588830043332, |
| "grad_norm": 1.171875, |
| "learning_rate": 7.319103280250947e-05, |
| "loss": 0.4016, |
| "step": 8730 |
| }, |
| { |
| "epoch": 2.1027924891670677, |
| "grad_norm": 1.21875, |
| "learning_rate": 7.312169289240526e-05, |
| "loss": 0.4046, |
| "step": 8735 |
| }, |
| { |
| "epoch": 2.103996148290804, |
| "grad_norm": 1.1484375, |
| "learning_rate": 7.305241663570813e-05, |
| "loss": 0.4007, |
| "step": 8740 |
| }, |
| { |
| "epoch": 2.1051998074145404, |
| "grad_norm": 1.0390625, |
| "learning_rate": 7.298320414310382e-05, |
| "loss": 0.3916, |
| "step": 8745 |
| }, |
| { |
| "epoch": 2.1064034665382763, |
| "grad_norm": 1.140625, |
| "learning_rate": 7.291405552517615e-05, |
| "loss": 0.3941, |
| "step": 8750 |
| }, |
| { |
| "epoch": 2.1076071256620126, |
| "grad_norm": 1.1875, |
| "learning_rate": 7.284497089240689e-05, |
| "loss": 0.3787, |
| "step": 8755 |
| }, |
| { |
| "epoch": 2.1088107847857485, |
| "grad_norm": 1.1953125, |
| "learning_rate": 7.277595035517552e-05, |
| "loss": 0.3666, |
| "step": 8760 |
| }, |
| { |
| "epoch": 2.110014443909485, |
| "grad_norm": 1.21875, |
| "learning_rate": 7.270699402375924e-05, |
| "loss": 0.3993, |
| "step": 8765 |
| }, |
| { |
| "epoch": 2.111218103033221, |
| "grad_norm": 1.1640625, |
| "learning_rate": 7.263810200833252e-05, |
| "loss": 0.4071, |
| "step": 8770 |
| }, |
| { |
| "epoch": 2.112421762156957, |
| "grad_norm": 1.203125, |
| "learning_rate": 7.25692744189672e-05, |
| "loss": 0.4174, |
| "step": 8775 |
| }, |
| { |
| "epoch": 2.1136254212806933, |
| "grad_norm": 1.1171875, |
| "learning_rate": 7.250051136563206e-05, |
| "loss": 0.4113, |
| "step": 8780 |
| }, |
| { |
| "epoch": 2.1148290804044296, |
| "grad_norm": 1.1640625, |
| "learning_rate": 7.243181295819291e-05, |
| "loss": 0.4229, |
| "step": 8785 |
| }, |
| { |
| "epoch": 2.1160327395281655, |
| "grad_norm": 1.125, |
| "learning_rate": 7.236317930641211e-05, |
| "loss": 0.4348, |
| "step": 8790 |
| }, |
| { |
| "epoch": 2.117236398651902, |
| "grad_norm": 1.1875, |
| "learning_rate": 7.229461051994869e-05, |
| "loss": 0.4058, |
| "step": 8795 |
| }, |
| { |
| "epoch": 2.118440057775638, |
| "grad_norm": 1.09375, |
| "learning_rate": 7.222610670835802e-05, |
| "loss": 0.3793, |
| "step": 8800 |
| }, |
| { |
| "epoch": 2.119643716899374, |
| "grad_norm": 1.1484375, |
| "learning_rate": 7.215766798109156e-05, |
| "loss": 0.3985, |
| "step": 8805 |
| }, |
| { |
| "epoch": 2.1208473760231104, |
| "grad_norm": 1.109375, |
| "learning_rate": 7.208929444749692e-05, |
| "loss": 0.425, |
| "step": 8810 |
| }, |
| { |
| "epoch": 2.1220510351468462, |
| "grad_norm": 1.2734375, |
| "learning_rate": 7.202098621681746e-05, |
| "loss": 0.4327, |
| "step": 8815 |
| }, |
| { |
| "epoch": 2.1232546942705826, |
| "grad_norm": 1.2265625, |
| "learning_rate": 7.195274339819216e-05, |
| "loss": 0.4094, |
| "step": 8820 |
| }, |
| { |
| "epoch": 2.124458353394319, |
| "grad_norm": 1.0703125, |
| "learning_rate": 7.188456610065564e-05, |
| "loss": 0.4066, |
| "step": 8825 |
| }, |
| { |
| "epoch": 2.1256620125180548, |
| "grad_norm": 1.1796875, |
| "learning_rate": 7.181645443313766e-05, |
| "loss": 0.4166, |
| "step": 8830 |
| }, |
| { |
| "epoch": 2.126865671641791, |
| "grad_norm": 1.0625, |
| "learning_rate": 7.17484085044633e-05, |
| "loss": 0.3624, |
| "step": 8835 |
| }, |
| { |
| "epoch": 2.1280693307655274, |
| "grad_norm": 1.2421875, |
| "learning_rate": 7.168042842335241e-05, |
| "loss": 0.414, |
| "step": 8840 |
| }, |
| { |
| "epoch": 2.1292729898892633, |
| "grad_norm": 1.0390625, |
| "learning_rate": 7.161251429841979e-05, |
| "loss": 0.405, |
| "step": 8845 |
| }, |
| { |
| "epoch": 2.1304766490129996, |
| "grad_norm": 1.2578125, |
| "learning_rate": 7.15446662381748e-05, |
| "loss": 0.3779, |
| "step": 8850 |
| }, |
| { |
| "epoch": 2.1316803081367355, |
| "grad_norm": 1.1953125, |
| "learning_rate": 7.147688435102122e-05, |
| "loss": 0.3961, |
| "step": 8855 |
| }, |
| { |
| "epoch": 2.132883967260472, |
| "grad_norm": 1.0703125, |
| "learning_rate": 7.140916874525718e-05, |
| "loss": 0.3901, |
| "step": 8860 |
| }, |
| { |
| "epoch": 2.134087626384208, |
| "grad_norm": 1.0859375, |
| "learning_rate": 7.134151952907482e-05, |
| "loss": 0.4149, |
| "step": 8865 |
| }, |
| { |
| "epoch": 2.135291285507944, |
| "grad_norm": 1.078125, |
| "learning_rate": 7.127393681056024e-05, |
| "loss": 0.3962, |
| "step": 8870 |
| }, |
| { |
| "epoch": 2.1364949446316803, |
| "grad_norm": 1.0625, |
| "learning_rate": 7.120642069769334e-05, |
| "loss": 0.3885, |
| "step": 8875 |
| }, |
| { |
| "epoch": 2.1376986037554166, |
| "grad_norm": 1.1015625, |
| "learning_rate": 7.113897129834749e-05, |
| "loss": 0.3891, |
| "step": 8880 |
| }, |
| { |
| "epoch": 2.1389022628791525, |
| "grad_norm": 1.09375, |
| "learning_rate": 7.107158872028961e-05, |
| "loss": 0.4024, |
| "step": 8885 |
| }, |
| { |
| "epoch": 2.140105922002889, |
| "grad_norm": 1.09375, |
| "learning_rate": 7.100427307117975e-05, |
| "loss": 0.3853, |
| "step": 8890 |
| }, |
| { |
| "epoch": 2.1413095811266247, |
| "grad_norm": 1.2890625, |
| "learning_rate": 7.093702445857105e-05, |
| "loss": 0.3992, |
| "step": 8895 |
| }, |
| { |
| "epoch": 2.142513240250361, |
| "grad_norm": 1.1875, |
| "learning_rate": 7.08698429899096e-05, |
| "loss": 0.4195, |
| "step": 8900 |
| }, |
| { |
| "epoch": 2.1437168993740974, |
| "grad_norm": 1.171875, |
| "learning_rate": 7.080272877253412e-05, |
| "loss": 0.3901, |
| "step": 8905 |
| }, |
| { |
| "epoch": 2.1449205584978333, |
| "grad_norm": 1.234375, |
| "learning_rate": 7.073568191367596e-05, |
| "loss": 0.4154, |
| "step": 8910 |
| }, |
| { |
| "epoch": 2.1461242176215696, |
| "grad_norm": 1.171875, |
| "learning_rate": 7.066870252045878e-05, |
| "loss": 0.4217, |
| "step": 8915 |
| }, |
| { |
| "epoch": 2.147327876745306, |
| "grad_norm": 1.15625, |
| "learning_rate": 7.06017906998985e-05, |
| "loss": 0.3903, |
| "step": 8920 |
| }, |
| { |
| "epoch": 2.1485315358690418, |
| "grad_norm": 1.1796875, |
| "learning_rate": 7.053494655890306e-05, |
| "loss": 0.3995, |
| "step": 8925 |
| }, |
| { |
| "epoch": 2.149735194992778, |
| "grad_norm": 1.21875, |
| "learning_rate": 7.046817020427223e-05, |
| "loss": 0.3909, |
| "step": 8930 |
| }, |
| { |
| "epoch": 2.1509388541165144, |
| "grad_norm": 1.1796875, |
| "learning_rate": 7.040146174269755e-05, |
| "loss": 0.3965, |
| "step": 8935 |
| }, |
| { |
| "epoch": 2.1521425132402503, |
| "grad_norm": 1.1640625, |
| "learning_rate": 7.033482128076199e-05, |
| "loss": 0.406, |
| "step": 8940 |
| }, |
| { |
| "epoch": 2.1533461723639866, |
| "grad_norm": 1.109375, |
| "learning_rate": 7.026824892493997e-05, |
| "loss": 0.4156, |
| "step": 8945 |
| }, |
| { |
| "epoch": 2.1545498314877225, |
| "grad_norm": 1.0859375, |
| "learning_rate": 7.020174478159706e-05, |
| "loss": 0.3951, |
| "step": 8950 |
| }, |
| { |
| "epoch": 2.155753490611459, |
| "grad_norm": 1.2109375, |
| "learning_rate": 7.013530895698976e-05, |
| "loss": 0.3775, |
| "step": 8955 |
| }, |
| { |
| "epoch": 2.156957149735195, |
| "grad_norm": 1.1796875, |
| "learning_rate": 7.006894155726557e-05, |
| "loss": 0.4095, |
| "step": 8960 |
| }, |
| { |
| "epoch": 2.158160808858931, |
| "grad_norm": 1.1796875, |
| "learning_rate": 7.00026426884625e-05, |
| "loss": 0.3725, |
| "step": 8965 |
| }, |
| { |
| "epoch": 2.1593644679826673, |
| "grad_norm": 1.078125, |
| "learning_rate": 6.993641245650924e-05, |
| "loss": 0.4198, |
| "step": 8970 |
| }, |
| { |
| "epoch": 2.1605681271064037, |
| "grad_norm": 1.21875, |
| "learning_rate": 6.987025096722466e-05, |
| "loss": 0.3956, |
| "step": 8975 |
| }, |
| { |
| "epoch": 2.1617717862301395, |
| "grad_norm": 1.140625, |
| "learning_rate": 6.980415832631785e-05, |
| "loss": 0.3812, |
| "step": 8980 |
| }, |
| { |
| "epoch": 2.162975445353876, |
| "grad_norm": 1.09375, |
| "learning_rate": 6.973813463938794e-05, |
| "loss": 0.3799, |
| "step": 8985 |
| }, |
| { |
| "epoch": 2.1641791044776117, |
| "grad_norm": 1.265625, |
| "learning_rate": 6.967218001192384e-05, |
| "loss": 0.4223, |
| "step": 8990 |
| }, |
| { |
| "epoch": 2.165382763601348, |
| "grad_norm": 1.1484375, |
| "learning_rate": 6.960629454930416e-05, |
| "loss": 0.3907, |
| "step": 8995 |
| }, |
| { |
| "epoch": 2.1665864227250844, |
| "grad_norm": 1.2578125, |
| "learning_rate": 6.954047835679694e-05, |
| "loss": 0.4085, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.1665864227250844, |
| "eval_loss": 0.37983912229537964, |
| "eval_runtime": 2.3776, |
| "eval_samples_per_second": 84.12, |
| "eval_steps_per_second": 84.12, |
| "step": 9000 |
| }, |
| { |
| "epoch": 2.1677900818488203, |
| "grad_norm": 1.0703125, |
| "learning_rate": 6.947473153955961e-05, |
| "loss": 0.3915, |
| "step": 9005 |
| }, |
| { |
| "epoch": 2.1689937409725566, |
| "grad_norm": 1.171875, |
| "learning_rate": 6.940905420263873e-05, |
| "loss": 0.4191, |
| "step": 9010 |
| }, |
| { |
| "epoch": 2.170197400096293, |
| "grad_norm": 1.1875, |
| "learning_rate": 6.934344645096984e-05, |
| "loss": 0.394, |
| "step": 9015 |
| }, |
| { |
| "epoch": 2.171401059220029, |
| "grad_norm": 1.15625, |
| "learning_rate": 6.927790838937733e-05, |
| "loss": 0.3988, |
| "step": 9020 |
| }, |
| { |
| "epoch": 2.172604718343765, |
| "grad_norm": 1.2109375, |
| "learning_rate": 6.921244012257418e-05, |
| "loss": 0.3957, |
| "step": 9025 |
| }, |
| { |
| "epoch": 2.173808377467501, |
| "grad_norm": 1.0234375, |
| "learning_rate": 6.914704175516192e-05, |
| "loss": 0.421, |
| "step": 9030 |
| }, |
| { |
| "epoch": 2.1750120365912373, |
| "grad_norm": 1.1953125, |
| "learning_rate": 6.90817133916304e-05, |
| "loss": 0.4044, |
| "step": 9035 |
| }, |
| { |
| "epoch": 2.1762156957149736, |
| "grad_norm": 1.1796875, |
| "learning_rate": 6.901645513635753e-05, |
| "loss": 0.4083, |
| "step": 9040 |
| }, |
| { |
| "epoch": 2.1774193548387095, |
| "grad_norm": 1.1015625, |
| "learning_rate": 6.895126709360935e-05, |
| "loss": 0.4041, |
| "step": 9045 |
| }, |
| { |
| "epoch": 2.178623013962446, |
| "grad_norm": 1.1875, |
| "learning_rate": 6.88861493675396e-05, |
| "loss": 0.4073, |
| "step": 9050 |
| }, |
| { |
| "epoch": 2.179826673086182, |
| "grad_norm": 1.203125, |
| "learning_rate": 6.882110206218971e-05, |
| "loss": 0.3887, |
| "step": 9055 |
| }, |
| { |
| "epoch": 2.181030332209918, |
| "grad_norm": 1.125, |
| "learning_rate": 6.875612528148862e-05, |
| "loss": 0.3788, |
| "step": 9060 |
| }, |
| { |
| "epoch": 2.1822339913336544, |
| "grad_norm": 1.2109375, |
| "learning_rate": 6.869121912925255e-05, |
| "loss": 0.4137, |
| "step": 9065 |
| }, |
| { |
| "epoch": 2.1834376504573907, |
| "grad_norm": 1.1328125, |
| "learning_rate": 6.862638370918493e-05, |
| "loss": 0.3982, |
| "step": 9070 |
| }, |
| { |
| "epoch": 2.1846413095811266, |
| "grad_norm": 1.078125, |
| "learning_rate": 6.856161912487608e-05, |
| "loss": 0.4012, |
| "step": 9075 |
| }, |
| { |
| "epoch": 2.185844968704863, |
| "grad_norm": 1.1171875, |
| "learning_rate": 6.849692547980326e-05, |
| "loss": 0.4167, |
| "step": 9080 |
| }, |
| { |
| "epoch": 2.1870486278285988, |
| "grad_norm": 1.0390625, |
| "learning_rate": 6.843230287733031e-05, |
| "loss": 0.3923, |
| "step": 9085 |
| }, |
| { |
| "epoch": 2.188252286952335, |
| "grad_norm": 1.1484375, |
| "learning_rate": 6.836775142070754e-05, |
| "loss": 0.4036, |
| "step": 9090 |
| }, |
| { |
| "epoch": 2.1894559460760714, |
| "grad_norm": 1.1484375, |
| "learning_rate": 6.83032712130717e-05, |
| "loss": 0.4049, |
| "step": 9095 |
| }, |
| { |
| "epoch": 2.1906596051998073, |
| "grad_norm": 1.21875, |
| "learning_rate": 6.823886235744558e-05, |
| "loss": 0.4188, |
| "step": 9100 |
| }, |
| { |
| "epoch": 2.1918632643235436, |
| "grad_norm": 1.3046875, |
| "learning_rate": 6.817452495673803e-05, |
| "loss": 0.4083, |
| "step": 9105 |
| }, |
| { |
| "epoch": 2.19306692344728, |
| "grad_norm": 1.09375, |
| "learning_rate": 6.811025911374376e-05, |
| "loss": 0.3937, |
| "step": 9110 |
| }, |
| { |
| "epoch": 2.194270582571016, |
| "grad_norm": 1.0546875, |
| "learning_rate": 6.804606493114308e-05, |
| "loss": 0.4093, |
| "step": 9115 |
| }, |
| { |
| "epoch": 2.195474241694752, |
| "grad_norm": 1.234375, |
| "learning_rate": 6.798194251150184e-05, |
| "loss": 0.4323, |
| "step": 9120 |
| }, |
| { |
| "epoch": 2.196677900818488, |
| "grad_norm": 1.125, |
| "learning_rate": 6.791789195727122e-05, |
| "loss": 0.4055, |
| "step": 9125 |
| }, |
| { |
| "epoch": 2.1978815599422243, |
| "grad_norm": 1.1484375, |
| "learning_rate": 6.785391337078763e-05, |
| "loss": 0.4068, |
| "step": 9130 |
| }, |
| { |
| "epoch": 2.1990852190659607, |
| "grad_norm": 1.1328125, |
| "learning_rate": 6.77900068542724e-05, |
| "loss": 0.4158, |
| "step": 9135 |
| }, |
| { |
| "epoch": 2.2002888781896965, |
| "grad_norm": 1.140625, |
| "learning_rate": 6.772617250983177e-05, |
| "loss": 0.412, |
| "step": 9140 |
| }, |
| { |
| "epoch": 2.201492537313433, |
| "grad_norm": 1.1875, |
| "learning_rate": 6.766241043945668e-05, |
| "loss": 0.4121, |
| "step": 9145 |
| }, |
| { |
| "epoch": 2.202696196437169, |
| "grad_norm": 1.2265625, |
| "learning_rate": 6.759872074502254e-05, |
| "loss": 0.426, |
| "step": 9150 |
| }, |
| { |
| "epoch": 2.203899855560905, |
| "grad_norm": 1.0625, |
| "learning_rate": 6.753510352828918e-05, |
| "loss": 0.4078, |
| "step": 9155 |
| }, |
| { |
| "epoch": 2.2051035146846414, |
| "grad_norm": 1.171875, |
| "learning_rate": 6.747155889090062e-05, |
| "loss": 0.4036, |
| "step": 9160 |
| }, |
| { |
| "epoch": 2.2063071738083773, |
| "grad_norm": 1.1953125, |
| "learning_rate": 6.740808693438485e-05, |
| "loss": 0.3795, |
| "step": 9165 |
| }, |
| { |
| "epoch": 2.2075108329321136, |
| "grad_norm": 1.078125, |
| "learning_rate": 6.734468776015385e-05, |
| "loss": 0.3897, |
| "step": 9170 |
| }, |
| { |
| "epoch": 2.20871449205585, |
| "grad_norm": 1.1875, |
| "learning_rate": 6.728136146950318e-05, |
| "loss": 0.3972, |
| "step": 9175 |
| }, |
| { |
| "epoch": 2.209918151179586, |
| "grad_norm": 1.046875, |
| "learning_rate": 6.721810816361207e-05, |
| "loss": 0.3788, |
| "step": 9180 |
| }, |
| { |
| "epoch": 2.211121810303322, |
| "grad_norm": 1.2578125, |
| "learning_rate": 6.715492794354308e-05, |
| "loss": 0.4073, |
| "step": 9185 |
| }, |
| { |
| "epoch": 2.2123254694270584, |
| "grad_norm": 1.1484375, |
| "learning_rate": 6.709182091024196e-05, |
| "loss": 0.3923, |
| "step": 9190 |
| }, |
| { |
| "epoch": 2.2135291285507943, |
| "grad_norm": 1.09375, |
| "learning_rate": 6.702878716453764e-05, |
| "loss": 0.3862, |
| "step": 9195 |
| }, |
| { |
| "epoch": 2.2147327876745306, |
| "grad_norm": 1.3125, |
| "learning_rate": 6.696582680714181e-05, |
| "loss": 0.4135, |
| "step": 9200 |
| }, |
| { |
| "epoch": 2.215936446798267, |
| "grad_norm": 1.21875, |
| "learning_rate": 6.690293993864907e-05, |
| "loss": 0.3883, |
| "step": 9205 |
| }, |
| { |
| "epoch": 2.217140105922003, |
| "grad_norm": 1.1484375, |
| "learning_rate": 6.684012665953645e-05, |
| "loss": 0.3833, |
| "step": 9210 |
| }, |
| { |
| "epoch": 2.218343765045739, |
| "grad_norm": 1.3359375, |
| "learning_rate": 6.677738707016347e-05, |
| "loss": 0.4578, |
| "step": 9215 |
| }, |
| { |
| "epoch": 2.219547424169475, |
| "grad_norm": 1.078125, |
| "learning_rate": 6.671472127077195e-05, |
| "loss": 0.3936, |
| "step": 9220 |
| }, |
| { |
| "epoch": 2.2207510832932114, |
| "grad_norm": 1.078125, |
| "learning_rate": 6.665212936148573e-05, |
| "loss": 0.4076, |
| "step": 9225 |
| }, |
| { |
| "epoch": 2.2219547424169477, |
| "grad_norm": 1.234375, |
| "learning_rate": 6.658961144231066e-05, |
| "loss": 0.4269, |
| "step": 9230 |
| }, |
| { |
| "epoch": 2.2231584015406836, |
| "grad_norm": 1.0625, |
| "learning_rate": 6.652716761313433e-05, |
| "loss": 0.407, |
| "step": 9235 |
| }, |
| { |
| "epoch": 2.22436206066442, |
| "grad_norm": 1.1640625, |
| "learning_rate": 6.646479797372603e-05, |
| "loss": 0.3887, |
| "step": 9240 |
| }, |
| { |
| "epoch": 2.2255657197881558, |
| "grad_norm": 1.1015625, |
| "learning_rate": 6.640250262373638e-05, |
| "loss": 0.3935, |
| "step": 9245 |
| }, |
| { |
| "epoch": 2.226769378911892, |
| "grad_norm": 1.1640625, |
| "learning_rate": 6.63402816626974e-05, |
| "loss": 0.39, |
| "step": 9250 |
| }, |
| { |
| "epoch": 2.2279730380356284, |
| "grad_norm": 1.0703125, |
| "learning_rate": 6.627813519002227e-05, |
| "loss": 0.388, |
| "step": 9255 |
| }, |
| { |
| "epoch": 2.2291766971593643, |
| "grad_norm": 1.1953125, |
| "learning_rate": 6.621606330500507e-05, |
| "loss": 0.4123, |
| "step": 9260 |
| }, |
| { |
| "epoch": 2.2303803562831006, |
| "grad_norm": 1.2421875, |
| "learning_rate": 6.615406610682079e-05, |
| "loss": 0.3847, |
| "step": 9265 |
| }, |
| { |
| "epoch": 2.231584015406837, |
| "grad_norm": 1.1015625, |
| "learning_rate": 6.609214369452507e-05, |
| "loss": 0.3888, |
| "step": 9270 |
| }, |
| { |
| "epoch": 2.232787674530573, |
| "grad_norm": 1.078125, |
| "learning_rate": 6.603029616705402e-05, |
| "loss": 0.3791, |
| "step": 9275 |
| }, |
| { |
| "epoch": 2.233991333654309, |
| "grad_norm": 1.0625, |
| "learning_rate": 6.596852362322418e-05, |
| "loss": 0.4157, |
| "step": 9280 |
| }, |
| { |
| "epoch": 2.2351949927780455, |
| "grad_norm": 1.09375, |
| "learning_rate": 6.590682616173218e-05, |
| "loss": 0.3867, |
| "step": 9285 |
| }, |
| { |
| "epoch": 2.2363986519017813, |
| "grad_norm": 1.2734375, |
| "learning_rate": 6.58452038811548e-05, |
| "loss": 0.4108, |
| "step": 9290 |
| }, |
| { |
| "epoch": 2.2376023110255177, |
| "grad_norm": 1.140625, |
| "learning_rate": 6.578365687994862e-05, |
| "loss": 0.4389, |
| "step": 9295 |
| }, |
| { |
| "epoch": 2.2388059701492535, |
| "grad_norm": 1.109375, |
| "learning_rate": 6.572218525644996e-05, |
| "loss": 0.4346, |
| "step": 9300 |
| }, |
| { |
| "epoch": 2.24000962927299, |
| "grad_norm": 1.046875, |
| "learning_rate": 6.566078910887475e-05, |
| "loss": 0.388, |
| "step": 9305 |
| }, |
| { |
| "epoch": 2.241213288396726, |
| "grad_norm": 1.078125, |
| "learning_rate": 6.559946853531826e-05, |
| "loss": 0.3964, |
| "step": 9310 |
| }, |
| { |
| "epoch": 2.242416947520462, |
| "grad_norm": 1.1015625, |
| "learning_rate": 6.553822363375506e-05, |
| "loss": 0.3974, |
| "step": 9315 |
| }, |
| { |
| "epoch": 2.2436206066441984, |
| "grad_norm": 1.171875, |
| "learning_rate": 6.547705450203882e-05, |
| "loss": 0.4184, |
| "step": 9320 |
| }, |
| { |
| "epoch": 2.2448242657679347, |
| "grad_norm": 1.1640625, |
| "learning_rate": 6.541596123790211e-05, |
| "loss": 0.4027, |
| "step": 9325 |
| }, |
| { |
| "epoch": 2.2460279248916706, |
| "grad_norm": 1.1875, |
| "learning_rate": 6.535494393895632e-05, |
| "loss": 0.4171, |
| "step": 9330 |
| }, |
| { |
| "epoch": 2.247231584015407, |
| "grad_norm": 1.140625, |
| "learning_rate": 6.529400270269144e-05, |
| "loss": 0.3795, |
| "step": 9335 |
| }, |
| { |
| "epoch": 2.2484352431391432, |
| "grad_norm": 1.1328125, |
| "learning_rate": 6.523313762647595e-05, |
| "loss": 0.426, |
| "step": 9340 |
| }, |
| { |
| "epoch": 2.249638902262879, |
| "grad_norm": 1.125, |
| "learning_rate": 6.517234880755663e-05, |
| "loss": 0.4125, |
| "step": 9345 |
| }, |
| { |
| "epoch": 2.2508425613866154, |
| "grad_norm": 1.21875, |
| "learning_rate": 6.511163634305844e-05, |
| "loss": 0.422, |
| "step": 9350 |
| }, |
| { |
| "epoch": 2.2520462205103513, |
| "grad_norm": 1.1328125, |
| "learning_rate": 6.505100032998434e-05, |
| "loss": 0.3871, |
| "step": 9355 |
| }, |
| { |
| "epoch": 2.2532498796340876, |
| "grad_norm": 1.15625, |
| "learning_rate": 6.499044086521509e-05, |
| "loss": 0.4342, |
| "step": 9360 |
| }, |
| { |
| "epoch": 2.254453538757824, |
| "grad_norm": 1.09375, |
| "learning_rate": 6.492995804550923e-05, |
| "loss": 0.3903, |
| "step": 9365 |
| }, |
| { |
| "epoch": 2.25565719788156, |
| "grad_norm": 1.125, |
| "learning_rate": 6.486955196750282e-05, |
| "loss": 0.3923, |
| "step": 9370 |
| }, |
| { |
| "epoch": 2.256860857005296, |
| "grad_norm": 1.1015625, |
| "learning_rate": 6.480922272770924e-05, |
| "loss": 0.3975, |
| "step": 9375 |
| }, |
| { |
| "epoch": 2.258064516129032, |
| "grad_norm": 1.1875, |
| "learning_rate": 6.474897042251918e-05, |
| "loss": 0.4123, |
| "step": 9380 |
| }, |
| { |
| "epoch": 2.2592681752527684, |
| "grad_norm": 1.0625, |
| "learning_rate": 6.468879514820038e-05, |
| "loss": 0.4151, |
| "step": 9385 |
| }, |
| { |
| "epoch": 2.2604718343765047, |
| "grad_norm": 1.203125, |
| "learning_rate": 6.462869700089747e-05, |
| "loss": 0.3918, |
| "step": 9390 |
| }, |
| { |
| "epoch": 2.2616754935002406, |
| "grad_norm": 1.3046875, |
| "learning_rate": 6.456867607663193e-05, |
| "loss": 0.3907, |
| "step": 9395 |
| }, |
| { |
| "epoch": 2.262879152623977, |
| "grad_norm": 1.09375, |
| "learning_rate": 6.450873247130179e-05, |
| "loss": 0.3894, |
| "step": 9400 |
| }, |
| { |
| "epoch": 2.264082811747713, |
| "grad_norm": 1.21875, |
| "learning_rate": 6.444886628068155e-05, |
| "loss": 0.3874, |
| "step": 9405 |
| }, |
| { |
| "epoch": 2.265286470871449, |
| "grad_norm": 1.1171875, |
| "learning_rate": 6.438907760042204e-05, |
| "loss": 0.3837, |
| "step": 9410 |
| }, |
| { |
| "epoch": 2.2664901299951854, |
| "grad_norm": 1.21875, |
| "learning_rate": 6.432936652605025e-05, |
| "loss": 0.4001, |
| "step": 9415 |
| }, |
| { |
| "epoch": 2.2676937891189217, |
| "grad_norm": 1.140625, |
| "learning_rate": 6.426973315296917e-05, |
| "loss": 0.4088, |
| "step": 9420 |
| }, |
| { |
| "epoch": 2.2688974482426576, |
| "grad_norm": 1.1171875, |
| "learning_rate": 6.421017757645764e-05, |
| "loss": 0.4058, |
| "step": 9425 |
| }, |
| { |
| "epoch": 2.270101107366394, |
| "grad_norm": 1.09375, |
| "learning_rate": 6.415069989167023e-05, |
| "loss": 0.3805, |
| "step": 9430 |
| }, |
| { |
| "epoch": 2.27130476649013, |
| "grad_norm": 1.25, |
| "learning_rate": 6.409130019363698e-05, |
| "loss": 0.4057, |
| "step": 9435 |
| }, |
| { |
| "epoch": 2.272508425613866, |
| "grad_norm": 1.1015625, |
| "learning_rate": 6.403197857726344e-05, |
| "loss": 0.401, |
| "step": 9440 |
| }, |
| { |
| "epoch": 2.2737120847376024, |
| "grad_norm": 1.0625, |
| "learning_rate": 6.397273513733029e-05, |
| "loss": 0.3913, |
| "step": 9445 |
| }, |
| { |
| "epoch": 2.2749157438613383, |
| "grad_norm": 1.2578125, |
| "learning_rate": 6.39135699684934e-05, |
| "loss": 0.423, |
| "step": 9450 |
| }, |
| { |
| "epoch": 2.2761194029850746, |
| "grad_norm": 1.09375, |
| "learning_rate": 6.385448316528354e-05, |
| "loss": 0.3915, |
| "step": 9455 |
| }, |
| { |
| "epoch": 2.277323062108811, |
| "grad_norm": 1.078125, |
| "learning_rate": 6.379547482210624e-05, |
| "loss": 0.3817, |
| "step": 9460 |
| }, |
| { |
| "epoch": 2.278526721232547, |
| "grad_norm": 1.234375, |
| "learning_rate": 6.373654503324175e-05, |
| "loss": 0.3925, |
| "step": 9465 |
| }, |
| { |
| "epoch": 2.279730380356283, |
| "grad_norm": 1.1328125, |
| "learning_rate": 6.367769389284472e-05, |
| "loss": 0.4074, |
| "step": 9470 |
| }, |
| { |
| "epoch": 2.2809340394800195, |
| "grad_norm": 1.09375, |
| "learning_rate": 6.361892149494422e-05, |
| "loss": 0.4199, |
| "step": 9475 |
| }, |
| { |
| "epoch": 2.2821376986037554, |
| "grad_norm": 1.3671875, |
| "learning_rate": 6.356022793344349e-05, |
| "loss": 0.4211, |
| "step": 9480 |
| }, |
| { |
| "epoch": 2.2833413577274917, |
| "grad_norm": 1.2734375, |
| "learning_rate": 6.350161330211973e-05, |
| "loss": 0.3933, |
| "step": 9485 |
| }, |
| { |
| "epoch": 2.2845450168512276, |
| "grad_norm": 1.15625, |
| "learning_rate": 6.344307769462417e-05, |
| "loss": 0.4069, |
| "step": 9490 |
| }, |
| { |
| "epoch": 2.285748675974964, |
| "grad_norm": 1.171875, |
| "learning_rate": 6.338462120448164e-05, |
| "loss": 0.4346, |
| "step": 9495 |
| }, |
| { |
| "epoch": 2.2869523350987, |
| "grad_norm": 1.2265625, |
| "learning_rate": 6.332624392509068e-05, |
| "loss": 0.4202, |
| "step": 9500 |
| }, |
| { |
| "epoch": 2.2869523350987, |
| "eval_loss": 0.3756787180900574, |
| "eval_runtime": 2.344, |
| "eval_samples_per_second": 85.324, |
| "eval_steps_per_second": 85.324, |
| "step": 9500 |
| }, |
| { |
| "epoch": 2.288155994222436, |
| "grad_norm": 1.1875, |
| "learning_rate": 6.326794594972318e-05, |
| "loss": 0.3907, |
| "step": 9505 |
| }, |
| { |
| "epoch": 2.2893596533461724, |
| "grad_norm": 1.0546875, |
| "learning_rate": 6.320972737152439e-05, |
| "loss": 0.3857, |
| "step": 9510 |
| }, |
| { |
| "epoch": 2.2905633124699083, |
| "grad_norm": 1.2265625, |
| "learning_rate": 6.315158828351266e-05, |
| "loss": 0.3914, |
| "step": 9515 |
| }, |
| { |
| "epoch": 2.2917669715936446, |
| "grad_norm": 1.078125, |
| "learning_rate": 6.309352877857933e-05, |
| "loss": 0.4172, |
| "step": 9520 |
| }, |
| { |
| "epoch": 2.292970630717381, |
| "grad_norm": 1.125, |
| "learning_rate": 6.303554894948859e-05, |
| "loss": 0.3929, |
| "step": 9525 |
| }, |
| { |
| "epoch": 2.294174289841117, |
| "grad_norm": 1.125, |
| "learning_rate": 6.297764888887743e-05, |
| "loss": 0.4198, |
| "step": 9530 |
| }, |
| { |
| "epoch": 2.295377948964853, |
| "grad_norm": 1.1796875, |
| "learning_rate": 6.291982868925522e-05, |
| "loss": 0.4168, |
| "step": 9535 |
| }, |
| { |
| "epoch": 2.2965816080885895, |
| "grad_norm": 1.0703125, |
| "learning_rate": 6.286208844300387e-05, |
| "loss": 0.4076, |
| "step": 9540 |
| }, |
| { |
| "epoch": 2.2977852672123253, |
| "grad_norm": 1.0625, |
| "learning_rate": 6.280442824237744e-05, |
| "loss": 0.4014, |
| "step": 9545 |
| }, |
| { |
| "epoch": 2.2989889263360617, |
| "grad_norm": 1.2421875, |
| "learning_rate": 6.274684817950216e-05, |
| "loss": 0.4014, |
| "step": 9550 |
| }, |
| { |
| "epoch": 2.300192585459798, |
| "grad_norm": 1.046875, |
| "learning_rate": 6.268934834637626e-05, |
| "loss": 0.392, |
| "step": 9555 |
| }, |
| { |
| "epoch": 2.301396244583534, |
| "grad_norm": 1.1953125, |
| "learning_rate": 6.26319288348697e-05, |
| "loss": 0.4122, |
| "step": 9560 |
| }, |
| { |
| "epoch": 2.30259990370727, |
| "grad_norm": 1.1953125, |
| "learning_rate": 6.257458973672414e-05, |
| "loss": 0.3761, |
| "step": 9565 |
| }, |
| { |
| "epoch": 2.303803562831006, |
| "grad_norm": 1.1640625, |
| "learning_rate": 6.251733114355274e-05, |
| "loss": 0.4226, |
| "step": 9570 |
| }, |
| { |
| "epoch": 2.3050072219547424, |
| "grad_norm": 1.109375, |
| "learning_rate": 6.246015314684007e-05, |
| "loss": 0.376, |
| "step": 9575 |
| }, |
| { |
| "epoch": 2.3062108810784787, |
| "grad_norm": 1.1328125, |
| "learning_rate": 6.240305583794193e-05, |
| "loss": 0.4237, |
| "step": 9580 |
| }, |
| { |
| "epoch": 2.3074145402022146, |
| "grad_norm": 1.1328125, |
| "learning_rate": 6.234603930808515e-05, |
| "loss": 0.3921, |
| "step": 9585 |
| }, |
| { |
| "epoch": 2.308618199325951, |
| "grad_norm": 1.1015625, |
| "learning_rate": 6.228910364836756e-05, |
| "loss": 0.4114, |
| "step": 9590 |
| }, |
| { |
| "epoch": 2.3098218584496872, |
| "grad_norm": 1.1328125, |
| "learning_rate": 6.223224894975772e-05, |
| "loss": 0.4328, |
| "step": 9595 |
| }, |
| { |
| "epoch": 2.311025517573423, |
| "grad_norm": 1.125, |
| "learning_rate": 6.217547530309486e-05, |
| "loss": 0.4088, |
| "step": 9600 |
| }, |
| { |
| "epoch": 2.3122291766971594, |
| "grad_norm": 1.1640625, |
| "learning_rate": 6.211878279908876e-05, |
| "loss": 0.3963, |
| "step": 9605 |
| }, |
| { |
| "epoch": 2.3134328358208958, |
| "grad_norm": 1.125, |
| "learning_rate": 6.206217152831948e-05, |
| "loss": 0.3749, |
| "step": 9610 |
| }, |
| { |
| "epoch": 2.3146364949446316, |
| "grad_norm": 1.1640625, |
| "learning_rate": 6.200564158123732e-05, |
| "loss": 0.3965, |
| "step": 9615 |
| }, |
| { |
| "epoch": 2.315840154068368, |
| "grad_norm": 1.1328125, |
| "learning_rate": 6.19491930481626e-05, |
| "loss": 0.4348, |
| "step": 9620 |
| }, |
| { |
| "epoch": 2.317043813192104, |
| "grad_norm": 1.203125, |
| "learning_rate": 6.189282601928567e-05, |
| "loss": 0.3908, |
| "step": 9625 |
| }, |
| { |
| "epoch": 2.31824747231584, |
| "grad_norm": 1.0859375, |
| "learning_rate": 6.183654058466657e-05, |
| "loss": 0.4179, |
| "step": 9630 |
| }, |
| { |
| "epoch": 2.3194511314395765, |
| "grad_norm": 1.1328125, |
| "learning_rate": 6.178033683423499e-05, |
| "loss": 0.3856, |
| "step": 9635 |
| }, |
| { |
| "epoch": 2.3206547905633124, |
| "grad_norm": 1.1171875, |
| "learning_rate": 6.172421485779012e-05, |
| "loss": 0.3889, |
| "step": 9640 |
| }, |
| { |
| "epoch": 2.3218584496870487, |
| "grad_norm": 1.0, |
| "learning_rate": 6.166817474500047e-05, |
| "loss": 0.3951, |
| "step": 9645 |
| }, |
| { |
| "epoch": 2.3230621088107846, |
| "grad_norm": 1.234375, |
| "learning_rate": 6.161221658540383e-05, |
| "loss": 0.4012, |
| "step": 9650 |
| }, |
| { |
| "epoch": 2.324265767934521, |
| "grad_norm": 1.1171875, |
| "learning_rate": 6.155634046840693e-05, |
| "loss": 0.3987, |
| "step": 9655 |
| }, |
| { |
| "epoch": 2.325469427058257, |
| "grad_norm": 1.1796875, |
| "learning_rate": 6.150054648328552e-05, |
| "loss": 0.3923, |
| "step": 9660 |
| }, |
| { |
| "epoch": 2.326673086181993, |
| "grad_norm": 1.15625, |
| "learning_rate": 6.144483471918406e-05, |
| "loss": 0.3805, |
| "step": 9665 |
| }, |
| { |
| "epoch": 2.3278767453057294, |
| "grad_norm": 1.2421875, |
| "learning_rate": 6.138920526511571e-05, |
| "loss": 0.4214, |
| "step": 9670 |
| }, |
| { |
| "epoch": 2.3290804044294657, |
| "grad_norm": 1.109375, |
| "learning_rate": 6.133365820996204e-05, |
| "loss": 0.4041, |
| "step": 9675 |
| }, |
| { |
| "epoch": 2.3302840635532016, |
| "grad_norm": 1.0859375, |
| "learning_rate": 6.127819364247298e-05, |
| "loss": 0.4002, |
| "step": 9680 |
| }, |
| { |
| "epoch": 2.331487722676938, |
| "grad_norm": 1.140625, |
| "learning_rate": 6.122281165126675e-05, |
| "loss": 0.4156, |
| "step": 9685 |
| }, |
| { |
| "epoch": 2.3326913818006743, |
| "grad_norm": 1.046875, |
| "learning_rate": 6.116751232482955e-05, |
| "loss": 0.3841, |
| "step": 9690 |
| }, |
| { |
| "epoch": 2.33389504092441, |
| "grad_norm": 1.203125, |
| "learning_rate": 6.111229575151552e-05, |
| "loss": 0.3953, |
| "step": 9695 |
| }, |
| { |
| "epoch": 2.3350987000481465, |
| "grad_norm": 1.078125, |
| "learning_rate": 6.105716201954659e-05, |
| "loss": 0.4067, |
| "step": 9700 |
| }, |
| { |
| "epoch": 2.3363023591718823, |
| "grad_norm": 1.1484375, |
| "learning_rate": 6.1002111217012335e-05, |
| "loss": 0.3868, |
| "step": 9705 |
| }, |
| { |
| "epoch": 2.3375060182956187, |
| "grad_norm": 1.1953125, |
| "learning_rate": 6.094714343186982e-05, |
| "loss": 0.3945, |
| "step": 9710 |
| }, |
| { |
| "epoch": 2.338709677419355, |
| "grad_norm": 1.15625, |
| "learning_rate": 6.089225875194349e-05, |
| "loss": 0.3912, |
| "step": 9715 |
| }, |
| { |
| "epoch": 2.339913336543091, |
| "grad_norm": 1.125, |
| "learning_rate": 6.083745726492499e-05, |
| "loss": 0.3853, |
| "step": 9720 |
| }, |
| { |
| "epoch": 2.341116995666827, |
| "grad_norm": 1.265625, |
| "learning_rate": 6.078273905837305e-05, |
| "loss": 0.4007, |
| "step": 9725 |
| }, |
| { |
| "epoch": 2.3423206547905635, |
| "grad_norm": 1.15625, |
| "learning_rate": 6.072810421971328e-05, |
| "loss": 0.403, |
| "step": 9730 |
| }, |
| { |
| "epoch": 2.3435243139142994, |
| "grad_norm": 1.125, |
| "learning_rate": 6.067355283623823e-05, |
| "loss": 0.4096, |
| "step": 9735 |
| }, |
| { |
| "epoch": 2.3447279730380357, |
| "grad_norm": 1.078125, |
| "learning_rate": 6.0619084995106985e-05, |
| "loss": 0.3982, |
| "step": 9740 |
| }, |
| { |
| "epoch": 2.345931632161772, |
| "grad_norm": 1.1015625, |
| "learning_rate": 6.0564700783345177e-05, |
| "loss": 0.3865, |
| "step": 9745 |
| }, |
| { |
| "epoch": 2.347135291285508, |
| "grad_norm": 1.140625, |
| "learning_rate": 6.051040028784489e-05, |
| "loss": 0.4359, |
| "step": 9750 |
| }, |
| { |
| "epoch": 2.3483389504092442, |
| "grad_norm": 1.1171875, |
| "learning_rate": 6.045618359536434e-05, |
| "loss": 0.3731, |
| "step": 9755 |
| }, |
| { |
| "epoch": 2.34954260953298, |
| "grad_norm": 1.125, |
| "learning_rate": 6.0402050792527914e-05, |
| "loss": 0.3772, |
| "step": 9760 |
| }, |
| { |
| "epoch": 2.3507462686567164, |
| "grad_norm": 1.28125, |
| "learning_rate": 6.0348001965825986e-05, |
| "loss": 0.398, |
| "step": 9765 |
| }, |
| { |
| "epoch": 2.3519499277804528, |
| "grad_norm": 1.1015625, |
| "learning_rate": 6.02940372016147e-05, |
| "loss": 0.3803, |
| "step": 9770 |
| }, |
| { |
| "epoch": 2.3531535869041886, |
| "grad_norm": 1.2890625, |
| "learning_rate": 6.0240156586115926e-05, |
| "loss": 0.3942, |
| "step": 9775 |
| }, |
| { |
| "epoch": 2.354357246027925, |
| "grad_norm": 1.2109375, |
| "learning_rate": 6.0186360205417044e-05, |
| "loss": 0.3934, |
| "step": 9780 |
| }, |
| { |
| "epoch": 2.355560905151661, |
| "grad_norm": 1.0859375, |
| "learning_rate": 6.0132648145470914e-05, |
| "loss": 0.4019, |
| "step": 9785 |
| }, |
| { |
| "epoch": 2.356764564275397, |
| "grad_norm": 1.21875, |
| "learning_rate": 6.0079020492095636e-05, |
| "loss": 0.4345, |
| "step": 9790 |
| }, |
| { |
| "epoch": 2.3579682233991335, |
| "grad_norm": 1.1171875, |
| "learning_rate": 6.002547733097443e-05, |
| "loss": 0.3959, |
| "step": 9795 |
| }, |
| { |
| "epoch": 2.3591718825228694, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.9972018747655576e-05, |
| "loss": 0.3901, |
| "step": 9800 |
| }, |
| { |
| "epoch": 2.3603755416466057, |
| "grad_norm": 1.125, |
| "learning_rate": 5.9918644827552145e-05, |
| "loss": 0.3859, |
| "step": 9805 |
| }, |
| { |
| "epoch": 2.361579200770342, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.9865355655942e-05, |
| "loss": 0.3907, |
| "step": 9810 |
| }, |
| { |
| "epoch": 2.362782859894078, |
| "grad_norm": 1.046875, |
| "learning_rate": 5.98121513179676e-05, |
| "loss": 0.4194, |
| "step": 9815 |
| }, |
| { |
| "epoch": 2.363986519017814, |
| "grad_norm": 1.1171875, |
| "learning_rate": 5.9759031898635787e-05, |
| "loss": 0.4023, |
| "step": 9820 |
| }, |
| { |
| "epoch": 2.3651901781415505, |
| "grad_norm": 1.0859375, |
| "learning_rate": 5.9705997482817825e-05, |
| "loss": 0.415, |
| "step": 9825 |
| }, |
| { |
| "epoch": 2.3663938372652864, |
| "grad_norm": 1.078125, |
| "learning_rate": 5.96530481552491e-05, |
| "loss": 0.3732, |
| "step": 9830 |
| }, |
| { |
| "epoch": 2.3675974963890227, |
| "grad_norm": 1.1171875, |
| "learning_rate": 5.9600184000529025e-05, |
| "loss": 0.4401, |
| "step": 9835 |
| }, |
| { |
| "epoch": 2.3688011555127586, |
| "grad_norm": 1.109375, |
| "learning_rate": 5.954740510312103e-05, |
| "loss": 0.402, |
| "step": 9840 |
| }, |
| { |
| "epoch": 2.370004814636495, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.9494711547352204e-05, |
| "loss": 0.3932, |
| "step": 9845 |
| }, |
| { |
| "epoch": 2.3712084737602313, |
| "grad_norm": 1.3046875, |
| "learning_rate": 5.94421034174134e-05, |
| "loss": 0.4304, |
| "step": 9850 |
| }, |
| { |
| "epoch": 2.372412132883967, |
| "grad_norm": 1.03125, |
| "learning_rate": 5.9389580797358866e-05, |
| "loss": 0.3816, |
| "step": 9855 |
| }, |
| { |
| "epoch": 2.3736157920077035, |
| "grad_norm": 1.1015625, |
| "learning_rate": 5.933714377110631e-05, |
| "loss": 0.382, |
| "step": 9860 |
| }, |
| { |
| "epoch": 2.3748194511314393, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.928479242243668e-05, |
| "loss": 0.3949, |
| "step": 9865 |
| }, |
| { |
| "epoch": 2.3760231102551757, |
| "grad_norm": 1.171875, |
| "learning_rate": 5.923252683499397e-05, |
| "loss": 0.3954, |
| "step": 9870 |
| }, |
| { |
| "epoch": 2.377226769378912, |
| "grad_norm": 1.296875, |
| "learning_rate": 5.9180347092285224e-05, |
| "loss": 0.3966, |
| "step": 9875 |
| }, |
| { |
| "epoch": 2.3784304285026483, |
| "grad_norm": 1.171875, |
| "learning_rate": 5.912825327768029e-05, |
| "loss": 0.4403, |
| "step": 9880 |
| }, |
| { |
| "epoch": 2.379634087626384, |
| "grad_norm": 1.1953125, |
| "learning_rate": 5.907624547441171e-05, |
| "loss": 0.3975, |
| "step": 9885 |
| }, |
| { |
| "epoch": 2.3808377467501205, |
| "grad_norm": 1.1796875, |
| "learning_rate": 5.9024323765574615e-05, |
| "loss": 0.3871, |
| "step": 9890 |
| }, |
| { |
| "epoch": 2.3820414058738564, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.8972488234126614e-05, |
| "loss": 0.4278, |
| "step": 9895 |
| }, |
| { |
| "epoch": 2.3832450649975927, |
| "grad_norm": 1.1328125, |
| "learning_rate": 5.89207389628876e-05, |
| "loss": 0.3843, |
| "step": 9900 |
| }, |
| { |
| "epoch": 2.384448724121329, |
| "grad_norm": 1.125, |
| "learning_rate": 5.88690760345396e-05, |
| "loss": 0.3924, |
| "step": 9905 |
| }, |
| { |
| "epoch": 2.385652383245065, |
| "grad_norm": 1.28125, |
| "learning_rate": 5.881749953162678e-05, |
| "loss": 0.3939, |
| "step": 9910 |
| }, |
| { |
| "epoch": 2.3868560423688012, |
| "grad_norm": 1.125, |
| "learning_rate": 5.8766009536555125e-05, |
| "loss": 0.4135, |
| "step": 9915 |
| }, |
| { |
| "epoch": 2.388059701492537, |
| "grad_norm": 1.109375, |
| "learning_rate": 5.871460613159248e-05, |
| "loss": 0.3938, |
| "step": 9920 |
| }, |
| { |
| "epoch": 2.3892633606162734, |
| "grad_norm": 1.109375, |
| "learning_rate": 5.866328939886831e-05, |
| "loss": 0.3952, |
| "step": 9925 |
| }, |
| { |
| "epoch": 2.3904670197400097, |
| "grad_norm": 1.2265625, |
| "learning_rate": 5.861205942037359e-05, |
| "loss": 0.3796, |
| "step": 9930 |
| }, |
| { |
| "epoch": 2.3916706788637456, |
| "grad_norm": 0.9921875, |
| "learning_rate": 5.856091627796069e-05, |
| "loss": 0.3852, |
| "step": 9935 |
| }, |
| { |
| "epoch": 2.392874337987482, |
| "grad_norm": 1.2890625, |
| "learning_rate": 5.850986005334323e-05, |
| "loss": 0.3841, |
| "step": 9940 |
| }, |
| { |
| "epoch": 2.3940779971112183, |
| "grad_norm": 1.2109375, |
| "learning_rate": 5.845889082809597e-05, |
| "loss": 0.3893, |
| "step": 9945 |
| }, |
| { |
| "epoch": 2.395281656234954, |
| "grad_norm": 1.109375, |
| "learning_rate": 5.84080086836547e-05, |
| "loss": 0.4146, |
| "step": 9950 |
| }, |
| { |
| "epoch": 2.3964853153586905, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.835721370131599e-05, |
| "loss": 0.3858, |
| "step": 9955 |
| }, |
| { |
| "epoch": 2.397688974482427, |
| "grad_norm": 1.1328125, |
| "learning_rate": 5.8306505962237257e-05, |
| "loss": 0.4173, |
| "step": 9960 |
| }, |
| { |
| "epoch": 2.3988926336061627, |
| "grad_norm": 1.171875, |
| "learning_rate": 5.825588554743643e-05, |
| "loss": 0.4106, |
| "step": 9965 |
| }, |
| { |
| "epoch": 2.400096292729899, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.820535253779196e-05, |
| "loss": 0.3961, |
| "step": 9970 |
| }, |
| { |
| "epoch": 2.401299951853635, |
| "grad_norm": 1.2265625, |
| "learning_rate": 5.815490701404266e-05, |
| "loss": 0.4355, |
| "step": 9975 |
| }, |
| { |
| "epoch": 2.402503610977371, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.8104549056787514e-05, |
| "loss": 0.386, |
| "step": 9980 |
| }, |
| { |
| "epoch": 2.4037072701011075, |
| "grad_norm": 1.2109375, |
| "learning_rate": 5.805427874648568e-05, |
| "loss": 0.4063, |
| "step": 9985 |
| }, |
| { |
| "epoch": 2.4049109292248434, |
| "grad_norm": 1.0234375, |
| "learning_rate": 5.800409616345619e-05, |
| "loss": 0.3664, |
| "step": 9990 |
| }, |
| { |
| "epoch": 2.4061145883485797, |
| "grad_norm": 1.1875, |
| "learning_rate": 5.7954001387877925e-05, |
| "loss": 0.3639, |
| "step": 9995 |
| }, |
| { |
| "epoch": 2.4073182474723156, |
| "grad_norm": 1.171875, |
| "learning_rate": 5.790399449978955e-05, |
| "loss": 0.4059, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.4073182474723156, |
| "eval_loss": 0.36942535638809204, |
| "eval_runtime": 2.3535, |
| "eval_samples_per_second": 84.981, |
| "eval_steps_per_second": 84.981, |
| "step": 10000 |
| }, |
| { |
| "epoch": 2.408521906596052, |
| "grad_norm": 1.1328125, |
| "learning_rate": 5.78540755790892e-05, |
| "loss": 0.3825, |
| "step": 10005 |
| }, |
| { |
| "epoch": 2.4097255657197882, |
| "grad_norm": 1.15625, |
| "learning_rate": 5.7804244705534553e-05, |
| "loss": 0.3795, |
| "step": 10010 |
| }, |
| { |
| "epoch": 2.410929224843524, |
| "grad_norm": 1.046875, |
| "learning_rate": 5.775450195874255e-05, |
| "loss": 0.3916, |
| "step": 10015 |
| }, |
| { |
| "epoch": 2.4121328839672604, |
| "grad_norm": 1.1796875, |
| "learning_rate": 5.770484741818935e-05, |
| "loss": 0.394, |
| "step": 10020 |
| }, |
| { |
| "epoch": 2.4133365430909968, |
| "grad_norm": 1.2421875, |
| "learning_rate": 5.765528116321019e-05, |
| "loss": 0.3868, |
| "step": 10025 |
| }, |
| { |
| "epoch": 2.4145402022147326, |
| "grad_norm": 1.0859375, |
| "learning_rate": 5.760580327299922e-05, |
| "loss": 0.416, |
| "step": 10030 |
| }, |
| { |
| "epoch": 2.415743861338469, |
| "grad_norm": 1.1015625, |
| "learning_rate": 5.7556413826609464e-05, |
| "loss": 0.3958, |
| "step": 10035 |
| }, |
| { |
| "epoch": 2.4169475204622053, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.750711290295257e-05, |
| "loss": 0.3945, |
| "step": 10040 |
| }, |
| { |
| "epoch": 2.418151179585941, |
| "grad_norm": 1.21875, |
| "learning_rate": 5.7457900580798746e-05, |
| "loss": 0.423, |
| "step": 10045 |
| }, |
| { |
| "epoch": 2.4193548387096775, |
| "grad_norm": 1.2734375, |
| "learning_rate": 5.7408776938776725e-05, |
| "loss": 0.4141, |
| "step": 10050 |
| }, |
| { |
| "epoch": 2.4205584978334134, |
| "grad_norm": 1.15625, |
| "learning_rate": 5.7359742055373445e-05, |
| "loss": 0.3918, |
| "step": 10055 |
| }, |
| { |
| "epoch": 2.4217621569571497, |
| "grad_norm": 1.1875, |
| "learning_rate": 5.731079600893412e-05, |
| "loss": 0.4105, |
| "step": 10060 |
| }, |
| { |
| "epoch": 2.422965816080886, |
| "grad_norm": 1.1953125, |
| "learning_rate": 5.726193887766194e-05, |
| "loss": 0.3852, |
| "step": 10065 |
| }, |
| { |
| "epoch": 2.424169475204622, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.721317073961811e-05, |
| "loss": 0.384, |
| "step": 10070 |
| }, |
| { |
| "epoch": 2.425373134328358, |
| "grad_norm": 1.140625, |
| "learning_rate": 5.716449167272162e-05, |
| "loss": 0.3943, |
| "step": 10075 |
| }, |
| { |
| "epoch": 2.4265767934520945, |
| "grad_norm": 1.171875, |
| "learning_rate": 5.71159017547491e-05, |
| "loss": 0.3999, |
| "step": 10080 |
| }, |
| { |
| "epoch": 2.4277804525758304, |
| "grad_norm": 1.1953125, |
| "learning_rate": 5.706740106333482e-05, |
| "loss": 0.4005, |
| "step": 10085 |
| }, |
| { |
| "epoch": 2.4289841116995667, |
| "grad_norm": 1.0625, |
| "learning_rate": 5.701898967597045e-05, |
| "loss": 0.3771, |
| "step": 10090 |
| }, |
| { |
| "epoch": 2.430187770823303, |
| "grad_norm": 1.2109375, |
| "learning_rate": 5.697066767000494e-05, |
| "loss": 0.3732, |
| "step": 10095 |
| }, |
| { |
| "epoch": 2.431391429947039, |
| "grad_norm": 1.1328125, |
| "learning_rate": 5.6922435122644514e-05, |
| "loss": 0.4007, |
| "step": 10100 |
| }, |
| { |
| "epoch": 2.4325950890707753, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.687429211095238e-05, |
| "loss": 0.4237, |
| "step": 10105 |
| }, |
| { |
| "epoch": 2.433798748194511, |
| "grad_norm": 1.0, |
| "learning_rate": 5.6826238711848764e-05, |
| "loss": 0.3836, |
| "step": 10110 |
| }, |
| { |
| "epoch": 2.4350024073182475, |
| "grad_norm": 1.28125, |
| "learning_rate": 5.677827500211065e-05, |
| "loss": 0.4104, |
| "step": 10115 |
| }, |
| { |
| "epoch": 2.436206066441984, |
| "grad_norm": 1.25, |
| "learning_rate": 5.673040105837176e-05, |
| "loss": 0.3988, |
| "step": 10120 |
| }, |
| { |
| "epoch": 2.4374097255657197, |
| "grad_norm": 1.078125, |
| "learning_rate": 5.668261695712239e-05, |
| "loss": 0.394, |
| "step": 10125 |
| }, |
| { |
| "epoch": 2.438613384689456, |
| "grad_norm": 1.1328125, |
| "learning_rate": 5.663492277470928e-05, |
| "loss": 0.3955, |
| "step": 10130 |
| }, |
| { |
| "epoch": 2.439817043813192, |
| "grad_norm": 1.2265625, |
| "learning_rate": 5.658731858733551e-05, |
| "loss": 0.3973, |
| "step": 10135 |
| }, |
| { |
| "epoch": 2.441020702936928, |
| "grad_norm": 1.2109375, |
| "learning_rate": 5.653980447106035e-05, |
| "loss": 0.3938, |
| "step": 10140 |
| }, |
| { |
| "epoch": 2.4422243620606645, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.649238050179922e-05, |
| "loss": 0.4094, |
| "step": 10145 |
| }, |
| { |
| "epoch": 2.4434280211844004, |
| "grad_norm": 1.0859375, |
| "learning_rate": 5.6445046755323414e-05, |
| "loss": 0.4017, |
| "step": 10150 |
| }, |
| { |
| "epoch": 2.4446316803081367, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.639780330726015e-05, |
| "loss": 0.3798, |
| "step": 10155 |
| }, |
| { |
| "epoch": 2.445835339431873, |
| "grad_norm": 1.265625, |
| "learning_rate": 5.635065023309236e-05, |
| "loss": 0.4175, |
| "step": 10160 |
| }, |
| { |
| "epoch": 2.447038998555609, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.6303587608158535e-05, |
| "loss": 0.3966, |
| "step": 10165 |
| }, |
| { |
| "epoch": 2.4482426576793452, |
| "grad_norm": 1.1015625, |
| "learning_rate": 5.625661550765272e-05, |
| "loss": 0.4021, |
| "step": 10170 |
| }, |
| { |
| "epoch": 2.4494463168030816, |
| "grad_norm": 1.2421875, |
| "learning_rate": 5.620973400662427e-05, |
| "loss": 0.4069, |
| "step": 10175 |
| }, |
| { |
| "epoch": 2.4506499759268174, |
| "grad_norm": 1.171875, |
| "learning_rate": 5.616294317997779e-05, |
| "loss": 0.4031, |
| "step": 10180 |
| }, |
| { |
| "epoch": 2.4518536350505538, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.611624310247307e-05, |
| "loss": 0.3878, |
| "step": 10185 |
| }, |
| { |
| "epoch": 2.4530572941742896, |
| "grad_norm": 1.109375, |
| "learning_rate": 5.606963384872483e-05, |
| "loss": 0.3885, |
| "step": 10190 |
| }, |
| { |
| "epoch": 2.454260953298026, |
| "grad_norm": 1.09375, |
| "learning_rate": 5.6023115493202725e-05, |
| "loss": 0.4029, |
| "step": 10195 |
| }, |
| { |
| "epoch": 2.4554646124217623, |
| "grad_norm": 1.109375, |
| "learning_rate": 5.5976688110231156e-05, |
| "loss": 0.3871, |
| "step": 10200 |
| }, |
| { |
| "epoch": 2.456668271545498, |
| "grad_norm": 1.15625, |
| "learning_rate": 5.5930351773989164e-05, |
| "loss": 0.3943, |
| "step": 10205 |
| }, |
| { |
| "epoch": 2.4578719306692345, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.588410655851036e-05, |
| "loss": 0.4, |
| "step": 10210 |
| }, |
| { |
| "epoch": 2.459075589792971, |
| "grad_norm": 1.21875, |
| "learning_rate": 5.5837952537682724e-05, |
| "loss": 0.4252, |
| "step": 10215 |
| }, |
| { |
| "epoch": 2.4602792489167067, |
| "grad_norm": 1.109375, |
| "learning_rate": 5.579188978524856e-05, |
| "loss": 0.4152, |
| "step": 10220 |
| }, |
| { |
| "epoch": 2.461482908040443, |
| "grad_norm": 1.2109375, |
| "learning_rate": 5.574591837480434e-05, |
| "loss": 0.3914, |
| "step": 10225 |
| }, |
| { |
| "epoch": 2.4626865671641793, |
| "grad_norm": 1.1796875, |
| "learning_rate": 5.570003837980058e-05, |
| "loss": 0.3967, |
| "step": 10230 |
| }, |
| { |
| "epoch": 2.463890226287915, |
| "grad_norm": 1.21875, |
| "learning_rate": 5.565424987354178e-05, |
| "loss": 0.3968, |
| "step": 10235 |
| }, |
| { |
| "epoch": 2.4650938854116515, |
| "grad_norm": 1.078125, |
| "learning_rate": 5.56085529291862e-05, |
| "loss": 0.3802, |
| "step": 10240 |
| }, |
| { |
| "epoch": 2.4662975445353874, |
| "grad_norm": 1.140625, |
| "learning_rate": 5.5562947619745864e-05, |
| "loss": 0.4232, |
| "step": 10245 |
| }, |
| { |
| "epoch": 2.4675012036591237, |
| "grad_norm": 1.2109375, |
| "learning_rate": 5.551743401808637e-05, |
| "loss": 0.3887, |
| "step": 10250 |
| }, |
| { |
| "epoch": 2.46870486278286, |
| "grad_norm": 1.1796875, |
| "learning_rate": 5.5472012196926744e-05, |
| "loss": 0.4019, |
| "step": 10255 |
| }, |
| { |
| "epoch": 2.469908521906596, |
| "grad_norm": 1.1171875, |
| "learning_rate": 5.5426682228839476e-05, |
| "loss": 0.4006, |
| "step": 10260 |
| }, |
| { |
| "epoch": 2.4711121810303323, |
| "grad_norm": 1.140625, |
| "learning_rate": 5.538144418625017e-05, |
| "loss": 0.4005, |
| "step": 10265 |
| }, |
| { |
| "epoch": 2.472315840154068, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.533629814143768e-05, |
| "loss": 0.403, |
| "step": 10270 |
| }, |
| { |
| "epoch": 2.4735194992778045, |
| "grad_norm": 1.1953125, |
| "learning_rate": 5.529124416653376e-05, |
| "loss": 0.413, |
| "step": 10275 |
| }, |
| { |
| "epoch": 2.474723158401541, |
| "grad_norm": 1.140625, |
| "learning_rate": 5.524628233352315e-05, |
| "loss": 0.3953, |
| "step": 10280 |
| }, |
| { |
| "epoch": 2.4759268175252767, |
| "grad_norm": 1.2421875, |
| "learning_rate": 5.5201412714243324e-05, |
| "loss": 0.3951, |
| "step": 10285 |
| }, |
| { |
| "epoch": 2.477130476649013, |
| "grad_norm": 1.265625, |
| "learning_rate": 5.5156635380384427e-05, |
| "loss": 0.3856, |
| "step": 10290 |
| }, |
| { |
| "epoch": 2.4783341357727493, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.511195040348916e-05, |
| "loss": 0.3876, |
| "step": 10295 |
| }, |
| { |
| "epoch": 2.479537794896485, |
| "grad_norm": 1.0859375, |
| "learning_rate": 5.506735785495266e-05, |
| "loss": 0.3904, |
| "step": 10300 |
| }, |
| { |
| "epoch": 2.4807414540202215, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.50228578060224e-05, |
| "loss": 0.4175, |
| "step": 10305 |
| }, |
| { |
| "epoch": 2.481945113143958, |
| "grad_norm": 1.125, |
| "learning_rate": 5.4978450327798036e-05, |
| "loss": 0.3739, |
| "step": 10310 |
| }, |
| { |
| "epoch": 2.4831487722676937, |
| "grad_norm": 1.2734375, |
| "learning_rate": 5.4934135491231325e-05, |
| "loss": 0.4026, |
| "step": 10315 |
| }, |
| { |
| "epoch": 2.48435243139143, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.488991336712604e-05, |
| "loss": 0.3891, |
| "step": 10320 |
| }, |
| { |
| "epoch": 2.485556090515166, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.4845784026137775e-05, |
| "loss": 0.4144, |
| "step": 10325 |
| }, |
| { |
| "epoch": 2.4867597496389022, |
| "grad_norm": 1.109375, |
| "learning_rate": 5.4801747538773916e-05, |
| "loss": 0.3816, |
| "step": 10330 |
| }, |
| { |
| "epoch": 2.4879634087626386, |
| "grad_norm": 1.1796875, |
| "learning_rate": 5.47578039753935e-05, |
| "loss": 0.3974, |
| "step": 10335 |
| }, |
| { |
| "epoch": 2.4891670678863744, |
| "grad_norm": 1.3125, |
| "learning_rate": 5.471395340620705e-05, |
| "loss": 0.3987, |
| "step": 10340 |
| }, |
| { |
| "epoch": 2.4903707270101108, |
| "grad_norm": 1.1328125, |
| "learning_rate": 5.467019590127654e-05, |
| "loss": 0.435, |
| "step": 10345 |
| }, |
| { |
| "epoch": 2.491574386133847, |
| "grad_norm": 1.0859375, |
| "learning_rate": 5.462653153051525e-05, |
| "loss": 0.375, |
| "step": 10350 |
| }, |
| { |
| "epoch": 2.492778045257583, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.4582960363687656e-05, |
| "loss": 0.3907, |
| "step": 10355 |
| }, |
| { |
| "epoch": 2.4939817043813193, |
| "grad_norm": 1.1015625, |
| "learning_rate": 5.453948247040931e-05, |
| "loss": 0.4016, |
| "step": 10360 |
| }, |
| { |
| "epoch": 2.4951853635050556, |
| "grad_norm": 1.1953125, |
| "learning_rate": 5.4496097920146724e-05, |
| "loss": 0.4186, |
| "step": 10365 |
| }, |
| { |
| "epoch": 2.4963890226287915, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.445280678221731e-05, |
| "loss": 0.4148, |
| "step": 10370 |
| }, |
| { |
| "epoch": 2.497592681752528, |
| "grad_norm": 1.109375, |
| "learning_rate": 5.4409609125789176e-05, |
| "loss": 0.4069, |
| "step": 10375 |
| }, |
| { |
| "epoch": 2.4987963408762637, |
| "grad_norm": 1.28125, |
| "learning_rate": 5.436650501988115e-05, |
| "loss": 0.39, |
| "step": 10380 |
| }, |
| { |
| "epoch": 2.5, |
| "grad_norm": 1.1953125, |
| "learning_rate": 5.4323494533362504e-05, |
| "loss": 0.4253, |
| "step": 10385 |
| }, |
| { |
| "epoch": 2.5012036591237363, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.428057773495297e-05, |
| "loss": 0.3715, |
| "step": 10390 |
| }, |
| { |
| "epoch": 2.502407318247472, |
| "grad_norm": 1.1796875, |
| "learning_rate": 5.4237754693222645e-05, |
| "loss": 0.44, |
| "step": 10395 |
| }, |
| { |
| "epoch": 2.5036109773712085, |
| "grad_norm": 1.09375, |
| "learning_rate": 5.41950254765917e-05, |
| "loss": 0.3835, |
| "step": 10400 |
| }, |
| { |
| "epoch": 2.5048146364949444, |
| "grad_norm": 1.125, |
| "learning_rate": 5.415239015333052e-05, |
| "loss": 0.384, |
| "step": 10405 |
| }, |
| { |
| "epoch": 2.5060182956186807, |
| "grad_norm": 1.046875, |
| "learning_rate": 5.41098487915594e-05, |
| "loss": 0.3911, |
| "step": 10410 |
| }, |
| { |
| "epoch": 2.507221954742417, |
| "grad_norm": 1.1953125, |
| "learning_rate": 5.406740145924852e-05, |
| "loss": 0.3934, |
| "step": 10415 |
| }, |
| { |
| "epoch": 2.5084256138661534, |
| "grad_norm": 1.1328125, |
| "learning_rate": 5.4025048224217864e-05, |
| "loss": 0.4006, |
| "step": 10420 |
| }, |
| { |
| "epoch": 2.5096292729898892, |
| "grad_norm": 1.2109375, |
| "learning_rate": 5.3982789154137016e-05, |
| "loss": 0.4099, |
| "step": 10425 |
| }, |
| { |
| "epoch": 2.5108329321136256, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.394062431652516e-05, |
| "loss": 0.4089, |
| "step": 10430 |
| }, |
| { |
| "epoch": 2.5120365912373614, |
| "grad_norm": 1.140625, |
| "learning_rate": 5.389855377875087e-05, |
| "loss": 0.3837, |
| "step": 10435 |
| }, |
| { |
| "epoch": 2.5132402503610978, |
| "grad_norm": 1.1875, |
| "learning_rate": 5.3856577608032104e-05, |
| "loss": 0.38, |
| "step": 10440 |
| }, |
| { |
| "epoch": 2.514443909484834, |
| "grad_norm": 1.2578125, |
| "learning_rate": 5.381469587143603e-05, |
| "loss": 0.414, |
| "step": 10445 |
| }, |
| { |
| "epoch": 2.51564756860857, |
| "grad_norm": 1.078125, |
| "learning_rate": 5.37729086358789e-05, |
| "loss": 0.3798, |
| "step": 10450 |
| }, |
| { |
| "epoch": 2.5168512277323063, |
| "grad_norm": 1.1796875, |
| "learning_rate": 5.373121596812603e-05, |
| "loss": 0.3993, |
| "step": 10455 |
| }, |
| { |
| "epoch": 2.518054886856042, |
| "grad_norm": 1.1875, |
| "learning_rate": 5.3689617934791586e-05, |
| "loss": 0.3905, |
| "step": 10460 |
| }, |
| { |
| "epoch": 2.5192585459797785, |
| "grad_norm": 1.2109375, |
| "learning_rate": 5.364811460233859e-05, |
| "loss": 0.396, |
| "step": 10465 |
| }, |
| { |
| "epoch": 2.520462205103515, |
| "grad_norm": 1.0703125, |
| "learning_rate": 5.3606706037078704e-05, |
| "loss": 0.3868, |
| "step": 10470 |
| }, |
| { |
| "epoch": 2.5216658642272507, |
| "grad_norm": 1.1796875, |
| "learning_rate": 5.356539230517222e-05, |
| "loss": 0.4062, |
| "step": 10475 |
| }, |
| { |
| "epoch": 2.522869523350987, |
| "grad_norm": 1.15625, |
| "learning_rate": 5.352417347262789e-05, |
| "loss": 0.3789, |
| "step": 10480 |
| }, |
| { |
| "epoch": 2.524073182474723, |
| "grad_norm": 1.125, |
| "learning_rate": 5.3483049605302805e-05, |
| "loss": 0.386, |
| "step": 10485 |
| }, |
| { |
| "epoch": 2.525276841598459, |
| "grad_norm": 1.2578125, |
| "learning_rate": 5.34420207689024e-05, |
| "loss": 0.381, |
| "step": 10490 |
| }, |
| { |
| "epoch": 2.5264805007221955, |
| "grad_norm": 1.078125, |
| "learning_rate": 5.340108702898024e-05, |
| "loss": 0.3819, |
| "step": 10495 |
| }, |
| { |
| "epoch": 2.527684159845932, |
| "grad_norm": 1.0390625, |
| "learning_rate": 5.336024845093793e-05, |
| "loss": 0.3858, |
| "step": 10500 |
| }, |
| { |
| "epoch": 2.527684159845932, |
| "eval_loss": 0.36653754115104675, |
| "eval_runtime": 2.3578, |
| "eval_samples_per_second": 84.826, |
| "eval_steps_per_second": 84.826, |
| "step": 10500 |
| }, |
| { |
| "epoch": 2.5288878189696677, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.331950510002507e-05, |
| "loss": 0.3956, |
| "step": 10505 |
| }, |
| { |
| "epoch": 2.530091478093404, |
| "grad_norm": 1.15625, |
| "learning_rate": 5.327885704133906e-05, |
| "loss": 0.4002, |
| "step": 10510 |
| }, |
| { |
| "epoch": 2.53129513721714, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.3238304339825105e-05, |
| "loss": 0.3865, |
| "step": 10515 |
| }, |
| { |
| "epoch": 2.5324987963408763, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.319784706027602e-05, |
| "loss": 0.3823, |
| "step": 10520 |
| }, |
| { |
| "epoch": 2.5337024554646126, |
| "grad_norm": 1.15625, |
| "learning_rate": 5.3157485267332136e-05, |
| "loss": 0.3943, |
| "step": 10525 |
| }, |
| { |
| "epoch": 2.5349061145883485, |
| "grad_norm": 1.125, |
| "learning_rate": 5.31172190254813e-05, |
| "loss": 0.3661, |
| "step": 10530 |
| }, |
| { |
| "epoch": 2.536109773712085, |
| "grad_norm": 1.1015625, |
| "learning_rate": 5.30770483990586e-05, |
| "loss": 0.3944, |
| "step": 10535 |
| }, |
| { |
| "epoch": 2.5373134328358207, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.3036973452246435e-05, |
| "loss": 0.3947, |
| "step": 10540 |
| }, |
| { |
| "epoch": 2.538517091959557, |
| "grad_norm": 1.125, |
| "learning_rate": 5.299699424907428e-05, |
| "loss": 0.3935, |
| "step": 10545 |
| }, |
| { |
| "epoch": 2.5397207510832933, |
| "grad_norm": 1.1875, |
| "learning_rate": 5.295711085341864e-05, |
| "loss": 0.3934, |
| "step": 10550 |
| }, |
| { |
| "epoch": 2.5409244102070296, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.2917323329003e-05, |
| "loss": 0.4008, |
| "step": 10555 |
| }, |
| { |
| "epoch": 2.5421280693307655, |
| "grad_norm": 1.3359375, |
| "learning_rate": 5.2877631739397565e-05, |
| "loss": 0.3874, |
| "step": 10560 |
| }, |
| { |
| "epoch": 2.543331728454502, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.2838036148019377e-05, |
| "loss": 0.3848, |
| "step": 10565 |
| }, |
| { |
| "epoch": 2.5445353875782377, |
| "grad_norm": 1.21875, |
| "learning_rate": 5.279853661813198e-05, |
| "loss": 0.3887, |
| "step": 10570 |
| }, |
| { |
| "epoch": 2.545739046701974, |
| "grad_norm": 1.125, |
| "learning_rate": 5.2759133212845574e-05, |
| "loss": 0.3643, |
| "step": 10575 |
| }, |
| { |
| "epoch": 2.5469427058257104, |
| "grad_norm": 1.0703125, |
| "learning_rate": 5.271982599511664e-05, |
| "loss": 0.4055, |
| "step": 10580 |
| }, |
| { |
| "epoch": 2.5481463649494462, |
| "grad_norm": 1.140625, |
| "learning_rate": 5.268061502774804e-05, |
| "loss": 0.4327, |
| "step": 10585 |
| }, |
| { |
| "epoch": 2.5493500240731826, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.2641500373388875e-05, |
| "loss": 0.3987, |
| "step": 10590 |
| }, |
| { |
| "epoch": 2.5505536831969184, |
| "grad_norm": 1.0859375, |
| "learning_rate": 5.2602482094534305e-05, |
| "loss": 0.4275, |
| "step": 10595 |
| }, |
| { |
| "epoch": 2.5517573423206548, |
| "grad_norm": 1.1796875, |
| "learning_rate": 5.2563560253525564e-05, |
| "loss": 0.3902, |
| "step": 10600 |
| }, |
| { |
| "epoch": 2.552961001444391, |
| "grad_norm": 1.328125, |
| "learning_rate": 5.2524734912549775e-05, |
| "loss": 0.4187, |
| "step": 10605 |
| }, |
| { |
| "epoch": 2.554164660568127, |
| "grad_norm": 1.15625, |
| "learning_rate": 5.248600613363985e-05, |
| "loss": 0.3815, |
| "step": 10610 |
| }, |
| { |
| "epoch": 2.5553683196918633, |
| "grad_norm": 1.21875, |
| "learning_rate": 5.244737397867449e-05, |
| "loss": 0.3901, |
| "step": 10615 |
| }, |
| { |
| "epoch": 2.556571978815599, |
| "grad_norm": 1.125, |
| "learning_rate": 5.2408838509377936e-05, |
| "loss": 0.3754, |
| "step": 10620 |
| }, |
| { |
| "epoch": 2.5577756379393355, |
| "grad_norm": 1.140625, |
| "learning_rate": 5.237039978732003e-05, |
| "loss": 0.3847, |
| "step": 10625 |
| }, |
| { |
| "epoch": 2.558979297063072, |
| "grad_norm": 1.15625, |
| "learning_rate": 5.233205787391596e-05, |
| "loss": 0.4114, |
| "step": 10630 |
| }, |
| { |
| "epoch": 2.560182956186808, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.229381283042629e-05, |
| "loss": 0.4031, |
| "step": 10635 |
| }, |
| { |
| "epoch": 2.561386615310544, |
| "grad_norm": 1.1171875, |
| "learning_rate": 5.2255664717956805e-05, |
| "loss": 0.3975, |
| "step": 10640 |
| }, |
| { |
| "epoch": 2.5625902744342803, |
| "grad_norm": 1.0625, |
| "learning_rate": 5.2217613597458384e-05, |
| "loss": 0.4185, |
| "step": 10645 |
| }, |
| { |
| "epoch": 2.563793933558016, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.217965952972697e-05, |
| "loss": 0.3871, |
| "step": 10650 |
| }, |
| { |
| "epoch": 2.5649975926817525, |
| "grad_norm": 1.109375, |
| "learning_rate": 5.214180257540346e-05, |
| "loss": 0.3932, |
| "step": 10655 |
| }, |
| { |
| "epoch": 2.566201251805489, |
| "grad_norm": 1.0625, |
| "learning_rate": 5.210404279497353e-05, |
| "loss": 0.3969, |
| "step": 10660 |
| }, |
| { |
| "epoch": 2.5674049109292247, |
| "grad_norm": 1.21875, |
| "learning_rate": 5.206638024876766e-05, |
| "loss": 0.4153, |
| "step": 10665 |
| }, |
| { |
| "epoch": 2.568608570052961, |
| "grad_norm": 1.171875, |
| "learning_rate": 5.202881499696091e-05, |
| "loss": 0.3996, |
| "step": 10670 |
| }, |
| { |
| "epoch": 2.569812229176697, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.199134709957295e-05, |
| "loss": 0.3858, |
| "step": 10675 |
| }, |
| { |
| "epoch": 2.5710158883004333, |
| "grad_norm": 1.125, |
| "learning_rate": 5.195397661646787e-05, |
| "loss": 0.3619, |
| "step": 10680 |
| }, |
| { |
| "epoch": 2.5722195474241696, |
| "grad_norm": 1.234375, |
| "learning_rate": 5.191670360735409e-05, |
| "loss": 0.3843, |
| "step": 10685 |
| }, |
| { |
| "epoch": 2.573423206547906, |
| "grad_norm": 1.171875, |
| "learning_rate": 5.187952813178438e-05, |
| "loss": 0.4048, |
| "step": 10690 |
| }, |
| { |
| "epoch": 2.574626865671642, |
| "grad_norm": 1.15625, |
| "learning_rate": 5.184245024915557e-05, |
| "loss": 0.4132, |
| "step": 10695 |
| }, |
| { |
| "epoch": 2.575830524795378, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.1805470018708635e-05, |
| "loss": 0.3826, |
| "step": 10700 |
| }, |
| { |
| "epoch": 2.577034183919114, |
| "grad_norm": 1.171875, |
| "learning_rate": 5.176858749952851e-05, |
| "loss": 0.3977, |
| "step": 10705 |
| }, |
| { |
| "epoch": 2.5782378430428503, |
| "grad_norm": 1.2578125, |
| "learning_rate": 5.1731802750543986e-05, |
| "loss": 0.4028, |
| "step": 10710 |
| }, |
| { |
| "epoch": 2.5794415021665866, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.169511583052767e-05, |
| "loss": 0.4002, |
| "step": 10715 |
| }, |
| { |
| "epoch": 2.5806451612903225, |
| "grad_norm": 1.1328125, |
| "learning_rate": 5.165852679809585e-05, |
| "loss": 0.4226, |
| "step": 10720 |
| }, |
| { |
| "epoch": 2.581848820414059, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.1622035711708453e-05, |
| "loss": 0.3834, |
| "step": 10725 |
| }, |
| { |
| "epoch": 2.5830524795377947, |
| "grad_norm": 1.0703125, |
| "learning_rate": 5.158564262966883e-05, |
| "loss": 0.4073, |
| "step": 10730 |
| }, |
| { |
| "epoch": 2.584256138661531, |
| "grad_norm": 1.2109375, |
| "learning_rate": 5.154934761012382e-05, |
| "loss": 0.4149, |
| "step": 10735 |
| }, |
| { |
| "epoch": 2.5854597977852674, |
| "grad_norm": 1.15625, |
| "learning_rate": 5.151315071106359e-05, |
| "loss": 0.3749, |
| "step": 10740 |
| }, |
| { |
| "epoch": 2.5866634569090032, |
| "grad_norm": 1.1875, |
| "learning_rate": 5.147705199032145e-05, |
| "loss": 0.4127, |
| "step": 10745 |
| }, |
| { |
| "epoch": 2.5878671160327396, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.144105150557392e-05, |
| "loss": 0.3747, |
| "step": 10750 |
| }, |
| { |
| "epoch": 2.5890707751564754, |
| "grad_norm": 1.0546875, |
| "learning_rate": 5.1405149314340575e-05, |
| "loss": 0.399, |
| "step": 10755 |
| }, |
| { |
| "epoch": 2.5902744342802118, |
| "grad_norm": 1.2109375, |
| "learning_rate": 5.136934547398388e-05, |
| "loss": 0.3917, |
| "step": 10760 |
| }, |
| { |
| "epoch": 2.591478093403948, |
| "grad_norm": 1.2265625, |
| "learning_rate": 5.133364004170922e-05, |
| "loss": 0.3636, |
| "step": 10765 |
| }, |
| { |
| "epoch": 2.5926817525276844, |
| "grad_norm": 1.25, |
| "learning_rate": 5.129803307456468e-05, |
| "loss": 0.3905, |
| "step": 10770 |
| }, |
| { |
| "epoch": 2.5938854116514203, |
| "grad_norm": 1.1796875, |
| "learning_rate": 5.126252462944111e-05, |
| "loss": 0.3639, |
| "step": 10775 |
| }, |
| { |
| "epoch": 2.5950890707751566, |
| "grad_norm": 1.2890625, |
| "learning_rate": 5.12271147630719e-05, |
| "loss": 0.425, |
| "step": 10780 |
| }, |
| { |
| "epoch": 2.5962927298988925, |
| "grad_norm": 1.125, |
| "learning_rate": 5.1191803532032915e-05, |
| "loss": 0.3718, |
| "step": 10785 |
| }, |
| { |
| "epoch": 2.597496389022629, |
| "grad_norm": 1.1796875, |
| "learning_rate": 5.1156590992742465e-05, |
| "loss": 0.4002, |
| "step": 10790 |
| }, |
| { |
| "epoch": 2.598700048146365, |
| "grad_norm": 1.0625, |
| "learning_rate": 5.112147720146115e-05, |
| "loss": 0.3949, |
| "step": 10795 |
| }, |
| { |
| "epoch": 2.599903707270101, |
| "grad_norm": 1.1171875, |
| "learning_rate": 5.1086462214291816e-05, |
| "loss": 0.3942, |
| "step": 10800 |
| }, |
| { |
| "epoch": 2.6011073663938373, |
| "grad_norm": 1.1328125, |
| "learning_rate": 5.1051546087179426e-05, |
| "loss": 0.3761, |
| "step": 10805 |
| }, |
| { |
| "epoch": 2.602311025517573, |
| "grad_norm": 1.1953125, |
| "learning_rate": 5.101672887591101e-05, |
| "loss": 0.409, |
| "step": 10810 |
| }, |
| { |
| "epoch": 2.6035146846413095, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.0982010636115545e-05, |
| "loss": 0.3918, |
| "step": 10815 |
| }, |
| { |
| "epoch": 2.604718343765046, |
| "grad_norm": 1.109375, |
| "learning_rate": 5.094739142326389e-05, |
| "loss": 0.3767, |
| "step": 10820 |
| }, |
| { |
| "epoch": 2.605922002888782, |
| "grad_norm": 1.046875, |
| "learning_rate": 5.0912871292668646e-05, |
| "loss": 0.4152, |
| "step": 10825 |
| }, |
| { |
| "epoch": 2.607125662012518, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.087845029948413e-05, |
| "loss": 0.4117, |
| "step": 10830 |
| }, |
| { |
| "epoch": 2.6083293211362544, |
| "grad_norm": 1.140625, |
| "learning_rate": 5.0844128498706314e-05, |
| "loss": 0.4213, |
| "step": 10835 |
| }, |
| { |
| "epoch": 2.6095329802599903, |
| "grad_norm": 1.171875, |
| "learning_rate": 5.08099059451726e-05, |
| "loss": 0.3752, |
| "step": 10840 |
| }, |
| { |
| "epoch": 2.6107366393837266, |
| "grad_norm": 1.1875, |
| "learning_rate": 5.077578269356184e-05, |
| "loss": 0.408, |
| "step": 10845 |
| }, |
| { |
| "epoch": 2.611940298507463, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.0741758798394284e-05, |
| "loss": 0.4146, |
| "step": 10850 |
| }, |
| { |
| "epoch": 2.6131439576311988, |
| "grad_norm": 1.125, |
| "learning_rate": 5.070783431403136e-05, |
| "loss": 0.399, |
| "step": 10855 |
| }, |
| { |
| "epoch": 2.614347616754935, |
| "grad_norm": 1.1328125, |
| "learning_rate": 5.06740092946757e-05, |
| "loss": 0.402, |
| "step": 10860 |
| }, |
| { |
| "epoch": 2.615551275878671, |
| "grad_norm": 1.0625, |
| "learning_rate": 5.064028379437105e-05, |
| "loss": 0.4006, |
| "step": 10865 |
| }, |
| { |
| "epoch": 2.6167549350024073, |
| "grad_norm": 1.125, |
| "learning_rate": 5.060665786700206e-05, |
| "loss": 0.3777, |
| "step": 10870 |
| }, |
| { |
| "epoch": 2.6179585941261436, |
| "grad_norm": 1.140625, |
| "learning_rate": 5.057313156629439e-05, |
| "loss": 0.4108, |
| "step": 10875 |
| }, |
| { |
| "epoch": 2.6191622532498795, |
| "grad_norm": 1.2421875, |
| "learning_rate": 5.053970494581444e-05, |
| "loss": 0.4123, |
| "step": 10880 |
| }, |
| { |
| "epoch": 2.620365912373616, |
| "grad_norm": 1.2421875, |
| "learning_rate": 5.05063780589694e-05, |
| "loss": 0.4031, |
| "step": 10885 |
| }, |
| { |
| "epoch": 2.6215695714973517, |
| "grad_norm": 1.140625, |
| "learning_rate": 5.04731509590071e-05, |
| "loss": 0.383, |
| "step": 10890 |
| }, |
| { |
| "epoch": 2.622773230621088, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.0440023699015906e-05, |
| "loss": 0.3954, |
| "step": 10895 |
| }, |
| { |
| "epoch": 2.6239768897448243, |
| "grad_norm": 1.0859375, |
| "learning_rate": 5.040699633192469e-05, |
| "loss": 0.3613, |
| "step": 10900 |
| }, |
| { |
| "epoch": 2.6251805488685607, |
| "grad_norm": 1.109375, |
| "learning_rate": 5.037406891050272e-05, |
| "loss": 0.3987, |
| "step": 10905 |
| }, |
| { |
| "epoch": 2.6263842079922965, |
| "grad_norm": 1.0859375, |
| "learning_rate": 5.0341241487359573e-05, |
| "loss": 0.4063, |
| "step": 10910 |
| }, |
| { |
| "epoch": 2.627587867116033, |
| "grad_norm": 1.1328125, |
| "learning_rate": 5.0308514114945074e-05, |
| "loss": 0.383, |
| "step": 10915 |
| }, |
| { |
| "epoch": 2.6287915262397687, |
| "grad_norm": 1.203125, |
| "learning_rate": 5.0275886845549155e-05, |
| "loss": 0.3873, |
| "step": 10920 |
| }, |
| { |
| "epoch": 2.629995185363505, |
| "grad_norm": 1.1875, |
| "learning_rate": 5.0243359731301835e-05, |
| "loss": 0.3911, |
| "step": 10925 |
| }, |
| { |
| "epoch": 2.6311988444872414, |
| "grad_norm": 1.1015625, |
| "learning_rate": 5.0210932824173105e-05, |
| "loss": 0.3908, |
| "step": 10930 |
| }, |
| { |
| "epoch": 2.6324025036109773, |
| "grad_norm": 1.1875, |
| "learning_rate": 5.0178606175972834e-05, |
| "loss": 0.4026, |
| "step": 10935 |
| }, |
| { |
| "epoch": 2.6336061627347136, |
| "grad_norm": 1.0859375, |
| "learning_rate": 5.0146379838350745e-05, |
| "loss": 0.4183, |
| "step": 10940 |
| }, |
| { |
| "epoch": 2.6348098218584495, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.011425386279626e-05, |
| "loss": 0.398, |
| "step": 10945 |
| }, |
| { |
| "epoch": 2.636013480982186, |
| "grad_norm": 1.1484375, |
| "learning_rate": 5.0082228300638444e-05, |
| "loss": 0.4177, |
| "step": 10950 |
| }, |
| { |
| "epoch": 2.637217140105922, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.005030320304591e-05, |
| "loss": 0.3913, |
| "step": 10955 |
| }, |
| { |
| "epoch": 2.638420799229658, |
| "grad_norm": 1.1640625, |
| "learning_rate": 5.001847862102683e-05, |
| "loss": 0.3883, |
| "step": 10960 |
| }, |
| { |
| "epoch": 2.6396244583533943, |
| "grad_norm": 1.265625, |
| "learning_rate": 4.9986754605428706e-05, |
| "loss": 0.4112, |
| "step": 10965 |
| }, |
| { |
| "epoch": 2.64082811747713, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.9955131206938366e-05, |
| "loss": 0.4387, |
| "step": 10970 |
| }, |
| { |
| "epoch": 2.6420317766008665, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.9923608476081915e-05, |
| "loss": 0.3736, |
| "step": 10975 |
| }, |
| { |
| "epoch": 2.643235435724603, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.9892186463224584e-05, |
| "loss": 0.3938, |
| "step": 10980 |
| }, |
| { |
| "epoch": 2.644439094848339, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.98608652185707e-05, |
| "loss": 0.3771, |
| "step": 10985 |
| }, |
| { |
| "epoch": 2.645642753972075, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.982964479216359e-05, |
| "loss": 0.4309, |
| "step": 10990 |
| }, |
| { |
| "epoch": 2.6468464130958114, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.9798525233885485e-05, |
| "loss": 0.4135, |
| "step": 10995 |
| }, |
| { |
| "epoch": 2.6480500722195472, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.9767506593457463e-05, |
| "loss": 0.4049, |
| "step": 11000 |
| }, |
| { |
| "epoch": 2.6480500722195472, |
| "eval_loss": 0.3635382056236267, |
| "eval_runtime": 2.3559, |
| "eval_samples_per_second": 84.894, |
| "eval_steps_per_second": 84.894, |
| "step": 11000 |
| }, |
| { |
| "epoch": 2.6492537313432836, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.973658892043935e-05, |
| "loss": 0.3901, |
| "step": 11005 |
| }, |
| { |
| "epoch": 2.65045739046702, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.9705772264229656e-05, |
| "loss": 0.3768, |
| "step": 11010 |
| }, |
| { |
| "epoch": 2.6516610495907558, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.9675056674065534e-05, |
| "loss": 0.4015, |
| "step": 11015 |
| }, |
| { |
| "epoch": 2.652864708714492, |
| "grad_norm": 1.0546875, |
| "learning_rate": 4.964444219902259e-05, |
| "loss": 0.4087, |
| "step": 11020 |
| }, |
| { |
| "epoch": 2.654068367838228, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.9613928888014915e-05, |
| "loss": 0.4151, |
| "step": 11025 |
| }, |
| { |
| "epoch": 2.6552720269619643, |
| "grad_norm": 1.171875, |
| "learning_rate": 4.958351678979496e-05, |
| "loss": 0.3746, |
| "step": 11030 |
| }, |
| { |
| "epoch": 2.6564756860857006, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.955320595295347e-05, |
| "loss": 0.3983, |
| "step": 11035 |
| }, |
| { |
| "epoch": 2.657679345209437, |
| "grad_norm": 1.2578125, |
| "learning_rate": 4.952299642591937e-05, |
| "loss": 0.3974, |
| "step": 11040 |
| }, |
| { |
| "epoch": 2.658883004333173, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.949288825695975e-05, |
| "loss": 0.3775, |
| "step": 11045 |
| }, |
| { |
| "epoch": 2.660086663456909, |
| "grad_norm": 1.09375, |
| "learning_rate": 4.946288149417977e-05, |
| "loss": 0.3934, |
| "step": 11050 |
| }, |
| { |
| "epoch": 2.661290322580645, |
| "grad_norm": 1.125, |
| "learning_rate": 4.943297618552253e-05, |
| "loss": 0.3777, |
| "step": 11055 |
| }, |
| { |
| "epoch": 2.6624939817043813, |
| "grad_norm": 1.15625, |
| "learning_rate": 4.9403172378769016e-05, |
| "loss": 0.4239, |
| "step": 11060 |
| }, |
| { |
| "epoch": 2.6636976408281177, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.9373470121538103e-05, |
| "loss": 0.3757, |
| "step": 11065 |
| }, |
| { |
| "epoch": 2.6649012999518535, |
| "grad_norm": 1.2734375, |
| "learning_rate": 4.934386946128637e-05, |
| "loss": 0.4289, |
| "step": 11070 |
| }, |
| { |
| "epoch": 2.66610495907559, |
| "grad_norm": 1.09375, |
| "learning_rate": 4.9314370445308105e-05, |
| "loss": 0.3909, |
| "step": 11075 |
| }, |
| { |
| "epoch": 2.6673086181993257, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.928497312073515e-05, |
| "loss": 0.4065, |
| "step": 11080 |
| }, |
| { |
| "epoch": 2.668512277323062, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.9255677534536906e-05, |
| "loss": 0.4246, |
| "step": 11085 |
| }, |
| { |
| "epoch": 2.6697159364467984, |
| "grad_norm": 1.2890625, |
| "learning_rate": 4.92264837335202e-05, |
| "loss": 0.4447, |
| "step": 11090 |
| }, |
| { |
| "epoch": 2.6709195955705343, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.9197391764329246e-05, |
| "loss": 0.3842, |
| "step": 11095 |
| }, |
| { |
| "epoch": 2.6721232546942706, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.916840167344557e-05, |
| "loss": 0.3733, |
| "step": 11100 |
| }, |
| { |
| "epoch": 2.6733269138180065, |
| "grad_norm": 1.0859375, |
| "learning_rate": 4.9139513507187894e-05, |
| "loss": 0.384, |
| "step": 11105 |
| }, |
| { |
| "epoch": 2.674530572941743, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.9110727311712086e-05, |
| "loss": 0.3897, |
| "step": 11110 |
| }, |
| { |
| "epoch": 2.675734232065479, |
| "grad_norm": 1.2421875, |
| "learning_rate": 4.9082043133011115e-05, |
| "loss": 0.4143, |
| "step": 11115 |
| }, |
| { |
| "epoch": 2.6769378911892154, |
| "grad_norm": 1.0859375, |
| "learning_rate": 4.9053461016914944e-05, |
| "loss": 0.4066, |
| "step": 11120 |
| }, |
| { |
| "epoch": 2.6781415503129513, |
| "grad_norm": 1.0546875, |
| "learning_rate": 4.9024981009090494e-05, |
| "loss": 0.3885, |
| "step": 11125 |
| }, |
| { |
| "epoch": 2.6793452094366876, |
| "grad_norm": 1.09375, |
| "learning_rate": 4.899660315504149e-05, |
| "loss": 0.3929, |
| "step": 11130 |
| }, |
| { |
| "epoch": 2.6805488685604235, |
| "grad_norm": 1.0859375, |
| "learning_rate": 4.8968327500108476e-05, |
| "loss": 0.364, |
| "step": 11135 |
| }, |
| { |
| "epoch": 2.68175252768416, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.894015408946869e-05, |
| "loss": 0.3784, |
| "step": 11140 |
| }, |
| { |
| "epoch": 2.682956186807896, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.891208296813603e-05, |
| "loss": 0.3999, |
| "step": 11145 |
| }, |
| { |
| "epoch": 2.684159845931632, |
| "grad_norm": 3.09375, |
| "learning_rate": 4.8884114180960946e-05, |
| "loss": 0.3888, |
| "step": 11150 |
| }, |
| { |
| "epoch": 2.6853635050553684, |
| "grad_norm": 1.0625, |
| "learning_rate": 4.885624777263042e-05, |
| "loss": 0.3868, |
| "step": 11155 |
| }, |
| { |
| "epoch": 2.6865671641791042, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.882848378766777e-05, |
| "loss": 0.3863, |
| "step": 11160 |
| }, |
| { |
| "epoch": 2.6877708233028406, |
| "grad_norm": 1.171875, |
| "learning_rate": 4.880082227043278e-05, |
| "loss": 0.3974, |
| "step": 11165 |
| }, |
| { |
| "epoch": 2.688974482426577, |
| "grad_norm": 1.0234375, |
| "learning_rate": 4.8773263265121436e-05, |
| "loss": 0.41, |
| "step": 11170 |
| }, |
| { |
| "epoch": 2.690178141550313, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.874580681576597e-05, |
| "loss": 0.3997, |
| "step": 11175 |
| }, |
| { |
| "epoch": 2.691381800674049, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.871845296623475e-05, |
| "loss": 0.3755, |
| "step": 11180 |
| }, |
| { |
| "epoch": 2.6925854597977854, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.869120176023221e-05, |
| "loss": 0.3829, |
| "step": 11185 |
| }, |
| { |
| "epoch": 2.6937891189215213, |
| "grad_norm": 1.0234375, |
| "learning_rate": 4.866405324129881e-05, |
| "loss": 0.3873, |
| "step": 11190 |
| }, |
| { |
| "epoch": 2.6949927780452576, |
| "grad_norm": 1.1875, |
| "learning_rate": 4.863700745281089e-05, |
| "loss": 0.4194, |
| "step": 11195 |
| }, |
| { |
| "epoch": 2.696196437168994, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.861006443798074e-05, |
| "loss": 0.3864, |
| "step": 11200 |
| }, |
| { |
| "epoch": 2.69740009629273, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.858322423985633e-05, |
| "loss": 0.3747, |
| "step": 11205 |
| }, |
| { |
| "epoch": 2.698603755416466, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.8556486901321454e-05, |
| "loss": 0.3988, |
| "step": 11210 |
| }, |
| { |
| "epoch": 2.699807414540202, |
| "grad_norm": 1.171875, |
| "learning_rate": 4.852985246509552e-05, |
| "loss": 0.3403, |
| "step": 11215 |
| }, |
| { |
| "epoch": 2.7010110736639383, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.850332097373351e-05, |
| "loss": 0.3799, |
| "step": 11220 |
| }, |
| { |
| "epoch": 2.7022147327876747, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.8476892469625974e-05, |
| "loss": 0.358, |
| "step": 11225 |
| }, |
| { |
| "epoch": 2.7034183919114105, |
| "grad_norm": 1.2109375, |
| "learning_rate": 4.845056699499886e-05, |
| "loss": 0.3873, |
| "step": 11230 |
| }, |
| { |
| "epoch": 2.704622051035147, |
| "grad_norm": 1.125, |
| "learning_rate": 4.842434459191356e-05, |
| "loss": 0.3851, |
| "step": 11235 |
| }, |
| { |
| "epoch": 2.7058257101588827, |
| "grad_norm": 1.21875, |
| "learning_rate": 4.839822530226671e-05, |
| "loss": 0.411, |
| "step": 11240 |
| }, |
| { |
| "epoch": 2.707029369282619, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.8372209167790256e-05, |
| "loss": 0.388, |
| "step": 11245 |
| }, |
| { |
| "epoch": 2.7082330284063554, |
| "grad_norm": 1.1875, |
| "learning_rate": 4.834629623005135e-05, |
| "loss": 0.4201, |
| "step": 11250 |
| }, |
| { |
| "epoch": 2.7094366875300917, |
| "grad_norm": 1.0625, |
| "learning_rate": 4.8320486530452155e-05, |
| "loss": 0.3806, |
| "step": 11255 |
| }, |
| { |
| "epoch": 2.7106403466538276, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.8294780110230024e-05, |
| "loss": 0.3789, |
| "step": 11260 |
| }, |
| { |
| "epoch": 2.711844005777564, |
| "grad_norm": 1.0546875, |
| "learning_rate": 4.826917701045717e-05, |
| "loss": 0.38, |
| "step": 11265 |
| }, |
| { |
| "epoch": 2.7130476649013, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.8243677272040824e-05, |
| "loss": 0.3812, |
| "step": 11270 |
| }, |
| { |
| "epoch": 2.714251324025036, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.8218280935723e-05, |
| "loss": 0.4132, |
| "step": 11275 |
| }, |
| { |
| "epoch": 2.7154549831487724, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.8192988042080534e-05, |
| "loss": 0.3824, |
| "step": 11280 |
| }, |
| { |
| "epoch": 2.7166586422725083, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.8167798631525015e-05, |
| "loss": 0.383, |
| "step": 11285 |
| }, |
| { |
| "epoch": 2.7178623013962446, |
| "grad_norm": 1.0859375, |
| "learning_rate": 4.814271274430264e-05, |
| "loss": 0.3768, |
| "step": 11290 |
| }, |
| { |
| "epoch": 2.7190659605199805, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.811773042049425e-05, |
| "loss": 0.4089, |
| "step": 11295 |
| }, |
| { |
| "epoch": 2.720269619643717, |
| "grad_norm": 1.21875, |
| "learning_rate": 4.809285170001515e-05, |
| "loss": 0.3842, |
| "step": 11300 |
| }, |
| { |
| "epoch": 2.721473278767453, |
| "grad_norm": 1.1875, |
| "learning_rate": 4.80680766226152e-05, |
| "loss": 0.4126, |
| "step": 11305 |
| }, |
| { |
| "epoch": 2.7226769378911895, |
| "grad_norm": 1.171875, |
| "learning_rate": 4.8043405227878604e-05, |
| "loss": 0.3973, |
| "step": 11310 |
| }, |
| { |
| "epoch": 2.7238805970149254, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.8018837555223934e-05, |
| "loss": 0.3732, |
| "step": 11315 |
| }, |
| { |
| "epoch": 2.7250842561386617, |
| "grad_norm": 1.3125, |
| "learning_rate": 4.7994373643904027e-05, |
| "loss": 0.3947, |
| "step": 11320 |
| }, |
| { |
| "epoch": 2.7262879152623976, |
| "grad_norm": 1.2109375, |
| "learning_rate": 4.797001353300594e-05, |
| "loss": 0.3856, |
| "step": 11325 |
| }, |
| { |
| "epoch": 2.727491574386134, |
| "grad_norm": 1.171875, |
| "learning_rate": 4.794575726145088e-05, |
| "loss": 0.3788, |
| "step": 11330 |
| }, |
| { |
| "epoch": 2.72869523350987, |
| "grad_norm": 1.2109375, |
| "learning_rate": 4.7921604867994177e-05, |
| "loss": 0.4013, |
| "step": 11335 |
| }, |
| { |
| "epoch": 2.729898892633606, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.789755639122513e-05, |
| "loss": 0.3841, |
| "step": 11340 |
| }, |
| { |
| "epoch": 2.7311025517573424, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.787361186956707e-05, |
| "loss": 0.3777, |
| "step": 11345 |
| }, |
| { |
| "epoch": 2.7323062108810783, |
| "grad_norm": 1.09375, |
| "learning_rate": 4.784977134127719e-05, |
| "loss": 0.3689, |
| "step": 11350 |
| }, |
| { |
| "epoch": 2.7335098700048146, |
| "grad_norm": 1.125, |
| "learning_rate": 4.782603484444653e-05, |
| "loss": 0.3712, |
| "step": 11355 |
| }, |
| { |
| "epoch": 2.734713529128551, |
| "grad_norm": 1.046875, |
| "learning_rate": 4.780240241699996e-05, |
| "loss": 0.3911, |
| "step": 11360 |
| }, |
| { |
| "epoch": 2.735917188252287, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.777887409669601e-05, |
| "loss": 0.4111, |
| "step": 11365 |
| }, |
| { |
| "epoch": 2.737120847376023, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.775544992112692e-05, |
| "loss": 0.3969, |
| "step": 11370 |
| }, |
| { |
| "epoch": 2.738324506499759, |
| "grad_norm": 1.2109375, |
| "learning_rate": 4.773212992771851e-05, |
| "loss": 0.4193, |
| "step": 11375 |
| }, |
| { |
| "epoch": 2.7395281656234953, |
| "grad_norm": 1.0546875, |
| "learning_rate": 4.7708914153730156e-05, |
| "loss": 0.3762, |
| "step": 11380 |
| }, |
| { |
| "epoch": 2.7407318247472316, |
| "grad_norm": 1.125, |
| "learning_rate": 4.768580263625472e-05, |
| "loss": 0.3743, |
| "step": 11385 |
| }, |
| { |
| "epoch": 2.741935483870968, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.7662795412218464e-05, |
| "loss": 0.4056, |
| "step": 11390 |
| }, |
| { |
| "epoch": 2.743139142994704, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.7639892518381056e-05, |
| "loss": 0.3928, |
| "step": 11395 |
| }, |
| { |
| "epoch": 2.74434280211844, |
| "grad_norm": 1.125, |
| "learning_rate": 4.761709399133542e-05, |
| "loss": 0.3929, |
| "step": 11400 |
| }, |
| { |
| "epoch": 2.745546461242176, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.759439986750778e-05, |
| "loss": 0.4081, |
| "step": 11405 |
| }, |
| { |
| "epoch": 2.7467501203659124, |
| "grad_norm": 1.2109375, |
| "learning_rate": 4.757181018315753e-05, |
| "loss": 0.4217, |
| "step": 11410 |
| }, |
| { |
| "epoch": 2.7479537794896487, |
| "grad_norm": 1.15625, |
| "learning_rate": 4.754932497437718e-05, |
| "loss": 0.3866, |
| "step": 11415 |
| }, |
| { |
| "epoch": 2.7491574386133846, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.7526944277092356e-05, |
| "loss": 0.4032, |
| "step": 11420 |
| }, |
| { |
| "epoch": 2.750361097737121, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.750466812706165e-05, |
| "loss": 0.3796, |
| "step": 11425 |
| }, |
| { |
| "epoch": 2.7515647568608568, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.7482496559876636e-05, |
| "loss": 0.4032, |
| "step": 11430 |
| }, |
| { |
| "epoch": 2.752768415984593, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.746042961096181e-05, |
| "loss": 0.4044, |
| "step": 11435 |
| }, |
| { |
| "epoch": 2.7539720751083294, |
| "grad_norm": 1.125, |
| "learning_rate": 4.743846731557449e-05, |
| "loss": 0.3798, |
| "step": 11440 |
| }, |
| { |
| "epoch": 2.7551757342320657, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.7416609708804815e-05, |
| "loss": 0.393, |
| "step": 11445 |
| }, |
| { |
| "epoch": 2.7563793933558016, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.7394856825575606e-05, |
| "loss": 0.4015, |
| "step": 11450 |
| }, |
| { |
| "epoch": 2.757583052479538, |
| "grad_norm": 1.15625, |
| "learning_rate": 4.7373208700642395e-05, |
| "loss": 0.3922, |
| "step": 11455 |
| }, |
| { |
| "epoch": 2.758786711603274, |
| "grad_norm": 1.046875, |
| "learning_rate": 4.7351665368593374e-05, |
| "loss": 0.3676, |
| "step": 11460 |
| }, |
| { |
| "epoch": 2.75999037072701, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.733022686384922e-05, |
| "loss": 0.3733, |
| "step": 11465 |
| }, |
| { |
| "epoch": 2.7611940298507465, |
| "grad_norm": 1.15625, |
| "learning_rate": 4.730889322066321e-05, |
| "loss": 0.3956, |
| "step": 11470 |
| }, |
| { |
| "epoch": 2.7623976889744823, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.7287664473121e-05, |
| "loss": 0.3956, |
| "step": 11475 |
| }, |
| { |
| "epoch": 2.7636013480982187, |
| "grad_norm": 1.0390625, |
| "learning_rate": 4.726654065514071e-05, |
| "loss": 0.3852, |
| "step": 11480 |
| }, |
| { |
| "epoch": 2.7648050072219545, |
| "grad_norm": 1.15625, |
| "learning_rate": 4.7245521800472784e-05, |
| "loss": 0.3916, |
| "step": 11485 |
| }, |
| { |
| "epoch": 2.766008666345691, |
| "grad_norm": 1.1875, |
| "learning_rate": 4.722460794269993e-05, |
| "loss": 0.4224, |
| "step": 11490 |
| }, |
| { |
| "epoch": 2.767212325469427, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.720379911523719e-05, |
| "loss": 0.376, |
| "step": 11495 |
| }, |
| { |
| "epoch": 2.768415984593163, |
| "grad_norm": 1.0625, |
| "learning_rate": 4.718309535133169e-05, |
| "loss": 0.3794, |
| "step": 11500 |
| }, |
| { |
| "epoch": 2.768415984593163, |
| "eval_loss": 0.3630373477935791, |
| "eval_runtime": 2.3435, |
| "eval_samples_per_second": 85.344, |
| "eval_steps_per_second": 85.344, |
| "step": 11500 |
| }, |
| { |
| "epoch": 2.7696196437168994, |
| "grad_norm": 1.21875, |
| "learning_rate": 4.716249668406274e-05, |
| "loss": 0.3653, |
| "step": 11505 |
| }, |
| { |
| "epoch": 2.7708233028406353, |
| "grad_norm": 1.125, |
| "learning_rate": 4.714200314634176e-05, |
| "loss": 0.3893, |
| "step": 11510 |
| }, |
| { |
| "epoch": 2.7720269619643716, |
| "grad_norm": 1.28125, |
| "learning_rate": 4.7121614770912134e-05, |
| "loss": 0.3769, |
| "step": 11515 |
| }, |
| { |
| "epoch": 2.773230621088108, |
| "grad_norm": 1.09375, |
| "learning_rate": 4.7101331590349296e-05, |
| "loss": 0.3879, |
| "step": 11520 |
| }, |
| { |
| "epoch": 2.7744342802118442, |
| "grad_norm": 1.0546875, |
| "learning_rate": 4.708115363706054e-05, |
| "loss": 0.3831, |
| "step": 11525 |
| }, |
| { |
| "epoch": 2.77563793933558, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.706108094328506e-05, |
| "loss": 0.3808, |
| "step": 11530 |
| }, |
| { |
| "epoch": 2.7768415984593164, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.704111354109391e-05, |
| "loss": 0.3977, |
| "step": 11535 |
| }, |
| { |
| "epoch": 2.7780452575830523, |
| "grad_norm": 1.0859375, |
| "learning_rate": 4.702125146238985e-05, |
| "loss": 0.4004, |
| "step": 11540 |
| }, |
| { |
| "epoch": 2.7792489167067886, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.700149473890739e-05, |
| "loss": 0.3839, |
| "step": 11545 |
| }, |
| { |
| "epoch": 2.780452575830525, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.6981843402212716e-05, |
| "loss": 0.407, |
| "step": 11550 |
| }, |
| { |
| "epoch": 2.781656234954261, |
| "grad_norm": 1.28125, |
| "learning_rate": 4.6962297483703634e-05, |
| "loss": 0.3937, |
| "step": 11555 |
| }, |
| { |
| "epoch": 2.782859894077997, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.69428570146095e-05, |
| "loss": 0.4046, |
| "step": 11560 |
| }, |
| { |
| "epoch": 2.784063553201733, |
| "grad_norm": 0.99609375, |
| "learning_rate": 4.69235220259912e-05, |
| "loss": 0.3875, |
| "step": 11565 |
| }, |
| { |
| "epoch": 2.7852672123254694, |
| "grad_norm": 1.125, |
| "learning_rate": 4.690429254874111e-05, |
| "loss": 0.3818, |
| "step": 11570 |
| }, |
| { |
| "epoch": 2.7864708714492057, |
| "grad_norm": 1.2734375, |
| "learning_rate": 4.6885168613582965e-05, |
| "loss": 0.3988, |
| "step": 11575 |
| }, |
| { |
| "epoch": 2.787674530572942, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.6866150251071955e-05, |
| "loss": 0.3745, |
| "step": 11580 |
| }, |
| { |
| "epoch": 2.788878189696678, |
| "grad_norm": 1.1875, |
| "learning_rate": 4.684723749159452e-05, |
| "loss": 0.4021, |
| "step": 11585 |
| }, |
| { |
| "epoch": 2.790081848820414, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.68284303653684e-05, |
| "loss": 0.3919, |
| "step": 11590 |
| }, |
| { |
| "epoch": 2.79128550794415, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.680972890244258e-05, |
| "loss": 0.4028, |
| "step": 11595 |
| }, |
| { |
| "epoch": 2.7924891670678864, |
| "grad_norm": 1.046875, |
| "learning_rate": 4.679113313269719e-05, |
| "loss": 0.3992, |
| "step": 11600 |
| }, |
| { |
| "epoch": 2.7936928261916227, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.677264308584351e-05, |
| "loss": 0.3931, |
| "step": 11605 |
| }, |
| { |
| "epoch": 2.7948964853153586, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.675425879142389e-05, |
| "loss": 0.3863, |
| "step": 11610 |
| }, |
| { |
| "epoch": 2.796100144439095, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.6735980278811716e-05, |
| "loss": 0.3739, |
| "step": 11615 |
| }, |
| { |
| "epoch": 2.797303803562831, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.671780757721139e-05, |
| "loss": 0.4002, |
| "step": 11620 |
| }, |
| { |
| "epoch": 2.798507462686567, |
| "grad_norm": 1.25, |
| "learning_rate": 4.6699740715658183e-05, |
| "loss": 0.4051, |
| "step": 11625 |
| }, |
| { |
| "epoch": 2.7997111218103035, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.668177972301835e-05, |
| "loss": 0.3831, |
| "step": 11630 |
| }, |
| { |
| "epoch": 2.8009147809340393, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.6663924627988944e-05, |
| "loss": 0.379, |
| "step": 11635 |
| }, |
| { |
| "epoch": 2.8021184400577757, |
| "grad_norm": 1.2421875, |
| "learning_rate": 4.664617545909782e-05, |
| "loss": 0.3973, |
| "step": 11640 |
| }, |
| { |
| "epoch": 2.8033220991815115, |
| "grad_norm": 1.125, |
| "learning_rate": 4.6628532244703614e-05, |
| "loss": 0.3647, |
| "step": 11645 |
| }, |
| { |
| "epoch": 2.804525758305248, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.661099501299563e-05, |
| "loss": 0.3971, |
| "step": 11650 |
| }, |
| { |
| "epoch": 2.805729417428984, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.6593563791993904e-05, |
| "loss": 0.395, |
| "step": 11655 |
| }, |
| { |
| "epoch": 2.8069330765527205, |
| "grad_norm": 1.171875, |
| "learning_rate": 4.657623860954904e-05, |
| "loss": 0.3814, |
| "step": 11660 |
| }, |
| { |
| "epoch": 2.8081367356764564, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.655901949334225e-05, |
| "loss": 0.3839, |
| "step": 11665 |
| }, |
| { |
| "epoch": 2.8093403948001927, |
| "grad_norm": 1.0546875, |
| "learning_rate": 4.6541906470885245e-05, |
| "loss": 0.4111, |
| "step": 11670 |
| }, |
| { |
| "epoch": 2.8105440539239286, |
| "grad_norm": 1.125, |
| "learning_rate": 4.652489956952027e-05, |
| "loss": 0.4018, |
| "step": 11675 |
| }, |
| { |
| "epoch": 2.811747713047665, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.650799881641999e-05, |
| "loss": 0.3954, |
| "step": 11680 |
| }, |
| { |
| "epoch": 2.8129513721714012, |
| "grad_norm": 1.046875, |
| "learning_rate": 4.6491204238587454e-05, |
| "loss": 0.4026, |
| "step": 11685 |
| }, |
| { |
| "epoch": 2.814155031295137, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.6474515862856124e-05, |
| "loss": 0.4223, |
| "step": 11690 |
| }, |
| { |
| "epoch": 2.8153586904188734, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.645793371588975e-05, |
| "loss": 0.3726, |
| "step": 11695 |
| }, |
| { |
| "epoch": 2.8165623495426093, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.6441457824182304e-05, |
| "loss": 0.4033, |
| "step": 11700 |
| }, |
| { |
| "epoch": 2.8177660086663456, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.6425088214058085e-05, |
| "loss": 0.3833, |
| "step": 11705 |
| }, |
| { |
| "epoch": 2.818969667790082, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.64088249116715e-05, |
| "loss": 0.4062, |
| "step": 11710 |
| }, |
| { |
| "epoch": 2.8201733269138183, |
| "grad_norm": 1.15625, |
| "learning_rate": 4.639266794300718e-05, |
| "loss": 0.3728, |
| "step": 11715 |
| }, |
| { |
| "epoch": 2.821376986037554, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.6376617333879767e-05, |
| "loss": 0.3784, |
| "step": 11720 |
| }, |
| { |
| "epoch": 2.8225806451612905, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.6360673109934046e-05, |
| "loss": 0.3902, |
| "step": 11725 |
| }, |
| { |
| "epoch": 2.8237843042850264, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.6344835296644816e-05, |
| "loss": 0.3874, |
| "step": 11730 |
| }, |
| { |
| "epoch": 2.8249879634087627, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.632910391931681e-05, |
| "loss": 0.411, |
| "step": 11735 |
| }, |
| { |
| "epoch": 2.826191622532499, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.6313479003084764e-05, |
| "loss": 0.3941, |
| "step": 11740 |
| }, |
| { |
| "epoch": 2.827395281656235, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.629796057291328e-05, |
| "loss": 0.3811, |
| "step": 11745 |
| }, |
| { |
| "epoch": 2.828598940779971, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.62825486535968e-05, |
| "loss": 0.3855, |
| "step": 11750 |
| }, |
| { |
| "epoch": 2.829802599903707, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.626724326975967e-05, |
| "loss": 0.3715, |
| "step": 11755 |
| }, |
| { |
| "epoch": 2.8310062590274434, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.625204444585594e-05, |
| "loss": 0.3914, |
| "step": 11760 |
| }, |
| { |
| "epoch": 2.8322099181511797, |
| "grad_norm": 1.171875, |
| "learning_rate": 4.6236952206169446e-05, |
| "loss": 0.404, |
| "step": 11765 |
| }, |
| { |
| "epoch": 2.8334135772749156, |
| "grad_norm": 1.2109375, |
| "learning_rate": 4.622196657481371e-05, |
| "loss": 0.4047, |
| "step": 11770 |
| }, |
| { |
| "epoch": 2.834617236398652, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.620708757573193e-05, |
| "loss": 0.388, |
| "step": 11775 |
| }, |
| { |
| "epoch": 2.835820895522388, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.619231523269695e-05, |
| "loss": 0.3735, |
| "step": 11780 |
| }, |
| { |
| "epoch": 2.837024554646124, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.617764956931115e-05, |
| "loss": 0.3948, |
| "step": 11785 |
| }, |
| { |
| "epoch": 2.8382282137698605, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.6163090609006536e-05, |
| "loss": 0.3964, |
| "step": 11790 |
| }, |
| { |
| "epoch": 2.8394318728935968, |
| "grad_norm": 1.1875, |
| "learning_rate": 4.6148638375044584e-05, |
| "loss": 0.408, |
| "step": 11795 |
| }, |
| { |
| "epoch": 2.8406355320173327, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.613429289051625e-05, |
| "loss": 0.3863, |
| "step": 11800 |
| }, |
| { |
| "epoch": 2.841839191141069, |
| "grad_norm": 1.0546875, |
| "learning_rate": 4.6120054178341954e-05, |
| "loss": 0.3856, |
| "step": 11805 |
| }, |
| { |
| "epoch": 2.843042850264805, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.610592226127149e-05, |
| "loss": 0.3929, |
| "step": 11810 |
| }, |
| { |
| "epoch": 2.844246509388541, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.609189716188406e-05, |
| "loss": 0.3822, |
| "step": 11815 |
| }, |
| { |
| "epoch": 2.8454501685122775, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.607797890258817e-05, |
| "loss": 0.408, |
| "step": 11820 |
| }, |
| { |
| "epoch": 2.8466538276360134, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.6064167505621615e-05, |
| "loss": 0.449, |
| "step": 11825 |
| }, |
| { |
| "epoch": 2.8478574867597497, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.605046299305149e-05, |
| "loss": 0.3883, |
| "step": 11830 |
| }, |
| { |
| "epoch": 2.8490611458834856, |
| "grad_norm": 1.21875, |
| "learning_rate": 4.603686538677408e-05, |
| "loss": 0.3782, |
| "step": 11835 |
| }, |
| { |
| "epoch": 2.850264805007222, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.602337470851487e-05, |
| "loss": 0.3858, |
| "step": 11840 |
| }, |
| { |
| "epoch": 2.8514684641309582, |
| "grad_norm": 1.046875, |
| "learning_rate": 4.600999097982851e-05, |
| "loss": 0.3777, |
| "step": 11845 |
| }, |
| { |
| "epoch": 2.8526721232546945, |
| "grad_norm": 1.2578125, |
| "learning_rate": 4.5996714222098786e-05, |
| "loss": 0.3919, |
| "step": 11850 |
| }, |
| { |
| "epoch": 2.8538757823784304, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.598354445653853e-05, |
| "loss": 0.4115, |
| "step": 11855 |
| }, |
| { |
| "epoch": 2.8550794415021667, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.5970481704189654e-05, |
| "loss": 0.3761, |
| "step": 11860 |
| }, |
| { |
| "epoch": 2.8562831006259026, |
| "grad_norm": 1.0859375, |
| "learning_rate": 4.595752598592309e-05, |
| "loss": 0.3877, |
| "step": 11865 |
| }, |
| { |
| "epoch": 2.857486759749639, |
| "grad_norm": 1.1875, |
| "learning_rate": 4.594467732243876e-05, |
| "loss": 0.3993, |
| "step": 11870 |
| }, |
| { |
| "epoch": 2.8586904188733753, |
| "grad_norm": 1.125, |
| "learning_rate": 4.593193573426552e-05, |
| "loss": 0.3994, |
| "step": 11875 |
| }, |
| { |
| "epoch": 2.859894077997111, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.591930124176118e-05, |
| "loss": 0.3997, |
| "step": 11880 |
| }, |
| { |
| "epoch": 2.8610977371208475, |
| "grad_norm": 1.2421875, |
| "learning_rate": 4.590677386511242e-05, |
| "loss": 0.4357, |
| "step": 11885 |
| }, |
| { |
| "epoch": 2.8623013962445834, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.589435362433476e-05, |
| "loss": 0.4057, |
| "step": 11890 |
| }, |
| { |
| "epoch": 2.8635050553683197, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.5882040539272574e-05, |
| "loss": 0.3927, |
| "step": 11895 |
| }, |
| { |
| "epoch": 2.864708714492056, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.5869834629599006e-05, |
| "loss": 0.4123, |
| "step": 11900 |
| }, |
| { |
| "epoch": 2.865912373615792, |
| "grad_norm": 1.046875, |
| "learning_rate": 4.585773591481599e-05, |
| "loss": 0.3836, |
| "step": 11905 |
| }, |
| { |
| "epoch": 2.867116032739528, |
| "grad_norm": 1.09375, |
| "learning_rate": 4.5845744414254135e-05, |
| "loss": 0.3762, |
| "step": 11910 |
| }, |
| { |
| "epoch": 2.868319691863264, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.583386014707282e-05, |
| "loss": 0.3659, |
| "step": 11915 |
| }, |
| { |
| "epoch": 2.8695233509870004, |
| "grad_norm": 1.2578125, |
| "learning_rate": 4.582208313226003e-05, |
| "loss": 0.3944, |
| "step": 11920 |
| }, |
| { |
| "epoch": 2.8707270101107367, |
| "grad_norm": 1.125, |
| "learning_rate": 4.581041338863245e-05, |
| "loss": 0.3668, |
| "step": 11925 |
| }, |
| { |
| "epoch": 2.871930669234473, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.579885093483531e-05, |
| "loss": 0.3869, |
| "step": 11930 |
| }, |
| { |
| "epoch": 2.873134328358209, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.5787395789342436e-05, |
| "loss": 0.3973, |
| "step": 11935 |
| }, |
| { |
| "epoch": 2.8743379874819452, |
| "grad_norm": 1.0546875, |
| "learning_rate": 4.5776047970456265e-05, |
| "loss": 0.3618, |
| "step": 11940 |
| }, |
| { |
| "epoch": 2.875541646605681, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.576480749630765e-05, |
| "loss": 0.3859, |
| "step": 11945 |
| }, |
| { |
| "epoch": 2.8767453057294174, |
| "grad_norm": 1.046875, |
| "learning_rate": 4.575367438485602e-05, |
| "loss": 0.3745, |
| "step": 11950 |
| }, |
| { |
| "epoch": 2.8779489648531538, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.5742648653889204e-05, |
| "loss": 0.356, |
| "step": 11955 |
| }, |
| { |
| "epoch": 2.8791526239768896, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.573173032102351e-05, |
| "loss": 0.4155, |
| "step": 11960 |
| }, |
| { |
| "epoch": 2.880356283100626, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.572091940370366e-05, |
| "loss": 0.3779, |
| "step": 11965 |
| }, |
| { |
| "epoch": 2.881559942224362, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.571021591920267e-05, |
| "loss": 0.4197, |
| "step": 11970 |
| }, |
| { |
| "epoch": 2.882763601348098, |
| "grad_norm": 1.2421875, |
| "learning_rate": 4.569961988462203e-05, |
| "loss": 0.3803, |
| "step": 11975 |
| }, |
| { |
| "epoch": 2.8839672604718345, |
| "grad_norm": 1.21875, |
| "learning_rate": 4.568913131689144e-05, |
| "loss": 0.4214, |
| "step": 11980 |
| }, |
| { |
| "epoch": 2.8851709195955704, |
| "grad_norm": 1.015625, |
| "learning_rate": 4.5678750232768954e-05, |
| "loss": 0.3551, |
| "step": 11985 |
| }, |
| { |
| "epoch": 2.8863745787193067, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.56684766488409e-05, |
| "loss": 0.3932, |
| "step": 11990 |
| }, |
| { |
| "epoch": 2.887578237843043, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.565831058152179e-05, |
| "loss": 0.3653, |
| "step": 11995 |
| }, |
| { |
| "epoch": 2.888781896966779, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.564825204705445e-05, |
| "loss": 0.3705, |
| "step": 12000 |
| }, |
| { |
| "epoch": 2.888781896966779, |
| "eval_loss": 0.36297592520713806, |
| "eval_runtime": 2.3567, |
| "eval_samples_per_second": 84.863, |
| "eval_steps_per_second": 84.863, |
| "step": 12000 |
| }, |
| { |
| "epoch": 2.889985556090515, |
| "grad_norm": 1.046875, |
| "learning_rate": 4.563830106150981e-05, |
| "loss": 0.3619, |
| "step": 12005 |
| }, |
| { |
| "epoch": 2.8911892152142515, |
| "grad_norm": 1.1875, |
| "learning_rate": 4.562845764078699e-05, |
| "loss": 0.4069, |
| "step": 12010 |
| }, |
| { |
| "epoch": 2.8923928743379874, |
| "grad_norm": 1.171875, |
| "learning_rate": 4.561872180061326e-05, |
| "loss": 0.3982, |
| "step": 12015 |
| }, |
| { |
| "epoch": 2.8935965334617237, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.5609093556543985e-05, |
| "loss": 0.376, |
| "step": 12020 |
| }, |
| { |
| "epoch": 2.8948001925854596, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.559957292396265e-05, |
| "loss": 0.386, |
| "step": 12025 |
| }, |
| { |
| "epoch": 2.896003851709196, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.559015991808078e-05, |
| "loss": 0.3997, |
| "step": 12030 |
| }, |
| { |
| "epoch": 2.8972075108329323, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.5580854553937935e-05, |
| "loss": 0.3895, |
| "step": 12035 |
| }, |
| { |
| "epoch": 2.898411169956668, |
| "grad_norm": 1.0859375, |
| "learning_rate": 4.557165684640171e-05, |
| "loss": 0.3929, |
| "step": 12040 |
| }, |
| { |
| "epoch": 2.8996148290804045, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.556256681016768e-05, |
| "loss": 0.3843, |
| "step": 12045 |
| }, |
| { |
| "epoch": 2.9008184882041403, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.555358445975939e-05, |
| "loss": 0.4044, |
| "step": 12050 |
| }, |
| { |
| "epoch": 2.9020221473278767, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.5544709809528343e-05, |
| "loss": 0.4108, |
| "step": 12055 |
| }, |
| { |
| "epoch": 2.903225806451613, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.553594287365394e-05, |
| "loss": 0.3778, |
| "step": 12060 |
| }, |
| { |
| "epoch": 2.9044294655753493, |
| "grad_norm": 1.265625, |
| "learning_rate": 4.552728366614352e-05, |
| "loss": 0.3852, |
| "step": 12065 |
| }, |
| { |
| "epoch": 2.905633124699085, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.551873220083226e-05, |
| "loss": 0.3932, |
| "step": 12070 |
| }, |
| { |
| "epoch": 2.9068367838228215, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.551028849138321e-05, |
| "loss": 0.3745, |
| "step": 12075 |
| }, |
| { |
| "epoch": 2.9080404429465574, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.5501952551287255e-05, |
| "loss": 0.3862, |
| "step": 12080 |
| }, |
| { |
| "epoch": 2.9092441020702937, |
| "grad_norm": 1.2734375, |
| "learning_rate": 4.5493724393863104e-05, |
| "loss": 0.3694, |
| "step": 12085 |
| }, |
| { |
| "epoch": 2.91044776119403, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.548560403225722e-05, |
| "loss": 0.4124, |
| "step": 12090 |
| }, |
| { |
| "epoch": 2.911651420317766, |
| "grad_norm": 1.1875, |
| "learning_rate": 4.547759147944386e-05, |
| "loss": 0.3763, |
| "step": 12095 |
| }, |
| { |
| "epoch": 2.9128550794415022, |
| "grad_norm": 1.234375, |
| "learning_rate": 4.5469686748225054e-05, |
| "loss": 0.3835, |
| "step": 12100 |
| }, |
| { |
| "epoch": 2.914058738565238, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.546188985123051e-05, |
| "loss": 0.4051, |
| "step": 12105 |
| }, |
| { |
| "epoch": 2.9152623976889744, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.545420080091769e-05, |
| "loss": 0.3837, |
| "step": 12110 |
| }, |
| { |
| "epoch": 2.9164660568127108, |
| "grad_norm": 1.125, |
| "learning_rate": 4.5446619609571724e-05, |
| "loss": 0.3746, |
| "step": 12115 |
| }, |
| { |
| "epoch": 2.9176697159364466, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.543914628930539e-05, |
| "loss": 0.4243, |
| "step": 12120 |
| }, |
| { |
| "epoch": 2.918873375060183, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.5431780852059166e-05, |
| "loss": 0.3876, |
| "step": 12125 |
| }, |
| { |
| "epoch": 2.920077034183919, |
| "grad_norm": 1.296875, |
| "learning_rate": 4.5424523309601115e-05, |
| "loss": 0.4298, |
| "step": 12130 |
| }, |
| { |
| "epoch": 2.921280693307655, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.541737367352694e-05, |
| "loss": 0.3514, |
| "step": 12135 |
| }, |
| { |
| "epoch": 2.9224843524313915, |
| "grad_norm": 1.265625, |
| "learning_rate": 4.541033195525991e-05, |
| "loss": 0.4289, |
| "step": 12140 |
| }, |
| { |
| "epoch": 2.923688011555128, |
| "grad_norm": 1.0859375, |
| "learning_rate": 4.540339816605091e-05, |
| "loss": 0.3888, |
| "step": 12145 |
| }, |
| { |
| "epoch": 2.9248916706788637, |
| "grad_norm": 1.3046875, |
| "learning_rate": 4.5396572316978325e-05, |
| "loss": 0.4116, |
| "step": 12150 |
| }, |
| { |
| "epoch": 2.9260953298026, |
| "grad_norm": 1.1015625, |
| "learning_rate": 4.538985441894814e-05, |
| "loss": 0.3848, |
| "step": 12155 |
| }, |
| { |
| "epoch": 2.927298988926336, |
| "grad_norm": 1.21875, |
| "learning_rate": 4.538324448269383e-05, |
| "loss": 0.3879, |
| "step": 12160 |
| }, |
| { |
| "epoch": 2.928502648050072, |
| "grad_norm": 1.2109375, |
| "learning_rate": 4.5376742518776384e-05, |
| "loss": 0.3769, |
| "step": 12165 |
| }, |
| { |
| "epoch": 2.9297063071738085, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.5370348537584245e-05, |
| "loss": 0.3912, |
| "step": 12170 |
| }, |
| { |
| "epoch": 2.9309099662975444, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.5364062549333376e-05, |
| "loss": 0.4036, |
| "step": 12175 |
| }, |
| { |
| "epoch": 2.9321136254212807, |
| "grad_norm": 1.15625, |
| "learning_rate": 4.535788456406718e-05, |
| "loss": 0.4089, |
| "step": 12180 |
| }, |
| { |
| "epoch": 2.9333172845450166, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.535181459165649e-05, |
| "loss": 0.395, |
| "step": 12185 |
| }, |
| { |
| "epoch": 2.934520943668753, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.534585264179955e-05, |
| "loss": 0.4056, |
| "step": 12190 |
| }, |
| { |
| "epoch": 2.9357246027924893, |
| "grad_norm": 1.0859375, |
| "learning_rate": 4.533999872402204e-05, |
| "loss": 0.3726, |
| "step": 12195 |
| }, |
| { |
| "epoch": 2.9369282619162256, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.533425284767704e-05, |
| "loss": 0.3937, |
| "step": 12200 |
| }, |
| { |
| "epoch": 2.9381319210399615, |
| "grad_norm": 1.3046875, |
| "learning_rate": 4.532861502194495e-05, |
| "loss": 0.3941, |
| "step": 12205 |
| }, |
| { |
| "epoch": 2.939335580163698, |
| "grad_norm": 1.25, |
| "learning_rate": 4.5323085255833595e-05, |
| "loss": 0.4077, |
| "step": 12210 |
| }, |
| { |
| "epoch": 2.9405392392874337, |
| "grad_norm": 1.171875, |
| "learning_rate": 4.531766355817812e-05, |
| "loss": 0.3771, |
| "step": 12215 |
| }, |
| { |
| "epoch": 2.94174289841117, |
| "grad_norm": 1.0625, |
| "learning_rate": 4.5312349937640994e-05, |
| "loss": 0.3888, |
| "step": 12220 |
| }, |
| { |
| "epoch": 2.9429465575349063, |
| "grad_norm": 1.125, |
| "learning_rate": 4.530714440271203e-05, |
| "loss": 0.4036, |
| "step": 12225 |
| }, |
| { |
| "epoch": 2.944150216658642, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.530204696170835e-05, |
| "loss": 0.3822, |
| "step": 12230 |
| }, |
| { |
| "epoch": 2.9453538757823785, |
| "grad_norm": 1.0859375, |
| "learning_rate": 4.529705762277434e-05, |
| "loss": 0.4212, |
| "step": 12235 |
| }, |
| { |
| "epoch": 2.9465575349061144, |
| "grad_norm": 1.1875, |
| "learning_rate": 4.529217639388169e-05, |
| "loss": 0.3818, |
| "step": 12240 |
| }, |
| { |
| "epoch": 2.9477611940298507, |
| "grad_norm": 1.125, |
| "learning_rate": 4.528740328282936e-05, |
| "loss": 0.362, |
| "step": 12245 |
| }, |
| { |
| "epoch": 2.948964853153587, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.5282738297243556e-05, |
| "loss": 0.3889, |
| "step": 12250 |
| }, |
| { |
| "epoch": 2.950168512277323, |
| "grad_norm": 1.0625, |
| "learning_rate": 4.527818144457772e-05, |
| "loss": 0.3724, |
| "step": 12255 |
| }, |
| { |
| "epoch": 2.9513721714010592, |
| "grad_norm": 1.2734375, |
| "learning_rate": 4.527373273211254e-05, |
| "loss": 0.4102, |
| "step": 12260 |
| }, |
| { |
| "epoch": 2.952575830524795, |
| "grad_norm": 1.125, |
| "learning_rate": 4.5269392166955915e-05, |
| "loss": 0.3812, |
| "step": 12265 |
| }, |
| { |
| "epoch": 2.9537794896485314, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.5265159756042956e-05, |
| "loss": 0.3913, |
| "step": 12270 |
| }, |
| { |
| "epoch": 2.9549831487722678, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.526103550613598e-05, |
| "loss": 0.3818, |
| "step": 12275 |
| }, |
| { |
| "epoch": 2.956186807896004, |
| "grad_norm": 1.1796875, |
| "learning_rate": 4.525701942382447e-05, |
| "loss": 0.4098, |
| "step": 12280 |
| }, |
| { |
| "epoch": 2.95739046701974, |
| "grad_norm": 1.2109375, |
| "learning_rate": 4.5253111515525094e-05, |
| "loss": 0.394, |
| "step": 12285 |
| }, |
| { |
| "epoch": 2.9585941261434763, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.5249311787481704e-05, |
| "loss": 0.364, |
| "step": 12290 |
| }, |
| { |
| "epoch": 2.959797785267212, |
| "grad_norm": 1.296875, |
| "learning_rate": 4.5245620245765275e-05, |
| "loss": 0.385, |
| "step": 12295 |
| }, |
| { |
| "epoch": 2.9610014443909485, |
| "grad_norm": 1.125, |
| "learning_rate": 4.5242036896273964e-05, |
| "loss": 0.3821, |
| "step": 12300 |
| }, |
| { |
| "epoch": 2.962205103514685, |
| "grad_norm": 1.21875, |
| "learning_rate": 4.5238561744733015e-05, |
| "loss": 0.3661, |
| "step": 12305 |
| }, |
| { |
| "epoch": 2.9634087626384207, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.523519479669485e-05, |
| "loss": 0.3844, |
| "step": 12310 |
| }, |
| { |
| "epoch": 2.964612421762157, |
| "grad_norm": 1.1484375, |
| "learning_rate": 4.5231936057538984e-05, |
| "loss": 0.3873, |
| "step": 12315 |
| }, |
| { |
| "epoch": 2.965816080885893, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.5228785532472045e-05, |
| "loss": 0.4045, |
| "step": 12320 |
| }, |
| { |
| "epoch": 2.967019740009629, |
| "grad_norm": 1.171875, |
| "learning_rate": 4.522574322652777e-05, |
| "loss": 0.3682, |
| "step": 12325 |
| }, |
| { |
| "epoch": 2.9682233991333655, |
| "grad_norm": 1.15625, |
| "learning_rate": 4.5222809144566984e-05, |
| "loss": 0.3824, |
| "step": 12330 |
| }, |
| { |
| "epoch": 2.969427058257102, |
| "grad_norm": 0.953125, |
| "learning_rate": 4.521998329127758e-05, |
| "loss": 0.4006, |
| "step": 12335 |
| }, |
| { |
| "epoch": 2.9706307173808377, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.521726567117457e-05, |
| "loss": 0.4114, |
| "step": 12340 |
| }, |
| { |
| "epoch": 2.971834376504574, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.5214656288600014e-05, |
| "loss": 0.3867, |
| "step": 12345 |
| }, |
| { |
| "epoch": 2.97303803562831, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.521215514772301e-05, |
| "loss": 0.3931, |
| "step": 12350 |
| }, |
| { |
| "epoch": 2.9742416947520463, |
| "grad_norm": 1.125, |
| "learning_rate": 4.5209762252539775e-05, |
| "loss": 0.3825, |
| "step": 12355 |
| }, |
| { |
| "epoch": 2.9754453538757826, |
| "grad_norm": 1.140625, |
| "learning_rate": 4.5207477606873514e-05, |
| "loss": 0.405, |
| "step": 12360 |
| }, |
| { |
| "epoch": 2.9766490129995185, |
| "grad_norm": 1.1953125, |
| "learning_rate": 4.520530121437452e-05, |
| "loss": 0.387, |
| "step": 12365 |
| }, |
| { |
| "epoch": 2.9778526721232548, |
| "grad_norm": 1.0703125, |
| "learning_rate": 4.5203233078520115e-05, |
| "loss": 0.3653, |
| "step": 12370 |
| }, |
| { |
| "epoch": 2.9790563312469907, |
| "grad_norm": 1.296875, |
| "learning_rate": 4.520127320261463e-05, |
| "loss": 0.3806, |
| "step": 12375 |
| }, |
| { |
| "epoch": 2.980259990370727, |
| "grad_norm": 1.2734375, |
| "learning_rate": 4.519942158978947e-05, |
| "loss": 0.3833, |
| "step": 12380 |
| }, |
| { |
| "epoch": 2.9814636494944633, |
| "grad_norm": 1.171875, |
| "learning_rate": 4.519767824300301e-05, |
| "loss": 0.4026, |
| "step": 12385 |
| }, |
| { |
| "epoch": 2.982667308618199, |
| "grad_norm": 1.0, |
| "learning_rate": 4.519604316504069e-05, |
| "loss": 0.3843, |
| "step": 12390 |
| }, |
| { |
| "epoch": 2.9838709677419355, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.519451635851493e-05, |
| "loss": 0.4132, |
| "step": 12395 |
| }, |
| { |
| "epoch": 2.9850746268656714, |
| "grad_norm": 1.1328125, |
| "learning_rate": 4.519309782586519e-05, |
| "loss": 0.3989, |
| "step": 12400 |
| }, |
| { |
| "epoch": 2.9862782859894077, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.519178756935792e-05, |
| "loss": 0.3793, |
| "step": 12405 |
| }, |
| { |
| "epoch": 2.987481945113144, |
| "grad_norm": 1.1875, |
| "learning_rate": 4.519058559108658e-05, |
| "loss": 0.405, |
| "step": 12410 |
| }, |
| { |
| "epoch": 2.9886856042368803, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.5189491892971596e-05, |
| "loss": 0.3882, |
| "step": 12415 |
| }, |
| { |
| "epoch": 2.9898892633606162, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.518850647676045e-05, |
| "loss": 0.398, |
| "step": 12420 |
| }, |
| { |
| "epoch": 2.9910929224843525, |
| "grad_norm": 1.1640625, |
| "learning_rate": 4.518762934402757e-05, |
| "loss": 0.4008, |
| "step": 12425 |
| }, |
| { |
| "epoch": 2.9922965816080884, |
| "grad_norm": 1.109375, |
| "learning_rate": 4.5186860496174374e-05, |
| "loss": 0.4006, |
| "step": 12430 |
| }, |
| { |
| "epoch": 2.9935002407318247, |
| "grad_norm": 1.078125, |
| "learning_rate": 4.5186199934429314e-05, |
| "loss": 0.3831, |
| "step": 12435 |
| }, |
| { |
| "epoch": 2.994703899855561, |
| "grad_norm": 1.2265625, |
| "learning_rate": 4.518564765984778e-05, |
| "loss": 0.4147, |
| "step": 12440 |
| }, |
| { |
| "epoch": 2.995907558979297, |
| "grad_norm": 1.203125, |
| "learning_rate": 4.518520367331216e-05, |
| "loss": 0.3909, |
| "step": 12445 |
| }, |
| { |
| "epoch": 2.9971112181030333, |
| "grad_norm": 1.28125, |
| "learning_rate": 4.518486797553185e-05, |
| "loss": 0.4093, |
| "step": 12450 |
| }, |
| { |
| "epoch": 2.998314877226769, |
| "grad_norm": 1.1171875, |
| "learning_rate": 4.5184640567043195e-05, |
| "loss": 0.3735, |
| "step": 12455 |
| }, |
| { |
| "epoch": 2.999277804525758, |
| "eval_loss": 0.3596603274345398, |
| "eval_runtime": 2.3319, |
| "eval_samples_per_second": 85.768, |
| "eval_steps_per_second": 85.768, |
| "step": 12459 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 12462, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 3, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 3.12509450944512e+17, |
| "train_batch_size": 48, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|