| { |
| "best_global_step": null, |
| "best_metric": null, |
| "best_model_checkpoint": null, |
| "epoch": 0.9997592681752527, |
| "eval_steps": 500, |
| "global_step": 4153, |
| "is_hyper_param_search": false, |
| "is_local_process_zero": true, |
| "is_world_process_zero": true, |
| "log_history": [ |
| { |
| "epoch": 0.001203659123736158, |
| "grad_norm": 5.4375, |
| "learning_rate": 8.594285714285714e-06, |
| "loss": 1.9523, |
| "step": 5 |
| }, |
| { |
| "epoch": 0.002407318247472316, |
| "grad_norm": 3.6875, |
| "learning_rate": 1.9337142857142854e-05, |
| "loss": 1.9164, |
| "step": 10 |
| }, |
| { |
| "epoch": 0.0036109773712084737, |
| "grad_norm": 3.046875, |
| "learning_rate": 3.008e-05, |
| "loss": 1.8413, |
| "step": 15 |
| }, |
| { |
| "epoch": 0.004814636494944632, |
| "grad_norm": 3.03125, |
| "learning_rate": 4.082285714285714e-05, |
| "loss": 1.7485, |
| "step": 20 |
| }, |
| { |
| "epoch": 0.00601829561868079, |
| "grad_norm": 2.78125, |
| "learning_rate": 5.156571428571429e-05, |
| "loss": 1.7032, |
| "step": 25 |
| }, |
| { |
| "epoch": 0.007221954742416947, |
| "grad_norm": 2.71875, |
| "learning_rate": 6.230857142857143e-05, |
| "loss": 1.5993, |
| "step": 30 |
| }, |
| { |
| "epoch": 0.008425613866153106, |
| "grad_norm": 2.78125, |
| "learning_rate": 7.305142857142857e-05, |
| "loss": 1.5406, |
| "step": 35 |
| }, |
| { |
| "epoch": 0.009629272989889264, |
| "grad_norm": 2.703125, |
| "learning_rate": 7.519999190126141e-05, |
| "loss": 1.4727, |
| "step": 40 |
| }, |
| { |
| "epoch": 0.010832932113625422, |
| "grad_norm": 2.796875, |
| "learning_rate": 7.519995900014385e-05, |
| "loss": 1.452, |
| "step": 45 |
| }, |
| { |
| "epoch": 0.01203659123736158, |
| "grad_norm": 2.78125, |
| "learning_rate": 7.519990079050565e-05, |
| "loss": 1.3904, |
| "step": 50 |
| }, |
| { |
| "epoch": 0.013240250361097737, |
| "grad_norm": 2.59375, |
| "learning_rate": 7.519981727239906e-05, |
| "loss": 1.3752, |
| "step": 55 |
| }, |
| { |
| "epoch": 0.014443909484833895, |
| "grad_norm": 2.765625, |
| "learning_rate": 7.519970844589904e-05, |
| "loss": 1.3351, |
| "step": 60 |
| }, |
| { |
| "epoch": 0.015647568608570053, |
| "grad_norm": 2.59375, |
| "learning_rate": 7.519957431110327e-05, |
| "loss": 1.342, |
| "step": 65 |
| }, |
| { |
| "epoch": 0.016851227732306212, |
| "grad_norm": 2.96875, |
| "learning_rate": 7.51994148681321e-05, |
| "loss": 1.3116, |
| "step": 70 |
| }, |
| { |
| "epoch": 0.018054886856042368, |
| "grad_norm": 2.734375, |
| "learning_rate": 7.519923011712865e-05, |
| "loss": 1.3081, |
| "step": 75 |
| }, |
| { |
| "epoch": 0.019258545979778528, |
| "grad_norm": 2.640625, |
| "learning_rate": 7.519902005825872e-05, |
| "loss": 1.2885, |
| "step": 80 |
| }, |
| { |
| "epoch": 0.020462205103514684, |
| "grad_norm": 2.703125, |
| "learning_rate": 7.519878469171081e-05, |
| "loss": 1.2879, |
| "step": 85 |
| }, |
| { |
| "epoch": 0.021665864227250843, |
| "grad_norm": 2.5625, |
| "learning_rate": 7.519852401769621e-05, |
| "loss": 1.2741, |
| "step": 90 |
| }, |
| { |
| "epoch": 0.022869523350987, |
| "grad_norm": 2.609375, |
| "learning_rate": 7.519823803644881e-05, |
| "loss": 1.2429, |
| "step": 95 |
| }, |
| { |
| "epoch": 0.02407318247472316, |
| "grad_norm": 2.984375, |
| "learning_rate": 7.519792674822529e-05, |
| "loss": 1.2462, |
| "step": 100 |
| }, |
| { |
| "epoch": 0.025276841598459315, |
| "grad_norm": 2.75, |
| "learning_rate": 7.519759015330501e-05, |
| "loss": 1.217, |
| "step": 105 |
| }, |
| { |
| "epoch": 0.026480500722195474, |
| "grad_norm": 2.484375, |
| "learning_rate": 7.519722825199007e-05, |
| "loss": 1.2431, |
| "step": 110 |
| }, |
| { |
| "epoch": 0.027684159845931634, |
| "grad_norm": 2.703125, |
| "learning_rate": 7.519684104460526e-05, |
| "loss": 1.242, |
| "step": 115 |
| }, |
| { |
| "epoch": 0.02888781896966779, |
| "grad_norm": 2.578125, |
| "learning_rate": 7.519642853149806e-05, |
| "loss": 1.2239, |
| "step": 120 |
| }, |
| { |
| "epoch": 0.03009147809340395, |
| "grad_norm": 2.59375, |
| "learning_rate": 7.519599071303875e-05, |
| "loss": 1.1809, |
| "step": 125 |
| }, |
| { |
| "epoch": 0.031295137217140105, |
| "grad_norm": 2.625, |
| "learning_rate": 7.519552758962019e-05, |
| "loss": 1.2366, |
| "step": 130 |
| }, |
| { |
| "epoch": 0.03249879634087626, |
| "grad_norm": 2.5, |
| "learning_rate": 7.519503916165803e-05, |
| "loss": 1.1634, |
| "step": 135 |
| }, |
| { |
| "epoch": 0.033702455464612424, |
| "grad_norm": 2.84375, |
| "learning_rate": 7.519452542959066e-05, |
| "loss": 1.1719, |
| "step": 140 |
| }, |
| { |
| "epoch": 0.03490611458834858, |
| "grad_norm": 2.953125, |
| "learning_rate": 7.51939863938791e-05, |
| "loss": 1.1596, |
| "step": 145 |
| }, |
| { |
| "epoch": 0.036109773712084736, |
| "grad_norm": 2.390625, |
| "learning_rate": 7.519342205500712e-05, |
| "loss": 1.1627, |
| "step": 150 |
| }, |
| { |
| "epoch": 0.03731343283582089, |
| "grad_norm": 2.46875, |
| "learning_rate": 7.519283241348121e-05, |
| "loss": 1.166, |
| "step": 155 |
| }, |
| { |
| "epoch": 0.038517091959557055, |
| "grad_norm": 2.59375, |
| "learning_rate": 7.519221746983052e-05, |
| "loss": 1.1952, |
| "step": 160 |
| }, |
| { |
| "epoch": 0.03972075108329321, |
| "grad_norm": 2.703125, |
| "learning_rate": 7.5191577224607e-05, |
| "loss": 1.1565, |
| "step": 165 |
| }, |
| { |
| "epoch": 0.04092441020702937, |
| "grad_norm": 2.671875, |
| "learning_rate": 7.519091167838519e-05, |
| "loss": 1.1575, |
| "step": 170 |
| }, |
| { |
| "epoch": 0.04212806933076553, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.519022083176244e-05, |
| "loss": 1.1399, |
| "step": 175 |
| }, |
| { |
| "epoch": 0.043331728454501686, |
| "grad_norm": 2.3125, |
| "learning_rate": 7.518950468535872e-05, |
| "loss": 1.1503, |
| "step": 180 |
| }, |
| { |
| "epoch": 0.04453538757823784, |
| "grad_norm": 2.53125, |
| "learning_rate": 7.518876323981678e-05, |
| "loss": 1.1025, |
| "step": 185 |
| }, |
| { |
| "epoch": 0.045739046701974, |
| "grad_norm": 2.765625, |
| "learning_rate": 7.518799649580204e-05, |
| "loss": 1.1512, |
| "step": 190 |
| }, |
| { |
| "epoch": 0.04694270582571016, |
| "grad_norm": 2.71875, |
| "learning_rate": 7.518720445400261e-05, |
| "loss": 1.1202, |
| "step": 195 |
| }, |
| { |
| "epoch": 0.04814636494944632, |
| "grad_norm": 2.90625, |
| "learning_rate": 7.518638711512932e-05, |
| "loss": 1.1038, |
| "step": 200 |
| }, |
| { |
| "epoch": 0.04935002407318247, |
| "grad_norm": 2.40625, |
| "learning_rate": 7.518554447991572e-05, |
| "loss": 1.1074, |
| "step": 205 |
| }, |
| { |
| "epoch": 0.05055368319691863, |
| "grad_norm": 2.359375, |
| "learning_rate": 7.518467654911806e-05, |
| "loss": 1.1035, |
| "step": 210 |
| }, |
| { |
| "epoch": 0.05175734232065479, |
| "grad_norm": 2.703125, |
| "learning_rate": 7.518378332351524e-05, |
| "loss": 1.1083, |
| "step": 215 |
| }, |
| { |
| "epoch": 0.05296100144439095, |
| "grad_norm": 2.84375, |
| "learning_rate": 7.518286480390892e-05, |
| "loss": 1.1062, |
| "step": 220 |
| }, |
| { |
| "epoch": 0.054164660568127104, |
| "grad_norm": 2.515625, |
| "learning_rate": 7.518192099112345e-05, |
| "loss": 1.1028, |
| "step": 225 |
| }, |
| { |
| "epoch": 0.05536831969186327, |
| "grad_norm": 2.46875, |
| "learning_rate": 7.518095188600586e-05, |
| "loss": 1.1036, |
| "step": 230 |
| }, |
| { |
| "epoch": 0.05657197881559942, |
| "grad_norm": 2.59375, |
| "learning_rate": 7.517995748942589e-05, |
| "loss": 1.0876, |
| "step": 235 |
| }, |
| { |
| "epoch": 0.05777563793933558, |
| "grad_norm": 2.359375, |
| "learning_rate": 7.517893780227597e-05, |
| "loss": 1.0686, |
| "step": 240 |
| }, |
| { |
| "epoch": 0.058979297063071735, |
| "grad_norm": 2.34375, |
| "learning_rate": 7.517789282547126e-05, |
| "loss": 1.0863, |
| "step": 245 |
| }, |
| { |
| "epoch": 0.0601829561868079, |
| "grad_norm": 2.515625, |
| "learning_rate": 7.517682255994956e-05, |
| "loss": 1.0745, |
| "step": 250 |
| }, |
| { |
| "epoch": 0.061386615310544054, |
| "grad_norm": 2.25, |
| "learning_rate": 7.517572700667141e-05, |
| "loss": 1.0997, |
| "step": 255 |
| }, |
| { |
| "epoch": 0.06259027443428021, |
| "grad_norm": 2.53125, |
| "learning_rate": 7.517460616662005e-05, |
| "loss": 1.0501, |
| "step": 260 |
| }, |
| { |
| "epoch": 0.06379393355801637, |
| "grad_norm": 2.5, |
| "learning_rate": 7.517346004080137e-05, |
| "loss": 1.0777, |
| "step": 265 |
| }, |
| { |
| "epoch": 0.06499759268175252, |
| "grad_norm": 2.546875, |
| "learning_rate": 7.5172288630244e-05, |
| "loss": 1.0623, |
| "step": 270 |
| }, |
| { |
| "epoch": 0.06620125180548869, |
| "grad_norm": 2.890625, |
| "learning_rate": 7.517109193599923e-05, |
| "loss": 1.0649, |
| "step": 275 |
| }, |
| { |
| "epoch": 0.06740491092922485, |
| "grad_norm": 2.65625, |
| "learning_rate": 7.516986995914106e-05, |
| "loss": 1.0468, |
| "step": 280 |
| }, |
| { |
| "epoch": 0.068608570052961, |
| "grad_norm": 3.0625, |
| "learning_rate": 7.516862270076615e-05, |
| "loss": 1.0485, |
| "step": 285 |
| }, |
| { |
| "epoch": 0.06981222917669716, |
| "grad_norm": 2.515625, |
| "learning_rate": 7.516735016199392e-05, |
| "loss": 1.0412, |
| "step": 290 |
| }, |
| { |
| "epoch": 0.07101588830043332, |
| "grad_norm": 2.390625, |
| "learning_rate": 7.516605234396639e-05, |
| "loss": 1.0392, |
| "step": 295 |
| }, |
| { |
| "epoch": 0.07221954742416947, |
| "grad_norm": 2.421875, |
| "learning_rate": 7.516472924784832e-05, |
| "loss": 1.0129, |
| "step": 300 |
| }, |
| { |
| "epoch": 0.07342320654790563, |
| "grad_norm": 2.546875, |
| "learning_rate": 7.516338087482715e-05, |
| "loss": 1.0365, |
| "step": 305 |
| }, |
| { |
| "epoch": 0.07462686567164178, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.5162007226113e-05, |
| "loss": 1.0767, |
| "step": 310 |
| }, |
| { |
| "epoch": 0.07583052479537795, |
| "grad_norm": 2.34375, |
| "learning_rate": 7.516060830293867e-05, |
| "loss": 1.0139, |
| "step": 315 |
| }, |
| { |
| "epoch": 0.07703418391911411, |
| "grad_norm": 2.484375, |
| "learning_rate": 7.515918410655963e-05, |
| "loss": 1.0152, |
| "step": 320 |
| }, |
| { |
| "epoch": 0.07823784304285027, |
| "grad_norm": 2.390625, |
| "learning_rate": 7.515773463825409e-05, |
| "loss": 1.0269, |
| "step": 325 |
| }, |
| { |
| "epoch": 0.07944150216658642, |
| "grad_norm": 2.640625, |
| "learning_rate": 7.515625989932286e-05, |
| "loss": 1.0453, |
| "step": 330 |
| }, |
| { |
| "epoch": 0.08064516129032258, |
| "grad_norm": 2.578125, |
| "learning_rate": 7.515475989108947e-05, |
| "loss": 1.0238, |
| "step": 335 |
| }, |
| { |
| "epoch": 0.08184882041405873, |
| "grad_norm": 2.875, |
| "learning_rate": 7.515323461490016e-05, |
| "loss": 1.022, |
| "step": 340 |
| }, |
| { |
| "epoch": 0.08305247953779489, |
| "grad_norm": 2.578125, |
| "learning_rate": 7.515168407212379e-05, |
| "loss": 1.0004, |
| "step": 345 |
| }, |
| { |
| "epoch": 0.08425613866153106, |
| "grad_norm": 2.515625, |
| "learning_rate": 7.515010826415193e-05, |
| "loss": 1.0361, |
| "step": 350 |
| }, |
| { |
| "epoch": 0.08545979778526722, |
| "grad_norm": 2.3125, |
| "learning_rate": 7.51485071923988e-05, |
| "loss": 1.0119, |
| "step": 355 |
| }, |
| { |
| "epoch": 0.08666345690900337, |
| "grad_norm": 2.3125, |
| "learning_rate": 7.514688085830133e-05, |
| "loss": 1.0128, |
| "step": 360 |
| }, |
| { |
| "epoch": 0.08786711603273953, |
| "grad_norm": 2.421875, |
| "learning_rate": 7.514522926331908e-05, |
| "loss": 1.0119, |
| "step": 365 |
| }, |
| { |
| "epoch": 0.08907077515647568, |
| "grad_norm": 2.390625, |
| "learning_rate": 7.51435524089343e-05, |
| "loss": 1.0205, |
| "step": 370 |
| }, |
| { |
| "epoch": 0.09027443428021184, |
| "grad_norm": 2.328125, |
| "learning_rate": 7.514185029665195e-05, |
| "loss": 1.0289, |
| "step": 375 |
| }, |
| { |
| "epoch": 0.091478093403948, |
| "grad_norm": 2.25, |
| "learning_rate": 7.514012292799957e-05, |
| "loss": 0.9974, |
| "step": 380 |
| }, |
| { |
| "epoch": 0.09268175252768417, |
| "grad_norm": 2.3125, |
| "learning_rate": 7.513837030452745e-05, |
| "loss": 1.0058, |
| "step": 385 |
| }, |
| { |
| "epoch": 0.09388541165142032, |
| "grad_norm": 2.328125, |
| "learning_rate": 7.513659242780848e-05, |
| "loss": 0.9894, |
| "step": 390 |
| }, |
| { |
| "epoch": 0.09508907077515648, |
| "grad_norm": 2.390625, |
| "learning_rate": 7.513478929943828e-05, |
| "loss": 0.9879, |
| "step": 395 |
| }, |
| { |
| "epoch": 0.09629272989889263, |
| "grad_norm": 2.5625, |
| "learning_rate": 7.513296092103507e-05, |
| "loss": 1.0006, |
| "step": 400 |
| }, |
| { |
| "epoch": 0.09749638902262879, |
| "grad_norm": 2.640625, |
| "learning_rate": 7.513110729423976e-05, |
| "loss": 0.9984, |
| "step": 405 |
| }, |
| { |
| "epoch": 0.09870004814636495, |
| "grad_norm": 2.515625, |
| "learning_rate": 7.512922842071594e-05, |
| "loss": 1.0084, |
| "step": 410 |
| }, |
| { |
| "epoch": 0.0999037072701011, |
| "grad_norm": 2.3125, |
| "learning_rate": 7.512732430214982e-05, |
| "loss": 1.0034, |
| "step": 415 |
| }, |
| { |
| "epoch": 0.10110736639383726, |
| "grad_norm": 2.546875, |
| "learning_rate": 7.512539494025027e-05, |
| "loss": 1.0019, |
| "step": 420 |
| }, |
| { |
| "epoch": 0.10231102551757343, |
| "grad_norm": 2.96875, |
| "learning_rate": 7.512344033674885e-05, |
| "loss": 0.9941, |
| "step": 425 |
| }, |
| { |
| "epoch": 0.10351468464130958, |
| "grad_norm": 2.625, |
| "learning_rate": 7.512146049339975e-05, |
| "loss": 0.9523, |
| "step": 430 |
| }, |
| { |
| "epoch": 0.10471834376504574, |
| "grad_norm": 2.4375, |
| "learning_rate": 7.51194554119798e-05, |
| "loss": 0.9821, |
| "step": 435 |
| }, |
| { |
| "epoch": 0.1059220028887819, |
| "grad_norm": 2.25, |
| "learning_rate": 7.51174250942885e-05, |
| "loss": 0.9661, |
| "step": 440 |
| }, |
| { |
| "epoch": 0.10712566201251805, |
| "grad_norm": 2.609375, |
| "learning_rate": 7.5115369542148e-05, |
| "loss": 0.9926, |
| "step": 445 |
| }, |
| { |
| "epoch": 0.10832932113625421, |
| "grad_norm": 2.484375, |
| "learning_rate": 7.511328875740308e-05, |
| "loss": 0.9999, |
| "step": 450 |
| }, |
| { |
| "epoch": 0.10953298025999036, |
| "grad_norm": 2.40625, |
| "learning_rate": 7.511118274192118e-05, |
| "loss": 1.0023, |
| "step": 455 |
| }, |
| { |
| "epoch": 0.11073663938372653, |
| "grad_norm": 2.25, |
| "learning_rate": 7.510905149759237e-05, |
| "loss": 0.9643, |
| "step": 460 |
| }, |
| { |
| "epoch": 0.11194029850746269, |
| "grad_norm": 2.375, |
| "learning_rate": 7.510689502632937e-05, |
| "loss": 0.9565, |
| "step": 465 |
| }, |
| { |
| "epoch": 0.11314395763119885, |
| "grad_norm": 2.265625, |
| "learning_rate": 7.510471333006756e-05, |
| "loss": 0.9777, |
| "step": 470 |
| }, |
| { |
| "epoch": 0.114347616754935, |
| "grad_norm": 2.515625, |
| "learning_rate": 7.510250641076491e-05, |
| "loss": 1.0148, |
| "step": 475 |
| }, |
| { |
| "epoch": 0.11555127587867116, |
| "grad_norm": 2.265625, |
| "learning_rate": 7.51002742704021e-05, |
| "loss": 0.9534, |
| "step": 480 |
| }, |
| { |
| "epoch": 0.11675493500240731, |
| "grad_norm": 2.421875, |
| "learning_rate": 7.509801691098234e-05, |
| "loss": 0.96, |
| "step": 485 |
| }, |
| { |
| "epoch": 0.11795859412614347, |
| "grad_norm": 2.359375, |
| "learning_rate": 7.50957343345316e-05, |
| "loss": 0.9168, |
| "step": 490 |
| }, |
| { |
| "epoch": 0.11916225324987964, |
| "grad_norm": 2.25, |
| "learning_rate": 7.509342654309836e-05, |
| "loss": 0.9506, |
| "step": 495 |
| }, |
| { |
| "epoch": 0.1203659123736158, |
| "grad_norm": 2.515625, |
| "learning_rate": 7.509109353875383e-05, |
| "loss": 0.967, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.1203659123736158, |
| "eval_loss": 0.8579447865486145, |
| "eval_runtime": 2.4166, |
| "eval_samples_per_second": 82.761, |
| "eval_steps_per_second": 82.761, |
| "step": 500 |
| }, |
| { |
| "epoch": 0.12156957149735195, |
| "grad_norm": 2.375, |
| "learning_rate": 7.508873532359177e-05, |
| "loss": 0.9136, |
| "step": 505 |
| }, |
| { |
| "epoch": 0.12277323062108811, |
| "grad_norm": 2.328125, |
| "learning_rate": 7.508635189972863e-05, |
| "loss": 0.9422, |
| "step": 510 |
| }, |
| { |
| "epoch": 0.12397688974482426, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.508394326930342e-05, |
| "loss": 0.9751, |
| "step": 515 |
| }, |
| { |
| "epoch": 0.12518054886856042, |
| "grad_norm": 2.359375, |
| "learning_rate": 7.508150943447782e-05, |
| "loss": 0.9974, |
| "step": 520 |
| }, |
| { |
| "epoch": 0.12638420799229658, |
| "grad_norm": 2.265625, |
| "learning_rate": 7.507905039743612e-05, |
| "loss": 0.9835, |
| "step": 525 |
| }, |
| { |
| "epoch": 0.12758786711603273, |
| "grad_norm": 2.328125, |
| "learning_rate": 7.507656616038523e-05, |
| "loss": 0.9457, |
| "step": 530 |
| }, |
| { |
| "epoch": 0.1287915262397689, |
| "grad_norm": 2.328125, |
| "learning_rate": 7.507405672555465e-05, |
| "loss": 0.9453, |
| "step": 535 |
| }, |
| { |
| "epoch": 0.12999518536350504, |
| "grad_norm": 2.5625, |
| "learning_rate": 7.507152209519653e-05, |
| "loss": 0.9403, |
| "step": 540 |
| }, |
| { |
| "epoch": 0.1311988444872412, |
| "grad_norm": 2.296875, |
| "learning_rate": 7.506896227158561e-05, |
| "loss": 0.9566, |
| "step": 545 |
| }, |
| { |
| "epoch": 0.13240250361097738, |
| "grad_norm": 2.390625, |
| "learning_rate": 7.506637725701925e-05, |
| "loss": 0.9112, |
| "step": 550 |
| }, |
| { |
| "epoch": 0.13360616273471354, |
| "grad_norm": 2.4375, |
| "learning_rate": 7.50637670538174e-05, |
| "loss": 0.9529, |
| "step": 555 |
| }, |
| { |
| "epoch": 0.1348098218584497, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.506113166432265e-05, |
| "loss": 0.9439, |
| "step": 560 |
| }, |
| { |
| "epoch": 0.13601348098218585, |
| "grad_norm": 2.296875, |
| "learning_rate": 7.505847109090016e-05, |
| "loss": 0.9204, |
| "step": 565 |
| }, |
| { |
| "epoch": 0.137217140105922, |
| "grad_norm": 2.25, |
| "learning_rate": 7.505578533593771e-05, |
| "loss": 0.9252, |
| "step": 570 |
| }, |
| { |
| "epoch": 0.13842079922965816, |
| "grad_norm": 2.25, |
| "learning_rate": 7.505307440184569e-05, |
| "loss": 0.8843, |
| "step": 575 |
| }, |
| { |
| "epoch": 0.13962445835339432, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.505033829105704e-05, |
| "loss": 0.9302, |
| "step": 580 |
| }, |
| { |
| "epoch": 0.14082811747713048, |
| "grad_norm": 2.484375, |
| "learning_rate": 7.504757700602735e-05, |
| "loss": 0.9238, |
| "step": 585 |
| }, |
| { |
| "epoch": 0.14203177660086663, |
| "grad_norm": 2.75, |
| "learning_rate": 7.504479054923478e-05, |
| "loss": 0.9393, |
| "step": 590 |
| }, |
| { |
| "epoch": 0.1432354357246028, |
| "grad_norm": 2.578125, |
| "learning_rate": 7.504197892318008e-05, |
| "loss": 0.9297, |
| "step": 595 |
| }, |
| { |
| "epoch": 0.14443909484833894, |
| "grad_norm": 2.359375, |
| "learning_rate": 7.50391421303866e-05, |
| "loss": 0.9065, |
| "step": 600 |
| }, |
| { |
| "epoch": 0.1456427539720751, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.503628017340025e-05, |
| "loss": 0.9263, |
| "step": 605 |
| }, |
| { |
| "epoch": 0.14684641309581126, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.503339305478953e-05, |
| "loss": 0.9169, |
| "step": 610 |
| }, |
| { |
| "epoch": 0.1480500722195474, |
| "grad_norm": 2.34375, |
| "learning_rate": 7.503048077714556e-05, |
| "loss": 0.9369, |
| "step": 615 |
| }, |
| { |
| "epoch": 0.14925373134328357, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.5027543343082e-05, |
| "loss": 0.9541, |
| "step": 620 |
| }, |
| { |
| "epoch": 0.15045739046701975, |
| "grad_norm": 2.359375, |
| "learning_rate": 7.502458075523511e-05, |
| "loss": 0.9273, |
| "step": 625 |
| }, |
| { |
| "epoch": 0.1516610495907559, |
| "grad_norm": 2.421875, |
| "learning_rate": 7.50215930162637e-05, |
| "loss": 0.9541, |
| "step": 630 |
| }, |
| { |
| "epoch": 0.15286470871449206, |
| "grad_norm": 2.375, |
| "learning_rate": 7.501858012884915e-05, |
| "loss": 0.9334, |
| "step": 635 |
| }, |
| { |
| "epoch": 0.15406836783822822, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.501554209569548e-05, |
| "loss": 0.9156, |
| "step": 640 |
| }, |
| { |
| "epoch": 0.15527202696196438, |
| "grad_norm": 2.3125, |
| "learning_rate": 7.501247891952918e-05, |
| "loss": 0.9295, |
| "step": 645 |
| }, |
| { |
| "epoch": 0.15647568608570053, |
| "grad_norm": 2.25, |
| "learning_rate": 7.500939060309934e-05, |
| "loss": 0.9318, |
| "step": 650 |
| }, |
| { |
| "epoch": 0.1576793452094367, |
| "grad_norm": 2.390625, |
| "learning_rate": 7.500627714917765e-05, |
| "loss": 0.9627, |
| "step": 655 |
| }, |
| { |
| "epoch": 0.15888300433317284, |
| "grad_norm": 2.421875, |
| "learning_rate": 7.500313856055832e-05, |
| "loss": 0.9144, |
| "step": 660 |
| }, |
| { |
| "epoch": 0.160086663456909, |
| "grad_norm": 2.25, |
| "learning_rate": 7.499997484005813e-05, |
| "loss": 0.9378, |
| "step": 665 |
| }, |
| { |
| "epoch": 0.16129032258064516, |
| "grad_norm": 2.46875, |
| "learning_rate": 7.499678599051639e-05, |
| "loss": 0.9226, |
| "step": 670 |
| }, |
| { |
| "epoch": 0.1624939817043813, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.499357201479502e-05, |
| "loss": 0.8941, |
| "step": 675 |
| }, |
| { |
| "epoch": 0.16369764082811747, |
| "grad_norm": 2.53125, |
| "learning_rate": 7.499033291577844e-05, |
| "loss": 0.9054, |
| "step": 680 |
| }, |
| { |
| "epoch": 0.16490129995185362, |
| "grad_norm": 2.4375, |
| "learning_rate": 7.498706869637364e-05, |
| "loss": 0.9043, |
| "step": 685 |
| }, |
| { |
| "epoch": 0.16610495907558978, |
| "grad_norm": 2.375, |
| "learning_rate": 7.498377935951014e-05, |
| "loss": 0.907, |
| "step": 690 |
| }, |
| { |
| "epoch": 0.16730861819932596, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.498046490814001e-05, |
| "loss": 0.8948, |
| "step": 695 |
| }, |
| { |
| "epoch": 0.16851227732306212, |
| "grad_norm": 2.515625, |
| "learning_rate": 7.497712534523786e-05, |
| "loss": 0.8884, |
| "step": 700 |
| }, |
| { |
| "epoch": 0.16971593644679828, |
| "grad_norm": 2.375, |
| "learning_rate": 7.497376067380085e-05, |
| "loss": 0.9339, |
| "step": 705 |
| }, |
| { |
| "epoch": 0.17091959557053443, |
| "grad_norm": 2.546875, |
| "learning_rate": 7.497037089684863e-05, |
| "loss": 0.9214, |
| "step": 710 |
| }, |
| { |
| "epoch": 0.1721232546942706, |
| "grad_norm": 2.359375, |
| "learning_rate": 7.496695601742344e-05, |
| "loss": 0.909, |
| "step": 715 |
| }, |
| { |
| "epoch": 0.17332691381800674, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.496351603859001e-05, |
| "loss": 0.8977, |
| "step": 720 |
| }, |
| { |
| "epoch": 0.1745305729417429, |
| "grad_norm": 2.46875, |
| "learning_rate": 7.496005096343561e-05, |
| "loss": 0.9395, |
| "step": 725 |
| }, |
| { |
| "epoch": 0.17573423206547906, |
| "grad_norm": 2.609375, |
| "learning_rate": 7.495656079507003e-05, |
| "loss": 0.902, |
| "step": 730 |
| }, |
| { |
| "epoch": 0.1769378911892152, |
| "grad_norm": 2.4375, |
| "learning_rate": 7.495304553662555e-05, |
| "loss": 0.9075, |
| "step": 735 |
| }, |
| { |
| "epoch": 0.17814155031295137, |
| "grad_norm": 2.390625, |
| "learning_rate": 7.494950519125705e-05, |
| "loss": 0.8822, |
| "step": 740 |
| }, |
| { |
| "epoch": 0.17934520943668752, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.494593976214182e-05, |
| "loss": 0.8719, |
| "step": 745 |
| }, |
| { |
| "epoch": 0.18054886856042368, |
| "grad_norm": 2.484375, |
| "learning_rate": 7.494234925247975e-05, |
| "loss": 0.8644, |
| "step": 750 |
| }, |
| { |
| "epoch": 0.18175252768415984, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.493873366549319e-05, |
| "loss": 0.8841, |
| "step": 755 |
| }, |
| { |
| "epoch": 0.182956186807896, |
| "grad_norm": 2.265625, |
| "learning_rate": 7.4935093004427e-05, |
| "loss": 0.8557, |
| "step": 760 |
| }, |
| { |
| "epoch": 0.18415984593163215, |
| "grad_norm": 2.25, |
| "learning_rate": 7.493142727254856e-05, |
| "loss": 0.8904, |
| "step": 765 |
| }, |
| { |
| "epoch": 0.18536350505536833, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.492773647314775e-05, |
| "loss": 0.8465, |
| "step": 770 |
| }, |
| { |
| "epoch": 0.1865671641791045, |
| "grad_norm": 2.296875, |
| "learning_rate": 7.492402060953692e-05, |
| "loss": 0.9323, |
| "step": 775 |
| }, |
| { |
| "epoch": 0.18777082330284064, |
| "grad_norm": 2.390625, |
| "learning_rate": 7.492027968505095e-05, |
| "loss": 0.8839, |
| "step": 780 |
| }, |
| { |
| "epoch": 0.1889744824265768, |
| "grad_norm": 2.359375, |
| "learning_rate": 7.49165137030472e-05, |
| "loss": 0.9033, |
| "step": 785 |
| }, |
| { |
| "epoch": 0.19017814155031296, |
| "grad_norm": 2.4375, |
| "learning_rate": 7.491272266690549e-05, |
| "loss": 0.8841, |
| "step": 790 |
| }, |
| { |
| "epoch": 0.1913818006740491, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.490890658002814e-05, |
| "loss": 0.8432, |
| "step": 795 |
| }, |
| { |
| "epoch": 0.19258545979778527, |
| "grad_norm": 2.3125, |
| "learning_rate": 7.490506544584e-05, |
| "loss": 0.8822, |
| "step": 800 |
| }, |
| { |
| "epoch": 0.19378911892152142, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.490119926778834e-05, |
| "loss": 0.889, |
| "step": 805 |
| }, |
| { |
| "epoch": 0.19499277804525758, |
| "grad_norm": 2.265625, |
| "learning_rate": 7.489730804934292e-05, |
| "loss": 0.8852, |
| "step": 810 |
| }, |
| { |
| "epoch": 0.19619643716899374, |
| "grad_norm": 2.40625, |
| "learning_rate": 7.489339179399597e-05, |
| "loss": 0.8688, |
| "step": 815 |
| }, |
| { |
| "epoch": 0.1974000962927299, |
| "grad_norm": 2.40625, |
| "learning_rate": 7.488945050526224e-05, |
| "loss": 0.8844, |
| "step": 820 |
| }, |
| { |
| "epoch": 0.19860375541646605, |
| "grad_norm": 2.515625, |
| "learning_rate": 7.488548418667887e-05, |
| "loss": 0.8692, |
| "step": 825 |
| }, |
| { |
| "epoch": 0.1998074145402022, |
| "grad_norm": 2.34375, |
| "learning_rate": 7.48814928418055e-05, |
| "loss": 0.8846, |
| "step": 830 |
| }, |
| { |
| "epoch": 0.20101107366393836, |
| "grad_norm": 2.296875, |
| "learning_rate": 7.487747647422422e-05, |
| "loss": 0.895, |
| "step": 835 |
| }, |
| { |
| "epoch": 0.20221473278767452, |
| "grad_norm": 2.40625, |
| "learning_rate": 7.48734350875396e-05, |
| "loss": 0.865, |
| "step": 840 |
| }, |
| { |
| "epoch": 0.2034183919114107, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.486936868537866e-05, |
| "loss": 0.8804, |
| "step": 845 |
| }, |
| { |
| "epoch": 0.20462205103514686, |
| "grad_norm": 2.296875, |
| "learning_rate": 7.486527727139085e-05, |
| "loss": 0.892, |
| "step": 850 |
| }, |
| { |
| "epoch": 0.205825710158883, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.486116084924808e-05, |
| "loss": 0.9048, |
| "step": 855 |
| }, |
| { |
| "epoch": 0.20702936928261917, |
| "grad_norm": 2.3125, |
| "learning_rate": 7.485701942264469e-05, |
| "loss": 0.8856, |
| "step": 860 |
| }, |
| { |
| "epoch": 0.20823302840635532, |
| "grad_norm": 2.125, |
| "learning_rate": 7.485285299529746e-05, |
| "loss": 0.9206, |
| "step": 865 |
| }, |
| { |
| "epoch": 0.20943668753009148, |
| "grad_norm": 2.34375, |
| "learning_rate": 7.484866157094568e-05, |
| "loss": 0.902, |
| "step": 870 |
| }, |
| { |
| "epoch": 0.21064034665382764, |
| "grad_norm": 2.5, |
| "learning_rate": 7.484444515335095e-05, |
| "loss": 0.8681, |
| "step": 875 |
| }, |
| { |
| "epoch": 0.2118440057775638, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.484020374629738e-05, |
| "loss": 0.8925, |
| "step": 880 |
| }, |
| { |
| "epoch": 0.21304766490129995, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.483593735359151e-05, |
| "loss": 0.8729, |
| "step": 885 |
| }, |
| { |
| "epoch": 0.2142513240250361, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.483164597906225e-05, |
| "loss": 0.8567, |
| "step": 890 |
| }, |
| { |
| "epoch": 0.21545498314877226, |
| "grad_norm": 2.40625, |
| "learning_rate": 7.482732962656101e-05, |
| "loss": 0.867, |
| "step": 895 |
| }, |
| { |
| "epoch": 0.21665864227250842, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.482298829996155e-05, |
| "loss": 0.8476, |
| "step": 900 |
| }, |
| { |
| "epoch": 0.21786230139624457, |
| "grad_norm": 2.4375, |
| "learning_rate": 7.481862200316005e-05, |
| "loss": 0.8878, |
| "step": 905 |
| }, |
| { |
| "epoch": 0.21906596051998073, |
| "grad_norm": 2.46875, |
| "learning_rate": 7.481423074007512e-05, |
| "loss": 0.8733, |
| "step": 910 |
| }, |
| { |
| "epoch": 0.2202696196437169, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.48098145146478e-05, |
| "loss": 0.8523, |
| "step": 915 |
| }, |
| { |
| "epoch": 0.22147327876745307, |
| "grad_norm": 2.328125, |
| "learning_rate": 7.480537333084149e-05, |
| "loss": 0.8696, |
| "step": 920 |
| }, |
| { |
| "epoch": 0.22267693789118922, |
| "grad_norm": 2.65625, |
| "learning_rate": 7.480090719264199e-05, |
| "loss": 0.8744, |
| "step": 925 |
| }, |
| { |
| "epoch": 0.22388059701492538, |
| "grad_norm": 2.421875, |
| "learning_rate": 7.479641610405752e-05, |
| "loss": 0.8644, |
| "step": 930 |
| }, |
| { |
| "epoch": 0.22508425613866154, |
| "grad_norm": 2.3125, |
| "learning_rate": 7.479190006911868e-05, |
| "loss": 0.8718, |
| "step": 935 |
| }, |
| { |
| "epoch": 0.2262879152623977, |
| "grad_norm": 2.34375, |
| "learning_rate": 7.478735909187847e-05, |
| "loss": 0.8723, |
| "step": 940 |
| }, |
| { |
| "epoch": 0.22749157438613385, |
| "grad_norm": 2.40625, |
| "learning_rate": 7.478279317641225e-05, |
| "loss": 0.8696, |
| "step": 945 |
| }, |
| { |
| "epoch": 0.22869523350987, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.47782023268178e-05, |
| "loss": 0.8958, |
| "step": 950 |
| }, |
| { |
| "epoch": 0.22989889263360616, |
| "grad_norm": 2.296875, |
| "learning_rate": 7.477358654721523e-05, |
| "loss": 0.8537, |
| "step": 955 |
| }, |
| { |
| "epoch": 0.23110255175734232, |
| "grad_norm": 2.3125, |
| "learning_rate": 7.476894584174705e-05, |
| "loss": 0.8586, |
| "step": 960 |
| }, |
| { |
| "epoch": 0.23230621088107847, |
| "grad_norm": 2.375, |
| "learning_rate": 7.476428021457815e-05, |
| "loss": 0.8727, |
| "step": 965 |
| }, |
| { |
| "epoch": 0.23350987000481463, |
| "grad_norm": 2.59375, |
| "learning_rate": 7.475958966989575e-05, |
| "loss": 0.8582, |
| "step": 970 |
| }, |
| { |
| "epoch": 0.23471352912855079, |
| "grad_norm": 2.328125, |
| "learning_rate": 7.47548742119095e-05, |
| "loss": 0.8351, |
| "step": 975 |
| }, |
| { |
| "epoch": 0.23591718825228694, |
| "grad_norm": 2.3125, |
| "learning_rate": 7.475013384485134e-05, |
| "loss": 0.841, |
| "step": 980 |
| }, |
| { |
| "epoch": 0.2371208473760231, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.474536857297558e-05, |
| "loss": 0.8406, |
| "step": 985 |
| }, |
| { |
| "epoch": 0.23832450649975928, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.474057840055891e-05, |
| "loss": 0.8378, |
| "step": 990 |
| }, |
| { |
| "epoch": 0.23952816562349544, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.473576333190034e-05, |
| "loss": 0.8534, |
| "step": 995 |
| }, |
| { |
| "epoch": 0.2407318247472316, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.473092337132126e-05, |
| "loss": 0.8428, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.2407318247472316, |
| "eval_loss": 0.7515629529953003, |
| "eval_runtime": 2.4162, |
| "eval_samples_per_second": 82.774, |
| "eval_steps_per_second": 82.774, |
| "step": 1000 |
| }, |
| { |
| "epoch": 0.24193548387096775, |
| "grad_norm": 2.125, |
| "learning_rate": 7.472605852316533e-05, |
| "loss": 0.8745, |
| "step": 1005 |
| }, |
| { |
| "epoch": 0.2431391429947039, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.47211687917986e-05, |
| "loss": 0.8463, |
| "step": 1010 |
| }, |
| { |
| "epoch": 0.24434280211844006, |
| "grad_norm": 2.265625, |
| "learning_rate": 7.471625418160947e-05, |
| "loss": 0.8593, |
| "step": 1015 |
| }, |
| { |
| "epoch": 0.24554646124217622, |
| "grad_norm": 2.453125, |
| "learning_rate": 7.471131469700862e-05, |
| "loss": 0.8309, |
| "step": 1020 |
| }, |
| { |
| "epoch": 0.24675012036591237, |
| "grad_norm": 2.125, |
| "learning_rate": 7.470635034242906e-05, |
| "loss": 0.8165, |
| "step": 1025 |
| }, |
| { |
| "epoch": 0.24795377948964853, |
| "grad_norm": 2.34375, |
| "learning_rate": 7.470136112232614e-05, |
| "loss": 0.8193, |
| "step": 1030 |
| }, |
| { |
| "epoch": 0.24915743861338469, |
| "grad_norm": 2.515625, |
| "learning_rate": 7.469634704117752e-05, |
| "loss": 0.8642, |
| "step": 1035 |
| }, |
| { |
| "epoch": 0.25036109773712084, |
| "grad_norm": 2.140625, |
| "learning_rate": 7.469130810348318e-05, |
| "loss": 0.8601, |
| "step": 1040 |
| }, |
| { |
| "epoch": 0.251564756860857, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.468624431376538e-05, |
| "loss": 0.7957, |
| "step": 1045 |
| }, |
| { |
| "epoch": 0.25276841598459315, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.468115567656872e-05, |
| "loss": 0.8385, |
| "step": 1050 |
| }, |
| { |
| "epoch": 0.2539720751083293, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.467604219646007e-05, |
| "loss": 0.7962, |
| "step": 1055 |
| }, |
| { |
| "epoch": 0.25517573423206547, |
| "grad_norm": 2.140625, |
| "learning_rate": 7.467090387802862e-05, |
| "loss": 0.8701, |
| "step": 1060 |
| }, |
| { |
| "epoch": 0.2563793933558016, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.466574072588581e-05, |
| "loss": 0.8678, |
| "step": 1065 |
| }, |
| { |
| "epoch": 0.2575830524795378, |
| "grad_norm": 2.34375, |
| "learning_rate": 7.466055274466543e-05, |
| "loss": 0.8385, |
| "step": 1070 |
| }, |
| { |
| "epoch": 0.25878671160327393, |
| "grad_norm": 2.375, |
| "learning_rate": 7.46553399390235e-05, |
| "loss": 0.8711, |
| "step": 1075 |
| }, |
| { |
| "epoch": 0.2599903707270101, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.465010231363835e-05, |
| "loss": 0.8953, |
| "step": 1080 |
| }, |
| { |
| "epoch": 0.26119402985074625, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.464483987321056e-05, |
| "loss": 0.8106, |
| "step": 1085 |
| }, |
| { |
| "epoch": 0.2623976889744824, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.463955262246301e-05, |
| "loss": 0.8329, |
| "step": 1090 |
| }, |
| { |
| "epoch": 0.26360134809821856, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.463424056614082e-05, |
| "loss": 0.8217, |
| "step": 1095 |
| }, |
| { |
| "epoch": 0.26480500722195477, |
| "grad_norm": 2.296875, |
| "learning_rate": 7.46289037090114e-05, |
| "loss": 0.8368, |
| "step": 1100 |
| }, |
| { |
| "epoch": 0.2660086663456909, |
| "grad_norm": 2.140625, |
| "learning_rate": 7.462354205586437e-05, |
| "loss": 0.8145, |
| "step": 1105 |
| }, |
| { |
| "epoch": 0.2672123254694271, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.461815561151166e-05, |
| "loss": 0.7885, |
| "step": 1110 |
| }, |
| { |
| "epoch": 0.26841598459316324, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.461274438078741e-05, |
| "loss": 0.845, |
| "step": 1115 |
| }, |
| { |
| "epoch": 0.2696196437168994, |
| "grad_norm": 2.453125, |
| "learning_rate": 7.460730836854803e-05, |
| "loss": 0.7927, |
| "step": 1120 |
| }, |
| { |
| "epoch": 0.27082330284063555, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.460184757967215e-05, |
| "loss": 0.85, |
| "step": 1125 |
| }, |
| { |
| "epoch": 0.2720269619643717, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.459636201906066e-05, |
| "loss": 0.8376, |
| "step": 1130 |
| }, |
| { |
| "epoch": 0.27323062108810786, |
| "grad_norm": 2.3125, |
| "learning_rate": 7.459085169163664e-05, |
| "loss": 0.866, |
| "step": 1135 |
| }, |
| { |
| "epoch": 0.274434280211844, |
| "grad_norm": 2.125, |
| "learning_rate": 7.458531660234546e-05, |
| "loss": 0.8382, |
| "step": 1140 |
| }, |
| { |
| "epoch": 0.2756379393355802, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.457975675615464e-05, |
| "loss": 0.8455, |
| "step": 1145 |
| }, |
| { |
| "epoch": 0.27684159845931633, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.457417215805399e-05, |
| "loss": 0.8559, |
| "step": 1150 |
| }, |
| { |
| "epoch": 0.2780452575830525, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.456856281305547e-05, |
| "loss": 0.8299, |
| "step": 1155 |
| }, |
| { |
| "epoch": 0.27924891670678864, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.45629287261933e-05, |
| "loss": 0.8586, |
| "step": 1160 |
| }, |
| { |
| "epoch": 0.2804525758305248, |
| "grad_norm": 2.265625, |
| "learning_rate": 7.455726990252389e-05, |
| "loss": 0.7975, |
| "step": 1165 |
| }, |
| { |
| "epoch": 0.28165623495426095, |
| "grad_norm": 2.484375, |
| "learning_rate": 7.455158634712583e-05, |
| "loss": 0.8304, |
| "step": 1170 |
| }, |
| { |
| "epoch": 0.2828598940779971, |
| "grad_norm": 2.125, |
| "learning_rate": 7.454587806509992e-05, |
| "loss": 0.819, |
| "step": 1175 |
| }, |
| { |
| "epoch": 0.28406355320173327, |
| "grad_norm": 2.453125, |
| "learning_rate": 7.454014506156915e-05, |
| "loss": 0.8544, |
| "step": 1180 |
| }, |
| { |
| "epoch": 0.2852672123254694, |
| "grad_norm": 2.515625, |
| "learning_rate": 7.453438734167873e-05, |
| "loss": 0.8258, |
| "step": 1185 |
| }, |
| { |
| "epoch": 0.2864708714492056, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.452860491059598e-05, |
| "loss": 0.8564, |
| "step": 1190 |
| }, |
| { |
| "epoch": 0.28767453057294173, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.452279777351046e-05, |
| "loss": 0.8325, |
| "step": 1195 |
| }, |
| { |
| "epoch": 0.2888781896966779, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.451696593563388e-05, |
| "loss": 0.8374, |
| "step": 1200 |
| }, |
| { |
| "epoch": 0.29008184882041405, |
| "grad_norm": 2.0, |
| "learning_rate": 7.451110940220013e-05, |
| "loss": 0.7921, |
| "step": 1205 |
| }, |
| { |
| "epoch": 0.2912855079441502, |
| "grad_norm": 2.296875, |
| "learning_rate": 7.450522817846522e-05, |
| "loss": 0.8379, |
| "step": 1210 |
| }, |
| { |
| "epoch": 0.29248916706788636, |
| "grad_norm": 2.359375, |
| "learning_rate": 7.449932226970739e-05, |
| "loss": 0.8362, |
| "step": 1215 |
| }, |
| { |
| "epoch": 0.2936928261916225, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.449339168122696e-05, |
| "loss": 0.8319, |
| "step": 1220 |
| }, |
| { |
| "epoch": 0.29489648531535867, |
| "grad_norm": 2.296875, |
| "learning_rate": 7.448743641834646e-05, |
| "loss": 0.8261, |
| "step": 1225 |
| }, |
| { |
| "epoch": 0.2961001444390948, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.448145648641054e-05, |
| "loss": 0.8369, |
| "step": 1230 |
| }, |
| { |
| "epoch": 0.297303803562831, |
| "grad_norm": 2.421875, |
| "learning_rate": 7.447545189078597e-05, |
| "loss": 0.8054, |
| "step": 1235 |
| }, |
| { |
| "epoch": 0.29850746268656714, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.446942263686169e-05, |
| "loss": 0.8111, |
| "step": 1240 |
| }, |
| { |
| "epoch": 0.29971112181030335, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.446336873004875e-05, |
| "loss": 0.8285, |
| "step": 1245 |
| }, |
| { |
| "epoch": 0.3009147809340395, |
| "grad_norm": 2.3125, |
| "learning_rate": 7.445729017578033e-05, |
| "loss": 0.8248, |
| "step": 1250 |
| }, |
| { |
| "epoch": 0.30211844005777566, |
| "grad_norm": 2.25, |
| "learning_rate": 7.445118697951173e-05, |
| "loss": 0.8131, |
| "step": 1255 |
| }, |
| { |
| "epoch": 0.3033220991815118, |
| "grad_norm": 1.9765625, |
| "learning_rate": 7.444505914672035e-05, |
| "loss": 0.8288, |
| "step": 1260 |
| }, |
| { |
| "epoch": 0.304525758305248, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.443890668290574e-05, |
| "loss": 0.7962, |
| "step": 1265 |
| }, |
| { |
| "epoch": 0.30572941742898413, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.443272959358952e-05, |
| "loss": 0.8235, |
| "step": 1270 |
| }, |
| { |
| "epoch": 0.3069330765527203, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.442652788431541e-05, |
| "loss": 0.8137, |
| "step": 1275 |
| }, |
| { |
| "epoch": 0.30813673567645644, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.442030156064925e-05, |
| "loss": 0.7973, |
| "step": 1280 |
| }, |
| { |
| "epoch": 0.3093403948001926, |
| "grad_norm": 2.453125, |
| "learning_rate": 7.441405062817895e-05, |
| "loss": 0.8416, |
| "step": 1285 |
| }, |
| { |
| "epoch": 0.31054405392392875, |
| "grad_norm": 2.359375, |
| "learning_rate": 7.440777509251453e-05, |
| "loss": 0.8208, |
| "step": 1290 |
| }, |
| { |
| "epoch": 0.3117477130476649, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.440147495928803e-05, |
| "loss": 0.8301, |
| "step": 1295 |
| }, |
| { |
| "epoch": 0.31295137217140107, |
| "grad_norm": 2.375, |
| "learning_rate": 7.439515023415366e-05, |
| "loss": 0.7933, |
| "step": 1300 |
| }, |
| { |
| "epoch": 0.3141550312951372, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.438880092278763e-05, |
| "loss": 0.7935, |
| "step": 1305 |
| }, |
| { |
| "epoch": 0.3153586904188734, |
| "grad_norm": 2.40625, |
| "learning_rate": 7.438242703088822e-05, |
| "loss": 0.8092, |
| "step": 1310 |
| }, |
| { |
| "epoch": 0.31656234954260953, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.43760285641758e-05, |
| "loss": 0.841, |
| "step": 1315 |
| }, |
| { |
| "epoch": 0.3177660086663457, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.436960552839279e-05, |
| "loss": 0.8307, |
| "step": 1320 |
| }, |
| { |
| "epoch": 0.31896966779008185, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.436315792930362e-05, |
| "loss": 0.823, |
| "step": 1325 |
| }, |
| { |
| "epoch": 0.320173326913818, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.435668577269483e-05, |
| "loss": 0.8125, |
| "step": 1330 |
| }, |
| { |
| "epoch": 0.32137698603755416, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.435018906437495e-05, |
| "loss": 0.8152, |
| "step": 1335 |
| }, |
| { |
| "epoch": 0.3225806451612903, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.434366781017453e-05, |
| "loss": 0.7877, |
| "step": 1340 |
| }, |
| { |
| "epoch": 0.32378430428502647, |
| "grad_norm": 2.359375, |
| "learning_rate": 7.433712201594622e-05, |
| "loss": 0.7896, |
| "step": 1345 |
| }, |
| { |
| "epoch": 0.3249879634087626, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.433055168756462e-05, |
| "loss": 0.7763, |
| "step": 1350 |
| }, |
| { |
| "epoch": 0.3261916225324988, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.432395683092641e-05, |
| "loss": 0.8121, |
| "step": 1355 |
| }, |
| { |
| "epoch": 0.32739528165623494, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.431733745195025e-05, |
| "loss": 0.7965, |
| "step": 1360 |
| }, |
| { |
| "epoch": 0.3285989407799711, |
| "grad_norm": 2.390625, |
| "learning_rate": 7.431069355657676e-05, |
| "loss": 0.8458, |
| "step": 1365 |
| }, |
| { |
| "epoch": 0.32980259990370725, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.430402515076869e-05, |
| "loss": 0.7621, |
| "step": 1370 |
| }, |
| { |
| "epoch": 0.3310062590274434, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.429733224051065e-05, |
| "loss": 0.8226, |
| "step": 1375 |
| }, |
| { |
| "epoch": 0.33220991815117956, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.429061483180935e-05, |
| "loss": 0.7758, |
| "step": 1380 |
| }, |
| { |
| "epoch": 0.3334135772749157, |
| "grad_norm": 2.265625, |
| "learning_rate": 7.428387293069341e-05, |
| "loss": 0.7796, |
| "step": 1385 |
| }, |
| { |
| "epoch": 0.33461723639865193, |
| "grad_norm": 2.578125, |
| "learning_rate": 7.427710654321345e-05, |
| "loss": 0.8098, |
| "step": 1390 |
| }, |
| { |
| "epoch": 0.3358208955223881, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.427031567544212e-05, |
| "loss": 0.8161, |
| "step": 1395 |
| }, |
| { |
| "epoch": 0.33702455464612424, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.426350033347396e-05, |
| "loss": 0.8314, |
| "step": 1400 |
| }, |
| { |
| "epoch": 0.3382282137698604, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.425666052342554e-05, |
| "loss": 0.7734, |
| "step": 1405 |
| }, |
| { |
| "epoch": 0.33943187289359655, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.424979625143531e-05, |
| "loss": 0.8005, |
| "step": 1410 |
| }, |
| { |
| "epoch": 0.3406355320173327, |
| "grad_norm": 2.375, |
| "learning_rate": 7.424290752366379e-05, |
| "loss": 0.8085, |
| "step": 1415 |
| }, |
| { |
| "epoch": 0.34183919114106887, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.423599434629334e-05, |
| "loss": 0.81, |
| "step": 1420 |
| }, |
| { |
| "epoch": 0.343042850264805, |
| "grad_norm": 2.328125, |
| "learning_rate": 7.422905672552831e-05, |
| "loss": 0.8262, |
| "step": 1425 |
| }, |
| { |
| "epoch": 0.3442465093885412, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.4222094667595e-05, |
| "loss": 0.7969, |
| "step": 1430 |
| }, |
| { |
| "epoch": 0.34545016851227733, |
| "grad_norm": 2.125, |
| "learning_rate": 7.421510817874162e-05, |
| "loss": 0.8157, |
| "step": 1435 |
| }, |
| { |
| "epoch": 0.3466538276360135, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.42080972652383e-05, |
| "loss": 0.791, |
| "step": 1440 |
| }, |
| { |
| "epoch": 0.34785748675974965, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.42010619333771e-05, |
| "loss": 0.7623, |
| "step": 1445 |
| }, |
| { |
| "epoch": 0.3490611458834858, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.419400218947201e-05, |
| "loss": 0.7848, |
| "step": 1450 |
| }, |
| { |
| "epoch": 0.35026480500722196, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.41869180398589e-05, |
| "loss": 0.77, |
| "step": 1455 |
| }, |
| { |
| "epoch": 0.3514684641309581, |
| "grad_norm": 2.546875, |
| "learning_rate": 7.417980949089556e-05, |
| "loss": 0.7763, |
| "step": 1460 |
| }, |
| { |
| "epoch": 0.35267212325469427, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.417267654896169e-05, |
| "loss": 0.7987, |
| "step": 1465 |
| }, |
| { |
| "epoch": 0.3538757823784304, |
| "grad_norm": 2.390625, |
| "learning_rate": 7.416551922045884e-05, |
| "loss": 0.8275, |
| "step": 1470 |
| }, |
| { |
| "epoch": 0.3550794415021666, |
| "grad_norm": 2.25, |
| "learning_rate": 7.415833751181048e-05, |
| "loss": 0.811, |
| "step": 1475 |
| }, |
| { |
| "epoch": 0.35628310062590274, |
| "grad_norm": 2.125, |
| "learning_rate": 7.415113142946199e-05, |
| "loss": 0.7969, |
| "step": 1480 |
| }, |
| { |
| "epoch": 0.3574867597496389, |
| "grad_norm": 2.40625, |
| "learning_rate": 7.414390097988053e-05, |
| "loss": 0.7832, |
| "step": 1485 |
| }, |
| { |
| "epoch": 0.35869041887337505, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.413664616955524e-05, |
| "loss": 0.7666, |
| "step": 1490 |
| }, |
| { |
| "epoch": 0.3598940779971112, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.412936700499703e-05, |
| "loss": 0.7793, |
| "step": 1495 |
| }, |
| { |
| "epoch": 0.36109773712084736, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.412206349273873e-05, |
| "loss": 0.7734, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.36109773712084736, |
| "eval_loss": 0.687716543674469, |
| "eval_runtime": 2.4175, |
| "eval_samples_per_second": 82.729, |
| "eval_steps_per_second": 82.729, |
| "step": 1500 |
| }, |
| { |
| "epoch": 0.3623013962445835, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.411473563933497e-05, |
| "loss": 0.8028, |
| "step": 1505 |
| }, |
| { |
| "epoch": 0.3635050553683197, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.410738345136231e-05, |
| "loss": 0.7837, |
| "step": 1510 |
| }, |
| { |
| "epoch": 0.36470871449205583, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.410000693541903e-05, |
| "loss": 0.7968, |
| "step": 1515 |
| }, |
| { |
| "epoch": 0.365912373615792, |
| "grad_norm": 2.125, |
| "learning_rate": 7.409260609812534e-05, |
| "loss": 0.7674, |
| "step": 1520 |
| }, |
| { |
| "epoch": 0.36711603273952814, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.408518094612324e-05, |
| "loss": 0.7536, |
| "step": 1525 |
| }, |
| { |
| "epoch": 0.3683196918632643, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.407773148607656e-05, |
| "loss": 0.8126, |
| "step": 1530 |
| }, |
| { |
| "epoch": 0.36952335098700045, |
| "grad_norm": 2.359375, |
| "learning_rate": 7.407025772467092e-05, |
| "loss": 0.8111, |
| "step": 1535 |
| }, |
| { |
| "epoch": 0.37072701011073667, |
| "grad_norm": 2.265625, |
| "learning_rate": 7.406275966861379e-05, |
| "loss": 0.8091, |
| "step": 1540 |
| }, |
| { |
| "epoch": 0.3719306692344728, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.405523732463444e-05, |
| "loss": 0.7743, |
| "step": 1545 |
| }, |
| { |
| "epoch": 0.373134328358209, |
| "grad_norm": 2.328125, |
| "learning_rate": 7.404769069948389e-05, |
| "loss": 0.7793, |
| "step": 1550 |
| }, |
| { |
| "epoch": 0.37433798748194513, |
| "grad_norm": 2.3125, |
| "learning_rate": 7.404011979993499e-05, |
| "loss": 0.7935, |
| "step": 1555 |
| }, |
| { |
| "epoch": 0.3755416466056813, |
| "grad_norm": 2.140625, |
| "learning_rate": 7.403252463278238e-05, |
| "loss": 0.7894, |
| "step": 1560 |
| }, |
| { |
| "epoch": 0.37674530572941745, |
| "grad_norm": 2.265625, |
| "learning_rate": 7.402490520484246e-05, |
| "loss": 0.7806, |
| "step": 1565 |
| }, |
| { |
| "epoch": 0.3779489648531536, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.401726152295342e-05, |
| "loss": 0.8119, |
| "step": 1570 |
| }, |
| { |
| "epoch": 0.37915262397688976, |
| "grad_norm": 2.25, |
| "learning_rate": 7.40095935939752e-05, |
| "loss": 0.7975, |
| "step": 1575 |
| }, |
| { |
| "epoch": 0.3803562831006259, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.400190142478953e-05, |
| "loss": 0.7802, |
| "step": 1580 |
| }, |
| { |
| "epoch": 0.38155994222436207, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.399418502229986e-05, |
| "loss": 0.7909, |
| "step": 1585 |
| }, |
| { |
| "epoch": 0.3827636013480982, |
| "grad_norm": 2.375, |
| "learning_rate": 7.398644439343139e-05, |
| "loss": 0.8037, |
| "step": 1590 |
| }, |
| { |
| "epoch": 0.3839672604718344, |
| "grad_norm": 2.421875, |
| "learning_rate": 7.397867954513109e-05, |
| "loss": 0.7849, |
| "step": 1595 |
| }, |
| { |
| "epoch": 0.38517091959557054, |
| "grad_norm": 2.140625, |
| "learning_rate": 7.397089048436767e-05, |
| "loss": 0.7871, |
| "step": 1600 |
| }, |
| { |
| "epoch": 0.3863745787193067, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.396307721813152e-05, |
| "loss": 0.7793, |
| "step": 1605 |
| }, |
| { |
| "epoch": 0.38757823784304285, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.395523975343479e-05, |
| "loss": 0.7851, |
| "step": 1610 |
| }, |
| { |
| "epoch": 0.388781896966779, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.394737809731136e-05, |
| "loss": 0.797, |
| "step": 1615 |
| }, |
| { |
| "epoch": 0.38998555609051516, |
| "grad_norm": 1.9296875, |
| "learning_rate": 7.39394922568168e-05, |
| "loss": 0.7627, |
| "step": 1620 |
| }, |
| { |
| "epoch": 0.3911892152142513, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.393158223902837e-05, |
| "loss": 0.8324, |
| "step": 1625 |
| }, |
| { |
| "epoch": 0.3923928743379875, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.392364805104507e-05, |
| "loss": 0.7787, |
| "step": 1630 |
| }, |
| { |
| "epoch": 0.39359653346172363, |
| "grad_norm": 2.265625, |
| "learning_rate": 7.391568969998755e-05, |
| "loss": 0.7932, |
| "step": 1635 |
| }, |
| { |
| "epoch": 0.3948001925854598, |
| "grad_norm": 2.34375, |
| "learning_rate": 7.390770719299817e-05, |
| "loss": 0.801, |
| "step": 1640 |
| }, |
| { |
| "epoch": 0.39600385170919594, |
| "grad_norm": 2.25, |
| "learning_rate": 7.389970053724096e-05, |
| "loss": 0.7666, |
| "step": 1645 |
| }, |
| { |
| "epoch": 0.3972075108329321, |
| "grad_norm": 2.296875, |
| "learning_rate": 7.389166973990165e-05, |
| "loss": 0.7781, |
| "step": 1650 |
| }, |
| { |
| "epoch": 0.39841116995666825, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.388361480818758e-05, |
| "loss": 0.7947, |
| "step": 1655 |
| }, |
| { |
| "epoch": 0.3996148290804044, |
| "grad_norm": 1.9921875, |
| "learning_rate": 7.38755357493278e-05, |
| "loss": 0.7934, |
| "step": 1660 |
| }, |
| { |
| "epoch": 0.40081848820414057, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.386743257057299e-05, |
| "loss": 0.769, |
| "step": 1665 |
| }, |
| { |
| "epoch": 0.4020221473278767, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.385930527919548e-05, |
| "loss": 0.7539, |
| "step": 1670 |
| }, |
| { |
| "epoch": 0.4032258064516129, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.385115388248925e-05, |
| "loss": 0.7754, |
| "step": 1675 |
| }, |
| { |
| "epoch": 0.40442946557534903, |
| "grad_norm": 2.125, |
| "learning_rate": 7.384297838776988e-05, |
| "loss": 0.8041, |
| "step": 1680 |
| }, |
| { |
| "epoch": 0.40563312469908525, |
| "grad_norm": 2.265625, |
| "learning_rate": 7.383477880237465e-05, |
| "loss": 0.7606, |
| "step": 1685 |
| }, |
| { |
| "epoch": 0.4068367838228214, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.382655513366237e-05, |
| "loss": 0.7865, |
| "step": 1690 |
| }, |
| { |
| "epoch": 0.40804044294655756, |
| "grad_norm": 1.953125, |
| "learning_rate": 7.381830738901354e-05, |
| "loss": 0.7656, |
| "step": 1695 |
| }, |
| { |
| "epoch": 0.4092441020702937, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.381003557583022e-05, |
| "loss": 0.76, |
| "step": 1700 |
| }, |
| { |
| "epoch": 0.41044776119402987, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.380173970153607e-05, |
| "loss": 0.793, |
| "step": 1705 |
| }, |
| { |
| "epoch": 0.411651420317766, |
| "grad_norm": 2.140625, |
| "learning_rate": 7.37934197735764e-05, |
| "loss": 0.756, |
| "step": 1710 |
| }, |
| { |
| "epoch": 0.4128550794415022, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.378507579941802e-05, |
| "loss": 0.7674, |
| "step": 1715 |
| }, |
| { |
| "epoch": 0.41405873856523834, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.377670778654941e-05, |
| "loss": 0.7861, |
| "step": 1720 |
| }, |
| { |
| "epoch": 0.4152623976889745, |
| "grad_norm": 2.375, |
| "learning_rate": 7.376831574248056e-05, |
| "loss": 0.7743, |
| "step": 1725 |
| }, |
| { |
| "epoch": 0.41646605681271065, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.375989967474304e-05, |
| "loss": 0.7511, |
| "step": 1730 |
| }, |
| { |
| "epoch": 0.4176697159364468, |
| "grad_norm": 2.296875, |
| "learning_rate": 7.375145959089001e-05, |
| "loss": 0.7772, |
| "step": 1735 |
| }, |
| { |
| "epoch": 0.41887337506018296, |
| "grad_norm": 2.125, |
| "learning_rate": 7.374299549849616e-05, |
| "loss": 0.7708, |
| "step": 1740 |
| }, |
| { |
| "epoch": 0.4200770341839191, |
| "grad_norm": 2.125, |
| "learning_rate": 7.373450740515772e-05, |
| "loss": 0.7664, |
| "step": 1745 |
| }, |
| { |
| "epoch": 0.4212806933076553, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.372599531849249e-05, |
| "loss": 0.7721, |
| "step": 1750 |
| }, |
| { |
| "epoch": 0.42248435243139143, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.371745924613975e-05, |
| "loss": 0.7751, |
| "step": 1755 |
| }, |
| { |
| "epoch": 0.4236880115551276, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.370889919576037e-05, |
| "loss": 0.7575, |
| "step": 1760 |
| }, |
| { |
| "epoch": 0.42489167067886374, |
| "grad_norm": 2.0, |
| "learning_rate": 7.370031517503668e-05, |
| "loss": 0.7773, |
| "step": 1765 |
| }, |
| { |
| "epoch": 0.4260953298025999, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.36917071916726e-05, |
| "loss": 0.7559, |
| "step": 1770 |
| }, |
| { |
| "epoch": 0.42729898892633605, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.368307525339345e-05, |
| "loss": 0.7386, |
| "step": 1775 |
| }, |
| { |
| "epoch": 0.4285026480500722, |
| "grad_norm": 2.328125, |
| "learning_rate": 7.367441936794613e-05, |
| "loss": 0.7575, |
| "step": 1780 |
| }, |
| { |
| "epoch": 0.42970630717380837, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.366573954309902e-05, |
| "loss": 0.7845, |
| "step": 1785 |
| }, |
| { |
| "epoch": 0.4309099662975445, |
| "grad_norm": 2.421875, |
| "learning_rate": 7.365703578664196e-05, |
| "loss": 0.8023, |
| "step": 1790 |
| }, |
| { |
| "epoch": 0.4321136254212807, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.364830810638628e-05, |
| "loss": 0.7781, |
| "step": 1795 |
| }, |
| { |
| "epoch": 0.43331728454501683, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.36395565101648e-05, |
| "loss": 0.7705, |
| "step": 1800 |
| }, |
| { |
| "epoch": 0.434520943668753, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.363078100583177e-05, |
| "loss": 0.8125, |
| "step": 1805 |
| }, |
| { |
| "epoch": 0.43572460279248915, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.36219816012629e-05, |
| "loss": 0.7666, |
| "step": 1810 |
| }, |
| { |
| "epoch": 0.4369282619162253, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.361315830435537e-05, |
| "loss": 0.7514, |
| "step": 1815 |
| }, |
| { |
| "epoch": 0.43813192103996146, |
| "grad_norm": 2.296875, |
| "learning_rate": 7.360431112302781e-05, |
| "loss": 0.7494, |
| "step": 1820 |
| }, |
| { |
| "epoch": 0.4393355801636976, |
| "grad_norm": 2.25, |
| "learning_rate": 7.359544006522026e-05, |
| "loss": 0.7663, |
| "step": 1825 |
| }, |
| { |
| "epoch": 0.4405392392874338, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.358654513889417e-05, |
| "loss": 0.7493, |
| "step": 1830 |
| }, |
| { |
| "epoch": 0.44174289841117, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.357762635203247e-05, |
| "loss": 0.7722, |
| "step": 1835 |
| }, |
| { |
| "epoch": 0.44294655753490614, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.35686837126395e-05, |
| "loss": 0.7896, |
| "step": 1840 |
| }, |
| { |
| "epoch": 0.4441502166586423, |
| "grad_norm": 1.8984375, |
| "learning_rate": 7.355971722874091e-05, |
| "loss": 0.7486, |
| "step": 1845 |
| }, |
| { |
| "epoch": 0.44535387578237845, |
| "grad_norm": 2.125, |
| "learning_rate": 7.355072690838387e-05, |
| "loss": 0.7846, |
| "step": 1850 |
| }, |
| { |
| "epoch": 0.4465575349061146, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.354171275963688e-05, |
| "loss": 0.7665, |
| "step": 1855 |
| }, |
| { |
| "epoch": 0.44776119402985076, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.353267479058982e-05, |
| "loss": 0.7758, |
| "step": 1860 |
| }, |
| { |
| "epoch": 0.4489648531535869, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.3523613009354e-05, |
| "loss": 0.723, |
| "step": 1865 |
| }, |
| { |
| "epoch": 0.4501685122773231, |
| "grad_norm": 2.359375, |
| "learning_rate": 7.351452742406204e-05, |
| "loss": 0.7733, |
| "step": 1870 |
| }, |
| { |
| "epoch": 0.45137217140105923, |
| "grad_norm": 2.296875, |
| "learning_rate": 7.350541804286795e-05, |
| "loss": 0.7683, |
| "step": 1875 |
| }, |
| { |
| "epoch": 0.4525758305247954, |
| "grad_norm": 1.9609375, |
| "learning_rate": 7.34962848739471e-05, |
| "loss": 0.7656, |
| "step": 1880 |
| }, |
| { |
| "epoch": 0.45377948964853154, |
| "grad_norm": 2.25, |
| "learning_rate": 7.348712792549623e-05, |
| "loss": 0.7732, |
| "step": 1885 |
| }, |
| { |
| "epoch": 0.4549831487722677, |
| "grad_norm": 2.578125, |
| "learning_rate": 7.347794720573334e-05, |
| "loss": 0.7221, |
| "step": 1890 |
| }, |
| { |
| "epoch": 0.45618680789600385, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.346874272289787e-05, |
| "loss": 0.728, |
| "step": 1895 |
| }, |
| { |
| "epoch": 0.45739046701974, |
| "grad_norm": 2.359375, |
| "learning_rate": 7.34595144852505e-05, |
| "loss": 0.8017, |
| "step": 1900 |
| }, |
| { |
| "epoch": 0.45859412614347617, |
| "grad_norm": 2.3125, |
| "learning_rate": 7.345026250107328e-05, |
| "loss": 0.7741, |
| "step": 1905 |
| }, |
| { |
| "epoch": 0.4597977852672123, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.344098677866956e-05, |
| "loss": 0.7762, |
| "step": 1910 |
| }, |
| { |
| "epoch": 0.4610014443909485, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.343168732636399e-05, |
| "loss": 0.7609, |
| "step": 1915 |
| }, |
| { |
| "epoch": 0.46220510351468463, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.342236415250251e-05, |
| "loss": 0.7588, |
| "step": 1920 |
| }, |
| { |
| "epoch": 0.4634087626384208, |
| "grad_norm": 2.125, |
| "learning_rate": 7.341301726545236e-05, |
| "loss": 0.7907, |
| "step": 1925 |
| }, |
| { |
| "epoch": 0.46461242176215695, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.340364667360207e-05, |
| "loss": 0.7583, |
| "step": 1930 |
| }, |
| { |
| "epoch": 0.4658160808858931, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.339425238536141e-05, |
| "loss": 0.7541, |
| "step": 1935 |
| }, |
| { |
| "epoch": 0.46701974000962926, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.338483440916145e-05, |
| "loss": 0.7562, |
| "step": 1940 |
| }, |
| { |
| "epoch": 0.4682233991333654, |
| "grad_norm": 2.34375, |
| "learning_rate": 7.337539275345452e-05, |
| "loss": 0.7563, |
| "step": 1945 |
| }, |
| { |
| "epoch": 0.46942705825710157, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.336592742671419e-05, |
| "loss": 0.7385, |
| "step": 1950 |
| }, |
| { |
| "epoch": 0.4706307173808377, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.335643843743526e-05, |
| "loss": 0.7353, |
| "step": 1955 |
| }, |
| { |
| "epoch": 0.4718343765045739, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.334692579413379e-05, |
| "loss": 0.7242, |
| "step": 1960 |
| }, |
| { |
| "epoch": 0.47303803562831004, |
| "grad_norm": 2.375, |
| "learning_rate": 7.333738950534705e-05, |
| "loss": 0.7719, |
| "step": 1965 |
| }, |
| { |
| "epoch": 0.4742416947520462, |
| "grad_norm": 2.125, |
| "learning_rate": 7.332782957963356e-05, |
| "loss": 0.7788, |
| "step": 1970 |
| }, |
| { |
| "epoch": 0.4754453538757824, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.3318246025573e-05, |
| "loss": 0.7635, |
| "step": 1975 |
| }, |
| { |
| "epoch": 0.47664901299951856, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.330863885176631e-05, |
| "loss": 0.7608, |
| "step": 1980 |
| }, |
| { |
| "epoch": 0.4778526721232547, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.329900806683563e-05, |
| "loss": 0.7329, |
| "step": 1985 |
| }, |
| { |
| "epoch": 0.4790563312469909, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.328935367942422e-05, |
| "loss": 0.751, |
| "step": 1990 |
| }, |
| { |
| "epoch": 0.48025999037072703, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.32796756981966e-05, |
| "loss": 0.7366, |
| "step": 1995 |
| }, |
| { |
| "epoch": 0.4814636494944632, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.326997413183845e-05, |
| "loss": 0.7259, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.4814636494944632, |
| "eval_loss": 0.6541061997413635, |
| "eval_runtime": 2.4161, |
| "eval_samples_per_second": 82.778, |
| "eval_steps_per_second": 82.778, |
| "step": 2000 |
| }, |
| { |
| "epoch": 0.48266730861819934, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.326024898905656e-05, |
| "loss": 0.7437, |
| "step": 2005 |
| }, |
| { |
| "epoch": 0.4838709677419355, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.325050027857896e-05, |
| "loss": 0.7322, |
| "step": 2010 |
| }, |
| { |
| "epoch": 0.48507462686567165, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.324072800915476e-05, |
| "loss": 0.7525, |
| "step": 2015 |
| }, |
| { |
| "epoch": 0.4862782859894078, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.323093218955426e-05, |
| "loss": 0.7395, |
| "step": 2020 |
| }, |
| { |
| "epoch": 0.48748194511314397, |
| "grad_norm": 2.140625, |
| "learning_rate": 7.322111282856888e-05, |
| "loss": 0.7477, |
| "step": 2025 |
| }, |
| { |
| "epoch": 0.4886856042368801, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.321126993501118e-05, |
| "loss": 0.7167, |
| "step": 2030 |
| }, |
| { |
| "epoch": 0.4898892633606163, |
| "grad_norm": 2.484375, |
| "learning_rate": 7.32014035177148e-05, |
| "loss": 0.7711, |
| "step": 2035 |
| }, |
| { |
| "epoch": 0.49109292248435243, |
| "grad_norm": 2.4375, |
| "learning_rate": 7.319151358553453e-05, |
| "loss": 0.7454, |
| "step": 2040 |
| }, |
| { |
| "epoch": 0.4922965816080886, |
| "grad_norm": 2.328125, |
| "learning_rate": 7.318160014734628e-05, |
| "loss": 0.7272, |
| "step": 2045 |
| }, |
| { |
| "epoch": 0.49350024073182475, |
| "grad_norm": 2.25, |
| "learning_rate": 7.3171663212047e-05, |
| "loss": 0.7585, |
| "step": 2050 |
| }, |
| { |
| "epoch": 0.4947038998555609, |
| "grad_norm": 2.140625, |
| "learning_rate": 7.316170278855475e-05, |
| "loss": 0.7301, |
| "step": 2055 |
| }, |
| { |
| "epoch": 0.49590755897929706, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.315171888580872e-05, |
| "loss": 0.7209, |
| "step": 2060 |
| }, |
| { |
| "epoch": 0.4971112181030332, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.314171151276908e-05, |
| "loss": 0.7412, |
| "step": 2065 |
| }, |
| { |
| "epoch": 0.49831487722676937, |
| "grad_norm": 2.125, |
| "learning_rate": 7.313168067841716e-05, |
| "loss": 0.7563, |
| "step": 2070 |
| }, |
| { |
| "epoch": 0.4995185363505055, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.312162639175524e-05, |
| "loss": 0.7186, |
| "step": 2075 |
| }, |
| { |
| "epoch": 0.5007221954742417, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.311154866180677e-05, |
| "loss": 0.7328, |
| "step": 2080 |
| }, |
| { |
| "epoch": 0.5019258545979779, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.310144749761613e-05, |
| "loss": 0.7683, |
| "step": 2085 |
| }, |
| { |
| "epoch": 0.503129513721714, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.30913229082488e-05, |
| "loss": 0.7706, |
| "step": 2090 |
| }, |
| { |
| "epoch": 0.5043331728454502, |
| "grad_norm": 1.9140625, |
| "learning_rate": 7.308117490279124e-05, |
| "loss": 0.7109, |
| "step": 2095 |
| }, |
| { |
| "epoch": 0.5055368319691863, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.307100349035097e-05, |
| "loss": 0.7755, |
| "step": 2100 |
| }, |
| { |
| "epoch": 0.5067404910929225, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.306080868005648e-05, |
| "loss": 0.7243, |
| "step": 2105 |
| }, |
| { |
| "epoch": 0.5079441502166586, |
| "grad_norm": 2.125, |
| "learning_rate": 7.305059048105727e-05, |
| "loss": 0.7462, |
| "step": 2110 |
| }, |
| { |
| "epoch": 0.5091478093403948, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.304034890252383e-05, |
| "loss": 0.7665, |
| "step": 2115 |
| }, |
| { |
| "epoch": 0.5103514684641309, |
| "grad_norm": 2.265625, |
| "learning_rate": 7.303008395364765e-05, |
| "loss": 0.7395, |
| "step": 2120 |
| }, |
| { |
| "epoch": 0.5115551275878671, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.301979564364117e-05, |
| "loss": 0.7747, |
| "step": 2125 |
| }, |
| { |
| "epoch": 0.5127587867116032, |
| "grad_norm": 2.25, |
| "learning_rate": 7.300948398173779e-05, |
| "loss": 0.6931, |
| "step": 2130 |
| }, |
| { |
| "epoch": 0.5139624458353395, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.299914897719191e-05, |
| "loss": 0.723, |
| "step": 2135 |
| }, |
| { |
| "epoch": 0.5151661049590756, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.298879063927882e-05, |
| "loss": 0.7726, |
| "step": 2140 |
| }, |
| { |
| "epoch": 0.5163697640828118, |
| "grad_norm": 2.125, |
| "learning_rate": 7.297840897729481e-05, |
| "loss": 0.7356, |
| "step": 2145 |
| }, |
| { |
| "epoch": 0.5175734232065479, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.296800400055706e-05, |
| "loss": 0.7247, |
| "step": 2150 |
| }, |
| { |
| "epoch": 0.5187770823302841, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.295757571840368e-05, |
| "loss": 0.7482, |
| "step": 2155 |
| }, |
| { |
| "epoch": 0.5199807414540202, |
| "grad_norm": 2.0, |
| "learning_rate": 7.294712414019372e-05, |
| "loss": 0.7282, |
| "step": 2160 |
| }, |
| { |
| "epoch": 0.5211844005777564, |
| "grad_norm": 2.25, |
| "learning_rate": 7.293664927530712e-05, |
| "loss": 0.757, |
| "step": 2165 |
| }, |
| { |
| "epoch": 0.5223880597014925, |
| "grad_norm": 2.125, |
| "learning_rate": 7.292615113314472e-05, |
| "loss": 0.7544, |
| "step": 2170 |
| }, |
| { |
| "epoch": 0.5235917188252287, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.291562972312825e-05, |
| "loss": 0.7363, |
| "step": 2175 |
| }, |
| { |
| "epoch": 0.5247953779489648, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.290508505470032e-05, |
| "loss": 0.7396, |
| "step": 2180 |
| }, |
| { |
| "epoch": 0.525999037072701, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.289451713732443e-05, |
| "loss": 0.7563, |
| "step": 2185 |
| }, |
| { |
| "epoch": 0.5272026961964371, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.288392598048492e-05, |
| "loss": 0.7385, |
| "step": 2190 |
| }, |
| { |
| "epoch": 0.5284063553201733, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.2873311593687e-05, |
| "loss": 0.7356, |
| "step": 2195 |
| }, |
| { |
| "epoch": 0.5296100144439095, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.286267398645673e-05, |
| "loss": 0.7428, |
| "step": 2200 |
| }, |
| { |
| "epoch": 0.5308136735676456, |
| "grad_norm": 2.265625, |
| "learning_rate": 7.285201316834101e-05, |
| "loss": 0.7507, |
| "step": 2205 |
| }, |
| { |
| "epoch": 0.5320173326913819, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.284132914890758e-05, |
| "loss": 0.7333, |
| "step": 2210 |
| }, |
| { |
| "epoch": 0.533220991815118, |
| "grad_norm": 2.140625, |
| "learning_rate": 7.283062193774495e-05, |
| "loss": 0.7249, |
| "step": 2215 |
| }, |
| { |
| "epoch": 0.5344246509388542, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.281989154446253e-05, |
| "loss": 0.7518, |
| "step": 2220 |
| }, |
| { |
| "epoch": 0.5356283100625903, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.280913797869046e-05, |
| "loss": 0.7485, |
| "step": 2225 |
| }, |
| { |
| "epoch": 0.5368319691863265, |
| "grad_norm": 2.40625, |
| "learning_rate": 7.279836125007971e-05, |
| "loss": 0.7355, |
| "step": 2230 |
| }, |
| { |
| "epoch": 0.5380356283100626, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.278756136830206e-05, |
| "loss": 0.7594, |
| "step": 2235 |
| }, |
| { |
| "epoch": 0.5392392874337988, |
| "grad_norm": 2.25, |
| "learning_rate": 7.277673834305001e-05, |
| "loss": 0.7225, |
| "step": 2240 |
| }, |
| { |
| "epoch": 0.5404429465575349, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.276589218403688e-05, |
| "loss": 0.7132, |
| "step": 2245 |
| }, |
| { |
| "epoch": 0.5416466056812711, |
| "grad_norm": 2.015625, |
| "learning_rate": 7.275502290099672e-05, |
| "loss": 0.7118, |
| "step": 2250 |
| }, |
| { |
| "epoch": 0.5428502648050072, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.274413050368438e-05, |
| "loss": 0.734, |
| "step": 2255 |
| }, |
| { |
| "epoch": 0.5440539239287434, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.273321500187538e-05, |
| "loss": 0.7491, |
| "step": 2260 |
| }, |
| { |
| "epoch": 0.5452575830524795, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.272227640536604e-05, |
| "loss": 0.7673, |
| "step": 2265 |
| }, |
| { |
| "epoch": 0.5464612421762157, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.271131472397339e-05, |
| "loss": 0.7483, |
| "step": 2270 |
| }, |
| { |
| "epoch": 0.5476649012999518, |
| "grad_norm": 2.34375, |
| "learning_rate": 7.270032996753517e-05, |
| "loss": 0.7284, |
| "step": 2275 |
| }, |
| { |
| "epoch": 0.548868560423688, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.268932214590982e-05, |
| "loss": 0.7643, |
| "step": 2280 |
| }, |
| { |
| "epoch": 0.5500722195474241, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.267829126897652e-05, |
| "loss": 0.7348, |
| "step": 2285 |
| }, |
| { |
| "epoch": 0.5512758786711603, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.266723734663508e-05, |
| "loss": 0.7307, |
| "step": 2290 |
| }, |
| { |
| "epoch": 0.5524795377948964, |
| "grad_norm": 1.8828125, |
| "learning_rate": 7.265616038880603e-05, |
| "loss": 0.7181, |
| "step": 2295 |
| }, |
| { |
| "epoch": 0.5536831969186327, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.26450604054306e-05, |
| "loss": 0.7386, |
| "step": 2300 |
| }, |
| { |
| "epoch": 0.5548868560423688, |
| "grad_norm": 2.3125, |
| "learning_rate": 7.263393740647062e-05, |
| "loss": 0.7537, |
| "step": 2305 |
| }, |
| { |
| "epoch": 0.556090515166105, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.262279140190863e-05, |
| "loss": 0.7102, |
| "step": 2310 |
| }, |
| { |
| "epoch": 0.5572941742898411, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.261162240174778e-05, |
| "loss": 0.7147, |
| "step": 2315 |
| }, |
| { |
| "epoch": 0.5584978334135773, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.260043041601189e-05, |
| "loss": 0.7572, |
| "step": 2320 |
| }, |
| { |
| "epoch": 0.5597014925373134, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.258921545474539e-05, |
| "loss": 0.7161, |
| "step": 2325 |
| }, |
| { |
| "epoch": 0.5609051516610496, |
| "grad_norm": 1.9453125, |
| "learning_rate": 7.257797752801332e-05, |
| "loss": 0.7251, |
| "step": 2330 |
| }, |
| { |
| "epoch": 0.5621088107847857, |
| "grad_norm": 2.0, |
| "learning_rate": 7.256671664590136e-05, |
| "loss": 0.6989, |
| "step": 2335 |
| }, |
| { |
| "epoch": 0.5633124699085219, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.255543281851577e-05, |
| "loss": 0.753, |
| "step": 2340 |
| }, |
| { |
| "epoch": 0.5645161290322581, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.25441260559834e-05, |
| "loss": 0.7316, |
| "step": 2345 |
| }, |
| { |
| "epoch": 0.5657197881559942, |
| "grad_norm": 2.296875, |
| "learning_rate": 7.253279636845171e-05, |
| "loss": 0.7296, |
| "step": 2350 |
| }, |
| { |
| "epoch": 0.5669234472797304, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.252144376608869e-05, |
| "loss": 0.6987, |
| "step": 2355 |
| }, |
| { |
| "epoch": 0.5681271064034665, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.251006825908295e-05, |
| "loss": 0.7098, |
| "step": 2360 |
| }, |
| { |
| "epoch": 0.5693307655272027, |
| "grad_norm": 1.9609375, |
| "learning_rate": 7.24986698576436e-05, |
| "loss": 0.6956, |
| "step": 2365 |
| }, |
| { |
| "epoch": 0.5705344246509388, |
| "grad_norm": 1.984375, |
| "learning_rate": 7.248724857200034e-05, |
| "loss": 0.6961, |
| "step": 2370 |
| }, |
| { |
| "epoch": 0.571738083774675, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.24758044124034e-05, |
| "loss": 0.7157, |
| "step": 2375 |
| }, |
| { |
| "epoch": 0.5729417428984112, |
| "grad_norm": 1.9453125, |
| "learning_rate": 7.246433738912352e-05, |
| "loss": 0.7143, |
| "step": 2380 |
| }, |
| { |
| "epoch": 0.5741454020221474, |
| "grad_norm": 2.125, |
| "learning_rate": 7.245284751245195e-05, |
| "loss": 0.726, |
| "step": 2385 |
| }, |
| { |
| "epoch": 0.5753490611458835, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.24413347927005e-05, |
| "loss": 0.7714, |
| "step": 2390 |
| }, |
| { |
| "epoch": 0.5765527202696197, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.242979924020144e-05, |
| "loss": 0.7224, |
| "step": 2395 |
| }, |
| { |
| "epoch": 0.5777563793933558, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.241824086530754e-05, |
| "loss": 0.7367, |
| "step": 2400 |
| }, |
| { |
| "epoch": 0.578960038517092, |
| "grad_norm": 2.125, |
| "learning_rate": 7.240665967839207e-05, |
| "loss": 0.7353, |
| "step": 2405 |
| }, |
| { |
| "epoch": 0.5801636976408281, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.239505568984874e-05, |
| "loss": 0.6976, |
| "step": 2410 |
| }, |
| { |
| "epoch": 0.5813673567645643, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.238342891009176e-05, |
| "loss": 0.6909, |
| "step": 2415 |
| }, |
| { |
| "epoch": 0.5825710158883004, |
| "grad_norm": 2.140625, |
| "learning_rate": 7.237177934955575e-05, |
| "loss": 0.749, |
| "step": 2420 |
| }, |
| { |
| "epoch": 0.5837746750120366, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.236010701869583e-05, |
| "loss": 0.7254, |
| "step": 2425 |
| }, |
| { |
| "epoch": 0.5849783341357727, |
| "grad_norm": 2.828125, |
| "learning_rate": 7.23484119279875e-05, |
| "loss": 0.7448, |
| "step": 2430 |
| }, |
| { |
| "epoch": 0.5861819932595089, |
| "grad_norm": 2.265625, |
| "learning_rate": 7.233669408792673e-05, |
| "loss": 0.7108, |
| "step": 2435 |
| }, |
| { |
| "epoch": 0.587385652383245, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.232495350902989e-05, |
| "loss": 0.7044, |
| "step": 2440 |
| }, |
| { |
| "epoch": 0.5885893115069812, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.231319020183376e-05, |
| "loss": 0.7287, |
| "step": 2445 |
| }, |
| { |
| "epoch": 0.5897929706307173, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.23014041768955e-05, |
| "loss": 0.7299, |
| "step": 2450 |
| }, |
| { |
| "epoch": 0.5909966297544536, |
| "grad_norm": 2.296875, |
| "learning_rate": 7.228959544479267e-05, |
| "loss": 0.7104, |
| "step": 2455 |
| }, |
| { |
| "epoch": 0.5922002888781897, |
| "grad_norm": 1.8671875, |
| "learning_rate": 7.227776401612323e-05, |
| "loss": 0.704, |
| "step": 2460 |
| }, |
| { |
| "epoch": 0.5934039480019259, |
| "grad_norm": 2.4375, |
| "learning_rate": 7.22659099015055e-05, |
| "loss": 0.7279, |
| "step": 2465 |
| }, |
| { |
| "epoch": 0.594607607125662, |
| "grad_norm": 2.140625, |
| "learning_rate": 7.225403311157814e-05, |
| "loss": 0.722, |
| "step": 2470 |
| }, |
| { |
| "epoch": 0.5958112662493982, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.224213365700016e-05, |
| "loss": 0.7195, |
| "step": 2475 |
| }, |
| { |
| "epoch": 0.5970149253731343, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.223021154845092e-05, |
| "loss": 0.7581, |
| "step": 2480 |
| }, |
| { |
| "epoch": 0.5982185844968705, |
| "grad_norm": 2.0, |
| "learning_rate": 7.221826679663015e-05, |
| "loss": 0.7929, |
| "step": 2485 |
| }, |
| { |
| "epoch": 0.5994222436206067, |
| "grad_norm": 1.984375, |
| "learning_rate": 7.220629941225782e-05, |
| "loss": 0.7036, |
| "step": 2490 |
| }, |
| { |
| "epoch": 0.6006259027443428, |
| "grad_norm": 2.375, |
| "learning_rate": 7.21943094060743e-05, |
| "loss": 0.7072, |
| "step": 2495 |
| }, |
| { |
| "epoch": 0.601829561868079, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.218229678884018e-05, |
| "loss": 0.7199, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.601829561868079, |
| "eval_loss": 0.6185581088066101, |
| "eval_runtime": 2.4024, |
| "eval_samples_per_second": 83.25, |
| "eval_steps_per_second": 83.25, |
| "step": 2500 |
| }, |
| { |
| "epoch": 0.6030332209918151, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.21702615713364e-05, |
| "loss": 0.7025, |
| "step": 2505 |
| }, |
| { |
| "epoch": 0.6042368801155513, |
| "grad_norm": 1.9921875, |
| "learning_rate": 7.215820376436418e-05, |
| "loss": 0.7126, |
| "step": 2510 |
| }, |
| { |
| "epoch": 0.6054405392392874, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.214612337874497e-05, |
| "loss": 0.7045, |
| "step": 2515 |
| }, |
| { |
| "epoch": 0.6066441983630236, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.213402042532054e-05, |
| "loss": 0.7276, |
| "step": 2520 |
| }, |
| { |
| "epoch": 0.6078478574867597, |
| "grad_norm": 1.984375, |
| "learning_rate": 7.212189491495289e-05, |
| "loss": 0.7343, |
| "step": 2525 |
| }, |
| { |
| "epoch": 0.609051516610496, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.210974685852423e-05, |
| "loss": 0.7073, |
| "step": 2530 |
| }, |
| { |
| "epoch": 0.610255175734232, |
| "grad_norm": 1.984375, |
| "learning_rate": 7.209757626693704e-05, |
| "loss": 0.6977, |
| "step": 2535 |
| }, |
| { |
| "epoch": 0.6114588348579683, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.208538315111404e-05, |
| "loss": 0.6994, |
| "step": 2540 |
| }, |
| { |
| "epoch": 0.6126624939817044, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.207316752199813e-05, |
| "loss": 0.7094, |
| "step": 2545 |
| }, |
| { |
| "epoch": 0.6138661531054406, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.206092939055242e-05, |
| "loss": 0.7154, |
| "step": 2550 |
| }, |
| { |
| "epoch": 0.6150698122291767, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.204866876776024e-05, |
| "loss": 0.7031, |
| "step": 2555 |
| }, |
| { |
| "epoch": 0.6162734713529129, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.203638566462509e-05, |
| "loss": 0.6997, |
| "step": 2560 |
| }, |
| { |
| "epoch": 0.617477130476649, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.202408009217063e-05, |
| "loss": 0.7273, |
| "step": 2565 |
| }, |
| { |
| "epoch": 0.6186807896003852, |
| "grad_norm": 2.125, |
| "learning_rate": 7.201175206144072e-05, |
| "loss": 0.7183, |
| "step": 2570 |
| }, |
| { |
| "epoch": 0.6198844487241213, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.199940158349934e-05, |
| "loss": 0.6838, |
| "step": 2575 |
| }, |
| { |
| "epoch": 0.6210881078478575, |
| "grad_norm": 2.125, |
| "learning_rate": 7.198702866943061e-05, |
| "loss": 0.6794, |
| "step": 2580 |
| }, |
| { |
| "epoch": 0.6222917669715936, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.197463333033886e-05, |
| "loss": 0.7418, |
| "step": 2585 |
| }, |
| { |
| "epoch": 0.6234954260953298, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.196221557734845e-05, |
| "loss": 0.706, |
| "step": 2590 |
| }, |
| { |
| "epoch": 0.6246990852190659, |
| "grad_norm": 2.140625, |
| "learning_rate": 7.194977542160393e-05, |
| "loss": 0.7136, |
| "step": 2595 |
| }, |
| { |
| "epoch": 0.6259027443428021, |
| "grad_norm": 1.9921875, |
| "learning_rate": 7.19373128742699e-05, |
| "loss": 0.7062, |
| "step": 2600 |
| }, |
| { |
| "epoch": 0.6271064034665382, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.192482794653109e-05, |
| "loss": 0.7187, |
| "step": 2605 |
| }, |
| { |
| "epoch": 0.6283100625902744, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.191232064959229e-05, |
| "loss": 0.7383, |
| "step": 2610 |
| }, |
| { |
| "epoch": 0.6295137217140105, |
| "grad_norm": 2.0, |
| "learning_rate": 7.18997909946784e-05, |
| "loss": 0.7232, |
| "step": 2615 |
| }, |
| { |
| "epoch": 0.6307173808377468, |
| "grad_norm": 1.9375, |
| "learning_rate": 7.188723899303436e-05, |
| "loss": 0.6968, |
| "step": 2620 |
| }, |
| { |
| "epoch": 0.6319210399614829, |
| "grad_norm": 2.125, |
| "learning_rate": 7.187466465592516e-05, |
| "loss": 0.749, |
| "step": 2625 |
| }, |
| { |
| "epoch": 0.6331246990852191, |
| "grad_norm": 2.015625, |
| "learning_rate": 7.186206799463587e-05, |
| "loss": 0.7269, |
| "step": 2630 |
| }, |
| { |
| "epoch": 0.6343283582089553, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.184944902047154e-05, |
| "loss": 0.7076, |
| "step": 2635 |
| }, |
| { |
| "epoch": 0.6355320173326914, |
| "grad_norm": 2.125, |
| "learning_rate": 7.183680774475732e-05, |
| "loss": 0.7502, |
| "step": 2640 |
| }, |
| { |
| "epoch": 0.6367356764564276, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.182414417883831e-05, |
| "loss": 0.7216, |
| "step": 2645 |
| }, |
| { |
| "epoch": 0.6379393355801637, |
| "grad_norm": 1.9921875, |
| "learning_rate": 7.181145833407964e-05, |
| "loss": 0.7058, |
| "step": 2650 |
| }, |
| { |
| "epoch": 0.6391429947038999, |
| "grad_norm": 2.296875, |
| "learning_rate": 7.179875022186641e-05, |
| "loss": 0.7297, |
| "step": 2655 |
| }, |
| { |
| "epoch": 0.640346653827636, |
| "grad_norm": 1.9453125, |
| "learning_rate": 7.178601985360377e-05, |
| "loss": 0.712, |
| "step": 2660 |
| }, |
| { |
| "epoch": 0.6415503129513722, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.177326724071674e-05, |
| "loss": 0.7122, |
| "step": 2665 |
| }, |
| { |
| "epoch": 0.6427539720751083, |
| "grad_norm": 2.015625, |
| "learning_rate": 7.176049239465043e-05, |
| "loss": 0.6803, |
| "step": 2670 |
| }, |
| { |
| "epoch": 0.6439576311988445, |
| "grad_norm": 1.8984375, |
| "learning_rate": 7.174769532686981e-05, |
| "loss": 0.7044, |
| "step": 2675 |
| }, |
| { |
| "epoch": 0.6451612903225806, |
| "grad_norm": 2.46875, |
| "learning_rate": 7.17348760488598e-05, |
| "loss": 0.7183, |
| "step": 2680 |
| }, |
| { |
| "epoch": 0.6463649494463168, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.172203457212529e-05, |
| "loss": 0.7206, |
| "step": 2685 |
| }, |
| { |
| "epoch": 0.6475686085700529, |
| "grad_norm": 2.140625, |
| "learning_rate": 7.170917090819108e-05, |
| "loss": 0.7073, |
| "step": 2690 |
| }, |
| { |
| "epoch": 0.6487722676937892, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.169628506860189e-05, |
| "loss": 0.7037, |
| "step": 2695 |
| }, |
| { |
| "epoch": 0.6499759268175253, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.16833770649223e-05, |
| "loss": 0.7078, |
| "step": 2700 |
| }, |
| { |
| "epoch": 0.6511795859412615, |
| "grad_norm": 1.921875, |
| "learning_rate": 7.167044690873683e-05, |
| "loss": 0.7619, |
| "step": 2705 |
| }, |
| { |
| "epoch": 0.6523832450649976, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.165749461164988e-05, |
| "loss": 0.6917, |
| "step": 2710 |
| }, |
| { |
| "epoch": 0.6535869041887338, |
| "grad_norm": 1.953125, |
| "learning_rate": 7.164452018528565e-05, |
| "loss": 0.7178, |
| "step": 2715 |
| }, |
| { |
| "epoch": 0.6547905633124699, |
| "grad_norm": 1.96875, |
| "learning_rate": 7.163152364128831e-05, |
| "loss": 0.7089, |
| "step": 2720 |
| }, |
| { |
| "epoch": 0.6559942224362061, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.16185049913218e-05, |
| "loss": 0.6982, |
| "step": 2725 |
| }, |
| { |
| "epoch": 0.6571978815599422, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.160546424706991e-05, |
| "loss": 0.7445, |
| "step": 2730 |
| }, |
| { |
| "epoch": 0.6584015406836784, |
| "grad_norm": 1.9765625, |
| "learning_rate": 7.15924014202363e-05, |
| "loss": 0.7561, |
| "step": 2735 |
| }, |
| { |
| "epoch": 0.6596051998074145, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.157931652254441e-05, |
| "loss": 0.6975, |
| "step": 2740 |
| }, |
| { |
| "epoch": 0.6608088589311507, |
| "grad_norm": 1.9453125, |
| "learning_rate": 7.156620956573748e-05, |
| "loss": 0.6788, |
| "step": 2745 |
| }, |
| { |
| "epoch": 0.6620125180548868, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.155308056157859e-05, |
| "loss": 0.7178, |
| "step": 2750 |
| }, |
| { |
| "epoch": 0.663216177178623, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.153992952185058e-05, |
| "loss": 0.7256, |
| "step": 2755 |
| }, |
| { |
| "epoch": 0.6644198363023591, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.152675645835607e-05, |
| "loss": 0.7036, |
| "step": 2760 |
| }, |
| { |
| "epoch": 0.6656234954260953, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.151356138291742e-05, |
| "loss": 0.7168, |
| "step": 2765 |
| }, |
| { |
| "epoch": 0.6668271545498314, |
| "grad_norm": 2.125, |
| "learning_rate": 7.150034430737679e-05, |
| "loss": 0.7073, |
| "step": 2770 |
| }, |
| { |
| "epoch": 0.6680308136735676, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.148710524359607e-05, |
| "loss": 0.6977, |
| "step": 2775 |
| }, |
| { |
| "epoch": 0.6692344727973039, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.147384420345685e-05, |
| "loss": 0.7269, |
| "step": 2780 |
| }, |
| { |
| "epoch": 0.67043813192104, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.14605611988605e-05, |
| "loss": 0.7017, |
| "step": 2785 |
| }, |
| { |
| "epoch": 0.6716417910447762, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.144725624172805e-05, |
| "loss": 0.6911, |
| "step": 2790 |
| }, |
| { |
| "epoch": 0.6728454501685123, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.143392934400028e-05, |
| "loss": 0.7137, |
| "step": 2795 |
| }, |
| { |
| "epoch": 0.6740491092922485, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.142058051763761e-05, |
| "loss": 0.7144, |
| "step": 2800 |
| }, |
| { |
| "epoch": 0.6752527684159846, |
| "grad_norm": 2.015625, |
| "learning_rate": 7.140720977462018e-05, |
| "loss": 0.7026, |
| "step": 2805 |
| }, |
| { |
| "epoch": 0.6764564275397208, |
| "grad_norm": 2.28125, |
| "learning_rate": 7.139381712694777e-05, |
| "loss": 0.712, |
| "step": 2810 |
| }, |
| { |
| "epoch": 0.6776600866634569, |
| "grad_norm": 2.296875, |
| "learning_rate": 7.138040258663984e-05, |
| "loss": 0.7336, |
| "step": 2815 |
| }, |
| { |
| "epoch": 0.6788637457871931, |
| "grad_norm": 2.125, |
| "learning_rate": 7.13669661657355e-05, |
| "loss": 0.7178, |
| "step": 2820 |
| }, |
| { |
| "epoch": 0.6800674049109292, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.135350787629349e-05, |
| "loss": 0.6975, |
| "step": 2825 |
| }, |
| { |
| "epoch": 0.6812710640346654, |
| "grad_norm": 2.015625, |
| "learning_rate": 7.134002773039217e-05, |
| "loss": 0.6854, |
| "step": 2830 |
| }, |
| { |
| "epoch": 0.6824747231584015, |
| "grad_norm": 2.0, |
| "learning_rate": 7.13265257401295e-05, |
| "loss": 0.7039, |
| "step": 2835 |
| }, |
| { |
| "epoch": 0.6836783822821377, |
| "grad_norm": 1.984375, |
| "learning_rate": 7.131300191762311e-05, |
| "loss": 0.7228, |
| "step": 2840 |
| }, |
| { |
| "epoch": 0.6848820414058738, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.129945627501013e-05, |
| "loss": 0.7109, |
| "step": 2845 |
| }, |
| { |
| "epoch": 0.68608570052961, |
| "grad_norm": 1.9921875, |
| "learning_rate": 7.128588882444734e-05, |
| "loss": 0.6984, |
| "step": 2850 |
| }, |
| { |
| "epoch": 0.6872893596533461, |
| "grad_norm": 2.421875, |
| "learning_rate": 7.127229957811112e-05, |
| "loss": 0.6898, |
| "step": 2855 |
| }, |
| { |
| "epoch": 0.6884930187770824, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.125868854819727e-05, |
| "loss": 0.7012, |
| "step": 2860 |
| }, |
| { |
| "epoch": 0.6896966779008185, |
| "grad_norm": 2.15625, |
| "learning_rate": 7.124505574692132e-05, |
| "loss": 0.7063, |
| "step": 2865 |
| }, |
| { |
| "epoch": 0.6909003370245547, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.123140118651819e-05, |
| "loss": 0.6994, |
| "step": 2870 |
| }, |
| { |
| "epoch": 0.6921039961482908, |
| "grad_norm": 1.90625, |
| "learning_rate": 7.121772487924245e-05, |
| "loss": 0.6898, |
| "step": 2875 |
| }, |
| { |
| "epoch": 0.693307655272027, |
| "grad_norm": 2.015625, |
| "learning_rate": 7.12040268373681e-05, |
| "loss": 0.7002, |
| "step": 2880 |
| }, |
| { |
| "epoch": 0.6945113143957631, |
| "grad_norm": 1.96875, |
| "learning_rate": 7.119030707318866e-05, |
| "loss": 0.7231, |
| "step": 2885 |
| }, |
| { |
| "epoch": 0.6957149735194993, |
| "grad_norm": 1.96875, |
| "learning_rate": 7.117656559901716e-05, |
| "loss": 0.7083, |
| "step": 2890 |
| }, |
| { |
| "epoch": 0.6969186326432354, |
| "grad_norm": 2.09375, |
| "learning_rate": 7.116280242718616e-05, |
| "loss": 0.7255, |
| "step": 2895 |
| }, |
| { |
| "epoch": 0.6981222917669716, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.11490175700476e-05, |
| "loss": 0.6818, |
| "step": 2900 |
| }, |
| { |
| "epoch": 0.6993259508907077, |
| "grad_norm": 1.890625, |
| "learning_rate": 7.113521103997295e-05, |
| "loss": 0.7098, |
| "step": 2905 |
| }, |
| { |
| "epoch": 0.7005296100144439, |
| "grad_norm": 2.140625, |
| "learning_rate": 7.112138284935309e-05, |
| "loss": 0.6684, |
| "step": 2910 |
| }, |
| { |
| "epoch": 0.70173326913818, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.110753301059837e-05, |
| "loss": 0.7065, |
| "step": 2915 |
| }, |
| { |
| "epoch": 0.7029369282619162, |
| "grad_norm": 1.9296875, |
| "learning_rate": 7.109366153613856e-05, |
| "loss": 0.6378, |
| "step": 2920 |
| }, |
| { |
| "epoch": 0.7041405873856523, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.107976843842285e-05, |
| "loss": 0.717, |
| "step": 2925 |
| }, |
| { |
| "epoch": 0.7053442465093885, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.106585372991983e-05, |
| "loss": 0.6748, |
| "step": 2930 |
| }, |
| { |
| "epoch": 0.7065479056331248, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.105191742311748e-05, |
| "loss": 0.6826, |
| "step": 2935 |
| }, |
| { |
| "epoch": 0.7077515647568609, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.103795953052316e-05, |
| "loss": 0.6717, |
| "step": 2940 |
| }, |
| { |
| "epoch": 0.7089552238805971, |
| "grad_norm": 1.9375, |
| "learning_rate": 7.102398006466362e-05, |
| "loss": 0.7121, |
| "step": 2945 |
| }, |
| { |
| "epoch": 0.7101588830043332, |
| "grad_norm": 2.0, |
| "learning_rate": 7.100997903808498e-05, |
| "loss": 0.7021, |
| "step": 2950 |
| }, |
| { |
| "epoch": 0.7113625421280694, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.099595646335266e-05, |
| "loss": 0.6888, |
| "step": 2955 |
| }, |
| { |
| "epoch": 0.7125662012518055, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.098191235305148e-05, |
| "loss": 0.6547, |
| "step": 2960 |
| }, |
| { |
| "epoch": 0.7137698603755417, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.096784671978555e-05, |
| "loss": 0.6816, |
| "step": 2965 |
| }, |
| { |
| "epoch": 0.7149735194992778, |
| "grad_norm": 2.140625, |
| "learning_rate": 7.09537595761783e-05, |
| "loss": 0.695, |
| "step": 2970 |
| }, |
| { |
| "epoch": 0.716177178623014, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.093965093487248e-05, |
| "loss": 0.6777, |
| "step": 2975 |
| }, |
| { |
| "epoch": 0.7173808377467501, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.092552080853013e-05, |
| "loss": 0.6849, |
| "step": 2980 |
| }, |
| { |
| "epoch": 0.7185844968704863, |
| "grad_norm": 1.9765625, |
| "learning_rate": 7.091136920983255e-05, |
| "loss": 0.7043, |
| "step": 2985 |
| }, |
| { |
| "epoch": 0.7197881559942224, |
| "grad_norm": 2.1875, |
| "learning_rate": 7.089719615148034e-05, |
| "loss": 0.7, |
| "step": 2990 |
| }, |
| { |
| "epoch": 0.7209918151179586, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.088300164619332e-05, |
| "loss": 0.6847, |
| "step": 2995 |
| }, |
| { |
| "epoch": 0.7221954742416947, |
| "grad_norm": 1.96875, |
| "learning_rate": 7.086878570671062e-05, |
| "loss": 0.6825, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.7221954742416947, |
| "eval_loss": 0.5935443043708801, |
| "eval_runtime": 2.4083, |
| "eval_samples_per_second": 83.047, |
| "eval_steps_per_second": 83.047, |
| "step": 3000 |
| }, |
| { |
| "epoch": 0.7233991333654309, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.085454834579054e-05, |
| "loss": 0.7262, |
| "step": 3005 |
| }, |
| { |
| "epoch": 0.724602792489167, |
| "grad_norm": 2.015625, |
| "learning_rate": 7.084028957621066e-05, |
| "loss": 0.7577, |
| "step": 3010 |
| }, |
| { |
| "epoch": 0.7258064516129032, |
| "grad_norm": 1.953125, |
| "learning_rate": 7.082600941076773e-05, |
| "loss": 0.6923, |
| "step": 3015 |
| }, |
| { |
| "epoch": 0.7270101107366393, |
| "grad_norm": 1.9375, |
| "learning_rate": 7.081170786227776e-05, |
| "loss": 0.6833, |
| "step": 3020 |
| }, |
| { |
| "epoch": 0.7282137698603756, |
| "grad_norm": 2.125, |
| "learning_rate": 7.079738494357583e-05, |
| "loss": 0.6757, |
| "step": 3025 |
| }, |
| { |
| "epoch": 0.7294174289841117, |
| "grad_norm": 2.125, |
| "learning_rate": 7.078304066751637e-05, |
| "loss": 0.7042, |
| "step": 3030 |
| }, |
| { |
| "epoch": 0.7306210881078479, |
| "grad_norm": 1.9296875, |
| "learning_rate": 7.076867504697283e-05, |
| "loss": 0.6797, |
| "step": 3035 |
| }, |
| { |
| "epoch": 0.731824747231584, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.075428809483791e-05, |
| "loss": 0.6647, |
| "step": 3040 |
| }, |
| { |
| "epoch": 0.7330284063553202, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.07398798240234e-05, |
| "loss": 0.6718, |
| "step": 3045 |
| }, |
| { |
| "epoch": 0.7342320654790563, |
| "grad_norm": 1.9609375, |
| "learning_rate": 7.072545024746024e-05, |
| "loss": 0.7162, |
| "step": 3050 |
| }, |
| { |
| "epoch": 0.7354357246027925, |
| "grad_norm": 2.40625, |
| "learning_rate": 7.07109993780985e-05, |
| "loss": 0.661, |
| "step": 3055 |
| }, |
| { |
| "epoch": 0.7366393837265286, |
| "grad_norm": 2.109375, |
| "learning_rate": 7.069652722890736e-05, |
| "loss": 0.7114, |
| "step": 3060 |
| }, |
| { |
| "epoch": 0.7378430428502648, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.068203381287507e-05, |
| "loss": 0.6964, |
| "step": 3065 |
| }, |
| { |
| "epoch": 0.7390467019740009, |
| "grad_norm": 1.9609375, |
| "learning_rate": 7.0667519143009e-05, |
| "loss": 0.727, |
| "step": 3070 |
| }, |
| { |
| "epoch": 0.7402503610977371, |
| "grad_norm": 2.015625, |
| "learning_rate": 7.065298323233558e-05, |
| "loss": 0.7187, |
| "step": 3075 |
| }, |
| { |
| "epoch": 0.7414540202214733, |
| "grad_norm": 1.9453125, |
| "learning_rate": 7.06384260939003e-05, |
| "loss": 0.6952, |
| "step": 3080 |
| }, |
| { |
| "epoch": 0.7426576793452094, |
| "grad_norm": 1.8828125, |
| "learning_rate": 7.06238477407677e-05, |
| "loss": 0.6252, |
| "step": 3085 |
| }, |
| { |
| "epoch": 0.7438613384689456, |
| "grad_norm": 2.3125, |
| "learning_rate": 7.060924818602138e-05, |
| "loss": 0.722, |
| "step": 3090 |
| }, |
| { |
| "epoch": 0.7450649975926817, |
| "grad_norm": 2.0, |
| "learning_rate": 7.059462744276395e-05, |
| "loss": 0.6839, |
| "step": 3095 |
| }, |
| { |
| "epoch": 0.746268656716418, |
| "grad_norm": 2.015625, |
| "learning_rate": 7.057998552411702e-05, |
| "loss": 0.6984, |
| "step": 3100 |
| }, |
| { |
| "epoch": 0.7474723158401541, |
| "grad_norm": 2.328125, |
| "learning_rate": 7.056532244322123e-05, |
| "loss": 0.6827, |
| "step": 3105 |
| }, |
| { |
| "epoch": 0.7486759749638903, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.055063821323621e-05, |
| "loss": 0.6519, |
| "step": 3110 |
| }, |
| { |
| "epoch": 0.7498796340876264, |
| "grad_norm": 1.7890625, |
| "learning_rate": 7.053593284734058e-05, |
| "loss": 0.6937, |
| "step": 3115 |
| }, |
| { |
| "epoch": 0.7510832932113626, |
| "grad_norm": 1.7734375, |
| "learning_rate": 7.052120635873189e-05, |
| "loss": 0.6719, |
| "step": 3120 |
| }, |
| { |
| "epoch": 0.7522869523350987, |
| "grad_norm": 1.8984375, |
| "learning_rate": 7.050645876062669e-05, |
| "loss": 0.6803, |
| "step": 3125 |
| }, |
| { |
| "epoch": 0.7534906114588349, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.049169006626043e-05, |
| "loss": 0.7005, |
| "step": 3130 |
| }, |
| { |
| "epoch": 0.754694270582571, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.047690028888756e-05, |
| "loss": 0.6623, |
| "step": 3135 |
| }, |
| { |
| "epoch": 0.7558979297063072, |
| "grad_norm": 2.046875, |
| "learning_rate": 7.046208944178136e-05, |
| "loss": 0.7266, |
| "step": 3140 |
| }, |
| { |
| "epoch": 0.7571015888300433, |
| "grad_norm": 1.96875, |
| "learning_rate": 7.044725753823412e-05, |
| "loss": 0.6812, |
| "step": 3145 |
| }, |
| { |
| "epoch": 0.7583052479537795, |
| "grad_norm": 2.21875, |
| "learning_rate": 7.043240459155696e-05, |
| "loss": 0.6907, |
| "step": 3150 |
| }, |
| { |
| "epoch": 0.7595089070775156, |
| "grad_norm": 2.015625, |
| "learning_rate": 7.041753061507987e-05, |
| "loss": 0.6656, |
| "step": 3155 |
| }, |
| { |
| "epoch": 0.7607125662012518, |
| "grad_norm": 2.0, |
| "learning_rate": 7.04026356221518e-05, |
| "loss": 0.6933, |
| "step": 3160 |
| }, |
| { |
| "epoch": 0.7619162253249879, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.038771962614047e-05, |
| "loss": 0.682, |
| "step": 3165 |
| }, |
| { |
| "epoch": 0.7631198844487241, |
| "grad_norm": 1.984375, |
| "learning_rate": 7.037278264043252e-05, |
| "loss": 0.6681, |
| "step": 3170 |
| }, |
| { |
| "epoch": 0.7643235435724602, |
| "grad_norm": 2.234375, |
| "learning_rate": 7.035782467843336e-05, |
| "loss": 0.6903, |
| "step": 3175 |
| }, |
| { |
| "epoch": 0.7655272026961965, |
| "grad_norm": 2.125, |
| "learning_rate": 7.034284575356729e-05, |
| "loss": 0.6795, |
| "step": 3180 |
| }, |
| { |
| "epoch": 0.7667308618199326, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.032784587927738e-05, |
| "loss": 0.6882, |
| "step": 3185 |
| }, |
| { |
| "epoch": 0.7679345209436688, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.031282506902551e-05, |
| "loss": 0.6924, |
| "step": 3190 |
| }, |
| { |
| "epoch": 0.7691381800674049, |
| "grad_norm": 1.96875, |
| "learning_rate": 7.029778333629238e-05, |
| "loss": 0.6932, |
| "step": 3195 |
| }, |
| { |
| "epoch": 0.7703418391911411, |
| "grad_norm": 2.015625, |
| "learning_rate": 7.028272069457741e-05, |
| "loss": 0.7174, |
| "step": 3200 |
| }, |
| { |
| "epoch": 0.7715454983148772, |
| "grad_norm": 2.015625, |
| "learning_rate": 7.026763715739883e-05, |
| "loss": 0.6819, |
| "step": 3205 |
| }, |
| { |
| "epoch": 0.7727491574386134, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.025253273829363e-05, |
| "loss": 0.7052, |
| "step": 3210 |
| }, |
| { |
| "epoch": 0.7739528165623495, |
| "grad_norm": 1.90625, |
| "learning_rate": 7.02374074508175e-05, |
| "loss": 0.6917, |
| "step": 3215 |
| }, |
| { |
| "epoch": 0.7751564756860857, |
| "grad_norm": 2.0625, |
| "learning_rate": 7.022226130854488e-05, |
| "loss": 0.665, |
| "step": 3220 |
| }, |
| { |
| "epoch": 0.7763601348098219, |
| "grad_norm": 2.03125, |
| "learning_rate": 7.020709432506894e-05, |
| "loss": 0.7044, |
| "step": 3225 |
| }, |
| { |
| "epoch": 0.777563793933558, |
| "grad_norm": 1.90625, |
| "learning_rate": 7.019190651400152e-05, |
| "loss": 0.7384, |
| "step": 3230 |
| }, |
| { |
| "epoch": 0.7787674530572942, |
| "grad_norm": 1.921875, |
| "learning_rate": 7.017669788897319e-05, |
| "loss": 0.7046, |
| "step": 3235 |
| }, |
| { |
| "epoch": 0.7799711121810303, |
| "grad_norm": 2.078125, |
| "learning_rate": 7.016146846363318e-05, |
| "loss": 0.6768, |
| "step": 3240 |
| }, |
| { |
| "epoch": 0.7811747713047665, |
| "grad_norm": 2.171875, |
| "learning_rate": 7.014621825164938e-05, |
| "loss": 0.6342, |
| "step": 3245 |
| }, |
| { |
| "epoch": 0.7823784304285026, |
| "grad_norm": 1.828125, |
| "learning_rate": 7.013094726670837e-05, |
| "loss": 0.6916, |
| "step": 3250 |
| }, |
| { |
| "epoch": 0.7835820895522388, |
| "grad_norm": 2.203125, |
| "learning_rate": 7.011565552251531e-05, |
| "loss": 0.6637, |
| "step": 3255 |
| }, |
| { |
| "epoch": 0.784785748675975, |
| "grad_norm": 1.984375, |
| "learning_rate": 7.010034303279406e-05, |
| "loss": 0.6942, |
| "step": 3260 |
| }, |
| { |
| "epoch": 0.7859894077997112, |
| "grad_norm": 1.96875, |
| "learning_rate": 7.008500981128708e-05, |
| "loss": 0.6655, |
| "step": 3265 |
| }, |
| { |
| "epoch": 0.7871930669234473, |
| "grad_norm": 1.9453125, |
| "learning_rate": 7.006965587175538e-05, |
| "loss": 0.661, |
| "step": 3270 |
| }, |
| { |
| "epoch": 0.7883967260471835, |
| "grad_norm": 2.140625, |
| "learning_rate": 7.005428122797864e-05, |
| "loss": 0.706, |
| "step": 3275 |
| }, |
| { |
| "epoch": 0.7896003851709196, |
| "grad_norm": 1.84375, |
| "learning_rate": 7.003888589375508e-05, |
| "loss": 0.6508, |
| "step": 3280 |
| }, |
| { |
| "epoch": 0.7908040442946558, |
| "grad_norm": 1.90625, |
| "learning_rate": 7.002346988290149e-05, |
| "loss": 0.6981, |
| "step": 3285 |
| }, |
| { |
| "epoch": 0.7920077034183919, |
| "grad_norm": 1.984375, |
| "learning_rate": 7.000803320925323e-05, |
| "loss": 0.6719, |
| "step": 3290 |
| }, |
| { |
| "epoch": 0.7932113625421281, |
| "grad_norm": 1.984375, |
| "learning_rate": 6.999257588666419e-05, |
| "loss": 0.6823, |
| "step": 3295 |
| }, |
| { |
| "epoch": 0.7944150216658642, |
| "grad_norm": 2.359375, |
| "learning_rate": 6.997709792900683e-05, |
| "loss": 0.6894, |
| "step": 3300 |
| }, |
| { |
| "epoch": 0.7956186807896004, |
| "grad_norm": 1.9609375, |
| "learning_rate": 6.996159935017208e-05, |
| "loss": 0.6801, |
| "step": 3305 |
| }, |
| { |
| "epoch": 0.7968223399133365, |
| "grad_norm": 2.03125, |
| "learning_rate": 6.994608016406938e-05, |
| "loss": 0.6678, |
| "step": 3310 |
| }, |
| { |
| "epoch": 0.7980259990370727, |
| "grad_norm": 2.25, |
| "learning_rate": 6.993054038462671e-05, |
| "loss": 0.6815, |
| "step": 3315 |
| }, |
| { |
| "epoch": 0.7992296581608088, |
| "grad_norm": 1.9140625, |
| "learning_rate": 6.991498002579048e-05, |
| "loss": 0.6926, |
| "step": 3320 |
| }, |
| { |
| "epoch": 0.800433317284545, |
| "grad_norm": 1.9140625, |
| "learning_rate": 6.989939910152561e-05, |
| "loss": 0.6916, |
| "step": 3325 |
| }, |
| { |
| "epoch": 0.8016369764082811, |
| "grad_norm": 1.9765625, |
| "learning_rate": 6.988379762581545e-05, |
| "loss": 0.6819, |
| "step": 3330 |
| }, |
| { |
| "epoch": 0.8028406355320173, |
| "grad_norm": 2.171875, |
| "learning_rate": 6.986817561266181e-05, |
| "loss": 0.6759, |
| "step": 3335 |
| }, |
| { |
| "epoch": 0.8040442946557534, |
| "grad_norm": 2.21875, |
| "learning_rate": 6.985253307608491e-05, |
| "loss": 0.6942, |
| "step": 3340 |
| }, |
| { |
| "epoch": 0.8052479537794897, |
| "grad_norm": 2.109375, |
| "learning_rate": 6.983687003012341e-05, |
| "loss": 0.6792, |
| "step": 3345 |
| }, |
| { |
| "epoch": 0.8064516129032258, |
| "grad_norm": 1.984375, |
| "learning_rate": 6.982118648883438e-05, |
| "loss": 0.6402, |
| "step": 3350 |
| }, |
| { |
| "epoch": 0.807655272026962, |
| "grad_norm": 1.9453125, |
| "learning_rate": 6.980548246629326e-05, |
| "loss": 0.6802, |
| "step": 3355 |
| }, |
| { |
| "epoch": 0.8088589311506981, |
| "grad_norm": 1.828125, |
| "learning_rate": 6.978975797659389e-05, |
| "loss": 0.615, |
| "step": 3360 |
| }, |
| { |
| "epoch": 0.8100625902744343, |
| "grad_norm": 2.0625, |
| "learning_rate": 6.97740130338485e-05, |
| "loss": 0.6543, |
| "step": 3365 |
| }, |
| { |
| "epoch": 0.8112662493981705, |
| "grad_norm": 1.9453125, |
| "learning_rate": 6.97582476521876e-05, |
| "loss": 0.6766, |
| "step": 3370 |
| }, |
| { |
| "epoch": 0.8124699085219066, |
| "grad_norm": 1.8515625, |
| "learning_rate": 6.974246184576012e-05, |
| "loss": 0.6788, |
| "step": 3375 |
| }, |
| { |
| "epoch": 0.8136735676456428, |
| "grad_norm": 2.078125, |
| "learning_rate": 6.97266556287333e-05, |
| "loss": 0.6849, |
| "step": 3380 |
| }, |
| { |
| "epoch": 0.8148772267693789, |
| "grad_norm": 2.015625, |
| "learning_rate": 6.971082901529267e-05, |
| "loss": 0.6419, |
| "step": 3385 |
| }, |
| { |
| "epoch": 0.8160808858931151, |
| "grad_norm": 2.046875, |
| "learning_rate": 6.969498201964212e-05, |
| "loss": 0.7203, |
| "step": 3390 |
| }, |
| { |
| "epoch": 0.8172845450168512, |
| "grad_norm": 2.203125, |
| "learning_rate": 6.967911465600376e-05, |
| "loss": 0.674, |
| "step": 3395 |
| }, |
| { |
| "epoch": 0.8184882041405874, |
| "grad_norm": 1.875, |
| "learning_rate": 6.966322693861804e-05, |
| "loss": 0.6785, |
| "step": 3400 |
| }, |
| { |
| "epoch": 0.8196918632643235, |
| "grad_norm": 2.359375, |
| "learning_rate": 6.964731888174366e-05, |
| "loss": 0.7204, |
| "step": 3405 |
| }, |
| { |
| "epoch": 0.8208955223880597, |
| "grad_norm": 2.234375, |
| "learning_rate": 6.963139049965758e-05, |
| "loss": 0.6844, |
| "step": 3410 |
| }, |
| { |
| "epoch": 0.8220991815117958, |
| "grad_norm": 2.15625, |
| "learning_rate": 6.961544180665494e-05, |
| "loss": 0.6818, |
| "step": 3415 |
| }, |
| { |
| "epoch": 0.823302840635532, |
| "grad_norm": 2.015625, |
| "learning_rate": 6.959947281704922e-05, |
| "loss": 0.6544, |
| "step": 3420 |
| }, |
| { |
| "epoch": 0.8245064997592682, |
| "grad_norm": 2.046875, |
| "learning_rate": 6.9583483545172e-05, |
| "loss": 0.7053, |
| "step": 3425 |
| }, |
| { |
| "epoch": 0.8257101588830044, |
| "grad_norm": 2.03125, |
| "learning_rate": 6.956747400537315e-05, |
| "loss": 0.7212, |
| "step": 3430 |
| }, |
| { |
| "epoch": 0.8269138180067405, |
| "grad_norm": 2.0, |
| "learning_rate": 6.955144421202071e-05, |
| "loss": 0.6408, |
| "step": 3435 |
| }, |
| { |
| "epoch": 0.8281174771304767, |
| "grad_norm": 1.9921875, |
| "learning_rate": 6.953539417950085e-05, |
| "loss": 0.6501, |
| "step": 3440 |
| }, |
| { |
| "epoch": 0.8293211362542128, |
| "grad_norm": 1.921875, |
| "learning_rate": 6.951932392221796e-05, |
| "loss": 0.6593, |
| "step": 3445 |
| }, |
| { |
| "epoch": 0.830524795377949, |
| "grad_norm": 2.09375, |
| "learning_rate": 6.950323345459454e-05, |
| "loss": 0.6657, |
| "step": 3450 |
| }, |
| { |
| "epoch": 0.8317284545016851, |
| "grad_norm": 1.8828125, |
| "learning_rate": 6.948712279107125e-05, |
| "loss": 0.685, |
| "step": 3455 |
| }, |
| { |
| "epoch": 0.8329321136254213, |
| "grad_norm": 2.078125, |
| "learning_rate": 6.947099194610689e-05, |
| "loss": 0.7025, |
| "step": 3460 |
| }, |
| { |
| "epoch": 0.8341357727491574, |
| "grad_norm": 1.9921875, |
| "learning_rate": 6.945484093417835e-05, |
| "loss": 0.6594, |
| "step": 3465 |
| }, |
| { |
| "epoch": 0.8353394318728936, |
| "grad_norm": 1.953125, |
| "learning_rate": 6.94386697697806e-05, |
| "loss": 0.6699, |
| "step": 3470 |
| }, |
| { |
| "epoch": 0.8365430909966297, |
| "grad_norm": 1.9921875, |
| "learning_rate": 6.942247846742674e-05, |
| "loss": 0.6582, |
| "step": 3475 |
| }, |
| { |
| "epoch": 0.8377467501203659, |
| "grad_norm": 2.015625, |
| "learning_rate": 6.940626704164793e-05, |
| "loss": 0.6745, |
| "step": 3480 |
| }, |
| { |
| "epoch": 0.838950409244102, |
| "grad_norm": 1.8046875, |
| "learning_rate": 6.939003550699337e-05, |
| "loss": 0.6824, |
| "step": 3485 |
| }, |
| { |
| "epoch": 0.8401540683678382, |
| "grad_norm": 2.09375, |
| "learning_rate": 6.93737838780303e-05, |
| "loss": 0.6271, |
| "step": 3490 |
| }, |
| { |
| "epoch": 0.8413577274915743, |
| "grad_norm": 2.109375, |
| "learning_rate": 6.935751216934407e-05, |
| "loss": 0.7001, |
| "step": 3495 |
| }, |
| { |
| "epoch": 0.8425613866153105, |
| "grad_norm": 1.96875, |
| "learning_rate": 6.934122039553793e-05, |
| "loss": 0.7044, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.8425613866153105, |
| "eval_loss": 0.5733353495597839, |
| "eval_runtime": 2.4041, |
| "eval_samples_per_second": 83.193, |
| "eval_steps_per_second": 83.193, |
| "step": 3500 |
| }, |
| { |
| "epoch": 0.8437650457390466, |
| "grad_norm": 1.890625, |
| "learning_rate": 6.932490857123324e-05, |
| "loss": 0.685, |
| "step": 3505 |
| }, |
| { |
| "epoch": 0.8449687048627829, |
| "grad_norm": 2.109375, |
| "learning_rate": 6.930857671106932e-05, |
| "loss": 0.6795, |
| "step": 3510 |
| }, |
| { |
| "epoch": 0.8461723639865191, |
| "grad_norm": 2.046875, |
| "learning_rate": 6.929222482970345e-05, |
| "loss": 0.6792, |
| "step": 3515 |
| }, |
| { |
| "epoch": 0.8473760231102552, |
| "grad_norm": 2.09375, |
| "learning_rate": 6.92758529418109e-05, |
| "loss": 0.6647, |
| "step": 3520 |
| }, |
| { |
| "epoch": 0.8485796822339914, |
| "grad_norm": 2.125, |
| "learning_rate": 6.925946106208492e-05, |
| "loss": 0.6924, |
| "step": 3525 |
| }, |
| { |
| "epoch": 0.8497833413577275, |
| "grad_norm": 2.03125, |
| "learning_rate": 6.924304920523662e-05, |
| "loss": 0.6794, |
| "step": 3530 |
| }, |
| { |
| "epoch": 0.8509870004814637, |
| "grad_norm": 2.0, |
| "learning_rate": 6.922661738599514e-05, |
| "loss": 0.7257, |
| "step": 3535 |
| }, |
| { |
| "epoch": 0.8521906596051998, |
| "grad_norm": 1.9609375, |
| "learning_rate": 6.921016561910748e-05, |
| "loss": 0.6848, |
| "step": 3540 |
| }, |
| { |
| "epoch": 0.853394318728936, |
| "grad_norm": 2.15625, |
| "learning_rate": 6.919369391933853e-05, |
| "loss": 0.6732, |
| "step": 3545 |
| }, |
| { |
| "epoch": 0.8545979778526721, |
| "grad_norm": 1.90625, |
| "learning_rate": 6.917720230147111e-05, |
| "loss": 0.6457, |
| "step": 3550 |
| }, |
| { |
| "epoch": 0.8558016369764083, |
| "grad_norm": 2.03125, |
| "learning_rate": 6.91606907803059e-05, |
| "loss": 0.6906, |
| "step": 3555 |
| }, |
| { |
| "epoch": 0.8570052961001444, |
| "grad_norm": 1.84375, |
| "learning_rate": 6.914415937066142e-05, |
| "loss": 0.6813, |
| "step": 3560 |
| }, |
| { |
| "epoch": 0.8582089552238806, |
| "grad_norm": 1.796875, |
| "learning_rate": 6.912760808737405e-05, |
| "loss": 0.7021, |
| "step": 3565 |
| }, |
| { |
| "epoch": 0.8594126143476167, |
| "grad_norm": 2.03125, |
| "learning_rate": 6.911103694529805e-05, |
| "loss": 0.6774, |
| "step": 3570 |
| }, |
| { |
| "epoch": 0.8606162734713529, |
| "grad_norm": 1.9375, |
| "learning_rate": 6.909444595930544e-05, |
| "loss": 0.6874, |
| "step": 3575 |
| }, |
| { |
| "epoch": 0.861819932595089, |
| "grad_norm": 2.078125, |
| "learning_rate": 6.907783514428607e-05, |
| "loss": 0.6654, |
| "step": 3580 |
| }, |
| { |
| "epoch": 0.8630235917188253, |
| "grad_norm": 2.0, |
| "learning_rate": 6.906120451514761e-05, |
| "loss": 0.6499, |
| "step": 3585 |
| }, |
| { |
| "epoch": 0.8642272508425614, |
| "grad_norm": 2.078125, |
| "learning_rate": 6.90445540868155e-05, |
| "loss": 0.6703, |
| "step": 3590 |
| }, |
| { |
| "epoch": 0.8654309099662976, |
| "grad_norm": 2.1875, |
| "learning_rate": 6.902788387423292e-05, |
| "loss": 0.6915, |
| "step": 3595 |
| }, |
| { |
| "epoch": 0.8666345690900337, |
| "grad_norm": 2.28125, |
| "learning_rate": 6.901119389236082e-05, |
| "loss": 0.6694, |
| "step": 3600 |
| }, |
| { |
| "epoch": 0.8678382282137699, |
| "grad_norm": 1.953125, |
| "learning_rate": 6.899448415617794e-05, |
| "loss": 0.6693, |
| "step": 3605 |
| }, |
| { |
| "epoch": 0.869041887337506, |
| "grad_norm": 2.015625, |
| "learning_rate": 6.897775468068067e-05, |
| "loss": 0.6575, |
| "step": 3610 |
| }, |
| { |
| "epoch": 0.8702455464612422, |
| "grad_norm": 1.9375, |
| "learning_rate": 6.896100548088318e-05, |
| "loss": 0.6947, |
| "step": 3615 |
| }, |
| { |
| "epoch": 0.8714492055849783, |
| "grad_norm": 2.03125, |
| "learning_rate": 6.894423657181731e-05, |
| "loss": 0.6578, |
| "step": 3620 |
| }, |
| { |
| "epoch": 0.8726528647087145, |
| "grad_norm": 1.96875, |
| "learning_rate": 6.89274479685326e-05, |
| "loss": 0.6838, |
| "step": 3625 |
| }, |
| { |
| "epoch": 0.8738565238324506, |
| "grad_norm": 1.9921875, |
| "learning_rate": 6.891063968609624e-05, |
| "loss": 0.6947, |
| "step": 3630 |
| }, |
| { |
| "epoch": 0.8750601829561868, |
| "grad_norm": 1.9609375, |
| "learning_rate": 6.889381173959314e-05, |
| "loss": 0.6484, |
| "step": 3635 |
| }, |
| { |
| "epoch": 0.8762638420799229, |
| "grad_norm": 1.875, |
| "learning_rate": 6.887696414412577e-05, |
| "loss": 0.7085, |
| "step": 3640 |
| }, |
| { |
| "epoch": 0.8774675012036591, |
| "grad_norm": 2.09375, |
| "learning_rate": 6.886009691481434e-05, |
| "loss": 0.6785, |
| "step": 3645 |
| }, |
| { |
| "epoch": 0.8786711603273952, |
| "grad_norm": 1.9609375, |
| "learning_rate": 6.884321006679656e-05, |
| "loss": 0.6721, |
| "step": 3650 |
| }, |
| { |
| "epoch": 0.8798748194511314, |
| "grad_norm": 1.921875, |
| "learning_rate": 6.882630361522787e-05, |
| "loss": 0.6621, |
| "step": 3655 |
| }, |
| { |
| "epoch": 0.8810784785748677, |
| "grad_norm": 2.015625, |
| "learning_rate": 6.880937757528123e-05, |
| "loss": 0.6415, |
| "step": 3660 |
| }, |
| { |
| "epoch": 0.8822821376986038, |
| "grad_norm": 1.984375, |
| "learning_rate": 6.879243196214718e-05, |
| "loss": 0.6314, |
| "step": 3665 |
| }, |
| { |
| "epoch": 0.88348579682234, |
| "grad_norm": 2.015625, |
| "learning_rate": 6.877546679103384e-05, |
| "loss": 0.701, |
| "step": 3670 |
| }, |
| { |
| "epoch": 0.8846894559460761, |
| "grad_norm": 2.0, |
| "learning_rate": 6.875848207716689e-05, |
| "loss": 0.686, |
| "step": 3675 |
| }, |
| { |
| "epoch": 0.8858931150698123, |
| "grad_norm": 1.9453125, |
| "learning_rate": 6.874147783578954e-05, |
| "loss": 0.6813, |
| "step": 3680 |
| }, |
| { |
| "epoch": 0.8870967741935484, |
| "grad_norm": 1.9453125, |
| "learning_rate": 6.872445408216255e-05, |
| "loss": 0.6357, |
| "step": 3685 |
| }, |
| { |
| "epoch": 0.8883004333172846, |
| "grad_norm": 2.015625, |
| "learning_rate": 6.870741083156415e-05, |
| "loss": 0.6627, |
| "step": 3690 |
| }, |
| { |
| "epoch": 0.8895040924410207, |
| "grad_norm": 2.15625, |
| "learning_rate": 6.86903480992901e-05, |
| "loss": 0.6747, |
| "step": 3695 |
| }, |
| { |
| "epoch": 0.8907077515647569, |
| "grad_norm": 2.15625, |
| "learning_rate": 6.867326590065361e-05, |
| "loss": 0.6878, |
| "step": 3700 |
| }, |
| { |
| "epoch": 0.891911410688493, |
| "grad_norm": 1.7265625, |
| "learning_rate": 6.86561642509854e-05, |
| "loss": 0.6376, |
| "step": 3705 |
| }, |
| { |
| "epoch": 0.8931150698122292, |
| "grad_norm": 1.9921875, |
| "learning_rate": 6.863904316563362e-05, |
| "loss": 0.6647, |
| "step": 3710 |
| }, |
| { |
| "epoch": 0.8943187289359653, |
| "grad_norm": 1.890625, |
| "learning_rate": 6.862190265996387e-05, |
| "loss": 0.6701, |
| "step": 3715 |
| }, |
| { |
| "epoch": 0.8955223880597015, |
| "grad_norm": 1.8046875, |
| "learning_rate": 6.86047427493592e-05, |
| "loss": 0.6583, |
| "step": 3720 |
| }, |
| { |
| "epoch": 0.8967260471834376, |
| "grad_norm": 1.9296875, |
| "learning_rate": 6.858756344922003e-05, |
| "loss": 0.6701, |
| "step": 3725 |
| }, |
| { |
| "epoch": 0.8979297063071738, |
| "grad_norm": 1.921875, |
| "learning_rate": 6.857036477496424e-05, |
| "loss": 0.6863, |
| "step": 3730 |
| }, |
| { |
| "epoch": 0.8991333654309099, |
| "grad_norm": 1.9765625, |
| "learning_rate": 6.855314674202704e-05, |
| "loss": 0.6299, |
| "step": 3735 |
| }, |
| { |
| "epoch": 0.9003370245546461, |
| "grad_norm": 1.8984375, |
| "learning_rate": 6.853590936586105e-05, |
| "loss": 0.6614, |
| "step": 3740 |
| }, |
| { |
| "epoch": 0.9015406836783822, |
| "grad_norm": 2.15625, |
| "learning_rate": 6.851865266193622e-05, |
| "loss": 0.6342, |
| "step": 3745 |
| }, |
| { |
| "epoch": 0.9027443428021185, |
| "grad_norm": 1.9921875, |
| "learning_rate": 6.850137664573988e-05, |
| "loss": 0.6648, |
| "step": 3750 |
| }, |
| { |
| "epoch": 0.9039480019258546, |
| "grad_norm": 1.8203125, |
| "learning_rate": 6.848408133277669e-05, |
| "loss": 0.6791, |
| "step": 3755 |
| }, |
| { |
| "epoch": 0.9051516610495908, |
| "grad_norm": 1.984375, |
| "learning_rate": 6.84667667385686e-05, |
| "loss": 0.6739, |
| "step": 3760 |
| }, |
| { |
| "epoch": 0.9063553201733269, |
| "grad_norm": 2.140625, |
| "learning_rate": 6.844943287865487e-05, |
| "loss": 0.702, |
| "step": 3765 |
| }, |
| { |
| "epoch": 0.9075589792970631, |
| "grad_norm": 1.8828125, |
| "learning_rate": 6.843207976859207e-05, |
| "loss": 0.6633, |
| "step": 3770 |
| }, |
| { |
| "epoch": 0.9087626384207992, |
| "grad_norm": 2.0, |
| "learning_rate": 6.841470742395405e-05, |
| "loss": 0.6723, |
| "step": 3775 |
| }, |
| { |
| "epoch": 0.9099662975445354, |
| "grad_norm": 2.0625, |
| "learning_rate": 6.839731586033188e-05, |
| "loss": 0.6841, |
| "step": 3780 |
| }, |
| { |
| "epoch": 0.9111699566682715, |
| "grad_norm": 2.0, |
| "learning_rate": 6.837990509333393e-05, |
| "loss": 0.6754, |
| "step": 3785 |
| }, |
| { |
| "epoch": 0.9123736157920077, |
| "grad_norm": 1.984375, |
| "learning_rate": 6.836247513858579e-05, |
| "loss": 0.661, |
| "step": 3790 |
| }, |
| { |
| "epoch": 0.9135772749157438, |
| "grad_norm": 1.8984375, |
| "learning_rate": 6.834502601173023e-05, |
| "loss": 0.6476, |
| "step": 3795 |
| }, |
| { |
| "epoch": 0.91478093403948, |
| "grad_norm": 2.09375, |
| "learning_rate": 6.832755772842727e-05, |
| "loss": 0.6827, |
| "step": 3800 |
| }, |
| { |
| "epoch": 0.9159845931632162, |
| "grad_norm": 1.8828125, |
| "learning_rate": 6.831007030435414e-05, |
| "loss": 0.6691, |
| "step": 3805 |
| }, |
| { |
| "epoch": 0.9171882522869523, |
| "grad_norm": 2.0, |
| "learning_rate": 6.829256375520516e-05, |
| "loss": 0.7024, |
| "step": 3810 |
| }, |
| { |
| "epoch": 0.9183919114106885, |
| "grad_norm": 1.9609375, |
| "learning_rate": 6.827503809669192e-05, |
| "loss": 0.6433, |
| "step": 3815 |
| }, |
| { |
| "epoch": 0.9195955705344246, |
| "grad_norm": 1.9453125, |
| "learning_rate": 6.825749334454311e-05, |
| "loss": 0.6887, |
| "step": 3820 |
| }, |
| { |
| "epoch": 0.9207992296581609, |
| "grad_norm": 1.9765625, |
| "learning_rate": 6.823992951450455e-05, |
| "loss": 0.6566, |
| "step": 3825 |
| }, |
| { |
| "epoch": 0.922002888781897, |
| "grad_norm": 1.9765625, |
| "learning_rate": 6.822234662233916e-05, |
| "loss": 0.6828, |
| "step": 3830 |
| }, |
| { |
| "epoch": 0.9232065479056332, |
| "grad_norm": 1.9609375, |
| "learning_rate": 6.820474468382704e-05, |
| "loss": 0.6761, |
| "step": 3835 |
| }, |
| { |
| "epoch": 0.9244102070293693, |
| "grad_norm": 1.9765625, |
| "learning_rate": 6.818712371476534e-05, |
| "loss": 0.626, |
| "step": 3840 |
| }, |
| { |
| "epoch": 0.9256138661531055, |
| "grad_norm": 2.140625, |
| "learning_rate": 6.816948373096826e-05, |
| "loss": 0.6551, |
| "step": 3845 |
| }, |
| { |
| "epoch": 0.9268175252768416, |
| "grad_norm": 1.859375, |
| "learning_rate": 6.815182474826712e-05, |
| "loss": 0.665, |
| "step": 3850 |
| }, |
| { |
| "epoch": 0.9280211844005778, |
| "grad_norm": 1.9609375, |
| "learning_rate": 6.813414678251028e-05, |
| "loss": 0.7109, |
| "step": 3855 |
| }, |
| { |
| "epoch": 0.9292248435243139, |
| "grad_norm": 1.9453125, |
| "learning_rate": 6.811644984956307e-05, |
| "loss": 0.6588, |
| "step": 3860 |
| }, |
| { |
| "epoch": 0.9304285026480501, |
| "grad_norm": 2.015625, |
| "learning_rate": 6.809873396530795e-05, |
| "loss": 0.6724, |
| "step": 3865 |
| }, |
| { |
| "epoch": 0.9316321617717862, |
| "grad_norm": 2.078125, |
| "learning_rate": 6.808099914564431e-05, |
| "loss": 0.691, |
| "step": 3870 |
| }, |
| { |
| "epoch": 0.9328358208955224, |
| "grad_norm": 1.953125, |
| "learning_rate": 6.806324540648856e-05, |
| "loss": 0.6624, |
| "step": 3875 |
| }, |
| { |
| "epoch": 0.9340394800192585, |
| "grad_norm": 2.0, |
| "learning_rate": 6.80454727637741e-05, |
| "loss": 0.6777, |
| "step": 3880 |
| }, |
| { |
| "epoch": 0.9352431391429947, |
| "grad_norm": 1.9609375, |
| "learning_rate": 6.802768123345126e-05, |
| "loss": 0.6342, |
| "step": 3885 |
| }, |
| { |
| "epoch": 0.9364467982667308, |
| "grad_norm": 1.9921875, |
| "learning_rate": 6.800987083148736e-05, |
| "loss": 0.661, |
| "step": 3890 |
| }, |
| { |
| "epoch": 0.937650457390467, |
| "grad_norm": 2.0625, |
| "learning_rate": 6.799204157386665e-05, |
| "loss": 0.6604, |
| "step": 3895 |
| }, |
| { |
| "epoch": 0.9388541165142031, |
| "grad_norm": 2.09375, |
| "learning_rate": 6.797419347659026e-05, |
| "loss": 0.6768, |
| "step": 3900 |
| }, |
| { |
| "epoch": 0.9400577756379394, |
| "grad_norm": 1.8828125, |
| "learning_rate": 6.795632655567628e-05, |
| "loss": 0.6441, |
| "step": 3905 |
| }, |
| { |
| "epoch": 0.9412614347616755, |
| "grad_norm": 2.046875, |
| "learning_rate": 6.793844082715967e-05, |
| "loss": 0.6903, |
| "step": 3910 |
| }, |
| { |
| "epoch": 0.9424650938854117, |
| "grad_norm": 2.0625, |
| "learning_rate": 6.79205363070923e-05, |
| "loss": 0.6843, |
| "step": 3915 |
| }, |
| { |
| "epoch": 0.9436687530091478, |
| "grad_norm": 1.9921875, |
| "learning_rate": 6.790261301154283e-05, |
| "loss": 0.6827, |
| "step": 3920 |
| }, |
| { |
| "epoch": 0.944872412132884, |
| "grad_norm": 1.859375, |
| "learning_rate": 6.788467095659686e-05, |
| "loss": 0.6374, |
| "step": 3925 |
| }, |
| { |
| "epoch": 0.9460760712566201, |
| "grad_norm": 1.9765625, |
| "learning_rate": 6.786671015835677e-05, |
| "loss": 0.6569, |
| "step": 3930 |
| }, |
| { |
| "epoch": 0.9472797303803563, |
| "grad_norm": 2.140625, |
| "learning_rate": 6.784873063294177e-05, |
| "loss": 0.6511, |
| "step": 3935 |
| }, |
| { |
| "epoch": 0.9484833895040924, |
| "grad_norm": 1.8984375, |
| "learning_rate": 6.783073239648788e-05, |
| "loss": 0.6392, |
| "step": 3940 |
| }, |
| { |
| "epoch": 0.9496870486278286, |
| "grad_norm": 1.90625, |
| "learning_rate": 6.781271546514794e-05, |
| "loss": 0.6284, |
| "step": 3945 |
| }, |
| { |
| "epoch": 0.9508907077515648, |
| "grad_norm": 1.90625, |
| "learning_rate": 6.779467985509152e-05, |
| "loss": 0.6342, |
| "step": 3950 |
| }, |
| { |
| "epoch": 0.9520943668753009, |
| "grad_norm": 1.9296875, |
| "learning_rate": 6.777662558250498e-05, |
| "loss": 0.63, |
| "step": 3955 |
| }, |
| { |
| "epoch": 0.9532980259990371, |
| "grad_norm": 1.9609375, |
| "learning_rate": 6.775855266359144e-05, |
| "loss": 0.6278, |
| "step": 3960 |
| }, |
| { |
| "epoch": 0.9545016851227732, |
| "grad_norm": 1.9765625, |
| "learning_rate": 6.774046111457075e-05, |
| "loss": 0.6682, |
| "step": 3965 |
| }, |
| { |
| "epoch": 0.9557053442465094, |
| "grad_norm": 2.15625, |
| "learning_rate": 6.772235095167942e-05, |
| "loss": 0.6455, |
| "step": 3970 |
| }, |
| { |
| "epoch": 0.9569090033702455, |
| "grad_norm": 1.9921875, |
| "learning_rate": 6.770422219117076e-05, |
| "loss": 0.6545, |
| "step": 3975 |
| }, |
| { |
| "epoch": 0.9581126624939817, |
| "grad_norm": 2.171875, |
| "learning_rate": 6.76860748493147e-05, |
| "loss": 0.6731, |
| "step": 3980 |
| }, |
| { |
| "epoch": 0.9593163216177178, |
| "grad_norm": 2.25, |
| "learning_rate": 6.766790894239793e-05, |
| "loss": 0.6858, |
| "step": 3985 |
| }, |
| { |
| "epoch": 0.9605199807414541, |
| "grad_norm": 2.0, |
| "learning_rate": 6.764972448672365e-05, |
| "loss": 0.6308, |
| "step": 3990 |
| }, |
| { |
| "epoch": 0.9617236398651902, |
| "grad_norm": 1.9375, |
| "learning_rate": 6.763152149861189e-05, |
| "loss": 0.6771, |
| "step": 3995 |
| }, |
| { |
| "epoch": 0.9629272989889264, |
| "grad_norm": 2.4375, |
| "learning_rate": 6.761329999439916e-05, |
| "loss": 0.6341, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.9629272989889264, |
| "eval_loss": 0.5589016675949097, |
| "eval_runtime": 2.406, |
| "eval_samples_per_second": 83.126, |
| "eval_steps_per_second": 83.126, |
| "step": 4000 |
| }, |
| { |
| "epoch": 0.9641309581126625, |
| "grad_norm": 1.8203125, |
| "learning_rate": 6.759505999043869e-05, |
| "loss": 0.7023, |
| "step": 4005 |
| }, |
| { |
| "epoch": 0.9653346172363987, |
| "grad_norm": 2.078125, |
| "learning_rate": 6.757680150310026e-05, |
| "loss": 0.66, |
| "step": 4010 |
| }, |
| { |
| "epoch": 0.9665382763601348, |
| "grad_norm": 2.0625, |
| "learning_rate": 6.755852454877027e-05, |
| "loss": 0.6577, |
| "step": 4015 |
| }, |
| { |
| "epoch": 0.967741935483871, |
| "grad_norm": 1.96875, |
| "learning_rate": 6.754022914385163e-05, |
| "loss": 0.6657, |
| "step": 4020 |
| }, |
| { |
| "epoch": 0.9689455946076071, |
| "grad_norm": 2.078125, |
| "learning_rate": 6.75219153047639e-05, |
| "loss": 0.6462, |
| "step": 4025 |
| }, |
| { |
| "epoch": 0.9701492537313433, |
| "grad_norm": 1.8828125, |
| "learning_rate": 6.750358304794312e-05, |
| "loss": 0.6606, |
| "step": 4030 |
| }, |
| { |
| "epoch": 0.9713529128550794, |
| "grad_norm": 1.8125, |
| "learning_rate": 6.748523238984188e-05, |
| "loss": 0.6602, |
| "step": 4035 |
| }, |
| { |
| "epoch": 0.9725565719788156, |
| "grad_norm": 1.9453125, |
| "learning_rate": 6.746686334692929e-05, |
| "loss": 0.6587, |
| "step": 4040 |
| }, |
| { |
| "epoch": 0.9737602311025517, |
| "grad_norm": 1.9921875, |
| "learning_rate": 6.744847593569092e-05, |
| "loss": 0.6497, |
| "step": 4045 |
| }, |
| { |
| "epoch": 0.9749638902262879, |
| "grad_norm": 1.9375, |
| "learning_rate": 6.74300701726289e-05, |
| "loss": 0.6741, |
| "step": 4050 |
| }, |
| { |
| "epoch": 0.976167549350024, |
| "grad_norm": 2.1875, |
| "learning_rate": 6.741164607426177e-05, |
| "loss": 0.6446, |
| "step": 4055 |
| }, |
| { |
| "epoch": 0.9773712084737602, |
| "grad_norm": 2.03125, |
| "learning_rate": 6.739320365712451e-05, |
| "loss": 0.6547, |
| "step": 4060 |
| }, |
| { |
| "epoch": 0.9785748675974963, |
| "grad_norm": 2.125, |
| "learning_rate": 6.737474293776865e-05, |
| "loss": 0.6354, |
| "step": 4065 |
| }, |
| { |
| "epoch": 0.9797785267212326, |
| "grad_norm": 1.9453125, |
| "learning_rate": 6.7356263932762e-05, |
| "loss": 0.6489, |
| "step": 4070 |
| }, |
| { |
| "epoch": 0.9809821858449687, |
| "grad_norm": 2.203125, |
| "learning_rate": 6.733776665868885e-05, |
| "loss": 0.7068, |
| "step": 4075 |
| }, |
| { |
| "epoch": 0.9821858449687049, |
| "grad_norm": 1.90625, |
| "learning_rate": 6.731925113214994e-05, |
| "loss": 0.6695, |
| "step": 4080 |
| }, |
| { |
| "epoch": 0.983389504092441, |
| "grad_norm": 2.046875, |
| "learning_rate": 6.730071736976229e-05, |
| "loss": 0.6576, |
| "step": 4085 |
| }, |
| { |
| "epoch": 0.9845931632161772, |
| "grad_norm": 1.8515625, |
| "learning_rate": 6.728216538815934e-05, |
| "loss": 0.6666, |
| "step": 4090 |
| }, |
| { |
| "epoch": 0.9857968223399133, |
| "grad_norm": 2.0625, |
| "learning_rate": 6.726359520399088e-05, |
| "loss": 0.6542, |
| "step": 4095 |
| }, |
| { |
| "epoch": 0.9870004814636495, |
| "grad_norm": 2.09375, |
| "learning_rate": 6.724500683392303e-05, |
| "loss": 0.6726, |
| "step": 4100 |
| }, |
| { |
| "epoch": 0.9882041405873857, |
| "grad_norm": 2.015625, |
| "learning_rate": 6.722640029463823e-05, |
| "loss": 0.6588, |
| "step": 4105 |
| }, |
| { |
| "epoch": 0.9894077997111218, |
| "grad_norm": 1.9921875, |
| "learning_rate": 6.720777560283523e-05, |
| "loss": 0.6522, |
| "step": 4110 |
| }, |
| { |
| "epoch": 0.990611458834858, |
| "grad_norm": 1.8359375, |
| "learning_rate": 6.718913277522905e-05, |
| "loss": 0.6492, |
| "step": 4115 |
| }, |
| { |
| "epoch": 0.9918151179585941, |
| "grad_norm": 2.0, |
| "learning_rate": 6.717047182855104e-05, |
| "loss": 0.6672, |
| "step": 4120 |
| }, |
| { |
| "epoch": 0.9930187770823303, |
| "grad_norm": 1.953125, |
| "learning_rate": 6.715179277954874e-05, |
| "loss": 0.6509, |
| "step": 4125 |
| }, |
| { |
| "epoch": 0.9942224362060664, |
| "grad_norm": 1.9921875, |
| "learning_rate": 6.713309564498599e-05, |
| "loss": 0.6461, |
| "step": 4130 |
| }, |
| { |
| "epoch": 0.9954260953298026, |
| "grad_norm": 1.9375, |
| "learning_rate": 6.711438044164282e-05, |
| "loss": 0.6566, |
| "step": 4135 |
| }, |
| { |
| "epoch": 0.9966297544535387, |
| "grad_norm": 2.046875, |
| "learning_rate": 6.709564718631556e-05, |
| "loss": 0.6447, |
| "step": 4140 |
| }, |
| { |
| "epoch": 0.997833413577275, |
| "grad_norm": 1.9609375, |
| "learning_rate": 6.707689589581662e-05, |
| "loss": 0.6736, |
| "step": 4145 |
| }, |
| { |
| "epoch": 0.999037072701011, |
| "grad_norm": 1.9140625, |
| "learning_rate": 6.705812658697467e-05, |
| "loss": 0.6542, |
| "step": 4150 |
| }, |
| { |
| "epoch": 0.9997592681752527, |
| "eval_loss": 0.5545368194580078, |
| "eval_runtime": 2.4068, |
| "eval_samples_per_second": 83.099, |
| "eval_steps_per_second": 83.099, |
| "step": 4153 |
| } |
| ], |
| "logging_steps": 5, |
| "max_steps": 16616, |
| "num_input_tokens_seen": 0, |
| "num_train_epochs": 4, |
| "save_steps": 500, |
| "stateful_callbacks": { |
| "TrainerControl": { |
| "args": { |
| "should_epoch_stop": false, |
| "should_evaluate": false, |
| "should_log": false, |
| "should_save": true, |
| "should_training_stop": false |
| }, |
| "attributes": {} |
| } |
| }, |
| "total_flos": 1.04173997654016e+17, |
| "train_batch_size": 48, |
| "trial_name": null, |
| "trial_params": null |
| } |
|
|