{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 0.9997592681752527, "eval_steps": 500, "global_step": 4153, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001203659123736158, "grad_norm": 5.4375, "learning_rate": 8.594285714285714e-06, "loss": 1.9523, "step": 5 }, { "epoch": 0.002407318247472316, "grad_norm": 3.6875, "learning_rate": 1.9337142857142854e-05, "loss": 1.9164, "step": 10 }, { "epoch": 0.0036109773712084737, "grad_norm": 3.046875, "learning_rate": 3.008e-05, "loss": 1.8413, "step": 15 }, { "epoch": 0.004814636494944632, "grad_norm": 3.03125, "learning_rate": 4.082285714285714e-05, "loss": 1.7485, "step": 20 }, { "epoch": 0.00601829561868079, "grad_norm": 2.78125, "learning_rate": 5.156571428571429e-05, "loss": 1.7032, "step": 25 }, { "epoch": 0.007221954742416947, "grad_norm": 2.71875, "learning_rate": 6.230857142857143e-05, "loss": 1.5993, "step": 30 }, { "epoch": 0.008425613866153106, "grad_norm": 2.78125, "learning_rate": 7.305142857142857e-05, "loss": 1.5406, "step": 35 }, { "epoch": 0.009629272989889264, "grad_norm": 2.703125, "learning_rate": 7.519999190126141e-05, "loss": 1.4727, "step": 40 }, { "epoch": 0.010832932113625422, "grad_norm": 2.796875, "learning_rate": 7.519995900014385e-05, "loss": 1.452, "step": 45 }, { "epoch": 0.01203659123736158, "grad_norm": 2.78125, "learning_rate": 7.519990079050565e-05, "loss": 1.3904, "step": 50 }, { "epoch": 0.013240250361097737, "grad_norm": 2.59375, "learning_rate": 7.519981727239906e-05, "loss": 1.3752, "step": 55 }, { "epoch": 0.014443909484833895, "grad_norm": 2.765625, "learning_rate": 7.519970844589904e-05, "loss": 1.3351, "step": 60 }, { "epoch": 0.015647568608570053, "grad_norm": 2.59375, "learning_rate": 7.519957431110327e-05, "loss": 1.342, "step": 65 }, { "epoch": 0.016851227732306212, "grad_norm": 2.96875, "learning_rate": 7.51994148681321e-05, "loss": 1.3116, "step": 70 }, { "epoch": 0.018054886856042368, "grad_norm": 2.734375, "learning_rate": 7.519923011712865e-05, "loss": 1.3081, "step": 75 }, { "epoch": 0.019258545979778528, "grad_norm": 2.640625, "learning_rate": 7.519902005825872e-05, "loss": 1.2885, "step": 80 }, { "epoch": 0.020462205103514684, "grad_norm": 2.703125, "learning_rate": 7.519878469171081e-05, "loss": 1.2879, "step": 85 }, { "epoch": 0.021665864227250843, "grad_norm": 2.5625, "learning_rate": 7.519852401769621e-05, "loss": 1.2741, "step": 90 }, { "epoch": 0.022869523350987, "grad_norm": 2.609375, "learning_rate": 7.519823803644881e-05, "loss": 1.2429, "step": 95 }, { "epoch": 0.02407318247472316, "grad_norm": 2.984375, "learning_rate": 7.519792674822529e-05, "loss": 1.2462, "step": 100 }, { "epoch": 0.025276841598459315, "grad_norm": 2.75, "learning_rate": 7.519759015330501e-05, "loss": 1.217, "step": 105 }, { "epoch": 0.026480500722195474, "grad_norm": 2.484375, "learning_rate": 7.519722825199007e-05, "loss": 1.2431, "step": 110 }, { "epoch": 0.027684159845931634, "grad_norm": 2.703125, "learning_rate": 7.519684104460526e-05, "loss": 1.242, "step": 115 }, { "epoch": 0.02888781896966779, "grad_norm": 2.578125, "learning_rate": 7.519642853149806e-05, "loss": 1.2239, "step": 120 }, { "epoch": 0.03009147809340395, "grad_norm": 2.59375, "learning_rate": 7.519599071303875e-05, "loss": 1.1809, "step": 125 }, { "epoch": 0.031295137217140105, "grad_norm": 2.625, "learning_rate": 7.519552758962019e-05, "loss": 1.2366, "step": 130 }, { "epoch": 0.03249879634087626, "grad_norm": 2.5, "learning_rate": 7.519503916165803e-05, "loss": 1.1634, "step": 135 }, { "epoch": 0.033702455464612424, "grad_norm": 2.84375, "learning_rate": 7.519452542959066e-05, "loss": 1.1719, "step": 140 }, { "epoch": 0.03490611458834858, "grad_norm": 2.953125, "learning_rate": 7.51939863938791e-05, "loss": 1.1596, "step": 145 }, { "epoch": 0.036109773712084736, "grad_norm": 2.390625, "learning_rate": 7.519342205500712e-05, "loss": 1.1627, "step": 150 }, { "epoch": 0.03731343283582089, "grad_norm": 2.46875, "learning_rate": 7.519283241348121e-05, "loss": 1.166, "step": 155 }, { "epoch": 0.038517091959557055, "grad_norm": 2.59375, "learning_rate": 7.519221746983052e-05, "loss": 1.1952, "step": 160 }, { "epoch": 0.03972075108329321, "grad_norm": 2.703125, "learning_rate": 7.5191577224607e-05, "loss": 1.1565, "step": 165 }, { "epoch": 0.04092441020702937, "grad_norm": 2.671875, "learning_rate": 7.519091167838519e-05, "loss": 1.1575, "step": 170 }, { "epoch": 0.04212806933076553, "grad_norm": 2.21875, "learning_rate": 7.519022083176244e-05, "loss": 1.1399, "step": 175 }, { "epoch": 0.043331728454501686, "grad_norm": 2.3125, "learning_rate": 7.518950468535872e-05, "loss": 1.1503, "step": 180 }, { "epoch": 0.04453538757823784, "grad_norm": 2.53125, "learning_rate": 7.518876323981678e-05, "loss": 1.1025, "step": 185 }, { "epoch": 0.045739046701974, "grad_norm": 2.765625, "learning_rate": 7.518799649580204e-05, "loss": 1.1512, "step": 190 }, { "epoch": 0.04694270582571016, "grad_norm": 2.71875, "learning_rate": 7.518720445400261e-05, "loss": 1.1202, "step": 195 }, { "epoch": 0.04814636494944632, "grad_norm": 2.90625, "learning_rate": 7.518638711512932e-05, "loss": 1.1038, "step": 200 }, { "epoch": 0.04935002407318247, "grad_norm": 2.40625, "learning_rate": 7.518554447991572e-05, "loss": 1.1074, "step": 205 }, { "epoch": 0.05055368319691863, "grad_norm": 2.359375, "learning_rate": 7.518467654911806e-05, "loss": 1.1035, "step": 210 }, { "epoch": 0.05175734232065479, "grad_norm": 2.703125, "learning_rate": 7.518378332351524e-05, "loss": 1.1083, "step": 215 }, { "epoch": 0.05296100144439095, "grad_norm": 2.84375, "learning_rate": 7.518286480390892e-05, "loss": 1.1062, "step": 220 }, { "epoch": 0.054164660568127104, "grad_norm": 2.515625, "learning_rate": 7.518192099112345e-05, "loss": 1.1028, "step": 225 }, { "epoch": 0.05536831969186327, "grad_norm": 2.46875, "learning_rate": 7.518095188600586e-05, "loss": 1.1036, "step": 230 }, { "epoch": 0.05657197881559942, "grad_norm": 2.59375, "learning_rate": 7.517995748942589e-05, "loss": 1.0876, "step": 235 }, { "epoch": 0.05777563793933558, "grad_norm": 2.359375, "learning_rate": 7.517893780227597e-05, "loss": 1.0686, "step": 240 }, { "epoch": 0.058979297063071735, "grad_norm": 2.34375, "learning_rate": 7.517789282547126e-05, "loss": 1.0863, "step": 245 }, { "epoch": 0.0601829561868079, "grad_norm": 2.515625, "learning_rate": 7.517682255994956e-05, "loss": 1.0745, "step": 250 }, { "epoch": 0.061386615310544054, "grad_norm": 2.25, "learning_rate": 7.517572700667141e-05, "loss": 1.0997, "step": 255 }, { "epoch": 0.06259027443428021, "grad_norm": 2.53125, "learning_rate": 7.517460616662005e-05, "loss": 1.0501, "step": 260 }, { "epoch": 0.06379393355801637, "grad_norm": 2.5, "learning_rate": 7.517346004080137e-05, "loss": 1.0777, "step": 265 }, { "epoch": 0.06499759268175252, "grad_norm": 2.546875, "learning_rate": 7.5172288630244e-05, "loss": 1.0623, "step": 270 }, { "epoch": 0.06620125180548869, "grad_norm": 2.890625, "learning_rate": 7.517109193599923e-05, "loss": 1.0649, "step": 275 }, { "epoch": 0.06740491092922485, "grad_norm": 2.65625, "learning_rate": 7.516986995914106e-05, "loss": 1.0468, "step": 280 }, { "epoch": 0.068608570052961, "grad_norm": 3.0625, "learning_rate": 7.516862270076615e-05, "loss": 1.0485, "step": 285 }, { "epoch": 0.06981222917669716, "grad_norm": 2.515625, "learning_rate": 7.516735016199392e-05, "loss": 1.0412, "step": 290 }, { "epoch": 0.07101588830043332, "grad_norm": 2.390625, "learning_rate": 7.516605234396639e-05, "loss": 1.0392, "step": 295 }, { "epoch": 0.07221954742416947, "grad_norm": 2.421875, "learning_rate": 7.516472924784832e-05, "loss": 1.0129, "step": 300 }, { "epoch": 0.07342320654790563, "grad_norm": 2.546875, "learning_rate": 7.516338087482715e-05, "loss": 1.0365, "step": 305 }, { "epoch": 0.07462686567164178, "grad_norm": 2.234375, "learning_rate": 7.5162007226113e-05, "loss": 1.0767, "step": 310 }, { "epoch": 0.07583052479537795, "grad_norm": 2.34375, "learning_rate": 7.516060830293867e-05, "loss": 1.0139, "step": 315 }, { "epoch": 0.07703418391911411, "grad_norm": 2.484375, "learning_rate": 7.515918410655963e-05, "loss": 1.0152, "step": 320 }, { "epoch": 0.07823784304285027, "grad_norm": 2.390625, "learning_rate": 7.515773463825409e-05, "loss": 1.0269, "step": 325 }, { "epoch": 0.07944150216658642, "grad_norm": 2.640625, "learning_rate": 7.515625989932286e-05, "loss": 1.0453, "step": 330 }, { "epoch": 0.08064516129032258, "grad_norm": 2.578125, "learning_rate": 7.515475989108947e-05, "loss": 1.0238, "step": 335 }, { "epoch": 0.08184882041405873, "grad_norm": 2.875, "learning_rate": 7.515323461490016e-05, "loss": 1.022, "step": 340 }, { "epoch": 0.08305247953779489, "grad_norm": 2.578125, "learning_rate": 7.515168407212379e-05, "loss": 1.0004, "step": 345 }, { "epoch": 0.08425613866153106, "grad_norm": 2.515625, "learning_rate": 7.515010826415193e-05, "loss": 1.0361, "step": 350 }, { "epoch": 0.08545979778526722, "grad_norm": 2.3125, "learning_rate": 7.51485071923988e-05, "loss": 1.0119, "step": 355 }, { "epoch": 0.08666345690900337, "grad_norm": 2.3125, "learning_rate": 7.514688085830133e-05, "loss": 1.0128, "step": 360 }, { "epoch": 0.08786711603273953, "grad_norm": 2.421875, "learning_rate": 7.514522926331908e-05, "loss": 1.0119, "step": 365 }, { "epoch": 0.08907077515647568, "grad_norm": 2.390625, "learning_rate": 7.51435524089343e-05, "loss": 1.0205, "step": 370 }, { "epoch": 0.09027443428021184, "grad_norm": 2.328125, "learning_rate": 7.514185029665195e-05, "loss": 1.0289, "step": 375 }, { "epoch": 0.091478093403948, "grad_norm": 2.25, "learning_rate": 7.514012292799957e-05, "loss": 0.9974, "step": 380 }, { "epoch": 0.09268175252768417, "grad_norm": 2.3125, "learning_rate": 7.513837030452745e-05, "loss": 1.0058, "step": 385 }, { "epoch": 0.09388541165142032, "grad_norm": 2.328125, "learning_rate": 7.513659242780848e-05, "loss": 0.9894, "step": 390 }, { "epoch": 0.09508907077515648, "grad_norm": 2.390625, "learning_rate": 7.513478929943828e-05, "loss": 0.9879, "step": 395 }, { "epoch": 0.09629272989889263, "grad_norm": 2.5625, "learning_rate": 7.513296092103507e-05, "loss": 1.0006, "step": 400 }, { "epoch": 0.09749638902262879, "grad_norm": 2.640625, "learning_rate": 7.513110729423976e-05, "loss": 0.9984, "step": 405 }, { "epoch": 0.09870004814636495, "grad_norm": 2.515625, "learning_rate": 7.512922842071594e-05, "loss": 1.0084, "step": 410 }, { "epoch": 0.0999037072701011, "grad_norm": 2.3125, "learning_rate": 7.512732430214982e-05, "loss": 1.0034, "step": 415 }, { "epoch": 0.10110736639383726, "grad_norm": 2.546875, "learning_rate": 7.512539494025027e-05, "loss": 1.0019, "step": 420 }, { "epoch": 0.10231102551757343, "grad_norm": 2.96875, "learning_rate": 7.512344033674885e-05, "loss": 0.9941, "step": 425 }, { "epoch": 0.10351468464130958, "grad_norm": 2.625, "learning_rate": 7.512146049339975e-05, "loss": 0.9523, "step": 430 }, { "epoch": 0.10471834376504574, "grad_norm": 2.4375, "learning_rate": 7.51194554119798e-05, "loss": 0.9821, "step": 435 }, { "epoch": 0.1059220028887819, "grad_norm": 2.25, "learning_rate": 7.51174250942885e-05, "loss": 0.9661, "step": 440 }, { "epoch": 0.10712566201251805, "grad_norm": 2.609375, "learning_rate": 7.5115369542148e-05, "loss": 0.9926, "step": 445 }, { "epoch": 0.10832932113625421, "grad_norm": 2.484375, "learning_rate": 7.511328875740308e-05, "loss": 0.9999, "step": 450 }, { "epoch": 0.10953298025999036, "grad_norm": 2.40625, "learning_rate": 7.511118274192118e-05, "loss": 1.0023, "step": 455 }, { "epoch": 0.11073663938372653, "grad_norm": 2.25, "learning_rate": 7.510905149759237e-05, "loss": 0.9643, "step": 460 }, { "epoch": 0.11194029850746269, "grad_norm": 2.375, "learning_rate": 7.510689502632937e-05, "loss": 0.9565, "step": 465 }, { "epoch": 0.11314395763119885, "grad_norm": 2.265625, "learning_rate": 7.510471333006756e-05, "loss": 0.9777, "step": 470 }, { "epoch": 0.114347616754935, "grad_norm": 2.515625, "learning_rate": 7.510250641076491e-05, "loss": 1.0148, "step": 475 }, { "epoch": 0.11555127587867116, "grad_norm": 2.265625, "learning_rate": 7.51002742704021e-05, "loss": 0.9534, "step": 480 }, { "epoch": 0.11675493500240731, "grad_norm": 2.421875, "learning_rate": 7.509801691098234e-05, "loss": 0.96, "step": 485 }, { "epoch": 0.11795859412614347, "grad_norm": 2.359375, "learning_rate": 7.50957343345316e-05, "loss": 0.9168, "step": 490 }, { "epoch": 0.11916225324987964, "grad_norm": 2.25, "learning_rate": 7.509342654309836e-05, "loss": 0.9506, "step": 495 }, { "epoch": 0.1203659123736158, "grad_norm": 2.515625, "learning_rate": 7.509109353875383e-05, "loss": 0.967, "step": 500 }, { "epoch": 0.1203659123736158, "eval_loss": 0.8579447865486145, "eval_runtime": 2.4166, "eval_samples_per_second": 82.761, "eval_steps_per_second": 82.761, "step": 500 }, { "epoch": 0.12156957149735195, "grad_norm": 2.375, "learning_rate": 7.508873532359177e-05, "loss": 0.9136, "step": 505 }, { "epoch": 0.12277323062108811, "grad_norm": 2.328125, "learning_rate": 7.508635189972863e-05, "loss": 0.9422, "step": 510 }, { "epoch": 0.12397688974482426, "grad_norm": 2.203125, "learning_rate": 7.508394326930342e-05, "loss": 0.9751, "step": 515 }, { "epoch": 0.12518054886856042, "grad_norm": 2.359375, "learning_rate": 7.508150943447782e-05, "loss": 0.9974, "step": 520 }, { "epoch": 0.12638420799229658, "grad_norm": 2.265625, "learning_rate": 7.507905039743612e-05, "loss": 0.9835, "step": 525 }, { "epoch": 0.12758786711603273, "grad_norm": 2.328125, "learning_rate": 7.507656616038523e-05, "loss": 0.9457, "step": 530 }, { "epoch": 0.1287915262397689, "grad_norm": 2.328125, "learning_rate": 7.507405672555465e-05, "loss": 0.9453, "step": 535 }, { "epoch": 0.12999518536350504, "grad_norm": 2.5625, "learning_rate": 7.507152209519653e-05, "loss": 0.9403, "step": 540 }, { "epoch": 0.1311988444872412, "grad_norm": 2.296875, "learning_rate": 7.506896227158561e-05, "loss": 0.9566, "step": 545 }, { "epoch": 0.13240250361097738, "grad_norm": 2.390625, "learning_rate": 7.506637725701925e-05, "loss": 0.9112, "step": 550 }, { "epoch": 0.13360616273471354, "grad_norm": 2.4375, "learning_rate": 7.50637670538174e-05, "loss": 0.9529, "step": 555 }, { "epoch": 0.1348098218584497, "grad_norm": 2.28125, "learning_rate": 7.506113166432265e-05, "loss": 0.9439, "step": 560 }, { "epoch": 0.13601348098218585, "grad_norm": 2.296875, "learning_rate": 7.505847109090016e-05, "loss": 0.9204, "step": 565 }, { "epoch": 0.137217140105922, "grad_norm": 2.25, "learning_rate": 7.505578533593771e-05, "loss": 0.9252, "step": 570 }, { "epoch": 0.13842079922965816, "grad_norm": 2.25, "learning_rate": 7.505307440184569e-05, "loss": 0.8843, "step": 575 }, { "epoch": 0.13962445835339432, "grad_norm": 2.21875, "learning_rate": 7.505033829105704e-05, "loss": 0.9302, "step": 580 }, { "epoch": 0.14082811747713048, "grad_norm": 2.484375, "learning_rate": 7.504757700602735e-05, "loss": 0.9238, "step": 585 }, { "epoch": 0.14203177660086663, "grad_norm": 2.75, "learning_rate": 7.504479054923478e-05, "loss": 0.9393, "step": 590 }, { "epoch": 0.1432354357246028, "grad_norm": 2.578125, "learning_rate": 7.504197892318008e-05, "loss": 0.9297, "step": 595 }, { "epoch": 0.14443909484833894, "grad_norm": 2.359375, "learning_rate": 7.50391421303866e-05, "loss": 0.9065, "step": 600 }, { "epoch": 0.1456427539720751, "grad_norm": 2.234375, "learning_rate": 7.503628017340025e-05, "loss": 0.9263, "step": 605 }, { "epoch": 0.14684641309581126, "grad_norm": 2.109375, "learning_rate": 7.503339305478953e-05, "loss": 0.9169, "step": 610 }, { "epoch": 0.1480500722195474, "grad_norm": 2.34375, "learning_rate": 7.503048077714556e-05, "loss": 0.9369, "step": 615 }, { "epoch": 0.14925373134328357, "grad_norm": 2.171875, "learning_rate": 7.5027543343082e-05, "loss": 0.9541, "step": 620 }, { "epoch": 0.15045739046701975, "grad_norm": 2.359375, "learning_rate": 7.502458075523511e-05, "loss": 0.9273, "step": 625 }, { "epoch": 0.1516610495907559, "grad_norm": 2.421875, "learning_rate": 7.50215930162637e-05, "loss": 0.9541, "step": 630 }, { "epoch": 0.15286470871449206, "grad_norm": 2.375, "learning_rate": 7.501858012884915e-05, "loss": 0.9334, "step": 635 }, { "epoch": 0.15406836783822822, "grad_norm": 2.15625, "learning_rate": 7.501554209569548e-05, "loss": 0.9156, "step": 640 }, { "epoch": 0.15527202696196438, "grad_norm": 2.3125, "learning_rate": 7.501247891952918e-05, "loss": 0.9295, "step": 645 }, { "epoch": 0.15647568608570053, "grad_norm": 2.25, "learning_rate": 7.500939060309934e-05, "loss": 0.9318, "step": 650 }, { "epoch": 0.1576793452094367, "grad_norm": 2.390625, "learning_rate": 7.500627714917765e-05, "loss": 0.9627, "step": 655 }, { "epoch": 0.15888300433317284, "grad_norm": 2.421875, "learning_rate": 7.500313856055832e-05, "loss": 0.9144, "step": 660 }, { "epoch": 0.160086663456909, "grad_norm": 2.25, "learning_rate": 7.499997484005813e-05, "loss": 0.9378, "step": 665 }, { "epoch": 0.16129032258064516, "grad_norm": 2.46875, "learning_rate": 7.499678599051639e-05, "loss": 0.9226, "step": 670 }, { "epoch": 0.1624939817043813, "grad_norm": 2.21875, "learning_rate": 7.499357201479502e-05, "loss": 0.8941, "step": 675 }, { "epoch": 0.16369764082811747, "grad_norm": 2.53125, "learning_rate": 7.499033291577844e-05, "loss": 0.9054, "step": 680 }, { "epoch": 0.16490129995185362, "grad_norm": 2.4375, "learning_rate": 7.498706869637364e-05, "loss": 0.9043, "step": 685 }, { "epoch": 0.16610495907558978, "grad_norm": 2.375, "learning_rate": 7.498377935951014e-05, "loss": 0.907, "step": 690 }, { "epoch": 0.16730861819932596, "grad_norm": 2.1875, "learning_rate": 7.498046490814001e-05, "loss": 0.8948, "step": 695 }, { "epoch": 0.16851227732306212, "grad_norm": 2.515625, "learning_rate": 7.497712534523786e-05, "loss": 0.8884, "step": 700 }, { "epoch": 0.16971593644679828, "grad_norm": 2.375, "learning_rate": 7.497376067380085e-05, "loss": 0.9339, "step": 705 }, { "epoch": 0.17091959557053443, "grad_norm": 2.546875, "learning_rate": 7.497037089684863e-05, "loss": 0.9214, "step": 710 }, { "epoch": 0.1721232546942706, "grad_norm": 2.359375, "learning_rate": 7.496695601742344e-05, "loss": 0.909, "step": 715 }, { "epoch": 0.17332691381800674, "grad_norm": 2.171875, "learning_rate": 7.496351603859001e-05, "loss": 0.8977, "step": 720 }, { "epoch": 0.1745305729417429, "grad_norm": 2.46875, "learning_rate": 7.496005096343561e-05, "loss": 0.9395, "step": 725 }, { "epoch": 0.17573423206547906, "grad_norm": 2.609375, "learning_rate": 7.495656079507003e-05, "loss": 0.902, "step": 730 }, { "epoch": 0.1769378911892152, "grad_norm": 2.4375, "learning_rate": 7.495304553662555e-05, "loss": 0.9075, "step": 735 }, { "epoch": 0.17814155031295137, "grad_norm": 2.390625, "learning_rate": 7.494950519125705e-05, "loss": 0.8822, "step": 740 }, { "epoch": 0.17934520943668752, "grad_norm": 2.203125, "learning_rate": 7.494593976214182e-05, "loss": 0.8719, "step": 745 }, { "epoch": 0.18054886856042368, "grad_norm": 2.484375, "learning_rate": 7.494234925247975e-05, "loss": 0.8644, "step": 750 }, { "epoch": 0.18175252768415984, "grad_norm": 2.203125, "learning_rate": 7.493873366549319e-05, "loss": 0.8841, "step": 755 }, { "epoch": 0.182956186807896, "grad_norm": 2.265625, "learning_rate": 7.4935093004427e-05, "loss": 0.8557, "step": 760 }, { "epoch": 0.18415984593163215, "grad_norm": 2.25, "learning_rate": 7.493142727254856e-05, "loss": 0.8904, "step": 765 }, { "epoch": 0.18536350505536833, "grad_norm": 2.203125, "learning_rate": 7.492773647314775e-05, "loss": 0.8465, "step": 770 }, { "epoch": 0.1865671641791045, "grad_norm": 2.296875, "learning_rate": 7.492402060953692e-05, "loss": 0.9323, "step": 775 }, { "epoch": 0.18777082330284064, "grad_norm": 2.390625, "learning_rate": 7.492027968505095e-05, "loss": 0.8839, "step": 780 }, { "epoch": 0.1889744824265768, "grad_norm": 2.359375, "learning_rate": 7.49165137030472e-05, "loss": 0.9033, "step": 785 }, { "epoch": 0.19017814155031296, "grad_norm": 2.4375, "learning_rate": 7.491272266690549e-05, "loss": 0.8841, "step": 790 }, { "epoch": 0.1913818006740491, "grad_norm": 2.21875, "learning_rate": 7.490890658002814e-05, "loss": 0.8432, "step": 795 }, { "epoch": 0.19258545979778527, "grad_norm": 2.3125, "learning_rate": 7.490506544584e-05, "loss": 0.8822, "step": 800 }, { "epoch": 0.19378911892152142, "grad_norm": 2.28125, "learning_rate": 7.490119926778834e-05, "loss": 0.889, "step": 805 }, { "epoch": 0.19499277804525758, "grad_norm": 2.265625, "learning_rate": 7.489730804934292e-05, "loss": 0.8852, "step": 810 }, { "epoch": 0.19619643716899374, "grad_norm": 2.40625, "learning_rate": 7.489339179399597e-05, "loss": 0.8688, "step": 815 }, { "epoch": 0.1974000962927299, "grad_norm": 2.40625, "learning_rate": 7.488945050526224e-05, "loss": 0.8844, "step": 820 }, { "epoch": 0.19860375541646605, "grad_norm": 2.515625, "learning_rate": 7.488548418667887e-05, "loss": 0.8692, "step": 825 }, { "epoch": 0.1998074145402022, "grad_norm": 2.34375, "learning_rate": 7.48814928418055e-05, "loss": 0.8846, "step": 830 }, { "epoch": 0.20101107366393836, "grad_norm": 2.296875, "learning_rate": 7.487747647422422e-05, "loss": 0.895, "step": 835 }, { "epoch": 0.20221473278767452, "grad_norm": 2.40625, "learning_rate": 7.48734350875396e-05, "loss": 0.865, "step": 840 }, { "epoch": 0.2034183919114107, "grad_norm": 2.109375, "learning_rate": 7.486936868537866e-05, "loss": 0.8804, "step": 845 }, { "epoch": 0.20462205103514686, "grad_norm": 2.296875, "learning_rate": 7.486527727139085e-05, "loss": 0.892, "step": 850 }, { "epoch": 0.205825710158883, "grad_norm": 2.171875, "learning_rate": 7.486116084924808e-05, "loss": 0.9048, "step": 855 }, { "epoch": 0.20702936928261917, "grad_norm": 2.3125, "learning_rate": 7.485701942264469e-05, "loss": 0.8856, "step": 860 }, { "epoch": 0.20823302840635532, "grad_norm": 2.125, "learning_rate": 7.485285299529746e-05, "loss": 0.9206, "step": 865 }, { "epoch": 0.20943668753009148, "grad_norm": 2.34375, "learning_rate": 7.484866157094568e-05, "loss": 0.902, "step": 870 }, { "epoch": 0.21064034665382764, "grad_norm": 2.5, "learning_rate": 7.484444515335095e-05, "loss": 0.8681, "step": 875 }, { "epoch": 0.2118440057775638, "grad_norm": 2.171875, "learning_rate": 7.484020374629738e-05, "loss": 0.8925, "step": 880 }, { "epoch": 0.21304766490129995, "grad_norm": 2.234375, "learning_rate": 7.483593735359151e-05, "loss": 0.8729, "step": 885 }, { "epoch": 0.2142513240250361, "grad_norm": 2.203125, "learning_rate": 7.483164597906225e-05, "loss": 0.8567, "step": 890 }, { "epoch": 0.21545498314877226, "grad_norm": 2.40625, "learning_rate": 7.482732962656101e-05, "loss": 0.867, "step": 895 }, { "epoch": 0.21665864227250842, "grad_norm": 2.078125, "learning_rate": 7.482298829996155e-05, "loss": 0.8476, "step": 900 }, { "epoch": 0.21786230139624457, "grad_norm": 2.4375, "learning_rate": 7.481862200316005e-05, "loss": 0.8878, "step": 905 }, { "epoch": 0.21906596051998073, "grad_norm": 2.46875, "learning_rate": 7.481423074007512e-05, "loss": 0.8733, "step": 910 }, { "epoch": 0.2202696196437169, "grad_norm": 2.28125, "learning_rate": 7.48098145146478e-05, "loss": 0.8523, "step": 915 }, { "epoch": 0.22147327876745307, "grad_norm": 2.328125, "learning_rate": 7.480537333084149e-05, "loss": 0.8696, "step": 920 }, { "epoch": 0.22267693789118922, "grad_norm": 2.65625, "learning_rate": 7.480090719264199e-05, "loss": 0.8744, "step": 925 }, { "epoch": 0.22388059701492538, "grad_norm": 2.421875, "learning_rate": 7.479641610405752e-05, "loss": 0.8644, "step": 930 }, { "epoch": 0.22508425613866154, "grad_norm": 2.3125, "learning_rate": 7.479190006911868e-05, "loss": 0.8718, "step": 935 }, { "epoch": 0.2262879152623977, "grad_norm": 2.34375, "learning_rate": 7.478735909187847e-05, "loss": 0.8723, "step": 940 }, { "epoch": 0.22749157438613385, "grad_norm": 2.40625, "learning_rate": 7.478279317641225e-05, "loss": 0.8696, "step": 945 }, { "epoch": 0.22869523350987, "grad_norm": 2.1875, "learning_rate": 7.47782023268178e-05, "loss": 0.8958, "step": 950 }, { "epoch": 0.22989889263360616, "grad_norm": 2.296875, "learning_rate": 7.477358654721523e-05, "loss": 0.8537, "step": 955 }, { "epoch": 0.23110255175734232, "grad_norm": 2.3125, "learning_rate": 7.476894584174705e-05, "loss": 0.8586, "step": 960 }, { "epoch": 0.23230621088107847, "grad_norm": 2.375, "learning_rate": 7.476428021457815e-05, "loss": 0.8727, "step": 965 }, { "epoch": 0.23350987000481463, "grad_norm": 2.59375, "learning_rate": 7.475958966989575e-05, "loss": 0.8582, "step": 970 }, { "epoch": 0.23471352912855079, "grad_norm": 2.328125, "learning_rate": 7.47548742119095e-05, "loss": 0.8351, "step": 975 }, { "epoch": 0.23591718825228694, "grad_norm": 2.3125, "learning_rate": 7.475013384485134e-05, "loss": 0.841, "step": 980 }, { "epoch": 0.2371208473760231, "grad_norm": 2.109375, "learning_rate": 7.474536857297558e-05, "loss": 0.8406, "step": 985 }, { "epoch": 0.23832450649975928, "grad_norm": 2.28125, "learning_rate": 7.474057840055891e-05, "loss": 0.8378, "step": 990 }, { "epoch": 0.23952816562349544, "grad_norm": 2.046875, "learning_rate": 7.473576333190034e-05, "loss": 0.8534, "step": 995 }, { "epoch": 0.2407318247472316, "grad_norm": 2.28125, "learning_rate": 7.473092337132126e-05, "loss": 0.8428, "step": 1000 }, { "epoch": 0.2407318247472316, "eval_loss": 0.7515629529953003, "eval_runtime": 2.4162, "eval_samples_per_second": 82.774, "eval_steps_per_second": 82.774, "step": 1000 }, { "epoch": 0.24193548387096775, "grad_norm": 2.125, "learning_rate": 7.472605852316533e-05, "loss": 0.8745, "step": 1005 }, { "epoch": 0.2431391429947039, "grad_norm": 2.171875, "learning_rate": 7.47211687917986e-05, "loss": 0.8463, "step": 1010 }, { "epoch": 0.24434280211844006, "grad_norm": 2.265625, "learning_rate": 7.471625418160947e-05, "loss": 0.8593, "step": 1015 }, { "epoch": 0.24554646124217622, "grad_norm": 2.453125, "learning_rate": 7.471131469700862e-05, "loss": 0.8309, "step": 1020 }, { "epoch": 0.24675012036591237, "grad_norm": 2.125, "learning_rate": 7.470635034242906e-05, "loss": 0.8165, "step": 1025 }, { "epoch": 0.24795377948964853, "grad_norm": 2.34375, "learning_rate": 7.470136112232614e-05, "loss": 0.8193, "step": 1030 }, { "epoch": 0.24915743861338469, "grad_norm": 2.515625, "learning_rate": 7.469634704117752e-05, "loss": 0.8642, "step": 1035 }, { "epoch": 0.25036109773712084, "grad_norm": 2.140625, "learning_rate": 7.469130810348318e-05, "loss": 0.8601, "step": 1040 }, { "epoch": 0.251564756860857, "grad_norm": 2.28125, "learning_rate": 7.468624431376538e-05, "loss": 0.7957, "step": 1045 }, { "epoch": 0.25276841598459315, "grad_norm": 2.09375, "learning_rate": 7.468115567656872e-05, "loss": 0.8385, "step": 1050 }, { "epoch": 0.2539720751083293, "grad_norm": 2.28125, "learning_rate": 7.467604219646007e-05, "loss": 0.7962, "step": 1055 }, { "epoch": 0.25517573423206547, "grad_norm": 2.140625, "learning_rate": 7.467090387802862e-05, "loss": 0.8701, "step": 1060 }, { "epoch": 0.2563793933558016, "grad_norm": 2.203125, "learning_rate": 7.466574072588581e-05, "loss": 0.8678, "step": 1065 }, { "epoch": 0.2575830524795378, "grad_norm": 2.34375, "learning_rate": 7.466055274466543e-05, "loss": 0.8385, "step": 1070 }, { "epoch": 0.25878671160327393, "grad_norm": 2.375, "learning_rate": 7.46553399390235e-05, "loss": 0.8711, "step": 1075 }, { "epoch": 0.2599903707270101, "grad_norm": 2.21875, "learning_rate": 7.465010231363835e-05, "loss": 0.8953, "step": 1080 }, { "epoch": 0.26119402985074625, "grad_norm": 2.234375, "learning_rate": 7.464483987321056e-05, "loss": 0.8106, "step": 1085 }, { "epoch": 0.2623976889744824, "grad_norm": 2.28125, "learning_rate": 7.463955262246301e-05, "loss": 0.8329, "step": 1090 }, { "epoch": 0.26360134809821856, "grad_norm": 2.21875, "learning_rate": 7.463424056614082e-05, "loss": 0.8217, "step": 1095 }, { "epoch": 0.26480500722195477, "grad_norm": 2.296875, "learning_rate": 7.46289037090114e-05, "loss": 0.8368, "step": 1100 }, { "epoch": 0.2660086663456909, "grad_norm": 2.140625, "learning_rate": 7.462354205586437e-05, "loss": 0.8145, "step": 1105 }, { "epoch": 0.2672123254694271, "grad_norm": 2.109375, "learning_rate": 7.461815561151166e-05, "loss": 0.7885, "step": 1110 }, { "epoch": 0.26841598459316324, "grad_norm": 2.28125, "learning_rate": 7.461274438078741e-05, "loss": 0.845, "step": 1115 }, { "epoch": 0.2696196437168994, "grad_norm": 2.453125, "learning_rate": 7.460730836854803e-05, "loss": 0.7927, "step": 1120 }, { "epoch": 0.27082330284063555, "grad_norm": 2.15625, "learning_rate": 7.460184757967215e-05, "loss": 0.85, "step": 1125 }, { "epoch": 0.2720269619643717, "grad_norm": 2.15625, "learning_rate": 7.459636201906066e-05, "loss": 0.8376, "step": 1130 }, { "epoch": 0.27323062108810786, "grad_norm": 2.3125, "learning_rate": 7.459085169163664e-05, "loss": 0.866, "step": 1135 }, { "epoch": 0.274434280211844, "grad_norm": 2.125, "learning_rate": 7.458531660234546e-05, "loss": 0.8382, "step": 1140 }, { "epoch": 0.2756379393355802, "grad_norm": 2.234375, "learning_rate": 7.457975675615464e-05, "loss": 0.8455, "step": 1145 }, { "epoch": 0.27684159845931633, "grad_norm": 2.15625, "learning_rate": 7.457417215805399e-05, "loss": 0.8559, "step": 1150 }, { "epoch": 0.2780452575830525, "grad_norm": 2.078125, "learning_rate": 7.456856281305547e-05, "loss": 0.8299, "step": 1155 }, { "epoch": 0.27924891670678864, "grad_norm": 2.1875, "learning_rate": 7.45629287261933e-05, "loss": 0.8586, "step": 1160 }, { "epoch": 0.2804525758305248, "grad_norm": 2.265625, "learning_rate": 7.455726990252389e-05, "loss": 0.7975, "step": 1165 }, { "epoch": 0.28165623495426095, "grad_norm": 2.484375, "learning_rate": 7.455158634712583e-05, "loss": 0.8304, "step": 1170 }, { "epoch": 0.2828598940779971, "grad_norm": 2.125, "learning_rate": 7.454587806509992e-05, "loss": 0.819, "step": 1175 }, { "epoch": 0.28406355320173327, "grad_norm": 2.453125, "learning_rate": 7.454014506156915e-05, "loss": 0.8544, "step": 1180 }, { "epoch": 0.2852672123254694, "grad_norm": 2.515625, "learning_rate": 7.453438734167873e-05, "loss": 0.8258, "step": 1185 }, { "epoch": 0.2864708714492056, "grad_norm": 2.15625, "learning_rate": 7.452860491059598e-05, "loss": 0.8564, "step": 1190 }, { "epoch": 0.28767453057294173, "grad_norm": 2.046875, "learning_rate": 7.452279777351046e-05, "loss": 0.8325, "step": 1195 }, { "epoch": 0.2888781896966779, "grad_norm": 2.28125, "learning_rate": 7.451696593563388e-05, "loss": 0.8374, "step": 1200 }, { "epoch": 0.29008184882041405, "grad_norm": 2.0, "learning_rate": 7.451110940220013e-05, "loss": 0.7921, "step": 1205 }, { "epoch": 0.2912855079441502, "grad_norm": 2.296875, "learning_rate": 7.450522817846522e-05, "loss": 0.8379, "step": 1210 }, { "epoch": 0.29248916706788636, "grad_norm": 2.359375, "learning_rate": 7.449932226970739e-05, "loss": 0.8362, "step": 1215 }, { "epoch": 0.2936928261916225, "grad_norm": 2.234375, "learning_rate": 7.449339168122696e-05, "loss": 0.8319, "step": 1220 }, { "epoch": 0.29489648531535867, "grad_norm": 2.296875, "learning_rate": 7.448743641834646e-05, "loss": 0.8261, "step": 1225 }, { "epoch": 0.2961001444390948, "grad_norm": 2.203125, "learning_rate": 7.448145648641054e-05, "loss": 0.8369, "step": 1230 }, { "epoch": 0.297303803562831, "grad_norm": 2.421875, "learning_rate": 7.447545189078597e-05, "loss": 0.8054, "step": 1235 }, { "epoch": 0.29850746268656714, "grad_norm": 2.203125, "learning_rate": 7.446942263686169e-05, "loss": 0.8111, "step": 1240 }, { "epoch": 0.29971112181030335, "grad_norm": 2.15625, "learning_rate": 7.446336873004875e-05, "loss": 0.8285, "step": 1245 }, { "epoch": 0.3009147809340395, "grad_norm": 2.3125, "learning_rate": 7.445729017578033e-05, "loss": 0.8248, "step": 1250 }, { "epoch": 0.30211844005777566, "grad_norm": 2.25, "learning_rate": 7.445118697951173e-05, "loss": 0.8131, "step": 1255 }, { "epoch": 0.3033220991815118, "grad_norm": 1.9765625, "learning_rate": 7.444505914672035e-05, "loss": 0.8288, "step": 1260 }, { "epoch": 0.304525758305248, "grad_norm": 2.1875, "learning_rate": 7.443890668290574e-05, "loss": 0.7962, "step": 1265 }, { "epoch": 0.30572941742898413, "grad_norm": 2.15625, "learning_rate": 7.443272959358952e-05, "loss": 0.8235, "step": 1270 }, { "epoch": 0.3069330765527203, "grad_norm": 2.109375, "learning_rate": 7.442652788431541e-05, "loss": 0.8137, "step": 1275 }, { "epoch": 0.30813673567645644, "grad_norm": 2.03125, "learning_rate": 7.442030156064925e-05, "loss": 0.7973, "step": 1280 }, { "epoch": 0.3093403948001926, "grad_norm": 2.453125, "learning_rate": 7.441405062817895e-05, "loss": 0.8416, "step": 1285 }, { "epoch": 0.31054405392392875, "grad_norm": 2.359375, "learning_rate": 7.440777509251453e-05, "loss": 0.8208, "step": 1290 }, { "epoch": 0.3117477130476649, "grad_norm": 2.203125, "learning_rate": 7.440147495928803e-05, "loss": 0.8301, "step": 1295 }, { "epoch": 0.31295137217140107, "grad_norm": 2.375, "learning_rate": 7.439515023415366e-05, "loss": 0.7933, "step": 1300 }, { "epoch": 0.3141550312951372, "grad_norm": 2.28125, "learning_rate": 7.438880092278763e-05, "loss": 0.7935, "step": 1305 }, { "epoch": 0.3153586904188734, "grad_norm": 2.40625, "learning_rate": 7.438242703088822e-05, "loss": 0.8092, "step": 1310 }, { "epoch": 0.31656234954260953, "grad_norm": 2.1875, "learning_rate": 7.43760285641758e-05, "loss": 0.841, "step": 1315 }, { "epoch": 0.3177660086663457, "grad_norm": 2.109375, "learning_rate": 7.436960552839279e-05, "loss": 0.8307, "step": 1320 }, { "epoch": 0.31896966779008185, "grad_norm": 2.1875, "learning_rate": 7.436315792930362e-05, "loss": 0.823, "step": 1325 }, { "epoch": 0.320173326913818, "grad_norm": 2.28125, "learning_rate": 7.435668577269483e-05, "loss": 0.8125, "step": 1330 }, { "epoch": 0.32137698603755416, "grad_norm": 2.1875, "learning_rate": 7.435018906437495e-05, "loss": 0.8152, "step": 1335 }, { "epoch": 0.3225806451612903, "grad_norm": 2.15625, "learning_rate": 7.434366781017453e-05, "loss": 0.7877, "step": 1340 }, { "epoch": 0.32378430428502647, "grad_norm": 2.359375, "learning_rate": 7.433712201594622e-05, "loss": 0.7896, "step": 1345 }, { "epoch": 0.3249879634087626, "grad_norm": 2.1875, "learning_rate": 7.433055168756462e-05, "loss": 0.7763, "step": 1350 }, { "epoch": 0.3261916225324988, "grad_norm": 2.171875, "learning_rate": 7.432395683092641e-05, "loss": 0.8121, "step": 1355 }, { "epoch": 0.32739528165623494, "grad_norm": 2.21875, "learning_rate": 7.431733745195025e-05, "loss": 0.7965, "step": 1360 }, { "epoch": 0.3285989407799711, "grad_norm": 2.390625, "learning_rate": 7.431069355657676e-05, "loss": 0.8458, "step": 1365 }, { "epoch": 0.32980259990370725, "grad_norm": 2.09375, "learning_rate": 7.430402515076869e-05, "loss": 0.7621, "step": 1370 }, { "epoch": 0.3310062590274434, "grad_norm": 2.109375, "learning_rate": 7.429733224051065e-05, "loss": 0.8226, "step": 1375 }, { "epoch": 0.33220991815117956, "grad_norm": 2.1875, "learning_rate": 7.429061483180935e-05, "loss": 0.7758, "step": 1380 }, { "epoch": 0.3334135772749157, "grad_norm": 2.265625, "learning_rate": 7.428387293069341e-05, "loss": 0.7796, "step": 1385 }, { "epoch": 0.33461723639865193, "grad_norm": 2.578125, "learning_rate": 7.427710654321345e-05, "loss": 0.8098, "step": 1390 }, { "epoch": 0.3358208955223881, "grad_norm": 2.15625, "learning_rate": 7.427031567544212e-05, "loss": 0.8161, "step": 1395 }, { "epoch": 0.33702455464612424, "grad_norm": 2.28125, "learning_rate": 7.426350033347396e-05, "loss": 0.8314, "step": 1400 }, { "epoch": 0.3382282137698604, "grad_norm": 2.203125, "learning_rate": 7.425666052342554e-05, "loss": 0.7734, "step": 1405 }, { "epoch": 0.33943187289359655, "grad_norm": 2.078125, "learning_rate": 7.424979625143531e-05, "loss": 0.8005, "step": 1410 }, { "epoch": 0.3406355320173327, "grad_norm": 2.375, "learning_rate": 7.424290752366379e-05, "loss": 0.8085, "step": 1415 }, { "epoch": 0.34183919114106887, "grad_norm": 2.1875, "learning_rate": 7.423599434629334e-05, "loss": 0.81, "step": 1420 }, { "epoch": 0.343042850264805, "grad_norm": 2.328125, "learning_rate": 7.422905672552831e-05, "loss": 0.8262, "step": 1425 }, { "epoch": 0.3442465093885412, "grad_norm": 2.203125, "learning_rate": 7.4222094667595e-05, "loss": 0.7969, "step": 1430 }, { "epoch": 0.34545016851227733, "grad_norm": 2.125, "learning_rate": 7.421510817874162e-05, "loss": 0.8157, "step": 1435 }, { "epoch": 0.3466538276360135, "grad_norm": 2.15625, "learning_rate": 7.42080972652383e-05, "loss": 0.791, "step": 1440 }, { "epoch": 0.34785748675974965, "grad_norm": 2.109375, "learning_rate": 7.42010619333771e-05, "loss": 0.7623, "step": 1445 }, { "epoch": 0.3490611458834858, "grad_norm": 2.203125, "learning_rate": 7.419400218947201e-05, "loss": 0.7848, "step": 1450 }, { "epoch": 0.35026480500722196, "grad_norm": 2.171875, "learning_rate": 7.41869180398589e-05, "loss": 0.77, "step": 1455 }, { "epoch": 0.3514684641309581, "grad_norm": 2.546875, "learning_rate": 7.417980949089556e-05, "loss": 0.7763, "step": 1460 }, { "epoch": 0.35267212325469427, "grad_norm": 2.28125, "learning_rate": 7.417267654896169e-05, "loss": 0.7987, "step": 1465 }, { "epoch": 0.3538757823784304, "grad_norm": 2.390625, "learning_rate": 7.416551922045884e-05, "loss": 0.8275, "step": 1470 }, { "epoch": 0.3550794415021666, "grad_norm": 2.25, "learning_rate": 7.415833751181048e-05, "loss": 0.811, "step": 1475 }, { "epoch": 0.35628310062590274, "grad_norm": 2.125, "learning_rate": 7.415113142946199e-05, "loss": 0.7969, "step": 1480 }, { "epoch": 0.3574867597496389, "grad_norm": 2.40625, "learning_rate": 7.414390097988053e-05, "loss": 0.7832, "step": 1485 }, { "epoch": 0.35869041887337505, "grad_norm": 2.171875, "learning_rate": 7.413664616955524e-05, "loss": 0.7666, "step": 1490 }, { "epoch": 0.3598940779971112, "grad_norm": 2.1875, "learning_rate": 7.412936700499703e-05, "loss": 0.7793, "step": 1495 }, { "epoch": 0.36109773712084736, "grad_norm": 2.234375, "learning_rate": 7.412206349273873e-05, "loss": 0.7734, "step": 1500 }, { "epoch": 0.36109773712084736, "eval_loss": 0.687716543674469, "eval_runtime": 2.4175, "eval_samples_per_second": 82.729, "eval_steps_per_second": 82.729, "step": 1500 }, { "epoch": 0.3623013962445835, "grad_norm": 2.28125, "learning_rate": 7.411473563933497e-05, "loss": 0.8028, "step": 1505 }, { "epoch": 0.3635050553683197, "grad_norm": 2.203125, "learning_rate": 7.410738345136231e-05, "loss": 0.7837, "step": 1510 }, { "epoch": 0.36470871449205583, "grad_norm": 2.203125, "learning_rate": 7.410000693541903e-05, "loss": 0.7968, "step": 1515 }, { "epoch": 0.365912373615792, "grad_norm": 2.125, "learning_rate": 7.409260609812534e-05, "loss": 0.7674, "step": 1520 }, { "epoch": 0.36711603273952814, "grad_norm": 2.203125, "learning_rate": 7.408518094612324e-05, "loss": 0.7536, "step": 1525 }, { "epoch": 0.3683196918632643, "grad_norm": 2.1875, "learning_rate": 7.407773148607656e-05, "loss": 0.8126, "step": 1530 }, { "epoch": 0.36952335098700045, "grad_norm": 2.359375, "learning_rate": 7.407025772467092e-05, "loss": 0.8111, "step": 1535 }, { "epoch": 0.37072701011073667, "grad_norm": 2.265625, "learning_rate": 7.406275966861379e-05, "loss": 0.8091, "step": 1540 }, { "epoch": 0.3719306692344728, "grad_norm": 2.203125, "learning_rate": 7.405523732463444e-05, "loss": 0.7743, "step": 1545 }, { "epoch": 0.373134328358209, "grad_norm": 2.328125, "learning_rate": 7.404769069948389e-05, "loss": 0.7793, "step": 1550 }, { "epoch": 0.37433798748194513, "grad_norm": 2.3125, "learning_rate": 7.404011979993499e-05, "loss": 0.7935, "step": 1555 }, { "epoch": 0.3755416466056813, "grad_norm": 2.140625, "learning_rate": 7.403252463278238e-05, "loss": 0.7894, "step": 1560 }, { "epoch": 0.37674530572941745, "grad_norm": 2.265625, "learning_rate": 7.402490520484246e-05, "loss": 0.7806, "step": 1565 }, { "epoch": 0.3779489648531536, "grad_norm": 2.15625, "learning_rate": 7.401726152295342e-05, "loss": 0.8119, "step": 1570 }, { "epoch": 0.37915262397688976, "grad_norm": 2.25, "learning_rate": 7.40095935939752e-05, "loss": 0.7975, "step": 1575 }, { "epoch": 0.3803562831006259, "grad_norm": 2.28125, "learning_rate": 7.400190142478953e-05, "loss": 0.7802, "step": 1580 }, { "epoch": 0.38155994222436207, "grad_norm": 2.203125, "learning_rate": 7.399418502229986e-05, "loss": 0.7909, "step": 1585 }, { "epoch": 0.3827636013480982, "grad_norm": 2.375, "learning_rate": 7.398644439343139e-05, "loss": 0.8037, "step": 1590 }, { "epoch": 0.3839672604718344, "grad_norm": 2.421875, "learning_rate": 7.397867954513109e-05, "loss": 0.7849, "step": 1595 }, { "epoch": 0.38517091959557054, "grad_norm": 2.140625, "learning_rate": 7.397089048436767e-05, "loss": 0.7871, "step": 1600 }, { "epoch": 0.3863745787193067, "grad_norm": 2.09375, "learning_rate": 7.396307721813152e-05, "loss": 0.7793, "step": 1605 }, { "epoch": 0.38757823784304285, "grad_norm": 2.234375, "learning_rate": 7.395523975343479e-05, "loss": 0.7851, "step": 1610 }, { "epoch": 0.388781896966779, "grad_norm": 2.21875, "learning_rate": 7.394737809731136e-05, "loss": 0.797, "step": 1615 }, { "epoch": 0.38998555609051516, "grad_norm": 1.9296875, "learning_rate": 7.39394922568168e-05, "loss": 0.7627, "step": 1620 }, { "epoch": 0.3911892152142513, "grad_norm": 2.078125, "learning_rate": 7.393158223902837e-05, "loss": 0.8324, "step": 1625 }, { "epoch": 0.3923928743379875, "grad_norm": 2.109375, "learning_rate": 7.392364805104507e-05, "loss": 0.7787, "step": 1630 }, { "epoch": 0.39359653346172363, "grad_norm": 2.265625, "learning_rate": 7.391568969998755e-05, "loss": 0.7932, "step": 1635 }, { "epoch": 0.3948001925854598, "grad_norm": 2.34375, "learning_rate": 7.390770719299817e-05, "loss": 0.801, "step": 1640 }, { "epoch": 0.39600385170919594, "grad_norm": 2.25, "learning_rate": 7.389970053724096e-05, "loss": 0.7666, "step": 1645 }, { "epoch": 0.3972075108329321, "grad_norm": 2.296875, "learning_rate": 7.389166973990165e-05, "loss": 0.7781, "step": 1650 }, { "epoch": 0.39841116995666825, "grad_norm": 2.15625, "learning_rate": 7.388361480818758e-05, "loss": 0.7947, "step": 1655 }, { "epoch": 0.3996148290804044, "grad_norm": 1.9921875, "learning_rate": 7.38755357493278e-05, "loss": 0.7934, "step": 1660 }, { "epoch": 0.40081848820414057, "grad_norm": 2.203125, "learning_rate": 7.386743257057299e-05, "loss": 0.769, "step": 1665 }, { "epoch": 0.4020221473278767, "grad_norm": 2.203125, "learning_rate": 7.385930527919548e-05, "loss": 0.7539, "step": 1670 }, { "epoch": 0.4032258064516129, "grad_norm": 2.234375, "learning_rate": 7.385115388248925e-05, "loss": 0.7754, "step": 1675 }, { "epoch": 0.40442946557534903, "grad_norm": 2.125, "learning_rate": 7.384297838776988e-05, "loss": 0.8041, "step": 1680 }, { "epoch": 0.40563312469908525, "grad_norm": 2.265625, "learning_rate": 7.383477880237465e-05, "loss": 0.7606, "step": 1685 }, { "epoch": 0.4068367838228214, "grad_norm": 2.234375, "learning_rate": 7.382655513366237e-05, "loss": 0.7865, "step": 1690 }, { "epoch": 0.40804044294655756, "grad_norm": 1.953125, "learning_rate": 7.381830738901354e-05, "loss": 0.7656, "step": 1695 }, { "epoch": 0.4092441020702937, "grad_norm": 2.15625, "learning_rate": 7.381003557583022e-05, "loss": 0.76, "step": 1700 }, { "epoch": 0.41044776119402987, "grad_norm": 2.234375, "learning_rate": 7.380173970153607e-05, "loss": 0.793, "step": 1705 }, { "epoch": 0.411651420317766, "grad_norm": 2.140625, "learning_rate": 7.37934197735764e-05, "loss": 0.756, "step": 1710 }, { "epoch": 0.4128550794415022, "grad_norm": 2.15625, "learning_rate": 7.378507579941802e-05, "loss": 0.7674, "step": 1715 }, { "epoch": 0.41405873856523834, "grad_norm": 2.171875, "learning_rate": 7.377670778654941e-05, "loss": 0.7861, "step": 1720 }, { "epoch": 0.4152623976889745, "grad_norm": 2.375, "learning_rate": 7.376831574248056e-05, "loss": 0.7743, "step": 1725 }, { "epoch": 0.41646605681271065, "grad_norm": 2.03125, "learning_rate": 7.375989967474304e-05, "loss": 0.7511, "step": 1730 }, { "epoch": 0.4176697159364468, "grad_norm": 2.296875, "learning_rate": 7.375145959089001e-05, "loss": 0.7772, "step": 1735 }, { "epoch": 0.41887337506018296, "grad_norm": 2.125, "learning_rate": 7.374299549849616e-05, "loss": 0.7708, "step": 1740 }, { "epoch": 0.4200770341839191, "grad_norm": 2.125, "learning_rate": 7.373450740515772e-05, "loss": 0.7664, "step": 1745 }, { "epoch": 0.4212806933076553, "grad_norm": 2.171875, "learning_rate": 7.372599531849249e-05, "loss": 0.7721, "step": 1750 }, { "epoch": 0.42248435243139143, "grad_norm": 2.09375, "learning_rate": 7.371745924613975e-05, "loss": 0.7751, "step": 1755 }, { "epoch": 0.4236880115551276, "grad_norm": 2.1875, "learning_rate": 7.370889919576037e-05, "loss": 0.7575, "step": 1760 }, { "epoch": 0.42489167067886374, "grad_norm": 2.0, "learning_rate": 7.370031517503668e-05, "loss": 0.7773, "step": 1765 }, { "epoch": 0.4260953298025999, "grad_norm": 2.234375, "learning_rate": 7.36917071916726e-05, "loss": 0.7559, "step": 1770 }, { "epoch": 0.42729898892633605, "grad_norm": 2.15625, "learning_rate": 7.368307525339345e-05, "loss": 0.7386, "step": 1775 }, { "epoch": 0.4285026480500722, "grad_norm": 2.328125, "learning_rate": 7.367441936794613e-05, "loss": 0.7575, "step": 1780 }, { "epoch": 0.42970630717380837, "grad_norm": 2.1875, "learning_rate": 7.366573954309902e-05, "loss": 0.7845, "step": 1785 }, { "epoch": 0.4309099662975445, "grad_norm": 2.421875, "learning_rate": 7.365703578664196e-05, "loss": 0.8023, "step": 1790 }, { "epoch": 0.4321136254212807, "grad_norm": 2.046875, "learning_rate": 7.364830810638628e-05, "loss": 0.7781, "step": 1795 }, { "epoch": 0.43331728454501683, "grad_norm": 2.046875, "learning_rate": 7.36395565101648e-05, "loss": 0.7705, "step": 1800 }, { "epoch": 0.434520943668753, "grad_norm": 2.203125, "learning_rate": 7.363078100583177e-05, "loss": 0.8125, "step": 1805 }, { "epoch": 0.43572460279248915, "grad_norm": 2.09375, "learning_rate": 7.36219816012629e-05, "loss": 0.7666, "step": 1810 }, { "epoch": 0.4369282619162253, "grad_norm": 2.21875, "learning_rate": 7.361315830435537e-05, "loss": 0.7514, "step": 1815 }, { "epoch": 0.43813192103996146, "grad_norm": 2.296875, "learning_rate": 7.360431112302781e-05, "loss": 0.7494, "step": 1820 }, { "epoch": 0.4393355801636976, "grad_norm": 2.25, "learning_rate": 7.359544006522026e-05, "loss": 0.7663, "step": 1825 }, { "epoch": 0.4405392392874338, "grad_norm": 2.15625, "learning_rate": 7.358654513889417e-05, "loss": 0.7493, "step": 1830 }, { "epoch": 0.44174289841117, "grad_norm": 2.1875, "learning_rate": 7.357762635203247e-05, "loss": 0.7722, "step": 1835 }, { "epoch": 0.44294655753490614, "grad_norm": 2.15625, "learning_rate": 7.35686837126395e-05, "loss": 0.7896, "step": 1840 }, { "epoch": 0.4441502166586423, "grad_norm": 1.8984375, "learning_rate": 7.355971722874091e-05, "loss": 0.7486, "step": 1845 }, { "epoch": 0.44535387578237845, "grad_norm": 2.125, "learning_rate": 7.355072690838387e-05, "loss": 0.7846, "step": 1850 }, { "epoch": 0.4465575349061146, "grad_norm": 2.21875, "learning_rate": 7.354171275963688e-05, "loss": 0.7665, "step": 1855 }, { "epoch": 0.44776119402985076, "grad_norm": 2.171875, "learning_rate": 7.353267479058982e-05, "loss": 0.7758, "step": 1860 }, { "epoch": 0.4489648531535869, "grad_norm": 2.15625, "learning_rate": 7.3523613009354e-05, "loss": 0.723, "step": 1865 }, { "epoch": 0.4501685122773231, "grad_norm": 2.359375, "learning_rate": 7.351452742406204e-05, "loss": 0.7733, "step": 1870 }, { "epoch": 0.45137217140105923, "grad_norm": 2.296875, "learning_rate": 7.350541804286795e-05, "loss": 0.7683, "step": 1875 }, { "epoch": 0.4525758305247954, "grad_norm": 1.9609375, "learning_rate": 7.34962848739471e-05, "loss": 0.7656, "step": 1880 }, { "epoch": 0.45377948964853154, "grad_norm": 2.25, "learning_rate": 7.348712792549623e-05, "loss": 0.7732, "step": 1885 }, { "epoch": 0.4549831487722677, "grad_norm": 2.578125, "learning_rate": 7.347794720573334e-05, "loss": 0.7221, "step": 1890 }, { "epoch": 0.45618680789600385, "grad_norm": 2.171875, "learning_rate": 7.346874272289787e-05, "loss": 0.728, "step": 1895 }, { "epoch": 0.45739046701974, "grad_norm": 2.359375, "learning_rate": 7.34595144852505e-05, "loss": 0.8017, "step": 1900 }, { "epoch": 0.45859412614347617, "grad_norm": 2.3125, "learning_rate": 7.345026250107328e-05, "loss": 0.7741, "step": 1905 }, { "epoch": 0.4597977852672123, "grad_norm": 2.234375, "learning_rate": 7.344098677866956e-05, "loss": 0.7762, "step": 1910 }, { "epoch": 0.4610014443909485, "grad_norm": 2.28125, "learning_rate": 7.343168732636399e-05, "loss": 0.7609, "step": 1915 }, { "epoch": 0.46220510351468463, "grad_norm": 2.28125, "learning_rate": 7.342236415250251e-05, "loss": 0.7588, "step": 1920 }, { "epoch": 0.4634087626384208, "grad_norm": 2.125, "learning_rate": 7.341301726545236e-05, "loss": 0.7907, "step": 1925 }, { "epoch": 0.46461242176215695, "grad_norm": 2.1875, "learning_rate": 7.340364667360207e-05, "loss": 0.7583, "step": 1930 }, { "epoch": 0.4658160808858931, "grad_norm": 2.109375, "learning_rate": 7.339425238536141e-05, "loss": 0.7541, "step": 1935 }, { "epoch": 0.46701974000962926, "grad_norm": 2.09375, "learning_rate": 7.338483440916145e-05, "loss": 0.7562, "step": 1940 }, { "epoch": 0.4682233991333654, "grad_norm": 2.34375, "learning_rate": 7.337539275345452e-05, "loss": 0.7563, "step": 1945 }, { "epoch": 0.46942705825710157, "grad_norm": 2.171875, "learning_rate": 7.336592742671419e-05, "loss": 0.7385, "step": 1950 }, { "epoch": 0.4706307173808377, "grad_norm": 2.0625, "learning_rate": 7.335643843743526e-05, "loss": 0.7353, "step": 1955 }, { "epoch": 0.4718343765045739, "grad_norm": 2.1875, "learning_rate": 7.334692579413379e-05, "loss": 0.7242, "step": 1960 }, { "epoch": 0.47303803562831004, "grad_norm": 2.375, "learning_rate": 7.333738950534705e-05, "loss": 0.7719, "step": 1965 }, { "epoch": 0.4742416947520462, "grad_norm": 2.125, "learning_rate": 7.332782957963356e-05, "loss": 0.7788, "step": 1970 }, { "epoch": 0.4754453538757824, "grad_norm": 2.171875, "learning_rate": 7.3318246025573e-05, "loss": 0.7635, "step": 1975 }, { "epoch": 0.47664901299951856, "grad_norm": 2.234375, "learning_rate": 7.330863885176631e-05, "loss": 0.7608, "step": 1980 }, { "epoch": 0.4778526721232547, "grad_norm": 2.1875, "learning_rate": 7.329900806683563e-05, "loss": 0.7329, "step": 1985 }, { "epoch": 0.4790563312469909, "grad_norm": 2.09375, "learning_rate": 7.328935367942422e-05, "loss": 0.751, "step": 1990 }, { "epoch": 0.48025999037072703, "grad_norm": 2.171875, "learning_rate": 7.32796756981966e-05, "loss": 0.7366, "step": 1995 }, { "epoch": 0.4814636494944632, "grad_norm": 2.078125, "learning_rate": 7.326997413183845e-05, "loss": 0.7259, "step": 2000 }, { "epoch": 0.4814636494944632, "eval_loss": 0.6541061997413635, "eval_runtime": 2.4161, "eval_samples_per_second": 82.778, "eval_steps_per_second": 82.778, "step": 2000 }, { "epoch": 0.48266730861819934, "grad_norm": 2.21875, "learning_rate": 7.326024898905656e-05, "loss": 0.7437, "step": 2005 }, { "epoch": 0.4838709677419355, "grad_norm": 2.15625, "learning_rate": 7.325050027857896e-05, "loss": 0.7322, "step": 2010 }, { "epoch": 0.48507462686567165, "grad_norm": 2.09375, "learning_rate": 7.324072800915476e-05, "loss": 0.7525, "step": 2015 }, { "epoch": 0.4862782859894078, "grad_norm": 2.203125, "learning_rate": 7.323093218955426e-05, "loss": 0.7395, "step": 2020 }, { "epoch": 0.48748194511314397, "grad_norm": 2.140625, "learning_rate": 7.322111282856888e-05, "loss": 0.7477, "step": 2025 }, { "epoch": 0.4886856042368801, "grad_norm": 2.234375, "learning_rate": 7.321126993501118e-05, "loss": 0.7167, "step": 2030 }, { "epoch": 0.4898892633606163, "grad_norm": 2.484375, "learning_rate": 7.32014035177148e-05, "loss": 0.7711, "step": 2035 }, { "epoch": 0.49109292248435243, "grad_norm": 2.4375, "learning_rate": 7.319151358553453e-05, "loss": 0.7454, "step": 2040 }, { "epoch": 0.4922965816080886, "grad_norm": 2.328125, "learning_rate": 7.318160014734628e-05, "loss": 0.7272, "step": 2045 }, { "epoch": 0.49350024073182475, "grad_norm": 2.25, "learning_rate": 7.3171663212047e-05, "loss": 0.7585, "step": 2050 }, { "epoch": 0.4947038998555609, "grad_norm": 2.140625, "learning_rate": 7.316170278855475e-05, "loss": 0.7301, "step": 2055 }, { "epoch": 0.49590755897929706, "grad_norm": 2.203125, "learning_rate": 7.315171888580872e-05, "loss": 0.7209, "step": 2060 }, { "epoch": 0.4971112181030332, "grad_norm": 2.21875, "learning_rate": 7.314171151276908e-05, "loss": 0.7412, "step": 2065 }, { "epoch": 0.49831487722676937, "grad_norm": 2.125, "learning_rate": 7.313168067841716e-05, "loss": 0.7563, "step": 2070 }, { "epoch": 0.4995185363505055, "grad_norm": 2.0625, "learning_rate": 7.312162639175524e-05, "loss": 0.7186, "step": 2075 }, { "epoch": 0.5007221954742417, "grad_norm": 2.21875, "learning_rate": 7.311154866180677e-05, "loss": 0.7328, "step": 2080 }, { "epoch": 0.5019258545979779, "grad_norm": 2.046875, "learning_rate": 7.310144749761613e-05, "loss": 0.7683, "step": 2085 }, { "epoch": 0.503129513721714, "grad_norm": 2.09375, "learning_rate": 7.30913229082488e-05, "loss": 0.7706, "step": 2090 }, { "epoch": 0.5043331728454502, "grad_norm": 1.9140625, "learning_rate": 7.308117490279124e-05, "loss": 0.7109, "step": 2095 }, { "epoch": 0.5055368319691863, "grad_norm": 2.046875, "learning_rate": 7.307100349035097e-05, "loss": 0.7755, "step": 2100 }, { "epoch": 0.5067404910929225, "grad_norm": 2.046875, "learning_rate": 7.306080868005648e-05, "loss": 0.7243, "step": 2105 }, { "epoch": 0.5079441502166586, "grad_norm": 2.125, "learning_rate": 7.305059048105727e-05, "loss": 0.7462, "step": 2110 }, { "epoch": 0.5091478093403948, "grad_norm": 2.1875, "learning_rate": 7.304034890252383e-05, "loss": 0.7665, "step": 2115 }, { "epoch": 0.5103514684641309, "grad_norm": 2.265625, "learning_rate": 7.303008395364765e-05, "loss": 0.7395, "step": 2120 }, { "epoch": 0.5115551275878671, "grad_norm": 2.109375, "learning_rate": 7.301979564364117e-05, "loss": 0.7747, "step": 2125 }, { "epoch": 0.5127587867116032, "grad_norm": 2.25, "learning_rate": 7.300948398173779e-05, "loss": 0.6931, "step": 2130 }, { "epoch": 0.5139624458353395, "grad_norm": 2.203125, "learning_rate": 7.299914897719191e-05, "loss": 0.723, "step": 2135 }, { "epoch": 0.5151661049590756, "grad_norm": 2.03125, "learning_rate": 7.298879063927882e-05, "loss": 0.7726, "step": 2140 }, { "epoch": 0.5163697640828118, "grad_norm": 2.125, "learning_rate": 7.297840897729481e-05, "loss": 0.7356, "step": 2145 }, { "epoch": 0.5175734232065479, "grad_norm": 2.109375, "learning_rate": 7.296800400055706e-05, "loss": 0.7247, "step": 2150 }, { "epoch": 0.5187770823302841, "grad_norm": 2.15625, "learning_rate": 7.295757571840368e-05, "loss": 0.7482, "step": 2155 }, { "epoch": 0.5199807414540202, "grad_norm": 2.0, "learning_rate": 7.294712414019372e-05, "loss": 0.7282, "step": 2160 }, { "epoch": 0.5211844005777564, "grad_norm": 2.25, "learning_rate": 7.293664927530712e-05, "loss": 0.757, "step": 2165 }, { "epoch": 0.5223880597014925, "grad_norm": 2.125, "learning_rate": 7.292615113314472e-05, "loss": 0.7544, "step": 2170 }, { "epoch": 0.5235917188252287, "grad_norm": 2.078125, "learning_rate": 7.291562972312825e-05, "loss": 0.7363, "step": 2175 }, { "epoch": 0.5247953779489648, "grad_norm": 2.09375, "learning_rate": 7.290508505470032e-05, "loss": 0.7396, "step": 2180 }, { "epoch": 0.525999037072701, "grad_norm": 2.03125, "learning_rate": 7.289451713732443e-05, "loss": 0.7563, "step": 2185 }, { "epoch": 0.5272026961964371, "grad_norm": 2.046875, "learning_rate": 7.288392598048492e-05, "loss": 0.7385, "step": 2190 }, { "epoch": 0.5284063553201733, "grad_norm": 2.0625, "learning_rate": 7.2873311593687e-05, "loss": 0.7356, "step": 2195 }, { "epoch": 0.5296100144439095, "grad_norm": 2.171875, "learning_rate": 7.286267398645673e-05, "loss": 0.7428, "step": 2200 }, { "epoch": 0.5308136735676456, "grad_norm": 2.265625, "learning_rate": 7.285201316834101e-05, "loss": 0.7507, "step": 2205 }, { "epoch": 0.5320173326913819, "grad_norm": 2.109375, "learning_rate": 7.284132914890758e-05, "loss": 0.7333, "step": 2210 }, { "epoch": 0.533220991815118, "grad_norm": 2.140625, "learning_rate": 7.283062193774495e-05, "loss": 0.7249, "step": 2215 }, { "epoch": 0.5344246509388542, "grad_norm": 2.21875, "learning_rate": 7.281989154446253e-05, "loss": 0.7518, "step": 2220 }, { "epoch": 0.5356283100625903, "grad_norm": 2.203125, "learning_rate": 7.280913797869046e-05, "loss": 0.7485, "step": 2225 }, { "epoch": 0.5368319691863265, "grad_norm": 2.40625, "learning_rate": 7.279836125007971e-05, "loss": 0.7355, "step": 2230 }, { "epoch": 0.5380356283100626, "grad_norm": 2.03125, "learning_rate": 7.278756136830206e-05, "loss": 0.7594, "step": 2235 }, { "epoch": 0.5392392874337988, "grad_norm": 2.25, "learning_rate": 7.277673834305001e-05, "loss": 0.7225, "step": 2240 }, { "epoch": 0.5404429465575349, "grad_norm": 2.0625, "learning_rate": 7.276589218403688e-05, "loss": 0.7132, "step": 2245 }, { "epoch": 0.5416466056812711, "grad_norm": 2.015625, "learning_rate": 7.275502290099672e-05, "loss": 0.7118, "step": 2250 }, { "epoch": 0.5428502648050072, "grad_norm": 2.046875, "learning_rate": 7.274413050368438e-05, "loss": 0.734, "step": 2255 }, { "epoch": 0.5440539239287434, "grad_norm": 2.078125, "learning_rate": 7.273321500187538e-05, "loss": 0.7491, "step": 2260 }, { "epoch": 0.5452575830524795, "grad_norm": 2.15625, "learning_rate": 7.272227640536604e-05, "loss": 0.7673, "step": 2265 }, { "epoch": 0.5464612421762157, "grad_norm": 2.03125, "learning_rate": 7.271131472397339e-05, "loss": 0.7483, "step": 2270 }, { "epoch": 0.5476649012999518, "grad_norm": 2.34375, "learning_rate": 7.270032996753517e-05, "loss": 0.7284, "step": 2275 }, { "epoch": 0.548868560423688, "grad_norm": 2.203125, "learning_rate": 7.268932214590982e-05, "loss": 0.7643, "step": 2280 }, { "epoch": 0.5500722195474241, "grad_norm": 2.0625, "learning_rate": 7.267829126897652e-05, "loss": 0.7348, "step": 2285 }, { "epoch": 0.5512758786711603, "grad_norm": 2.078125, "learning_rate": 7.266723734663508e-05, "loss": 0.7307, "step": 2290 }, { "epoch": 0.5524795377948964, "grad_norm": 1.8828125, "learning_rate": 7.265616038880603e-05, "loss": 0.7181, "step": 2295 }, { "epoch": 0.5536831969186327, "grad_norm": 2.09375, "learning_rate": 7.26450604054306e-05, "loss": 0.7386, "step": 2300 }, { "epoch": 0.5548868560423688, "grad_norm": 2.3125, "learning_rate": 7.263393740647062e-05, "loss": 0.7537, "step": 2305 }, { "epoch": 0.556090515166105, "grad_norm": 2.28125, "learning_rate": 7.262279140190863e-05, "loss": 0.7102, "step": 2310 }, { "epoch": 0.5572941742898411, "grad_norm": 2.1875, "learning_rate": 7.261162240174778e-05, "loss": 0.7147, "step": 2315 }, { "epoch": 0.5584978334135773, "grad_norm": 2.1875, "learning_rate": 7.260043041601189e-05, "loss": 0.7572, "step": 2320 }, { "epoch": 0.5597014925373134, "grad_norm": 2.046875, "learning_rate": 7.258921545474539e-05, "loss": 0.7161, "step": 2325 }, { "epoch": 0.5609051516610496, "grad_norm": 1.9453125, "learning_rate": 7.257797752801332e-05, "loss": 0.7251, "step": 2330 }, { "epoch": 0.5621088107847857, "grad_norm": 2.0, "learning_rate": 7.256671664590136e-05, "loss": 0.6989, "step": 2335 }, { "epoch": 0.5633124699085219, "grad_norm": 2.203125, "learning_rate": 7.255543281851577e-05, "loss": 0.753, "step": 2340 }, { "epoch": 0.5645161290322581, "grad_norm": 2.046875, "learning_rate": 7.25441260559834e-05, "loss": 0.7316, "step": 2345 }, { "epoch": 0.5657197881559942, "grad_norm": 2.296875, "learning_rate": 7.253279636845171e-05, "loss": 0.7296, "step": 2350 }, { "epoch": 0.5669234472797304, "grad_norm": 2.15625, "learning_rate": 7.252144376608869e-05, "loss": 0.6987, "step": 2355 }, { "epoch": 0.5681271064034665, "grad_norm": 2.1875, "learning_rate": 7.251006825908295e-05, "loss": 0.7098, "step": 2360 }, { "epoch": 0.5693307655272027, "grad_norm": 1.9609375, "learning_rate": 7.24986698576436e-05, "loss": 0.6956, "step": 2365 }, { "epoch": 0.5705344246509388, "grad_norm": 1.984375, "learning_rate": 7.248724857200034e-05, "loss": 0.6961, "step": 2370 }, { "epoch": 0.571738083774675, "grad_norm": 2.1875, "learning_rate": 7.24758044124034e-05, "loss": 0.7157, "step": 2375 }, { "epoch": 0.5729417428984112, "grad_norm": 1.9453125, "learning_rate": 7.246433738912352e-05, "loss": 0.7143, "step": 2380 }, { "epoch": 0.5741454020221474, "grad_norm": 2.125, "learning_rate": 7.245284751245195e-05, "loss": 0.726, "step": 2385 }, { "epoch": 0.5753490611458835, "grad_norm": 2.046875, "learning_rate": 7.24413347927005e-05, "loss": 0.7714, "step": 2390 }, { "epoch": 0.5765527202696197, "grad_norm": 2.171875, "learning_rate": 7.242979924020144e-05, "loss": 0.7224, "step": 2395 }, { "epoch": 0.5777563793933558, "grad_norm": 2.203125, "learning_rate": 7.241824086530754e-05, "loss": 0.7367, "step": 2400 }, { "epoch": 0.578960038517092, "grad_norm": 2.125, "learning_rate": 7.240665967839207e-05, "loss": 0.7353, "step": 2405 }, { "epoch": 0.5801636976408281, "grad_norm": 2.03125, "learning_rate": 7.239505568984874e-05, "loss": 0.6976, "step": 2410 }, { "epoch": 0.5813673567645643, "grad_norm": 2.0625, "learning_rate": 7.238342891009176e-05, "loss": 0.6909, "step": 2415 }, { "epoch": 0.5825710158883004, "grad_norm": 2.140625, "learning_rate": 7.237177934955575e-05, "loss": 0.749, "step": 2420 }, { "epoch": 0.5837746750120366, "grad_norm": 2.03125, "learning_rate": 7.236010701869583e-05, "loss": 0.7254, "step": 2425 }, { "epoch": 0.5849783341357727, "grad_norm": 2.828125, "learning_rate": 7.23484119279875e-05, "loss": 0.7448, "step": 2430 }, { "epoch": 0.5861819932595089, "grad_norm": 2.265625, "learning_rate": 7.233669408792673e-05, "loss": 0.7108, "step": 2435 }, { "epoch": 0.587385652383245, "grad_norm": 2.15625, "learning_rate": 7.232495350902989e-05, "loss": 0.7044, "step": 2440 }, { "epoch": 0.5885893115069812, "grad_norm": 2.171875, "learning_rate": 7.231319020183376e-05, "loss": 0.7287, "step": 2445 }, { "epoch": 0.5897929706307173, "grad_norm": 2.171875, "learning_rate": 7.23014041768955e-05, "loss": 0.7299, "step": 2450 }, { "epoch": 0.5909966297544536, "grad_norm": 2.296875, "learning_rate": 7.228959544479267e-05, "loss": 0.7104, "step": 2455 }, { "epoch": 0.5922002888781897, "grad_norm": 1.8671875, "learning_rate": 7.227776401612323e-05, "loss": 0.704, "step": 2460 }, { "epoch": 0.5934039480019259, "grad_norm": 2.4375, "learning_rate": 7.22659099015055e-05, "loss": 0.7279, "step": 2465 }, { "epoch": 0.594607607125662, "grad_norm": 2.140625, "learning_rate": 7.225403311157814e-05, "loss": 0.722, "step": 2470 }, { "epoch": 0.5958112662493982, "grad_norm": 2.078125, "learning_rate": 7.224213365700016e-05, "loss": 0.7195, "step": 2475 }, { "epoch": 0.5970149253731343, "grad_norm": 2.078125, "learning_rate": 7.223021154845092e-05, "loss": 0.7581, "step": 2480 }, { "epoch": 0.5982185844968705, "grad_norm": 2.0, "learning_rate": 7.221826679663015e-05, "loss": 0.7929, "step": 2485 }, { "epoch": 0.5994222436206067, "grad_norm": 1.984375, "learning_rate": 7.220629941225782e-05, "loss": 0.7036, "step": 2490 }, { "epoch": 0.6006259027443428, "grad_norm": 2.375, "learning_rate": 7.21943094060743e-05, "loss": 0.7072, "step": 2495 }, { "epoch": 0.601829561868079, "grad_norm": 2.109375, "learning_rate": 7.218229678884018e-05, "loss": 0.7199, "step": 2500 }, { "epoch": 0.601829561868079, "eval_loss": 0.6185581088066101, "eval_runtime": 2.4024, "eval_samples_per_second": 83.25, "eval_steps_per_second": 83.25, "step": 2500 }, { "epoch": 0.6030332209918151, "grad_norm": 2.203125, "learning_rate": 7.21702615713364e-05, "loss": 0.7025, "step": 2505 }, { "epoch": 0.6042368801155513, "grad_norm": 1.9921875, "learning_rate": 7.215820376436418e-05, "loss": 0.7126, "step": 2510 }, { "epoch": 0.6054405392392874, "grad_norm": 2.09375, "learning_rate": 7.214612337874497e-05, "loss": 0.7045, "step": 2515 }, { "epoch": 0.6066441983630236, "grad_norm": 2.078125, "learning_rate": 7.213402042532054e-05, "loss": 0.7276, "step": 2520 }, { "epoch": 0.6078478574867597, "grad_norm": 1.984375, "learning_rate": 7.212189491495289e-05, "loss": 0.7343, "step": 2525 }, { "epoch": 0.609051516610496, "grad_norm": 2.203125, "learning_rate": 7.210974685852423e-05, "loss": 0.7073, "step": 2530 }, { "epoch": 0.610255175734232, "grad_norm": 1.984375, "learning_rate": 7.209757626693704e-05, "loss": 0.6977, "step": 2535 }, { "epoch": 0.6114588348579683, "grad_norm": 2.03125, "learning_rate": 7.208538315111404e-05, "loss": 0.6994, "step": 2540 }, { "epoch": 0.6126624939817044, "grad_norm": 2.15625, "learning_rate": 7.207316752199813e-05, "loss": 0.7094, "step": 2545 }, { "epoch": 0.6138661531054406, "grad_norm": 2.046875, "learning_rate": 7.206092939055242e-05, "loss": 0.7154, "step": 2550 }, { "epoch": 0.6150698122291767, "grad_norm": 2.078125, "learning_rate": 7.204866876776024e-05, "loss": 0.7031, "step": 2555 }, { "epoch": 0.6162734713529129, "grad_norm": 2.203125, "learning_rate": 7.203638566462509e-05, "loss": 0.6997, "step": 2560 }, { "epoch": 0.617477130476649, "grad_norm": 2.0625, "learning_rate": 7.202408009217063e-05, "loss": 0.7273, "step": 2565 }, { "epoch": 0.6186807896003852, "grad_norm": 2.125, "learning_rate": 7.201175206144072e-05, "loss": 0.7183, "step": 2570 }, { "epoch": 0.6198844487241213, "grad_norm": 2.03125, "learning_rate": 7.199940158349934e-05, "loss": 0.6838, "step": 2575 }, { "epoch": 0.6210881078478575, "grad_norm": 2.125, "learning_rate": 7.198702866943061e-05, "loss": 0.6794, "step": 2580 }, { "epoch": 0.6222917669715936, "grad_norm": 2.046875, "learning_rate": 7.197463333033886e-05, "loss": 0.7418, "step": 2585 }, { "epoch": 0.6234954260953298, "grad_norm": 2.171875, "learning_rate": 7.196221557734845e-05, "loss": 0.706, "step": 2590 }, { "epoch": 0.6246990852190659, "grad_norm": 2.140625, "learning_rate": 7.194977542160393e-05, "loss": 0.7136, "step": 2595 }, { "epoch": 0.6259027443428021, "grad_norm": 1.9921875, "learning_rate": 7.19373128742699e-05, "loss": 0.7062, "step": 2600 }, { "epoch": 0.6271064034665382, "grad_norm": 2.203125, "learning_rate": 7.192482794653109e-05, "loss": 0.7187, "step": 2605 }, { "epoch": 0.6283100625902744, "grad_norm": 2.078125, "learning_rate": 7.191232064959229e-05, "loss": 0.7383, "step": 2610 }, { "epoch": 0.6295137217140105, "grad_norm": 2.0, "learning_rate": 7.18997909946784e-05, "loss": 0.7232, "step": 2615 }, { "epoch": 0.6307173808377468, "grad_norm": 1.9375, "learning_rate": 7.188723899303436e-05, "loss": 0.6968, "step": 2620 }, { "epoch": 0.6319210399614829, "grad_norm": 2.125, "learning_rate": 7.187466465592516e-05, "loss": 0.749, "step": 2625 }, { "epoch": 0.6331246990852191, "grad_norm": 2.015625, "learning_rate": 7.186206799463587e-05, "loss": 0.7269, "step": 2630 }, { "epoch": 0.6343283582089553, "grad_norm": 2.171875, "learning_rate": 7.184944902047154e-05, "loss": 0.7076, "step": 2635 }, { "epoch": 0.6355320173326914, "grad_norm": 2.125, "learning_rate": 7.183680774475732e-05, "loss": 0.7502, "step": 2640 }, { "epoch": 0.6367356764564276, "grad_norm": 2.109375, "learning_rate": 7.182414417883831e-05, "loss": 0.7216, "step": 2645 }, { "epoch": 0.6379393355801637, "grad_norm": 1.9921875, "learning_rate": 7.181145833407964e-05, "loss": 0.7058, "step": 2650 }, { "epoch": 0.6391429947038999, "grad_norm": 2.296875, "learning_rate": 7.179875022186641e-05, "loss": 0.7297, "step": 2655 }, { "epoch": 0.640346653827636, "grad_norm": 1.9453125, "learning_rate": 7.178601985360377e-05, "loss": 0.712, "step": 2660 }, { "epoch": 0.6415503129513722, "grad_norm": 2.109375, "learning_rate": 7.177326724071674e-05, "loss": 0.7122, "step": 2665 }, { "epoch": 0.6427539720751083, "grad_norm": 2.015625, "learning_rate": 7.176049239465043e-05, "loss": 0.6803, "step": 2670 }, { "epoch": 0.6439576311988445, "grad_norm": 1.8984375, "learning_rate": 7.174769532686981e-05, "loss": 0.7044, "step": 2675 }, { "epoch": 0.6451612903225806, "grad_norm": 2.46875, "learning_rate": 7.17348760488598e-05, "loss": 0.7183, "step": 2680 }, { "epoch": 0.6463649494463168, "grad_norm": 2.078125, "learning_rate": 7.172203457212529e-05, "loss": 0.7206, "step": 2685 }, { "epoch": 0.6475686085700529, "grad_norm": 2.140625, "learning_rate": 7.170917090819108e-05, "loss": 0.7073, "step": 2690 }, { "epoch": 0.6487722676937892, "grad_norm": 2.078125, "learning_rate": 7.169628506860189e-05, "loss": 0.7037, "step": 2695 }, { "epoch": 0.6499759268175253, "grad_norm": 2.0625, "learning_rate": 7.16833770649223e-05, "loss": 0.7078, "step": 2700 }, { "epoch": 0.6511795859412615, "grad_norm": 1.921875, "learning_rate": 7.167044690873683e-05, "loss": 0.7619, "step": 2705 }, { "epoch": 0.6523832450649976, "grad_norm": 2.0625, "learning_rate": 7.165749461164988e-05, "loss": 0.6917, "step": 2710 }, { "epoch": 0.6535869041887338, "grad_norm": 1.953125, "learning_rate": 7.164452018528565e-05, "loss": 0.7178, "step": 2715 }, { "epoch": 0.6547905633124699, "grad_norm": 1.96875, "learning_rate": 7.163152364128831e-05, "loss": 0.7089, "step": 2720 }, { "epoch": 0.6559942224362061, "grad_norm": 2.203125, "learning_rate": 7.16185049913218e-05, "loss": 0.6982, "step": 2725 }, { "epoch": 0.6571978815599422, "grad_norm": 2.234375, "learning_rate": 7.160546424706991e-05, "loss": 0.7445, "step": 2730 }, { "epoch": 0.6584015406836784, "grad_norm": 1.9765625, "learning_rate": 7.15924014202363e-05, "loss": 0.7561, "step": 2735 }, { "epoch": 0.6596051998074145, "grad_norm": 2.03125, "learning_rate": 7.157931652254441e-05, "loss": 0.6975, "step": 2740 }, { "epoch": 0.6608088589311507, "grad_norm": 1.9453125, "learning_rate": 7.156620956573748e-05, "loss": 0.6788, "step": 2745 }, { "epoch": 0.6620125180548868, "grad_norm": 2.03125, "learning_rate": 7.155308056157859e-05, "loss": 0.7178, "step": 2750 }, { "epoch": 0.663216177178623, "grad_norm": 2.03125, "learning_rate": 7.153992952185058e-05, "loss": 0.7256, "step": 2755 }, { "epoch": 0.6644198363023591, "grad_norm": 2.03125, "learning_rate": 7.152675645835607e-05, "loss": 0.7036, "step": 2760 }, { "epoch": 0.6656234954260953, "grad_norm": 2.171875, "learning_rate": 7.151356138291742e-05, "loss": 0.7168, "step": 2765 }, { "epoch": 0.6668271545498314, "grad_norm": 2.125, "learning_rate": 7.150034430737679e-05, "loss": 0.7073, "step": 2770 }, { "epoch": 0.6680308136735676, "grad_norm": 2.078125, "learning_rate": 7.148710524359607e-05, "loss": 0.6977, "step": 2775 }, { "epoch": 0.6692344727973039, "grad_norm": 2.03125, "learning_rate": 7.147384420345685e-05, "loss": 0.7269, "step": 2780 }, { "epoch": 0.67043813192104, "grad_norm": 2.0625, "learning_rate": 7.14605611988605e-05, "loss": 0.7017, "step": 2785 }, { "epoch": 0.6716417910447762, "grad_norm": 2.1875, "learning_rate": 7.144725624172805e-05, "loss": 0.6911, "step": 2790 }, { "epoch": 0.6728454501685123, "grad_norm": 2.0625, "learning_rate": 7.143392934400028e-05, "loss": 0.7137, "step": 2795 }, { "epoch": 0.6740491092922485, "grad_norm": 2.046875, "learning_rate": 7.142058051763761e-05, "loss": 0.7144, "step": 2800 }, { "epoch": 0.6752527684159846, "grad_norm": 2.015625, "learning_rate": 7.140720977462018e-05, "loss": 0.7026, "step": 2805 }, { "epoch": 0.6764564275397208, "grad_norm": 2.28125, "learning_rate": 7.139381712694777e-05, "loss": 0.712, "step": 2810 }, { "epoch": 0.6776600866634569, "grad_norm": 2.296875, "learning_rate": 7.138040258663984e-05, "loss": 0.7336, "step": 2815 }, { "epoch": 0.6788637457871931, "grad_norm": 2.125, "learning_rate": 7.13669661657355e-05, "loss": 0.7178, "step": 2820 }, { "epoch": 0.6800674049109292, "grad_norm": 2.03125, "learning_rate": 7.135350787629349e-05, "loss": 0.6975, "step": 2825 }, { "epoch": 0.6812710640346654, "grad_norm": 2.015625, "learning_rate": 7.134002773039217e-05, "loss": 0.6854, "step": 2830 }, { "epoch": 0.6824747231584015, "grad_norm": 2.0, "learning_rate": 7.13265257401295e-05, "loss": 0.7039, "step": 2835 }, { "epoch": 0.6836783822821377, "grad_norm": 1.984375, "learning_rate": 7.131300191762311e-05, "loss": 0.7228, "step": 2840 }, { "epoch": 0.6848820414058738, "grad_norm": 2.03125, "learning_rate": 7.129945627501013e-05, "loss": 0.7109, "step": 2845 }, { "epoch": 0.68608570052961, "grad_norm": 1.9921875, "learning_rate": 7.128588882444734e-05, "loss": 0.6984, "step": 2850 }, { "epoch": 0.6872893596533461, "grad_norm": 2.421875, "learning_rate": 7.127229957811112e-05, "loss": 0.6898, "step": 2855 }, { "epoch": 0.6884930187770824, "grad_norm": 2.078125, "learning_rate": 7.125868854819727e-05, "loss": 0.7012, "step": 2860 }, { "epoch": 0.6896966779008185, "grad_norm": 2.15625, "learning_rate": 7.124505574692132e-05, "loss": 0.7063, "step": 2865 }, { "epoch": 0.6909003370245547, "grad_norm": 2.0625, "learning_rate": 7.123140118651819e-05, "loss": 0.6994, "step": 2870 }, { "epoch": 0.6921039961482908, "grad_norm": 1.90625, "learning_rate": 7.121772487924245e-05, "loss": 0.6898, "step": 2875 }, { "epoch": 0.693307655272027, "grad_norm": 2.015625, "learning_rate": 7.12040268373681e-05, "loss": 0.7002, "step": 2880 }, { "epoch": 0.6945113143957631, "grad_norm": 1.96875, "learning_rate": 7.119030707318866e-05, "loss": 0.7231, "step": 2885 }, { "epoch": 0.6957149735194993, "grad_norm": 1.96875, "learning_rate": 7.117656559901716e-05, "loss": 0.7083, "step": 2890 }, { "epoch": 0.6969186326432354, "grad_norm": 2.09375, "learning_rate": 7.116280242718616e-05, "loss": 0.7255, "step": 2895 }, { "epoch": 0.6981222917669716, "grad_norm": 2.1875, "learning_rate": 7.11490175700476e-05, "loss": 0.6818, "step": 2900 }, { "epoch": 0.6993259508907077, "grad_norm": 1.890625, "learning_rate": 7.113521103997295e-05, "loss": 0.7098, "step": 2905 }, { "epoch": 0.7005296100144439, "grad_norm": 2.140625, "learning_rate": 7.112138284935309e-05, "loss": 0.6684, "step": 2910 }, { "epoch": 0.70173326913818, "grad_norm": 2.078125, "learning_rate": 7.110753301059837e-05, "loss": 0.7065, "step": 2915 }, { "epoch": 0.7029369282619162, "grad_norm": 1.9296875, "learning_rate": 7.109366153613856e-05, "loss": 0.6378, "step": 2920 }, { "epoch": 0.7041405873856523, "grad_norm": 2.046875, "learning_rate": 7.107976843842285e-05, "loss": 0.717, "step": 2925 }, { "epoch": 0.7053442465093885, "grad_norm": 2.046875, "learning_rate": 7.106585372991983e-05, "loss": 0.6748, "step": 2930 }, { "epoch": 0.7065479056331248, "grad_norm": 2.171875, "learning_rate": 7.105191742311748e-05, "loss": 0.6826, "step": 2935 }, { "epoch": 0.7077515647568609, "grad_norm": 2.0625, "learning_rate": 7.103795953052316e-05, "loss": 0.6717, "step": 2940 }, { "epoch": 0.7089552238805971, "grad_norm": 1.9375, "learning_rate": 7.102398006466362e-05, "loss": 0.7121, "step": 2945 }, { "epoch": 0.7101588830043332, "grad_norm": 2.0, "learning_rate": 7.100997903808498e-05, "loss": 0.7021, "step": 2950 }, { "epoch": 0.7113625421280694, "grad_norm": 2.078125, "learning_rate": 7.099595646335266e-05, "loss": 0.6888, "step": 2955 }, { "epoch": 0.7125662012518055, "grad_norm": 2.078125, "learning_rate": 7.098191235305148e-05, "loss": 0.6547, "step": 2960 }, { "epoch": 0.7137698603755417, "grad_norm": 2.234375, "learning_rate": 7.096784671978555e-05, "loss": 0.6816, "step": 2965 }, { "epoch": 0.7149735194992778, "grad_norm": 2.140625, "learning_rate": 7.09537595761783e-05, "loss": 0.695, "step": 2970 }, { "epoch": 0.716177178623014, "grad_norm": 2.0625, "learning_rate": 7.093965093487248e-05, "loss": 0.6777, "step": 2975 }, { "epoch": 0.7173808377467501, "grad_norm": 2.078125, "learning_rate": 7.092552080853013e-05, "loss": 0.6849, "step": 2980 }, { "epoch": 0.7185844968704863, "grad_norm": 1.9765625, "learning_rate": 7.091136920983255e-05, "loss": 0.7043, "step": 2985 }, { "epoch": 0.7197881559942224, "grad_norm": 2.1875, "learning_rate": 7.089719615148034e-05, "loss": 0.7, "step": 2990 }, { "epoch": 0.7209918151179586, "grad_norm": 2.0625, "learning_rate": 7.088300164619332e-05, "loss": 0.6847, "step": 2995 }, { "epoch": 0.7221954742416947, "grad_norm": 1.96875, "learning_rate": 7.086878570671062e-05, "loss": 0.6825, "step": 3000 }, { "epoch": 0.7221954742416947, "eval_loss": 0.5935443043708801, "eval_runtime": 2.4083, "eval_samples_per_second": 83.047, "eval_steps_per_second": 83.047, "step": 3000 }, { "epoch": 0.7233991333654309, "grad_norm": 2.171875, "learning_rate": 7.085454834579054e-05, "loss": 0.7262, "step": 3005 }, { "epoch": 0.724602792489167, "grad_norm": 2.015625, "learning_rate": 7.084028957621066e-05, "loss": 0.7577, "step": 3010 }, { "epoch": 0.7258064516129032, "grad_norm": 1.953125, "learning_rate": 7.082600941076773e-05, "loss": 0.6923, "step": 3015 }, { "epoch": 0.7270101107366393, "grad_norm": 1.9375, "learning_rate": 7.081170786227776e-05, "loss": 0.6833, "step": 3020 }, { "epoch": 0.7282137698603756, "grad_norm": 2.125, "learning_rate": 7.079738494357583e-05, "loss": 0.6757, "step": 3025 }, { "epoch": 0.7294174289841117, "grad_norm": 2.125, "learning_rate": 7.078304066751637e-05, "loss": 0.7042, "step": 3030 }, { "epoch": 0.7306210881078479, "grad_norm": 1.9296875, "learning_rate": 7.076867504697283e-05, "loss": 0.6797, "step": 3035 }, { "epoch": 0.731824747231584, "grad_norm": 2.046875, "learning_rate": 7.075428809483791e-05, "loss": 0.6647, "step": 3040 }, { "epoch": 0.7330284063553202, "grad_norm": 2.203125, "learning_rate": 7.07398798240234e-05, "loss": 0.6718, "step": 3045 }, { "epoch": 0.7342320654790563, "grad_norm": 1.9609375, "learning_rate": 7.072545024746024e-05, "loss": 0.7162, "step": 3050 }, { "epoch": 0.7354357246027925, "grad_norm": 2.40625, "learning_rate": 7.07109993780985e-05, "loss": 0.661, "step": 3055 }, { "epoch": 0.7366393837265286, "grad_norm": 2.109375, "learning_rate": 7.069652722890736e-05, "loss": 0.7114, "step": 3060 }, { "epoch": 0.7378430428502648, "grad_norm": 2.21875, "learning_rate": 7.068203381287507e-05, "loss": 0.6964, "step": 3065 }, { "epoch": 0.7390467019740009, "grad_norm": 1.9609375, "learning_rate": 7.0667519143009e-05, "loss": 0.727, "step": 3070 }, { "epoch": 0.7402503610977371, "grad_norm": 2.015625, "learning_rate": 7.065298323233558e-05, "loss": 0.7187, "step": 3075 }, { "epoch": 0.7414540202214733, "grad_norm": 1.9453125, "learning_rate": 7.06384260939003e-05, "loss": 0.6952, "step": 3080 }, { "epoch": 0.7426576793452094, "grad_norm": 1.8828125, "learning_rate": 7.06238477407677e-05, "loss": 0.6252, "step": 3085 }, { "epoch": 0.7438613384689456, "grad_norm": 2.3125, "learning_rate": 7.060924818602138e-05, "loss": 0.722, "step": 3090 }, { "epoch": 0.7450649975926817, "grad_norm": 2.0, "learning_rate": 7.059462744276395e-05, "loss": 0.6839, "step": 3095 }, { "epoch": 0.746268656716418, "grad_norm": 2.015625, "learning_rate": 7.057998552411702e-05, "loss": 0.6984, "step": 3100 }, { "epoch": 0.7474723158401541, "grad_norm": 2.328125, "learning_rate": 7.056532244322123e-05, "loss": 0.6827, "step": 3105 }, { "epoch": 0.7486759749638903, "grad_norm": 2.078125, "learning_rate": 7.055063821323621e-05, "loss": 0.6519, "step": 3110 }, { "epoch": 0.7498796340876264, "grad_norm": 1.7890625, "learning_rate": 7.053593284734058e-05, "loss": 0.6937, "step": 3115 }, { "epoch": 0.7510832932113626, "grad_norm": 1.7734375, "learning_rate": 7.052120635873189e-05, "loss": 0.6719, "step": 3120 }, { "epoch": 0.7522869523350987, "grad_norm": 1.8984375, "learning_rate": 7.050645876062669e-05, "loss": 0.6803, "step": 3125 }, { "epoch": 0.7534906114588349, "grad_norm": 2.171875, "learning_rate": 7.049169006626043e-05, "loss": 0.7005, "step": 3130 }, { "epoch": 0.754694270582571, "grad_norm": 2.21875, "learning_rate": 7.047690028888756e-05, "loss": 0.6623, "step": 3135 }, { "epoch": 0.7558979297063072, "grad_norm": 2.046875, "learning_rate": 7.046208944178136e-05, "loss": 0.7266, "step": 3140 }, { "epoch": 0.7571015888300433, "grad_norm": 1.96875, "learning_rate": 7.044725753823412e-05, "loss": 0.6812, "step": 3145 }, { "epoch": 0.7583052479537795, "grad_norm": 2.21875, "learning_rate": 7.043240459155696e-05, "loss": 0.6907, "step": 3150 }, { "epoch": 0.7595089070775156, "grad_norm": 2.015625, "learning_rate": 7.041753061507987e-05, "loss": 0.6656, "step": 3155 }, { "epoch": 0.7607125662012518, "grad_norm": 2.0, "learning_rate": 7.04026356221518e-05, "loss": 0.6933, "step": 3160 }, { "epoch": 0.7619162253249879, "grad_norm": 2.078125, "learning_rate": 7.038771962614047e-05, "loss": 0.682, "step": 3165 }, { "epoch": 0.7631198844487241, "grad_norm": 1.984375, "learning_rate": 7.037278264043252e-05, "loss": 0.6681, "step": 3170 }, { "epoch": 0.7643235435724602, "grad_norm": 2.234375, "learning_rate": 7.035782467843336e-05, "loss": 0.6903, "step": 3175 }, { "epoch": 0.7655272026961965, "grad_norm": 2.125, "learning_rate": 7.034284575356729e-05, "loss": 0.6795, "step": 3180 }, { "epoch": 0.7667308618199326, "grad_norm": 2.078125, "learning_rate": 7.032784587927738e-05, "loss": 0.6882, "step": 3185 }, { "epoch": 0.7679345209436688, "grad_norm": 2.03125, "learning_rate": 7.031282506902551e-05, "loss": 0.6924, "step": 3190 }, { "epoch": 0.7691381800674049, "grad_norm": 1.96875, "learning_rate": 7.029778333629238e-05, "loss": 0.6932, "step": 3195 }, { "epoch": 0.7703418391911411, "grad_norm": 2.015625, "learning_rate": 7.028272069457741e-05, "loss": 0.7174, "step": 3200 }, { "epoch": 0.7715454983148772, "grad_norm": 2.015625, "learning_rate": 7.026763715739883e-05, "loss": 0.6819, "step": 3205 }, { "epoch": 0.7727491574386134, "grad_norm": 2.078125, "learning_rate": 7.025253273829363e-05, "loss": 0.7052, "step": 3210 }, { "epoch": 0.7739528165623495, "grad_norm": 1.90625, "learning_rate": 7.02374074508175e-05, "loss": 0.6917, "step": 3215 }, { "epoch": 0.7751564756860857, "grad_norm": 2.0625, "learning_rate": 7.022226130854488e-05, "loss": 0.665, "step": 3220 }, { "epoch": 0.7763601348098219, "grad_norm": 2.03125, "learning_rate": 7.020709432506894e-05, "loss": 0.7044, "step": 3225 }, { "epoch": 0.777563793933558, "grad_norm": 1.90625, "learning_rate": 7.019190651400152e-05, "loss": 0.7384, "step": 3230 }, { "epoch": 0.7787674530572942, "grad_norm": 1.921875, "learning_rate": 7.017669788897319e-05, "loss": 0.7046, "step": 3235 }, { "epoch": 0.7799711121810303, "grad_norm": 2.078125, "learning_rate": 7.016146846363318e-05, "loss": 0.6768, "step": 3240 }, { "epoch": 0.7811747713047665, "grad_norm": 2.171875, "learning_rate": 7.014621825164938e-05, "loss": 0.6342, "step": 3245 }, { "epoch": 0.7823784304285026, "grad_norm": 1.828125, "learning_rate": 7.013094726670837e-05, "loss": 0.6916, "step": 3250 }, { "epoch": 0.7835820895522388, "grad_norm": 2.203125, "learning_rate": 7.011565552251531e-05, "loss": 0.6637, "step": 3255 }, { "epoch": 0.784785748675975, "grad_norm": 1.984375, "learning_rate": 7.010034303279406e-05, "loss": 0.6942, "step": 3260 }, { "epoch": 0.7859894077997112, "grad_norm": 1.96875, "learning_rate": 7.008500981128708e-05, "loss": 0.6655, "step": 3265 }, { "epoch": 0.7871930669234473, "grad_norm": 1.9453125, "learning_rate": 7.006965587175538e-05, "loss": 0.661, "step": 3270 }, { "epoch": 0.7883967260471835, "grad_norm": 2.140625, "learning_rate": 7.005428122797864e-05, "loss": 0.706, "step": 3275 }, { "epoch": 0.7896003851709196, "grad_norm": 1.84375, "learning_rate": 7.003888589375508e-05, "loss": 0.6508, "step": 3280 }, { "epoch": 0.7908040442946558, "grad_norm": 1.90625, "learning_rate": 7.002346988290149e-05, "loss": 0.6981, "step": 3285 }, { "epoch": 0.7920077034183919, "grad_norm": 1.984375, "learning_rate": 7.000803320925323e-05, "loss": 0.6719, "step": 3290 }, { "epoch": 0.7932113625421281, "grad_norm": 1.984375, "learning_rate": 6.999257588666419e-05, "loss": 0.6823, "step": 3295 }, { "epoch": 0.7944150216658642, "grad_norm": 2.359375, "learning_rate": 6.997709792900683e-05, "loss": 0.6894, "step": 3300 }, { "epoch": 0.7956186807896004, "grad_norm": 1.9609375, "learning_rate": 6.996159935017208e-05, "loss": 0.6801, "step": 3305 }, { "epoch": 0.7968223399133365, "grad_norm": 2.03125, "learning_rate": 6.994608016406938e-05, "loss": 0.6678, "step": 3310 }, { "epoch": 0.7980259990370727, "grad_norm": 2.25, "learning_rate": 6.993054038462671e-05, "loss": 0.6815, "step": 3315 }, { "epoch": 0.7992296581608088, "grad_norm": 1.9140625, "learning_rate": 6.991498002579048e-05, "loss": 0.6926, "step": 3320 }, { "epoch": 0.800433317284545, "grad_norm": 1.9140625, "learning_rate": 6.989939910152561e-05, "loss": 0.6916, "step": 3325 }, { "epoch": 0.8016369764082811, "grad_norm": 1.9765625, "learning_rate": 6.988379762581545e-05, "loss": 0.6819, "step": 3330 }, { "epoch": 0.8028406355320173, "grad_norm": 2.171875, "learning_rate": 6.986817561266181e-05, "loss": 0.6759, "step": 3335 }, { "epoch": 0.8040442946557534, "grad_norm": 2.21875, "learning_rate": 6.985253307608491e-05, "loss": 0.6942, "step": 3340 }, { "epoch": 0.8052479537794897, "grad_norm": 2.109375, "learning_rate": 6.983687003012341e-05, "loss": 0.6792, "step": 3345 }, { "epoch": 0.8064516129032258, "grad_norm": 1.984375, "learning_rate": 6.982118648883438e-05, "loss": 0.6402, "step": 3350 }, { "epoch": 0.807655272026962, "grad_norm": 1.9453125, "learning_rate": 6.980548246629326e-05, "loss": 0.6802, "step": 3355 }, { "epoch": 0.8088589311506981, "grad_norm": 1.828125, "learning_rate": 6.978975797659389e-05, "loss": 0.615, "step": 3360 }, { "epoch": 0.8100625902744343, "grad_norm": 2.0625, "learning_rate": 6.97740130338485e-05, "loss": 0.6543, "step": 3365 }, { "epoch": 0.8112662493981705, "grad_norm": 1.9453125, "learning_rate": 6.97582476521876e-05, "loss": 0.6766, "step": 3370 }, { "epoch": 0.8124699085219066, "grad_norm": 1.8515625, "learning_rate": 6.974246184576012e-05, "loss": 0.6788, "step": 3375 }, { "epoch": 0.8136735676456428, "grad_norm": 2.078125, "learning_rate": 6.97266556287333e-05, "loss": 0.6849, "step": 3380 }, { "epoch": 0.8148772267693789, "grad_norm": 2.015625, "learning_rate": 6.971082901529267e-05, "loss": 0.6419, "step": 3385 }, { "epoch": 0.8160808858931151, "grad_norm": 2.046875, "learning_rate": 6.969498201964212e-05, "loss": 0.7203, "step": 3390 }, { "epoch": 0.8172845450168512, "grad_norm": 2.203125, "learning_rate": 6.967911465600376e-05, "loss": 0.674, "step": 3395 }, { "epoch": 0.8184882041405874, "grad_norm": 1.875, "learning_rate": 6.966322693861804e-05, "loss": 0.6785, "step": 3400 }, { "epoch": 0.8196918632643235, "grad_norm": 2.359375, "learning_rate": 6.964731888174366e-05, "loss": 0.7204, "step": 3405 }, { "epoch": 0.8208955223880597, "grad_norm": 2.234375, "learning_rate": 6.963139049965758e-05, "loss": 0.6844, "step": 3410 }, { "epoch": 0.8220991815117958, "grad_norm": 2.15625, "learning_rate": 6.961544180665494e-05, "loss": 0.6818, "step": 3415 }, { "epoch": 0.823302840635532, "grad_norm": 2.015625, "learning_rate": 6.959947281704922e-05, "loss": 0.6544, "step": 3420 }, { "epoch": 0.8245064997592682, "grad_norm": 2.046875, "learning_rate": 6.9583483545172e-05, "loss": 0.7053, "step": 3425 }, { "epoch": 0.8257101588830044, "grad_norm": 2.03125, "learning_rate": 6.956747400537315e-05, "loss": 0.7212, "step": 3430 }, { "epoch": 0.8269138180067405, "grad_norm": 2.0, "learning_rate": 6.955144421202071e-05, "loss": 0.6408, "step": 3435 }, { "epoch": 0.8281174771304767, "grad_norm": 1.9921875, "learning_rate": 6.953539417950085e-05, "loss": 0.6501, "step": 3440 }, { "epoch": 0.8293211362542128, "grad_norm": 1.921875, "learning_rate": 6.951932392221796e-05, "loss": 0.6593, "step": 3445 }, { "epoch": 0.830524795377949, "grad_norm": 2.09375, "learning_rate": 6.950323345459454e-05, "loss": 0.6657, "step": 3450 }, { "epoch": 0.8317284545016851, "grad_norm": 1.8828125, "learning_rate": 6.948712279107125e-05, "loss": 0.685, "step": 3455 }, { "epoch": 0.8329321136254213, "grad_norm": 2.078125, "learning_rate": 6.947099194610689e-05, "loss": 0.7025, "step": 3460 }, { "epoch": 0.8341357727491574, "grad_norm": 1.9921875, "learning_rate": 6.945484093417835e-05, "loss": 0.6594, "step": 3465 }, { "epoch": 0.8353394318728936, "grad_norm": 1.953125, "learning_rate": 6.94386697697806e-05, "loss": 0.6699, "step": 3470 }, { "epoch": 0.8365430909966297, "grad_norm": 1.9921875, "learning_rate": 6.942247846742674e-05, "loss": 0.6582, "step": 3475 }, { "epoch": 0.8377467501203659, "grad_norm": 2.015625, "learning_rate": 6.940626704164793e-05, "loss": 0.6745, "step": 3480 }, { "epoch": 0.838950409244102, "grad_norm": 1.8046875, "learning_rate": 6.939003550699337e-05, "loss": 0.6824, "step": 3485 }, { "epoch": 0.8401540683678382, "grad_norm": 2.09375, "learning_rate": 6.93737838780303e-05, "loss": 0.6271, "step": 3490 }, { "epoch": 0.8413577274915743, "grad_norm": 2.109375, "learning_rate": 6.935751216934407e-05, "loss": 0.7001, "step": 3495 }, { "epoch": 0.8425613866153105, "grad_norm": 1.96875, "learning_rate": 6.934122039553793e-05, "loss": 0.7044, "step": 3500 }, { "epoch": 0.8425613866153105, "eval_loss": 0.5733353495597839, "eval_runtime": 2.4041, "eval_samples_per_second": 83.193, "eval_steps_per_second": 83.193, "step": 3500 }, { "epoch": 0.8437650457390466, "grad_norm": 1.890625, "learning_rate": 6.932490857123324e-05, "loss": 0.685, "step": 3505 }, { "epoch": 0.8449687048627829, "grad_norm": 2.109375, "learning_rate": 6.930857671106932e-05, "loss": 0.6795, "step": 3510 }, { "epoch": 0.8461723639865191, "grad_norm": 2.046875, "learning_rate": 6.929222482970345e-05, "loss": 0.6792, "step": 3515 }, { "epoch": 0.8473760231102552, "grad_norm": 2.09375, "learning_rate": 6.92758529418109e-05, "loss": 0.6647, "step": 3520 }, { "epoch": 0.8485796822339914, "grad_norm": 2.125, "learning_rate": 6.925946106208492e-05, "loss": 0.6924, "step": 3525 }, { "epoch": 0.8497833413577275, "grad_norm": 2.03125, "learning_rate": 6.924304920523662e-05, "loss": 0.6794, "step": 3530 }, { "epoch": 0.8509870004814637, "grad_norm": 2.0, "learning_rate": 6.922661738599514e-05, "loss": 0.7257, "step": 3535 }, { "epoch": 0.8521906596051998, "grad_norm": 1.9609375, "learning_rate": 6.921016561910748e-05, "loss": 0.6848, "step": 3540 }, { "epoch": 0.853394318728936, "grad_norm": 2.15625, "learning_rate": 6.919369391933853e-05, "loss": 0.6732, "step": 3545 }, { "epoch": 0.8545979778526721, "grad_norm": 1.90625, "learning_rate": 6.917720230147111e-05, "loss": 0.6457, "step": 3550 }, { "epoch": 0.8558016369764083, "grad_norm": 2.03125, "learning_rate": 6.91606907803059e-05, "loss": 0.6906, "step": 3555 }, { "epoch": 0.8570052961001444, "grad_norm": 1.84375, "learning_rate": 6.914415937066142e-05, "loss": 0.6813, "step": 3560 }, { "epoch": 0.8582089552238806, "grad_norm": 1.796875, "learning_rate": 6.912760808737405e-05, "loss": 0.7021, "step": 3565 }, { "epoch": 0.8594126143476167, "grad_norm": 2.03125, "learning_rate": 6.911103694529805e-05, "loss": 0.6774, "step": 3570 }, { "epoch": 0.8606162734713529, "grad_norm": 1.9375, "learning_rate": 6.909444595930544e-05, "loss": 0.6874, "step": 3575 }, { "epoch": 0.861819932595089, "grad_norm": 2.078125, "learning_rate": 6.907783514428607e-05, "loss": 0.6654, "step": 3580 }, { "epoch": 0.8630235917188253, "grad_norm": 2.0, "learning_rate": 6.906120451514761e-05, "loss": 0.6499, "step": 3585 }, { "epoch": 0.8642272508425614, "grad_norm": 2.078125, "learning_rate": 6.90445540868155e-05, "loss": 0.6703, "step": 3590 }, { "epoch": 0.8654309099662976, "grad_norm": 2.1875, "learning_rate": 6.902788387423292e-05, "loss": 0.6915, "step": 3595 }, { "epoch": 0.8666345690900337, "grad_norm": 2.28125, "learning_rate": 6.901119389236082e-05, "loss": 0.6694, "step": 3600 }, { "epoch": 0.8678382282137699, "grad_norm": 1.953125, "learning_rate": 6.899448415617794e-05, "loss": 0.6693, "step": 3605 }, { "epoch": 0.869041887337506, "grad_norm": 2.015625, "learning_rate": 6.897775468068067e-05, "loss": 0.6575, "step": 3610 }, { "epoch": 0.8702455464612422, "grad_norm": 1.9375, "learning_rate": 6.896100548088318e-05, "loss": 0.6947, "step": 3615 }, { "epoch": 0.8714492055849783, "grad_norm": 2.03125, "learning_rate": 6.894423657181731e-05, "loss": 0.6578, "step": 3620 }, { "epoch": 0.8726528647087145, "grad_norm": 1.96875, "learning_rate": 6.89274479685326e-05, "loss": 0.6838, "step": 3625 }, { "epoch": 0.8738565238324506, "grad_norm": 1.9921875, "learning_rate": 6.891063968609624e-05, "loss": 0.6947, "step": 3630 }, { "epoch": 0.8750601829561868, "grad_norm": 1.9609375, "learning_rate": 6.889381173959314e-05, "loss": 0.6484, "step": 3635 }, { "epoch": 0.8762638420799229, "grad_norm": 1.875, "learning_rate": 6.887696414412577e-05, "loss": 0.7085, "step": 3640 }, { "epoch": 0.8774675012036591, "grad_norm": 2.09375, "learning_rate": 6.886009691481434e-05, "loss": 0.6785, "step": 3645 }, { "epoch": 0.8786711603273952, "grad_norm": 1.9609375, "learning_rate": 6.884321006679656e-05, "loss": 0.6721, "step": 3650 }, { "epoch": 0.8798748194511314, "grad_norm": 1.921875, "learning_rate": 6.882630361522787e-05, "loss": 0.6621, "step": 3655 }, { "epoch": 0.8810784785748677, "grad_norm": 2.015625, "learning_rate": 6.880937757528123e-05, "loss": 0.6415, "step": 3660 }, { "epoch": 0.8822821376986038, "grad_norm": 1.984375, "learning_rate": 6.879243196214718e-05, "loss": 0.6314, "step": 3665 }, { "epoch": 0.88348579682234, "grad_norm": 2.015625, "learning_rate": 6.877546679103384e-05, "loss": 0.701, "step": 3670 }, { "epoch": 0.8846894559460761, "grad_norm": 2.0, "learning_rate": 6.875848207716689e-05, "loss": 0.686, "step": 3675 }, { "epoch": 0.8858931150698123, "grad_norm": 1.9453125, "learning_rate": 6.874147783578954e-05, "loss": 0.6813, "step": 3680 }, { "epoch": 0.8870967741935484, "grad_norm": 1.9453125, "learning_rate": 6.872445408216255e-05, "loss": 0.6357, "step": 3685 }, { "epoch": 0.8883004333172846, "grad_norm": 2.015625, "learning_rate": 6.870741083156415e-05, "loss": 0.6627, "step": 3690 }, { "epoch": 0.8895040924410207, "grad_norm": 2.15625, "learning_rate": 6.86903480992901e-05, "loss": 0.6747, "step": 3695 }, { "epoch": 0.8907077515647569, "grad_norm": 2.15625, "learning_rate": 6.867326590065361e-05, "loss": 0.6878, "step": 3700 }, { "epoch": 0.891911410688493, "grad_norm": 1.7265625, "learning_rate": 6.86561642509854e-05, "loss": 0.6376, "step": 3705 }, { "epoch": 0.8931150698122292, "grad_norm": 1.9921875, "learning_rate": 6.863904316563362e-05, "loss": 0.6647, "step": 3710 }, { "epoch": 0.8943187289359653, "grad_norm": 1.890625, "learning_rate": 6.862190265996387e-05, "loss": 0.6701, "step": 3715 }, { "epoch": 0.8955223880597015, "grad_norm": 1.8046875, "learning_rate": 6.86047427493592e-05, "loss": 0.6583, "step": 3720 }, { "epoch": 0.8967260471834376, "grad_norm": 1.9296875, "learning_rate": 6.858756344922003e-05, "loss": 0.6701, "step": 3725 }, { "epoch": 0.8979297063071738, "grad_norm": 1.921875, "learning_rate": 6.857036477496424e-05, "loss": 0.6863, "step": 3730 }, { "epoch": 0.8991333654309099, "grad_norm": 1.9765625, "learning_rate": 6.855314674202704e-05, "loss": 0.6299, "step": 3735 }, { "epoch": 0.9003370245546461, "grad_norm": 1.8984375, "learning_rate": 6.853590936586105e-05, "loss": 0.6614, "step": 3740 }, { "epoch": 0.9015406836783822, "grad_norm": 2.15625, "learning_rate": 6.851865266193622e-05, "loss": 0.6342, "step": 3745 }, { "epoch": 0.9027443428021185, "grad_norm": 1.9921875, "learning_rate": 6.850137664573988e-05, "loss": 0.6648, "step": 3750 }, { "epoch": 0.9039480019258546, "grad_norm": 1.8203125, "learning_rate": 6.848408133277669e-05, "loss": 0.6791, "step": 3755 }, { "epoch": 0.9051516610495908, "grad_norm": 1.984375, "learning_rate": 6.84667667385686e-05, "loss": 0.6739, "step": 3760 }, { "epoch": 0.9063553201733269, "grad_norm": 2.140625, "learning_rate": 6.844943287865487e-05, "loss": 0.702, "step": 3765 }, { "epoch": 0.9075589792970631, "grad_norm": 1.8828125, "learning_rate": 6.843207976859207e-05, "loss": 0.6633, "step": 3770 }, { "epoch": 0.9087626384207992, "grad_norm": 2.0, "learning_rate": 6.841470742395405e-05, "loss": 0.6723, "step": 3775 }, { "epoch": 0.9099662975445354, "grad_norm": 2.0625, "learning_rate": 6.839731586033188e-05, "loss": 0.6841, "step": 3780 }, { "epoch": 0.9111699566682715, "grad_norm": 2.0, "learning_rate": 6.837990509333393e-05, "loss": 0.6754, "step": 3785 }, { "epoch": 0.9123736157920077, "grad_norm": 1.984375, "learning_rate": 6.836247513858579e-05, "loss": 0.661, "step": 3790 }, { "epoch": 0.9135772749157438, "grad_norm": 1.8984375, "learning_rate": 6.834502601173023e-05, "loss": 0.6476, "step": 3795 }, { "epoch": 0.91478093403948, "grad_norm": 2.09375, "learning_rate": 6.832755772842727e-05, "loss": 0.6827, "step": 3800 }, { "epoch": 0.9159845931632162, "grad_norm": 1.8828125, "learning_rate": 6.831007030435414e-05, "loss": 0.6691, "step": 3805 }, { "epoch": 0.9171882522869523, "grad_norm": 2.0, "learning_rate": 6.829256375520516e-05, "loss": 0.7024, "step": 3810 }, { "epoch": 0.9183919114106885, "grad_norm": 1.9609375, "learning_rate": 6.827503809669192e-05, "loss": 0.6433, "step": 3815 }, { "epoch": 0.9195955705344246, "grad_norm": 1.9453125, "learning_rate": 6.825749334454311e-05, "loss": 0.6887, "step": 3820 }, { "epoch": 0.9207992296581609, "grad_norm": 1.9765625, "learning_rate": 6.823992951450455e-05, "loss": 0.6566, "step": 3825 }, { "epoch": 0.922002888781897, "grad_norm": 1.9765625, "learning_rate": 6.822234662233916e-05, "loss": 0.6828, "step": 3830 }, { "epoch": 0.9232065479056332, "grad_norm": 1.9609375, "learning_rate": 6.820474468382704e-05, "loss": 0.6761, "step": 3835 }, { "epoch": 0.9244102070293693, "grad_norm": 1.9765625, "learning_rate": 6.818712371476534e-05, "loss": 0.626, "step": 3840 }, { "epoch": 0.9256138661531055, "grad_norm": 2.140625, "learning_rate": 6.816948373096826e-05, "loss": 0.6551, "step": 3845 }, { "epoch": 0.9268175252768416, "grad_norm": 1.859375, "learning_rate": 6.815182474826712e-05, "loss": 0.665, "step": 3850 }, { "epoch": 0.9280211844005778, "grad_norm": 1.9609375, "learning_rate": 6.813414678251028e-05, "loss": 0.7109, "step": 3855 }, { "epoch": 0.9292248435243139, "grad_norm": 1.9453125, "learning_rate": 6.811644984956307e-05, "loss": 0.6588, "step": 3860 }, { "epoch": 0.9304285026480501, "grad_norm": 2.015625, "learning_rate": 6.809873396530795e-05, "loss": 0.6724, "step": 3865 }, { "epoch": 0.9316321617717862, "grad_norm": 2.078125, "learning_rate": 6.808099914564431e-05, "loss": 0.691, "step": 3870 }, { "epoch": 0.9328358208955224, "grad_norm": 1.953125, "learning_rate": 6.806324540648856e-05, "loss": 0.6624, "step": 3875 }, { "epoch": 0.9340394800192585, "grad_norm": 2.0, "learning_rate": 6.80454727637741e-05, "loss": 0.6777, "step": 3880 }, { "epoch": 0.9352431391429947, "grad_norm": 1.9609375, "learning_rate": 6.802768123345126e-05, "loss": 0.6342, "step": 3885 }, { "epoch": 0.9364467982667308, "grad_norm": 1.9921875, "learning_rate": 6.800987083148736e-05, "loss": 0.661, "step": 3890 }, { "epoch": 0.937650457390467, "grad_norm": 2.0625, "learning_rate": 6.799204157386665e-05, "loss": 0.6604, "step": 3895 }, { "epoch": 0.9388541165142031, "grad_norm": 2.09375, "learning_rate": 6.797419347659026e-05, "loss": 0.6768, "step": 3900 }, { "epoch": 0.9400577756379394, "grad_norm": 1.8828125, "learning_rate": 6.795632655567628e-05, "loss": 0.6441, "step": 3905 }, { "epoch": 0.9412614347616755, "grad_norm": 2.046875, "learning_rate": 6.793844082715967e-05, "loss": 0.6903, "step": 3910 }, { "epoch": 0.9424650938854117, "grad_norm": 2.0625, "learning_rate": 6.79205363070923e-05, "loss": 0.6843, "step": 3915 }, { "epoch": 0.9436687530091478, "grad_norm": 1.9921875, "learning_rate": 6.790261301154283e-05, "loss": 0.6827, "step": 3920 }, { "epoch": 0.944872412132884, "grad_norm": 1.859375, "learning_rate": 6.788467095659686e-05, "loss": 0.6374, "step": 3925 }, { "epoch": 0.9460760712566201, "grad_norm": 1.9765625, "learning_rate": 6.786671015835677e-05, "loss": 0.6569, "step": 3930 }, { "epoch": 0.9472797303803563, "grad_norm": 2.140625, "learning_rate": 6.784873063294177e-05, "loss": 0.6511, "step": 3935 }, { "epoch": 0.9484833895040924, "grad_norm": 1.8984375, "learning_rate": 6.783073239648788e-05, "loss": 0.6392, "step": 3940 }, { "epoch": 0.9496870486278286, "grad_norm": 1.90625, "learning_rate": 6.781271546514794e-05, "loss": 0.6284, "step": 3945 }, { "epoch": 0.9508907077515648, "grad_norm": 1.90625, "learning_rate": 6.779467985509152e-05, "loss": 0.6342, "step": 3950 }, { "epoch": 0.9520943668753009, "grad_norm": 1.9296875, "learning_rate": 6.777662558250498e-05, "loss": 0.63, "step": 3955 }, { "epoch": 0.9532980259990371, "grad_norm": 1.9609375, "learning_rate": 6.775855266359144e-05, "loss": 0.6278, "step": 3960 }, { "epoch": 0.9545016851227732, "grad_norm": 1.9765625, "learning_rate": 6.774046111457075e-05, "loss": 0.6682, "step": 3965 }, { "epoch": 0.9557053442465094, "grad_norm": 2.15625, "learning_rate": 6.772235095167942e-05, "loss": 0.6455, "step": 3970 }, { "epoch": 0.9569090033702455, "grad_norm": 1.9921875, "learning_rate": 6.770422219117076e-05, "loss": 0.6545, "step": 3975 }, { "epoch": 0.9581126624939817, "grad_norm": 2.171875, "learning_rate": 6.76860748493147e-05, "loss": 0.6731, "step": 3980 }, { "epoch": 0.9593163216177178, "grad_norm": 2.25, "learning_rate": 6.766790894239793e-05, "loss": 0.6858, "step": 3985 }, { "epoch": 0.9605199807414541, "grad_norm": 2.0, "learning_rate": 6.764972448672365e-05, "loss": 0.6308, "step": 3990 }, { "epoch": 0.9617236398651902, "grad_norm": 1.9375, "learning_rate": 6.763152149861189e-05, "loss": 0.6771, "step": 3995 }, { "epoch": 0.9629272989889264, "grad_norm": 2.4375, "learning_rate": 6.761329999439916e-05, "loss": 0.6341, "step": 4000 }, { "epoch": 0.9629272989889264, "eval_loss": 0.5589016675949097, "eval_runtime": 2.406, "eval_samples_per_second": 83.126, "eval_steps_per_second": 83.126, "step": 4000 }, { "epoch": 0.9641309581126625, "grad_norm": 1.8203125, "learning_rate": 6.759505999043869e-05, "loss": 0.7023, "step": 4005 }, { "epoch": 0.9653346172363987, "grad_norm": 2.078125, "learning_rate": 6.757680150310026e-05, "loss": 0.66, "step": 4010 }, { "epoch": 0.9665382763601348, "grad_norm": 2.0625, "learning_rate": 6.755852454877027e-05, "loss": 0.6577, "step": 4015 }, { "epoch": 0.967741935483871, "grad_norm": 1.96875, "learning_rate": 6.754022914385163e-05, "loss": 0.6657, "step": 4020 }, { "epoch": 0.9689455946076071, "grad_norm": 2.078125, "learning_rate": 6.75219153047639e-05, "loss": 0.6462, "step": 4025 }, { "epoch": 0.9701492537313433, "grad_norm": 1.8828125, "learning_rate": 6.750358304794312e-05, "loss": 0.6606, "step": 4030 }, { "epoch": 0.9713529128550794, "grad_norm": 1.8125, "learning_rate": 6.748523238984188e-05, "loss": 0.6602, "step": 4035 }, { "epoch": 0.9725565719788156, "grad_norm": 1.9453125, "learning_rate": 6.746686334692929e-05, "loss": 0.6587, "step": 4040 }, { "epoch": 0.9737602311025517, "grad_norm": 1.9921875, "learning_rate": 6.744847593569092e-05, "loss": 0.6497, "step": 4045 }, { "epoch": 0.9749638902262879, "grad_norm": 1.9375, "learning_rate": 6.74300701726289e-05, "loss": 0.6741, "step": 4050 }, { "epoch": 0.976167549350024, "grad_norm": 2.1875, "learning_rate": 6.741164607426177e-05, "loss": 0.6446, "step": 4055 }, { "epoch": 0.9773712084737602, "grad_norm": 2.03125, "learning_rate": 6.739320365712451e-05, "loss": 0.6547, "step": 4060 }, { "epoch": 0.9785748675974963, "grad_norm": 2.125, "learning_rate": 6.737474293776865e-05, "loss": 0.6354, "step": 4065 }, { "epoch": 0.9797785267212326, "grad_norm": 1.9453125, "learning_rate": 6.7356263932762e-05, "loss": 0.6489, "step": 4070 }, { "epoch": 0.9809821858449687, "grad_norm": 2.203125, "learning_rate": 6.733776665868885e-05, "loss": 0.7068, "step": 4075 }, { "epoch": 0.9821858449687049, "grad_norm": 1.90625, "learning_rate": 6.731925113214994e-05, "loss": 0.6695, "step": 4080 }, { "epoch": 0.983389504092441, "grad_norm": 2.046875, "learning_rate": 6.730071736976229e-05, "loss": 0.6576, "step": 4085 }, { "epoch": 0.9845931632161772, "grad_norm": 1.8515625, "learning_rate": 6.728216538815934e-05, "loss": 0.6666, "step": 4090 }, { "epoch": 0.9857968223399133, "grad_norm": 2.0625, "learning_rate": 6.726359520399088e-05, "loss": 0.6542, "step": 4095 }, { "epoch": 0.9870004814636495, "grad_norm": 2.09375, "learning_rate": 6.724500683392303e-05, "loss": 0.6726, "step": 4100 }, { "epoch": 0.9882041405873857, "grad_norm": 2.015625, "learning_rate": 6.722640029463823e-05, "loss": 0.6588, "step": 4105 }, { "epoch": 0.9894077997111218, "grad_norm": 1.9921875, "learning_rate": 6.720777560283523e-05, "loss": 0.6522, "step": 4110 }, { "epoch": 0.990611458834858, "grad_norm": 1.8359375, "learning_rate": 6.718913277522905e-05, "loss": 0.6492, "step": 4115 }, { "epoch": 0.9918151179585941, "grad_norm": 2.0, "learning_rate": 6.717047182855104e-05, "loss": 0.6672, "step": 4120 }, { "epoch": 0.9930187770823303, "grad_norm": 1.953125, "learning_rate": 6.715179277954874e-05, "loss": 0.6509, "step": 4125 }, { "epoch": 0.9942224362060664, "grad_norm": 1.9921875, "learning_rate": 6.713309564498599e-05, "loss": 0.6461, "step": 4130 }, { "epoch": 0.9954260953298026, "grad_norm": 1.9375, "learning_rate": 6.711438044164282e-05, "loss": 0.6566, "step": 4135 }, { "epoch": 0.9966297544535387, "grad_norm": 2.046875, "learning_rate": 6.709564718631556e-05, "loss": 0.6447, "step": 4140 }, { "epoch": 0.997833413577275, "grad_norm": 1.9609375, "learning_rate": 6.707689589581662e-05, "loss": 0.6736, "step": 4145 }, { "epoch": 0.999037072701011, "grad_norm": 1.9140625, "learning_rate": 6.705812658697467e-05, "loss": 0.6542, "step": 4150 }, { "epoch": 0.9997592681752527, "eval_loss": 0.5545368194580078, "eval_runtime": 2.4068, "eval_samples_per_second": 83.099, "eval_steps_per_second": 83.099, "step": 4153 } ], "logging_steps": 5, "max_steps": 16616, "num_input_tokens_seen": 0, "num_train_epochs": 4, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 1.04173997654016e+17, "train_batch_size": 48, "trial_name": null, "trial_params": null }