{ "best_global_step": null, "best_metric": null, "best_model_checkpoint": null, "epoch": 2.999277804525758, "eval_steps": 500, "global_step": 12459, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.001203659123736158, "grad_norm": 4.59375, "learning_rate": 2.0735274428673248e-05, "loss": 1.9426, "step": 5 }, { "epoch": 0.002407318247472316, "grad_norm": 3.9375, "learning_rate": 4.66543674645148e-05, "loss": 1.8336, "step": 10 }, { "epoch": 0.0036109773712084737, "grad_norm": 3.09375, "learning_rate": 7.257346050035636e-05, "loss": 1.716, "step": 15 }, { "epoch": 0.004814636494944632, "grad_norm": 3.078125, "learning_rate": 9.849255353619792e-05, "loss": 1.59, "step": 20 }, { "epoch": 0.00601829561868079, "grad_norm": 3.671875, "learning_rate": 0.00012441164657203948, "loss": 1.532, "step": 25 }, { "epoch": 0.007221954742416947, "grad_norm": 3.078125, "learning_rate": 0.00015033073960788105, "loss": 1.4454, "step": 30 }, { "epoch": 0.008425613866153106, "grad_norm": 2.6875, "learning_rate": 0.0001762498326437226, "loss": 1.3962, "step": 35 }, { "epoch": 0.009629272989889264, "grad_norm": 3.03125, "learning_rate": 0.0001814336164647536, "loss": 1.3353, "step": 40 }, { "epoch": 0.010832932113625422, "grad_norm": 3.265625, "learning_rate": 0.0001814334751461317, "loss": 1.3248, "step": 45 }, { "epoch": 0.01203659123736158, "grad_norm": 2.8125, "learning_rate": 0.00018143322512111725, "loss": 1.2706, "step": 50 }, { "epoch": 0.013240250361097737, "grad_norm": 2.96875, "learning_rate": 0.00018143286639010973, "loss": 1.2645, "step": 55 }, { "epoch": 0.014443909484833895, "grad_norm": 2.71875, "learning_rate": 0.00018143239895368233, "loss": 1.2174, "step": 60 }, { "epoch": 0.015647568608570053, "grad_norm": 2.734375, "learning_rate": 0.00018143182281258187, "loss": 1.2203, "step": 65 }, { "epoch": 0.016851227732306212, "grad_norm": 2.703125, "learning_rate": 0.00018143113796772884, "loss": 1.1906, "step": 70 }, { "epoch": 0.018054886856042368, "grad_norm": 2.53125, "learning_rate": 0.00018143034442021747, "loss": 1.1788, "step": 75 }, { "epoch": 0.019258545979778528, "grad_norm": 2.703125, "learning_rate": 0.00018142944217131572, "loss": 1.1606, "step": 80 }, { "epoch": 0.020462205103514684, "grad_norm": 2.421875, "learning_rate": 0.00018142843122246502, "loss": 1.1618, "step": 85 }, { "epoch": 0.021665864227250843, "grad_norm": 2.5, "learning_rate": 0.00018142731157528072, "loss": 1.1545, "step": 90 }, { "epoch": 0.022869523350987, "grad_norm": 2.46875, "learning_rate": 0.00018142608323155166, "loss": 1.129, "step": 95 }, { "epoch": 0.02407318247472316, "grad_norm": 2.40625, "learning_rate": 0.00018142474619324043, "loss": 1.1211, "step": 100 }, { "epoch": 0.025276841598459315, "grad_norm": 2.546875, "learning_rate": 0.00018142330046248332, "loss": 1.0903, "step": 105 }, { "epoch": 0.026480500722195474, "grad_norm": 2.296875, "learning_rate": 0.00018142174604159016, "loss": 1.127, "step": 110 }, { "epoch": 0.027684159845931634, "grad_norm": 2.78125, "learning_rate": 0.0001814200829330446, "loss": 1.1246, "step": 115 }, { "epoch": 0.02888781896966779, "grad_norm": 2.828125, "learning_rate": 0.00018141831113950379, "loss": 1.0981, "step": 120 }, { "epoch": 0.03009147809340395, "grad_norm": 2.359375, "learning_rate": 0.00018141643066379865, "loss": 1.0506, "step": 125 }, { "epoch": 0.031295137217140105, "grad_norm": 2.828125, "learning_rate": 0.00018141444150893366, "loss": 1.1246, "step": 130 }, { "epoch": 0.03249879634087626, "grad_norm": 2.515625, "learning_rate": 0.00018141234367808704, "loss": 1.0545, "step": 135 }, { "epoch": 0.033702455464612424, "grad_norm": 2.53125, "learning_rate": 0.0001814101371746105, "loss": 1.0579, "step": 140 }, { "epoch": 0.03490611458834858, "grad_norm": 2.6875, "learning_rate": 0.00018140782200202953, "loss": 1.0468, "step": 145 }, { "epoch": 0.036109773712084736, "grad_norm": 2.171875, "learning_rate": 0.00018140539816404317, "loss": 1.0484, "step": 150 }, { "epoch": 0.03731343283582089, "grad_norm": 2.4375, "learning_rate": 0.00018140286566452408, "loss": 1.0511, "step": 155 }, { "epoch": 0.038517091959557055, "grad_norm": 2.25, "learning_rate": 0.00018140022450751858, "loss": 1.0796, "step": 160 }, { "epoch": 0.03972075108329321, "grad_norm": 2.15625, "learning_rate": 0.0001813974746972465, "loss": 1.04, "step": 165 }, { "epoch": 0.04092441020702937, "grad_norm": 2.5625, "learning_rate": 0.00018139461623810138, "loss": 1.046, "step": 170 }, { "epoch": 0.04212806933076553, "grad_norm": 2.15625, "learning_rate": 0.0001813916491346503, "loss": 1.0332, "step": 175 }, { "epoch": 0.043331728454501686, "grad_norm": 2.40625, "learning_rate": 0.00018138857339163392, "loss": 1.0369, "step": 180 }, { "epoch": 0.04453538757823784, "grad_norm": 1.9765625, "learning_rate": 0.00018138538901396647, "loss": 0.9817, "step": 185 }, { "epoch": 0.045739046701974, "grad_norm": 2.546875, "learning_rate": 0.0001813820960067358, "loss": 1.0455, "step": 190 }, { "epoch": 0.04694270582571016, "grad_norm": 2.328125, "learning_rate": 0.00018137869437520328, "loss": 0.9987, "step": 195 }, { "epoch": 0.04814636494944632, "grad_norm": 2.421875, "learning_rate": 0.00018137518412480384, "loss": 0.9892, "step": 200 }, { "epoch": 0.04935002407318247, "grad_norm": 1.984375, "learning_rate": 0.00018137156526114596, "loss": 0.9887, "step": 205 }, { "epoch": 0.05055368319691863, "grad_norm": 1.984375, "learning_rate": 0.00018136783779001167, "loss": 0.9929, "step": 210 }, { "epoch": 0.05175734232065479, "grad_norm": 2.3125, "learning_rate": 0.00018136400171735653, "loss": 0.9929, "step": 215 }, { "epoch": 0.05296100144439095, "grad_norm": 2.15625, "learning_rate": 0.00018136005704930957, "loss": 0.9929, "step": 220 }, { "epoch": 0.054164660568127104, "grad_norm": 2.359375, "learning_rate": 0.0001813560037921733, "loss": 0.9898, "step": 225 }, { "epoch": 0.05536831969186327, "grad_norm": 2.171875, "learning_rate": 0.00018135184195242394, "loss": 0.9808, "step": 230 }, { "epoch": 0.05657197881559942, "grad_norm": 2.359375, "learning_rate": 0.00018134757153671089, "loss": 0.9605, "step": 235 }, { "epoch": 0.05777563793933558, "grad_norm": 2.1875, "learning_rate": 0.0001813431925518573, "loss": 0.951, "step": 240 }, { "epoch": 0.058979297063071735, "grad_norm": 2.078125, "learning_rate": 0.00018133870500485955, "loss": 0.9705, "step": 245 }, { "epoch": 0.0601829561868079, "grad_norm": 2.234375, "learning_rate": 0.00018133410890288767, "loss": 0.964, "step": 250 }, { "epoch": 0.061386615310544054, "grad_norm": 2.171875, "learning_rate": 0.00018132940425328505, "loss": 0.9917, "step": 255 }, { "epoch": 0.06259027443428021, "grad_norm": 2.125, "learning_rate": 0.00018132459106356846, "loss": 0.9234, "step": 260 }, { "epoch": 0.06379393355801637, "grad_norm": 2.234375, "learning_rate": 0.0001813196693414282, "loss": 0.9561, "step": 265 }, { "epoch": 0.06499759268175252, "grad_norm": 2.015625, "learning_rate": 0.00018131463909472792, "loss": 0.9422, "step": 270 }, { "epoch": 0.06620125180548869, "grad_norm": 2.28125, "learning_rate": 0.00018130950033150463, "loss": 0.9477, "step": 275 }, { "epoch": 0.06740491092922485, "grad_norm": 2.359375, "learning_rate": 0.00018130425305996873, "loss": 0.9404, "step": 280 }, { "epoch": 0.068608570052961, "grad_norm": 2.203125, "learning_rate": 0.0001812988972885041, "loss": 0.9246, "step": 285 }, { "epoch": 0.06981222917669716, "grad_norm": 2.203125, "learning_rate": 0.0001812934330256678, "loss": 0.928, "step": 290 }, { "epoch": 0.07101588830043332, "grad_norm": 2.25, "learning_rate": 0.0001812878602801904, "loss": 0.9228, "step": 295 }, { "epoch": 0.07221954742416947, "grad_norm": 2.125, "learning_rate": 0.00018128217906097569, "loss": 0.8997, "step": 300 }, { "epoch": 0.07342320654790563, "grad_norm": 2.109375, "learning_rate": 0.0001812763893771008, "loss": 0.9153, "step": 305 }, { "epoch": 0.07462686567164178, "grad_norm": 2.015625, "learning_rate": 0.00018127049123781614, "loss": 0.9652, "step": 310 }, { "epoch": 0.07583052479537795, "grad_norm": 2.09375, "learning_rate": 0.00018126448465254544, "loss": 0.8946, "step": 315 }, { "epoch": 0.07703418391911411, "grad_norm": 2.15625, "learning_rate": 0.0001812583696308857, "loss": 0.9044, "step": 320 }, { "epoch": 0.07823784304285027, "grad_norm": 2.03125, "learning_rate": 0.0001812521461826071, "loss": 0.9066, "step": 325 }, { "epoch": 0.07944150216658642, "grad_norm": 2.09375, "learning_rate": 0.00018124581431765317, "loss": 0.9333, "step": 330 }, { "epoch": 0.08064516129032258, "grad_norm": 2.421875, "learning_rate": 0.00018123937404614064, "loss": 0.9135, "step": 335 }, { "epoch": 0.08184882041405873, "grad_norm": 2.0625, "learning_rate": 0.00018123282537835933, "loss": 0.9078, "step": 340 }, { "epoch": 0.08305247953779489, "grad_norm": 2.234375, "learning_rate": 0.00018122616832477238, "loss": 0.8865, "step": 345 }, { "epoch": 0.08425613866153106, "grad_norm": 1.90625, "learning_rate": 0.00018121940289601602, "loss": 0.9306, "step": 350 }, { "epoch": 0.08545979778526722, "grad_norm": 2.265625, "learning_rate": 0.00018121252910289972, "loss": 0.9014, "step": 355 }, { "epoch": 0.08666345690900337, "grad_norm": 1.78125, "learning_rate": 0.000181205546956406, "loss": 0.9043, "step": 360 }, { "epoch": 0.08786711603273953, "grad_norm": 2.109375, "learning_rate": 0.00018119845646769056, "loss": 0.9037, "step": 365 }, { "epoch": 0.08907077515647568, "grad_norm": 1.8984375, "learning_rate": 0.00018119125764808218, "loss": 0.9022, "step": 370 }, { "epoch": 0.09027443428021184, "grad_norm": 2.109375, "learning_rate": 0.00018118395050908273, "loss": 0.913, "step": 375 }, { "epoch": 0.091478093403948, "grad_norm": 1.8671875, "learning_rate": 0.00018117653506236715, "loss": 0.8847, "step": 380 }, { "epoch": 0.09268175252768417, "grad_norm": 2.203125, "learning_rate": 0.0001811690113197834, "loss": 0.8985, "step": 385 }, { "epoch": 0.09388541165142032, "grad_norm": 2.15625, "learning_rate": 0.00018116137929335253, "loss": 0.8898, "step": 390 }, { "epoch": 0.09508907077515648, "grad_norm": 1.8828125, "learning_rate": 0.0001811536389952685, "loss": 0.8857, "step": 395 }, { "epoch": 0.09629272989889263, "grad_norm": 2.1875, "learning_rate": 0.00018114579043789836, "loss": 0.8846, "step": 400 }, { "epoch": 0.09749638902262879, "grad_norm": 1.8984375, "learning_rate": 0.0001811378336337821, "loss": 0.8833, "step": 405 }, { "epoch": 0.09870004814636495, "grad_norm": 1.953125, "learning_rate": 0.00018112976859563265, "loss": 0.8956, "step": 410 }, { "epoch": 0.0999037072701011, "grad_norm": 2.0, "learning_rate": 0.00018112159533633588, "loss": 0.8969, "step": 415 }, { "epoch": 0.10110736639383726, "grad_norm": 2.0, "learning_rate": 0.00018111331386895052, "loss": 0.8938, "step": 420 }, { "epoch": 0.10231102551757343, "grad_norm": 2.09375, "learning_rate": 0.0001811049242067083, "loss": 0.8883, "step": 425 }, { "epoch": 0.10351468464130958, "grad_norm": 2.15625, "learning_rate": 0.0001810964263630137, "loss": 0.8387, "step": 430 }, { "epoch": 0.10471834376504574, "grad_norm": 1.8125, "learning_rate": 0.0001810878203514441, "loss": 0.8824, "step": 435 }, { "epoch": 0.1059220028887819, "grad_norm": 2.03125, "learning_rate": 0.0001810791061857497, "loss": 0.8522, "step": 440 }, { "epoch": 0.10712566201251805, "grad_norm": 1.984375, "learning_rate": 0.00018107028387985352, "loss": 0.888, "step": 445 }, { "epoch": 0.10832932113625421, "grad_norm": 1.8125, "learning_rate": 0.00018106135344785136, "loss": 0.8926, "step": 450 }, { "epoch": 0.10953298025999036, "grad_norm": 1.6640625, "learning_rate": 0.0001810523149040117, "loss": 0.8939, "step": 455 }, { "epoch": 0.11073663938372653, "grad_norm": 1.765625, "learning_rate": 0.0001810431682627759, "loss": 0.8591, "step": 460 }, { "epoch": 0.11194029850746269, "grad_norm": 2.046875, "learning_rate": 0.0001810339135387579, "loss": 0.8527, "step": 465 }, { "epoch": 0.11314395763119885, "grad_norm": 1.7421875, "learning_rate": 0.00018102455074674433, "loss": 0.8686, "step": 470 }, { "epoch": 0.114347616754935, "grad_norm": 1.9375, "learning_rate": 0.0001810150799016946, "loss": 0.8991, "step": 475 }, { "epoch": 0.11555127587867116, "grad_norm": 1.8203125, "learning_rate": 0.00018100550101874066, "loss": 0.8476, "step": 480 }, { "epoch": 0.11675493500240731, "grad_norm": 1.8828125, "learning_rate": 0.00018099581411318714, "loss": 0.8648, "step": 485 }, { "epoch": 0.11795859412614347, "grad_norm": 1.84375, "learning_rate": 0.00018098601920051122, "loss": 0.801, "step": 490 }, { "epoch": 0.11916225324987964, "grad_norm": 1.84375, "learning_rate": 0.00018097611629636263, "loss": 0.8392, "step": 495 }, { "epoch": 0.1203659123736158, "grad_norm": 1.953125, "learning_rate": 0.00018096610541656368, "loss": 0.8541, "step": 500 }, { "epoch": 0.1203659123736158, "eval_loss": 0.7588192820549011, "eval_runtime": 2.3912, "eval_samples_per_second": 83.641, "eval_steps_per_second": 83.641, "step": 500 }, { "epoch": 0.12156957149735195, "grad_norm": 1.96875, "learning_rate": 0.00018095598657710924, "loss": 0.8072, "step": 505 }, { "epoch": 0.12277323062108811, "grad_norm": 2.03125, "learning_rate": 0.00018094575979416656, "loss": 0.8412, "step": 510 }, { "epoch": 0.12397688974482426, "grad_norm": 1.9765625, "learning_rate": 0.00018093542508407543, "loss": 0.8784, "step": 515 }, { "epoch": 0.12518054886856042, "grad_norm": 1.96875, "learning_rate": 0.00018092498246334812, "loss": 0.8986, "step": 520 }, { "epoch": 0.12638420799229658, "grad_norm": 2.03125, "learning_rate": 0.0001809144319486692, "loss": 0.8843, "step": 525 }, { "epoch": 0.12758786711603273, "grad_norm": 2.078125, "learning_rate": 0.0001809037735568957, "loss": 0.8423, "step": 530 }, { "epoch": 0.1287915262397689, "grad_norm": 1.921875, "learning_rate": 0.00018089300730505704, "loss": 0.8334, "step": 535 }, { "epoch": 0.12999518536350504, "grad_norm": 1.90625, "learning_rate": 0.00018088213321035485, "loss": 0.8376, "step": 540 }, { "epoch": 0.1311988444872412, "grad_norm": 1.9765625, "learning_rate": 0.00018087115129016324, "loss": 0.8541, "step": 545 }, { "epoch": 0.13240250361097738, "grad_norm": 2.015625, "learning_rate": 0.00018086006156202842, "loss": 0.8112, "step": 550 }, { "epoch": 0.13360616273471354, "grad_norm": 1.78125, "learning_rate": 0.00018084886404366897, "loss": 0.8491, "step": 555 }, { "epoch": 0.1348098218584497, "grad_norm": 1.765625, "learning_rate": 0.00018083755875297568, "loss": 0.8348, "step": 560 }, { "epoch": 0.13601348098218585, "grad_norm": 1.921875, "learning_rate": 0.00018082614570801147, "loss": 0.8148, "step": 565 }, { "epoch": 0.137217140105922, "grad_norm": 1.8515625, "learning_rate": 0.00018081462492701146, "loss": 0.8154, "step": 570 }, { "epoch": 0.13842079922965816, "grad_norm": 1.8046875, "learning_rate": 0.0001808029964283829, "loss": 0.7738, "step": 575 }, { "epoch": 0.13962445835339432, "grad_norm": 1.9140625, "learning_rate": 0.00018079126023070517, "loss": 0.8301, "step": 580 }, { "epoch": 0.14082811747713048, "grad_norm": 1.9296875, "learning_rate": 0.00018077941635272972, "loss": 0.8191, "step": 585 }, { "epoch": 0.14203177660086663, "grad_norm": 1.8828125, "learning_rate": 0.00018076746481337998, "loss": 0.8251, "step": 590 }, { "epoch": 0.1432354357246028, "grad_norm": 1.90625, "learning_rate": 0.0001807554056317515, "loss": 0.8273, "step": 595 }, { "epoch": 0.14443909484833894, "grad_norm": 1.7109375, "learning_rate": 0.00018074323882711173, "loss": 0.7992, "step": 600 }, { "epoch": 0.1456427539720751, "grad_norm": 1.7265625, "learning_rate": 0.0001807309644189001, "loss": 0.8212, "step": 605 }, { "epoch": 0.14684641309581126, "grad_norm": 1.7109375, "learning_rate": 0.000180718582426728, "loss": 0.8157, "step": 610 }, { "epoch": 0.1480500722195474, "grad_norm": 1.875, "learning_rate": 0.00018070609287037864, "loss": 0.8365, "step": 615 }, { "epoch": 0.14925373134328357, "grad_norm": 1.8828125, "learning_rate": 0.00018069349576980715, "loss": 0.8554, "step": 620 }, { "epoch": 0.15045739046701975, "grad_norm": 1.890625, "learning_rate": 0.0001806807911451405, "loss": 0.8188, "step": 625 }, { "epoch": 0.1516610495907559, "grad_norm": 1.8671875, "learning_rate": 0.00018066797901667732, "loss": 0.8463, "step": 630 }, { "epoch": 0.15286470871449206, "grad_norm": 1.828125, "learning_rate": 0.00018065505940488823, "loss": 0.8317, "step": 635 }, { "epoch": 0.15406836783822822, "grad_norm": 1.6640625, "learning_rate": 0.00018064203233041537, "loss": 0.8088, "step": 640 }, { "epoch": 0.15527202696196438, "grad_norm": 1.875, "learning_rate": 0.00018062889781407274, "loss": 0.8255, "step": 645 }, { "epoch": 0.15647568608570053, "grad_norm": 1.78125, "learning_rate": 0.00018061565587684583, "loss": 0.8245, "step": 650 }, { "epoch": 0.1576793452094367, "grad_norm": 1.8203125, "learning_rate": 0.00018060230653989197, "loss": 0.8574, "step": 655 }, { "epoch": 0.15888300433317284, "grad_norm": 1.8828125, "learning_rate": 0.0001805888498245399, "loss": 0.82, "step": 660 }, { "epoch": 0.160086663456909, "grad_norm": 1.7890625, "learning_rate": 0.00018057528575229003, "loss": 0.8405, "step": 665 }, { "epoch": 0.16129032258064516, "grad_norm": 2.0, "learning_rate": 0.00018056161434481429, "loss": 0.8203, "step": 670 }, { "epoch": 0.1624939817043813, "grad_norm": 1.703125, "learning_rate": 0.00018054783562395605, "loss": 0.7883, "step": 675 }, { "epoch": 0.16369764082811747, "grad_norm": 1.8046875, "learning_rate": 0.00018053394961173018, "loss": 0.8059, "step": 680 }, { "epoch": 0.16490129995185362, "grad_norm": 1.8515625, "learning_rate": 0.000180519956330323, "loss": 0.8, "step": 685 }, { "epoch": 0.16610495907558978, "grad_norm": 1.78125, "learning_rate": 0.00018050585580209214, "loss": 0.8, "step": 690 }, { "epoch": 0.16730861819932596, "grad_norm": 1.9375, "learning_rate": 0.00018049164804956666, "loss": 0.7914, "step": 695 }, { "epoch": 0.16851227732306212, "grad_norm": 1.875, "learning_rate": 0.0001804773330954469, "loss": 0.7928, "step": 700 }, { "epoch": 0.16971593644679828, "grad_norm": 2.03125, "learning_rate": 0.00018046291096260447, "loss": 0.8329, "step": 705 }, { "epoch": 0.17091959557053443, "grad_norm": 1.9453125, "learning_rate": 0.00018044838167408228, "loss": 0.817, "step": 710 }, { "epoch": 0.1721232546942706, "grad_norm": 1.7265625, "learning_rate": 0.00018043374525309434, "loss": 0.8079, "step": 715 }, { "epoch": 0.17332691381800674, "grad_norm": 1.8359375, "learning_rate": 0.00018041900172302601, "loss": 0.7959, "step": 720 }, { "epoch": 0.1745305729417429, "grad_norm": 1.765625, "learning_rate": 0.00018040415110743356, "loss": 0.8499, "step": 725 }, { "epoch": 0.17573423206547906, "grad_norm": 1.796875, "learning_rate": 0.00018038919343004453, "loss": 0.8052, "step": 730 }, { "epoch": 0.1769378911892152, "grad_norm": 1.875, "learning_rate": 0.0001803741287147574, "loss": 0.8096, "step": 735 }, { "epoch": 0.17814155031295137, "grad_norm": 1.8125, "learning_rate": 0.0001803589569856418, "loss": 0.7846, "step": 740 }, { "epoch": 0.17934520943668752, "grad_norm": 1.7578125, "learning_rate": 0.0001803436782669382, "loss": 0.7706, "step": 745 }, { "epoch": 0.18054886856042368, "grad_norm": 1.71875, "learning_rate": 0.00018032829258305812, "loss": 0.7574, "step": 750 }, { "epoch": 0.18175252768415984, "grad_norm": 1.625, "learning_rate": 0.0001803127999585839, "loss": 0.7831, "step": 755 }, { "epoch": 0.182956186807896, "grad_norm": 1.703125, "learning_rate": 0.00018029720041826878, "loss": 0.7531, "step": 760 }, { "epoch": 0.18415984593163215, "grad_norm": 1.9140625, "learning_rate": 0.00018028149398703687, "loss": 0.7958, "step": 765 }, { "epoch": 0.18536350505536833, "grad_norm": 1.7265625, "learning_rate": 0.000180265680689983, "loss": 0.7503, "step": 770 }, { "epoch": 0.1865671641791045, "grad_norm": 1.6484375, "learning_rate": 0.00018024976055237276, "loss": 0.8319, "step": 775 }, { "epoch": 0.18777082330284064, "grad_norm": 1.9296875, "learning_rate": 0.00018023373359964245, "loss": 0.7884, "step": 780 }, { "epoch": 0.1889744824265768, "grad_norm": 1.765625, "learning_rate": 0.000180217599857399, "loss": 0.7968, "step": 785 }, { "epoch": 0.19017814155031296, "grad_norm": 1.953125, "learning_rate": 0.00018020135935142007, "loss": 0.7777, "step": 790 }, { "epoch": 0.1913818006740491, "grad_norm": 1.6484375, "learning_rate": 0.00018018501210765377, "loss": 0.75, "step": 795 }, { "epoch": 0.19258545979778527, "grad_norm": 1.8984375, "learning_rate": 0.0001801685581522188, "loss": 0.7865, "step": 800 }, { "epoch": 0.19378911892152142, "grad_norm": 1.75, "learning_rate": 0.00018015199751140437, "loss": 0.7916, "step": 805 }, { "epoch": 0.19499277804525758, "grad_norm": 1.75, "learning_rate": 0.00018013533021167018, "loss": 0.7863, "step": 810 }, { "epoch": 0.19619643716899374, "grad_norm": 1.65625, "learning_rate": 0.00018011855627964626, "loss": 0.7675, "step": 815 }, { "epoch": 0.1974000962927299, "grad_norm": 1.8203125, "learning_rate": 0.00018010167574213307, "loss": 0.7847, "step": 820 }, { "epoch": 0.19860375541646605, "grad_norm": 1.7109375, "learning_rate": 0.00018008468862610135, "loss": 0.7765, "step": 825 }, { "epoch": 0.1998074145402022, "grad_norm": 1.8671875, "learning_rate": 0.00018006759495869219, "loss": 0.7872, "step": 830 }, { "epoch": 0.20101107366393836, "grad_norm": 1.703125, "learning_rate": 0.0001800503947672169, "loss": 0.7879, "step": 835 }, { "epoch": 0.20221473278767452, "grad_norm": 1.6328125, "learning_rate": 0.0001800330880791569, "loss": 0.7615, "step": 840 }, { "epoch": 0.2034183919114107, "grad_norm": 1.609375, "learning_rate": 0.00018001567492216398, "loss": 0.7837, "step": 845 }, { "epoch": 0.20462205103514686, "grad_norm": 1.734375, "learning_rate": 0.00017999815532405977, "loss": 0.7927, "step": 850 }, { "epoch": 0.205825710158883, "grad_norm": 1.4609375, "learning_rate": 0.00017998052931283615, "loss": 0.8079, "step": 855 }, { "epoch": 0.20702936928261917, "grad_norm": 1.7109375, "learning_rate": 0.000179962796916655, "loss": 0.786, "step": 860 }, { "epoch": 0.20823302840635532, "grad_norm": 1.6484375, "learning_rate": 0.00017994495816384804, "loss": 0.8247, "step": 865 }, { "epoch": 0.20943668753009148, "grad_norm": 1.7265625, "learning_rate": 0.0001799270130829171, "loss": 0.7969, "step": 870 }, { "epoch": 0.21064034665382764, "grad_norm": 1.890625, "learning_rate": 0.00017990896170253385, "loss": 0.7756, "step": 875 }, { "epoch": 0.2118440057775638, "grad_norm": 1.5703125, "learning_rate": 0.00017989080405153972, "loss": 0.8003, "step": 880 }, { "epoch": 0.21304766490129995, "grad_norm": 1.734375, "learning_rate": 0.00017987254015894595, "loss": 0.7765, "step": 885 }, { "epoch": 0.2142513240250361, "grad_norm": 1.609375, "learning_rate": 0.00017985417005393365, "loss": 0.7648, "step": 890 }, { "epoch": 0.21545498314877226, "grad_norm": 1.8046875, "learning_rate": 0.00017983569376585346, "loss": 0.7684, "step": 895 }, { "epoch": 0.21665864227250842, "grad_norm": 1.5, "learning_rate": 0.00017981711132422577, "loss": 0.7555, "step": 900 }, { "epoch": 0.21786230139624457, "grad_norm": 1.71875, "learning_rate": 0.0001797984227587406, "loss": 0.7881, "step": 905 }, { "epoch": 0.21906596051998073, "grad_norm": 1.84375, "learning_rate": 0.0001797796280992574, "loss": 0.7826, "step": 910 }, { "epoch": 0.2202696196437169, "grad_norm": 1.7109375, "learning_rate": 0.00017976072737580529, "loss": 0.7615, "step": 915 }, { "epoch": 0.22147327876745307, "grad_norm": 1.734375, "learning_rate": 0.00017974172061858276, "loss": 0.7794, "step": 920 }, { "epoch": 0.22267693789118922, "grad_norm": 1.8359375, "learning_rate": 0.00017972260785795776, "loss": 0.7793, "step": 925 }, { "epoch": 0.22388059701492538, "grad_norm": 1.8046875, "learning_rate": 0.00017970338912446756, "loss": 0.7668, "step": 930 }, { "epoch": 0.22508425613866154, "grad_norm": 1.609375, "learning_rate": 0.00017968406444881873, "loss": 0.7745, "step": 935 }, { "epoch": 0.2262879152623977, "grad_norm": 1.765625, "learning_rate": 0.00017966463386188722, "loss": 0.7792, "step": 940 }, { "epoch": 0.22749157438613385, "grad_norm": 1.765625, "learning_rate": 0.00017964509739471806, "loss": 0.7773, "step": 945 }, { "epoch": 0.22869523350987, "grad_norm": 1.6171875, "learning_rate": 0.0001796254550785256, "loss": 0.7977, "step": 950 }, { "epoch": 0.22989889263360616, "grad_norm": 1.6796875, "learning_rate": 0.00017960570694469313, "loss": 0.7452, "step": 955 }, { "epoch": 0.23110255175734232, "grad_norm": 1.8359375, "learning_rate": 0.00017958585302477317, "loss": 0.7605, "step": 960 }, { "epoch": 0.23230621088107847, "grad_norm": 1.6328125, "learning_rate": 0.0001795658933504872, "loss": 0.7691, "step": 965 }, { "epoch": 0.23350987000481463, "grad_norm": 1.6015625, "learning_rate": 0.00017954582795372557, "loss": 0.7643, "step": 970 }, { "epoch": 0.23471352912855079, "grad_norm": 1.6640625, "learning_rate": 0.00017952565686654777, "loss": 0.7389, "step": 975 }, { "epoch": 0.23591718825228694, "grad_norm": 1.59375, "learning_rate": 0.00017950538012118192, "loss": 0.7381, "step": 980 }, { "epoch": 0.2371208473760231, "grad_norm": 1.578125, "learning_rate": 0.00017948499775002514, "loss": 0.7427, "step": 985 }, { "epoch": 0.23832450649975928, "grad_norm": 1.5859375, "learning_rate": 0.00017946450978564317, "loss": 0.7455, "step": 990 }, { "epoch": 0.23952816562349544, "grad_norm": 1.6640625, "learning_rate": 0.00017944391626077057, "loss": 0.7557, "step": 995 }, { "epoch": 0.2407318247472316, "grad_norm": 1.75, "learning_rate": 0.00017942321720831048, "loss": 0.7453, "step": 1000 }, { "epoch": 0.2407318247472316, "eval_loss": 0.6591922044754028, "eval_runtime": 2.3906, "eval_samples_per_second": 83.662, "eval_steps_per_second": 83.662, "step": 1000 }, { "epoch": 0.24193548387096775, "grad_norm": 1.6640625, "learning_rate": 0.00017940241266133474, "loss": 0.7753, "step": 1005 }, { "epoch": 0.2431391429947039, "grad_norm": 1.765625, "learning_rate": 0.0001793815026530836, "loss": 0.7529, "step": 1010 }, { "epoch": 0.24434280211844006, "grad_norm": 1.5390625, "learning_rate": 0.00017936048721696595, "loss": 0.7543, "step": 1015 }, { "epoch": 0.24554646124217622, "grad_norm": 1.6484375, "learning_rate": 0.00017933936638655907, "loss": 0.7304, "step": 1020 }, { "epoch": 0.24675012036591237, "grad_norm": 1.4296875, "learning_rate": 0.0001793181401956086, "loss": 0.7233, "step": 1025 }, { "epoch": 0.24795377948964853, "grad_norm": 1.5625, "learning_rate": 0.00017929680867802855, "loss": 0.7151, "step": 1030 }, { "epoch": 0.24915743861338469, "grad_norm": 1.7890625, "learning_rate": 0.00017927537186790125, "loss": 0.7661, "step": 1035 }, { "epoch": 0.25036109773712084, "grad_norm": 1.6171875, "learning_rate": 0.0001792538297994772, "loss": 0.7578, "step": 1040 }, { "epoch": 0.251564756860857, "grad_norm": 1.734375, "learning_rate": 0.00017923218250717512, "loss": 0.7055, "step": 1045 }, { "epoch": 0.25276841598459315, "grad_norm": 1.5859375, "learning_rate": 0.0001792104300255818, "loss": 0.7427, "step": 1050 }, { "epoch": 0.2539720751083293, "grad_norm": 1.5625, "learning_rate": 0.0001791885723894522, "loss": 0.7011, "step": 1055 }, { "epoch": 0.25517573423206547, "grad_norm": 1.7890625, "learning_rate": 0.00017916660963370915, "loss": 0.7699, "step": 1060 }, { "epoch": 0.2563793933558016, "grad_norm": 1.5859375, "learning_rate": 0.00017914454179344358, "loss": 0.7615, "step": 1065 }, { "epoch": 0.2575830524795378, "grad_norm": 1.6328125, "learning_rate": 0.00017912236890391415, "loss": 0.7437, "step": 1070 }, { "epoch": 0.25878671160327393, "grad_norm": 1.65625, "learning_rate": 0.00017910009100054747, "loss": 0.7752, "step": 1075 }, { "epoch": 0.2599903707270101, "grad_norm": 1.78125, "learning_rate": 0.00017907770811893798, "loss": 0.8086, "step": 1080 }, { "epoch": 0.26119402985074625, "grad_norm": 1.8125, "learning_rate": 0.00017905522029484775, "loss": 0.713, "step": 1085 }, { "epoch": 0.2623976889744824, "grad_norm": 1.53125, "learning_rate": 0.00017903262756420652, "loss": 0.7457, "step": 1090 }, { "epoch": 0.26360134809821856, "grad_norm": 1.75, "learning_rate": 0.00017900992996311175, "loss": 0.7358, "step": 1095 }, { "epoch": 0.26480500722195477, "grad_norm": 1.53125, "learning_rate": 0.00017898712752782838, "loss": 0.7431, "step": 1100 }, { "epoch": 0.2660086663456909, "grad_norm": 1.6015625, "learning_rate": 0.0001789642202947888, "loss": 0.7242, "step": 1105 }, { "epoch": 0.2672123254694271, "grad_norm": 1.4921875, "learning_rate": 0.00017894120830059295, "loss": 0.696, "step": 1110 }, { "epoch": 0.26841598459316324, "grad_norm": 1.734375, "learning_rate": 0.0001789180915820081, "loss": 0.7504, "step": 1115 }, { "epoch": 0.2696196437168994, "grad_norm": 1.6875, "learning_rate": 0.00017889487017596877, "loss": 0.7081, "step": 1120 }, { "epoch": 0.27082330284063555, "grad_norm": 1.78125, "learning_rate": 0.00017887154411957686, "loss": 0.7614, "step": 1125 }, { "epoch": 0.2720269619643717, "grad_norm": 1.7578125, "learning_rate": 0.00017884811345010145, "loss": 0.7445, "step": 1130 }, { "epoch": 0.27323062108810786, "grad_norm": 1.8203125, "learning_rate": 0.00017882457820497872, "loss": 0.7695, "step": 1135 }, { "epoch": 0.274434280211844, "grad_norm": 1.609375, "learning_rate": 0.000178800938421812, "loss": 0.7481, "step": 1140 }, { "epoch": 0.2756379393355802, "grad_norm": 1.6640625, "learning_rate": 0.00017877719413837153, "loss": 0.7543, "step": 1145 }, { "epoch": 0.27684159845931633, "grad_norm": 1.7578125, "learning_rate": 0.00017875334539259463, "loss": 0.7673, "step": 1150 }, { "epoch": 0.2780452575830525, "grad_norm": 1.7421875, "learning_rate": 0.00017872939222258547, "loss": 0.7476, "step": 1155 }, { "epoch": 0.27924891670678864, "grad_norm": 1.5234375, "learning_rate": 0.0001787053346666151, "loss": 0.7663, "step": 1160 }, { "epoch": 0.2804525758305248, "grad_norm": 1.5546875, "learning_rate": 0.00017868117276312128, "loss": 0.7063, "step": 1165 }, { "epoch": 0.28165623495426095, "grad_norm": 1.6953125, "learning_rate": 0.00017865690655070857, "loss": 0.7367, "step": 1170 }, { "epoch": 0.2828598940779971, "grad_norm": 1.4609375, "learning_rate": 0.00017863253606814817, "loss": 0.7308, "step": 1175 }, { "epoch": 0.28406355320173327, "grad_norm": 1.5703125, "learning_rate": 0.00017860806135437784, "loss": 0.7511, "step": 1180 }, { "epoch": 0.2852672123254694, "grad_norm": 1.5546875, "learning_rate": 0.0001785834824485019, "loss": 0.7355, "step": 1185 }, { "epoch": 0.2864708714492056, "grad_norm": 1.640625, "learning_rate": 0.00017855879938979114, "loss": 0.7639, "step": 1190 }, { "epoch": 0.28767453057294173, "grad_norm": 1.5234375, "learning_rate": 0.00017853401221768278, "loss": 0.7357, "step": 1195 }, { "epoch": 0.2888781896966779, "grad_norm": 1.5546875, "learning_rate": 0.00017850912097178034, "loss": 0.7448, "step": 1200 }, { "epoch": 0.29008184882041405, "grad_norm": 1.5, "learning_rate": 0.00017848412569185365, "loss": 0.7036, "step": 1205 }, { "epoch": 0.2912855079441502, "grad_norm": 1.7265625, "learning_rate": 0.0001784590264178388, "loss": 0.7411, "step": 1210 }, { "epoch": 0.29248916706788636, "grad_norm": 1.6484375, "learning_rate": 0.000178433823189838, "loss": 0.7449, "step": 1215 }, { "epoch": 0.2936928261916225, "grad_norm": 1.609375, "learning_rate": 0.00017840851604811946, "loss": 0.7431, "step": 1220 }, { "epoch": 0.29489648531535867, "grad_norm": 1.5546875, "learning_rate": 0.00017838310503311757, "loss": 0.7291, "step": 1225 }, { "epoch": 0.2961001444390948, "grad_norm": 1.5703125, "learning_rate": 0.00017835759018543262, "loss": 0.7442, "step": 1230 }, { "epoch": 0.297303803562831, "grad_norm": 1.6640625, "learning_rate": 0.00017833197154583076, "loss": 0.7111, "step": 1235 }, { "epoch": 0.29850746268656714, "grad_norm": 1.71875, "learning_rate": 0.00017830624915524408, "loss": 0.7175, "step": 1240 }, { "epoch": 0.29971112181030335, "grad_norm": 1.5390625, "learning_rate": 0.0001782804230547703, "loss": 0.7278, "step": 1245 }, { "epoch": 0.3009147809340395, "grad_norm": 1.7578125, "learning_rate": 0.0001782544932856729, "loss": 0.7335, "step": 1250 }, { "epoch": 0.30211844005777566, "grad_norm": 1.6875, "learning_rate": 0.00017822845988938103, "loss": 0.7244, "step": 1255 }, { "epoch": 0.3033220991815118, "grad_norm": 1.4921875, "learning_rate": 0.0001782023229074894, "loss": 0.7328, "step": 1260 }, { "epoch": 0.304525758305248, "grad_norm": 1.59375, "learning_rate": 0.0001781760823817582, "loss": 0.7015, "step": 1265 }, { "epoch": 0.30572941742898413, "grad_norm": 1.5546875, "learning_rate": 0.00017814973835411304, "loss": 0.7289, "step": 1270 }, { "epoch": 0.3069330765527203, "grad_norm": 1.3828125, "learning_rate": 0.00017812329086664495, "loss": 0.7213, "step": 1275 }, { "epoch": 0.30813673567645644, "grad_norm": 1.484375, "learning_rate": 0.00017809673996161023, "loss": 0.7053, "step": 1280 }, { "epoch": 0.3093403948001926, "grad_norm": 1.5703125, "learning_rate": 0.0001780700856814304, "loss": 0.7443, "step": 1285 }, { "epoch": 0.31054405392392875, "grad_norm": 1.6875, "learning_rate": 0.00017804332806869218, "loss": 0.7331, "step": 1290 }, { "epoch": 0.3117477130476649, "grad_norm": 1.5234375, "learning_rate": 0.0001780164671661474, "loss": 0.7462, "step": 1295 }, { "epoch": 0.31295137217140107, "grad_norm": 1.609375, "learning_rate": 0.00017798950301671282, "loss": 0.7001, "step": 1300 }, { "epoch": 0.3141550312951372, "grad_norm": 1.7578125, "learning_rate": 0.00017796243566347032, "loss": 0.7057, "step": 1305 }, { "epoch": 0.3153586904188734, "grad_norm": 1.8515625, "learning_rate": 0.00017793526514966654, "loss": 0.7206, "step": 1310 }, { "epoch": 0.31656234954260953, "grad_norm": 1.7109375, "learning_rate": 0.00017790799151871304, "loss": 0.7501, "step": 1315 }, { "epoch": 0.3177660086663457, "grad_norm": 1.6796875, "learning_rate": 0.00017788061481418608, "loss": 0.7305, "step": 1320 }, { "epoch": 0.31896966779008185, "grad_norm": 1.5859375, "learning_rate": 0.0001778531350798266, "loss": 0.734, "step": 1325 }, { "epoch": 0.320173326913818, "grad_norm": 1.5859375, "learning_rate": 0.00017782555235954013, "loss": 0.7195, "step": 1330 }, { "epoch": 0.32137698603755416, "grad_norm": 1.5390625, "learning_rate": 0.00017779786669739687, "loss": 0.7174, "step": 1335 }, { "epoch": 0.3225806451612903, "grad_norm": 1.4609375, "learning_rate": 0.00017777007813763135, "loss": 0.6927, "step": 1340 }, { "epoch": 0.32378430428502647, "grad_norm": 1.5234375, "learning_rate": 0.0001777421867246426, "loss": 0.6868, "step": 1345 }, { "epoch": 0.3249879634087626, "grad_norm": 1.5390625, "learning_rate": 0.00017771419250299395, "loss": 0.6822, "step": 1350 }, { "epoch": 0.3261916225324988, "grad_norm": 1.5390625, "learning_rate": 0.00017768609551741294, "loss": 0.7231, "step": 1355 }, { "epoch": 0.32739528165623494, "grad_norm": 1.5703125, "learning_rate": 0.00017765789581279142, "loss": 0.7121, "step": 1360 }, { "epoch": 0.3285989407799711, "grad_norm": 1.5625, "learning_rate": 0.00017762959343418523, "loss": 0.7539, "step": 1365 }, { "epoch": 0.32980259990370725, "grad_norm": 1.4453125, "learning_rate": 0.00017760118842681432, "loss": 0.6681, "step": 1370 }, { "epoch": 0.3310062590274434, "grad_norm": 1.5625, "learning_rate": 0.00017757268083606266, "loss": 0.732, "step": 1375 }, { "epoch": 0.33220991815117956, "grad_norm": 1.6328125, "learning_rate": 0.00017754407070747803, "loss": 0.6868, "step": 1380 }, { "epoch": 0.3334135772749157, "grad_norm": 1.421875, "learning_rate": 0.00017751535808677212, "loss": 0.687, "step": 1385 }, { "epoch": 0.33461723639865193, "grad_norm": 1.65625, "learning_rate": 0.00017748654301982027, "loss": 0.7162, "step": 1390 }, { "epoch": 0.3358208955223881, "grad_norm": 1.453125, "learning_rate": 0.00017745762555266162, "loss": 0.7199, "step": 1395 }, { "epoch": 0.33702455464612424, "grad_norm": 1.4375, "learning_rate": 0.00017742860573149888, "loss": 0.7354, "step": 1400 }, { "epoch": 0.3382282137698604, "grad_norm": 1.5703125, "learning_rate": 0.00017739948360269827, "loss": 0.6817, "step": 1405 }, { "epoch": 0.33943187289359655, "grad_norm": 1.390625, "learning_rate": 0.00017737025921278943, "loss": 0.7115, "step": 1410 }, { "epoch": 0.3406355320173327, "grad_norm": 1.5234375, "learning_rate": 0.00017734093260846553, "loss": 0.7204, "step": 1415 }, { "epoch": 0.34183919114106887, "grad_norm": 1.6640625, "learning_rate": 0.00017731150383658293, "loss": 0.7286, "step": 1420 }, { "epoch": 0.343042850264805, "grad_norm": 1.5078125, "learning_rate": 0.00017728197294416124, "loss": 0.7304, "step": 1425 }, { "epoch": 0.3442465093885412, "grad_norm": 1.6953125, "learning_rate": 0.00017725233997838329, "loss": 0.7101, "step": 1430 }, { "epoch": 0.34545016851227733, "grad_norm": 1.46875, "learning_rate": 0.00017722260498659496, "loss": 0.7209, "step": 1435 }, { "epoch": 0.3466538276360135, "grad_norm": 1.5546875, "learning_rate": 0.00017719276801630515, "loss": 0.7022, "step": 1440 }, { "epoch": 0.34785748675974965, "grad_norm": 1.453125, "learning_rate": 0.00017716282911518566, "loss": 0.6727, "step": 1445 }, { "epoch": 0.3490611458834858, "grad_norm": 1.5546875, "learning_rate": 0.0001771327883310712, "loss": 0.6885, "step": 1450 }, { "epoch": 0.35026480500722196, "grad_norm": 1.453125, "learning_rate": 0.00017710264571195922, "loss": 0.6793, "step": 1455 }, { "epoch": 0.3514684641309581, "grad_norm": 1.578125, "learning_rate": 0.00017707240130600992, "loss": 0.6901, "step": 1460 }, { "epoch": 0.35267212325469427, "grad_norm": 1.4765625, "learning_rate": 0.00017704205516154612, "loss": 0.7077, "step": 1465 }, { "epoch": 0.3538757823784304, "grad_norm": 1.53125, "learning_rate": 0.00017701160732705313, "loss": 0.7431, "step": 1470 }, { "epoch": 0.3550794415021666, "grad_norm": 1.421875, "learning_rate": 0.00017698105785117884, "loss": 0.7215, "step": 1475 }, { "epoch": 0.35628310062590274, "grad_norm": 1.5390625, "learning_rate": 0.00017695040678273344, "loss": 0.7106, "step": 1480 }, { "epoch": 0.3574867597496389, "grad_norm": 1.609375, "learning_rate": 0.0001769196541706895, "loss": 0.6923, "step": 1485 }, { "epoch": 0.35869041887337505, "grad_norm": 1.4375, "learning_rate": 0.00017688880006418183, "loss": 0.6773, "step": 1490 }, { "epoch": 0.3598940779971112, "grad_norm": 1.4921875, "learning_rate": 0.00017685784451250733, "loss": 0.6892, "step": 1495 }, { "epoch": 0.36109773712084736, "grad_norm": 1.640625, "learning_rate": 0.00017682678756512509, "loss": 0.6795, "step": 1500 }, { "epoch": 0.36109773712084736, "eval_loss": 0.6025854349136353, "eval_runtime": 2.4049, "eval_samples_per_second": 83.162, "eval_steps_per_second": 83.162, "step": 1500 }, { "epoch": 0.3623013962445835, "grad_norm": 1.578125, "learning_rate": 0.00017679562927165612, "loss": 0.7128, "step": 1505 }, { "epoch": 0.3635050553683197, "grad_norm": 1.5234375, "learning_rate": 0.0001767643696818834, "loss": 0.6946, "step": 1510 }, { "epoch": 0.36470871449205583, "grad_norm": 1.59375, "learning_rate": 0.00017673300884575173, "loss": 0.7061, "step": 1515 }, { "epoch": 0.365912373615792, "grad_norm": 1.5078125, "learning_rate": 0.00017670154681336767, "loss": 0.6766, "step": 1520 }, { "epoch": 0.36711603273952814, "grad_norm": 1.375, "learning_rate": 0.0001766699836349995, "loss": 0.6685, "step": 1525 }, { "epoch": 0.3683196918632643, "grad_norm": 1.5078125, "learning_rate": 0.00017663831936107712, "loss": 0.728, "step": 1530 }, { "epoch": 0.36952335098700045, "grad_norm": 1.7109375, "learning_rate": 0.0001766065540421919, "loss": 0.7238, "step": 1535 }, { "epoch": 0.37072701011073667, "grad_norm": 1.71875, "learning_rate": 0.00017657468772909663, "loss": 0.7126, "step": 1540 }, { "epoch": 0.3719306692344728, "grad_norm": 1.5234375, "learning_rate": 0.00017654272047270558, "loss": 0.6897, "step": 1545 }, { "epoch": 0.373134328358209, "grad_norm": 1.4765625, "learning_rate": 0.00017651065232409416, "loss": 0.6872, "step": 1550 }, { "epoch": 0.37433798748194513, "grad_norm": 1.5390625, "learning_rate": 0.0001764784833344991, "loss": 0.7086, "step": 1555 }, { "epoch": 0.3755416466056813, "grad_norm": 1.5859375, "learning_rate": 0.00017644621355531823, "loss": 0.7068, "step": 1560 }, { "epoch": 0.37674530572941745, "grad_norm": 1.625, "learning_rate": 0.00017641384303811032, "loss": 0.6891, "step": 1565 }, { "epoch": 0.3779489648531536, "grad_norm": 1.4296875, "learning_rate": 0.00017638137183459516, "loss": 0.7255, "step": 1570 }, { "epoch": 0.37915262397688976, "grad_norm": 1.578125, "learning_rate": 0.00017634879999665348, "loss": 0.7103, "step": 1575 }, { "epoch": 0.3803562831006259, "grad_norm": 1.5234375, "learning_rate": 0.00017631612757632663, "loss": 0.6874, "step": 1580 }, { "epoch": 0.38155994222436207, "grad_norm": 1.3671875, "learning_rate": 0.0001762833546258168, "loss": 0.6898, "step": 1585 }, { "epoch": 0.3827636013480982, "grad_norm": 1.5859375, "learning_rate": 0.0001762504811974868, "loss": 0.707, "step": 1590 }, { "epoch": 0.3839672604718344, "grad_norm": 1.5703125, "learning_rate": 0.00017621750734385993, "loss": 0.6924, "step": 1595 }, { "epoch": 0.38517091959557054, "grad_norm": 1.5390625, "learning_rate": 0.0001761844331176199, "loss": 0.7022, "step": 1600 }, { "epoch": 0.3863745787193067, "grad_norm": 1.3984375, "learning_rate": 0.00017615125857161094, "loss": 0.6904, "step": 1605 }, { "epoch": 0.38757823784304285, "grad_norm": 1.4921875, "learning_rate": 0.00017611798375883738, "loss": 0.6895, "step": 1610 }, { "epoch": 0.388781896966779, "grad_norm": 1.6171875, "learning_rate": 0.00017608460873246387, "loss": 0.7075, "step": 1615 }, { "epoch": 0.38998555609051516, "grad_norm": 1.4921875, "learning_rate": 0.00017605113354581515, "loss": 0.6792, "step": 1620 }, { "epoch": 0.3911892152142513, "grad_norm": 1.5859375, "learning_rate": 0.000176017558252376, "loss": 0.7466, "step": 1625 }, { "epoch": 0.3923928743379875, "grad_norm": 1.390625, "learning_rate": 0.00017598388290579115, "loss": 0.6889, "step": 1630 }, { "epoch": 0.39359653346172363, "grad_norm": 1.53125, "learning_rate": 0.0001759501075598651, "loss": 0.7023, "step": 1635 }, { "epoch": 0.3948001925854598, "grad_norm": 1.59375, "learning_rate": 0.00017591623226856224, "loss": 0.707, "step": 1640 }, { "epoch": 0.39600385170919594, "grad_norm": 1.53125, "learning_rate": 0.00017588225708600662, "loss": 0.6763, "step": 1645 }, { "epoch": 0.3972075108329321, "grad_norm": 1.5859375, "learning_rate": 0.0001758481820664818, "loss": 0.683, "step": 1650 }, { "epoch": 0.39841116995666825, "grad_norm": 1.4765625, "learning_rate": 0.00017581400726443102, "loss": 0.7005, "step": 1655 }, { "epoch": 0.3996148290804044, "grad_norm": 1.390625, "learning_rate": 0.0001757797327344568, "loss": 0.6983, "step": 1660 }, { "epoch": 0.40081848820414057, "grad_norm": 1.5234375, "learning_rate": 0.00017574535853132108, "loss": 0.6762, "step": 1665 }, { "epoch": 0.4020221473278767, "grad_norm": 1.6015625, "learning_rate": 0.000175710884709945, "loss": 0.6733, "step": 1670 }, { "epoch": 0.4032258064516129, "grad_norm": 1.609375, "learning_rate": 0.00017567631132540886, "loss": 0.6843, "step": 1675 }, { "epoch": 0.40442946557534903, "grad_norm": 1.5078125, "learning_rate": 0.0001756416384329521, "loss": 0.7135, "step": 1680 }, { "epoch": 0.40563312469908525, "grad_norm": 1.609375, "learning_rate": 0.00017560686608797315, "loss": 0.6691, "step": 1685 }, { "epoch": 0.4068367838228214, "grad_norm": 1.625, "learning_rate": 0.00017557199434602925, "loss": 0.7016, "step": 1690 }, { "epoch": 0.40804044294655756, "grad_norm": 1.4609375, "learning_rate": 0.00017553702326283653, "loss": 0.6793, "step": 1695 }, { "epoch": 0.4092441020702937, "grad_norm": 1.4921875, "learning_rate": 0.0001755019528942698, "loss": 0.6773, "step": 1700 }, { "epoch": 0.41044776119402987, "grad_norm": 1.59375, "learning_rate": 0.00017546678329636252, "loss": 0.7092, "step": 1705 }, { "epoch": 0.411651420317766, "grad_norm": 1.3671875, "learning_rate": 0.00017543151452530673, "loss": 0.6657, "step": 1710 }, { "epoch": 0.4128550794415022, "grad_norm": 1.4296875, "learning_rate": 0.0001753961466374528, "loss": 0.6737, "step": 1715 }, { "epoch": 0.41405873856523834, "grad_norm": 1.5078125, "learning_rate": 0.00017536067968930964, "loss": 0.693, "step": 1720 }, { "epoch": 0.4152623976889745, "grad_norm": 1.4375, "learning_rate": 0.0001753251137375443, "loss": 0.6852, "step": 1725 }, { "epoch": 0.41646605681271065, "grad_norm": 1.359375, "learning_rate": 0.00017528944883898206, "loss": 0.6566, "step": 1730 }, { "epoch": 0.4176697159364468, "grad_norm": 1.53125, "learning_rate": 0.00017525368505060627, "loss": 0.6874, "step": 1735 }, { "epoch": 0.41887337506018296, "grad_norm": 1.5546875, "learning_rate": 0.00017521782242955827, "loss": 0.6762, "step": 1740 }, { "epoch": 0.4200770341839191, "grad_norm": 1.4765625, "learning_rate": 0.00017518186103313742, "loss": 0.6745, "step": 1745 }, { "epoch": 0.4212806933076553, "grad_norm": 1.4375, "learning_rate": 0.00017514580091880077, "loss": 0.6853, "step": 1750 }, { "epoch": 0.42248435243139143, "grad_norm": 1.390625, "learning_rate": 0.0001751096421441631, "loss": 0.69, "step": 1755 }, { "epoch": 0.4236880115551276, "grad_norm": 1.6015625, "learning_rate": 0.00017507338476699692, "loss": 0.6636, "step": 1760 }, { "epoch": 0.42489167067886374, "grad_norm": 1.46875, "learning_rate": 0.00017503702884523218, "loss": 0.6855, "step": 1765 }, { "epoch": 0.4260953298025999, "grad_norm": 1.59375, "learning_rate": 0.00017500057443695636, "loss": 0.6725, "step": 1770 }, { "epoch": 0.42729898892633605, "grad_norm": 1.5546875, "learning_rate": 0.0001749640216004142, "loss": 0.6449, "step": 1775 }, { "epoch": 0.4285026480500722, "grad_norm": 1.390625, "learning_rate": 0.00017492737039400785, "loss": 0.6772, "step": 1780 }, { "epoch": 0.42970630717380837, "grad_norm": 1.3828125, "learning_rate": 0.00017489062087629648, "loss": 0.7016, "step": 1785 }, { "epoch": 0.4309099662975445, "grad_norm": 1.40625, "learning_rate": 0.0001748537731059964, "loss": 0.7142, "step": 1790 }, { "epoch": 0.4321136254212807, "grad_norm": 1.5859375, "learning_rate": 0.00017481682714198097, "loss": 0.6957, "step": 1795 }, { "epoch": 0.43331728454501683, "grad_norm": 1.359375, "learning_rate": 0.00017477978304328024, "loss": 0.6857, "step": 1800 }, { "epoch": 0.434520943668753, "grad_norm": 1.5, "learning_rate": 0.0001747426408690813, "loss": 0.7223, "step": 1805 }, { "epoch": 0.43572460279248915, "grad_norm": 1.3359375, "learning_rate": 0.0001747054006787278, "loss": 0.6781, "step": 1810 }, { "epoch": 0.4369282619162253, "grad_norm": 1.5078125, "learning_rate": 0.00017466806253171996, "loss": 0.664, "step": 1815 }, { "epoch": 0.43813192103996146, "grad_norm": 1.671875, "learning_rate": 0.00017463062648771462, "loss": 0.666, "step": 1820 }, { "epoch": 0.4393355801636976, "grad_norm": 1.484375, "learning_rate": 0.00017459309260652497, "loss": 0.6811, "step": 1825 }, { "epoch": 0.4405392392874338, "grad_norm": 1.484375, "learning_rate": 0.0001745554609481205, "loss": 0.6608, "step": 1830 }, { "epoch": 0.44174289841117, "grad_norm": 1.4375, "learning_rate": 0.00017451773157262703, "loss": 0.6846, "step": 1835 }, { "epoch": 0.44294655753490614, "grad_norm": 1.515625, "learning_rate": 0.00017447990454032636, "loss": 0.7046, "step": 1840 }, { "epoch": 0.4441502166586423, "grad_norm": 1.375, "learning_rate": 0.0001744419799116564, "loss": 0.6599, "step": 1845 }, { "epoch": 0.44535387578237845, "grad_norm": 1.4453125, "learning_rate": 0.00017440395774721102, "loss": 0.6928, "step": 1850 }, { "epoch": 0.4465575349061146, "grad_norm": 1.40625, "learning_rate": 0.00017436583810773987, "loss": 0.6748, "step": 1855 }, { "epoch": 0.44776119402985076, "grad_norm": 1.4375, "learning_rate": 0.00017432762105414832, "loss": 0.6887, "step": 1860 }, { "epoch": 0.4489648531535869, "grad_norm": 1.40625, "learning_rate": 0.00017428930664749749, "loss": 0.6298, "step": 1865 }, { "epoch": 0.4501685122773231, "grad_norm": 1.6875, "learning_rate": 0.00017425089494900394, "loss": 0.6894, "step": 1870 }, { "epoch": 0.45137217140105923, "grad_norm": 1.4453125, "learning_rate": 0.00017421238602003975, "loss": 0.6788, "step": 1875 }, { "epoch": 0.4525758305247954, "grad_norm": 1.3203125, "learning_rate": 0.0001741737799221323, "loss": 0.6706, "step": 1880 }, { "epoch": 0.45377948964853154, "grad_norm": 1.53125, "learning_rate": 0.0001741350767169642, "loss": 0.6857, "step": 1885 }, { "epoch": 0.4549831487722677, "grad_norm": 1.46875, "learning_rate": 0.00017409627646637334, "loss": 0.6344, "step": 1890 }, { "epoch": 0.45618680789600385, "grad_norm": 1.5, "learning_rate": 0.00017405737923235247, "loss": 0.6424, "step": 1895 }, { "epoch": 0.45739046701974, "grad_norm": 1.5859375, "learning_rate": 0.00017401838507704947, "loss": 0.7227, "step": 1900 }, { "epoch": 0.45859412614347617, "grad_norm": 1.484375, "learning_rate": 0.00017397929406276697, "loss": 0.6895, "step": 1905 }, { "epoch": 0.4597977852672123, "grad_norm": 1.484375, "learning_rate": 0.00017394010625196242, "loss": 0.6858, "step": 1910 }, { "epoch": 0.4610014443909485, "grad_norm": 1.390625, "learning_rate": 0.00017390082170724787, "loss": 0.6728, "step": 1915 }, { "epoch": 0.46220510351468463, "grad_norm": 1.515625, "learning_rate": 0.00017386144049138995, "loss": 0.6685, "step": 1920 }, { "epoch": 0.4634087626384208, "grad_norm": 1.515625, "learning_rate": 0.00017382196266730977, "loss": 0.7095, "step": 1925 }, { "epoch": 0.46461242176215695, "grad_norm": 1.53125, "learning_rate": 0.00017378238829808274, "loss": 0.6678, "step": 1930 }, { "epoch": 0.4658160808858931, "grad_norm": 1.640625, "learning_rate": 0.00017374271744693861, "loss": 0.6634, "step": 1935 }, { "epoch": 0.46701974000962926, "grad_norm": 1.359375, "learning_rate": 0.0001737029501772612, "loss": 0.6682, "step": 1940 }, { "epoch": 0.4682233991333654, "grad_norm": 1.3671875, "learning_rate": 0.00017366308655258837, "loss": 0.6648, "step": 1945 }, { "epoch": 0.46942705825710157, "grad_norm": 1.4921875, "learning_rate": 0.00017362312663661203, "loss": 0.6449, "step": 1950 }, { "epoch": 0.4706307173808377, "grad_norm": 1.3984375, "learning_rate": 0.00017358307049317786, "loss": 0.6514, "step": 1955 }, { "epoch": 0.4718343765045739, "grad_norm": 1.3671875, "learning_rate": 0.00017354291818628533, "loss": 0.6347, "step": 1960 }, { "epoch": 0.47303803562831004, "grad_norm": 1.59375, "learning_rate": 0.0001735026697800875, "loss": 0.6859, "step": 1965 }, { "epoch": 0.4742416947520462, "grad_norm": 1.3515625, "learning_rate": 0.00017346232533889104, "loss": 0.6924, "step": 1970 }, { "epoch": 0.4754453538757824, "grad_norm": 1.4375, "learning_rate": 0.00017342188492715596, "loss": 0.6782, "step": 1975 }, { "epoch": 0.47664901299951856, "grad_norm": 1.53125, "learning_rate": 0.00017338134860949578, "loss": 0.6753, "step": 1980 }, { "epoch": 0.4778526721232547, "grad_norm": 1.4296875, "learning_rate": 0.00017334071645067704, "loss": 0.6419, "step": 1985 }, { "epoch": 0.4790563312469909, "grad_norm": 1.4453125, "learning_rate": 0.00017329998851561956, "loss": 0.6609, "step": 1990 }, { "epoch": 0.48025999037072703, "grad_norm": 1.5546875, "learning_rate": 0.00017325916486939617, "loss": 0.6492, "step": 1995 }, { "epoch": 0.4814636494944632, "grad_norm": 1.3046875, "learning_rate": 0.00017321824557723253, "loss": 0.6382, "step": 2000 }, { "epoch": 0.4814636494944632, "eval_loss": 0.5719231963157654, "eval_runtime": 2.395, "eval_samples_per_second": 83.506, "eval_steps_per_second": 83.506, "step": 2000 }, { "epoch": 0.48266730861819934, "grad_norm": 1.5625, "learning_rate": 0.0001731772307045072, "loss": 0.6636, "step": 2005 }, { "epoch": 0.4838709677419355, "grad_norm": 1.421875, "learning_rate": 0.00017313612031675144, "loss": 0.6493, "step": 2010 }, { "epoch": 0.48507462686567165, "grad_norm": 1.4453125, "learning_rate": 0.00017309491447964912, "loss": 0.6684, "step": 2015 }, { "epoch": 0.4862782859894078, "grad_norm": 1.4765625, "learning_rate": 0.0001730536132590366, "loss": 0.6535, "step": 2020 }, { "epoch": 0.48748194511314397, "grad_norm": 1.5390625, "learning_rate": 0.0001730122167209026, "loss": 0.6687, "step": 2025 }, { "epoch": 0.4886856042368801, "grad_norm": 1.5078125, "learning_rate": 0.00017297072493138824, "loss": 0.6337, "step": 2030 }, { "epoch": 0.4898892633606163, "grad_norm": 1.609375, "learning_rate": 0.0001729291379567867, "loss": 0.689, "step": 2035 }, { "epoch": 0.49109292248435243, "grad_norm": 1.65625, "learning_rate": 0.00017288745586354338, "loss": 0.6611, "step": 2040 }, { "epoch": 0.4922965816080886, "grad_norm": 1.4375, "learning_rate": 0.00017284567871825554, "loss": 0.6394, "step": 2045 }, { "epoch": 0.49350024073182475, "grad_norm": 1.453125, "learning_rate": 0.00017280380658767232, "loss": 0.6721, "step": 2050 }, { "epoch": 0.4947038998555609, "grad_norm": 1.4296875, "learning_rate": 0.00017276183953869468, "loss": 0.6445, "step": 2055 }, { "epoch": 0.49590755897929706, "grad_norm": 1.5390625, "learning_rate": 0.0001727197776383752, "loss": 0.6357, "step": 2060 }, { "epoch": 0.4971112181030332, "grad_norm": 1.4609375, "learning_rate": 0.00017267762095391805, "loss": 0.6575, "step": 2065 }, { "epoch": 0.49831487722676937, "grad_norm": 1.4140625, "learning_rate": 0.00017263536955267877, "loss": 0.668, "step": 2070 }, { "epoch": 0.4995185363505055, "grad_norm": 1.40625, "learning_rate": 0.00017259302350216426, "loss": 0.6321, "step": 2075 }, { "epoch": 0.5007221954742417, "grad_norm": 1.703125, "learning_rate": 0.00017255058287003267, "loss": 0.6468, "step": 2080 }, { "epoch": 0.5019258545979779, "grad_norm": 1.546875, "learning_rate": 0.00017250804772409321, "loss": 0.6888, "step": 2085 }, { "epoch": 0.503129513721714, "grad_norm": 1.375, "learning_rate": 0.00017246541813230621, "loss": 0.6862, "step": 2090 }, { "epoch": 0.5043331728454502, "grad_norm": 1.25, "learning_rate": 0.00017242269416278277, "loss": 0.6215, "step": 2095 }, { "epoch": 0.5055368319691863, "grad_norm": 1.3671875, "learning_rate": 0.0001723798758837849, "loss": 0.6863, "step": 2100 }, { "epoch": 0.5067404910929225, "grad_norm": 1.484375, "learning_rate": 0.00017233696336372518, "loss": 0.6396, "step": 2105 }, { "epoch": 0.5079441502166586, "grad_norm": 1.46875, "learning_rate": 0.0001722939566711668, "loss": 0.6635, "step": 2110 }, { "epoch": 0.5091478093403948, "grad_norm": 1.5078125, "learning_rate": 0.0001722508558748235, "loss": 0.676, "step": 2115 }, { "epoch": 0.5103514684641309, "grad_norm": 1.53125, "learning_rate": 0.0001722076610435593, "loss": 0.6546, "step": 2120 }, { "epoch": 0.5115551275878671, "grad_norm": 1.4453125, "learning_rate": 0.00017216437224638835, "loss": 0.6884, "step": 2125 }, { "epoch": 0.5127587867116032, "grad_norm": 1.421875, "learning_rate": 0.0001721209895524752, "loss": 0.6016, "step": 2130 }, { "epoch": 0.5139624458353395, "grad_norm": 1.5, "learning_rate": 0.0001720775130311342, "loss": 0.6292, "step": 2135 }, { "epoch": 0.5151661049590756, "grad_norm": 1.3984375, "learning_rate": 0.00017203394275182966, "loss": 0.6847, "step": 2140 }, { "epoch": 0.5163697640828118, "grad_norm": 1.4296875, "learning_rate": 0.00017199027878417577, "loss": 0.6527, "step": 2145 }, { "epoch": 0.5175734232065479, "grad_norm": 1.2734375, "learning_rate": 0.00017194652119793634, "loss": 0.6383, "step": 2150 }, { "epoch": 0.5187770823302841, "grad_norm": 1.4921875, "learning_rate": 0.0001719026700630248, "loss": 0.6664, "step": 2155 }, { "epoch": 0.5199807414540202, "grad_norm": 1.3046875, "learning_rate": 0.00017185872544950396, "loss": 0.6455, "step": 2160 }, { "epoch": 0.5211844005777564, "grad_norm": 1.4765625, "learning_rate": 0.0001718146874275861, "loss": 0.6697, "step": 2165 }, { "epoch": 0.5223880597014925, "grad_norm": 1.515625, "learning_rate": 0.00017177055606763266, "loss": 0.6764, "step": 2170 }, { "epoch": 0.5235917188252287, "grad_norm": 1.359375, "learning_rate": 0.00017172633144015424, "loss": 0.6458, "step": 2175 }, { "epoch": 0.5247953779489648, "grad_norm": 1.328125, "learning_rate": 0.00017168201361581053, "loss": 0.6561, "step": 2180 }, { "epoch": 0.525999037072701, "grad_norm": 1.390625, "learning_rate": 0.00017163760266540994, "loss": 0.6688, "step": 2185 }, { "epoch": 0.5272026961964371, "grad_norm": 1.3984375, "learning_rate": 0.00017159309865990988, "loss": 0.652, "step": 2190 }, { "epoch": 0.5284063553201733, "grad_norm": 1.3828125, "learning_rate": 0.00017154850167041625, "loss": 0.6565, "step": 2195 }, { "epoch": 0.5296100144439095, "grad_norm": 1.3046875, "learning_rate": 0.0001715038117681837, "loss": 0.6544, "step": 2200 }, { "epoch": 0.5308136735676456, "grad_norm": 1.4609375, "learning_rate": 0.00017145902902461515, "loss": 0.663, "step": 2205 }, { "epoch": 0.5320173326913819, "grad_norm": 1.4140625, "learning_rate": 0.00017141415351126202, "loss": 0.6494, "step": 2210 }, { "epoch": 0.533220991815118, "grad_norm": 1.3203125, "learning_rate": 0.0001713691852998238, "loss": 0.6389, "step": 2215 }, { "epoch": 0.5344246509388542, "grad_norm": 1.484375, "learning_rate": 0.00017132412446214823, "loss": 0.6686, "step": 2220 }, { "epoch": 0.5356283100625903, "grad_norm": 1.578125, "learning_rate": 0.000171278971070231, "loss": 0.6633, "step": 2225 }, { "epoch": 0.5368319691863265, "grad_norm": 1.53125, "learning_rate": 0.00017123372519621557, "loss": 0.6523, "step": 2230 }, { "epoch": 0.5380356283100626, "grad_norm": 1.2890625, "learning_rate": 0.00017118838691239328, "loss": 0.6764, "step": 2235 }, { "epoch": 0.5392392874337988, "grad_norm": 1.5234375, "learning_rate": 0.0001711429562912031, "loss": 0.6358, "step": 2240 }, { "epoch": 0.5404429465575349, "grad_norm": 1.359375, "learning_rate": 0.00017109743340523154, "loss": 0.6273, "step": 2245 }, { "epoch": 0.5416466056812711, "grad_norm": 1.25, "learning_rate": 0.00017105181832721244, "loss": 0.6216, "step": 2250 }, { "epoch": 0.5428502648050072, "grad_norm": 1.421875, "learning_rate": 0.00017100611113002702, "loss": 0.6458, "step": 2255 }, { "epoch": 0.5440539239287434, "grad_norm": 1.375, "learning_rate": 0.0001709603118867037, "loss": 0.6651, "step": 2260 }, { "epoch": 0.5452575830524795, "grad_norm": 1.4375, "learning_rate": 0.00017091442067041792, "loss": 0.6818, "step": 2265 }, { "epoch": 0.5464612421762157, "grad_norm": 1.3203125, "learning_rate": 0.00017086843755449203, "loss": 0.6613, "step": 2270 }, { "epoch": 0.5476649012999518, "grad_norm": 1.46875, "learning_rate": 0.00017082236261239532, "loss": 0.6473, "step": 2275 }, { "epoch": 0.548868560423688, "grad_norm": 1.5, "learning_rate": 0.0001707761959177437, "loss": 0.6797, "step": 2280 }, { "epoch": 0.5500722195474241, "grad_norm": 1.484375, "learning_rate": 0.00017072993754429973, "loss": 0.6512, "step": 2285 }, { "epoch": 0.5512758786711603, "grad_norm": 1.4921875, "learning_rate": 0.00017068358756597242, "loss": 0.6498, "step": 2290 }, { "epoch": 0.5524795377948964, "grad_norm": 1.265625, "learning_rate": 0.00017063714605681718, "loss": 0.6372, "step": 2295 }, { "epoch": 0.5536831969186327, "grad_norm": 1.421875, "learning_rate": 0.00017059061309103563, "loss": 0.6486, "step": 2300 }, { "epoch": 0.5548868560423688, "grad_norm": 1.453125, "learning_rate": 0.00017054398874297548, "loss": 0.6633, "step": 2305 }, { "epoch": 0.556090515166105, "grad_norm": 1.5078125, "learning_rate": 0.00017049727308713052, "loss": 0.6301, "step": 2310 }, { "epoch": 0.5572941742898411, "grad_norm": 1.40625, "learning_rate": 0.00017045046619814036, "loss": 0.6282, "step": 2315 }, { "epoch": 0.5584978334135773, "grad_norm": 1.4453125, "learning_rate": 0.00017040356815079048, "loss": 0.6738, "step": 2320 }, { "epoch": 0.5597014925373134, "grad_norm": 1.3046875, "learning_rate": 0.00017035657902001188, "loss": 0.6275, "step": 2325 }, { "epoch": 0.5609051516610496, "grad_norm": 1.34375, "learning_rate": 0.00017030949888088116, "loss": 0.6388, "step": 2330 }, { "epoch": 0.5621088107847857, "grad_norm": 1.2421875, "learning_rate": 0.0001702623278086203, "loss": 0.6096, "step": 2335 }, { "epoch": 0.5633124699085219, "grad_norm": 1.375, "learning_rate": 0.00017021506587859664, "loss": 0.6627, "step": 2340 }, { "epoch": 0.5645161290322581, "grad_norm": 1.453125, "learning_rate": 0.00017016771316632253, "loss": 0.6482, "step": 2345 }, { "epoch": 0.5657197881559942, "grad_norm": 1.53125, "learning_rate": 0.00017012026974745555, "loss": 0.6469, "step": 2350 }, { "epoch": 0.5669234472797304, "grad_norm": 1.5, "learning_rate": 0.00017007273569779807, "loss": 0.6173, "step": 2355 }, { "epoch": 0.5681271064034665, "grad_norm": 1.3828125, "learning_rate": 0.00017002511109329732, "loss": 0.622, "step": 2360 }, { "epoch": 0.5693307655272027, "grad_norm": 1.3671875, "learning_rate": 0.00016997739601004523, "loss": 0.6142, "step": 2365 }, { "epoch": 0.5705344246509388, "grad_norm": 1.3984375, "learning_rate": 0.0001699295905242783, "loss": 0.6124, "step": 2370 }, { "epoch": 0.571738083774675, "grad_norm": 1.53125, "learning_rate": 0.00016988169471237737, "loss": 0.634, "step": 2375 }, { "epoch": 0.5729417428984112, "grad_norm": 1.34375, "learning_rate": 0.00016983370865086774, "loss": 0.6306, "step": 2380 }, { "epoch": 0.5741454020221474, "grad_norm": 1.34375, "learning_rate": 0.00016978563241641873, "loss": 0.6354, "step": 2385 }, { "epoch": 0.5753490611458835, "grad_norm": 1.375, "learning_rate": 0.00016973746608584396, "loss": 0.683, "step": 2390 }, { "epoch": 0.5765527202696197, "grad_norm": 1.3046875, "learning_rate": 0.00016968920973610082, "loss": 0.635, "step": 2395 }, { "epoch": 0.5777563793933558, "grad_norm": 1.4921875, "learning_rate": 0.0001696408634442906, "loss": 0.6502, "step": 2400 }, { "epoch": 0.578960038517092, "grad_norm": 1.3203125, "learning_rate": 0.00016959242728765828, "loss": 0.6509, "step": 2405 }, { "epoch": 0.5801636976408281, "grad_norm": 1.4453125, "learning_rate": 0.00016954390134359237, "loss": 0.6082, "step": 2410 }, { "epoch": 0.5813673567645643, "grad_norm": 1.359375, "learning_rate": 0.000169495285689625, "loss": 0.6122, "step": 2415 }, { "epoch": 0.5825710158883004, "grad_norm": 1.4296875, "learning_rate": 0.00016944658040343142, "loss": 0.6628, "step": 2420 }, { "epoch": 0.5837746750120366, "grad_norm": 1.25, "learning_rate": 0.0001693977855628302, "loss": 0.6307, "step": 2425 }, { "epoch": 0.5849783341357727, "grad_norm": 2.234375, "learning_rate": 0.00016934890124578305, "loss": 0.6582, "step": 2430 }, { "epoch": 0.5861819932595089, "grad_norm": 1.484375, "learning_rate": 0.00016929992753039454, "loss": 0.6318, "step": 2435 }, { "epoch": 0.587385652383245, "grad_norm": 1.3046875, "learning_rate": 0.00016925086449491212, "loss": 0.626, "step": 2440 }, { "epoch": 0.5885893115069812, "grad_norm": 1.4609375, "learning_rate": 0.0001692017122177259, "loss": 0.6422, "step": 2445 }, { "epoch": 0.5897929706307173, "grad_norm": 1.3671875, "learning_rate": 0.00016915247077736872, "loss": 0.649, "step": 2450 }, { "epoch": 0.5909966297544536, "grad_norm": 1.3984375, "learning_rate": 0.00016910314025251567, "loss": 0.6335, "step": 2455 }, { "epoch": 0.5922002888781897, "grad_norm": 1.28125, "learning_rate": 0.00016905372072198433, "loss": 0.6265, "step": 2460 }, { "epoch": 0.5934039480019259, "grad_norm": 1.4921875, "learning_rate": 0.00016900421226473446, "loss": 0.6399, "step": 2465 }, { "epoch": 0.594607607125662, "grad_norm": 1.5234375, "learning_rate": 0.00016895461495986783, "loss": 0.6459, "step": 2470 }, { "epoch": 0.5958112662493982, "grad_norm": 1.421875, "learning_rate": 0.0001689049288866283, "loss": 0.6353, "step": 2475 }, { "epoch": 0.5970149253731343, "grad_norm": 1.421875, "learning_rate": 0.00016885515412440137, "loss": 0.661, "step": 2480 }, { "epoch": 0.5982185844968705, "grad_norm": 1.3671875, "learning_rate": 0.00016880529075271445, "loss": 0.7094, "step": 2485 }, { "epoch": 0.5994222436206067, "grad_norm": 1.3203125, "learning_rate": 0.00016875533885123636, "loss": 0.6169, "step": 2490 }, { "epoch": 0.6006259027443428, "grad_norm": 1.5859375, "learning_rate": 0.00016870529849977745, "loss": 0.6238, "step": 2495 }, { "epoch": 0.601829561868079, "grad_norm": 1.421875, "learning_rate": 0.0001686551697782894, "loss": 0.6355, "step": 2500 }, { "epoch": 0.601829561868079, "eval_loss": 0.5359562635421753, "eval_runtime": 2.4034, "eval_samples_per_second": 83.214, "eval_steps_per_second": 83.214, "step": 2500 }, { "epoch": 0.6030332209918151, "grad_norm": 1.4296875, "learning_rate": 0.00016860495276686504, "loss": 0.6152, "step": 2505 }, { "epoch": 0.6042368801155513, "grad_norm": 1.2578125, "learning_rate": 0.00016855464754573826, "loss": 0.6355, "step": 2510 }, { "epoch": 0.6054405392392874, "grad_norm": 1.3828125, "learning_rate": 0.00016850425419528395, "loss": 0.6177, "step": 2515 }, { "epoch": 0.6066441983630236, "grad_norm": 1.3046875, "learning_rate": 0.00016845377279601774, "loss": 0.6431, "step": 2520 }, { "epoch": 0.6078478574867597, "grad_norm": 1.421875, "learning_rate": 0.00016840320342859593, "loss": 0.641, "step": 2525 }, { "epoch": 0.609051516610496, "grad_norm": 1.421875, "learning_rate": 0.0001683525461738155, "loss": 0.6243, "step": 2530 }, { "epoch": 0.610255175734232, "grad_norm": 1.3984375, "learning_rate": 0.00016830180111261362, "loss": 0.6137, "step": 2535 }, { "epoch": 0.6114588348579683, "grad_norm": 1.3828125, "learning_rate": 0.00016825096832606806, "loss": 0.6148, "step": 2540 }, { "epoch": 0.6126624939817044, "grad_norm": 1.3203125, "learning_rate": 0.0001682000478953964, "loss": 0.6307, "step": 2545 }, { "epoch": 0.6138661531054406, "grad_norm": 1.296875, "learning_rate": 0.00016814903990195658, "loss": 0.6314, "step": 2550 }, { "epoch": 0.6150698122291767, "grad_norm": 1.3046875, "learning_rate": 0.00016809794442724623, "loss": 0.6148, "step": 2555 }, { "epoch": 0.6162734713529129, "grad_norm": 1.2578125, "learning_rate": 0.00016804676155290284, "loss": 0.6156, "step": 2560 }, { "epoch": 0.617477130476649, "grad_norm": 1.421875, "learning_rate": 0.0001679954913607035, "loss": 0.641, "step": 2565 }, { "epoch": 0.6186807896003852, "grad_norm": 1.3984375, "learning_rate": 0.00016794413393256486, "loss": 0.6264, "step": 2570 }, { "epoch": 0.6198844487241213, "grad_norm": 1.3515625, "learning_rate": 0.0001678926893505429, "loss": 0.5985, "step": 2575 }, { "epoch": 0.6210881078478575, "grad_norm": 1.4140625, "learning_rate": 0.0001678411576968329, "loss": 0.5958, "step": 2580 }, { "epoch": 0.6222917669715936, "grad_norm": 1.4140625, "learning_rate": 0.0001677895390537692, "loss": 0.658, "step": 2585 }, { "epoch": 0.6234954260953298, "grad_norm": 1.5234375, "learning_rate": 0.0001677378335038252, "loss": 0.6186, "step": 2590 }, { "epoch": 0.6246990852190659, "grad_norm": 1.4609375, "learning_rate": 0.0001676860411296131, "loss": 0.6286, "step": 2595 }, { "epoch": 0.6259027443428021, "grad_norm": 1.34375, "learning_rate": 0.0001676341620138838, "loss": 0.6219, "step": 2600 }, { "epoch": 0.6271064034665382, "grad_norm": 1.5, "learning_rate": 0.00016758219623952688, "loss": 0.6304, "step": 2605 }, { "epoch": 0.6283100625902744, "grad_norm": 1.4921875, "learning_rate": 0.00016753014388957028, "loss": 0.653, "step": 2610 }, { "epoch": 0.6295137217140105, "grad_norm": 1.4296875, "learning_rate": 0.00016747800504718035, "loss": 0.6361, "step": 2615 }, { "epoch": 0.6307173808377468, "grad_norm": 1.2890625, "learning_rate": 0.00016742577979566157, "loss": 0.6186, "step": 2620 }, { "epoch": 0.6319210399614829, "grad_norm": 1.3359375, "learning_rate": 0.0001673734682184565, "loss": 0.6639, "step": 2625 }, { "epoch": 0.6331246990852191, "grad_norm": 1.34375, "learning_rate": 0.0001673210703991457, "loss": 0.6343, "step": 2630 }, { "epoch": 0.6343283582089553, "grad_norm": 1.3515625, "learning_rate": 0.00016726858642144733, "loss": 0.6222, "step": 2635 }, { "epoch": 0.6355320173326914, "grad_norm": 1.4140625, "learning_rate": 0.00016721601636921743, "loss": 0.6644, "step": 2640 }, { "epoch": 0.6367356764564276, "grad_norm": 1.5234375, "learning_rate": 0.00016716336032644944, "loss": 0.6349, "step": 2645 }, { "epoch": 0.6379393355801637, "grad_norm": 1.40625, "learning_rate": 0.0001671106183772742, "loss": 0.6215, "step": 2650 }, { "epoch": 0.6391429947038999, "grad_norm": 1.4140625, "learning_rate": 0.00016705779060595985, "loss": 0.641, "step": 2655 }, { "epoch": 0.640346653827636, "grad_norm": 1.4375, "learning_rate": 0.0001670048770969116, "loss": 0.6223, "step": 2660 }, { "epoch": 0.6415503129513722, "grad_norm": 1.359375, "learning_rate": 0.00016695187793467167, "loss": 0.6244, "step": 2665 }, { "epoch": 0.6427539720751083, "grad_norm": 1.3046875, "learning_rate": 0.00016689879320391917, "loss": 0.6003, "step": 2670 }, { "epoch": 0.6439576311988445, "grad_norm": 1.1875, "learning_rate": 0.00016684562298946987, "loss": 0.6195, "step": 2675 }, { "epoch": 0.6451612903225806, "grad_norm": 1.5703125, "learning_rate": 0.00016679236737627613, "loss": 0.6311, "step": 2680 }, { "epoch": 0.6463649494463168, "grad_norm": 1.390625, "learning_rate": 0.0001667390264494268, "loss": 0.6297, "step": 2685 }, { "epoch": 0.6475686085700529, "grad_norm": 1.3125, "learning_rate": 0.00016668560029414699, "loss": 0.6197, "step": 2690 }, { "epoch": 0.6487722676937892, "grad_norm": 1.3984375, "learning_rate": 0.00016663208899579798, "loss": 0.6159, "step": 2695 }, { "epoch": 0.6499759268175253, "grad_norm": 1.328125, "learning_rate": 0.00016657849263987715, "loss": 0.6258, "step": 2700 }, { "epoch": 0.6511795859412615, "grad_norm": 1.25, "learning_rate": 0.00016652481131201768, "loss": 0.6746, "step": 2705 }, { "epoch": 0.6523832450649976, "grad_norm": 1.28125, "learning_rate": 0.00016647104509798867, "loss": 0.6039, "step": 2710 }, { "epoch": 0.6535869041887338, "grad_norm": 1.25, "learning_rate": 0.00016641719408369462, "loss": 0.6381, "step": 2715 }, { "epoch": 0.6547905633124699, "grad_norm": 1.3359375, "learning_rate": 0.0001663632583551757, "loss": 0.6218, "step": 2720 }, { "epoch": 0.6559942224362061, "grad_norm": 1.4296875, "learning_rate": 0.00016630923799860743, "loss": 0.6147, "step": 2725 }, { "epoch": 0.6571978815599422, "grad_norm": 1.4375, "learning_rate": 0.00016625513310030042, "loss": 0.6536, "step": 2730 }, { "epoch": 0.6584015406836784, "grad_norm": 1.3125, "learning_rate": 0.00016620094374670047, "loss": 0.6633, "step": 2735 }, { "epoch": 0.6596051998074145, "grad_norm": 1.2421875, "learning_rate": 0.00016614667002438823, "loss": 0.6164, "step": 2740 }, { "epoch": 0.6608088589311507, "grad_norm": 1.28125, "learning_rate": 0.00016609231202007924, "loss": 0.5894, "step": 2745 }, { "epoch": 0.6620125180548868, "grad_norm": 1.34375, "learning_rate": 0.00016603786982062363, "loss": 0.6336, "step": 2750 }, { "epoch": 0.663216177178623, "grad_norm": 1.34375, "learning_rate": 0.00016598334351300613, "loss": 0.6402, "step": 2755 }, { "epoch": 0.6644198363023591, "grad_norm": 1.3125, "learning_rate": 0.0001659287331843457, "loss": 0.6195, "step": 2760 }, { "epoch": 0.6656234954260953, "grad_norm": 1.34375, "learning_rate": 0.00016587403892189575, "loss": 0.6338, "step": 2765 }, { "epoch": 0.6668271545498314, "grad_norm": 1.3046875, "learning_rate": 0.00016581926081304365, "loss": 0.631, "step": 2770 }, { "epoch": 0.6680308136735676, "grad_norm": 1.453125, "learning_rate": 0.00016576439894531075, "loss": 0.6237, "step": 2775 }, { "epoch": 0.6692344727973039, "grad_norm": 1.34375, "learning_rate": 0.00016570945340635228, "loss": 0.6423, "step": 2780 }, { "epoch": 0.67043813192104, "grad_norm": 1.3359375, "learning_rate": 0.0001656544242839571, "loss": 0.6303, "step": 2785 }, { "epoch": 0.6716417910447762, "grad_norm": 1.3828125, "learning_rate": 0.00016559931166604768, "loss": 0.6014, "step": 2790 }, { "epoch": 0.6728454501685123, "grad_norm": 1.28125, "learning_rate": 0.00016554411564067984, "loss": 0.6299, "step": 2795 }, { "epoch": 0.6740491092922485, "grad_norm": 1.3515625, "learning_rate": 0.00016548883629604266, "loss": 0.6262, "step": 2800 }, { "epoch": 0.6752527684159846, "grad_norm": 1.2734375, "learning_rate": 0.00016543347372045838, "loss": 0.6097, "step": 2805 }, { "epoch": 0.6764564275397208, "grad_norm": 1.546875, "learning_rate": 0.0001653780280023822, "loss": 0.6265, "step": 2810 }, { "epoch": 0.6776600866634569, "grad_norm": 1.3671875, "learning_rate": 0.00016532249923040214, "loss": 0.6533, "step": 2815 }, { "epoch": 0.6788637457871931, "grad_norm": 1.4296875, "learning_rate": 0.00016526688749323901, "loss": 0.6321, "step": 2820 }, { "epoch": 0.6800674049109292, "grad_norm": 1.6328125, "learning_rate": 0.00016521119287974602, "loss": 0.6148, "step": 2825 }, { "epoch": 0.6812710640346654, "grad_norm": 1.296875, "learning_rate": 0.00016515541547890895, "loss": 0.6007, "step": 2830 }, { "epoch": 0.6824747231584015, "grad_norm": 1.2265625, "learning_rate": 0.00016509955537984575, "loss": 0.6219, "step": 2835 }, { "epoch": 0.6836783822821377, "grad_norm": 1.359375, "learning_rate": 0.00016504361267180657, "loss": 0.6367, "step": 2840 }, { "epoch": 0.6848820414058738, "grad_norm": 1.359375, "learning_rate": 0.00016498758744417353, "loss": 0.6188, "step": 2845 }, { "epoch": 0.68608570052961, "grad_norm": 1.3125, "learning_rate": 0.0001649314797864605, "loss": 0.6162, "step": 2850 }, { "epoch": 0.6872893596533461, "grad_norm": 1.8984375, "learning_rate": 0.00016487528978831323, "loss": 0.6024, "step": 2855 }, { "epoch": 0.6884930187770824, "grad_norm": 1.4140625, "learning_rate": 0.00016481901753950888, "loss": 0.6152, "step": 2860 }, { "epoch": 0.6896966779008185, "grad_norm": 1.3125, "learning_rate": 0.0001647626631299561, "loss": 0.6176, "step": 2865 }, { "epoch": 0.6909003370245547, "grad_norm": 1.375, "learning_rate": 0.00016470622664969473, "loss": 0.6142, "step": 2870 }, { "epoch": 0.6921039961482908, "grad_norm": 1.3671875, "learning_rate": 0.00016464970818889587, "loss": 0.6095, "step": 2875 }, { "epoch": 0.693307655272027, "grad_norm": 1.265625, "learning_rate": 0.00016459310783786154, "loss": 0.6172, "step": 2880 }, { "epoch": 0.6945113143957631, "grad_norm": 1.3203125, "learning_rate": 0.00016453642568702454, "loss": 0.6371, "step": 2885 }, { "epoch": 0.6957149735194993, "grad_norm": 1.3515625, "learning_rate": 0.00016447966182694842, "loss": 0.6234, "step": 2890 }, { "epoch": 0.6969186326432354, "grad_norm": 1.375, "learning_rate": 0.0001644228163483273, "loss": 0.6368, "step": 2895 }, { "epoch": 0.6981222917669716, "grad_norm": 1.4296875, "learning_rate": 0.0001643658893419857, "loss": 0.5879, "step": 2900 }, { "epoch": 0.6993259508907077, "grad_norm": 1.359375, "learning_rate": 0.00016430888089887834, "loss": 0.6275, "step": 2905 }, { "epoch": 0.7005296100144439, "grad_norm": 1.21875, "learning_rate": 0.00016425179111009009, "loss": 0.5837, "step": 2910 }, { "epoch": 0.70173326913818, "grad_norm": 1.421875, "learning_rate": 0.00016419462006683584, "loss": 0.6286, "step": 2915 }, { "epoch": 0.7029369282619162, "grad_norm": 1.3046875, "learning_rate": 0.00016413736786046024, "loss": 0.5522, "step": 2920 }, { "epoch": 0.7041405873856523, "grad_norm": 1.3828125, "learning_rate": 0.00016408003458243768, "loss": 0.6287, "step": 2925 }, { "epoch": 0.7053442465093885, "grad_norm": 1.328125, "learning_rate": 0.00016402262032437194, "loss": 0.5847, "step": 2930 }, { "epoch": 0.7065479056331248, "grad_norm": 1.4296875, "learning_rate": 0.00016396512517799638, "loss": 0.5989, "step": 2935 }, { "epoch": 0.7077515647568609, "grad_norm": 1.3125, "learning_rate": 0.00016390754923517348, "loss": 0.5815, "step": 2940 }, { "epoch": 0.7089552238805971, "grad_norm": 1.2578125, "learning_rate": 0.00016384989258789483, "loss": 0.6299, "step": 2945 }, { "epoch": 0.7101588830043332, "grad_norm": 1.3828125, "learning_rate": 0.00016379215532828098, "loss": 0.6205, "step": 2950 }, { "epoch": 0.7113625421280694, "grad_norm": 1.3203125, "learning_rate": 0.00016373433754858128, "loss": 0.6041, "step": 2955 }, { "epoch": 0.7125662012518055, "grad_norm": 1.265625, "learning_rate": 0.00016367643934117368, "loss": 0.5704, "step": 2960 }, { "epoch": 0.7137698603755417, "grad_norm": 1.2890625, "learning_rate": 0.00016361846079856467, "loss": 0.593, "step": 2965 }, { "epoch": 0.7149735194992778, "grad_norm": 1.3359375, "learning_rate": 0.00016356040201338917, "loss": 0.6131, "step": 2970 }, { "epoch": 0.716177178623014, "grad_norm": 1.4296875, "learning_rate": 0.00016350226307841015, "loss": 0.5934, "step": 2975 }, { "epoch": 0.7173808377467501, "grad_norm": 1.21875, "learning_rate": 0.0001634440440865188, "loss": 0.6044, "step": 2980 }, { "epoch": 0.7185844968704863, "grad_norm": 1.2421875, "learning_rate": 0.0001633857451307341, "loss": 0.6189, "step": 2985 }, { "epoch": 0.7197881559942224, "grad_norm": 1.2734375, "learning_rate": 0.00016332736630420282, "loss": 0.6159, "step": 2990 }, { "epoch": 0.7209918151179586, "grad_norm": 1.359375, "learning_rate": 0.00016326890770019942, "loss": 0.5984, "step": 2995 }, { "epoch": 0.7221954742416947, "grad_norm": 1.375, "learning_rate": 0.0001632103694121257, "loss": 0.5956, "step": 3000 }, { "epoch": 0.7221954742416947, "eval_loss": 0.5172761082649231, "eval_runtime": 2.4017, "eval_samples_per_second": 83.276, "eval_steps_per_second": 83.276, "step": 3000 }, { "epoch": 0.7233991333654309, "grad_norm": 1.34375, "learning_rate": 0.0001631517515335109, "loss": 0.6393, "step": 3005 }, { "epoch": 0.724602792489167, "grad_norm": 1.390625, "learning_rate": 0.00016309305415801128, "loss": 0.6718, "step": 3010 }, { "epoch": 0.7258064516129032, "grad_norm": 1.234375, "learning_rate": 0.0001630342773794103, "loss": 0.6037, "step": 3015 }, { "epoch": 0.7270101107366393, "grad_norm": 1.2578125, "learning_rate": 0.0001629754212916181, "loss": 0.5953, "step": 3020 }, { "epoch": 0.7282137698603756, "grad_norm": 1.34375, "learning_rate": 0.00016291648598867163, "loss": 0.5897, "step": 3025 }, { "epoch": 0.7294174289841117, "grad_norm": 1.3515625, "learning_rate": 0.00016285747156473445, "loss": 0.6198, "step": 3030 }, { "epoch": 0.7306210881078479, "grad_norm": 1.2265625, "learning_rate": 0.00016279837811409645, "loss": 0.5962, "step": 3035 }, { "epoch": 0.731824747231584, "grad_norm": 1.3125, "learning_rate": 0.0001627392057311738, "loss": 0.5809, "step": 3040 }, { "epoch": 0.7330284063553202, "grad_norm": 1.296875, "learning_rate": 0.00016267995451050885, "loss": 0.5829, "step": 3045 }, { "epoch": 0.7342320654790563, "grad_norm": 1.2578125, "learning_rate": 0.00016262062454676978, "loss": 0.632, "step": 3050 }, { "epoch": 0.7354357246027925, "grad_norm": 1.4921875, "learning_rate": 0.00016256121593475072, "loss": 0.5775, "step": 3055 }, { "epoch": 0.7366393837265286, "grad_norm": 1.3984375, "learning_rate": 0.0001625017287693714, "loss": 0.623, "step": 3060 }, { "epoch": 0.7378430428502648, "grad_norm": 1.375, "learning_rate": 0.00016244216314567704, "loss": 0.611, "step": 3065 }, { "epoch": 0.7390467019740009, "grad_norm": 1.2890625, "learning_rate": 0.00016238251915883827, "loss": 0.6363, "step": 3070 }, { "epoch": 0.7402503610977371, "grad_norm": 1.296875, "learning_rate": 0.00016232279690415086, "loss": 0.633, "step": 3075 }, { "epoch": 0.7414540202214733, "grad_norm": 1.265625, "learning_rate": 0.00016226299647703564, "loss": 0.612, "step": 3080 }, { "epoch": 0.7426576793452094, "grad_norm": 1.3046875, "learning_rate": 0.0001622031179730384, "loss": 0.5371, "step": 3085 }, { "epoch": 0.7438613384689456, "grad_norm": 1.3046875, "learning_rate": 0.00016214316148782968, "loss": 0.6312, "step": 3090 }, { "epoch": 0.7450649975926817, "grad_norm": 1.21875, "learning_rate": 0.0001620831271172045, "loss": 0.595, "step": 3095 }, { "epoch": 0.746268656716418, "grad_norm": 1.2890625, "learning_rate": 0.00016202301495708244, "loss": 0.615, "step": 3100 }, { "epoch": 0.7474723158401541, "grad_norm": 1.484375, "learning_rate": 0.00016196282510350725, "loss": 0.6047, "step": 3105 }, { "epoch": 0.7486759749638903, "grad_norm": 1.3359375, "learning_rate": 0.000161902557652647, "loss": 0.5688, "step": 3110 }, { "epoch": 0.7498796340876264, "grad_norm": 1.1328125, "learning_rate": 0.00016184221270079352, "loss": 0.6099, "step": 3115 }, { "epoch": 0.7510832932113626, "grad_norm": 1.2734375, "learning_rate": 0.00016178179034436263, "loss": 0.5904, "step": 3120 }, { "epoch": 0.7522869523350987, "grad_norm": 1.234375, "learning_rate": 0.0001617212906798938, "loss": 0.5969, "step": 3125 }, { "epoch": 0.7534906114588349, "grad_norm": 1.375, "learning_rate": 0.00016166071380404988, "loss": 0.6169, "step": 3130 }, { "epoch": 0.754694270582571, "grad_norm": 1.3828125, "learning_rate": 0.00016160005981361727, "loss": 0.5742, "step": 3135 }, { "epoch": 0.7558979297063072, "grad_norm": 1.1953125, "learning_rate": 0.0001615393288055055, "loss": 0.6365, "step": 3140 }, { "epoch": 0.7571015888300433, "grad_norm": 1.3359375, "learning_rate": 0.0001614785208767471, "loss": 0.5881, "step": 3145 }, { "epoch": 0.7583052479537795, "grad_norm": 1.2890625, "learning_rate": 0.00016141763612449758, "loss": 0.6109, "step": 3150 }, { "epoch": 0.7595089070775156, "grad_norm": 1.2421875, "learning_rate": 0.0001613566746460352, "loss": 0.5787, "step": 3155 }, { "epoch": 0.7607125662012518, "grad_norm": 1.234375, "learning_rate": 0.00016129563653876069, "loss": 0.6149, "step": 3160 }, { "epoch": 0.7619162253249879, "grad_norm": 1.3671875, "learning_rate": 0.0001612345219001974, "loss": 0.6038, "step": 3165 }, { "epoch": 0.7631198844487241, "grad_norm": 1.28125, "learning_rate": 0.0001611733308279908, "loss": 0.5831, "step": 3170 }, { "epoch": 0.7643235435724602, "grad_norm": 1.3984375, "learning_rate": 0.00016111206341990855, "loss": 0.6032, "step": 3175 }, { "epoch": 0.7655272026961965, "grad_norm": 1.28125, "learning_rate": 0.00016105071977384023, "loss": 0.5921, "step": 3180 }, { "epoch": 0.7667308618199326, "grad_norm": 1.3515625, "learning_rate": 0.0001609892999877973, "loss": 0.6032, "step": 3185 }, { "epoch": 0.7679345209436688, "grad_norm": 1.234375, "learning_rate": 0.0001609278041599128, "loss": 0.6096, "step": 3190 }, { "epoch": 0.7691381800674049, "grad_norm": 1.28125, "learning_rate": 0.00016086623238844132, "loss": 0.6154, "step": 3195 }, { "epoch": 0.7703418391911411, "grad_norm": 1.3515625, "learning_rate": 0.00016080458477175878, "loss": 0.6341, "step": 3200 }, { "epoch": 0.7715454983148772, "grad_norm": 1.2890625, "learning_rate": 0.00016074286140836224, "loss": 0.5924, "step": 3205 }, { "epoch": 0.7727491574386134, "grad_norm": 1.265625, "learning_rate": 0.00016068106239686985, "loss": 0.619, "step": 3210 }, { "epoch": 0.7739528165623495, "grad_norm": 1.203125, "learning_rate": 0.00016061918783602056, "loss": 0.6052, "step": 3215 }, { "epoch": 0.7751564756860857, "grad_norm": 1.3125, "learning_rate": 0.00016055723782467402, "loss": 0.5785, "step": 3220 }, { "epoch": 0.7763601348098219, "grad_norm": 1.21875, "learning_rate": 0.00016049521246181054, "loss": 0.6208, "step": 3225 }, { "epoch": 0.777563793933558, "grad_norm": 1.21875, "learning_rate": 0.0001604331118465307, "loss": 0.6546, "step": 3230 }, { "epoch": 0.7787674530572942, "grad_norm": 1.359375, "learning_rate": 0.00016037093607805536, "loss": 0.6166, "step": 3235 }, { "epoch": 0.7799711121810303, "grad_norm": 1.328125, "learning_rate": 0.00016030868525572555, "loss": 0.5876, "step": 3240 }, { "epoch": 0.7811747713047665, "grad_norm": 1.4375, "learning_rate": 0.000160246359479002, "loss": 0.5557, "step": 3245 }, { "epoch": 0.7823784304285026, "grad_norm": 1.15625, "learning_rate": 0.00016018395884746534, "loss": 0.6041, "step": 3250 }, { "epoch": 0.7835820895522388, "grad_norm": 1.546875, "learning_rate": 0.0001601214834608159, "loss": 0.5732, "step": 3255 }, { "epoch": 0.784785748675975, "grad_norm": 1.2890625, "learning_rate": 0.00016005893341887318, "loss": 0.6156, "step": 3260 }, { "epoch": 0.7859894077997112, "grad_norm": 1.1796875, "learning_rate": 0.00015999630882157623, "loss": 0.5772, "step": 3265 }, { "epoch": 0.7871930669234473, "grad_norm": 1.28125, "learning_rate": 0.00015993360976898304, "loss": 0.5651, "step": 3270 }, { "epoch": 0.7883967260471835, "grad_norm": 1.4453125, "learning_rate": 0.00015987083636127066, "loss": 0.6199, "step": 3275 }, { "epoch": 0.7896003851709196, "grad_norm": 1.171875, "learning_rate": 0.00015980798869873487, "loss": 0.5641, "step": 3280 }, { "epoch": 0.7908040442946558, "grad_norm": 1.2890625, "learning_rate": 0.00015974506688179015, "loss": 0.6227, "step": 3285 }, { "epoch": 0.7920077034183919, "grad_norm": 1.265625, "learning_rate": 0.0001596820710109694, "loss": 0.5881, "step": 3290 }, { "epoch": 0.7932113625421281, "grad_norm": 1.3515625, "learning_rate": 0.00015961900118692388, "loss": 0.5989, "step": 3295 }, { "epoch": 0.7944150216658642, "grad_norm": 1.3984375, "learning_rate": 0.00015955585751042305, "loss": 0.5997, "step": 3300 }, { "epoch": 0.7956186807896004, "grad_norm": 1.2890625, "learning_rate": 0.00015949264008235426, "loss": 0.598, "step": 3305 }, { "epoch": 0.7968223399133365, "grad_norm": 1.4453125, "learning_rate": 0.0001594293490037228, "loss": 0.5824, "step": 3310 }, { "epoch": 0.7980259990370727, "grad_norm": 1.3828125, "learning_rate": 0.00015936598437565154, "loss": 0.6047, "step": 3315 }, { "epoch": 0.7992296581608088, "grad_norm": 1.21875, "learning_rate": 0.00015930254629938095, "loss": 0.6081, "step": 3320 }, { "epoch": 0.800433317284545, "grad_norm": 1.25, "learning_rate": 0.0001592390348762688, "loss": 0.6084, "step": 3325 }, { "epoch": 0.8016369764082811, "grad_norm": 1.2421875, "learning_rate": 0.00015917545020779005, "loss": 0.5911, "step": 3330 }, { "epoch": 0.8028406355320173, "grad_norm": 1.2890625, "learning_rate": 0.0001591117923955367, "loss": 0.5856, "step": 3335 }, { "epoch": 0.8040442946557534, "grad_norm": 1.421875, "learning_rate": 0.00015904806154121766, "loss": 0.6114, "step": 3340 }, { "epoch": 0.8052479537794897, "grad_norm": 1.28125, "learning_rate": 0.00015898425774665844, "loss": 0.5982, "step": 3345 }, { "epoch": 0.8064516129032258, "grad_norm": 1.234375, "learning_rate": 0.00015892038111380122, "loss": 0.5542, "step": 3350 }, { "epoch": 0.807655272026962, "grad_norm": 1.375, "learning_rate": 0.00015885643174470438, "loss": 0.5927, "step": 3355 }, { "epoch": 0.8088589311506981, "grad_norm": 1.2265625, "learning_rate": 0.0001587924097415427, "loss": 0.5313, "step": 3360 }, { "epoch": 0.8100625902744343, "grad_norm": 1.3984375, "learning_rate": 0.0001587283152066069, "loss": 0.575, "step": 3365 }, { "epoch": 0.8112662493981705, "grad_norm": 1.3359375, "learning_rate": 0.00015866414824230362, "loss": 0.5912, "step": 3370 }, { "epoch": 0.8124699085219066, "grad_norm": 1.1875, "learning_rate": 0.00015859990895115525, "loss": 0.5957, "step": 3375 }, { "epoch": 0.8136735676456428, "grad_norm": 1.3828125, "learning_rate": 0.00015853559743579966, "loss": 0.5992, "step": 3380 }, { "epoch": 0.8148772267693789, "grad_norm": 1.2109375, "learning_rate": 0.00015847121379899018, "loss": 0.55, "step": 3385 }, { "epoch": 0.8160808858931151, "grad_norm": 1.453125, "learning_rate": 0.00015840675814359533, "loss": 0.6401, "step": 3390 }, { "epoch": 0.8172845450168512, "grad_norm": 1.3984375, "learning_rate": 0.0001583422305725988, "loss": 0.5889, "step": 3395 }, { "epoch": 0.8184882041405874, "grad_norm": 1.2578125, "learning_rate": 0.00015827763118909905, "loss": 0.5919, "step": 3400 }, { "epoch": 0.8196918632643235, "grad_norm": 1.4375, "learning_rate": 0.00015821296009630936, "loss": 0.633, "step": 3405 }, { "epoch": 0.8208955223880597, "grad_norm": 1.234375, "learning_rate": 0.00015814821739755752, "loss": 0.5937, "step": 3410 }, { "epoch": 0.8220991815117958, "grad_norm": 1.375, "learning_rate": 0.00015808340319628582, "loss": 0.5949, "step": 3415 }, { "epoch": 0.823302840635532, "grad_norm": 1.390625, "learning_rate": 0.00015801851759605069, "loss": 0.5666, "step": 3420 }, { "epoch": 0.8245064997592682, "grad_norm": 1.2421875, "learning_rate": 0.0001579535607005227, "loss": 0.6208, "step": 3425 }, { "epoch": 0.8257101588830044, "grad_norm": 1.3046875, "learning_rate": 0.00015788853261348635, "loss": 0.6313, "step": 3430 }, { "epoch": 0.8269138180067405, "grad_norm": 1.28125, "learning_rate": 0.00015782343343883978, "loss": 0.5513, "step": 3435 }, { "epoch": 0.8281174771304767, "grad_norm": 1.296875, "learning_rate": 0.00015775826328059484, "loss": 0.5594, "step": 3440 }, { "epoch": 0.8293211362542128, "grad_norm": 1.21875, "learning_rate": 0.0001576930222428767, "loss": 0.57, "step": 3445 }, { "epoch": 0.830524795377949, "grad_norm": 1.2890625, "learning_rate": 0.0001576277104299239, "loss": 0.5756, "step": 3450 }, { "epoch": 0.8317284545016851, "grad_norm": 1.3359375, "learning_rate": 0.00015756232794608788, "loss": 0.6003, "step": 3455 }, { "epoch": 0.8329321136254213, "grad_norm": 1.4921875, "learning_rate": 0.00015749687489583313, "loss": 0.6116, "step": 3460 }, { "epoch": 0.8341357727491574, "grad_norm": 1.2265625, "learning_rate": 0.0001574313513837368, "loss": 0.5729, "step": 3465 }, { "epoch": 0.8353394318728936, "grad_norm": 1.2421875, "learning_rate": 0.00015736575751448877, "loss": 0.5828, "step": 3470 }, { "epoch": 0.8365430909966297, "grad_norm": 1.359375, "learning_rate": 0.0001573000933928911, "loss": 0.5651, "step": 3475 }, { "epoch": 0.8377467501203659, "grad_norm": 1.265625, "learning_rate": 0.0001572343591238583, "loss": 0.5863, "step": 3480 }, { "epoch": 0.838950409244102, "grad_norm": 1.265625, "learning_rate": 0.00015716855481241686, "loss": 0.6015, "step": 3485 }, { "epoch": 0.8401540683678382, "grad_norm": 1.2578125, "learning_rate": 0.0001571026805637052, "loss": 0.5433, "step": 3490 }, { "epoch": 0.8413577274915743, "grad_norm": 1.453125, "learning_rate": 0.00015703673648297345, "loss": 0.613, "step": 3495 }, { "epoch": 0.8425613866153105, "grad_norm": 1.234375, "learning_rate": 0.00015697072267558345, "loss": 0.6195, "step": 3500 }, { "epoch": 0.8425613866153105, "eval_loss": 0.49514415860176086, "eval_runtime": 2.4002, "eval_samples_per_second": 83.326, "eval_steps_per_second": 83.326, "step": 3500 }, { "epoch": 0.8437650457390466, "grad_norm": 1.2421875, "learning_rate": 0.00015690463924700822, "loss": 0.5976, "step": 3505 }, { "epoch": 0.8449687048627829, "grad_norm": 1.28125, "learning_rate": 0.00015683848630283222, "loss": 0.5928, "step": 3510 }, { "epoch": 0.8461723639865191, "grad_norm": 1.359375, "learning_rate": 0.00015677226394875086, "loss": 0.5912, "step": 3515 }, { "epoch": 0.8473760231102552, "grad_norm": 1.296875, "learning_rate": 0.0001567059722905705, "loss": 0.5744, "step": 3520 }, { "epoch": 0.8485796822339914, "grad_norm": 1.2890625, "learning_rate": 0.00015663961143420817, "loss": 0.6012, "step": 3525 }, { "epoch": 0.8497833413577275, "grad_norm": 1.3359375, "learning_rate": 0.0001565731814856916, "loss": 0.5899, "step": 3530 }, { "epoch": 0.8509870004814637, "grad_norm": 1.4140625, "learning_rate": 0.0001565066825511587, "loss": 0.6465, "step": 3535 }, { "epoch": 0.8521906596051998, "grad_norm": 1.1796875, "learning_rate": 0.00015644011473685786, "loss": 0.5968, "step": 3540 }, { "epoch": 0.853394318728936, "grad_norm": 1.3671875, "learning_rate": 0.00015637347814914724, "loss": 0.588, "step": 3545 }, { "epoch": 0.8545979778526721, "grad_norm": 1.265625, "learning_rate": 0.00015630677289449514, "loss": 0.565, "step": 3550 }, { "epoch": 0.8558016369764083, "grad_norm": 1.296875, "learning_rate": 0.00015623999907947935, "loss": 0.6125, "step": 3555 }, { "epoch": 0.8570052961001444, "grad_norm": 1.2265625, "learning_rate": 0.0001561731568107874, "loss": 0.5903, "step": 3560 }, { "epoch": 0.8582089552238806, "grad_norm": 1.140625, "learning_rate": 0.0001561062461952161, "loss": 0.6142, "step": 3565 }, { "epoch": 0.8594126143476167, "grad_norm": 1.3984375, "learning_rate": 0.00015603926733967138, "loss": 0.5918, "step": 3570 }, { "epoch": 0.8606162734713529, "grad_norm": 1.21875, "learning_rate": 0.00015597222035116835, "loss": 0.5974, "step": 3575 }, { "epoch": 0.861819932595089, "grad_norm": 1.3515625, "learning_rate": 0.00015590510533683088, "loss": 0.5726, "step": 3580 }, { "epoch": 0.8630235917188253, "grad_norm": 1.2421875, "learning_rate": 0.00015583792240389158, "loss": 0.5648, "step": 3585 }, { "epoch": 0.8642272508425614, "grad_norm": 1.2890625, "learning_rate": 0.00015577067165969152, "loss": 0.5807, "step": 3590 }, { "epoch": 0.8654309099662976, "grad_norm": 1.2578125, "learning_rate": 0.00015570335321168018, "loss": 0.6013, "step": 3595 }, { "epoch": 0.8666345690900337, "grad_norm": 1.3515625, "learning_rate": 0.00015563596716741516, "loss": 0.5767, "step": 3600 }, { "epoch": 0.8678382282137699, "grad_norm": 1.3125, "learning_rate": 0.00015556851363456213, "loss": 0.5825, "step": 3605 }, { "epoch": 0.869041887337506, "grad_norm": 1.328125, "learning_rate": 0.0001555009927208945, "loss": 0.5685, "step": 3610 }, { "epoch": 0.8702455464612422, "grad_norm": 1.21875, "learning_rate": 0.00015543340453429337, "loss": 0.6157, "step": 3615 }, { "epoch": 0.8714492055849783, "grad_norm": 1.3203125, "learning_rate": 0.00015536574918274736, "loss": 0.5748, "step": 3620 }, { "epoch": 0.8726528647087145, "grad_norm": 1.296875, "learning_rate": 0.0001552980267743524, "loss": 0.5961, "step": 3625 }, { "epoch": 0.8738565238324506, "grad_norm": 1.2578125, "learning_rate": 0.0001552302374173115, "loss": 0.6097, "step": 3630 }, { "epoch": 0.8750601829561868, "grad_norm": 1.1953125, "learning_rate": 0.00015516238121993466, "loss": 0.5665, "step": 3635 }, { "epoch": 0.8762638420799229, "grad_norm": 1.171875, "learning_rate": 0.00015509445829063877, "loss": 0.619, "step": 3640 }, { "epoch": 0.8774675012036591, "grad_norm": 1.265625, "learning_rate": 0.00015502646873794717, "loss": 0.5902, "step": 3645 }, { "epoch": 0.8786711603273952, "grad_norm": 1.3046875, "learning_rate": 0.00015495841267048978, "loss": 0.5833, "step": 3650 }, { "epoch": 0.8798748194511314, "grad_norm": 1.15625, "learning_rate": 0.00015489029019700274, "loss": 0.5734, "step": 3655 }, { "epoch": 0.8810784785748677, "grad_norm": 1.203125, "learning_rate": 0.00015482210142632832, "loss": 0.5526, "step": 3660 }, { "epoch": 0.8822821376986038, "grad_norm": 1.2578125, "learning_rate": 0.00015475384646741465, "loss": 0.5419, "step": 3665 }, { "epoch": 0.88348579682234, "grad_norm": 1.2578125, "learning_rate": 0.0001546855254293157, "loss": 0.6102, "step": 3670 }, { "epoch": 0.8846894559460761, "grad_norm": 1.203125, "learning_rate": 0.00015461713842119092, "loss": 0.5931, "step": 3675 }, { "epoch": 0.8858931150698123, "grad_norm": 1.3125, "learning_rate": 0.00015454868555230526, "loss": 0.5969, "step": 3680 }, { "epoch": 0.8870967741935484, "grad_norm": 1.296875, "learning_rate": 0.00015448016693202888, "loss": 0.555, "step": 3685 }, { "epoch": 0.8883004333172846, "grad_norm": 1.34375, "learning_rate": 0.0001544115826698369, "loss": 0.5798, "step": 3690 }, { "epoch": 0.8895040924410207, "grad_norm": 1.3984375, "learning_rate": 0.00015434293287530947, "loss": 0.5854, "step": 3695 }, { "epoch": 0.8907077515647569, "grad_norm": 1.359375, "learning_rate": 0.00015427421765813133, "loss": 0.6015, "step": 3700 }, { "epoch": 0.891911410688493, "grad_norm": 1.125, "learning_rate": 0.00015420543712809176, "loss": 0.5548, "step": 3705 }, { "epoch": 0.8931150698122292, "grad_norm": 1.1953125, "learning_rate": 0.00015413659139508444, "loss": 0.5799, "step": 3710 }, { "epoch": 0.8943187289359653, "grad_norm": 1.2890625, "learning_rate": 0.00015406768056910717, "loss": 0.5821, "step": 3715 }, { "epoch": 0.8955223880597015, "grad_norm": 1.1015625, "learning_rate": 0.0001539987047602618, "loss": 0.5711, "step": 3720 }, { "epoch": 0.8967260471834376, "grad_norm": 1.1875, "learning_rate": 0.00015392966407875403, "loss": 0.5839, "step": 3725 }, { "epoch": 0.8979297063071738, "grad_norm": 1.1484375, "learning_rate": 0.0001538605586348931, "loss": 0.5959, "step": 3730 }, { "epoch": 0.8991333654309099, "grad_norm": 1.234375, "learning_rate": 0.0001537913885390919, "loss": 0.545, "step": 3735 }, { "epoch": 0.9003370245546461, "grad_norm": 1.1953125, "learning_rate": 0.0001537221539018664, "loss": 0.5711, "step": 3740 }, { "epoch": 0.9015406836783822, "grad_norm": 1.296875, "learning_rate": 0.00015365285483383582, "loss": 0.5497, "step": 3745 }, { "epoch": 0.9027443428021185, "grad_norm": 1.3359375, "learning_rate": 0.00015358349144572236, "loss": 0.5744, "step": 3750 }, { "epoch": 0.9039480019258546, "grad_norm": 1.1484375, "learning_rate": 0.0001535140638483509, "loss": 0.5886, "step": 3755 }, { "epoch": 0.9051516610495908, "grad_norm": 1.2578125, "learning_rate": 0.00015344457215264894, "loss": 0.5872, "step": 3760 }, { "epoch": 0.9063553201733269, "grad_norm": 1.2890625, "learning_rate": 0.0001533750164696464, "loss": 0.6167, "step": 3765 }, { "epoch": 0.9075589792970631, "grad_norm": 1.234375, "learning_rate": 0.00015330539691047544, "loss": 0.5705, "step": 3770 }, { "epoch": 0.9087626384207992, "grad_norm": 1.1875, "learning_rate": 0.00015323571358637024, "loss": 0.5854, "step": 3775 }, { "epoch": 0.9099662975445354, "grad_norm": 1.28125, "learning_rate": 0.00015316596660866694, "loss": 0.5895, "step": 3780 }, { "epoch": 0.9111699566682715, "grad_norm": 1.2265625, "learning_rate": 0.00015309615608880329, "loss": 0.5797, "step": 3785 }, { "epoch": 0.9123736157920077, "grad_norm": 1.203125, "learning_rate": 0.0001530262821383186, "loss": 0.5766, "step": 3790 }, { "epoch": 0.9135772749157438, "grad_norm": 1.3125, "learning_rate": 0.00015295634486885357, "loss": 0.5612, "step": 3795 }, { "epoch": 0.91478093403948, "grad_norm": 1.296875, "learning_rate": 0.00015288634439215, "loss": 0.5973, "step": 3800 }, { "epoch": 0.9159845931632162, "grad_norm": 1.25, "learning_rate": 0.00015281628082005068, "loss": 0.5812, "step": 3805 }, { "epoch": 0.9171882522869523, "grad_norm": 1.2578125, "learning_rate": 0.00015274615426449928, "loss": 0.6128, "step": 3810 }, { "epoch": 0.9183919114106885, "grad_norm": 1.234375, "learning_rate": 0.00015267596483754006, "loss": 0.5636, "step": 3815 }, { "epoch": 0.9195955705344246, "grad_norm": 1.2734375, "learning_rate": 0.0001526057126513177, "loss": 0.6068, "step": 3820 }, { "epoch": 0.9207992296581609, "grad_norm": 1.359375, "learning_rate": 0.00015253539781807723, "loss": 0.5671, "step": 3825 }, { "epoch": 0.922002888781897, "grad_norm": 1.3203125, "learning_rate": 0.00015246502045016368, "loss": 0.5896, "step": 3830 }, { "epoch": 0.9232065479056332, "grad_norm": 1.3125, "learning_rate": 0.00015239458066002205, "loss": 0.5953, "step": 3835 }, { "epoch": 0.9244102070293693, "grad_norm": 1.25, "learning_rate": 0.0001523240785601971, "loss": 0.5414, "step": 3840 }, { "epoch": 0.9256138661531055, "grad_norm": 1.3046875, "learning_rate": 0.00015225351426333307, "loss": 0.5688, "step": 3845 }, { "epoch": 0.9268175252768416, "grad_norm": 1.28125, "learning_rate": 0.00015218288788217365, "loss": 0.5837, "step": 3850 }, { "epoch": 0.9280211844005778, "grad_norm": 1.390625, "learning_rate": 0.0001521121995295617, "loss": 0.6235, "step": 3855 }, { "epoch": 0.9292248435243139, "grad_norm": 1.28125, "learning_rate": 0.00015204144931843907, "loss": 0.5753, "step": 3860 }, { "epoch": 0.9304285026480501, "grad_norm": 1.2734375, "learning_rate": 0.00015197063736184638, "loss": 0.5954, "step": 3865 }, { "epoch": 0.9316321617717862, "grad_norm": 1.359375, "learning_rate": 0.00015189976377292313, "loss": 0.6025, "step": 3870 }, { "epoch": 0.9328358208955224, "grad_norm": 1.1953125, "learning_rate": 0.00015182882866490705, "loss": 0.568, "step": 3875 }, { "epoch": 0.9340394800192585, "grad_norm": 1.21875, "learning_rate": 0.0001517578321511343, "loss": 0.5858, "step": 3880 }, { "epoch": 0.9352431391429947, "grad_norm": 1.2265625, "learning_rate": 0.0001516867743450391, "loss": 0.5447, "step": 3885 }, { "epoch": 0.9364467982667308, "grad_norm": 1.3046875, "learning_rate": 0.0001516156553601536, "loss": 0.5706, "step": 3890 }, { "epoch": 0.937650457390467, "grad_norm": 1.296875, "learning_rate": 0.0001515444753101077, "loss": 0.5655, "step": 3895 }, { "epoch": 0.9388541165142031, "grad_norm": 1.296875, "learning_rate": 0.00015147323430862885, "loss": 0.5894, "step": 3900 }, { "epoch": 0.9400577756379394, "grad_norm": 1.2109375, "learning_rate": 0.000151401932469542, "loss": 0.5615, "step": 3905 }, { "epoch": 0.9412614347616755, "grad_norm": 1.203125, "learning_rate": 0.0001513305699067691, "loss": 0.5968, "step": 3910 }, { "epoch": 0.9424650938854117, "grad_norm": 1.2265625, "learning_rate": 0.0001512591467343293, "loss": 0.5947, "step": 3915 }, { "epoch": 0.9436687530091478, "grad_norm": 1.171875, "learning_rate": 0.00015118766306633852, "loss": 0.5947, "step": 3920 }, { "epoch": 0.944872412132884, "grad_norm": 1.1796875, "learning_rate": 0.0001511161190170093, "loss": 0.5479, "step": 3925 }, { "epoch": 0.9460760712566201, "grad_norm": 1.2734375, "learning_rate": 0.00015104451470065072, "loss": 0.5633, "step": 3930 }, { "epoch": 0.9472797303803563, "grad_norm": 1.296875, "learning_rate": 0.00015097285023166806, "loss": 0.5664, "step": 3935 }, { "epoch": 0.9484833895040924, "grad_norm": 1.1875, "learning_rate": 0.00015090112572456287, "loss": 0.5505, "step": 3940 }, { "epoch": 0.9496870486278286, "grad_norm": 1.3203125, "learning_rate": 0.00015082934129393246, "loss": 0.5501, "step": 3945 }, { "epoch": 0.9508907077515648, "grad_norm": 1.25, "learning_rate": 0.00015075749705447, "loss": 0.5473, "step": 3950 }, { "epoch": 0.9520943668753009, "grad_norm": 1.21875, "learning_rate": 0.0001506855931209641, "loss": 0.5399, "step": 3955 }, { "epoch": 0.9532980259990371, "grad_norm": 1.2578125, "learning_rate": 0.0001506136296082988, "loss": 0.538, "step": 3960 }, { "epoch": 0.9545016851227732, "grad_norm": 1.2890625, "learning_rate": 0.00015054160663145348, "loss": 0.5856, "step": 3965 }, { "epoch": 0.9557053442465094, "grad_norm": 1.296875, "learning_rate": 0.00015046952430550222, "loss": 0.5595, "step": 3970 }, { "epoch": 0.9569090033702455, "grad_norm": 1.2578125, "learning_rate": 0.00015039738274561423, "loss": 0.5672, "step": 3975 }, { "epoch": 0.9581126624939817, "grad_norm": 1.3515625, "learning_rate": 0.00015032518206705317, "loss": 0.5799, "step": 3980 }, { "epoch": 0.9593163216177178, "grad_norm": 1.46875, "learning_rate": 0.00015025292238517721, "loss": 0.5946, "step": 3985 }, { "epoch": 0.9605199807414541, "grad_norm": 1.25, "learning_rate": 0.00015018060381543884, "loss": 0.5416, "step": 3990 }, { "epoch": 0.9617236398651902, "grad_norm": 1.2890625, "learning_rate": 0.00015010822647338454, "loss": 0.5897, "step": 3995 }, { "epoch": 0.9629272989889264, "grad_norm": 1.296875, "learning_rate": 0.0001500357904746548, "loss": 0.5483, "step": 4000 }, { "epoch": 0.9629272989889264, "eval_loss": 0.4734587073326111, "eval_runtime": 2.3912, "eval_samples_per_second": 83.641, "eval_steps_per_second": 83.641, "step": 4000 }, { "epoch": 0.9641309581126625, "grad_norm": 1.234375, "learning_rate": 0.00014996329593498374, "loss": 0.6136, "step": 4005 }, { "epoch": 0.9653346172363987, "grad_norm": 1.265625, "learning_rate": 0.00014989074297019905, "loss": 0.57, "step": 4010 }, { "epoch": 0.9665382763601348, "grad_norm": 1.3203125, "learning_rate": 0.0001498181316962218, "loss": 0.5694, "step": 4015 }, { "epoch": 0.967741935483871, "grad_norm": 1.3359375, "learning_rate": 0.00014974546222906615, "loss": 0.5752, "step": 4020 }, { "epoch": 0.9689455946076071, "grad_norm": 1.28125, "learning_rate": 0.00014967273468483936, "loss": 0.5556, "step": 4025 }, { "epoch": 0.9701492537313433, "grad_norm": 1.234375, "learning_rate": 0.00014959994917974133, "loss": 0.5719, "step": 4030 }, { "epoch": 0.9713529128550794, "grad_norm": 1.171875, "learning_rate": 0.00014952710583006462, "loss": 0.5756, "step": 4035 }, { "epoch": 0.9725565719788156, "grad_norm": 1.28125, "learning_rate": 0.00014945420475219427, "loss": 0.5698, "step": 4040 }, { "epoch": 0.9737602311025517, "grad_norm": 1.1953125, "learning_rate": 0.00014938124606260752, "loss": 0.5599, "step": 4045 }, { "epoch": 0.9749638902262879, "grad_norm": 1.296875, "learning_rate": 0.00014930822987787367, "loss": 0.5838, "step": 4050 }, { "epoch": 0.976167549350024, "grad_norm": 1.28125, "learning_rate": 0.0001492351563146538, "loss": 0.557, "step": 4055 }, { "epoch": 0.9773712084737602, "grad_norm": 1.3203125, "learning_rate": 0.00014916202548970076, "loss": 0.5653, "step": 4060 }, { "epoch": 0.9785748675974963, "grad_norm": 1.2734375, "learning_rate": 0.0001490888375198589, "loss": 0.5479, "step": 4065 }, { "epoch": 0.9797785267212326, "grad_norm": 1.2265625, "learning_rate": 0.00014901559252206373, "loss": 0.556, "step": 4070 }, { "epoch": 0.9809821858449687, "grad_norm": 1.4375, "learning_rate": 0.00014894229061334212, "loss": 0.617, "step": 4075 }, { "epoch": 0.9821858449687049, "grad_norm": 1.171875, "learning_rate": 0.0001488689319108117, "loss": 0.5858, "step": 4080 }, { "epoch": 0.983389504092441, "grad_norm": 1.1953125, "learning_rate": 0.00014879551653168075, "loss": 0.5617, "step": 4085 }, { "epoch": 0.9845931632161772, "grad_norm": 1.140625, "learning_rate": 0.00014872204459324834, "loss": 0.5746, "step": 4090 }, { "epoch": 0.9857968223399133, "grad_norm": 1.2421875, "learning_rate": 0.00014864851621290375, "loss": 0.5643, "step": 4095 }, { "epoch": 0.9870004814636495, "grad_norm": 1.34375, "learning_rate": 0.0001485749315081265, "loss": 0.5819, "step": 4100 }, { "epoch": 0.9882041405873857, "grad_norm": 1.2734375, "learning_rate": 0.00014850129059648608, "loss": 0.5738, "step": 4105 }, { "epoch": 0.9894077997111218, "grad_norm": 1.328125, "learning_rate": 0.0001484275935956418, "loss": 0.5664, "step": 4110 }, { "epoch": 0.990611458834858, "grad_norm": 1.171875, "learning_rate": 0.00014835384062334255, "loss": 0.5583, "step": 4115 }, { "epoch": 0.9918151179585941, "grad_norm": 1.25, "learning_rate": 0.00014828003179742665, "loss": 0.5798, "step": 4120 }, { "epoch": 0.9930187770823303, "grad_norm": 1.2890625, "learning_rate": 0.00014820616723582175, "loss": 0.5652, "step": 4125 }, { "epoch": 0.9942224362060664, "grad_norm": 1.1875, "learning_rate": 0.00014813224705654448, "loss": 0.5567, "step": 4130 }, { "epoch": 0.9954260953298026, "grad_norm": 1.203125, "learning_rate": 0.00014805827137770027, "loss": 0.5679, "step": 4135 }, { "epoch": 0.9966297544535387, "grad_norm": 1.3671875, "learning_rate": 0.00014798424031748335, "loss": 0.5511, "step": 4140 }, { "epoch": 0.997833413577275, "grad_norm": 1.34375, "learning_rate": 0.00014791015399417634, "loss": 0.584, "step": 4145 }, { "epoch": 0.999037072701011, "grad_norm": 1.4921875, "learning_rate": 0.00014783601252615021, "loss": 0.5678, "step": 4150 }, { "epoch": 0.9997592681752527, "eval_loss": 0.473895788192749, "eval_runtime": 2.3815, "eval_samples_per_second": 83.979, "eval_steps_per_second": 83.979, "step": 4153 }, { "epoch": 1.0002407318247473, "grad_norm": 1.2890625, "learning_rate": 0.000147761816031864, "loss": 0.5771, "step": 4155 }, { "epoch": 1.0014443909484834, "grad_norm": 1.1875, "learning_rate": 0.00014768756462986472, "loss": 0.5225, "step": 4160 }, { "epoch": 1.0026480500722195, "grad_norm": 1.15625, "learning_rate": 0.00014761325843878708, "loss": 0.5087, "step": 4165 }, { "epoch": 1.0038517091959558, "grad_norm": 1.234375, "learning_rate": 0.0001475388975773533, "loss": 0.5132, "step": 4170 }, { "epoch": 1.0050553683196919, "grad_norm": 1.1484375, "learning_rate": 0.00014746448216437297, "loss": 0.5072, "step": 4175 }, { "epoch": 1.006259027443428, "grad_norm": 1.2109375, "learning_rate": 0.00014739001231874282, "loss": 0.5182, "step": 4180 }, { "epoch": 1.007462686567164, "grad_norm": 1.28125, "learning_rate": 0.00014731548815944667, "loss": 0.5508, "step": 4185 }, { "epoch": 1.0086663456909004, "grad_norm": 1.34375, "learning_rate": 0.00014724090980555493, "loss": 0.5124, "step": 4190 }, { "epoch": 1.0098700048146365, "grad_norm": 1.2265625, "learning_rate": 0.00014716627737622472, "loss": 0.531, "step": 4195 }, { "epoch": 1.0110736639383726, "grad_norm": 1.265625, "learning_rate": 0.00014709159099069954, "loss": 0.5358, "step": 4200 }, { "epoch": 1.0122773230621087, "grad_norm": 1.3125, "learning_rate": 0.00014701685076830905, "loss": 0.5555, "step": 4205 }, { "epoch": 1.013480982185845, "grad_norm": 1.140625, "learning_rate": 0.00014694205682846902, "loss": 0.5441, "step": 4210 }, { "epoch": 1.0146846413095811, "grad_norm": 1.2265625, "learning_rate": 0.00014686720929068097, "loss": 0.4988, "step": 4215 }, { "epoch": 1.0158883004333172, "grad_norm": 1.25, "learning_rate": 0.00014679230827453201, "loss": 0.5267, "step": 4220 }, { "epoch": 1.0170919595570533, "grad_norm": 1.3515625, "learning_rate": 0.00014671735389969488, "loss": 0.5199, "step": 4225 }, { "epoch": 1.0182956186807897, "grad_norm": 1.2890625, "learning_rate": 0.0001466423462859274, "loss": 0.5331, "step": 4230 }, { "epoch": 1.0194992778045258, "grad_norm": 1.34375, "learning_rate": 0.0001465672855530725, "loss": 0.5153, "step": 4235 }, { "epoch": 1.0207029369282619, "grad_norm": 1.28125, "learning_rate": 0.000146492171821058, "loss": 0.5189, "step": 4240 }, { "epoch": 1.021906596051998, "grad_norm": 1.078125, "learning_rate": 0.00014641700520989638, "loss": 0.5432, "step": 4245 }, { "epoch": 1.0231102551757343, "grad_norm": 1.171875, "learning_rate": 0.0001463417858396846, "loss": 0.5175, "step": 4250 }, { "epoch": 1.0243139142994704, "grad_norm": 1.140625, "learning_rate": 0.000146266513830604, "loss": 0.5426, "step": 4255 }, { "epoch": 1.0255175734232065, "grad_norm": 1.2890625, "learning_rate": 0.00014619118930291993, "loss": 0.5428, "step": 4260 }, { "epoch": 1.0267212325469428, "grad_norm": 1.2578125, "learning_rate": 0.0001461158123769816, "loss": 0.5442, "step": 4265 }, { "epoch": 1.027924891670679, "grad_norm": 1.2109375, "learning_rate": 0.0001460403831732221, "loss": 0.5237, "step": 4270 }, { "epoch": 1.029128550794415, "grad_norm": 1.3359375, "learning_rate": 0.0001459649018121579, "loss": 0.5397, "step": 4275 }, { "epoch": 1.030332209918151, "grad_norm": 1.1875, "learning_rate": 0.00014588936841438893, "loss": 0.5196, "step": 4280 }, { "epoch": 1.0315358690418874, "grad_norm": 1.1484375, "learning_rate": 0.00014581378310059813, "loss": 0.5221, "step": 4285 }, { "epoch": 1.0327395281656235, "grad_norm": 1.3203125, "learning_rate": 0.0001457381459915515, "loss": 0.4995, "step": 4290 }, { "epoch": 1.0339431872893596, "grad_norm": 1.1875, "learning_rate": 0.00014566245720809776, "loss": 0.4799, "step": 4295 }, { "epoch": 1.0351468464130957, "grad_norm": 1.125, "learning_rate": 0.00014558671687116812, "loss": 0.4823, "step": 4300 }, { "epoch": 1.036350505536832, "grad_norm": 1.34375, "learning_rate": 0.00014551092510177628, "loss": 0.5294, "step": 4305 }, { "epoch": 1.0375541646605682, "grad_norm": 1.3125, "learning_rate": 0.00014543508202101804, "loss": 0.5223, "step": 4310 }, { "epoch": 1.0387578237843043, "grad_norm": 1.3203125, "learning_rate": 0.0001453591877500712, "loss": 0.5217, "step": 4315 }, { "epoch": 1.0399614829080404, "grad_norm": 1.1875, "learning_rate": 0.0001452832424101954, "loss": 0.5155, "step": 4320 }, { "epoch": 1.0411651420317767, "grad_norm": 1.1875, "learning_rate": 0.00014520724612273169, "loss": 0.5396, "step": 4325 }, { "epoch": 1.0423688011555128, "grad_norm": 1.2265625, "learning_rate": 0.0001451311990091028, "loss": 0.5092, "step": 4330 }, { "epoch": 1.0435724602792489, "grad_norm": 1.2890625, "learning_rate": 0.00014505510119081237, "loss": 0.5019, "step": 4335 }, { "epoch": 1.044776119402985, "grad_norm": 1.1796875, "learning_rate": 0.0001449789527894454, "loss": 0.5046, "step": 4340 }, { "epoch": 1.0459797785267213, "grad_norm": 1.2578125, "learning_rate": 0.00014490275392666738, "loss": 0.5062, "step": 4345 }, { "epoch": 1.0471834376504574, "grad_norm": 1.203125, "learning_rate": 0.0001448265047242246, "loss": 0.5312, "step": 4350 }, { "epoch": 1.0483870967741935, "grad_norm": 1.4375, "learning_rate": 0.0001447502053039437, "loss": 0.4988, "step": 4355 }, { "epoch": 1.0495907558979296, "grad_norm": 1.2734375, "learning_rate": 0.00014467385578773167, "loss": 0.5001, "step": 4360 }, { "epoch": 1.050794415021666, "grad_norm": 1.328125, "learning_rate": 0.00014459745629757538, "loss": 0.5061, "step": 4365 }, { "epoch": 1.051998074145402, "grad_norm": 1.171875, "learning_rate": 0.0001445210069555417, "loss": 0.5037, "step": 4370 }, { "epoch": 1.0532017332691381, "grad_norm": 1.1328125, "learning_rate": 0.00014444450788377703, "loss": 0.5185, "step": 4375 }, { "epoch": 1.0544053923928742, "grad_norm": 1.265625, "learning_rate": 0.00014436795920450734, "loss": 0.4907, "step": 4380 }, { "epoch": 1.0556090515166106, "grad_norm": 1.390625, "learning_rate": 0.0001442913610400377, "loss": 0.5318, "step": 4385 }, { "epoch": 1.0568127106403467, "grad_norm": 1.296875, "learning_rate": 0.0001442147135127524, "loss": 0.5462, "step": 4390 }, { "epoch": 1.0580163697640828, "grad_norm": 1.328125, "learning_rate": 0.00014413801674511453, "loss": 0.5337, "step": 4395 }, { "epoch": 1.059220028887819, "grad_norm": 1.2578125, "learning_rate": 0.0001440612708596659, "loss": 0.5172, "step": 4400 }, { "epoch": 1.0604236880115552, "grad_norm": 1.2578125, "learning_rate": 0.0001439844759790267, "loss": 0.5241, "step": 4405 }, { "epoch": 1.0616273471352913, "grad_norm": 1.2734375, "learning_rate": 0.00014390763222589547, "loss": 0.5169, "step": 4410 }, { "epoch": 1.0628310062590274, "grad_norm": 1.2421875, "learning_rate": 0.00014383073972304888, "loss": 0.521, "step": 4415 }, { "epoch": 1.0640346653827637, "grad_norm": 1.28125, "learning_rate": 0.0001437537985933414, "loss": 0.524, "step": 4420 }, { "epoch": 1.0652383245064998, "grad_norm": 1.3203125, "learning_rate": 0.00014367680895970524, "loss": 0.538, "step": 4425 }, { "epoch": 1.066441983630236, "grad_norm": 1.21875, "learning_rate": 0.0001435997709451501, "loss": 0.5229, "step": 4430 }, { "epoch": 1.067645642753972, "grad_norm": 1.2109375, "learning_rate": 0.00014352268467276297, "loss": 0.5578, "step": 4435 }, { "epoch": 1.0688493018777083, "grad_norm": 1.3125, "learning_rate": 0.00014344555026570794, "loss": 0.5176, "step": 4440 }, { "epoch": 1.0700529610014444, "grad_norm": 1.2265625, "learning_rate": 0.00014336836784722601, "loss": 0.5166, "step": 4445 }, { "epoch": 1.0712566201251805, "grad_norm": 1.1640625, "learning_rate": 0.000143291137540635, "loss": 0.5137, "step": 4450 }, { "epoch": 1.0724602792489166, "grad_norm": 1.21875, "learning_rate": 0.00014321385946932904, "loss": 0.5232, "step": 4455 }, { "epoch": 1.073663938372653, "grad_norm": 1.1484375, "learning_rate": 0.00014313653375677866, "loss": 0.497, "step": 4460 }, { "epoch": 1.074867597496389, "grad_norm": 1.1875, "learning_rate": 0.00014305916052653059, "loss": 0.5122, "step": 4465 }, { "epoch": 1.0760712566201251, "grad_norm": 1.1796875, "learning_rate": 0.0001429817399022074, "loss": 0.5198, "step": 4470 }, { "epoch": 1.0772749157438612, "grad_norm": 1.265625, "learning_rate": 0.0001429042720075074, "loss": 0.509, "step": 4475 }, { "epoch": 1.0784785748675976, "grad_norm": 1.25, "learning_rate": 0.00014282675696620443, "loss": 0.4838, "step": 4480 }, { "epoch": 1.0796822339913337, "grad_norm": 1.3046875, "learning_rate": 0.00014274919490214763, "loss": 0.5123, "step": 4485 }, { "epoch": 1.0808858931150698, "grad_norm": 1.1796875, "learning_rate": 0.0001426715859392613, "loss": 0.5272, "step": 4490 }, { "epoch": 1.0820895522388059, "grad_norm": 1.1640625, "learning_rate": 0.00014259393020154475, "loss": 0.5045, "step": 4495 }, { "epoch": 1.0832932113625422, "grad_norm": 1.1015625, "learning_rate": 0.0001425162278130718, "loss": 0.5249, "step": 4500 }, { "epoch": 1.0832932113625422, "eval_loss": 0.4617222547531128, "eval_runtime": 2.4499, "eval_samples_per_second": 81.637, "eval_steps_per_second": 81.637, "step": 4500 }, { "epoch": 1.0844968704862783, "grad_norm": 1.125, "learning_rate": 0.00014243847889799108, "loss": 0.5066, "step": 4505 }, { "epoch": 1.0857005296100144, "grad_norm": 1.1796875, "learning_rate": 0.00014236068358052531, "loss": 0.5523, "step": 4510 }, { "epoch": 1.0869041887337505, "grad_norm": 1.234375, "learning_rate": 0.00014228284198497155, "loss": 0.512, "step": 4515 }, { "epoch": 1.0881078478574868, "grad_norm": 1.15625, "learning_rate": 0.00014220495423570068, "loss": 0.5772, "step": 4520 }, { "epoch": 1.089311506981223, "grad_norm": 1.265625, "learning_rate": 0.00014212702045715735, "loss": 0.5035, "step": 4525 }, { "epoch": 1.090515166104959, "grad_norm": 1.265625, "learning_rate": 0.0001420490407738598, "loss": 0.5199, "step": 4530 }, { "epoch": 1.0917188252286953, "grad_norm": 1.265625, "learning_rate": 0.0001419710153103995, "loss": 0.5553, "step": 4535 }, { "epoch": 1.0929224843524314, "grad_norm": 1.28125, "learning_rate": 0.0001418929441914412, "loss": 0.4998, "step": 4540 }, { "epoch": 1.0941261434761675, "grad_norm": 1.1875, "learning_rate": 0.00014181482754172252, "loss": 0.5263, "step": 4545 }, { "epoch": 1.0953298025999036, "grad_norm": 1.34375, "learning_rate": 0.00014173666548605385, "loss": 0.498, "step": 4550 }, { "epoch": 1.09653346172364, "grad_norm": 1.21875, "learning_rate": 0.0001416584581493181, "loss": 0.5324, "step": 4555 }, { "epoch": 1.097737120847376, "grad_norm": 1.4296875, "learning_rate": 0.00014158020565647057, "loss": 0.5463, "step": 4560 }, { "epoch": 1.0989407799711122, "grad_norm": 1.1875, "learning_rate": 0.00014150190813253864, "loss": 0.5182, "step": 4565 }, { "epoch": 1.1001444390948483, "grad_norm": 1.1796875, "learning_rate": 0.0001414235657026217, "loss": 0.5274, "step": 4570 }, { "epoch": 1.1013480982185846, "grad_norm": 1.21875, "learning_rate": 0.0001413451784918909, "loss": 0.4993, "step": 4575 }, { "epoch": 1.1025517573423207, "grad_norm": 1.3125, "learning_rate": 0.00014126674662558885, "loss": 0.4948, "step": 4580 }, { "epoch": 1.1037554164660568, "grad_norm": 1.28125, "learning_rate": 0.00014118827022902963, "loss": 0.5347, "step": 4585 }, { "epoch": 1.104959075589793, "grad_norm": 1.328125, "learning_rate": 0.0001411097494275984, "loss": 0.523, "step": 4590 }, { "epoch": 1.1061627347135292, "grad_norm": 1.2109375, "learning_rate": 0.00014103118434675117, "loss": 0.4897, "step": 4595 }, { "epoch": 1.1073663938372653, "grad_norm": 1.296875, "learning_rate": 0.00014095257511201494, "loss": 0.5214, "step": 4600 }, { "epoch": 1.1085700529610014, "grad_norm": 1.2265625, "learning_rate": 0.000140873921848987, "loss": 0.4806, "step": 4605 }, { "epoch": 1.1097737120847375, "grad_norm": 1.1796875, "learning_rate": 0.00014079522468333518, "loss": 0.5315, "step": 4610 }, { "epoch": 1.1109773712084738, "grad_norm": 1.28125, "learning_rate": 0.00014071648374079738, "loss": 0.5276, "step": 4615 }, { "epoch": 1.11218103033221, "grad_norm": 1.28125, "learning_rate": 0.00014063769914718135, "loss": 0.4961, "step": 4620 }, { "epoch": 1.113384689455946, "grad_norm": 1.2890625, "learning_rate": 0.00014055887102836477, "loss": 0.5384, "step": 4625 }, { "epoch": 1.1145883485796821, "grad_norm": 1.265625, "learning_rate": 0.0001404799995102947, "loss": 0.5496, "step": 4630 }, { "epoch": 1.1157920077034185, "grad_norm": 1.375, "learning_rate": 0.00014040108471898767, "loss": 0.5306, "step": 4635 }, { "epoch": 1.1169956668271546, "grad_norm": 1.25, "learning_rate": 0.00014032212678052923, "loss": 0.511, "step": 4640 }, { "epoch": 1.1181993259508907, "grad_norm": 1.2578125, "learning_rate": 0.00014024312582107396, "loss": 0.5264, "step": 4645 }, { "epoch": 1.1194029850746268, "grad_norm": 1.3125, "learning_rate": 0.00014016408196684512, "loss": 0.4961, "step": 4650 }, { "epoch": 1.120606644198363, "grad_norm": 1.2734375, "learning_rate": 0.00014008499534413454, "loss": 0.5045, "step": 4655 }, { "epoch": 1.1218103033220992, "grad_norm": 1.2109375, "learning_rate": 0.00014000586607930236, "loss": 0.506, "step": 4660 }, { "epoch": 1.1230139624458353, "grad_norm": 1.2734375, "learning_rate": 0.0001399266942987769, "loss": 0.5392, "step": 4665 }, { "epoch": 1.1242176215695716, "grad_norm": 1.2109375, "learning_rate": 0.0001398474801290543, "loss": 0.4822, "step": 4670 }, { "epoch": 1.1254212806933077, "grad_norm": 1.140625, "learning_rate": 0.00013976822369669847, "loss": 0.5352, "step": 4675 }, { "epoch": 1.1266249398170438, "grad_norm": 1.296875, "learning_rate": 0.00013968892512834102, "loss": 0.5048, "step": 4680 }, { "epoch": 1.12782859894078, "grad_norm": 1.2734375, "learning_rate": 0.0001396095845506806, "loss": 0.5026, "step": 4685 }, { "epoch": 1.129032258064516, "grad_norm": 1.21875, "learning_rate": 0.0001395302020904832, "loss": 0.5228, "step": 4690 }, { "epoch": 1.1302359171882523, "grad_norm": 1.1796875, "learning_rate": 0.0001394507778745816, "loss": 0.5086, "step": 4695 }, { "epoch": 1.1314395763119884, "grad_norm": 1.1171875, "learning_rate": 0.00013937131202987528, "loss": 0.5125, "step": 4700 }, { "epoch": 1.1326432354357245, "grad_norm": 1.234375, "learning_rate": 0.00013929180468333038, "loss": 0.4953, "step": 4705 }, { "epoch": 1.1338468945594609, "grad_norm": 1.2421875, "learning_rate": 0.0001392122559619792, "loss": 0.5134, "step": 4710 }, { "epoch": 1.135050553683197, "grad_norm": 1.28125, "learning_rate": 0.00013913266599292025, "loss": 0.5201, "step": 4715 }, { "epoch": 1.136254212806933, "grad_norm": 1.21875, "learning_rate": 0.0001390530349033179, "loss": 0.5389, "step": 4720 }, { "epoch": 1.1374578719306692, "grad_norm": 1.234375, "learning_rate": 0.00013897336282040214, "loss": 0.5209, "step": 4725 }, { "epoch": 1.1386615310544055, "grad_norm": 1.4296875, "learning_rate": 0.0001388936498714686, "loss": 0.5175, "step": 4730 }, { "epoch": 1.1398651901781416, "grad_norm": 1.140625, "learning_rate": 0.00013881389618387814, "loss": 0.5347, "step": 4735 }, { "epoch": 1.1410688493018777, "grad_norm": 1.1640625, "learning_rate": 0.00013873410188505668, "loss": 0.4957, "step": 4740 }, { "epoch": 1.1422725084256138, "grad_norm": 1.2109375, "learning_rate": 0.00013865426710249511, "loss": 0.5297, "step": 4745 }, { "epoch": 1.14347616754935, "grad_norm": 1.2265625, "learning_rate": 0.00013857439196374884, "loss": 0.523, "step": 4750 }, { "epoch": 1.1446798266730862, "grad_norm": 1.2265625, "learning_rate": 0.00013849447659643798, "loss": 0.5046, "step": 4755 }, { "epoch": 1.1458834857968223, "grad_norm": 1.1796875, "learning_rate": 0.0001384145211282467, "loss": 0.5278, "step": 4760 }, { "epoch": 1.1470871449205584, "grad_norm": 1.28125, "learning_rate": 0.00013833452568692343, "loss": 0.5012, "step": 4765 }, { "epoch": 1.1482908040442947, "grad_norm": 1.3125, "learning_rate": 0.00013825449040028034, "loss": 0.5116, "step": 4770 }, { "epoch": 1.1494944631680308, "grad_norm": 1.1953125, "learning_rate": 0.0001381744153961933, "loss": 0.4871, "step": 4775 }, { "epoch": 1.150698122291767, "grad_norm": 1.3515625, "learning_rate": 0.0001380943008026016, "loss": 0.5277, "step": 4780 }, { "epoch": 1.151901781415503, "grad_norm": 1.3671875, "learning_rate": 0.0001380141467475079, "loss": 0.5294, "step": 4785 }, { "epoch": 1.1531054405392394, "grad_norm": 1.15625, "learning_rate": 0.00013793395335897778, "loss": 0.5147, "step": 4790 }, { "epoch": 1.1543090996629755, "grad_norm": 1.1875, "learning_rate": 0.00013785372076513972, "loss": 0.5138, "step": 4795 }, { "epoch": 1.1555127587867116, "grad_norm": 1.2421875, "learning_rate": 0.00013777344909418488, "loss": 0.4957, "step": 4800 }, { "epoch": 1.1567164179104479, "grad_norm": 1.15625, "learning_rate": 0.00013769313847436675, "loss": 0.5136, "step": 4805 }, { "epoch": 1.157920077034184, "grad_norm": 1.21875, "learning_rate": 0.00013761278903400115, "loss": 0.5179, "step": 4810 }, { "epoch": 1.15912373615792, "grad_norm": 1.2890625, "learning_rate": 0.00013753240090146588, "loss": 0.5299, "step": 4815 }, { "epoch": 1.1603273952816562, "grad_norm": 1.3046875, "learning_rate": 0.00013745197420520058, "loss": 0.5254, "step": 4820 }, { "epoch": 1.1615310544053923, "grad_norm": 1.234375, "learning_rate": 0.00013737150907370646, "loss": 0.5321, "step": 4825 }, { "epoch": 1.1627347135291286, "grad_norm": 1.1640625, "learning_rate": 0.00013729100563554623, "loss": 0.4723, "step": 4830 }, { "epoch": 1.1639383726528647, "grad_norm": 1.28125, "learning_rate": 0.00013721046401934367, "loss": 0.5046, "step": 4835 }, { "epoch": 1.1651420317766008, "grad_norm": 1.171875, "learning_rate": 0.00013712988435378372, "loss": 0.5246, "step": 4840 }, { "epoch": 1.1663456909003371, "grad_norm": 1.1875, "learning_rate": 0.00013704926676761197, "loss": 0.5104, "step": 4845 }, { "epoch": 1.1675493500240732, "grad_norm": 1.1875, "learning_rate": 0.00013696861138963467, "loss": 0.5346, "step": 4850 }, { "epoch": 1.1687530091478093, "grad_norm": 1.265625, "learning_rate": 0.00013688791834871845, "loss": 0.4683, "step": 4855 }, { "epoch": 1.1699566682715454, "grad_norm": 1.3046875, "learning_rate": 0.00013680718777379008, "loss": 0.5208, "step": 4860 }, { "epoch": 1.1711603273952818, "grad_norm": 1.1796875, "learning_rate": 0.00013672641979383635, "loss": 0.4973, "step": 4865 }, { "epoch": 1.1723639865190179, "grad_norm": 1.2734375, "learning_rate": 0.00013664561453790376, "loss": 0.4948, "step": 4870 }, { "epoch": 1.173567645642754, "grad_norm": 1.2421875, "learning_rate": 0.00013656477213509844, "loss": 0.5358, "step": 4875 }, { "epoch": 1.17477130476649, "grad_norm": 1.2734375, "learning_rate": 0.00013648389271458575, "loss": 0.5246, "step": 4880 }, { "epoch": 1.1759749638902264, "grad_norm": 1.2578125, "learning_rate": 0.00013640297640559035, "loss": 0.5285, "step": 4885 }, { "epoch": 1.1771786230139625, "grad_norm": 1.265625, "learning_rate": 0.0001363220233373957, "loss": 0.4932, "step": 4890 }, { "epoch": 1.1783822821376986, "grad_norm": 1.1484375, "learning_rate": 0.0001362410336393441, "loss": 0.4945, "step": 4895 }, { "epoch": 1.1795859412614347, "grad_norm": 1.2578125, "learning_rate": 0.00013616000744083627, "loss": 0.4816, "step": 4900 }, { "epoch": 1.180789600385171, "grad_norm": 1.1015625, "learning_rate": 0.00013607894487133133, "loss": 0.5009, "step": 4905 }, { "epoch": 1.181993259508907, "grad_norm": 1.28125, "learning_rate": 0.00013599784606034652, "loss": 0.557, "step": 4910 }, { "epoch": 1.1831969186326432, "grad_norm": 1.296875, "learning_rate": 0.00013591671113745692, "loss": 0.4899, "step": 4915 }, { "epoch": 1.1844005777563793, "grad_norm": 1.15625, "learning_rate": 0.0001358355402322953, "loss": 0.5142, "step": 4920 }, { "epoch": 1.1856042368801156, "grad_norm": 1.265625, "learning_rate": 0.000135754333474552, "loss": 0.4999, "step": 4925 }, { "epoch": 1.1868078960038517, "grad_norm": 1.1875, "learning_rate": 0.00013567309099397464, "loss": 0.5319, "step": 4930 }, { "epoch": 1.1880115551275878, "grad_norm": 1.2578125, "learning_rate": 0.0001355918129203678, "loss": 0.5222, "step": 4935 }, { "epoch": 1.1892152142513241, "grad_norm": 1.1875, "learning_rate": 0.00013551049938359305, "loss": 0.4894, "step": 4940 }, { "epoch": 1.1904188733750602, "grad_norm": 1.1953125, "learning_rate": 0.00013542915051356855, "loss": 0.5252, "step": 4945 }, { "epoch": 1.1916225324987963, "grad_norm": 1.171875, "learning_rate": 0.00013534776644026896, "loss": 0.5026, "step": 4950 }, { "epoch": 1.1928261916225325, "grad_norm": 1.1796875, "learning_rate": 0.00013526634729372518, "loss": 0.4818, "step": 4955 }, { "epoch": 1.1940298507462686, "grad_norm": 1.296875, "learning_rate": 0.00013518489320402412, "loss": 0.5361, "step": 4960 }, { "epoch": 1.1952335098700049, "grad_norm": 1.28125, "learning_rate": 0.0001351034043013085, "loss": 0.5016, "step": 4965 }, { "epoch": 1.196437168993741, "grad_norm": 1.2421875, "learning_rate": 0.00013502188071577676, "loss": 0.481, "step": 4970 }, { "epoch": 1.197640828117477, "grad_norm": 1.2890625, "learning_rate": 0.00013494032257768264, "loss": 0.5144, "step": 4975 }, { "epoch": 1.1988444872412134, "grad_norm": 1.21875, "learning_rate": 0.00013485873001733514, "loss": 0.4989, "step": 4980 }, { "epoch": 1.2000481463649495, "grad_norm": 1.296875, "learning_rate": 0.0001347771031650983, "loss": 0.5184, "step": 4985 }, { "epoch": 1.2012518054886856, "grad_norm": 1.1796875, "learning_rate": 0.00013469544215139086, "loss": 0.5012, "step": 4990 }, { "epoch": 1.2024554646124217, "grad_norm": 1.359375, "learning_rate": 0.0001346137471066862, "loss": 0.5221, "step": 4995 }, { "epoch": 1.2036591237361578, "grad_norm": 1.3125, "learning_rate": 0.0001345320181615121, "loss": 0.5348, "step": 5000 }, { "epoch": 1.2036591237361578, "eval_loss": 0.4517897665500641, "eval_runtime": 2.4035, "eval_samples_per_second": 83.211, "eval_steps_per_second": 83.211, "step": 5000 }, { "epoch": 1.2048627828598941, "grad_norm": 1.21875, "learning_rate": 0.00013445025544645036, "loss": 0.4839, "step": 5005 }, { "epoch": 1.2060664419836302, "grad_norm": 1.2109375, "learning_rate": 0.00013436845909213694, "loss": 0.4808, "step": 5010 }, { "epoch": 1.2072701011073663, "grad_norm": 1.203125, "learning_rate": 0.0001342866292292614, "loss": 0.5058, "step": 5015 }, { "epoch": 1.2084737602311026, "grad_norm": 1.1875, "learning_rate": 0.0001342047659885669, "loss": 0.505, "step": 5020 }, { "epoch": 1.2096774193548387, "grad_norm": 1.078125, "learning_rate": 0.00013412286950084988, "loss": 0.5294, "step": 5025 }, { "epoch": 1.2108810784785748, "grad_norm": 1.3203125, "learning_rate": 0.00013404093989695997, "loss": 0.5163, "step": 5030 }, { "epoch": 1.212084737602311, "grad_norm": 1.1875, "learning_rate": 0.00013395897730779965, "loss": 0.5139, "step": 5035 }, { "epoch": 1.2132883967260473, "grad_norm": 1.25, "learning_rate": 0.00013387698186432417, "loss": 0.528, "step": 5040 }, { "epoch": 1.2144920558497834, "grad_norm": 1.2265625, "learning_rate": 0.00013379495369754115, "loss": 0.5191, "step": 5045 }, { "epoch": 1.2156957149735195, "grad_norm": 1.1484375, "learning_rate": 0.00013371289293851065, "loss": 0.5063, "step": 5050 }, { "epoch": 1.2168993740972556, "grad_norm": 1.234375, "learning_rate": 0.00013363079971834467, "loss": 0.5163, "step": 5055 }, { "epoch": 1.218103033220992, "grad_norm": 1.0546875, "learning_rate": 0.00013354867416820717, "loss": 0.5024, "step": 5060 }, { "epoch": 1.219306692344728, "grad_norm": 1.125, "learning_rate": 0.00013346651641931372, "loss": 0.4864, "step": 5065 }, { "epoch": 1.220510351468464, "grad_norm": 1.3046875, "learning_rate": 0.00013338432660293134, "loss": 0.5081, "step": 5070 }, { "epoch": 1.2217140105922002, "grad_norm": 1.1796875, "learning_rate": 0.00013330210485037826, "loss": 0.5035, "step": 5075 }, { "epoch": 1.2229176697159365, "grad_norm": 1.2578125, "learning_rate": 0.0001332198512930238, "loss": 0.5395, "step": 5080 }, { "epoch": 1.2241213288396726, "grad_norm": 1.109375, "learning_rate": 0.00013313756606228802, "loss": 0.4689, "step": 5085 }, { "epoch": 1.2253249879634087, "grad_norm": 1.2734375, "learning_rate": 0.00013305524928964164, "loss": 0.5177, "step": 5090 }, { "epoch": 1.2265286470871448, "grad_norm": 1.1875, "learning_rate": 0.00013297290110660578, "loss": 0.5019, "step": 5095 }, { "epoch": 1.2277323062108811, "grad_norm": 1.1875, "learning_rate": 0.00013289052164475174, "loss": 0.5231, "step": 5100 }, { "epoch": 1.2289359653346172, "grad_norm": 1.3046875, "learning_rate": 0.00013280811103570073, "loss": 0.4896, "step": 5105 }, { "epoch": 1.2301396244583533, "grad_norm": 1.25, "learning_rate": 0.00013272566941112382, "loss": 0.4972, "step": 5110 }, { "epoch": 1.2313432835820897, "grad_norm": 1.1328125, "learning_rate": 0.00013264319690274153, "loss": 0.5249, "step": 5115 }, { "epoch": 1.2325469427058258, "grad_norm": 1.171875, "learning_rate": 0.00013256069364232388, "loss": 0.5052, "step": 5120 }, { "epoch": 1.2337506018295619, "grad_norm": 1.34375, "learning_rate": 0.00013247815976168985, "loss": 0.4939, "step": 5125 }, { "epoch": 1.234954260953298, "grad_norm": 1.25, "learning_rate": 0.00013239559539270746, "loss": 0.4873, "step": 5130 }, { "epoch": 1.236157920077034, "grad_norm": 1.046875, "learning_rate": 0.00013231300066729343, "loss": 0.4558, "step": 5135 }, { "epoch": 1.2373615792007704, "grad_norm": 1.1796875, "learning_rate": 0.0001322303757174129, "loss": 0.5328, "step": 5140 }, { "epoch": 1.2385652383245065, "grad_norm": 1.171875, "learning_rate": 0.0001321477206750794, "loss": 0.5181, "step": 5145 }, { "epoch": 1.2397688974482426, "grad_norm": 1.2109375, "learning_rate": 0.00013206503567235448, "loss": 0.4987, "step": 5150 }, { "epoch": 1.240972556571979, "grad_norm": 1.296875, "learning_rate": 0.00013198232084134757, "loss": 0.4767, "step": 5155 }, { "epoch": 1.242176215695715, "grad_norm": 1.1953125, "learning_rate": 0.0001318995763142158, "loss": 0.5226, "step": 5160 }, { "epoch": 1.2433798748194511, "grad_norm": 1.3359375, "learning_rate": 0.0001318168022231637, "loss": 0.4951, "step": 5165 }, { "epoch": 1.2445835339431872, "grad_norm": 1.1796875, "learning_rate": 0.00013173399870044302, "loss": 0.5172, "step": 5170 }, { "epoch": 1.2457871930669235, "grad_norm": 1.265625, "learning_rate": 0.00013165116587835258, "loss": 0.4999, "step": 5175 }, { "epoch": 1.2469908521906596, "grad_norm": 1.2578125, "learning_rate": 0.000131568303889238, "loss": 0.5176, "step": 5180 }, { "epoch": 1.2481945113143957, "grad_norm": 1.25, "learning_rate": 0.00013148541286549153, "loss": 0.4963, "step": 5185 }, { "epoch": 1.2493981704381318, "grad_norm": 1.1796875, "learning_rate": 0.00013140249293955173, "loss": 0.4923, "step": 5190 }, { "epoch": 1.2506018295618682, "grad_norm": 1.21875, "learning_rate": 0.00013131954424390338, "loss": 0.5051, "step": 5195 }, { "epoch": 1.2518054886856043, "grad_norm": 1.1484375, "learning_rate": 0.00013123656691107723, "loss": 0.5134, "step": 5200 }, { "epoch": 1.2530091478093404, "grad_norm": 1.1953125, "learning_rate": 0.00013115356107364982, "loss": 0.503, "step": 5205 }, { "epoch": 1.2542128069330767, "grad_norm": 1.1640625, "learning_rate": 0.0001310705268642432, "loss": 0.5029, "step": 5210 }, { "epoch": 1.2554164660568128, "grad_norm": 1.234375, "learning_rate": 0.00013098746441552473, "loss": 0.4801, "step": 5215 }, { "epoch": 1.2566201251805489, "grad_norm": 1.234375, "learning_rate": 0.0001309043738602069, "loss": 0.5105, "step": 5220 }, { "epoch": 1.257823784304285, "grad_norm": 1.2109375, "learning_rate": 0.00013082125533104714, "loss": 0.4996, "step": 5225 }, { "epoch": 1.259027443428021, "grad_norm": 1.2109375, "learning_rate": 0.00013073810896084757, "loss": 0.4947, "step": 5230 }, { "epoch": 1.2602311025517574, "grad_norm": 1.1953125, "learning_rate": 0.0001306549348824547, "loss": 0.5176, "step": 5235 }, { "epoch": 1.2614347616754935, "grad_norm": 1.3125, "learning_rate": 0.00013057173322875945, "loss": 0.513, "step": 5240 }, { "epoch": 1.2626384207992296, "grad_norm": 1.2109375, "learning_rate": 0.00013048850413269672, "loss": 0.4998, "step": 5245 }, { "epoch": 1.263842079922966, "grad_norm": 1.203125, "learning_rate": 0.00013040524772724526, "loss": 0.5221, "step": 5250 }, { "epoch": 1.265045739046702, "grad_norm": 1.1796875, "learning_rate": 0.00013032196414542748, "loss": 0.4986, "step": 5255 }, { "epoch": 1.2662493981704381, "grad_norm": 1.3046875, "learning_rate": 0.0001302386535203092, "loss": 0.5107, "step": 5260 }, { "epoch": 1.2674530572941742, "grad_norm": 1.2265625, "learning_rate": 0.0001301553159849994, "loss": 0.5451, "step": 5265 }, { "epoch": 1.2686567164179103, "grad_norm": 1.234375, "learning_rate": 0.00013007195167265016, "loss": 0.5035, "step": 5270 }, { "epoch": 1.2698603755416467, "grad_norm": 1.1640625, "learning_rate": 0.0001299885607164562, "loss": 0.4897, "step": 5275 }, { "epoch": 1.2710640346653828, "grad_norm": 1.296875, "learning_rate": 0.00012990514324965496, "loss": 0.5223, "step": 5280 }, { "epoch": 1.2722676937891189, "grad_norm": 1.2109375, "learning_rate": 0.00012982169940552613, "loss": 0.523, "step": 5285 }, { "epoch": 1.2734713529128552, "grad_norm": 1.1875, "learning_rate": 0.0001297382293173916, "loss": 0.4896, "step": 5290 }, { "epoch": 1.2746750120365913, "grad_norm": 1.125, "learning_rate": 0.00012965473311861517, "loss": 0.5007, "step": 5295 }, { "epoch": 1.2758786711603274, "grad_norm": 1.125, "learning_rate": 0.00012957121094260234, "loss": 0.5159, "step": 5300 }, { "epoch": 1.2770823302840635, "grad_norm": 1.1953125, "learning_rate": 0.00012948766292280017, "loss": 0.4975, "step": 5305 }, { "epoch": 1.2782859894077996, "grad_norm": 1.2421875, "learning_rate": 0.00012940408919269697, "loss": 0.5584, "step": 5310 }, { "epoch": 1.279489648531536, "grad_norm": 1.2265625, "learning_rate": 0.0001293204898858221, "loss": 0.4937, "step": 5315 }, { "epoch": 1.280693307655272, "grad_norm": 1.125, "learning_rate": 0.0001292368651357459, "loss": 0.5058, "step": 5320 }, { "epoch": 1.281896966779008, "grad_norm": 1.2265625, "learning_rate": 0.00012915321507607917, "loss": 0.5049, "step": 5325 }, { "epoch": 1.2831006259027444, "grad_norm": 1.2265625, "learning_rate": 0.0001290695398404734, "loss": 0.5174, "step": 5330 }, { "epoch": 1.2843042850264805, "grad_norm": 1.28125, "learning_rate": 0.00012898583956262003, "loss": 0.522, "step": 5335 }, { "epoch": 1.2855079441502166, "grad_norm": 1.328125, "learning_rate": 0.0001289021143762507, "loss": 0.5216, "step": 5340 }, { "epoch": 1.286711603273953, "grad_norm": 1.2421875, "learning_rate": 0.00012881836441513687, "loss": 0.4868, "step": 5345 }, { "epoch": 1.287915262397689, "grad_norm": 1.1640625, "learning_rate": 0.0001287345898130894, "loss": 0.4973, "step": 5350 }, { "epoch": 1.2891189215214252, "grad_norm": 1.171875, "learning_rate": 0.00012865079070395867, "loss": 0.5306, "step": 5355 }, { "epoch": 1.2903225806451613, "grad_norm": 1.328125, "learning_rate": 0.00012856696722163418, "loss": 0.5288, "step": 5360 }, { "epoch": 1.2915262397688974, "grad_norm": 1.1171875, "learning_rate": 0.00012848311950004436, "loss": 0.5023, "step": 5365 }, { "epoch": 1.2927298988926337, "grad_norm": 1.03125, "learning_rate": 0.00012839924767315636, "loss": 0.4719, "step": 5370 }, { "epoch": 1.2939335580163698, "grad_norm": 1.3125, "learning_rate": 0.0001283153518749759, "loss": 0.4913, "step": 5375 }, { "epoch": 1.2951372171401059, "grad_norm": 1.203125, "learning_rate": 0.0001282314322395469, "loss": 0.5153, "step": 5380 }, { "epoch": 1.2963408762638422, "grad_norm": 1.1640625, "learning_rate": 0.0001281474889009515, "loss": 0.5297, "step": 5385 }, { "epoch": 1.2975445353875783, "grad_norm": 1.1875, "learning_rate": 0.00012806352199330954, "loss": 0.516, "step": 5390 }, { "epoch": 1.2987481945113144, "grad_norm": 1.34375, "learning_rate": 0.0001279795316507787, "loss": 0.5134, "step": 5395 }, { "epoch": 1.2999518536350505, "grad_norm": 1.140625, "learning_rate": 0.00012789551800755397, "loss": 0.4592, "step": 5400 }, { "epoch": 1.3011555127587866, "grad_norm": 1.1875, "learning_rate": 0.00012781148119786763, "loss": 0.5122, "step": 5405 }, { "epoch": 1.302359171882523, "grad_norm": 1.203125, "learning_rate": 0.000127727421355989, "loss": 0.5261, "step": 5410 }, { "epoch": 1.303562831006259, "grad_norm": 1.109375, "learning_rate": 0.00012764333861622405, "loss": 0.4806, "step": 5415 }, { "epoch": 1.3047664901299951, "grad_norm": 1.203125, "learning_rate": 0.00012755923311291554, "loss": 0.4957, "step": 5420 }, { "epoch": 1.3059701492537314, "grad_norm": 1.2421875, "learning_rate": 0.00012747510498044247, "loss": 0.4763, "step": 5425 }, { "epoch": 1.3071738083774675, "grad_norm": 1.21875, "learning_rate": 0.00012739095435322, "loss": 0.4947, "step": 5430 }, { "epoch": 1.3083774675012037, "grad_norm": 1.1640625, "learning_rate": 0.00012730678136569935, "loss": 0.5029, "step": 5435 }, { "epoch": 1.3095811266249398, "grad_norm": 1.171875, "learning_rate": 0.00012722258615236725, "loss": 0.509, "step": 5440 }, { "epoch": 1.3107847857486759, "grad_norm": 1.2578125, "learning_rate": 0.00012713836884774618, "loss": 0.5267, "step": 5445 }, { "epoch": 1.3119884448724122, "grad_norm": 1.2109375, "learning_rate": 0.00012705412958639373, "loss": 0.4885, "step": 5450 }, { "epoch": 1.3131921039961483, "grad_norm": 1.1796875, "learning_rate": 0.00012696986850290268, "loss": 0.4938, "step": 5455 }, { "epoch": 1.3143957631198844, "grad_norm": 1.1640625, "learning_rate": 0.00012688558573190062, "loss": 0.5027, "step": 5460 }, { "epoch": 1.3155994222436207, "grad_norm": 1.2109375, "learning_rate": 0.00012680128140804987, "loss": 0.5026, "step": 5465 }, { "epoch": 1.3168030813673568, "grad_norm": 1.21875, "learning_rate": 0.00012671695566604706, "loss": 0.4694, "step": 5470 }, { "epoch": 1.318006740491093, "grad_norm": 1.1328125, "learning_rate": 0.00012663260864062312, "loss": 0.4949, "step": 5475 }, { "epoch": 1.319210399614829, "grad_norm": 1.1484375, "learning_rate": 0.000126548240466543, "loss": 0.493, "step": 5480 }, { "epoch": 1.320414058738565, "grad_norm": 1.265625, "learning_rate": 0.00012646385127860536, "loss": 0.4796, "step": 5485 }, { "epoch": 1.3216177178623014, "grad_norm": 1.171875, "learning_rate": 0.00012637944121164258, "loss": 0.511, "step": 5490 }, { "epoch": 1.3228213769860375, "grad_norm": 1.125, "learning_rate": 0.00012629501040052024, "loss": 0.4867, "step": 5495 }, { "epoch": 1.3240250361097736, "grad_norm": 1.125, "learning_rate": 0.00012621055898013713, "loss": 0.5021, "step": 5500 }, { "epoch": 1.3240250361097736, "eval_loss": 0.4372206926345825, "eval_runtime": 2.3993, "eval_samples_per_second": 83.357, "eval_steps_per_second": 83.357, "step": 5500 }, { "epoch": 1.32522869523351, "grad_norm": 1.2578125, "learning_rate": 0.00012612608708542503, "loss": 0.5143, "step": 5505 }, { "epoch": 1.326432354357246, "grad_norm": 1.25, "learning_rate": 0.0001260415948513483, "loss": 0.522, "step": 5510 }, { "epoch": 1.3276360134809821, "grad_norm": 1.2109375, "learning_rate": 0.00012595708241290392, "loss": 0.4713, "step": 5515 }, { "epoch": 1.3288396726047185, "grad_norm": 1.3203125, "learning_rate": 0.0001258725499051211, "loss": 0.4949, "step": 5520 }, { "epoch": 1.3300433317284546, "grad_norm": 1.28125, "learning_rate": 0.00012578799746306108, "loss": 0.5109, "step": 5525 }, { "epoch": 1.3312469908521907, "grad_norm": 1.171875, "learning_rate": 0.000125703425221817, "loss": 0.4836, "step": 5530 }, { "epoch": 1.3324506499759268, "grad_norm": 1.2890625, "learning_rate": 0.00012561883331651364, "loss": 0.497, "step": 5535 }, { "epoch": 1.3336543090996629, "grad_norm": 1.25, "learning_rate": 0.00012553422188230716, "loss": 0.4835, "step": 5540 }, { "epoch": 1.3348579682233992, "grad_norm": 0.99609375, "learning_rate": 0.00012544959105438498, "loss": 0.5083, "step": 5545 }, { "epoch": 1.3360616273471353, "grad_norm": 1.25, "learning_rate": 0.00012536494096796543, "loss": 0.5072, "step": 5550 }, { "epoch": 1.3372652864708714, "grad_norm": 1.2734375, "learning_rate": 0.00012528027175829763, "loss": 0.4937, "step": 5555 }, { "epoch": 1.3384689455946077, "grad_norm": 1.328125, "learning_rate": 0.0001251955835606613, "loss": 0.5118, "step": 5560 }, { "epoch": 1.3396726047183438, "grad_norm": 1.3515625, "learning_rate": 0.00012511087651036648, "loss": 0.525, "step": 5565 }, { "epoch": 1.34087626384208, "grad_norm": 1.1875, "learning_rate": 0.00012502615074275335, "loss": 0.4801, "step": 5570 }, { "epoch": 1.342079922965816, "grad_norm": 1.15625, "learning_rate": 0.0001249414063931919, "loss": 0.5221, "step": 5575 }, { "epoch": 1.3432835820895521, "grad_norm": 1.0859375, "learning_rate": 0.0001248566435970819, "loss": 0.4969, "step": 5580 }, { "epoch": 1.3444872412132884, "grad_norm": 1.1875, "learning_rate": 0.00012477186248985257, "loss": 0.4779, "step": 5585 }, { "epoch": 1.3456909003370245, "grad_norm": 1.25, "learning_rate": 0.00012468706320696238, "loss": 0.4816, "step": 5590 }, { "epoch": 1.3468945594607606, "grad_norm": 1.1640625, "learning_rate": 0.00012460224588389885, "loss": 0.5386, "step": 5595 }, { "epoch": 1.348098218584497, "grad_norm": 1.328125, "learning_rate": 0.00012451741065617835, "loss": 0.5128, "step": 5600 }, { "epoch": 1.349301877708233, "grad_norm": 1.2265625, "learning_rate": 0.0001244325576593458, "loss": 0.4803, "step": 5605 }, { "epoch": 1.3505055368319692, "grad_norm": 1.328125, "learning_rate": 0.0001243476870289745, "loss": 0.4959, "step": 5610 }, { "epoch": 1.3517091959557053, "grad_norm": 1.171875, "learning_rate": 0.000124262798900666, "loss": 0.4922, "step": 5615 }, { "epoch": 1.3529128550794414, "grad_norm": 1.2578125, "learning_rate": 0.0001241778934100498, "loss": 0.4875, "step": 5620 }, { "epoch": 1.3541165142031777, "grad_norm": 1.21875, "learning_rate": 0.00012409297069278306, "loss": 0.5178, "step": 5625 }, { "epoch": 1.3553201733269138, "grad_norm": 1.203125, "learning_rate": 0.00012400803088455053, "loss": 0.4882, "step": 5630 }, { "epoch": 1.35652383245065, "grad_norm": 1.2265625, "learning_rate": 0.00012392307412106423, "loss": 0.4877, "step": 5635 }, { "epoch": 1.3577274915743862, "grad_norm": 1.203125, "learning_rate": 0.00012383810053806328, "loss": 0.5071, "step": 5640 }, { "epoch": 1.3589311506981223, "grad_norm": 1.2265625, "learning_rate": 0.00012375311027131372, "loss": 0.5296, "step": 5645 }, { "epoch": 1.3601348098218584, "grad_norm": 1.21875, "learning_rate": 0.00012366810345660823, "loss": 0.5262, "step": 5650 }, { "epoch": 1.3613384689455947, "grad_norm": 1.2890625, "learning_rate": 0.00012358308022976584, "loss": 0.4921, "step": 5655 }, { "epoch": 1.3625421280693308, "grad_norm": 1.171875, "learning_rate": 0.0001234980407266319, "loss": 0.4937, "step": 5660 }, { "epoch": 1.363745787193067, "grad_norm": 1.2265625, "learning_rate": 0.00012341298508307773, "loss": 0.4961, "step": 5665 }, { "epoch": 1.364949446316803, "grad_norm": 1.25, "learning_rate": 0.00012332791343500047, "loss": 0.4887, "step": 5670 }, { "epoch": 1.3661531054405391, "grad_norm": 1.2109375, "learning_rate": 0.0001232428259183228, "loss": 0.4992, "step": 5675 }, { "epoch": 1.3673567645642755, "grad_norm": 1.015625, "learning_rate": 0.00012315772266899275, "loss": 0.4733, "step": 5680 }, { "epoch": 1.3685604236880116, "grad_norm": 1.140625, "learning_rate": 0.00012307260382298345, "loss": 0.5134, "step": 5685 }, { "epoch": 1.3697640828117477, "grad_norm": 1.2578125, "learning_rate": 0.00012298746951629308, "loss": 0.4616, "step": 5690 }, { "epoch": 1.370967741935484, "grad_norm": 1.25, "learning_rate": 0.0001229023198849444, "loss": 0.5291, "step": 5695 }, { "epoch": 1.37217140105922, "grad_norm": 1.265625, "learning_rate": 0.00012281715506498464, "loss": 0.5034, "step": 5700 }, { "epoch": 1.3733750601829562, "grad_norm": 1.1796875, "learning_rate": 0.00012273197519248547, "loss": 0.4917, "step": 5705 }, { "epoch": 1.3745787193066923, "grad_norm": 1.1796875, "learning_rate": 0.00012264678040354237, "loss": 0.4709, "step": 5710 }, { "epoch": 1.3757823784304284, "grad_norm": 1.28125, "learning_rate": 0.00012256157083427482, "loss": 0.5178, "step": 5715 }, { "epoch": 1.3769860375541647, "grad_norm": 1.1640625, "learning_rate": 0.00012247634662082587, "loss": 0.4827, "step": 5720 }, { "epoch": 1.3781896966779008, "grad_norm": 1.125, "learning_rate": 0.00012239110789936199, "loss": 0.4989, "step": 5725 }, { "epoch": 1.379393355801637, "grad_norm": 1.2890625, "learning_rate": 0.0001223058548060728, "loss": 0.4845, "step": 5730 }, { "epoch": 1.3805970149253732, "grad_norm": 1.1953125, "learning_rate": 0.00012222058747717084, "loss": 0.5135, "step": 5735 }, { "epoch": 1.3818006740491093, "grad_norm": 1.15625, "learning_rate": 0.00012213530604889147, "loss": 0.5021, "step": 5740 }, { "epoch": 1.3830043331728454, "grad_norm": 1.1171875, "learning_rate": 0.00012205001065749255, "loss": 0.4686, "step": 5745 }, { "epoch": 1.3842079922965815, "grad_norm": 1.2109375, "learning_rate": 0.00012196470143925426, "loss": 0.5039, "step": 5750 }, { "epoch": 1.3854116514203176, "grad_norm": 1.0703125, "learning_rate": 0.00012187937853047884, "loss": 0.505, "step": 5755 }, { "epoch": 1.386615310544054, "grad_norm": 1.171875, "learning_rate": 0.00012179404206749049, "loss": 0.5195, "step": 5760 }, { "epoch": 1.38781896966779, "grad_norm": 1.1328125, "learning_rate": 0.00012170869218663492, "loss": 0.4773, "step": 5765 }, { "epoch": 1.3890226287915262, "grad_norm": 1.2265625, "learning_rate": 0.00012162332902427938, "loss": 0.4814, "step": 5770 }, { "epoch": 1.3902262879152625, "grad_norm": 1.1953125, "learning_rate": 0.00012153795271681234, "loss": 0.5127, "step": 5775 }, { "epoch": 1.3914299470389986, "grad_norm": 1.2265625, "learning_rate": 0.00012145256340064325, "loss": 0.4639, "step": 5780 }, { "epoch": 1.3926336061627347, "grad_norm": 1.21875, "learning_rate": 0.00012136716121220235, "loss": 0.5135, "step": 5785 }, { "epoch": 1.393837265286471, "grad_norm": 1.203125, "learning_rate": 0.00012128174628794044, "loss": 0.4896, "step": 5790 }, { "epoch": 1.395040924410207, "grad_norm": 1.125, "learning_rate": 0.00012119631876432865, "loss": 0.4836, "step": 5795 }, { "epoch": 1.3962445835339432, "grad_norm": 1.1328125, "learning_rate": 0.00012111087877785829, "loss": 0.4916, "step": 5800 }, { "epoch": 1.3974482426576793, "grad_norm": 1.1328125, "learning_rate": 0.00012102542646504054, "loss": 0.5167, "step": 5805 }, { "epoch": 1.3986519017814154, "grad_norm": 1.2734375, "learning_rate": 0.00012093996196240631, "loss": 0.5174, "step": 5810 }, { "epoch": 1.3998555609051517, "grad_norm": 1.1328125, "learning_rate": 0.00012085448540650597, "loss": 0.4978, "step": 5815 }, { "epoch": 1.4010592200288878, "grad_norm": 1.203125, "learning_rate": 0.0001207689969339091, "loss": 0.4887, "step": 5820 }, { "epoch": 1.402262879152624, "grad_norm": 1.1953125, "learning_rate": 0.00012068349668120441, "loss": 0.4983, "step": 5825 }, { "epoch": 1.4034665382763603, "grad_norm": 1.140625, "learning_rate": 0.00012059798478499935, "loss": 0.5057, "step": 5830 }, { "epoch": 1.4046701974000964, "grad_norm": 1.21875, "learning_rate": 0.00012051246138192003, "loss": 0.5114, "step": 5835 }, { "epoch": 1.4058738565238325, "grad_norm": 1.21875, "learning_rate": 0.00012042692660861097, "loss": 0.4961, "step": 5840 }, { "epoch": 1.4070775156475686, "grad_norm": 1.328125, "learning_rate": 0.00012034138060173472, "loss": 0.4637, "step": 5845 }, { "epoch": 1.4082811747713047, "grad_norm": 1.078125, "learning_rate": 0.00012025582349797189, "loss": 0.464, "step": 5850 }, { "epoch": 1.409484833895041, "grad_norm": 1.1640625, "learning_rate": 0.00012017025543402084, "loss": 0.474, "step": 5855 }, { "epoch": 1.410688493018777, "grad_norm": 1.1484375, "learning_rate": 0.00012008467654659739, "loss": 0.467, "step": 5860 }, { "epoch": 1.4118921521425132, "grad_norm": 1.1875, "learning_rate": 0.00011999908697243468, "loss": 0.4678, "step": 5865 }, { "epoch": 1.4130958112662495, "grad_norm": 1.25, "learning_rate": 0.00011991348684828284, "loss": 0.4767, "step": 5870 }, { "epoch": 1.4142994703899856, "grad_norm": 1.1484375, "learning_rate": 0.00011982787631090896, "loss": 0.4711, "step": 5875 }, { "epoch": 1.4155031295137217, "grad_norm": 1.234375, "learning_rate": 0.00011974225549709677, "loss": 0.4824, "step": 5880 }, { "epoch": 1.4167067886374578, "grad_norm": 1.1796875, "learning_rate": 0.00011965662454364634, "loss": 0.4644, "step": 5885 }, { "epoch": 1.417910447761194, "grad_norm": 1.1875, "learning_rate": 0.00011957098358737399, "loss": 0.4832, "step": 5890 }, { "epoch": 1.4191141068849302, "grad_norm": 1.171875, "learning_rate": 0.00011948533276511205, "loss": 0.4851, "step": 5895 }, { "epoch": 1.4203177660086663, "grad_norm": 1.125, "learning_rate": 0.00011939967221370848, "loss": 0.4649, "step": 5900 }, { "epoch": 1.4215214251324024, "grad_norm": 1.1953125, "learning_rate": 0.000119314002070027, "loss": 0.4774, "step": 5905 }, { "epoch": 1.4227250842561388, "grad_norm": 1.140625, "learning_rate": 0.00011922832247094646, "loss": 0.4969, "step": 5910 }, { "epoch": 1.4239287433798749, "grad_norm": 1.0859375, "learning_rate": 0.00011914263355336094, "loss": 0.5128, "step": 5915 }, { "epoch": 1.425132402503611, "grad_norm": 1.1953125, "learning_rate": 0.00011905693545417933, "loss": 0.4906, "step": 5920 }, { "epoch": 1.4263360616273473, "grad_norm": 1.1953125, "learning_rate": 0.00011897122831032525, "loss": 0.4675, "step": 5925 }, { "epoch": 1.4275397207510834, "grad_norm": 1.1640625, "learning_rate": 0.0001188855122587367, "loss": 0.4809, "step": 5930 }, { "epoch": 1.4287433798748195, "grad_norm": 1.1328125, "learning_rate": 0.000118799787436366, "loss": 0.4919, "step": 5935 }, { "epoch": 1.4299470389985556, "grad_norm": 1.1484375, "learning_rate": 0.00011871405398017943, "loss": 0.4994, "step": 5940 }, { "epoch": 1.4311506981222917, "grad_norm": 1.203125, "learning_rate": 0.0001186283120271571, "loss": 0.4926, "step": 5945 }, { "epoch": 1.432354357246028, "grad_norm": 1.203125, "learning_rate": 0.00011854256171429261, "loss": 0.4811, "step": 5950 }, { "epoch": 1.433558016369764, "grad_norm": 1.125, "learning_rate": 0.00011845680317859303, "loss": 0.514, "step": 5955 }, { "epoch": 1.4347616754935002, "grad_norm": 1.234375, "learning_rate": 0.00011837103655707849, "loss": 0.4987, "step": 5960 }, { "epoch": 1.4359653346172365, "grad_norm": 1.1640625, "learning_rate": 0.00011828526198678208, "loss": 0.4906, "step": 5965 }, { "epoch": 1.4371689937409726, "grad_norm": 1.171875, "learning_rate": 0.00011819947960474954, "loss": 0.4696, "step": 5970 }, { "epoch": 1.4383726528647087, "grad_norm": 1.3046875, "learning_rate": 0.0001181136895480392, "loss": 0.4913, "step": 5975 }, { "epoch": 1.4395763119884448, "grad_norm": 1.21875, "learning_rate": 0.0001180278919537215, "loss": 0.5125, "step": 5980 }, { "epoch": 1.440779971112181, "grad_norm": 1.234375, "learning_rate": 0.00011794208695887903, "loss": 0.4988, "step": 5985 }, { "epoch": 1.4419836302359172, "grad_norm": 1.0859375, "learning_rate": 0.00011785627470060619, "loss": 0.4803, "step": 5990 }, { "epoch": 1.4431872893596533, "grad_norm": 1.2734375, "learning_rate": 0.00011777045531600896, "loss": 0.5158, "step": 5995 }, { "epoch": 1.4443909484833894, "grad_norm": 1.15625, "learning_rate": 0.00011768462894220472, "loss": 0.4966, "step": 6000 }, { "epoch": 1.4443909484833894, "eval_loss": 0.4236730933189392, "eval_runtime": 2.3901, "eval_samples_per_second": 83.68, "eval_steps_per_second": 83.68, "step": 6000 }, { "epoch": 1.4455946076071258, "grad_norm": 1.1796875, "learning_rate": 0.00011759879571632197, "loss": 0.4952, "step": 6005 }, { "epoch": 1.4467982667308619, "grad_norm": 1.1640625, "learning_rate": 0.00011751295577550028, "loss": 0.477, "step": 6010 }, { "epoch": 1.448001925854598, "grad_norm": 1.234375, "learning_rate": 0.00011742710925688983, "loss": 0.4742, "step": 6015 }, { "epoch": 1.449205584978334, "grad_norm": 1.21875, "learning_rate": 0.00011734125629765133, "loss": 0.4721, "step": 6020 }, { "epoch": 1.4504092441020702, "grad_norm": 1.1015625, "learning_rate": 0.00011725539703495584, "loss": 0.5081, "step": 6025 }, { "epoch": 1.4516129032258065, "grad_norm": 1.1484375, "learning_rate": 0.00011716953160598443, "loss": 0.4644, "step": 6030 }, { "epoch": 1.4528165623495426, "grad_norm": 1.0859375, "learning_rate": 0.00011708366014792801, "loss": 0.4901, "step": 6035 }, { "epoch": 1.4540202214732787, "grad_norm": 1.171875, "learning_rate": 0.00011699778279798723, "loss": 0.4536, "step": 6040 }, { "epoch": 1.455223880597015, "grad_norm": 1.0859375, "learning_rate": 0.00011691189969337203, "loss": 0.4701, "step": 6045 }, { "epoch": 1.4564275397207511, "grad_norm": 1.28125, "learning_rate": 0.00011682601097130162, "loss": 0.4936, "step": 6050 }, { "epoch": 1.4576311988444872, "grad_norm": 1.125, "learning_rate": 0.00011674011676900414, "loss": 0.5204, "step": 6055 }, { "epoch": 1.4588348579682233, "grad_norm": 1.265625, "learning_rate": 0.0001166542172237165, "loss": 0.4949, "step": 6060 }, { "epoch": 1.4600385170919594, "grad_norm": 1.203125, "learning_rate": 0.00011656831247268417, "loss": 0.4835, "step": 6065 }, { "epoch": 1.4612421762156957, "grad_norm": 1.203125, "learning_rate": 0.00011648240265316094, "loss": 0.4727, "step": 6070 }, { "epoch": 1.4624458353394318, "grad_norm": 1.1328125, "learning_rate": 0.00011639648790240863, "loss": 0.4894, "step": 6075 }, { "epoch": 1.463649494463168, "grad_norm": 1.1640625, "learning_rate": 0.00011631056835769706, "loss": 0.5233, "step": 6080 }, { "epoch": 1.4648531535869043, "grad_norm": 1.1640625, "learning_rate": 0.00011622464415630352, "loss": 0.4761, "step": 6085 }, { "epoch": 1.4660568127106404, "grad_norm": 1.0625, "learning_rate": 0.00011613871543551298, "loss": 0.4713, "step": 6090 }, { "epoch": 1.4672604718343765, "grad_norm": 1.1328125, "learning_rate": 0.00011605278233261741, "loss": 0.4944, "step": 6095 }, { "epoch": 1.4684641309581128, "grad_norm": 1.1484375, "learning_rate": 0.00011596684498491593, "loss": 0.4928, "step": 6100 }, { "epoch": 1.469667790081849, "grad_norm": 1.296875, "learning_rate": 0.00011588090352971445, "loss": 0.4932, "step": 6105 }, { "epoch": 1.470871449205585, "grad_norm": 1.140625, "learning_rate": 0.00011579495810432523, "loss": 0.4658, "step": 6110 }, { "epoch": 1.472075108329321, "grad_norm": 1.140625, "learning_rate": 0.00011570900884606716, "loss": 0.5048, "step": 6115 }, { "epoch": 1.4732787674530572, "grad_norm": 1.1953125, "learning_rate": 0.00011562305589226506, "loss": 0.5027, "step": 6120 }, { "epoch": 1.4744824265767935, "grad_norm": 1.171875, "learning_rate": 0.00011553709938024977, "loss": 0.4814, "step": 6125 }, { "epoch": 1.4756860857005296, "grad_norm": 1.2109375, "learning_rate": 0.00011545113944735769, "loss": 0.4922, "step": 6130 }, { "epoch": 1.4768897448242657, "grad_norm": 1.328125, "learning_rate": 0.0001153651762309308, "loss": 0.5091, "step": 6135 }, { "epoch": 1.478093403948002, "grad_norm": 1.2734375, "learning_rate": 0.00011527920986831631, "loss": 0.5113, "step": 6140 }, { "epoch": 1.4792970630717381, "grad_norm": 1.1328125, "learning_rate": 0.00011519324049686634, "loss": 0.5091, "step": 6145 }, { "epoch": 1.4805007221954742, "grad_norm": 1.3515625, "learning_rate": 0.00011510726825393796, "loss": 0.5244, "step": 6150 }, { "epoch": 1.4817043813192103, "grad_norm": 1.0859375, "learning_rate": 0.00011502129327689282, "loss": 0.4535, "step": 6155 }, { "epoch": 1.4829080404429464, "grad_norm": 1.078125, "learning_rate": 0.00011493531570309687, "loss": 0.4736, "step": 6160 }, { "epoch": 1.4841116995666828, "grad_norm": 1.2890625, "learning_rate": 0.00011484933566992013, "loss": 0.4997, "step": 6165 }, { "epoch": 1.4853153586904189, "grad_norm": 1.1875, "learning_rate": 0.00011476335331473677, "loss": 0.5114, "step": 6170 }, { "epoch": 1.486519017814155, "grad_norm": 1.234375, "learning_rate": 0.00011467736877492454, "loss": 0.4885, "step": 6175 }, { "epoch": 1.4877226769378913, "grad_norm": 1.1875, "learning_rate": 0.00011459138218786465, "loss": 0.4865, "step": 6180 }, { "epoch": 1.4889263360616274, "grad_norm": 1.1796875, "learning_rate": 0.00011450539369094162, "loss": 0.488, "step": 6185 }, { "epoch": 1.4901299951853635, "grad_norm": 1.1953125, "learning_rate": 0.00011441940342154308, "loss": 0.4846, "step": 6190 }, { "epoch": 1.4913336543090996, "grad_norm": 1.25, "learning_rate": 0.00011433341151705937, "loss": 0.4878, "step": 6195 }, { "epoch": 1.4925373134328357, "grad_norm": 1.2421875, "learning_rate": 0.00011424741811488358, "loss": 0.5122, "step": 6200 }, { "epoch": 1.493740972556572, "grad_norm": 1.265625, "learning_rate": 0.00011416142335241106, "loss": 0.4755, "step": 6205 }, { "epoch": 1.4949446316803081, "grad_norm": 1.2109375, "learning_rate": 0.00011407542736703943, "loss": 0.5136, "step": 6210 }, { "epoch": 1.4961482908040442, "grad_norm": 1.1328125, "learning_rate": 0.00011398943029616821, "loss": 0.4667, "step": 6215 }, { "epoch": 1.4973519499277805, "grad_norm": 1.1796875, "learning_rate": 0.00011390343227719868, "loss": 0.4883, "step": 6220 }, { "epoch": 1.4985556090515166, "grad_norm": 1.3515625, "learning_rate": 0.00011381743344753364, "loss": 0.4689, "step": 6225 }, { "epoch": 1.4997592681752527, "grad_norm": 1.1484375, "learning_rate": 0.00011373143394457716, "loss": 0.4806, "step": 6230 }, { "epoch": 1.500962927298989, "grad_norm": 1.125, "learning_rate": 0.00011364543390573441, "loss": 0.471, "step": 6235 }, { "epoch": 1.502166586422725, "grad_norm": 1.3046875, "learning_rate": 0.0001135594334684114, "loss": 0.4961, "step": 6240 }, { "epoch": 1.5033702455464613, "grad_norm": 1.1953125, "learning_rate": 0.0001134734327700148, "loss": 0.4623, "step": 6245 }, { "epoch": 1.5045739046701974, "grad_norm": 1.2109375, "learning_rate": 0.00011338743194795163, "loss": 0.4349, "step": 6250 }, { "epoch": 1.5057775637939335, "grad_norm": 1.234375, "learning_rate": 0.00011330143113962918, "loss": 0.4838, "step": 6255 }, { "epoch": 1.5069812229176698, "grad_norm": 1.140625, "learning_rate": 0.00011321543048245474, "loss": 0.4885, "step": 6260 }, { "epoch": 1.5081848820414059, "grad_norm": 1.25, "learning_rate": 0.00011312943011383526, "loss": 0.4958, "step": 6265 }, { "epoch": 1.509388541165142, "grad_norm": 1.1875, "learning_rate": 0.0001130434301711773, "loss": 0.5054, "step": 6270 }, { "epoch": 1.5105922002888783, "grad_norm": 1.0859375, "learning_rate": 0.00011295743079188675, "loss": 0.4916, "step": 6275 }, { "epoch": 1.5117958594126142, "grad_norm": 1.28125, "learning_rate": 0.0001128714321133685, "loss": 0.4826, "step": 6280 }, { "epoch": 1.5129995185363505, "grad_norm": 1.234375, "learning_rate": 0.00011278543427302645, "loss": 0.5122, "step": 6285 }, { "epoch": 1.5142031776600868, "grad_norm": 1.1640625, "learning_rate": 0.0001126994374082631, "loss": 0.4941, "step": 6290 }, { "epoch": 1.5154068367838227, "grad_norm": 1.1875, "learning_rate": 0.0001126134416564794, "loss": 0.4636, "step": 6295 }, { "epoch": 1.516610495907559, "grad_norm": 1.15625, "learning_rate": 0.0001125274471550745, "loss": 0.4767, "step": 6300 }, { "epoch": 1.5178141550312951, "grad_norm": 1.2734375, "learning_rate": 0.00011244145404144554, "loss": 0.4698, "step": 6305 }, { "epoch": 1.5190178141550312, "grad_norm": 1.1328125, "learning_rate": 0.0001123554624529875, "loss": 0.4936, "step": 6310 }, { "epoch": 1.5202214732787676, "grad_norm": 1.0859375, "learning_rate": 0.00011226947252709293, "loss": 0.5105, "step": 6315 }, { "epoch": 1.5214251324025037, "grad_norm": 1.3125, "learning_rate": 0.00011218348440115162, "loss": 0.5167, "step": 6320 }, { "epoch": 1.5226287915262398, "grad_norm": 1.0859375, "learning_rate": 0.00011209749821255062, "loss": 0.4814, "step": 6325 }, { "epoch": 1.523832450649976, "grad_norm": 1.1875, "learning_rate": 0.00011201151409867374, "loss": 0.4943, "step": 6330 }, { "epoch": 1.525036109773712, "grad_norm": 1.3203125, "learning_rate": 0.0001119255321969016, "loss": 0.4951, "step": 6335 }, { "epoch": 1.5262397688974483, "grad_norm": 1.1640625, "learning_rate": 0.00011183955264461125, "loss": 0.4885, "step": 6340 }, { "epoch": 1.5274434280211844, "grad_norm": 1.1640625, "learning_rate": 0.00011175357557917597, "loss": 0.4522, "step": 6345 }, { "epoch": 1.5286470871449205, "grad_norm": 1.1171875, "learning_rate": 0.00011166760113796501, "loss": 0.4875, "step": 6350 }, { "epoch": 1.5298507462686568, "grad_norm": 1.234375, "learning_rate": 0.00011158162945834355, "loss": 0.4492, "step": 6355 }, { "epoch": 1.531054405392393, "grad_norm": 1.1484375, "learning_rate": 0.00011149566067767227, "loss": 0.48, "step": 6360 }, { "epoch": 1.532258064516129, "grad_norm": 1.2109375, "learning_rate": 0.00011140969493330727, "loss": 0.4518, "step": 6365 }, { "epoch": 1.5334617236398653, "grad_norm": 1.234375, "learning_rate": 0.00011132373236259974, "loss": 0.435, "step": 6370 }, { "epoch": 1.5346653827636012, "grad_norm": 1.1484375, "learning_rate": 0.00011123777310289585, "loss": 0.5019, "step": 6375 }, { "epoch": 1.5358690418873375, "grad_norm": 1.109375, "learning_rate": 0.00011115181729153642, "loss": 0.4437, "step": 6380 }, { "epoch": 1.5370727010110736, "grad_norm": 1.2734375, "learning_rate": 0.00011106586506585685, "loss": 0.4861, "step": 6385 }, { "epoch": 1.5382763601348097, "grad_norm": 1.1328125, "learning_rate": 0.00011097991656318678, "loss": 0.4799, "step": 6390 }, { "epoch": 1.539480019258546, "grad_norm": 1.1640625, "learning_rate": 0.00011089397192084982, "loss": 0.5087, "step": 6395 }, { "epoch": 1.5406836783822822, "grad_norm": 1.1796875, "learning_rate": 0.00011080803127616352, "loss": 0.4612, "step": 6400 }, { "epoch": 1.5418873375060183, "grad_norm": 1.15625, "learning_rate": 0.00011072209476643897, "loss": 0.4816, "step": 6405 }, { "epoch": 1.5430909966297546, "grad_norm": 1.1640625, "learning_rate": 0.00011063616252898071, "loss": 0.5003, "step": 6410 }, { "epoch": 1.5442946557534905, "grad_norm": 1.2578125, "learning_rate": 0.00011055023470108644, "loss": 0.4642, "step": 6415 }, { "epoch": 1.5454983148772268, "grad_norm": 1.109375, "learning_rate": 0.00011046431142004676, "loss": 0.5087, "step": 6420 }, { "epoch": 1.5467019740009629, "grad_norm": 1.1953125, "learning_rate": 0.00011037839282314504, "loss": 0.4676, "step": 6425 }, { "epoch": 1.547905633124699, "grad_norm": 1.25, "learning_rate": 0.00011029247904765724, "loss": 0.5149, "step": 6430 }, { "epoch": 1.5491092922484353, "grad_norm": 1.1953125, "learning_rate": 0.00011020657023085149, "loss": 0.4821, "step": 6435 }, { "epoch": 1.5503129513721714, "grad_norm": 1.1328125, "learning_rate": 0.00011012066650998805, "loss": 0.4744, "step": 6440 }, { "epoch": 1.5515166104959075, "grad_norm": 1.140625, "learning_rate": 0.00011003476802231909, "loss": 0.4787, "step": 6445 }, { "epoch": 1.5527202696196438, "grad_norm": 1.21875, "learning_rate": 0.00010994887490508834, "loss": 0.4899, "step": 6450 }, { "epoch": 1.55392392874338, "grad_norm": 1.1953125, "learning_rate": 0.000109862987295531, "loss": 0.4836, "step": 6455 }, { "epoch": 1.555127587867116, "grad_norm": 1.2109375, "learning_rate": 0.00010977710533087346, "loss": 0.4532, "step": 6460 }, { "epoch": 1.5563312469908523, "grad_norm": 1.3046875, "learning_rate": 0.00010969122914833304, "loss": 0.4874, "step": 6465 }, { "epoch": 1.5575349061145882, "grad_norm": 1.1484375, "learning_rate": 0.00010960535888511789, "loss": 0.4735, "step": 6470 }, { "epoch": 1.5587385652383245, "grad_norm": 1.1015625, "learning_rate": 0.00010951949467842669, "loss": 0.4959, "step": 6475 }, { "epoch": 1.5599422243620606, "grad_norm": 1.1953125, "learning_rate": 0.00010943363666544845, "loss": 0.4706, "step": 6480 }, { "epoch": 1.5611458834857967, "grad_norm": 1.0625, "learning_rate": 0.00010934778498336222, "loss": 0.4864, "step": 6485 }, { "epoch": 1.562349542609533, "grad_norm": 1.1640625, "learning_rate": 0.00010926193976933696, "loss": 0.5178, "step": 6490 }, { "epoch": 1.5635532017332692, "grad_norm": 1.15625, "learning_rate": 0.00010917610116053135, "loss": 0.4709, "step": 6495 }, { "epoch": 1.5647568608570053, "grad_norm": 1.203125, "learning_rate": 0.00010909026929409354, "loss": 0.5032, "step": 6500 }, { "epoch": 1.5647568608570053, "eval_loss": 0.41056159138679504, "eval_runtime": 2.398, "eval_samples_per_second": 83.402, "eval_steps_per_second": 83.402, "step": 6500 }, { "epoch": 1.5659605199807416, "grad_norm": 1.2578125, "learning_rate": 0.00010900444430716075, "loss": 0.495, "step": 6505 }, { "epoch": 1.5671641791044775, "grad_norm": 1.203125, "learning_rate": 0.00010891862633685939, "loss": 0.4829, "step": 6510 }, { "epoch": 1.5683678382282138, "grad_norm": 1.203125, "learning_rate": 0.0001088328155203045, "loss": 0.4818, "step": 6515 }, { "epoch": 1.56957149735195, "grad_norm": 1.0859375, "learning_rate": 0.0001087470119945998, "loss": 0.476, "step": 6520 }, { "epoch": 1.570775156475686, "grad_norm": 1.109375, "learning_rate": 0.00010866121589683738, "loss": 0.4425, "step": 6525 }, { "epoch": 1.5719788155994223, "grad_norm": 1.1171875, "learning_rate": 0.00010857542736409733, "loss": 0.5139, "step": 6530 }, { "epoch": 1.5731824747231584, "grad_norm": 1.15625, "learning_rate": 0.00010848964653344778, "loss": 0.4963, "step": 6535 }, { "epoch": 1.5743861338468945, "grad_norm": 1.1171875, "learning_rate": 0.00010840387354194445, "loss": 0.4689, "step": 6540 }, { "epoch": 1.5755897929706308, "grad_norm": 1.203125, "learning_rate": 0.00010831810852663065, "loss": 0.4876, "step": 6545 }, { "epoch": 1.5767934520943667, "grad_norm": 1.1640625, "learning_rate": 0.0001082323516245369, "loss": 0.4751, "step": 6550 }, { "epoch": 1.577997111218103, "grad_norm": 1.21875, "learning_rate": 0.00010814660297268071, "loss": 0.4719, "step": 6555 }, { "epoch": 1.5792007703418391, "grad_norm": 1.2578125, "learning_rate": 0.00010806086270806646, "loss": 0.4712, "step": 6560 }, { "epoch": 1.5804044294655752, "grad_norm": 1.203125, "learning_rate": 0.0001079751309676851, "loss": 0.4717, "step": 6565 }, { "epoch": 1.5816080885893116, "grad_norm": 1.1328125, "learning_rate": 0.00010788940788851398, "loss": 0.4869, "step": 6570 }, { "epoch": 1.5828117477130477, "grad_norm": 1.1328125, "learning_rate": 0.00010780369360751663, "loss": 0.504, "step": 6575 }, { "epoch": 1.5840154068367838, "grad_norm": 1.0546875, "learning_rate": 0.00010771798826164241, "loss": 0.4856, "step": 6580 }, { "epoch": 1.58521906596052, "grad_norm": 1.2109375, "learning_rate": 0.00010763229198782662, "loss": 0.4648, "step": 6585 }, { "epoch": 1.5864227250842562, "grad_norm": 1.1953125, "learning_rate": 0.00010754660492298984, "loss": 0.4842, "step": 6590 }, { "epoch": 1.5876263842079923, "grad_norm": 1.2109375, "learning_rate": 0.00010746092720403805, "loss": 0.4613, "step": 6595 }, { "epoch": 1.5888300433317286, "grad_norm": 1.3359375, "learning_rate": 0.00010737525896786234, "loss": 0.4635, "step": 6600 }, { "epoch": 1.5900337024554645, "grad_norm": 1.1171875, "learning_rate": 0.00010728960035133852, "loss": 0.4909, "step": 6605 }, { "epoch": 1.5912373615792008, "grad_norm": 1.171875, "learning_rate": 0.00010720395149132714, "loss": 0.4731, "step": 6610 }, { "epoch": 1.592441020702937, "grad_norm": 1.203125, "learning_rate": 0.0001071183125246731, "loss": 0.5077, "step": 6615 }, { "epoch": 1.593644679826673, "grad_norm": 1.2265625, "learning_rate": 0.00010703268358820552, "loss": 0.472, "step": 6620 }, { "epoch": 1.5948483389504093, "grad_norm": 1.2578125, "learning_rate": 0.00010694706481873753, "loss": 0.4272, "step": 6625 }, { "epoch": 1.5960519980741454, "grad_norm": 1.1015625, "learning_rate": 0.00010686145635306588, "loss": 0.4776, "step": 6630 }, { "epoch": 1.5972556571978815, "grad_norm": 1.125, "learning_rate": 0.00010677585832797107, "loss": 0.4335, "step": 6635 }, { "epoch": 1.5984593163216179, "grad_norm": 1.1171875, "learning_rate": 0.0001066902708802167, "loss": 0.4629, "step": 6640 }, { "epoch": 1.5996629754453537, "grad_norm": 1.1875, "learning_rate": 0.00010660469414654962, "loss": 0.492, "step": 6645 }, { "epoch": 1.60086663456909, "grad_norm": 1.21875, "learning_rate": 0.00010651912826369955, "loss": 0.477, "step": 6650 }, { "epoch": 1.6020702936928262, "grad_norm": 1.1484375, "learning_rate": 0.00010643357336837872, "loss": 0.4892, "step": 6655 }, { "epoch": 1.6032739528165623, "grad_norm": 1.171875, "learning_rate": 0.00010634802959728206, "loss": 0.5039, "step": 6660 }, { "epoch": 1.6044776119402986, "grad_norm": 1.0546875, "learning_rate": 0.00010626249708708647, "loss": 0.5071, "step": 6665 }, { "epoch": 1.6056812710640347, "grad_norm": 1.21875, "learning_rate": 0.00010617697597445104, "loss": 0.5055, "step": 6670 }, { "epoch": 1.6068849301877708, "grad_norm": 1.09375, "learning_rate": 0.00010609146639601659, "loss": 0.489, "step": 6675 }, { "epoch": 1.608088589311507, "grad_norm": 1.1796875, "learning_rate": 0.00010600596848840544, "loss": 0.5205, "step": 6680 }, { "epoch": 1.609292248435243, "grad_norm": 1.1953125, "learning_rate": 0.0001059204823882214, "loss": 0.495, "step": 6685 }, { "epoch": 1.6104959075589793, "grad_norm": 1.15625, "learning_rate": 0.00010583500823204933, "loss": 0.4569, "step": 6690 }, { "epoch": 1.6116995666827154, "grad_norm": 1.1484375, "learning_rate": 0.00010574954615645499, "loss": 0.4698, "step": 6695 }, { "epoch": 1.6129032258064515, "grad_norm": 1.1796875, "learning_rate": 0.00010566409629798486, "loss": 0.4777, "step": 6700 }, { "epoch": 1.6141068849301878, "grad_norm": 1.078125, "learning_rate": 0.00010557865879316595, "loss": 0.4874, "step": 6705 }, { "epoch": 1.615310544053924, "grad_norm": 1.140625, "learning_rate": 0.0001054932337785055, "loss": 0.4672, "step": 6710 }, { "epoch": 1.61651420317766, "grad_norm": 1.0859375, "learning_rate": 0.00010540782139049073, "loss": 0.427, "step": 6715 }, { "epoch": 1.6177178623013964, "grad_norm": 1.203125, "learning_rate": 0.00010532242176558877, "loss": 0.4983, "step": 6720 }, { "epoch": 1.6189215214251322, "grad_norm": 1.15625, "learning_rate": 0.00010523703504024633, "loss": 0.4716, "step": 6725 }, { "epoch": 1.6201251805488686, "grad_norm": 1.2421875, "learning_rate": 0.0001051516613508895, "loss": 0.4437, "step": 6730 }, { "epoch": 1.6213288396726049, "grad_norm": 1.15625, "learning_rate": 0.00010506630083392358, "loss": 0.49, "step": 6735 }, { "epoch": 1.6225324987963408, "grad_norm": 1.1328125, "learning_rate": 0.00010498095362573273, "loss": 0.4688, "step": 6740 }, { "epoch": 1.623736157920077, "grad_norm": 1.1640625, "learning_rate": 0.00010489561986267997, "loss": 0.482, "step": 6745 }, { "epoch": 1.6249398170438132, "grad_norm": 1.1640625, "learning_rate": 0.00010481029968110677, "loss": 0.4841, "step": 6750 }, { "epoch": 1.6261434761675493, "grad_norm": 1.109375, "learning_rate": 0.00010472499321733286, "loss": 0.5009, "step": 6755 }, { "epoch": 1.6273471352912856, "grad_norm": 1.1015625, "learning_rate": 0.00010463970060765619, "loss": 0.4861, "step": 6760 }, { "epoch": 1.6285507944150217, "grad_norm": 1.234375, "learning_rate": 0.00010455442198835239, "loss": 0.4887, "step": 6765 }, { "epoch": 1.6297544535387578, "grad_norm": 1.25, "learning_rate": 0.00010446915749567487, "loss": 0.4773, "step": 6770 }, { "epoch": 1.6309581126624941, "grad_norm": 1.171875, "learning_rate": 0.00010438390726585444, "loss": 0.4515, "step": 6775 }, { "epoch": 1.63216177178623, "grad_norm": 1.328125, "learning_rate": 0.00010429867143509911, "loss": 0.4575, "step": 6780 }, { "epoch": 1.6333654309099663, "grad_norm": 1.1484375, "learning_rate": 0.0001042134501395939, "loss": 0.4457, "step": 6785 }, { "epoch": 1.6345690900337024, "grad_norm": 1.203125, "learning_rate": 0.00010412824351550055, "loss": 0.4922, "step": 6790 }, { "epoch": 1.6357727491574385, "grad_norm": 1.2734375, "learning_rate": 0.00010404305169895746, "loss": 0.4796, "step": 6795 }, { "epoch": 1.6369764082811749, "grad_norm": 1.171875, "learning_rate": 0.00010395787482607926, "loss": 0.4898, "step": 6800 }, { "epoch": 1.638180067404911, "grad_norm": 1.234375, "learning_rate": 0.00010387271303295677, "loss": 0.4823, "step": 6805 }, { "epoch": 1.639383726528647, "grad_norm": 1.21875, "learning_rate": 0.00010378756645565674, "loss": 0.4629, "step": 6810 }, { "epoch": 1.6405873856523834, "grad_norm": 1.234375, "learning_rate": 0.00010370243523022154, "loss": 0.4652, "step": 6815 }, { "epoch": 1.6417910447761193, "grad_norm": 1.21875, "learning_rate": 0.00010361731949266905, "loss": 0.4604, "step": 6820 }, { "epoch": 1.6429947038998556, "grad_norm": 1.1484375, "learning_rate": 0.00010353221937899241, "loss": 0.4678, "step": 6825 }, { "epoch": 1.6441983630235917, "grad_norm": 1.1484375, "learning_rate": 0.00010344713502515978, "loss": 0.4846, "step": 6830 }, { "epoch": 1.6454020221473278, "grad_norm": 1.203125, "learning_rate": 0.00010336206656711416, "loss": 0.4587, "step": 6835 }, { "epoch": 1.646605681271064, "grad_norm": 1.0546875, "learning_rate": 0.00010327701414077307, "loss": 0.4833, "step": 6840 }, { "epoch": 1.6478093403948002, "grad_norm": 1.0859375, "learning_rate": 0.00010319197788202859, "loss": 0.4579, "step": 6845 }, { "epoch": 1.6490129995185363, "grad_norm": 1.1796875, "learning_rate": 0.00010310695792674681, "loss": 0.4999, "step": 6850 }, { "epoch": 1.6502166586422726, "grad_norm": 1.1796875, "learning_rate": 0.00010302195441076776, "loss": 0.4468, "step": 6855 }, { "epoch": 1.6514203177660085, "grad_norm": 1.1953125, "learning_rate": 0.0001029369674699054, "loss": 0.4587, "step": 6860 }, { "epoch": 1.6526239768897448, "grad_norm": 1.1640625, "learning_rate": 0.00010285199723994696, "loss": 0.4643, "step": 6865 }, { "epoch": 1.653827636013481, "grad_norm": 1.140625, "learning_rate": 0.00010276704385665316, "loss": 0.5027, "step": 6870 }, { "epoch": 1.655031295137217, "grad_norm": 1.109375, "learning_rate": 0.00010268210745575766, "loss": 0.4241, "step": 6875 }, { "epoch": 1.6562349542609534, "grad_norm": 1.2109375, "learning_rate": 0.00010259718817296711, "loss": 0.4809, "step": 6880 }, { "epoch": 1.6574386133846895, "grad_norm": 1.1015625, "learning_rate": 0.00010251228614396078, "loss": 0.5063, "step": 6885 }, { "epoch": 1.6586422725084256, "grad_norm": 1.203125, "learning_rate": 0.00010242740150439024, "loss": 0.4927, "step": 6890 }, { "epoch": 1.6598459316321619, "grad_norm": 1.15625, "learning_rate": 0.0001023425343898795, "loss": 0.4651, "step": 6895 }, { "epoch": 1.661049590755898, "grad_norm": 1.1953125, "learning_rate": 0.00010225768493602437, "loss": 0.4627, "step": 6900 }, { "epoch": 1.662253249879634, "grad_norm": 1.1640625, "learning_rate": 0.0001021728532783926, "loss": 0.4753, "step": 6905 }, { "epoch": 1.6634569090033704, "grad_norm": 1.1875, "learning_rate": 0.00010208803955252335, "loss": 0.4603, "step": 6910 }, { "epoch": 1.6646605681271063, "grad_norm": 1.1796875, "learning_rate": 0.00010200324389392724, "loss": 0.4871, "step": 6915 }, { "epoch": 1.6658642272508426, "grad_norm": 1.140625, "learning_rate": 0.00010191846643808607, "loss": 0.4724, "step": 6920 }, { "epoch": 1.6670678863745787, "grad_norm": 1.1328125, "learning_rate": 0.00010183370732045236, "loss": 0.473, "step": 6925 }, { "epoch": 1.6682715454983148, "grad_norm": 1.296875, "learning_rate": 0.00010174896667644954, "loss": 0.5103, "step": 6930 }, { "epoch": 1.6694752046220511, "grad_norm": 1.125, "learning_rate": 0.0001016642446414714, "loss": 0.4525, "step": 6935 }, { "epoch": 1.6706788637457872, "grad_norm": 1.21875, "learning_rate": 0.00010157954135088202, "loss": 0.4582, "step": 6940 }, { "epoch": 1.6718825228695233, "grad_norm": 1.3125, "learning_rate": 0.0001014948569400156, "loss": 0.4983, "step": 6945 }, { "epoch": 1.6730861819932596, "grad_norm": 1.171875, "learning_rate": 0.00010141019154417605, "loss": 0.4344, "step": 6950 }, { "epoch": 1.6742898411169955, "grad_norm": 1.1796875, "learning_rate": 0.00010132554529863701, "loss": 0.4866, "step": 6955 }, { "epoch": 1.6754935002407318, "grad_norm": 1.234375, "learning_rate": 0.00010124091833864149, "loss": 0.445, "step": 6960 }, { "epoch": 1.676697159364468, "grad_norm": 1.21875, "learning_rate": 0.00010115631079940164, "loss": 0.4949, "step": 6965 }, { "epoch": 1.677900818488204, "grad_norm": 1.1171875, "learning_rate": 0.00010107172281609865, "loss": 0.4533, "step": 6970 }, { "epoch": 1.6791044776119404, "grad_norm": 1.234375, "learning_rate": 0.0001009871545238824, "loss": 0.4666, "step": 6975 }, { "epoch": 1.6803081367356765, "grad_norm": 1.171875, "learning_rate": 0.0001009026060578714, "loss": 0.4385, "step": 6980 }, { "epoch": 1.6815117958594126, "grad_norm": 1.09375, "learning_rate": 0.00010081807755315239, "loss": 0.4661, "step": 6985 }, { "epoch": 1.682715454983149, "grad_norm": 1.2890625, "learning_rate": 0.00010073356914478021, "loss": 0.4982, "step": 6990 }, { "epoch": 1.6839191141068848, "grad_norm": 1.1953125, "learning_rate": 0.00010064908096777771, "loss": 0.4564, "step": 6995 }, { "epoch": 1.685122773230621, "grad_norm": 1.1640625, "learning_rate": 0.00010056461315713526, "loss": 0.4728, "step": 7000 }, { "epoch": 1.685122773230621, "eval_loss": 0.4017234146595001, "eval_runtime": 2.4276, "eval_samples_per_second": 82.387, "eval_steps_per_second": 82.387, "step": 7000 }, { "epoch": 1.6863264323543572, "grad_norm": 1.21875, "learning_rate": 0.00010048016584781085, "loss": 0.4775, "step": 7005 }, { "epoch": 1.6875300914780933, "grad_norm": 1.1953125, "learning_rate": 0.00010039573917472955, "loss": 0.4673, "step": 7010 }, { "epoch": 1.6887337506018296, "grad_norm": 1.1796875, "learning_rate": 0.0001003113332727836, "loss": 0.4877, "step": 7015 }, { "epoch": 1.6899374097255657, "grad_norm": 1.109375, "learning_rate": 0.000100226948276832, "loss": 0.4254, "step": 7020 }, { "epoch": 1.6911410688493018, "grad_norm": 1.078125, "learning_rate": 0.00010014258432170028, "loss": 0.4543, "step": 7025 }, { "epoch": 1.6923447279730381, "grad_norm": 1.1484375, "learning_rate": 0.00010005824154218047, "loss": 0.4555, "step": 7030 }, { "epoch": 1.6935483870967742, "grad_norm": 1.234375, "learning_rate": 9.997392007303066e-05, "loss": 0.4845, "step": 7035 }, { "epoch": 1.6947520462205103, "grad_norm": 1.0546875, "learning_rate": 9.9889620048975e-05, "loss": 0.4641, "step": 7040 }, { "epoch": 1.6959557053442467, "grad_norm": 1.2265625, "learning_rate": 9.98053416047033e-05, "loss": 0.4804, "step": 7045 }, { "epoch": 1.6971593644679825, "grad_norm": 1.203125, "learning_rate": 9.972108487487087e-05, "loss": 0.4517, "step": 7050 }, { "epoch": 1.6983630235917189, "grad_norm": 1.15625, "learning_rate": 9.963684999409843e-05, "loss": 0.4462, "step": 7055 }, { "epoch": 1.699566682715455, "grad_norm": 1.2109375, "learning_rate": 9.955263709697163e-05, "loss": 0.4604, "step": 7060 }, { "epoch": 1.700770341839191, "grad_norm": 1.125, "learning_rate": 9.946844631804118e-05, "loss": 0.44, "step": 7065 }, { "epoch": 1.7019740009629274, "grad_norm": 1.2109375, "learning_rate": 9.938427779182235e-05, "loss": 0.4756, "step": 7070 }, { "epoch": 1.7031776600866635, "grad_norm": 1.21875, "learning_rate": 9.930013165279483e-05, "loss": 0.4968, "step": 7075 }, { "epoch": 1.7043813192103996, "grad_norm": 1.1171875, "learning_rate": 9.92160080354026e-05, "loss": 0.4313, "step": 7080 }, { "epoch": 1.705584978334136, "grad_norm": 1.15625, "learning_rate": 9.913190707405364e-05, "loss": 0.453, "step": 7085 }, { "epoch": 1.7067886374578718, "grad_norm": 1.1875, "learning_rate": 9.90478289031197e-05, "loss": 0.4536, "step": 7090 }, { "epoch": 1.7079922965816081, "grad_norm": 1.09375, "learning_rate": 9.89637736569362e-05, "loss": 0.4394, "step": 7095 }, { "epoch": 1.7091959557053442, "grad_norm": 1.0703125, "learning_rate": 9.887974146980183e-05, "loss": 0.4743, "step": 7100 }, { "epoch": 1.7103996148290803, "grad_norm": 1.1953125, "learning_rate": 9.879573247597849e-05, "loss": 0.4516, "step": 7105 }, { "epoch": 1.7116032739528166, "grad_norm": 1.1796875, "learning_rate": 9.8711746809691e-05, "loss": 0.4844, "step": 7110 }, { "epoch": 1.7128069330765527, "grad_norm": 1.140625, "learning_rate": 9.862778460512697e-05, "loss": 0.4653, "step": 7115 }, { "epoch": 1.7140105922002888, "grad_norm": 1.109375, "learning_rate": 9.854384599643646e-05, "loss": 0.4733, "step": 7120 }, { "epoch": 1.7152142513240252, "grad_norm": 1.15625, "learning_rate": 9.845993111773181e-05, "loss": 0.4359, "step": 7125 }, { "epoch": 1.716417910447761, "grad_norm": 1.3359375, "learning_rate": 9.837604010308757e-05, "loss": 0.5006, "step": 7130 }, { "epoch": 1.7176215695714974, "grad_norm": 1.2109375, "learning_rate": 9.829217308653996e-05, "loss": 0.4801, "step": 7135 }, { "epoch": 1.7188252286952335, "grad_norm": 1.1328125, "learning_rate": 9.820833020208709e-05, "loss": 0.4744, "step": 7140 }, { "epoch": 1.7200288878189696, "grad_norm": 1.203125, "learning_rate": 9.81245115836883e-05, "loss": 0.4371, "step": 7145 }, { "epoch": 1.7212325469427059, "grad_norm": 1.046875, "learning_rate": 9.804071736526432e-05, "loss": 0.4655, "step": 7150 }, { "epoch": 1.722436206066442, "grad_norm": 1.078125, "learning_rate": 9.795694768069681e-05, "loss": 0.4688, "step": 7155 }, { "epoch": 1.723639865190178, "grad_norm": 1.28125, "learning_rate": 9.787320266382821e-05, "loss": 0.4847, "step": 7160 }, { "epoch": 1.7248435243139144, "grad_norm": 1.125, "learning_rate": 9.778948244846171e-05, "loss": 0.436, "step": 7165 }, { "epoch": 1.7260471834376505, "grad_norm": 1.1484375, "learning_rate": 9.770578716836058e-05, "loss": 0.4586, "step": 7170 }, { "epoch": 1.7272508425613866, "grad_norm": 1.2109375, "learning_rate": 9.762211695724857e-05, "loss": 0.508, "step": 7175 }, { "epoch": 1.728454501685123, "grad_norm": 1.265625, "learning_rate": 9.753847194880916e-05, "loss": 0.4431, "step": 7180 }, { "epoch": 1.7296581608088588, "grad_norm": 1.21875, "learning_rate": 9.745485227668559e-05, "loss": 0.4897, "step": 7185 }, { "epoch": 1.7308618199325951, "grad_norm": 1.203125, "learning_rate": 9.73712580744808e-05, "loss": 0.5014, "step": 7190 }, { "epoch": 1.7320654790563312, "grad_norm": 1.1796875, "learning_rate": 9.728768947575673e-05, "loss": 0.4469, "step": 7195 }, { "epoch": 1.7332691381800673, "grad_norm": 1.1328125, "learning_rate": 9.720414661403469e-05, "loss": 0.4667, "step": 7200 }, { "epoch": 1.7344727973038037, "grad_norm": 1.125, "learning_rate": 9.712062962279472e-05, "loss": 0.4498, "step": 7205 }, { "epoch": 1.7356764564275398, "grad_norm": 1.1875, "learning_rate": 9.703713863547554e-05, "loss": 0.4739, "step": 7210 }, { "epoch": 1.7368801155512759, "grad_norm": 1.265625, "learning_rate": 9.69536737854744e-05, "loss": 0.4298, "step": 7215 }, { "epoch": 1.7380837746750122, "grad_norm": 1.21875, "learning_rate": 9.687023520614666e-05, "loss": 0.4709, "step": 7220 }, { "epoch": 1.739287433798748, "grad_norm": 1.1875, "learning_rate": 9.678682303080585e-05, "loss": 0.4882, "step": 7225 }, { "epoch": 1.7404910929224844, "grad_norm": 1.15625, "learning_rate": 9.670343739272324e-05, "loss": 0.4488, "step": 7230 }, { "epoch": 1.7416947520462205, "grad_norm": 1.21875, "learning_rate": 9.662007842512765e-05, "loss": 0.4741, "step": 7235 }, { "epoch": 1.7428984111699566, "grad_norm": 1.15625, "learning_rate": 9.653674626120536e-05, "loss": 0.4721, "step": 7240 }, { "epoch": 1.744102070293693, "grad_norm": 1.078125, "learning_rate": 9.645344103409982e-05, "loss": 0.4671, "step": 7245 }, { "epoch": 1.745305729417429, "grad_norm": 1.640625, "learning_rate": 9.637016287691144e-05, "loss": 0.4693, "step": 7250 }, { "epoch": 1.746509388541165, "grad_norm": 1.1015625, "learning_rate": 9.628691192269735e-05, "loss": 0.4494, "step": 7255 }, { "epoch": 1.7477130476649014, "grad_norm": 1.140625, "learning_rate": 9.62036883044712e-05, "loss": 0.464, "step": 7260 }, { "epoch": 1.7489167067886373, "grad_norm": 1.21875, "learning_rate": 9.612049215520306e-05, "loss": 0.4814, "step": 7265 }, { "epoch": 1.7501203659123736, "grad_norm": 1.1328125, "learning_rate": 9.603732360781898e-05, "loss": 0.4608, "step": 7270 }, { "epoch": 1.7513240250361097, "grad_norm": 1.09375, "learning_rate": 9.595418279520106e-05, "loss": 0.445, "step": 7275 }, { "epoch": 1.7525276841598458, "grad_norm": 1.140625, "learning_rate": 9.587106985018696e-05, "loss": 0.4452, "step": 7280 }, { "epoch": 1.7537313432835822, "grad_norm": 1.09375, "learning_rate": 9.578798490556983e-05, "loss": 0.4525, "step": 7285 }, { "epoch": 1.7549350024073183, "grad_norm": 1.15625, "learning_rate": 9.570492809409817e-05, "loss": 0.4515, "step": 7290 }, { "epoch": 1.7561386615310544, "grad_norm": 1.0859375, "learning_rate": 9.562189954847543e-05, "loss": 0.4457, "step": 7295 }, { "epoch": 1.7573423206547907, "grad_norm": 1.1484375, "learning_rate": 9.553889940135997e-05, "loss": 0.4403, "step": 7300 }, { "epoch": 1.7585459797785266, "grad_norm": 1.078125, "learning_rate": 9.545592778536476e-05, "loss": 0.458, "step": 7305 }, { "epoch": 1.7597496389022629, "grad_norm": 1.1875, "learning_rate": 9.537298483305711e-05, "loss": 0.4915, "step": 7310 }, { "epoch": 1.7609532980259992, "grad_norm": 1.0703125, "learning_rate": 9.529007067695869e-05, "loss": 0.4384, "step": 7315 }, { "epoch": 1.762156957149735, "grad_norm": 1.1484375, "learning_rate": 9.520718544954493e-05, "loss": 0.4502, "step": 7320 }, { "epoch": 1.7633606162734714, "grad_norm": 1.125, "learning_rate": 9.51243292832453e-05, "loss": 0.4817, "step": 7325 }, { "epoch": 1.7645642753972075, "grad_norm": 1.1015625, "learning_rate": 9.504150231044268e-05, "loss": 0.4458, "step": 7330 }, { "epoch": 1.7657679345209436, "grad_norm": 1.140625, "learning_rate": 9.495870466347332e-05, "loss": 0.4879, "step": 7335 }, { "epoch": 1.76697159364468, "grad_norm": 1.1640625, "learning_rate": 9.487593647462665e-05, "loss": 0.4667, "step": 7340 }, { "epoch": 1.768175252768416, "grad_norm": 1.2109375, "learning_rate": 9.479319787614498e-05, "loss": 0.4865, "step": 7345 }, { "epoch": 1.7693789118921521, "grad_norm": 1.1875, "learning_rate": 9.471048900022342e-05, "loss": 0.4443, "step": 7350 }, { "epoch": 1.7705825710158885, "grad_norm": 1.0703125, "learning_rate": 9.462780997900958e-05, "loss": 0.4703, "step": 7355 }, { "epoch": 1.7717862301396243, "grad_norm": 1.125, "learning_rate": 9.454516094460328e-05, "loss": 0.4557, "step": 7360 }, { "epoch": 1.7729898892633607, "grad_norm": 1.234375, "learning_rate": 9.446254202905652e-05, "loss": 0.4903, "step": 7365 }, { "epoch": 1.7741935483870968, "grad_norm": 1.078125, "learning_rate": 9.437995336437311e-05, "loss": 0.4427, "step": 7370 }, { "epoch": 1.7753972075108329, "grad_norm": 1.203125, "learning_rate": 9.429739508250864e-05, "loss": 0.4733, "step": 7375 }, { "epoch": 1.7766008666345692, "grad_norm": 1.21875, "learning_rate": 9.421486731537e-05, "loss": 0.4731, "step": 7380 }, { "epoch": 1.7778045257583053, "grad_norm": 1.15625, "learning_rate": 9.413237019481546e-05, "loss": 0.4572, "step": 7385 }, { "epoch": 1.7790081848820414, "grad_norm": 1.1171875, "learning_rate": 9.404990385265427e-05, "loss": 0.4595, "step": 7390 }, { "epoch": 1.7802118440057777, "grad_norm": 1.1015625, "learning_rate": 9.396746842064645e-05, "loss": 0.4541, "step": 7395 }, { "epoch": 1.7814155031295136, "grad_norm": 1.0625, "learning_rate": 9.388506403050277e-05, "loss": 0.4592, "step": 7400 }, { "epoch": 1.78261916225325, "grad_norm": 1.28125, "learning_rate": 9.380269081388421e-05, "loss": 0.4734, "step": 7405 }, { "epoch": 1.783822821376986, "grad_norm": 1.125, "learning_rate": 9.372034890240215e-05, "loss": 0.4551, "step": 7410 }, { "epoch": 1.785026480500722, "grad_norm": 1.0625, "learning_rate": 9.363803842761786e-05, "loss": 0.4583, "step": 7415 }, { "epoch": 1.7862301396244584, "grad_norm": 1.140625, "learning_rate": 9.35557595210423e-05, "loss": 0.473, "step": 7420 }, { "epoch": 1.7874337987481945, "grad_norm": 1.0859375, "learning_rate": 9.347351231413613e-05, "loss": 0.441, "step": 7425 }, { "epoch": 1.7886374578719306, "grad_norm": 1.0859375, "learning_rate": 9.339129693830927e-05, "loss": 0.4616, "step": 7430 }, { "epoch": 1.789841116995667, "grad_norm": 1.1796875, "learning_rate": 9.330911352492083e-05, "loss": 0.4818, "step": 7435 }, { "epoch": 1.7910447761194028, "grad_norm": 1.15625, "learning_rate": 9.322696220527885e-05, "loss": 0.4542, "step": 7440 }, { "epoch": 1.7922484352431391, "grad_norm": 1.140625, "learning_rate": 9.314484311064006e-05, "loss": 0.4578, "step": 7445 }, { "epoch": 1.7934520943668752, "grad_norm": 1.25, "learning_rate": 9.306275637220975e-05, "loss": 0.4762, "step": 7450 }, { "epoch": 1.7946557534906113, "grad_norm": 1.109375, "learning_rate": 9.298070212114141e-05, "loss": 0.4547, "step": 7455 }, { "epoch": 1.7958594126143477, "grad_norm": 1.203125, "learning_rate": 9.28986804885368e-05, "loss": 0.4402, "step": 7460 }, { "epoch": 1.7970630717380838, "grad_norm": 1.0625, "learning_rate": 9.281669160544539e-05, "loss": 0.4647, "step": 7465 }, { "epoch": 1.7982667308618199, "grad_norm": 1.1875, "learning_rate": 9.273473560286441e-05, "loss": 0.4407, "step": 7470 }, { "epoch": 1.7994703899855562, "grad_norm": 1.2265625, "learning_rate": 9.26528126117386e-05, "loss": 0.4796, "step": 7475 }, { "epoch": 1.8006740491092923, "grad_norm": 1.0546875, "learning_rate": 9.257092276295981e-05, "loss": 0.446, "step": 7480 }, { "epoch": 1.8018777082330284, "grad_norm": 1.140625, "learning_rate": 9.248906618736706e-05, "loss": 0.4775, "step": 7485 }, { "epoch": 1.8030813673567647, "grad_norm": 1.1484375, "learning_rate": 9.240724301574621e-05, "loss": 0.4508, "step": 7490 }, { "epoch": 1.8042850264805006, "grad_norm": 1.21875, "learning_rate": 9.232545337882966e-05, "loss": 0.4432, "step": 7495 }, { "epoch": 1.805488685604237, "grad_norm": 1.0078125, "learning_rate": 9.224369740729636e-05, "loss": 0.437, "step": 7500 }, { "epoch": 1.805488685604237, "eval_loss": 0.3923625946044922, "eval_runtime": 2.4326, "eval_samples_per_second": 82.217, "eval_steps_per_second": 82.217, "step": 7500 }, { "epoch": 1.806692344727973, "grad_norm": 1.1796875, "learning_rate": 9.216197523177126e-05, "loss": 0.4852, "step": 7505 }, { "epoch": 1.8078960038517091, "grad_norm": 1.171875, "learning_rate": 9.208028698282558e-05, "loss": 0.4466, "step": 7510 }, { "epoch": 1.8090996629754454, "grad_norm": 1.171875, "learning_rate": 9.199863279097612e-05, "loss": 0.4602, "step": 7515 }, { "epoch": 1.8103033220991815, "grad_norm": 1.1328125, "learning_rate": 9.19170127866854e-05, "loss": 0.4465, "step": 7520 }, { "epoch": 1.8115069812229176, "grad_norm": 1.1640625, "learning_rate": 9.183542710036125e-05, "loss": 0.4367, "step": 7525 }, { "epoch": 1.812710640346654, "grad_norm": 1.109375, "learning_rate": 9.175387586235664e-05, "loss": 0.4722, "step": 7530 }, { "epoch": 1.8139142994703898, "grad_norm": 1.203125, "learning_rate": 9.167235920296956e-05, "loss": 0.4959, "step": 7535 }, { "epoch": 1.8151179585941262, "grad_norm": 1.234375, "learning_rate": 9.159087725244279e-05, "loss": 0.4682, "step": 7540 }, { "epoch": 1.8163216177178623, "grad_norm": 1.1171875, "learning_rate": 9.150943014096355e-05, "loss": 0.4735, "step": 7545 }, { "epoch": 1.8175252768415984, "grad_norm": 1.0625, "learning_rate": 9.142801799866344e-05, "loss": 0.4483, "step": 7550 }, { "epoch": 1.8187289359653347, "grad_norm": 1.265625, "learning_rate": 9.134664095561819e-05, "loss": 0.4961, "step": 7555 }, { "epoch": 1.8199325950890708, "grad_norm": 1.21875, "learning_rate": 9.126529914184744e-05, "loss": 0.4515, "step": 7560 }, { "epoch": 1.821136254212807, "grad_norm": 1.171875, "learning_rate": 9.118399268731464e-05, "loss": 0.461, "step": 7565 }, { "epoch": 1.8223399133365432, "grad_norm": 1.1015625, "learning_rate": 9.11027217219266e-05, "loss": 0.446, "step": 7570 }, { "epoch": 1.823543572460279, "grad_norm": 1.2421875, "learning_rate": 9.10214863755335e-05, "loss": 0.4625, "step": 7575 }, { "epoch": 1.8247472315840154, "grad_norm": 1.1953125, "learning_rate": 9.094028677792856e-05, "loss": 0.4575, "step": 7580 }, { "epoch": 1.8259508907077515, "grad_norm": 1.1015625, "learning_rate": 9.085912305884799e-05, "loss": 0.4514, "step": 7585 }, { "epoch": 1.8271545498314876, "grad_norm": 1.25, "learning_rate": 9.077799534797055e-05, "loss": 0.4618, "step": 7590 }, { "epoch": 1.828358208955224, "grad_norm": 1.0625, "learning_rate": 9.069690377491755e-05, "loss": 0.4788, "step": 7595 }, { "epoch": 1.82956186807896, "grad_norm": 1.265625, "learning_rate": 9.061584846925254e-05, "loss": 0.4367, "step": 7600 }, { "epoch": 1.8307655272026961, "grad_norm": 1.2265625, "learning_rate": 9.053482956048107e-05, "loss": 0.4539, "step": 7605 }, { "epoch": 1.8319691863264325, "grad_norm": 1.0859375, "learning_rate": 9.045384717805068e-05, "loss": 0.4725, "step": 7610 }, { "epoch": 1.8331728454501686, "grad_norm": 1.046875, "learning_rate": 9.037290145135037e-05, "loss": 0.4512, "step": 7615 }, { "epoch": 1.8343765045739047, "grad_norm": 1.265625, "learning_rate": 9.02919925097107e-05, "loss": 0.4691, "step": 7620 }, { "epoch": 1.835580163697641, "grad_norm": 1.2578125, "learning_rate": 9.021112048240343e-05, "loss": 0.439, "step": 7625 }, { "epoch": 1.8367838228213769, "grad_norm": 1.2109375, "learning_rate": 9.01302854986413e-05, "loss": 0.4738, "step": 7630 }, { "epoch": 1.8379874819451132, "grad_norm": 1.296875, "learning_rate": 9.004948768757791e-05, "loss": 0.4421, "step": 7635 }, { "epoch": 1.8391911410688493, "grad_norm": 1.0859375, "learning_rate": 8.996872717830741e-05, "loss": 0.4801, "step": 7640 }, { "epoch": 1.8403948001925854, "grad_norm": 1.1328125, "learning_rate": 8.988800409986444e-05, "loss": 0.4642, "step": 7645 }, { "epoch": 1.8415984593163217, "grad_norm": 1.1484375, "learning_rate": 8.980731858122381e-05, "loss": 0.4792, "step": 7650 }, { "epoch": 1.8428021184400578, "grad_norm": 1.171875, "learning_rate": 8.97266707513002e-05, "loss": 0.4573, "step": 7655 }, { "epoch": 1.844005777563794, "grad_norm": 1.234375, "learning_rate": 8.964606073894827e-05, "loss": 0.4592, "step": 7660 }, { "epoch": 1.8452094366875302, "grad_norm": 1.1953125, "learning_rate": 8.956548867296207e-05, "loss": 0.4835, "step": 7665 }, { "epoch": 1.8464130958112661, "grad_norm": 1.1640625, "learning_rate": 8.948495468207513e-05, "loss": 0.4719, "step": 7670 }, { "epoch": 1.8476167549350024, "grad_norm": 1.0546875, "learning_rate": 8.940445889496017e-05, "loss": 0.4719, "step": 7675 }, { "epoch": 1.8488204140587385, "grad_norm": 1.0703125, "learning_rate": 8.932400144022878e-05, "loss": 0.4302, "step": 7680 }, { "epoch": 1.8500240731824746, "grad_norm": 1.140625, "learning_rate": 8.924358244643136e-05, "loss": 0.4676, "step": 7685 }, { "epoch": 1.851227732306211, "grad_norm": 1.265625, "learning_rate": 8.916320204205683e-05, "loss": 0.4808, "step": 7690 }, { "epoch": 1.852431391429947, "grad_norm": 1.09375, "learning_rate": 8.908286035553246e-05, "loss": 0.4307, "step": 7695 }, { "epoch": 1.8536350505536832, "grad_norm": 1.2109375, "learning_rate": 8.900255751522374e-05, "loss": 0.4679, "step": 7700 }, { "epoch": 1.8548387096774195, "grad_norm": 1.1796875, "learning_rate": 8.892229364943395e-05, "loss": 0.4465, "step": 7705 }, { "epoch": 1.8560423688011554, "grad_norm": 1.0546875, "learning_rate": 8.884206888640423e-05, "loss": 0.4463, "step": 7710 }, { "epoch": 1.8572460279248917, "grad_norm": 1.1796875, "learning_rate": 8.876188335431309e-05, "loss": 0.4698, "step": 7715 }, { "epoch": 1.8584496870486278, "grad_norm": 1.125, "learning_rate": 8.868173718127654e-05, "loss": 0.4684, "step": 7720 }, { "epoch": 1.8596533461723639, "grad_norm": 1.140625, "learning_rate": 8.860163049534762e-05, "loss": 0.4427, "step": 7725 }, { "epoch": 1.8608570052961002, "grad_norm": 1.265625, "learning_rate": 8.852156342451623e-05, "loss": 0.4535, "step": 7730 }, { "epoch": 1.8620606644198363, "grad_norm": 1.140625, "learning_rate": 8.844153609670907e-05, "loss": 0.476, "step": 7735 }, { "epoch": 1.8632643235435724, "grad_norm": 1.0703125, "learning_rate": 8.836154863978926e-05, "loss": 0.4558, "step": 7740 }, { "epoch": 1.8644679826673087, "grad_norm": 1.0625, "learning_rate": 8.828160118155623e-05, "loss": 0.4243, "step": 7745 }, { "epoch": 1.8656716417910446, "grad_norm": 1.1875, "learning_rate": 8.82016938497456e-05, "loss": 0.4738, "step": 7750 }, { "epoch": 1.866875300914781, "grad_norm": 1.234375, "learning_rate": 8.812182677202875e-05, "loss": 0.4459, "step": 7755 }, { "epoch": 1.8680789600385173, "grad_norm": 1.375, "learning_rate": 8.80420000760128e-05, "loss": 0.4569, "step": 7760 }, { "epoch": 1.8692826191622531, "grad_norm": 1.1953125, "learning_rate": 8.796221388924035e-05, "loss": 0.4515, "step": 7765 }, { "epoch": 1.8704862782859895, "grad_norm": 1.140625, "learning_rate": 8.788246833918926e-05, "loss": 0.4504, "step": 7770 }, { "epoch": 1.8716899374097256, "grad_norm": 1.0703125, "learning_rate": 8.780276355327253e-05, "loss": 0.4491, "step": 7775 }, { "epoch": 1.8728935965334617, "grad_norm": 1.25, "learning_rate": 8.772309965883792e-05, "loss": 0.4417, "step": 7780 }, { "epoch": 1.874097255657198, "grad_norm": 1.125, "learning_rate": 8.764347678316792e-05, "loss": 0.4526, "step": 7785 }, { "epoch": 1.875300914780934, "grad_norm": 1.203125, "learning_rate": 8.756389505347948e-05, "loss": 0.4433, "step": 7790 }, { "epoch": 1.8765045739046702, "grad_norm": 1.1640625, "learning_rate": 8.748435459692378e-05, "loss": 0.4938, "step": 7795 }, { "epoch": 1.8777082330284065, "grad_norm": 1.2578125, "learning_rate": 8.740485554058615e-05, "loss": 0.4613, "step": 7800 }, { "epoch": 1.8789118921521424, "grad_norm": 1.125, "learning_rate": 8.732539801148562e-05, "loss": 0.4855, "step": 7805 }, { "epoch": 1.8801155512758787, "grad_norm": 1.125, "learning_rate": 8.724598213657503e-05, "loss": 0.4591, "step": 7810 }, { "epoch": 1.8813192103996148, "grad_norm": 1.140625, "learning_rate": 8.716660804274052e-05, "loss": 0.436, "step": 7815 }, { "epoch": 1.882522869523351, "grad_norm": 1.25, "learning_rate": 8.708727585680157e-05, "loss": 0.4879, "step": 7820 }, { "epoch": 1.8837265286470872, "grad_norm": 1.171875, "learning_rate": 8.700798570551068e-05, "loss": 0.4676, "step": 7825 }, { "epoch": 1.8849301877708233, "grad_norm": 1.109375, "learning_rate": 8.692873771555317e-05, "loss": 0.4801, "step": 7830 }, { "epoch": 1.8861338468945594, "grad_norm": 1.3046875, "learning_rate": 8.684953201354705e-05, "loss": 0.4642, "step": 7835 }, { "epoch": 1.8873375060182958, "grad_norm": 1.125, "learning_rate": 8.677036872604268e-05, "loss": 0.4721, "step": 7840 }, { "epoch": 1.8885411651420316, "grad_norm": 1.171875, "learning_rate": 8.669124797952277e-05, "loss": 0.462, "step": 7845 }, { "epoch": 1.889744824265768, "grad_norm": 1.2890625, "learning_rate": 8.661216990040188e-05, "loss": 0.4289, "step": 7850 }, { "epoch": 1.890948483389504, "grad_norm": 1.1796875, "learning_rate": 8.65331346150266e-05, "loss": 0.4524, "step": 7855 }, { "epoch": 1.8921521425132402, "grad_norm": 1.1875, "learning_rate": 8.645414224967503e-05, "loss": 0.4301, "step": 7860 }, { "epoch": 1.8933558016369765, "grad_norm": 1.109375, "learning_rate": 8.637519293055672e-05, "loss": 0.4577, "step": 7865 }, { "epoch": 1.8945594607607126, "grad_norm": 1.1171875, "learning_rate": 8.629628678381245e-05, "loss": 0.435, "step": 7870 }, { "epoch": 1.8957631198844487, "grad_norm": 1.140625, "learning_rate": 8.621742393551398e-05, "loss": 0.4733, "step": 7875 }, { "epoch": 1.896966779008185, "grad_norm": 1.0703125, "learning_rate": 8.613860451166396e-05, "loss": 0.4073, "step": 7880 }, { "epoch": 1.8981704381319209, "grad_norm": 1.171875, "learning_rate": 8.605982863819561e-05, "loss": 0.4518, "step": 7885 }, { "epoch": 1.8993740972556572, "grad_norm": 1.1328125, "learning_rate": 8.598109644097259e-05, "loss": 0.4688, "step": 7890 }, { "epoch": 1.9005777563793933, "grad_norm": 1.0859375, "learning_rate": 8.590240804578877e-05, "loss": 0.4641, "step": 7895 }, { "epoch": 1.9017814155031294, "grad_norm": 1.0234375, "learning_rate": 8.582376357836801e-05, "loss": 0.4241, "step": 7900 }, { "epoch": 1.9029850746268657, "grad_norm": 1.2421875, "learning_rate": 8.574516316436402e-05, "loss": 0.4692, "step": 7905 }, { "epoch": 1.9041887337506018, "grad_norm": 1.171875, "learning_rate": 8.566660692936014e-05, "loss": 0.4423, "step": 7910 }, { "epoch": 1.905392392874338, "grad_norm": 1.1015625, "learning_rate": 8.558809499886906e-05, "loss": 0.4413, "step": 7915 }, { "epoch": 1.9065960519980742, "grad_norm": 1.1875, "learning_rate": 8.550962749833274e-05, "loss": 0.4658, "step": 7920 }, { "epoch": 1.9077997111218103, "grad_norm": 1.2734375, "learning_rate": 8.543120455312211e-05, "loss": 0.4967, "step": 7925 }, { "epoch": 1.9090033702455464, "grad_norm": 1.1640625, "learning_rate": 8.535282628853693e-05, "loss": 0.4573, "step": 7930 }, { "epoch": 1.9102070293692828, "grad_norm": 1.15625, "learning_rate": 8.527449282980564e-05, "loss": 0.4532, "step": 7935 }, { "epoch": 1.9114106884930187, "grad_norm": 1.0625, "learning_rate": 8.519620430208491e-05, "loss": 0.4343, "step": 7940 }, { "epoch": 1.912614347616755, "grad_norm": 1.1953125, "learning_rate": 8.511796083045985e-05, "loss": 0.4682, "step": 7945 }, { "epoch": 1.913818006740491, "grad_norm": 1.0859375, "learning_rate": 8.503976253994338e-05, "loss": 0.4351, "step": 7950 }, { "epoch": 1.9150216658642272, "grad_norm": 1.2421875, "learning_rate": 8.496160955547641e-05, "loss": 0.4599, "step": 7955 }, { "epoch": 1.9162253249879635, "grad_norm": 1.15625, "learning_rate": 8.488350200192731e-05, "loss": 0.4671, "step": 7960 }, { "epoch": 1.9174289841116996, "grad_norm": 1.1328125, "learning_rate": 8.480544000409193e-05, "loss": 0.4484, "step": 7965 }, { "epoch": 1.9186326432354357, "grad_norm": 1.0078125, "learning_rate": 8.472742368669337e-05, "loss": 0.4137, "step": 7970 }, { "epoch": 1.919836302359172, "grad_norm": 1.046875, "learning_rate": 8.464945317438164e-05, "loss": 0.4392, "step": 7975 }, { "epoch": 1.921039961482908, "grad_norm": 1.296875, "learning_rate": 8.457152859173371e-05, "loss": 0.4482, "step": 7980 }, { "epoch": 1.9222436206066442, "grad_norm": 1.1171875, "learning_rate": 8.449365006325304e-05, "loss": 0.4845, "step": 7985 }, { "epoch": 1.9234472797303803, "grad_norm": 1.1015625, "learning_rate": 8.441581771336956e-05, "loss": 0.4533, "step": 7990 }, { "epoch": 1.9246509388541164, "grad_norm": 1.1328125, "learning_rate": 8.433803166643944e-05, "loss": 0.4669, "step": 7995 }, { "epoch": 1.9258545979778527, "grad_norm": 1.0703125, "learning_rate": 8.426029204674479e-05, "loss": 0.4378, "step": 8000 }, { "epoch": 1.9258545979778527, "eval_loss": 0.38515639305114746, "eval_runtime": 2.416, "eval_samples_per_second": 82.78, "eval_steps_per_second": 82.78, "step": 8000 }, { "epoch": 1.9270582571015888, "grad_norm": 1.1796875, "learning_rate": 8.418259897849365e-05, "loss": 0.4709, "step": 8005 }, { "epoch": 1.928261916225325, "grad_norm": 1.140625, "learning_rate": 8.410495258581957e-05, "loss": 0.432, "step": 8010 }, { "epoch": 1.9294655753490613, "grad_norm": 1.2265625, "learning_rate": 8.402735299278164e-05, "loss": 0.4623, "step": 8015 }, { "epoch": 1.9306692344727971, "grad_norm": 1.3046875, "learning_rate": 8.394980032336409e-05, "loss": 0.4701, "step": 8020 }, { "epoch": 1.9318728935965335, "grad_norm": 1.1171875, "learning_rate": 8.387229470147617e-05, "loss": 0.4867, "step": 8025 }, { "epoch": 1.9330765527202696, "grad_norm": 1.203125, "learning_rate": 8.379483625095202e-05, "loss": 0.4878, "step": 8030 }, { "epoch": 1.9342802118440057, "grad_norm": 1.1328125, "learning_rate": 8.371742509555042e-05, "loss": 0.4176, "step": 8035 }, { "epoch": 1.935483870967742, "grad_norm": 1.1796875, "learning_rate": 8.36400613589545e-05, "loss": 0.482, "step": 8040 }, { "epoch": 1.936687530091478, "grad_norm": 1.2265625, "learning_rate": 8.356274516477175e-05, "loss": 0.4373, "step": 8045 }, { "epoch": 1.9378911892152142, "grad_norm": 1.140625, "learning_rate": 8.348547663653352e-05, "loss": 0.4531, "step": 8050 }, { "epoch": 1.9390948483389505, "grad_norm": 1.1328125, "learning_rate": 8.340825589769523e-05, "loss": 0.4447, "step": 8055 }, { "epoch": 1.9402985074626866, "grad_norm": 1.0859375, "learning_rate": 8.333108307163573e-05, "loss": 0.4565, "step": 8060 }, { "epoch": 1.9415021665864227, "grad_norm": 1.1875, "learning_rate": 8.325395828165749e-05, "loss": 0.4464, "step": 8065 }, { "epoch": 1.942705825710159, "grad_norm": 1.0859375, "learning_rate": 8.31768816509861e-05, "loss": 0.4316, "step": 8070 }, { "epoch": 1.943909484833895, "grad_norm": 1.15625, "learning_rate": 8.309985330277024e-05, "loss": 0.443, "step": 8075 }, { "epoch": 1.9451131439576312, "grad_norm": 1.234375, "learning_rate": 8.302287336008153e-05, "loss": 0.4285, "step": 8080 }, { "epoch": 1.9463168030813673, "grad_norm": 1.1328125, "learning_rate": 8.294594194591412e-05, "loss": 0.4081, "step": 8085 }, { "epoch": 1.9475204622051034, "grad_norm": 1.0546875, "learning_rate": 8.286905918318472e-05, "loss": 0.4254, "step": 8090 }, { "epoch": 1.9487241213288398, "grad_norm": 1.0234375, "learning_rate": 8.279222519473229e-05, "loss": 0.4297, "step": 8095 }, { "epoch": 1.9499277804525759, "grad_norm": 1.3515625, "learning_rate": 8.27154401033178e-05, "loss": 0.462, "step": 8100 }, { "epoch": 1.951131439576312, "grad_norm": 1.0390625, "learning_rate": 8.263870403162422e-05, "loss": 0.4902, "step": 8105 }, { "epoch": 1.9523350987000483, "grad_norm": 1.1640625, "learning_rate": 8.256201710225603e-05, "loss": 0.4382, "step": 8110 }, { "epoch": 1.9535387578237842, "grad_norm": 1.109375, "learning_rate": 8.248537943773936e-05, "loss": 0.4516, "step": 8115 }, { "epoch": 1.9547424169475205, "grad_norm": 1.03125, "learning_rate": 8.240879116052151e-05, "loss": 0.4287, "step": 8120 }, { "epoch": 1.9559460760712566, "grad_norm": 1.234375, "learning_rate": 8.23322523929709e-05, "loss": 0.4558, "step": 8125 }, { "epoch": 1.9571497351949927, "grad_norm": 1.296875, "learning_rate": 8.225576325737692e-05, "loss": 0.4495, "step": 8130 }, { "epoch": 1.958353394318729, "grad_norm": 1.21875, "learning_rate": 8.217932387594952e-05, "loss": 0.4783, "step": 8135 }, { "epoch": 1.9595570534424651, "grad_norm": 1.0390625, "learning_rate": 8.210293437081929e-05, "loss": 0.447, "step": 8140 }, { "epoch": 1.9607607125662012, "grad_norm": 1.1640625, "learning_rate": 8.202659486403709e-05, "loss": 0.4473, "step": 8145 }, { "epoch": 1.9619643716899375, "grad_norm": 1.2109375, "learning_rate": 8.195030547757383e-05, "loss": 0.4188, "step": 8150 }, { "epoch": 1.9631680308136734, "grad_norm": 1.1875, "learning_rate": 8.187406633332045e-05, "loss": 0.463, "step": 8155 }, { "epoch": 1.9643716899374097, "grad_norm": 1.1796875, "learning_rate": 8.179787755308749e-05, "loss": 0.4376, "step": 8160 }, { "epoch": 1.9655753490611458, "grad_norm": 1.15625, "learning_rate": 8.172173925860515e-05, "loss": 0.4411, "step": 8165 }, { "epoch": 1.966779008184882, "grad_norm": 1.125, "learning_rate": 8.164565157152289e-05, "loss": 0.4409, "step": 8170 }, { "epoch": 1.9679826673086183, "grad_norm": 1.1953125, "learning_rate": 8.156961461340931e-05, "loss": 0.4583, "step": 8175 }, { "epoch": 1.9691863264323544, "grad_norm": 1.0390625, "learning_rate": 8.149362850575201e-05, "loss": 0.4748, "step": 8180 }, { "epoch": 1.9703899855560905, "grad_norm": 1.1796875, "learning_rate": 8.141769336995727e-05, "loss": 0.4574, "step": 8185 }, { "epoch": 1.9715936446798268, "grad_norm": 1.1796875, "learning_rate": 8.134180932734998e-05, "loss": 0.4537, "step": 8190 }, { "epoch": 1.9727973038035629, "grad_norm": 1.1015625, "learning_rate": 8.126597649917345e-05, "loss": 0.4421, "step": 8195 }, { "epoch": 1.974000962927299, "grad_norm": 1.125, "learning_rate": 8.1190195006589e-05, "loss": 0.4426, "step": 8200 }, { "epoch": 1.9752046220510353, "grad_norm": 1.15625, "learning_rate": 8.11144649706761e-05, "loss": 0.4775, "step": 8205 }, { "epoch": 1.9764082811747712, "grad_norm": 1.234375, "learning_rate": 8.103878651243189e-05, "loss": 0.4769, "step": 8210 }, { "epoch": 1.9776119402985075, "grad_norm": 1.0703125, "learning_rate": 8.096315975277116e-05, "loss": 0.4242, "step": 8215 }, { "epoch": 1.9788155994222436, "grad_norm": 1.1328125, "learning_rate": 8.088758481252612e-05, "loss": 0.4515, "step": 8220 }, { "epoch": 1.9800192585459797, "grad_norm": 1.0390625, "learning_rate": 8.081206181244612e-05, "loss": 0.4238, "step": 8225 }, { "epoch": 1.981222917669716, "grad_norm": 1.15625, "learning_rate": 8.073659087319757e-05, "loss": 0.447, "step": 8230 }, { "epoch": 1.9824265767934521, "grad_norm": 1.125, "learning_rate": 8.066117211536362e-05, "loss": 0.438, "step": 8235 }, { "epoch": 1.9836302359171882, "grad_norm": 1.046875, "learning_rate": 8.058580565944419e-05, "loss": 0.4456, "step": 8240 }, { "epoch": 1.9848338950409246, "grad_norm": 1.109375, "learning_rate": 8.051049162585553e-05, "loss": 0.4426, "step": 8245 }, { "epoch": 1.9860375541646604, "grad_norm": 1.2109375, "learning_rate": 8.043523013493014e-05, "loss": 0.4746, "step": 8250 }, { "epoch": 1.9872412132883968, "grad_norm": 0.99609375, "learning_rate": 8.036002130691662e-05, "loss": 0.4391, "step": 8255 }, { "epoch": 1.9884448724121329, "grad_norm": 1.1328125, "learning_rate": 8.028486526197935e-05, "loss": 0.4454, "step": 8260 }, { "epoch": 1.989648531535869, "grad_norm": 1.0859375, "learning_rate": 8.020976212019847e-05, "loss": 0.4457, "step": 8265 }, { "epoch": 1.9908521906596053, "grad_norm": 1.1015625, "learning_rate": 8.013471200156956e-05, "loss": 0.4616, "step": 8270 }, { "epoch": 1.9920558497833414, "grad_norm": 1.140625, "learning_rate": 8.005971502600344e-05, "loss": 0.4433, "step": 8275 }, { "epoch": 1.9932595089070775, "grad_norm": 1.203125, "learning_rate": 7.998477131332605e-05, "loss": 0.4558, "step": 8280 }, { "epoch": 1.9944631680308138, "grad_norm": 1.140625, "learning_rate": 7.990988098327821e-05, "loss": 0.4677, "step": 8285 }, { "epoch": 1.9956668271545497, "grad_norm": 1.140625, "learning_rate": 7.983504415551553e-05, "loss": 0.4546, "step": 8290 }, { "epoch": 1.996870486278286, "grad_norm": 1.1171875, "learning_rate": 7.9760260949608e-05, "loss": 0.4472, "step": 8295 }, { "epoch": 1.998074145402022, "grad_norm": 1.1171875, "learning_rate": 7.968553148504007e-05, "loss": 0.4175, "step": 8300 }, { "epoch": 1.9992778045257582, "grad_norm": 1.140625, "learning_rate": 7.961085588121026e-05, "loss": 0.4297, "step": 8305 }, { "epoch": 1.9995185363505055, "eval_loss": 0.37979966402053833, "eval_runtime": 2.3917, "eval_samples_per_second": 83.624, "eval_steps_per_second": 83.624, "step": 8306 }, { "epoch": 2.0004814636494945, "grad_norm": 1.1328125, "learning_rate": 7.9536234257431e-05, "loss": 0.4353, "step": 8310 }, { "epoch": 2.0016851227732304, "grad_norm": 1.109375, "learning_rate": 7.946166673292858e-05, "loss": 0.3902, "step": 8315 }, { "epoch": 2.0028887818969667, "grad_norm": 1.0390625, "learning_rate": 7.938715342684274e-05, "loss": 0.4374, "step": 8320 }, { "epoch": 2.004092441020703, "grad_norm": 1.1484375, "learning_rate": 7.931269445822666e-05, "loss": 0.4068, "step": 8325 }, { "epoch": 2.005296100144439, "grad_norm": 1.078125, "learning_rate": 7.923828994604671e-05, "loss": 0.3911, "step": 8330 }, { "epoch": 2.0064997592681753, "grad_norm": 1.078125, "learning_rate": 7.916394000918217e-05, "loss": 0.4198, "step": 8335 }, { "epoch": 2.0077034183919116, "grad_norm": 1.140625, "learning_rate": 7.908964476642523e-05, "loss": 0.4418, "step": 8340 }, { "epoch": 2.0089070775156475, "grad_norm": 1.046875, "learning_rate": 7.901540433648059e-05, "loss": 0.4011, "step": 8345 }, { "epoch": 2.0101107366393838, "grad_norm": 1.078125, "learning_rate": 7.894121883796545e-05, "loss": 0.399, "step": 8350 }, { "epoch": 2.01131439576312, "grad_norm": 1.0859375, "learning_rate": 7.886708838940925e-05, "loss": 0.4171, "step": 8355 }, { "epoch": 2.012518054886856, "grad_norm": 1.09375, "learning_rate": 7.879301310925339e-05, "loss": 0.3966, "step": 8360 }, { "epoch": 2.0137217140105923, "grad_norm": 1.0703125, "learning_rate": 7.871899311585123e-05, "loss": 0.4054, "step": 8365 }, { "epoch": 2.014925373134328, "grad_norm": 1.09375, "learning_rate": 7.86450285274677e-05, "loss": 0.393, "step": 8370 }, { "epoch": 2.0161290322580645, "grad_norm": 1.1484375, "learning_rate": 7.857111946227928e-05, "loss": 0.421, "step": 8375 }, { "epoch": 2.017332691381801, "grad_norm": 1.0546875, "learning_rate": 7.849726603837372e-05, "loss": 0.3687, "step": 8380 }, { "epoch": 2.0185363505055367, "grad_norm": 1.1484375, "learning_rate": 7.842346837374982e-05, "loss": 0.4218, "step": 8385 }, { "epoch": 2.019740009629273, "grad_norm": 1.09375, "learning_rate": 7.834972658631736e-05, "loss": 0.3992, "step": 8390 }, { "epoch": 2.0209436687530093, "grad_norm": 1.171875, "learning_rate": 7.827604079389679e-05, "loss": 0.3854, "step": 8395 }, { "epoch": 2.0221473278767452, "grad_norm": 1.1640625, "learning_rate": 7.82024111142191e-05, "loss": 0.4073, "step": 8400 }, { "epoch": 2.0233509870004815, "grad_norm": 1.125, "learning_rate": 7.812883766492573e-05, "loss": 0.4257, "step": 8405 }, { "epoch": 2.0245546461242174, "grad_norm": 1.0546875, "learning_rate": 7.805532056356809e-05, "loss": 0.3998, "step": 8410 }, { "epoch": 2.0257583052479537, "grad_norm": 1.1171875, "learning_rate": 7.798185992760773e-05, "loss": 0.3866, "step": 8415 }, { "epoch": 2.02696196437169, "grad_norm": 1.0546875, "learning_rate": 7.790845587441587e-05, "loss": 0.3957, "step": 8420 }, { "epoch": 2.028165623495426, "grad_norm": 1.125, "learning_rate": 7.783510852127338e-05, "loss": 0.4383, "step": 8425 }, { "epoch": 2.0293692826191623, "grad_norm": 1.0390625, "learning_rate": 7.776181798537056e-05, "loss": 0.3989, "step": 8430 }, { "epoch": 2.0305729417428986, "grad_norm": 1.078125, "learning_rate": 7.768858438380688e-05, "loss": 0.4121, "step": 8435 }, { "epoch": 2.0317766008666345, "grad_norm": 1.2421875, "learning_rate": 7.761540783359083e-05, "loss": 0.4311, "step": 8440 }, { "epoch": 2.032980259990371, "grad_norm": 1.1015625, "learning_rate": 7.754228845163984e-05, "loss": 0.4227, "step": 8445 }, { "epoch": 2.0341839191141067, "grad_norm": 1.21875, "learning_rate": 7.746922635477986e-05, "loss": 0.4153, "step": 8450 }, { "epoch": 2.035387578237843, "grad_norm": 1.0546875, "learning_rate": 7.739622165974547e-05, "loss": 0.4029, "step": 8455 }, { "epoch": 2.0365912373615793, "grad_norm": 1.21875, "learning_rate": 7.732327448317938e-05, "loss": 0.4005, "step": 8460 }, { "epoch": 2.037794896485315, "grad_norm": 1.0703125, "learning_rate": 7.725038494163256e-05, "loss": 0.4064, "step": 8465 }, { "epoch": 2.0389985556090515, "grad_norm": 1.15625, "learning_rate": 7.717755315156373e-05, "loss": 0.4134, "step": 8470 }, { "epoch": 2.040202214732788, "grad_norm": 1.1875, "learning_rate": 7.710477922933947e-05, "loss": 0.3978, "step": 8475 }, { "epoch": 2.0414058738565237, "grad_norm": 1.15625, "learning_rate": 7.703206329123387e-05, "loss": 0.3932, "step": 8480 }, { "epoch": 2.04260953298026, "grad_norm": 1.1328125, "learning_rate": 7.69594054534283e-05, "loss": 0.4132, "step": 8485 }, { "epoch": 2.043813192103996, "grad_norm": 1.0703125, "learning_rate": 7.688680583201143e-05, "loss": 0.4215, "step": 8490 }, { "epoch": 2.0450168512277322, "grad_norm": 1.1328125, "learning_rate": 7.68142645429788e-05, "loss": 0.3968, "step": 8495 }, { "epoch": 2.0462205103514686, "grad_norm": 1.203125, "learning_rate": 7.674178170223279e-05, "loss": 0.418, "step": 8500 }, { "epoch": 2.0462205103514686, "eval_loss": 0.3806837499141693, "eval_runtime": 2.3897, "eval_samples_per_second": 83.691, "eval_steps_per_second": 83.691, "step": 8500 }, { "epoch": 2.0474241694752044, "grad_norm": 1.1015625, "learning_rate": 7.666935742558238e-05, "loss": 0.3989, "step": 8505 }, { "epoch": 2.0486278285989408, "grad_norm": 1.2109375, "learning_rate": 7.659699182874302e-05, "loss": 0.4223, "step": 8510 }, { "epoch": 2.049831487722677, "grad_norm": 1.0625, "learning_rate": 7.652468502733641e-05, "loss": 0.4093, "step": 8515 }, { "epoch": 2.051035146846413, "grad_norm": 1.03125, "learning_rate": 7.645243713689025e-05, "loss": 0.4229, "step": 8520 }, { "epoch": 2.0522388059701493, "grad_norm": 1.171875, "learning_rate": 7.638024827283818e-05, "loss": 0.385, "step": 8525 }, { "epoch": 2.0534424650938856, "grad_norm": 1.046875, "learning_rate": 7.630811855051943e-05, "loss": 0.3694, "step": 8530 }, { "epoch": 2.0546461242176215, "grad_norm": 1.109375, "learning_rate": 7.623604808517888e-05, "loss": 0.4145, "step": 8535 }, { "epoch": 2.055849783341358, "grad_norm": 1.0390625, "learning_rate": 7.616403699196664e-05, "loss": 0.3964, "step": 8540 }, { "epoch": 2.0570534424650937, "grad_norm": 1.25, "learning_rate": 7.609208538593797e-05, "loss": 0.3957, "step": 8545 }, { "epoch": 2.05825710158883, "grad_norm": 1.078125, "learning_rate": 7.602019338205308e-05, "loss": 0.3741, "step": 8550 }, { "epoch": 2.0594607607125663, "grad_norm": 1.1484375, "learning_rate": 7.594836109517698e-05, "loss": 0.4318, "step": 8555 }, { "epoch": 2.060664419836302, "grad_norm": 1.171875, "learning_rate": 7.58765886400792e-05, "loss": 0.4193, "step": 8560 }, { "epoch": 2.0618680789600385, "grad_norm": 1.1328125, "learning_rate": 7.580487613143381e-05, "loss": 0.4035, "step": 8565 }, { "epoch": 2.063071738083775, "grad_norm": 1.125, "learning_rate": 7.573322368381895e-05, "loss": 0.4023, "step": 8570 }, { "epoch": 2.0642753972075107, "grad_norm": 1.09375, "learning_rate": 7.566163141171689e-05, "loss": 0.3735, "step": 8575 }, { "epoch": 2.065479056331247, "grad_norm": 1.171875, "learning_rate": 7.559009942951371e-05, "loss": 0.4156, "step": 8580 }, { "epoch": 2.066682715454983, "grad_norm": 1.109375, "learning_rate": 7.551862785149918e-05, "loss": 0.4007, "step": 8585 }, { "epoch": 2.0678863745787193, "grad_norm": 1.1171875, "learning_rate": 7.544721679186659e-05, "loss": 0.4138, "step": 8590 }, { "epoch": 2.0690900337024556, "grad_norm": 1.078125, "learning_rate": 7.53758663647125e-05, "loss": 0.39, "step": 8595 }, { "epoch": 2.0702936928261915, "grad_norm": 1.109375, "learning_rate": 7.530457668403657e-05, "loss": 0.4167, "step": 8600 }, { "epoch": 2.071497351949928, "grad_norm": 1.0859375, "learning_rate": 7.523334786374148e-05, "loss": 0.417, "step": 8605 }, { "epoch": 2.072701011073664, "grad_norm": 1.203125, "learning_rate": 7.516218001763256e-05, "loss": 0.4115, "step": 8610 }, { "epoch": 2.0739046701974, "grad_norm": 1.1015625, "learning_rate": 7.509107325941786e-05, "loss": 0.4022, "step": 8615 }, { "epoch": 2.0751083293211363, "grad_norm": 1.125, "learning_rate": 7.502002770270769e-05, "loss": 0.3862, "step": 8620 }, { "epoch": 2.076311988444872, "grad_norm": 1.2890625, "learning_rate": 7.494904346101468e-05, "loss": 0.4426, "step": 8625 }, { "epoch": 2.0775156475686085, "grad_norm": 1.125, "learning_rate": 7.487812064775342e-05, "loss": 0.3998, "step": 8630 }, { "epoch": 2.078719306692345, "grad_norm": 1.1953125, "learning_rate": 7.480725937624039e-05, "loss": 0.415, "step": 8635 }, { "epoch": 2.0799229658160807, "grad_norm": 1.0625, "learning_rate": 7.473645975969376e-05, "loss": 0.4114, "step": 8640 }, { "epoch": 2.081126624939817, "grad_norm": 1.0546875, "learning_rate": 7.466572191123314e-05, "loss": 0.3811, "step": 8645 }, { "epoch": 2.0823302840635534, "grad_norm": 1.1171875, "learning_rate": 7.45950459438795e-05, "loss": 0.3939, "step": 8650 }, { "epoch": 2.0835339431872892, "grad_norm": 1.109375, "learning_rate": 7.452443197055491e-05, "loss": 0.3883, "step": 8655 }, { "epoch": 2.0847376023110256, "grad_norm": 1.1015625, "learning_rate": 7.445388010408239e-05, "loss": 0.4078, "step": 8660 }, { "epoch": 2.085941261434762, "grad_norm": 1.21875, "learning_rate": 7.438339045718574e-05, "loss": 0.3887, "step": 8665 }, { "epoch": 2.0871449205584978, "grad_norm": 1.1953125, "learning_rate": 7.431296314248934e-05, "loss": 0.4039, "step": 8670 }, { "epoch": 2.088348579682234, "grad_norm": 1.1015625, "learning_rate": 7.4242598272518e-05, "loss": 0.4015, "step": 8675 }, { "epoch": 2.08955223880597, "grad_norm": 1.109375, "learning_rate": 7.417229595969673e-05, "loss": 0.394, "step": 8680 }, { "epoch": 2.0907558979297063, "grad_norm": 1.078125, "learning_rate": 7.410205631635059e-05, "loss": 0.3929, "step": 8685 }, { "epoch": 2.0919595570534426, "grad_norm": 1.15625, "learning_rate": 7.403187945470457e-05, "loss": 0.4202, "step": 8690 }, { "epoch": 2.0931632161771785, "grad_norm": 1.2109375, "learning_rate": 7.396176548688328e-05, "loss": 0.397, "step": 8695 }, { "epoch": 2.094366875300915, "grad_norm": 1.171875, "learning_rate": 7.389171452491087e-05, "loss": 0.3991, "step": 8700 }, { "epoch": 2.095570534424651, "grad_norm": 1.078125, "learning_rate": 7.382172668071079e-05, "loss": 0.3698, "step": 8705 }, { "epoch": 2.096774193548387, "grad_norm": 1.09375, "learning_rate": 7.375180206610572e-05, "loss": 0.3854, "step": 8710 }, { "epoch": 2.0979778526721233, "grad_norm": 1.2265625, "learning_rate": 7.368194079281723e-05, "loss": 0.4216, "step": 8715 }, { "epoch": 2.099181511795859, "grad_norm": 1.1328125, "learning_rate": 7.361214297246573e-05, "loss": 0.4296, "step": 8720 }, { "epoch": 2.1003851709195955, "grad_norm": 1.09375, "learning_rate": 7.354240871657028e-05, "loss": 0.4315, "step": 8725 }, { "epoch": 2.101588830043332, "grad_norm": 1.140625, "learning_rate": 7.347273813654831e-05, "loss": 0.4026, "step": 8730 }, { "epoch": 2.1027924891670677, "grad_norm": 1.1875, "learning_rate": 7.340313134371558e-05, "loss": 0.4065, "step": 8735 }, { "epoch": 2.103996148290804, "grad_norm": 1.1796875, "learning_rate": 7.333358844928584e-05, "loss": 0.4018, "step": 8740 }, { "epoch": 2.1051998074145404, "grad_norm": 1.046875, "learning_rate": 7.326410956437086e-05, "loss": 0.3906, "step": 8745 }, { "epoch": 2.1064034665382763, "grad_norm": 1.109375, "learning_rate": 7.319469479998007e-05, "loss": 0.3965, "step": 8750 }, { "epoch": 2.1076071256620126, "grad_norm": 1.1875, "learning_rate": 7.31253442670205e-05, "loss": 0.3783, "step": 8755 }, { "epoch": 2.1088107847857485, "grad_norm": 1.1328125, "learning_rate": 7.305605807629646e-05, "loss": 0.3664, "step": 8760 }, { "epoch": 2.110014443909485, "grad_norm": 1.1953125, "learning_rate": 7.298683633850958e-05, "loss": 0.3972, "step": 8765 }, { "epoch": 2.111218103033221, "grad_norm": 1.21875, "learning_rate": 7.291767916425842e-05, "loss": 0.4058, "step": 8770 }, { "epoch": 2.112421762156957, "grad_norm": 1.2734375, "learning_rate": 7.284858666403843e-05, "loss": 0.4173, "step": 8775 }, { "epoch": 2.1136254212806933, "grad_norm": 1.1171875, "learning_rate": 7.27795589482417e-05, "loss": 0.4106, "step": 8780 }, { "epoch": 2.1148290804044296, "grad_norm": 1.140625, "learning_rate": 7.271059612715683e-05, "loss": 0.4197, "step": 8785 }, { "epoch": 2.1160327395281655, "grad_norm": 1.1640625, "learning_rate": 7.264169831096873e-05, "loss": 0.4326, "step": 8790 }, { "epoch": 2.117236398651902, "grad_norm": 1.15625, "learning_rate": 7.257286560975844e-05, "loss": 0.4073, "step": 8795 }, { "epoch": 2.118440057775638, "grad_norm": 1.078125, "learning_rate": 7.250409813350301e-05, "loss": 0.3785, "step": 8800 }, { "epoch": 2.119643716899374, "grad_norm": 1.125, "learning_rate": 7.243539599207518e-05, "loss": 0.3959, "step": 8805 }, { "epoch": 2.1208473760231104, "grad_norm": 1.125, "learning_rate": 7.236675929524341e-05, "loss": 0.4263, "step": 8810 }, { "epoch": 2.1220510351468462, "grad_norm": 1.2578125, "learning_rate": 7.229818815267155e-05, "loss": 0.4307, "step": 8815 }, { "epoch": 2.1232546942705826, "grad_norm": 1.1796875, "learning_rate": 7.222968267391863e-05, "loss": 0.4089, "step": 8820 }, { "epoch": 2.124458353394319, "grad_norm": 1.109375, "learning_rate": 7.216124296843894e-05, "loss": 0.4076, "step": 8825 }, { "epoch": 2.1256620125180548, "grad_norm": 1.1640625, "learning_rate": 7.20928691455815e-05, "loss": 0.4176, "step": 8830 }, { "epoch": 2.126865671641791, "grad_norm": 1.09375, "learning_rate": 7.202456131459023e-05, "loss": 0.3637, "step": 8835 }, { "epoch": 2.1280693307655274, "grad_norm": 1.390625, "learning_rate": 7.195631958460345e-05, "loss": 0.4148, "step": 8840 }, { "epoch": 2.1292729898892633, "grad_norm": 1.0625, "learning_rate": 7.188814406465402e-05, "loss": 0.4053, "step": 8845 }, { "epoch": 2.1304766490129996, "grad_norm": 1.2265625, "learning_rate": 7.182003486366893e-05, "loss": 0.3777, "step": 8850 }, { "epoch": 2.1316803081367355, "grad_norm": 1.1640625, "learning_rate": 7.17519920904692e-05, "loss": 0.3965, "step": 8855 }, { "epoch": 2.132883967260472, "grad_norm": 1.125, "learning_rate": 7.168401585376977e-05, "loss": 0.391, "step": 8860 }, { "epoch": 2.134087626384208, "grad_norm": 1.078125, "learning_rate": 7.16161062621792e-05, "loss": 0.4147, "step": 8865 }, { "epoch": 2.135291285507944, "grad_norm": 1.0703125, "learning_rate": 7.154826342419964e-05, "loss": 0.3957, "step": 8870 }, { "epoch": 2.1364949446316803, "grad_norm": 1.015625, "learning_rate": 7.148048744822656e-05, "loss": 0.3883, "step": 8875 }, { "epoch": 2.1376986037554166, "grad_norm": 1.09375, "learning_rate": 7.141277844254854e-05, "loss": 0.387, "step": 8880 }, { "epoch": 2.1389022628791525, "grad_norm": 1.109375, "learning_rate": 7.134513651534729e-05, "loss": 0.4049, "step": 8885 }, { "epoch": 2.140105922002889, "grad_norm": 1.0625, "learning_rate": 7.127756177469721e-05, "loss": 0.3873, "step": 8890 }, { "epoch": 2.1413095811266247, "grad_norm": 1.234375, "learning_rate": 7.121005432856543e-05, "loss": 0.3964, "step": 8895 }, { "epoch": 2.142513240250361, "grad_norm": 1.1171875, "learning_rate": 7.114261428481157e-05, "loss": 0.4228, "step": 8900 }, { "epoch": 2.1437168993740974, "grad_norm": 1.203125, "learning_rate": 7.107524175118749e-05, "loss": 0.3901, "step": 8905 }, { "epoch": 2.1449205584978333, "grad_norm": 1.2734375, "learning_rate": 7.100793683533727e-05, "loss": 0.4142, "step": 8910 }, { "epoch": 2.1461242176215696, "grad_norm": 1.1796875, "learning_rate": 7.094069964479686e-05, "loss": 0.4214, "step": 8915 }, { "epoch": 2.147327876745306, "grad_norm": 1.1640625, "learning_rate": 7.087353028699411e-05, "loss": 0.3911, "step": 8920 }, { "epoch": 2.1485315358690418, "grad_norm": 1.171875, "learning_rate": 7.080642886924841e-05, "loss": 0.3991, "step": 8925 }, { "epoch": 2.149735194992778, "grad_norm": 1.2578125, "learning_rate": 7.07393954987706e-05, "loss": 0.3907, "step": 8930 }, { "epoch": 2.1509388541165144, "grad_norm": 1.140625, "learning_rate": 7.067243028266287e-05, "loss": 0.3983, "step": 8935 }, { "epoch": 2.1521425132402503, "grad_norm": 1.203125, "learning_rate": 7.06055333279184e-05, "loss": 0.4067, "step": 8940 }, { "epoch": 2.1533461723639866, "grad_norm": 1.1484375, "learning_rate": 7.053870474142144e-05, "loss": 0.4131, "step": 8945 }, { "epoch": 2.1545498314877225, "grad_norm": 1.0859375, "learning_rate": 7.047194462994693e-05, "loss": 0.3966, "step": 8950 }, { "epoch": 2.155753490611459, "grad_norm": 1.2109375, "learning_rate": 7.040525310016038e-05, "loss": 0.377, "step": 8955 }, { "epoch": 2.156957149735195, "grad_norm": 1.1953125, "learning_rate": 7.03386302586178e-05, "loss": 0.4072, "step": 8960 }, { "epoch": 2.158160808858931, "grad_norm": 1.21875, "learning_rate": 7.027207621176537e-05, "loss": 0.3732, "step": 8965 }, { "epoch": 2.1593644679826673, "grad_norm": 1.046875, "learning_rate": 7.020559106593943e-05, "loss": 0.4195, "step": 8970 }, { "epoch": 2.1605681271064037, "grad_norm": 1.2578125, "learning_rate": 7.01391749273662e-05, "loss": 0.3931, "step": 8975 }, { "epoch": 2.1617717862301395, "grad_norm": 1.1171875, "learning_rate": 7.007282790216158e-05, "loss": 0.3809, "step": 8980 }, { "epoch": 2.162975445353876, "grad_norm": 1.1015625, "learning_rate": 7.00065500963312e-05, "loss": 0.3792, "step": 8985 }, { "epoch": 2.1641791044776117, "grad_norm": 1.3359375, "learning_rate": 6.994034161576994e-05, "loss": 0.421, "step": 8990 }, { "epoch": 2.165382763601348, "grad_norm": 1.1328125, "learning_rate": 6.987420256626201e-05, "loss": 0.3905, "step": 8995 }, { "epoch": 2.1665864227250844, "grad_norm": 1.0546875, "learning_rate": 6.980813305348063e-05, "loss": 0.4093, "step": 9000 }, { "epoch": 2.1665864227250844, "eval_loss": 0.37502264976501465, "eval_runtime": 2.3896, "eval_samples_per_second": 83.696, "eval_steps_per_second": 83.696, "step": 9000 }, { "epoch": 2.1677900818488203, "grad_norm": 1.0546875, "learning_rate": 6.974213318298798e-05, "loss": 0.3898, "step": 9005 }, { "epoch": 2.1689937409725566, "grad_norm": 1.1953125, "learning_rate": 6.96762030602349e-05, "loss": 0.4196, "step": 9010 }, { "epoch": 2.170197400096293, "grad_norm": 1.1796875, "learning_rate": 6.961034279056084e-05, "loss": 0.3951, "step": 9015 }, { "epoch": 2.171401059220029, "grad_norm": 1.140625, "learning_rate": 6.954455247919364e-05, "loss": 0.3994, "step": 9020 }, { "epoch": 2.172604718343765, "grad_norm": 1.1953125, "learning_rate": 6.947883223124934e-05, "loss": 0.3945, "step": 9025 }, { "epoch": 2.173808377467501, "grad_norm": 1.0078125, "learning_rate": 6.941318215173203e-05, "loss": 0.42, "step": 9030 }, { "epoch": 2.1750120365912373, "grad_norm": 1.1953125, "learning_rate": 6.934760234553373e-05, "loss": 0.4054, "step": 9035 }, { "epoch": 2.1762156957149736, "grad_norm": 1.1171875, "learning_rate": 6.928209291743416e-05, "loss": 0.4085, "step": 9040 }, { "epoch": 2.1774193548387095, "grad_norm": 1.0625, "learning_rate": 6.921665397210058e-05, "loss": 0.404, "step": 9045 }, { "epoch": 2.178623013962446, "grad_norm": 1.1796875, "learning_rate": 6.915128561408764e-05, "loss": 0.4078, "step": 9050 }, { "epoch": 2.179826673086182, "grad_norm": 1.1953125, "learning_rate": 6.908598794783725e-05, "loss": 0.387, "step": 9055 }, { "epoch": 2.181030332209918, "grad_norm": 1.1640625, "learning_rate": 6.902076107767832e-05, "loss": 0.3812, "step": 9060 }, { "epoch": 2.1822339913336544, "grad_norm": 1.1484375, "learning_rate": 6.895560510782667e-05, "loss": 0.4122, "step": 9065 }, { "epoch": 2.1834376504573907, "grad_norm": 1.1796875, "learning_rate": 6.889052014238485e-05, "loss": 0.397, "step": 9070 }, { "epoch": 2.1846413095811266, "grad_norm": 1.125, "learning_rate": 6.882550628534193e-05, "loss": 0.402, "step": 9075 }, { "epoch": 2.185844968704863, "grad_norm": 1.1171875, "learning_rate": 6.876056364057341e-05, "loss": 0.416, "step": 9080 }, { "epoch": 2.1870486278285988, "grad_norm": 1.0, "learning_rate": 6.869569231184099e-05, "loss": 0.3931, "step": 9085 }, { "epoch": 2.188252286952335, "grad_norm": 1.15625, "learning_rate": 6.863089240279237e-05, "loss": 0.4056, "step": 9090 }, { "epoch": 2.1894559460760714, "grad_norm": 1.1171875, "learning_rate": 6.856616401696126e-05, "loss": 0.4032, "step": 9095 }, { "epoch": 2.1906596051998073, "grad_norm": 1.2421875, "learning_rate": 6.850150725776697e-05, "loss": 0.4192, "step": 9100 }, { "epoch": 2.1918632643235436, "grad_norm": 1.2890625, "learning_rate": 6.843692222851447e-05, "loss": 0.4087, "step": 9105 }, { "epoch": 2.19306692344728, "grad_norm": 1.078125, "learning_rate": 6.837240903239406e-05, "loss": 0.3928, "step": 9110 }, { "epoch": 2.194270582571016, "grad_norm": 1.1015625, "learning_rate": 6.83079677724813e-05, "loss": 0.411, "step": 9115 }, { "epoch": 2.195474241694752, "grad_norm": 1.21875, "learning_rate": 6.824359855173681e-05, "loss": 0.4334, "step": 9120 }, { "epoch": 2.196677900818488, "grad_norm": 1.0625, "learning_rate": 6.817930147300608e-05, "loss": 0.4062, "step": 9125 }, { "epoch": 2.1978815599422243, "grad_norm": 1.1640625, "learning_rate": 6.811507663901937e-05, "loss": 0.4079, "step": 9130 }, { "epoch": 2.1990852190659607, "grad_norm": 1.0546875, "learning_rate": 6.805092415239152e-05, "loss": 0.4133, "step": 9135 }, { "epoch": 2.2002888781896965, "grad_norm": 1.125, "learning_rate": 6.798684411562172e-05, "loss": 0.4149, "step": 9140 }, { "epoch": 2.201492537313433, "grad_norm": 1.1953125, "learning_rate": 6.792283663109347e-05, "loss": 0.4132, "step": 9145 }, { "epoch": 2.202696196437169, "grad_norm": 1.21875, "learning_rate": 6.785890180107429e-05, "loss": 0.4258, "step": 9150 }, { "epoch": 2.203899855560905, "grad_norm": 1.0703125, "learning_rate": 6.779503972771568e-05, "loss": 0.4073, "step": 9155 }, { "epoch": 2.2051035146846414, "grad_norm": 1.21875, "learning_rate": 6.773125051305288e-05, "loss": 0.4025, "step": 9160 }, { "epoch": 2.2063071738083773, "grad_norm": 1.140625, "learning_rate": 6.766753425900465e-05, "loss": 0.3807, "step": 9165 }, { "epoch": 2.2075108329321136, "grad_norm": 1.015625, "learning_rate": 6.760389106737327e-05, "loss": 0.3883, "step": 9170 }, { "epoch": 2.20871449205585, "grad_norm": 1.171875, "learning_rate": 6.754032103984421e-05, "loss": 0.3972, "step": 9175 }, { "epoch": 2.209918151179586, "grad_norm": 1.078125, "learning_rate": 6.747682427798612e-05, "loss": 0.3811, "step": 9180 }, { "epoch": 2.211121810303322, "grad_norm": 1.296875, "learning_rate": 6.741340088325055e-05, "loss": 0.4075, "step": 9185 }, { "epoch": 2.2123254694270584, "grad_norm": 1.1953125, "learning_rate": 6.735005095697175e-05, "loss": 0.3911, "step": 9190 }, { "epoch": 2.2135291285507943, "grad_norm": 1.0859375, "learning_rate": 6.728677460036675e-05, "loss": 0.3871, "step": 9195 }, { "epoch": 2.2147327876745306, "grad_norm": 1.234375, "learning_rate": 6.72235719145349e-05, "loss": 0.412, "step": 9200 }, { "epoch": 2.215936446798267, "grad_norm": 1.2578125, "learning_rate": 6.716044300045792e-05, "loss": 0.3858, "step": 9205 }, { "epoch": 2.217140105922003, "grad_norm": 1.109375, "learning_rate": 6.709738795899959e-05, "loss": 0.3828, "step": 9210 }, { "epoch": 2.218343765045739, "grad_norm": 1.3046875, "learning_rate": 6.703440689090572e-05, "loss": 0.4583, "step": 9215 }, { "epoch": 2.219547424169475, "grad_norm": 1.0859375, "learning_rate": 6.697149989680394e-05, "loss": 0.3934, "step": 9220 }, { "epoch": 2.2207510832932114, "grad_norm": 1.109375, "learning_rate": 6.690866707720346e-05, "loss": 0.4065, "step": 9225 }, { "epoch": 2.2219547424169477, "grad_norm": 1.2890625, "learning_rate": 6.684590853249505e-05, "loss": 0.4261, "step": 9230 }, { "epoch": 2.2231584015406836, "grad_norm": 1.0859375, "learning_rate": 6.678322436295073e-05, "loss": 0.4086, "step": 9235 }, { "epoch": 2.22436206066442, "grad_norm": 1.140625, "learning_rate": 6.672061466872381e-05, "loss": 0.3873, "step": 9240 }, { "epoch": 2.2255657197881558, "grad_norm": 1.1015625, "learning_rate": 6.665807954984848e-05, "loss": 0.3935, "step": 9245 }, { "epoch": 2.226769378911892, "grad_norm": 1.2265625, "learning_rate": 6.659561910623981e-05, "loss": 0.3903, "step": 9250 }, { "epoch": 2.2279730380356284, "grad_norm": 1.0625, "learning_rate": 6.653323343769362e-05, "loss": 0.3877, "step": 9255 }, { "epoch": 2.2291766971593643, "grad_norm": 1.21875, "learning_rate": 6.647092264388618e-05, "loss": 0.4119, "step": 9260 }, { "epoch": 2.2303803562831006, "grad_norm": 1.1875, "learning_rate": 6.64086868243742e-05, "loss": 0.3823, "step": 9265 }, { "epoch": 2.231584015406837, "grad_norm": 1.078125, "learning_rate": 6.634652607859455e-05, "loss": 0.3907, "step": 9270 }, { "epoch": 2.232787674530573, "grad_norm": 1.0390625, "learning_rate": 6.628444050586414e-05, "loss": 0.3758, "step": 9275 }, { "epoch": 2.233991333654309, "grad_norm": 1.0625, "learning_rate": 6.622243020537987e-05, "loss": 0.4156, "step": 9280 }, { "epoch": 2.2351949927780455, "grad_norm": 1.078125, "learning_rate": 6.616049527621821e-05, "loss": 0.3866, "step": 9285 }, { "epoch": 2.2363986519017813, "grad_norm": 1.28125, "learning_rate": 6.609863581733538e-05, "loss": 0.4107, "step": 9290 }, { "epoch": 2.2376023110255177, "grad_norm": 1.125, "learning_rate": 6.60368519275669e-05, "loss": 0.4376, "step": 9295 }, { "epoch": 2.2388059701492535, "grad_norm": 1.078125, "learning_rate": 6.597514370562758e-05, "loss": 0.4332, "step": 9300 }, { "epoch": 2.24000962927299, "grad_norm": 1.0859375, "learning_rate": 6.591351125011137e-05, "loss": 0.3892, "step": 9305 }, { "epoch": 2.241213288396726, "grad_norm": 1.0703125, "learning_rate": 6.58519546594911e-05, "loss": 0.3966, "step": 9310 }, { "epoch": 2.242416947520462, "grad_norm": 1.1171875, "learning_rate": 6.579047403211843e-05, "loss": 0.3974, "step": 9315 }, { "epoch": 2.2436206066441984, "grad_norm": 1.1875, "learning_rate": 6.572906946622367e-05, "loss": 0.4176, "step": 9320 }, { "epoch": 2.2448242657679347, "grad_norm": 1.1953125, "learning_rate": 6.566774105991555e-05, "loss": 0.4012, "step": 9325 }, { "epoch": 2.2460279248916706, "grad_norm": 1.203125, "learning_rate": 6.560648891118115e-05, "loss": 0.4174, "step": 9330 }, { "epoch": 2.247231584015407, "grad_norm": 1.15625, "learning_rate": 6.554531311788569e-05, "loss": 0.3792, "step": 9335 }, { "epoch": 2.2484352431391432, "grad_norm": 1.140625, "learning_rate": 6.548421377777243e-05, "loss": 0.4256, "step": 9340 }, { "epoch": 2.249638902262879, "grad_norm": 1.125, "learning_rate": 6.542319098846242e-05, "loss": 0.4099, "step": 9345 }, { "epoch": 2.2508425613866154, "grad_norm": 1.2734375, "learning_rate": 6.536224484745446e-05, "loss": 0.4194, "step": 9350 }, { "epoch": 2.2520462205103513, "grad_norm": 1.109375, "learning_rate": 6.530137545212485e-05, "loss": 0.3866, "step": 9355 }, { "epoch": 2.2532498796340876, "grad_norm": 1.1171875, "learning_rate": 6.524058289972727e-05, "loss": 0.4307, "step": 9360 }, { "epoch": 2.254453538757824, "grad_norm": 1.125, "learning_rate": 6.517986728739263e-05, "loss": 0.3917, "step": 9365 }, { "epoch": 2.25565719788156, "grad_norm": 1.0859375, "learning_rate": 6.511922871212896e-05, "loss": 0.3917, "step": 9370 }, { "epoch": 2.256860857005296, "grad_norm": 1.046875, "learning_rate": 6.50586672708211e-05, "loss": 0.3924, "step": 9375 }, { "epoch": 2.258064516129032, "grad_norm": 1.1875, "learning_rate": 6.499818306023076e-05, "loss": 0.4108, "step": 9380 }, { "epoch": 2.2592681752527684, "grad_norm": 1.109375, "learning_rate": 6.493777617699615e-05, "loss": 0.4137, "step": 9385 }, { "epoch": 2.2604718343765047, "grad_norm": 1.1953125, "learning_rate": 6.487744671763201e-05, "loss": 0.3925, "step": 9390 }, { "epoch": 2.2616754935002406, "grad_norm": 1.2265625, "learning_rate": 6.481719477852939e-05, "loss": 0.3929, "step": 9395 }, { "epoch": 2.262879152623977, "grad_norm": 1.1171875, "learning_rate": 6.475702045595539e-05, "loss": 0.3885, "step": 9400 }, { "epoch": 2.264082811747713, "grad_norm": 1.1640625, "learning_rate": 6.469692384605316e-05, "loss": 0.3875, "step": 9405 }, { "epoch": 2.265286470871449, "grad_norm": 1.1328125, "learning_rate": 6.463690504484168e-05, "loss": 0.3813, "step": 9410 }, { "epoch": 2.2664901299951854, "grad_norm": 1.234375, "learning_rate": 6.457696414821561e-05, "loss": 0.4028, "step": 9415 }, { "epoch": 2.2676937891189217, "grad_norm": 1.1875, "learning_rate": 6.451710125194513e-05, "loss": 0.407, "step": 9420 }, { "epoch": 2.2688974482426576, "grad_norm": 1.140625, "learning_rate": 6.445731645167583e-05, "loss": 0.408, "step": 9425 }, { "epoch": 2.270101107366394, "grad_norm": 1.15625, "learning_rate": 6.439760984292848e-05, "loss": 0.3812, "step": 9430 }, { "epoch": 2.27130476649013, "grad_norm": 1.2265625, "learning_rate": 6.43379815210989e-05, "loss": 0.4072, "step": 9435 }, { "epoch": 2.272508425613866, "grad_norm": 1.171875, "learning_rate": 6.427843158145793e-05, "loss": 0.3998, "step": 9440 }, { "epoch": 2.2737120847376024, "grad_norm": 1.078125, "learning_rate": 6.421896011915104e-05, "loss": 0.3917, "step": 9445 }, { "epoch": 2.2749157438613383, "grad_norm": 1.2734375, "learning_rate": 6.415956722919843e-05, "loss": 0.4218, "step": 9450 }, { "epoch": 2.2761194029850746, "grad_norm": 1.1015625, "learning_rate": 6.410025300649471e-05, "loss": 0.3923, "step": 9455 }, { "epoch": 2.277323062108811, "grad_norm": 1.0625, "learning_rate": 6.404101754580875e-05, "loss": 0.3824, "step": 9460 }, { "epoch": 2.278526721232547, "grad_norm": 1.171875, "learning_rate": 6.39818609417837e-05, "loss": 0.3949, "step": 9465 }, { "epoch": 2.279730380356283, "grad_norm": 1.1484375, "learning_rate": 6.392278328893659e-05, "loss": 0.4075, "step": 9470 }, { "epoch": 2.2809340394800195, "grad_norm": 1.0859375, "learning_rate": 6.386378468165839e-05, "loss": 0.4196, "step": 9475 }, { "epoch": 2.2821376986037554, "grad_norm": 1.4375, "learning_rate": 6.380486521421378e-05, "loss": 0.4198, "step": 9480 }, { "epoch": 2.2833413577274917, "grad_norm": 1.234375, "learning_rate": 6.37460249807409e-05, "loss": 0.3953, "step": 9485 }, { "epoch": 2.2845450168512276, "grad_norm": 1.1328125, "learning_rate": 6.368726407525143e-05, "loss": 0.4056, "step": 9490 }, { "epoch": 2.285748675974964, "grad_norm": 1.21875, "learning_rate": 6.362858259163018e-05, "loss": 0.4337, "step": 9495 }, { "epoch": 2.2869523350987, "grad_norm": 1.2421875, "learning_rate": 6.356998062363514e-05, "loss": 0.421, "step": 9500 }, { "epoch": 2.2869523350987, "eval_loss": 0.3711702227592468, "eval_runtime": 2.3967, "eval_samples_per_second": 83.447, "eval_steps_per_second": 83.447, "step": 9500 }, { "epoch": 2.288155994222436, "grad_norm": 1.171875, "learning_rate": 6.351145826489722e-05, "loss": 0.3913, "step": 9505 }, { "epoch": 2.2893596533461724, "grad_norm": 1.03125, "learning_rate": 6.345301560892017e-05, "loss": 0.3862, "step": 9510 }, { "epoch": 2.2905633124699083, "grad_norm": 1.21875, "learning_rate": 6.339465274908037e-05, "loss": 0.3951, "step": 9515 }, { "epoch": 2.2917669715936446, "grad_norm": 1.078125, "learning_rate": 6.333636977862667e-05, "loss": 0.4166, "step": 9520 }, { "epoch": 2.292970630717381, "grad_norm": 1.0703125, "learning_rate": 6.327816679068032e-05, "loss": 0.3928, "step": 9525 }, { "epoch": 2.294174289841117, "grad_norm": 1.1171875, "learning_rate": 6.322004387823483e-05, "loss": 0.4188, "step": 9530 }, { "epoch": 2.295377948964853, "grad_norm": 1.171875, "learning_rate": 6.316200113415568e-05, "loss": 0.4166, "step": 9535 }, { "epoch": 2.2965816080885895, "grad_norm": 1.1171875, "learning_rate": 6.31040386511803e-05, "loss": 0.4072, "step": 9540 }, { "epoch": 2.2977852672123253, "grad_norm": 1.0546875, "learning_rate": 6.304615652191786e-05, "loss": 0.4007, "step": 9545 }, { "epoch": 2.2989889263360617, "grad_norm": 1.2421875, "learning_rate": 6.298835483884917e-05, "loss": 0.4006, "step": 9550 }, { "epoch": 2.300192585459798, "grad_norm": 1.046875, "learning_rate": 6.293063369432653e-05, "loss": 0.3938, "step": 9555 }, { "epoch": 2.301396244583534, "grad_norm": 1.140625, "learning_rate": 6.287299318057353e-05, "loss": 0.4106, "step": 9560 }, { "epoch": 2.30259990370727, "grad_norm": 1.21875, "learning_rate": 6.28154333896849e-05, "loss": 0.3769, "step": 9565 }, { "epoch": 2.303803562831006, "grad_norm": 1.171875, "learning_rate": 6.275795441362644e-05, "loss": 0.424, "step": 9570 }, { "epoch": 2.3050072219547424, "grad_norm": 1.109375, "learning_rate": 6.270055634423482e-05, "loss": 0.3756, "step": 9575 }, { "epoch": 2.3062108810784787, "grad_norm": 1.1171875, "learning_rate": 6.264323927321748e-05, "loss": 0.4229, "step": 9580 }, { "epoch": 2.3074145402022146, "grad_norm": 1.1015625, "learning_rate": 6.258600329215234e-05, "loss": 0.3925, "step": 9585 }, { "epoch": 2.308618199325951, "grad_norm": 1.125, "learning_rate": 6.25288484924879e-05, "loss": 0.4099, "step": 9590 }, { "epoch": 2.3098218584496872, "grad_norm": 1.125, "learning_rate": 6.24717749655428e-05, "loss": 0.4305, "step": 9595 }, { "epoch": 2.311025517573423, "grad_norm": 1.109375, "learning_rate": 6.241478280250594e-05, "loss": 0.4097, "step": 9600 }, { "epoch": 2.3122291766971594, "grad_norm": 1.1875, "learning_rate": 6.23578720944362e-05, "loss": 0.3966, "step": 9605 }, { "epoch": 2.3134328358208958, "grad_norm": 1.1484375, "learning_rate": 6.23010429322623e-05, "loss": 0.3747, "step": 9610 }, { "epoch": 2.3146364949446316, "grad_norm": 1.1796875, "learning_rate": 6.224429540678268e-05, "loss": 0.3955, "step": 9615 }, { "epoch": 2.315840154068368, "grad_norm": 1.140625, "learning_rate": 6.218762960866529e-05, "loss": 0.4318, "step": 9620 }, { "epoch": 2.317043813192104, "grad_norm": 1.140625, "learning_rate": 6.213104562844759e-05, "loss": 0.3901, "step": 9625 }, { "epoch": 2.31824747231584, "grad_norm": 1.0703125, "learning_rate": 6.207454355653628e-05, "loss": 0.4159, "step": 9630 }, { "epoch": 2.3194511314395765, "grad_norm": 1.109375, "learning_rate": 6.201812348320715e-05, "loss": 0.3858, "step": 9635 }, { "epoch": 2.3206547905633124, "grad_norm": 1.1328125, "learning_rate": 6.196178549860506e-05, "loss": 0.3864, "step": 9640 }, { "epoch": 2.3218584496870487, "grad_norm": 1.0390625, "learning_rate": 6.190552969274362e-05, "loss": 0.3958, "step": 9645 }, { "epoch": 2.3230621088107846, "grad_norm": 1.2109375, "learning_rate": 6.184935615550523e-05, "loss": 0.4007, "step": 9650 }, { "epoch": 2.324265767934521, "grad_norm": 1.1015625, "learning_rate": 6.179326497664076e-05, "loss": 0.3984, "step": 9655 }, { "epoch": 2.325469427058257, "grad_norm": 1.2109375, "learning_rate": 6.173725624576958e-05, "loss": 0.3919, "step": 9660 }, { "epoch": 2.326673086181993, "grad_norm": 1.125, "learning_rate": 6.168133005237925e-05, "loss": 0.3825, "step": 9665 }, { "epoch": 2.3278767453057294, "grad_norm": 1.234375, "learning_rate": 6.162548648582552e-05, "loss": 0.4195, "step": 9670 }, { "epoch": 2.3290804044294657, "grad_norm": 1.125, "learning_rate": 6.156972563533207e-05, "loss": 0.4048, "step": 9675 }, { "epoch": 2.3302840635532016, "grad_norm": 1.1328125, "learning_rate": 6.151404758999041e-05, "loss": 0.3994, "step": 9680 }, { "epoch": 2.331487722676938, "grad_norm": 1.125, "learning_rate": 6.145845243875986e-05, "loss": 0.4194, "step": 9685 }, { "epoch": 2.3326913818006743, "grad_norm": 1.0859375, "learning_rate": 6.140294027046718e-05, "loss": 0.3838, "step": 9690 }, { "epoch": 2.33389504092441, "grad_norm": 1.171875, "learning_rate": 6.134751117380658e-05, "loss": 0.3954, "step": 9695 }, { "epoch": 2.3350987000481465, "grad_norm": 1.0859375, "learning_rate": 6.129216523733955e-05, "loss": 0.4078, "step": 9700 }, { "epoch": 2.3363023591718823, "grad_norm": 1.1328125, "learning_rate": 6.123690254949472e-05, "loss": 0.3872, "step": 9705 }, { "epoch": 2.3375060182956187, "grad_norm": 1.1953125, "learning_rate": 6.118172319856768e-05, "loss": 0.3944, "step": 9710 }, { "epoch": 2.338709677419355, "grad_norm": 1.140625, "learning_rate": 6.112662727272092e-05, "loss": 0.3888, "step": 9715 }, { "epoch": 2.339913336543091, "grad_norm": 1.1015625, "learning_rate": 6.107161485998356e-05, "loss": 0.3875, "step": 9720 }, { "epoch": 2.341116995666827, "grad_norm": 1.1796875, "learning_rate": 6.10166860482514e-05, "loss": 0.3993, "step": 9725 }, { "epoch": 2.3423206547905635, "grad_norm": 1.1328125, "learning_rate": 6.0961840925286505e-05, "loss": 0.4037, "step": 9730 }, { "epoch": 2.3435243139142994, "grad_norm": 1.1015625, "learning_rate": 6.090707957871739e-05, "loss": 0.4084, "step": 9735 }, { "epoch": 2.3447279730380357, "grad_norm": 1.125, "learning_rate": 6.085240209603864e-05, "loss": 0.3996, "step": 9740 }, { "epoch": 2.345931632161772, "grad_norm": 1.125, "learning_rate": 6.0797808564610835e-05, "loss": 0.3843, "step": 9745 }, { "epoch": 2.347135291285508, "grad_norm": 1.21875, "learning_rate": 6.074329907166048e-05, "loss": 0.4346, "step": 9750 }, { "epoch": 2.3483389504092442, "grad_norm": 1.1328125, "learning_rate": 6.068887370427973e-05, "loss": 0.3709, "step": 9755 }, { "epoch": 2.34954260953298, "grad_norm": 1.109375, "learning_rate": 6.0634532549426374e-05, "loss": 0.3787, "step": 9760 }, { "epoch": 2.3507462686567164, "grad_norm": 1.2578125, "learning_rate": 6.058027569392368e-05, "loss": 0.397, "step": 9765 }, { "epoch": 2.3519499277804528, "grad_norm": 1.0625, "learning_rate": 6.052610322446018e-05, "loss": 0.3798, "step": 9770 }, { "epoch": 2.3531535869041886, "grad_norm": 1.2890625, "learning_rate": 6.047201522758959e-05, "loss": 0.3975, "step": 9775 }, { "epoch": 2.354357246027925, "grad_norm": 1.171875, "learning_rate": 6.041801178973063e-05, "loss": 0.3933, "step": 9780 }, { "epoch": 2.355560905151661, "grad_norm": 1.09375, "learning_rate": 6.0364092997167e-05, "loss": 0.4018, "step": 9785 }, { "epoch": 2.356764564275397, "grad_norm": 1.2109375, "learning_rate": 6.0310258936047094e-05, "loss": 0.4337, "step": 9790 }, { "epoch": 2.3579682233991335, "grad_norm": 1.1015625, "learning_rate": 6.0256509692383914e-05, "loss": 0.3952, "step": 9795 }, { "epoch": 2.3591718825228694, "grad_norm": 1.1640625, "learning_rate": 6.020284535205502e-05, "loss": 0.3888, "step": 9800 }, { "epoch": 2.3603755416466057, "grad_norm": 1.0703125, "learning_rate": 6.014926600080223e-05, "loss": 0.3849, "step": 9805 }, { "epoch": 2.361579200770342, "grad_norm": 1.2265625, "learning_rate": 6.0095771724231624e-05, "loss": 0.3895, "step": 9810 }, { "epoch": 2.362782859894078, "grad_norm": 1.078125, "learning_rate": 6.004236260781337e-05, "loss": 0.4174, "step": 9815 }, { "epoch": 2.363986519017814, "grad_norm": 1.1171875, "learning_rate": 5.998903873688149e-05, "loss": 0.4015, "step": 9820 }, { "epoch": 2.3651901781415505, "grad_norm": 1.109375, "learning_rate": 5.993580019663392e-05, "loss": 0.4158, "step": 9825 }, { "epoch": 2.3663938372652864, "grad_norm": 1.0390625, "learning_rate": 5.988264707213218e-05, "loss": 0.3738, "step": 9830 }, { "epoch": 2.3675974963890227, "grad_norm": 1.109375, "learning_rate": 5.9829579448301284e-05, "loss": 0.4382, "step": 9835 }, { "epoch": 2.3688011555127586, "grad_norm": 1.09375, "learning_rate": 5.9776597409929765e-05, "loss": 0.401, "step": 9840 }, { "epoch": 2.370004814636495, "grad_norm": 1.1875, "learning_rate": 5.9723701041669286e-05, "loss": 0.3907, "step": 9845 }, { "epoch": 2.3712084737602313, "grad_norm": 1.234375, "learning_rate": 5.967089042803473e-05, "loss": 0.4297, "step": 9850 }, { "epoch": 2.372412132883967, "grad_norm": 1.0390625, "learning_rate": 5.961816565340387e-05, "loss": 0.3821, "step": 9855 }, { "epoch": 2.3736157920077035, "grad_norm": 1.078125, "learning_rate": 5.956552680201741e-05, "loss": 0.3814, "step": 9860 }, { "epoch": 2.3748194511314393, "grad_norm": 1.140625, "learning_rate": 5.9512973957978736e-05, "loss": 0.395, "step": 9865 }, { "epoch": 2.3760231102551757, "grad_norm": 1.234375, "learning_rate": 5.946050720525381e-05, "loss": 0.3967, "step": 9870 }, { "epoch": 2.377226769378912, "grad_norm": 1.28125, "learning_rate": 5.940812662767108e-05, "loss": 0.3961, "step": 9875 }, { "epoch": 2.3784304285026483, "grad_norm": 1.15625, "learning_rate": 5.935583230892127e-05, "loss": 0.4415, "step": 9880 }, { "epoch": 2.379634087626384, "grad_norm": 1.171875, "learning_rate": 5.930362433255728e-05, "loss": 0.3949, "step": 9885 }, { "epoch": 2.3808377467501205, "grad_norm": 1.234375, "learning_rate": 5.925150278199407e-05, "loss": 0.3874, "step": 9890 }, { "epoch": 2.3820414058738564, "grad_norm": 1.125, "learning_rate": 5.9199467740508536e-05, "loss": 0.428, "step": 9895 }, { "epoch": 2.3832450649975927, "grad_norm": 1.1796875, "learning_rate": 5.914751929123933e-05, "loss": 0.3851, "step": 9900 }, { "epoch": 2.384448724121329, "grad_norm": 1.1875, "learning_rate": 5.9095657517186743e-05, "loss": 0.3884, "step": 9905 }, { "epoch": 2.385652383245065, "grad_norm": 1.2265625, "learning_rate": 5.9043882501212613e-05, "loss": 0.392, "step": 9910 }, { "epoch": 2.3868560423688012, "grad_norm": 1.125, "learning_rate": 5.899219432604012e-05, "loss": 0.4125, "step": 9915 }, { "epoch": 2.388059701492537, "grad_norm": 1.1171875, "learning_rate": 5.8940593074253744e-05, "loss": 0.3937, "step": 9920 }, { "epoch": 2.3892633606162734, "grad_norm": 1.15625, "learning_rate": 5.888907882829906e-05, "loss": 0.3936, "step": 9925 }, { "epoch": 2.3904670197400097, "grad_norm": 1.1796875, "learning_rate": 5.8837651670482604e-05, "loss": 0.3803, "step": 9930 }, { "epoch": 2.3916706788637456, "grad_norm": 1.078125, "learning_rate": 5.878631168297181e-05, "loss": 0.3818, "step": 9935 }, { "epoch": 2.392874337987482, "grad_norm": 1.2890625, "learning_rate": 5.873505894779479e-05, "loss": 0.383, "step": 9940 }, { "epoch": 2.3940779971112183, "grad_norm": 1.203125, "learning_rate": 5.8683893546840286e-05, "loss": 0.3882, "step": 9945 }, { "epoch": 2.395281656234954, "grad_norm": 1.125, "learning_rate": 5.863281556185752e-05, "loss": 0.4125, "step": 9950 }, { "epoch": 2.3964853153586905, "grad_norm": 1.1953125, "learning_rate": 5.858182507445598e-05, "loss": 0.3848, "step": 9955 }, { "epoch": 2.397688974482427, "grad_norm": 1.109375, "learning_rate": 5.853092216610542e-05, "loss": 0.4149, "step": 9960 }, { "epoch": 2.3988926336061627, "grad_norm": 1.1796875, "learning_rate": 5.8480106918135636e-05, "loss": 0.4103, "step": 9965 }, { "epoch": 2.400096292729899, "grad_norm": 1.203125, "learning_rate": 5.842937941173635e-05, "loss": 0.3975, "step": 9970 }, { "epoch": 2.401299951853635, "grad_norm": 1.171875, "learning_rate": 5.837873972795713e-05, "loss": 0.4351, "step": 9975 }, { "epoch": 2.402503610977371, "grad_norm": 1.125, "learning_rate": 5.832818794770718e-05, "loss": 0.3843, "step": 9980 }, { "epoch": 2.4037072701011075, "grad_norm": 1.1953125, "learning_rate": 5.8277724151755356e-05, "loss": 0.4056, "step": 9985 }, { "epoch": 2.4049109292248434, "grad_norm": 1.0390625, "learning_rate": 5.8227348420729805e-05, "loss": 0.3652, "step": 9990 }, { "epoch": 2.4061145883485797, "grad_norm": 1.1875, "learning_rate": 5.8177060835118026e-05, "loss": 0.3646, "step": 9995 }, { "epoch": 2.4073182474723156, "grad_norm": 1.1640625, "learning_rate": 5.812686147526673e-05, "loss": 0.4058, "step": 10000 }, { "epoch": 2.4073182474723156, "eval_loss": 0.36641818284988403, "eval_runtime": 2.3939, "eval_samples_per_second": 83.545, "eval_steps_per_second": 83.545, "step": 10000 }, { "epoch": 2.408521906596052, "grad_norm": 1.109375, "learning_rate": 5.8076750421381594e-05, "loss": 0.3825, "step": 10005 }, { "epoch": 2.4097255657197882, "grad_norm": 1.109375, "learning_rate": 5.802672775352725e-05, "loss": 0.3807, "step": 10010 }, { "epoch": 2.410929224843524, "grad_norm": 1.0625, "learning_rate": 5.7976793551627094e-05, "loss": 0.392, "step": 10015 }, { "epoch": 2.4121328839672604, "grad_norm": 1.1796875, "learning_rate": 5.7926947895463165e-05, "loss": 0.392, "step": 10020 }, { "epoch": 2.4133365430909968, "grad_norm": 1.2421875, "learning_rate": 5.787719086467608e-05, "loss": 0.3858, "step": 10025 }, { "epoch": 2.4145402022147326, "grad_norm": 1.09375, "learning_rate": 5.782752253876479e-05, "loss": 0.4164, "step": 10030 }, { "epoch": 2.415743861338469, "grad_norm": 1.0859375, "learning_rate": 5.777794299708658e-05, "loss": 0.3956, "step": 10035 }, { "epoch": 2.4169475204622053, "grad_norm": 1.203125, "learning_rate": 5.772845231885681e-05, "loss": 0.3958, "step": 10040 }, { "epoch": 2.418151179585941, "grad_norm": 1.2265625, "learning_rate": 5.767905058314889e-05, "loss": 0.424, "step": 10045 }, { "epoch": 2.4193548387096775, "grad_norm": 1.3046875, "learning_rate": 5.762973786889418e-05, "loss": 0.4156, "step": 10050 }, { "epoch": 2.4205584978334134, "grad_norm": 1.171875, "learning_rate": 5.75805142548817e-05, "loss": 0.39, "step": 10055 }, { "epoch": 2.4217621569571497, "grad_norm": 1.203125, "learning_rate": 5.7531379819758214e-05, "loss": 0.4127, "step": 10060 }, { "epoch": 2.422965816080886, "grad_norm": 1.140625, "learning_rate": 5.748233464202791e-05, "loss": 0.3849, "step": 10065 }, { "epoch": 2.424169475204622, "grad_norm": 1.171875, "learning_rate": 5.743337880005243e-05, "loss": 0.3841, "step": 10070 }, { "epoch": 2.425373134328358, "grad_norm": 1.1640625, "learning_rate": 5.738451237205068e-05, "loss": 0.3903, "step": 10075 }, { "epoch": 2.4265767934520945, "grad_norm": 1.203125, "learning_rate": 5.7335735436098656e-05, "loss": 0.3989, "step": 10080 }, { "epoch": 2.4277804525758304, "grad_norm": 1.203125, "learning_rate": 5.72870480701294e-05, "loss": 0.3997, "step": 10085 }, { "epoch": 2.4289841116995667, "grad_norm": 1.0703125, "learning_rate": 5.723845035193287e-05, "loss": 0.3758, "step": 10090 }, { "epoch": 2.430187770823303, "grad_norm": 1.171875, "learning_rate": 5.718994235915571e-05, "loss": 0.3731, "step": 10095 }, { "epoch": 2.431391429947039, "grad_norm": 1.1640625, "learning_rate": 5.714152416930132e-05, "loss": 0.4003, "step": 10100 }, { "epoch": 2.4325950890707753, "grad_norm": 1.1328125, "learning_rate": 5.709319585972951e-05, "loss": 0.4216, "step": 10105 }, { "epoch": 2.433798748194511, "grad_norm": 1.0234375, "learning_rate": 5.704495750765654e-05, "loss": 0.382, "step": 10110 }, { "epoch": 2.4350024073182475, "grad_norm": 1.2265625, "learning_rate": 5.6996809190154927e-05, "loss": 0.4103, "step": 10115 }, { "epoch": 2.436206066441984, "grad_norm": 1.1875, "learning_rate": 5.694875098415336e-05, "loss": 0.3976, "step": 10120 }, { "epoch": 2.4374097255657197, "grad_norm": 1.125, "learning_rate": 5.6900782966436505e-05, "loss": 0.3941, "step": 10125 }, { "epoch": 2.438613384689456, "grad_norm": 1.109375, "learning_rate": 5.685290521364497e-05, "loss": 0.3954, "step": 10130 }, { "epoch": 2.439817043813192, "grad_norm": 1.28125, "learning_rate": 5.6805117802275146e-05, "loss": 0.3974, "step": 10135 }, { "epoch": 2.441020702936928, "grad_norm": 1.203125, "learning_rate": 5.675742080867903e-05, "loss": 0.3952, "step": 10140 }, { "epoch": 2.4422243620606645, "grad_norm": 1.171875, "learning_rate": 5.670981430906424e-05, "loss": 0.4087, "step": 10145 }, { "epoch": 2.4434280211844004, "grad_norm": 1.1015625, "learning_rate": 5.666229837949371e-05, "loss": 0.4009, "step": 10150 }, { "epoch": 2.4446316803081367, "grad_norm": 1.1171875, "learning_rate": 5.6614873095885734e-05, "loss": 0.3799, "step": 10155 }, { "epoch": 2.445835339431873, "grad_norm": 1.21875, "learning_rate": 5.6567538534013765e-05, "loss": 0.418, "step": 10160 }, { "epoch": 2.447038998555609, "grad_norm": 1.1875, "learning_rate": 5.652029476950628e-05, "loss": 0.3949, "step": 10165 }, { "epoch": 2.4482426576793452, "grad_norm": 1.1328125, "learning_rate": 5.6473141877846724e-05, "loss": 0.4035, "step": 10170 }, { "epoch": 2.4494463168030816, "grad_norm": 1.1796875, "learning_rate": 5.642607993437333e-05, "loss": 0.4067, "step": 10175 }, { "epoch": 2.4506499759268174, "grad_norm": 1.1875, "learning_rate": 5.6379109014278985e-05, "loss": 0.4057, "step": 10180 }, { "epoch": 2.4518536350505538, "grad_norm": 1.1328125, "learning_rate": 5.633222919261122e-05, "loss": 0.3889, "step": 10185 }, { "epoch": 2.4530572941742896, "grad_norm": 1.0859375, "learning_rate": 5.6285440544271955e-05, "loss": 0.3872, "step": 10190 }, { "epoch": 2.454260953298026, "grad_norm": 1.0703125, "learning_rate": 5.623874314401746e-05, "loss": 0.4022, "step": 10195 }, { "epoch": 2.4554646124217623, "grad_norm": 1.1171875, "learning_rate": 5.619213706645821e-05, "loss": 0.3894, "step": 10200 }, { "epoch": 2.456668271545498, "grad_norm": 1.2265625, "learning_rate": 5.614562238605875e-05, "loss": 0.3941, "step": 10205 }, { "epoch": 2.4578719306692345, "grad_norm": 1.1484375, "learning_rate": 5.609919917713765e-05, "loss": 0.3998, "step": 10210 }, { "epoch": 2.459075589792971, "grad_norm": 1.203125, "learning_rate": 5.605286751386726e-05, "loss": 0.424, "step": 10215 }, { "epoch": 2.4602792489167067, "grad_norm": 1.1015625, "learning_rate": 5.600662747027373e-05, "loss": 0.4156, "step": 10220 }, { "epoch": 2.461482908040443, "grad_norm": 1.1640625, "learning_rate": 5.596047912023678e-05, "loss": 0.3905, "step": 10225 }, { "epoch": 2.4626865671641793, "grad_norm": 1.1875, "learning_rate": 5.591442253748964e-05, "loss": 0.3959, "step": 10230 }, { "epoch": 2.463890226287915, "grad_norm": 1.203125, "learning_rate": 5.5868457795618946e-05, "loss": 0.3953, "step": 10235 }, { "epoch": 2.4650938854116515, "grad_norm": 1.1328125, "learning_rate": 5.582258496806455e-05, "loss": 0.3779, "step": 10240 }, { "epoch": 2.4662975445353874, "grad_norm": 1.1328125, "learning_rate": 5.5776804128119475e-05, "loss": 0.4194, "step": 10245 }, { "epoch": 2.4675012036591237, "grad_norm": 1.15625, "learning_rate": 5.573111534892978e-05, "loss": 0.3878, "step": 10250 }, { "epoch": 2.46870486278286, "grad_norm": 1.1875, "learning_rate": 5.5685518703494395e-05, "loss": 0.4049, "step": 10255 }, { "epoch": 2.469908521906596, "grad_norm": 1.09375, "learning_rate": 5.564001426466511e-05, "loss": 0.4012, "step": 10260 }, { "epoch": 2.4711121810303323, "grad_norm": 1.1328125, "learning_rate": 5.5594602105146296e-05, "loss": 0.4015, "step": 10265 }, { "epoch": 2.472315840154068, "grad_norm": 1.203125, "learning_rate": 5.5549282297495004e-05, "loss": 0.4032, "step": 10270 }, { "epoch": 2.4735194992778045, "grad_norm": 1.203125, "learning_rate": 5.550405491412062e-05, "loss": 0.4149, "step": 10275 }, { "epoch": 2.474723158401541, "grad_norm": 1.109375, "learning_rate": 5.5458920027284944e-05, "loss": 0.3948, "step": 10280 }, { "epoch": 2.4759268175252767, "grad_norm": 1.1953125, "learning_rate": 5.541387770910194e-05, "loss": 0.3952, "step": 10285 }, { "epoch": 2.477130476649013, "grad_norm": 1.21875, "learning_rate": 5.536892803153768e-05, "loss": 0.3861, "step": 10290 }, { "epoch": 2.4783341357727493, "grad_norm": 1.2109375, "learning_rate": 5.5324071066410244e-05, "loss": 0.3851, "step": 10295 }, { "epoch": 2.479537794896485, "grad_norm": 1.046875, "learning_rate": 5.527930688538955e-05, "loss": 0.39, "step": 10300 }, { "epoch": 2.4807414540202215, "grad_norm": 1.21875, "learning_rate": 5.523463555999728e-05, "loss": 0.4146, "step": 10305 }, { "epoch": 2.481945113143958, "grad_norm": 1.15625, "learning_rate": 5.519005716160677e-05, "loss": 0.374, "step": 10310 }, { "epoch": 2.4831487722676937, "grad_norm": 1.2109375, "learning_rate": 5.514557176144285e-05, "loss": 0.4023, "step": 10315 }, { "epoch": 2.48435243139143, "grad_norm": 1.1640625, "learning_rate": 5.510117943058183e-05, "loss": 0.3898, "step": 10320 }, { "epoch": 2.485556090515166, "grad_norm": 1.15625, "learning_rate": 5.505688023995122e-05, "loss": 0.4129, "step": 10325 }, { "epoch": 2.4867597496389022, "grad_norm": 1.109375, "learning_rate": 5.50126742603298e-05, "loss": 0.3818, "step": 10330 }, { "epoch": 2.4879634087626386, "grad_norm": 1.140625, "learning_rate": 5.496856156234742e-05, "loss": 0.3979, "step": 10335 }, { "epoch": 2.4891670678863744, "grad_norm": 1.2578125, "learning_rate": 5.492454221648481e-05, "loss": 0.3965, "step": 10340 }, { "epoch": 2.4903707270101108, "grad_norm": 1.1328125, "learning_rate": 5.488061629307364e-05, "loss": 0.4336, "step": 10345 }, { "epoch": 2.491574386133847, "grad_norm": 1.1171875, "learning_rate": 5.483678386229625e-05, "loss": 0.3737, "step": 10350 }, { "epoch": 2.492778045257583, "grad_norm": 1.2265625, "learning_rate": 5.4793044994185646e-05, "loss": 0.391, "step": 10355 }, { "epoch": 2.4939817043813193, "grad_norm": 1.0859375, "learning_rate": 5.474939975862532e-05, "loss": 0.4015, "step": 10360 }, { "epoch": 2.4951853635050556, "grad_norm": 1.1953125, "learning_rate": 5.470584822534917e-05, "loss": 0.4185, "step": 10365 }, { "epoch": 2.4963890226287915, "grad_norm": 1.1796875, "learning_rate": 5.466239046394138e-05, "loss": 0.4132, "step": 10370 }, { "epoch": 2.497592681752528, "grad_norm": 1.15625, "learning_rate": 5.461902654383629e-05, "loss": 0.4081, "step": 10375 }, { "epoch": 2.4987963408762637, "grad_norm": 1.3359375, "learning_rate": 5.4575756534318385e-05, "loss": 0.3876, "step": 10380 }, { "epoch": 2.5, "grad_norm": 1.1875, "learning_rate": 5.453258050452198e-05, "loss": 0.4256, "step": 10385 }, { "epoch": 2.5012036591237363, "grad_norm": 1.125, "learning_rate": 5.4489498523431344e-05, "loss": 0.3719, "step": 10390 }, { "epoch": 2.502407318247472, "grad_norm": 1.203125, "learning_rate": 5.444651065988045e-05, "loss": 0.4402, "step": 10395 }, { "epoch": 2.5036109773712085, "grad_norm": 1.078125, "learning_rate": 5.4403616982552844e-05, "loss": 0.3807, "step": 10400 }, { "epoch": 2.5048146364949444, "grad_norm": 1.140625, "learning_rate": 5.436081755998166e-05, "loss": 0.3837, "step": 10405 }, { "epoch": 2.5060182956186807, "grad_norm": 1.078125, "learning_rate": 5.431811246054939e-05, "loss": 0.3918, "step": 10410 }, { "epoch": 2.507221954742417, "grad_norm": 1.1484375, "learning_rate": 5.427550175248782e-05, "loss": 0.3935, "step": 10415 }, { "epoch": 2.5084256138661534, "grad_norm": 1.1171875, "learning_rate": 5.423298550387798e-05, "loss": 0.3998, "step": 10420 }, { "epoch": 2.5096292729898892, "grad_norm": 1.21875, "learning_rate": 5.419056378264988e-05, "loss": 0.4105, "step": 10425 }, { "epoch": 2.5108329321136256, "grad_norm": 1.203125, "learning_rate": 5.4148236656582606e-05, "loss": 0.4108, "step": 10430 }, { "epoch": 2.5120365912373614, "grad_norm": 1.1328125, "learning_rate": 5.4106004193304016e-05, "loss": 0.3828, "step": 10435 }, { "epoch": 2.5132402503610978, "grad_norm": 1.203125, "learning_rate": 5.406386646029078e-05, "loss": 0.3813, "step": 10440 }, { "epoch": 2.514443909484834, "grad_norm": 1.21875, "learning_rate": 5.4021823524868216e-05, "loss": 0.4137, "step": 10445 }, { "epoch": 2.51564756860857, "grad_norm": 1.125, "learning_rate": 5.397987545421011e-05, "loss": 0.3782, "step": 10450 }, { "epoch": 2.5168512277323063, "grad_norm": 1.2265625, "learning_rate": 5.393802231533876e-05, "loss": 0.3988, "step": 10455 }, { "epoch": 2.518054886856042, "grad_norm": 1.203125, "learning_rate": 5.389626417512473e-05, "loss": 0.3872, "step": 10460 }, { "epoch": 2.5192585459797785, "grad_norm": 1.25, "learning_rate": 5.3854601100286845e-05, "loss": 0.3967, "step": 10465 }, { "epoch": 2.520462205103515, "grad_norm": 1.0625, "learning_rate": 5.3813033157391994e-05, "loss": 0.3891, "step": 10470 }, { "epoch": 2.5216658642272507, "grad_norm": 1.21875, "learning_rate": 5.3771560412855116e-05, "loss": 0.4044, "step": 10475 }, { "epoch": 2.522869523350987, "grad_norm": 1.1875, "learning_rate": 5.373018293293902e-05, "loss": 0.3816, "step": 10480 }, { "epoch": 2.524073182474723, "grad_norm": 1.1796875, "learning_rate": 5.368890078375429e-05, "loss": 0.3851, "step": 10485 }, { "epoch": 2.525276841598459, "grad_norm": 1.2265625, "learning_rate": 5.364771403125924e-05, "loss": 0.3807, "step": 10490 }, { "epoch": 2.5264805007221955, "grad_norm": 1.0859375, "learning_rate": 5.3606622741259745e-05, "loss": 0.3809, "step": 10495 }, { "epoch": 2.527684159845932, "grad_norm": 1.046875, "learning_rate": 5.356562697940913e-05, "loss": 0.384, "step": 10500 }, { "epoch": 2.527684159845932, "eval_loss": 0.363790363073349, "eval_runtime": 2.4019, "eval_samples_per_second": 83.267, "eval_steps_per_second": 83.267, "step": 10500 }, { "epoch": 2.5288878189696677, "grad_norm": 1.203125, "learning_rate": 5.3524726811208153e-05, "loss": 0.3944, "step": 10505 }, { "epoch": 2.530091478093404, "grad_norm": 1.171875, "learning_rate": 5.348392230200476e-05, "loss": 0.4003, "step": 10510 }, { "epoch": 2.53129513721714, "grad_norm": 1.15625, "learning_rate": 5.344321351699411e-05, "loss": 0.3861, "step": 10515 }, { "epoch": 2.5324987963408763, "grad_norm": 1.2421875, "learning_rate": 5.340260052121841e-05, "loss": 0.3834, "step": 10520 }, { "epoch": 2.5337024554646126, "grad_norm": 1.1015625, "learning_rate": 5.3362083379566796e-05, "loss": 0.3901, "step": 10525 }, { "epoch": 2.5349061145883485, "grad_norm": 1.1953125, "learning_rate": 5.3321662156775306e-05, "loss": 0.367, "step": 10530 }, { "epoch": 2.536109773712085, "grad_norm": 1.0859375, "learning_rate": 5.3281336917426657e-05, "loss": 0.3956, "step": 10535 }, { "epoch": 2.5373134328358207, "grad_norm": 1.15625, "learning_rate": 5.324110772595028e-05, "loss": 0.3953, "step": 10540 }, { "epoch": 2.538517091959557, "grad_norm": 1.1484375, "learning_rate": 5.320097464662208e-05, "loss": 0.3922, "step": 10545 }, { "epoch": 2.5397207510832933, "grad_norm": 1.2109375, "learning_rate": 5.316093774356444e-05, "loss": 0.3925, "step": 10550 }, { "epoch": 2.5409244102070296, "grad_norm": 1.2265625, "learning_rate": 5.3120997080746086e-05, "loss": 0.4013, "step": 10555 }, { "epoch": 2.5421280693307655, "grad_norm": 1.265625, "learning_rate": 5.3081152721981914e-05, "loss": 0.3857, "step": 10560 }, { "epoch": 2.543331728454502, "grad_norm": 1.1328125, "learning_rate": 5.304140473093304e-05, "loss": 0.3848, "step": 10565 }, { "epoch": 2.5445353875782377, "grad_norm": 1.296875, "learning_rate": 5.3001753171106526e-05, "loss": 0.3886, "step": 10570 }, { "epoch": 2.545739046701974, "grad_norm": 1.109375, "learning_rate": 5.2962198105855456e-05, "loss": 0.3627, "step": 10575 }, { "epoch": 2.5469427058257104, "grad_norm": 1.0703125, "learning_rate": 5.2922739598378634e-05, "loss": 0.4065, "step": 10580 }, { "epoch": 2.5481463649494462, "grad_norm": 1.140625, "learning_rate": 5.288337771172064e-05, "loss": 0.4338, "step": 10585 }, { "epoch": 2.5493500240731826, "grad_norm": 1.2265625, "learning_rate": 5.2844112508771706e-05, "loss": 0.4, "step": 10590 }, { "epoch": 2.5505536831969184, "grad_norm": 1.078125, "learning_rate": 5.280494405226753e-05, "loss": 0.4274, "step": 10595 }, { "epoch": 2.5517573423206548, "grad_norm": 1.125, "learning_rate": 5.27658724047893e-05, "loss": 0.3895, "step": 10600 }, { "epoch": 2.552961001444391, "grad_norm": 1.4140625, "learning_rate": 5.2726897628763484e-05, "loss": 0.4171, "step": 10605 }, { "epoch": 2.554164660568127, "grad_norm": 1.2109375, "learning_rate": 5.268801978646174e-05, "loss": 0.379, "step": 10610 }, { "epoch": 2.5553683196918633, "grad_norm": 1.21875, "learning_rate": 5.264923894000096e-05, "loss": 0.3883, "step": 10615 }, { "epoch": 2.556571978815599, "grad_norm": 1.125, "learning_rate": 5.261055515134293e-05, "loss": 0.377, "step": 10620 }, { "epoch": 2.5577756379393355, "grad_norm": 1.140625, "learning_rate": 5.257196848229449e-05, "loss": 0.3833, "step": 10625 }, { "epoch": 2.558979297063072, "grad_norm": 1.125, "learning_rate": 5.25334789945072e-05, "loss": 0.4117, "step": 10630 }, { "epoch": 2.560182956186808, "grad_norm": 1.125, "learning_rate": 5.2495086749477405e-05, "loss": 0.403, "step": 10635 }, { "epoch": 2.561386615310544, "grad_norm": 1.125, "learning_rate": 5.245679180854609e-05, "loss": 0.3941, "step": 10640 }, { "epoch": 2.5625902744342803, "grad_norm": 1.0078125, "learning_rate": 5.2418594232898726e-05, "loss": 0.4176, "step": 10645 }, { "epoch": 2.563793933558016, "grad_norm": 1.09375, "learning_rate": 5.238049408356525e-05, "loss": 0.385, "step": 10650 }, { "epoch": 2.5649975926817525, "grad_norm": 1.109375, "learning_rate": 5.2342491421419956e-05, "loss": 0.3962, "step": 10655 }, { "epoch": 2.566201251805489, "grad_norm": 1.0625, "learning_rate": 5.230458630718134e-05, "loss": 0.3933, "step": 10660 }, { "epoch": 2.5674049109292247, "grad_norm": 1.2421875, "learning_rate": 5.226677880141207e-05, "loss": 0.4155, "step": 10665 }, { "epoch": 2.568608570052961, "grad_norm": 1.1171875, "learning_rate": 5.2229068964518826e-05, "loss": 0.4007, "step": 10670 }, { "epoch": 2.569812229176697, "grad_norm": 1.1328125, "learning_rate": 5.219145685675229e-05, "loss": 0.3848, "step": 10675 }, { "epoch": 2.5710158883004333, "grad_norm": 1.140625, "learning_rate": 5.2153942538206934e-05, "loss": 0.3625, "step": 10680 }, { "epoch": 2.5722195474241696, "grad_norm": 1.1640625, "learning_rate": 5.211652606882104e-05, "loss": 0.3854, "step": 10685 }, { "epoch": 2.573423206547906, "grad_norm": 1.2109375, "learning_rate": 5.2079207508376544e-05, "loss": 0.4038, "step": 10690 }, { "epoch": 2.574626865671642, "grad_norm": 1.1171875, "learning_rate": 5.2041986916498915e-05, "loss": 0.4136, "step": 10695 }, { "epoch": 2.575830524795378, "grad_norm": 1.15625, "learning_rate": 5.200486435265714e-05, "loss": 0.3833, "step": 10700 }, { "epoch": 2.577034183919114, "grad_norm": 1.1875, "learning_rate": 5.1967839876163556e-05, "loss": 0.3981, "step": 10705 }, { "epoch": 2.5782378430428503, "grad_norm": 1.2890625, "learning_rate": 5.1930913546173766e-05, "loss": 0.4059, "step": 10710 }, { "epoch": 2.5794415021665866, "grad_norm": 1.1875, "learning_rate": 5.1894085421686605e-05, "loss": 0.399, "step": 10715 }, { "epoch": 2.5806451612903225, "grad_norm": 1.1484375, "learning_rate": 5.185735556154395e-05, "loss": 0.4223, "step": 10720 }, { "epoch": 2.581848820414059, "grad_norm": 1.171875, "learning_rate": 5.1820724024430737e-05, "loss": 0.3819, "step": 10725 }, { "epoch": 2.5830524795377947, "grad_norm": 1.0625, "learning_rate": 5.178419086887472e-05, "loss": 0.4061, "step": 10730 }, { "epoch": 2.584256138661531, "grad_norm": 1.1875, "learning_rate": 5.174775615324656e-05, "loss": 0.4162, "step": 10735 }, { "epoch": 2.5854597977852674, "grad_norm": 1.15625, "learning_rate": 5.1711419935759594e-05, "loss": 0.3761, "step": 10740 }, { "epoch": 2.5866634569090032, "grad_norm": 1.1953125, "learning_rate": 5.1675182274469744e-05, "loss": 0.4141, "step": 10745 }, { "epoch": 2.5878671160327396, "grad_norm": 1.1171875, "learning_rate": 5.163904322727552e-05, "loss": 0.3762, "step": 10750 }, { "epoch": 2.5890707751564754, "grad_norm": 1.0546875, "learning_rate": 5.160300285191788e-05, "loss": 0.3992, "step": 10755 }, { "epoch": 2.5902744342802118, "grad_norm": 1.1953125, "learning_rate": 5.156706120598007e-05, "loss": 0.3924, "step": 10760 }, { "epoch": 2.591478093403948, "grad_norm": 1.21875, "learning_rate": 5.153121834688766e-05, "loss": 0.3643, "step": 10765 }, { "epoch": 2.5926817525276844, "grad_norm": 1.2734375, "learning_rate": 5.1495474331908306e-05, "loss": 0.3923, "step": 10770 }, { "epoch": 2.5938854116514203, "grad_norm": 1.1484375, "learning_rate": 5.145982921815182e-05, "loss": 0.3632, "step": 10775 }, { "epoch": 2.5950890707751566, "grad_norm": 1.25, "learning_rate": 5.142428306256995e-05, "loss": 0.4253, "step": 10780 }, { "epoch": 2.5962927298988925, "grad_norm": 1.1171875, "learning_rate": 5.138883592195634e-05, "loss": 0.3692, "step": 10785 }, { "epoch": 2.597496389022629, "grad_norm": 1.1796875, "learning_rate": 5.135348785294642e-05, "loss": 0.4008, "step": 10790 }, { "epoch": 2.598700048146365, "grad_norm": 1.0546875, "learning_rate": 5.1318238912017345e-05, "loss": 0.395, "step": 10795 }, { "epoch": 2.599903707270101, "grad_norm": 1.046875, "learning_rate": 5.12830891554879e-05, "loss": 0.3927, "step": 10800 }, { "epoch": 2.6011073663938373, "grad_norm": 1.1015625, "learning_rate": 5.124803863951837e-05, "loss": 0.374, "step": 10805 }, { "epoch": 2.602311025517573, "grad_norm": 1.2265625, "learning_rate": 5.12130874201105e-05, "loss": 0.4077, "step": 10810 }, { "epoch": 2.6035146846413095, "grad_norm": 1.15625, "learning_rate": 5.117823555310738e-05, "loss": 0.3914, "step": 10815 }, { "epoch": 2.604718343765046, "grad_norm": 1.1171875, "learning_rate": 5.1143483094193366e-05, "loss": 0.3755, "step": 10820 }, { "epoch": 2.605922002888782, "grad_norm": 1.0390625, "learning_rate": 5.110883009889396e-05, "loss": 0.4147, "step": 10825 }, { "epoch": 2.607125662012518, "grad_norm": 1.1484375, "learning_rate": 5.107427662257577e-05, "loss": 0.4125, "step": 10830 }, { "epoch": 2.6083293211362544, "grad_norm": 1.234375, "learning_rate": 5.103982272044643e-05, "loss": 0.4214, "step": 10835 }, { "epoch": 2.6095329802599903, "grad_norm": 1.1171875, "learning_rate": 5.10054684475544e-05, "loss": 0.3757, "step": 10840 }, { "epoch": 2.6107366393837266, "grad_norm": 1.1796875, "learning_rate": 5.0971213858789e-05, "loss": 0.4079, "step": 10845 }, { "epoch": 2.611940298507463, "grad_norm": 1.2109375, "learning_rate": 5.093705900888031e-05, "loss": 0.4148, "step": 10850 }, { "epoch": 2.6131439576311988, "grad_norm": 1.171875, "learning_rate": 5.090300395239901e-05, "loss": 0.3974, "step": 10855 }, { "epoch": 2.614347616754935, "grad_norm": 1.125, "learning_rate": 5.0869048743756336e-05, "loss": 0.4012, "step": 10860 }, { "epoch": 2.615551275878671, "grad_norm": 1.109375, "learning_rate": 5.083519343720404e-05, "loss": 0.3988, "step": 10865 }, { "epoch": 2.6167549350024073, "grad_norm": 1.171875, "learning_rate": 5.080143808683419e-05, "loss": 0.3792, "step": 10870 }, { "epoch": 2.6179585941261436, "grad_norm": 1.1640625, "learning_rate": 5.0767782746579206e-05, "loss": 0.411, "step": 10875 }, { "epoch": 2.6191622532498795, "grad_norm": 1.234375, "learning_rate": 5.073422747021168e-05, "loss": 0.4114, "step": 10880 }, { "epoch": 2.620365912373616, "grad_norm": 1.25, "learning_rate": 5.0700772311344345e-05, "loss": 0.4095, "step": 10885 }, { "epoch": 2.6215695714973517, "grad_norm": 1.125, "learning_rate": 5.066741732342999e-05, "loss": 0.3815, "step": 10890 }, { "epoch": 2.622773230621088, "grad_norm": 1.2265625, "learning_rate": 5.063416255976131e-05, "loss": 0.3941, "step": 10895 }, { "epoch": 2.6239768897448243, "grad_norm": 1.078125, "learning_rate": 5.0601008073470886e-05, "loss": 0.3619, "step": 10900 }, { "epoch": 2.6251805488685607, "grad_norm": 1.1171875, "learning_rate": 5.0567953917531106e-05, "loss": 0.3967, "step": 10905 }, { "epoch": 2.6263842079922965, "grad_norm": 1.09375, "learning_rate": 5.053500014475402e-05, "loss": 0.4058, "step": 10910 }, { "epoch": 2.627587867116033, "grad_norm": 1.15625, "learning_rate": 5.0502146807791325e-05, "loss": 0.3839, "step": 10915 }, { "epoch": 2.6287915262397687, "grad_norm": 1.28125, "learning_rate": 5.046939395913423e-05, "loss": 0.3853, "step": 10920 }, { "epoch": 2.629995185363505, "grad_norm": 1.203125, "learning_rate": 5.0436741651113396e-05, "loss": 0.3941, "step": 10925 }, { "epoch": 2.6311988444872414, "grad_norm": 1.1171875, "learning_rate": 5.0404189935898824e-05, "loss": 0.3901, "step": 10930 }, { "epoch": 2.6324025036109773, "grad_norm": 1.1328125, "learning_rate": 5.037173886549981e-05, "loss": 0.3999, "step": 10935 }, { "epoch": 2.6336061627347136, "grad_norm": 1.09375, "learning_rate": 5.033938849176487e-05, "loss": 0.4189, "step": 10940 }, { "epoch": 2.6348098218584495, "grad_norm": 1.15625, "learning_rate": 5.0307138866381594e-05, "loss": 0.398, "step": 10945 }, { "epoch": 2.636013480982186, "grad_norm": 1.1796875, "learning_rate": 5.027499004087661e-05, "loss": 0.4178, "step": 10950 }, { "epoch": 2.637217140105922, "grad_norm": 1.1328125, "learning_rate": 5.02429420666155e-05, "loss": 0.3906, "step": 10955 }, { "epoch": 2.638420799229658, "grad_norm": 1.1015625, "learning_rate": 5.021099499480272e-05, "loss": 0.3867, "step": 10960 }, { "epoch": 2.6396244583533943, "grad_norm": 1.2265625, "learning_rate": 5.01791488764815e-05, "loss": 0.411, "step": 10965 }, { "epoch": 2.64082811747713, "grad_norm": 1.2265625, "learning_rate": 5.0147403762533755e-05, "loss": 0.4381, "step": 10970 }, { "epoch": 2.6420317766008665, "grad_norm": 1.09375, "learning_rate": 5.011575970368007e-05, "loss": 0.3743, "step": 10975 }, { "epoch": 2.643235435724603, "grad_norm": 1.078125, "learning_rate": 5.0084216750479514e-05, "loss": 0.3951, "step": 10980 }, { "epoch": 2.644439094848339, "grad_norm": 1.078125, "learning_rate": 5.005277495332965e-05, "loss": 0.3753, "step": 10985 }, { "epoch": 2.645642753972075, "grad_norm": 1.1640625, "learning_rate": 5.002143436246642e-05, "loss": 0.4305, "step": 10990 }, { "epoch": 2.6468464130958114, "grad_norm": 1.1796875, "learning_rate": 4.9990195027964035e-05, "loss": 0.4126, "step": 10995 }, { "epoch": 2.6480500722195472, "grad_norm": 1.1640625, "learning_rate": 4.9959056999734956e-05, "loss": 0.405, "step": 11000 }, { "epoch": 2.6480500722195472, "eval_loss": 0.35999855399131775, "eval_runtime": 2.3992, "eval_samples_per_second": 83.362, "eval_steps_per_second": 83.362, "step": 11000 }, { "epoch": 2.6492537313432836, "grad_norm": 1.2109375, "learning_rate": 4.992802032752976e-05, "loss": 0.3871, "step": 11005 }, { "epoch": 2.65045739046702, "grad_norm": 1.125, "learning_rate": 4.989708506093709e-05, "loss": 0.3773, "step": 11010 }, { "epoch": 2.6516610495907558, "grad_norm": 1.1875, "learning_rate": 4.9866251249383596e-05, "loss": 0.3991, "step": 11015 }, { "epoch": 2.652864708714492, "grad_norm": 1.0625, "learning_rate": 4.983551894213378e-05, "loss": 0.4101, "step": 11020 }, { "epoch": 2.654068367838228, "grad_norm": 1.1875, "learning_rate": 4.980488818829e-05, "loss": 0.4147, "step": 11025 }, { "epoch": 2.6552720269619643, "grad_norm": 1.1640625, "learning_rate": 4.977435903679234e-05, "loss": 0.3749, "step": 11030 }, { "epoch": 2.6564756860857006, "grad_norm": 1.1484375, "learning_rate": 4.9743931536418574e-05, "loss": 0.3976, "step": 11035 }, { "epoch": 2.657679345209437, "grad_norm": 1.2421875, "learning_rate": 4.971360573578403e-05, "loss": 0.3958, "step": 11040 }, { "epoch": 2.658883004333173, "grad_norm": 1.0546875, "learning_rate": 4.968338168334157e-05, "loss": 0.3748, "step": 11045 }, { "epoch": 2.660086663456909, "grad_norm": 1.1015625, "learning_rate": 4.96532594273815e-05, "loss": 0.3949, "step": 11050 }, { "epoch": 2.661290322580645, "grad_norm": 1.109375, "learning_rate": 4.962323901603145e-05, "loss": 0.3771, "step": 11055 }, { "epoch": 2.6624939817043813, "grad_norm": 1.21875, "learning_rate": 4.959332049725632e-05, "loss": 0.4241, "step": 11060 }, { "epoch": 2.6636976408281177, "grad_norm": 1.0390625, "learning_rate": 4.9563503918858254e-05, "loss": 0.3762, "step": 11065 }, { "epoch": 2.6649012999518535, "grad_norm": 1.265625, "learning_rate": 4.9533789328476504e-05, "loss": 0.4294, "step": 11070 }, { "epoch": 2.66610495907559, "grad_norm": 1.109375, "learning_rate": 4.9504176773587365e-05, "loss": 0.39, "step": 11075 }, { "epoch": 2.6673086181993257, "grad_norm": 1.109375, "learning_rate": 4.947466630150409e-05, "loss": 0.4092, "step": 11080 }, { "epoch": 2.668512277323062, "grad_norm": 1.1484375, "learning_rate": 4.944525795937689e-05, "loss": 0.4245, "step": 11085 }, { "epoch": 2.6697159364467984, "grad_norm": 1.2421875, "learning_rate": 4.94159517941927e-05, "loss": 0.446, "step": 11090 }, { "epoch": 2.6709195955705343, "grad_norm": 1.109375, "learning_rate": 4.93867478527753e-05, "loss": 0.382, "step": 11095 }, { "epoch": 2.6721232546942706, "grad_norm": 1.125, "learning_rate": 4.935764618178511e-05, "loss": 0.3752, "step": 11100 }, { "epoch": 2.6733269138180065, "grad_norm": 1.078125, "learning_rate": 4.932864682771912e-05, "loss": 0.3823, "step": 11105 }, { "epoch": 2.674530572941743, "grad_norm": 1.234375, "learning_rate": 4.9299749836910855e-05, "loss": 0.3903, "step": 11110 }, { "epoch": 2.675734232065479, "grad_norm": 1.2109375, "learning_rate": 4.9270955255530304e-05, "loss": 0.4138, "step": 11115 }, { "epoch": 2.6769378911892154, "grad_norm": 1.0625, "learning_rate": 4.924226312958383e-05, "loss": 0.4069, "step": 11120 }, { "epoch": 2.6781415503129513, "grad_norm": 1.0546875, "learning_rate": 4.9213673504914116e-05, "loss": 0.3896, "step": 11125 }, { "epoch": 2.6793452094366876, "grad_norm": 1.078125, "learning_rate": 4.918518642720002e-05, "loss": 0.3947, "step": 11130 }, { "epoch": 2.6805488685604235, "grad_norm": 1.1171875, "learning_rate": 4.9156801941956615e-05, "loss": 0.3621, "step": 11135 }, { "epoch": 2.68175252768416, "grad_norm": 1.046875, "learning_rate": 4.9128520094535e-05, "loss": 0.3774, "step": 11140 }, { "epoch": 2.682956186807896, "grad_norm": 1.140625, "learning_rate": 4.910034093012235e-05, "loss": 0.4004, "step": 11145 }, { "epoch": 2.684159845931632, "grad_norm": 2.46875, "learning_rate": 4.907226449374173e-05, "loss": 0.3884, "step": 11150 }, { "epoch": 2.6853635050553684, "grad_norm": 1.0546875, "learning_rate": 4.9044290830252106e-05, "loss": 0.3853, "step": 11155 }, { "epoch": 2.6865671641791042, "grad_norm": 1.0703125, "learning_rate": 4.901641998434819e-05, "loss": 0.3866, "step": 11160 }, { "epoch": 2.6877708233028406, "grad_norm": 1.140625, "learning_rate": 4.8988652000560474e-05, "loss": 0.3965, "step": 11165 }, { "epoch": 2.688974482426577, "grad_norm": 1.0703125, "learning_rate": 4.8960986923255066e-05, "loss": 0.4114, "step": 11170 }, { "epoch": 2.690178141550313, "grad_norm": 1.0859375, "learning_rate": 4.893342479663367e-05, "loss": 0.3998, "step": 11175 }, { "epoch": 2.691381800674049, "grad_norm": 1.15625, "learning_rate": 4.890596566473349e-05, "loss": 0.3753, "step": 11180 }, { "epoch": 2.6925854597977854, "grad_norm": 1.0859375, "learning_rate": 4.8878609571427204e-05, "loss": 0.3839, "step": 11185 }, { "epoch": 2.6937891189215213, "grad_norm": 1.03125, "learning_rate": 4.885135656042281e-05, "loss": 0.3886, "step": 11190 }, { "epoch": 2.6949927780452576, "grad_norm": 1.15625, "learning_rate": 4.8824206675263646e-05, "loss": 0.4186, "step": 11195 }, { "epoch": 2.696196437168994, "grad_norm": 1.1484375, "learning_rate": 4.879715995932828e-05, "loss": 0.3876, "step": 11200 }, { "epoch": 2.69740009629273, "grad_norm": 1.1171875, "learning_rate": 4.877021645583041e-05, "loss": 0.3745, "step": 11205 }, { "epoch": 2.698603755416466, "grad_norm": 1.1796875, "learning_rate": 4.8743376207818855e-05, "loss": 0.399, "step": 11210 }, { "epoch": 2.699807414540202, "grad_norm": 1.1640625, "learning_rate": 4.871663925817746e-05, "loss": 0.3409, "step": 11215 }, { "epoch": 2.7010110736639383, "grad_norm": 1.1328125, "learning_rate": 4.869000564962499e-05, "loss": 0.3794, "step": 11220 }, { "epoch": 2.7022147327876747, "grad_norm": 1.1796875, "learning_rate": 4.866347542471515e-05, "loss": 0.3588, "step": 11225 }, { "epoch": 2.7034183919114105, "grad_norm": 1.171875, "learning_rate": 4.8637048625836426e-05, "loss": 0.3864, "step": 11230 }, { "epoch": 2.704622051035147, "grad_norm": 1.1640625, "learning_rate": 4.8610725295212084e-05, "loss": 0.386, "step": 11235 }, { "epoch": 2.7058257101588827, "grad_norm": 1.203125, "learning_rate": 4.858450547490003e-05, "loss": 0.409, "step": 11240 }, { "epoch": 2.707029369282619, "grad_norm": 1.1171875, "learning_rate": 4.855838920679282e-05, "loss": 0.3893, "step": 11245 }, { "epoch": 2.7082330284063554, "grad_norm": 1.1796875, "learning_rate": 4.85323765326176e-05, "loss": 0.4206, "step": 11250 }, { "epoch": 2.7094366875300917, "grad_norm": 1.0546875, "learning_rate": 4.8506467493935896e-05, "loss": 0.382, "step": 11255 }, { "epoch": 2.7106403466538276, "grad_norm": 1.0546875, "learning_rate": 4.8480662132143754e-05, "loss": 0.3788, "step": 11260 }, { "epoch": 2.711844005777564, "grad_norm": 1.109375, "learning_rate": 4.84549604884715e-05, "loss": 0.3799, "step": 11265 }, { "epoch": 2.7130476649013, "grad_norm": 1.1640625, "learning_rate": 4.8429362603983786e-05, "loss": 0.3809, "step": 11270 }, { "epoch": 2.714251324025036, "grad_norm": 1.15625, "learning_rate": 4.840386851957946e-05, "loss": 0.4125, "step": 11275 }, { "epoch": 2.7154549831487724, "grad_norm": 1.171875, "learning_rate": 4.837847827599152e-05, "loss": 0.3811, "step": 11280 }, { "epoch": 2.7166586422725083, "grad_norm": 1.1015625, "learning_rate": 4.8353191913787104e-05, "loss": 0.3844, "step": 11285 }, { "epoch": 2.7178623013962446, "grad_norm": 1.0703125, "learning_rate": 4.832800947336731e-05, "loss": 0.3761, "step": 11290 }, { "epoch": 2.7190659605199805, "grad_norm": 1.140625, "learning_rate": 4.830293099496724e-05, "loss": 0.4067, "step": 11295 }, { "epoch": 2.720269619643717, "grad_norm": 1.2265625, "learning_rate": 4.827795651865584e-05, "loss": 0.3837, "step": 11300 }, { "epoch": 2.721473278767453, "grad_norm": 1.1640625, "learning_rate": 4.825308608433596e-05, "loss": 0.4123, "step": 11305 }, { "epoch": 2.7226769378911895, "grad_norm": 1.203125, "learning_rate": 4.822831973174415e-05, "loss": 0.3933, "step": 11310 }, { "epoch": 2.7238805970149254, "grad_norm": 1.1328125, "learning_rate": 4.820365750045072e-05, "loss": 0.3705, "step": 11315 }, { "epoch": 2.7250842561386617, "grad_norm": 1.21875, "learning_rate": 4.817909942985956e-05, "loss": 0.396, "step": 11320 }, { "epoch": 2.7262879152623976, "grad_norm": 1.1796875, "learning_rate": 4.81546455592082e-05, "loss": 0.3852, "step": 11325 }, { "epoch": 2.727491574386134, "grad_norm": 1.109375, "learning_rate": 4.813029592756763e-05, "loss": 0.3748, "step": 11330 }, { "epoch": 2.72869523350987, "grad_norm": 1.2109375, "learning_rate": 4.810605057384235e-05, "loss": 0.4004, "step": 11335 }, { "epoch": 2.729898892633606, "grad_norm": 1.1171875, "learning_rate": 4.8081909536770195e-05, "loss": 0.3843, "step": 11340 }, { "epoch": 2.7311025517573424, "grad_norm": 1.1015625, "learning_rate": 4.805787285492237e-05, "loss": 0.3786, "step": 11345 }, { "epoch": 2.7323062108810783, "grad_norm": 1.0703125, "learning_rate": 4.803394056670332e-05, "loss": 0.368, "step": 11350 }, { "epoch": 2.7335098700048146, "grad_norm": 1.1484375, "learning_rate": 4.80101127103507e-05, "loss": 0.3719, "step": 11355 }, { "epoch": 2.734713529128551, "grad_norm": 1.1328125, "learning_rate": 4.7986389323935345e-05, "loss": 0.3925, "step": 11360 }, { "epoch": 2.735917188252287, "grad_norm": 1.1015625, "learning_rate": 4.796277044536111e-05, "loss": 0.4133, "step": 11365 }, { "epoch": 2.737120847376023, "grad_norm": 1.1015625, "learning_rate": 4.793925611236494e-05, "loss": 0.3963, "step": 11370 }, { "epoch": 2.738324506499759, "grad_norm": 1.21875, "learning_rate": 4.791584636251668e-05, "loss": 0.4214, "step": 11375 }, { "epoch": 2.7395281656234953, "grad_norm": 1.0390625, "learning_rate": 4.789254123321914e-05, "loss": 0.3743, "step": 11380 }, { "epoch": 2.7407318247472316, "grad_norm": 1.0859375, "learning_rate": 4.7869340761707926e-05, "loss": 0.3723, "step": 11385 }, { "epoch": 2.741935483870968, "grad_norm": 1.171875, "learning_rate": 4.7846244985051426e-05, "loss": 0.4027, "step": 11390 }, { "epoch": 2.743139142994704, "grad_norm": 1.1796875, "learning_rate": 4.7823253940150814e-05, "loss": 0.3928, "step": 11395 }, { "epoch": 2.74434280211844, "grad_norm": 1.0625, "learning_rate": 4.7800367663739836e-05, "loss": 0.3909, "step": 11400 }, { "epoch": 2.745546461242176, "grad_norm": 1.140625, "learning_rate": 4.777758619238492e-05, "loss": 0.4082, "step": 11405 }, { "epoch": 2.7467501203659124, "grad_norm": 1.28125, "learning_rate": 4.775490956248504e-05, "loss": 0.4194, "step": 11410 }, { "epoch": 2.7479537794896487, "grad_norm": 1.1328125, "learning_rate": 4.77323378102716e-05, "loss": 0.3857, "step": 11415 }, { "epoch": 2.7491574386133846, "grad_norm": 1.171875, "learning_rate": 4.7709870971808497e-05, "loss": 0.4026, "step": 11420 }, { "epoch": 2.750361097737121, "grad_norm": 1.203125, "learning_rate": 4.768750908299197e-05, "loss": 0.3793, "step": 11425 }, { "epoch": 2.7515647568608568, "grad_norm": 1.15625, "learning_rate": 4.7665252179550576e-05, "loss": 0.4044, "step": 11430 }, { "epoch": 2.752768415984593, "grad_norm": 1.2109375, "learning_rate": 4.7643100297045155e-05, "loss": 0.4044, "step": 11435 }, { "epoch": 2.7539720751083294, "grad_norm": 1.125, "learning_rate": 4.7621053470868734e-05, "loss": 0.3788, "step": 11440 }, { "epoch": 2.7551757342320657, "grad_norm": 1.1953125, "learning_rate": 4.759911173624649e-05, "loss": 0.3904, "step": 11445 }, { "epoch": 2.7563793933558016, "grad_norm": 1.1484375, "learning_rate": 4.757727512823569e-05, "loss": 0.4006, "step": 11450 }, { "epoch": 2.757583052479538, "grad_norm": 1.15625, "learning_rate": 4.7555543681725604e-05, "loss": 0.3898, "step": 11455 }, { "epoch": 2.758786711603274, "grad_norm": 1.09375, "learning_rate": 4.7533917431437576e-05, "loss": 0.3678, "step": 11460 }, { "epoch": 2.75999037072701, "grad_norm": 1.171875, "learning_rate": 4.751239641192476e-05, "loss": 0.3722, "step": 11465 }, { "epoch": 2.7611940298507465, "grad_norm": 1.1640625, "learning_rate": 4.749098065757225e-05, "loss": 0.3969, "step": 11470 }, { "epoch": 2.7623976889744823, "grad_norm": 1.1015625, "learning_rate": 4.746967020259693e-05, "loss": 0.394, "step": 11475 }, { "epoch": 2.7636013480982187, "grad_norm": 1.046875, "learning_rate": 4.744846508104744e-05, "loss": 0.3847, "step": 11480 }, { "epoch": 2.7648050072219545, "grad_norm": 1.125, "learning_rate": 4.7427365326804126e-05, "loss": 0.3887, "step": 11485 }, { "epoch": 2.766008666345691, "grad_norm": 1.203125, "learning_rate": 4.740637097357898e-05, "loss": 0.4225, "step": 11490 }, { "epoch": 2.767212325469427, "grad_norm": 1.1171875, "learning_rate": 4.738548205491562e-05, "loss": 0.3742, "step": 11495 }, { "epoch": 2.768415984593163, "grad_norm": 1.078125, "learning_rate": 4.7364698604189155e-05, "loss": 0.3791, "step": 11500 }, { "epoch": 2.768415984593163, "eval_loss": 0.3576022684574127, "eval_runtime": 2.3931, "eval_samples_per_second": 83.574, "eval_steps_per_second": 83.574, "step": 11500 }, { "epoch": 2.7696196437168994, "grad_norm": 1.1875, "learning_rate": 4.7344020654606226e-05, "loss": 0.3636, "step": 11505 }, { "epoch": 2.7708233028406353, "grad_norm": 1.1953125, "learning_rate": 4.7323448239204896e-05, "loss": 0.3912, "step": 11510 }, { "epoch": 2.7720269619643716, "grad_norm": 1.1953125, "learning_rate": 4.7302981390854604e-05, "loss": 0.3732, "step": 11515 }, { "epoch": 2.773230621088108, "grad_norm": 1.0703125, "learning_rate": 4.728262014225614e-05, "loss": 0.3888, "step": 11520 }, { "epoch": 2.7744342802118442, "grad_norm": 1.0625, "learning_rate": 4.726236452594155e-05, "loss": 0.3842, "step": 11525 }, { "epoch": 2.77563793933558, "grad_norm": 1.046875, "learning_rate": 4.724221457427411e-05, "loss": 0.382, "step": 11530 }, { "epoch": 2.7768415984593164, "grad_norm": 1.1328125, "learning_rate": 4.7222170319448306e-05, "loss": 0.3981, "step": 11535 }, { "epoch": 2.7780452575830523, "grad_norm": 1.078125, "learning_rate": 4.720223179348968e-05, "loss": 0.3983, "step": 11540 }, { "epoch": 2.7792489167067886, "grad_norm": 1.1875, "learning_rate": 4.718239902825491e-05, "loss": 0.3847, "step": 11545 }, { "epoch": 2.780452575830525, "grad_norm": 1.1015625, "learning_rate": 4.716267205543166e-05, "loss": 0.4063, "step": 11550 }, { "epoch": 2.781656234954261, "grad_norm": 1.3515625, "learning_rate": 4.714305090653857e-05, "loss": 0.3949, "step": 11555 }, { "epoch": 2.782859894077997, "grad_norm": 1.1484375, "learning_rate": 4.712353561292523e-05, "loss": 0.4026, "step": 11560 }, { "epoch": 2.784063553201733, "grad_norm": 1.015625, "learning_rate": 4.710412620577205e-05, "loss": 0.3894, "step": 11565 }, { "epoch": 2.7852672123254694, "grad_norm": 1.109375, "learning_rate": 4.708482271609032e-05, "loss": 0.382, "step": 11570 }, { "epoch": 2.7864708714492057, "grad_norm": 1.21875, "learning_rate": 4.706562517472203e-05, "loss": 0.3983, "step": 11575 }, { "epoch": 2.787674530572942, "grad_norm": 1.1640625, "learning_rate": 4.704653361233995e-05, "loss": 0.3772, "step": 11580 }, { "epoch": 2.788878189696678, "grad_norm": 1.1953125, "learning_rate": 4.70275480594475e-05, "loss": 0.4026, "step": 11585 }, { "epoch": 2.790081848820414, "grad_norm": 1.1015625, "learning_rate": 4.7008668546378726e-05, "loss": 0.3918, "step": 11590 }, { "epoch": 2.79128550794415, "grad_norm": 1.3046875, "learning_rate": 4.6989895103298245e-05, "loss": 0.404, "step": 11595 }, { "epoch": 2.7924891670678864, "grad_norm": 1.0234375, "learning_rate": 4.697122776020121e-05, "loss": 0.3989, "step": 11600 }, { "epoch": 2.7936928261916227, "grad_norm": 1.265625, "learning_rate": 4.695266654691326e-05, "loss": 0.3904, "step": 11605 }, { "epoch": 2.7948964853153586, "grad_norm": 1.1796875, "learning_rate": 4.693421149309044e-05, "loss": 0.3853, "step": 11610 }, { "epoch": 2.796100144439095, "grad_norm": 1.0703125, "learning_rate": 4.6915862628219196e-05, "loss": 0.3757, "step": 11615 }, { "epoch": 2.797303803562831, "grad_norm": 1.078125, "learning_rate": 4.689761998161634e-05, "loss": 0.3983, "step": 11620 }, { "epoch": 2.798507462686567, "grad_norm": 1.25, "learning_rate": 4.6879483582428885e-05, "loss": 0.4047, "step": 11625 }, { "epoch": 2.7997111218103035, "grad_norm": 1.2109375, "learning_rate": 4.6861453459634206e-05, "loss": 0.3826, "step": 11630 }, { "epoch": 2.8009147809340393, "grad_norm": 1.1171875, "learning_rate": 4.684352964203979e-05, "loss": 0.3798, "step": 11635 }, { "epoch": 2.8021184400577757, "grad_norm": 1.1640625, "learning_rate": 4.682571215828331e-05, "loss": 0.3932, "step": 11640 }, { "epoch": 2.8033220991815115, "grad_norm": 1.1484375, "learning_rate": 4.680800103683254e-05, "loss": 0.3664, "step": 11645 }, { "epoch": 2.804525758305248, "grad_norm": 1.09375, "learning_rate": 4.679039630598529e-05, "loss": 0.3953, "step": 11650 }, { "epoch": 2.805729417428984, "grad_norm": 1.1484375, "learning_rate": 4.6772897993869435e-05, "loss": 0.3942, "step": 11655 }, { "epoch": 2.8069330765527205, "grad_norm": 1.1796875, "learning_rate": 4.6755506128442775e-05, "loss": 0.383, "step": 11660 }, { "epoch": 2.8081367356764564, "grad_norm": 1.125, "learning_rate": 4.673822073749306e-05, "loss": 0.3847, "step": 11665 }, { "epoch": 2.8093403948001927, "grad_norm": 1.0703125, "learning_rate": 4.672104184863791e-05, "loss": 0.411, "step": 11670 }, { "epoch": 2.8105440539239286, "grad_norm": 1.125, "learning_rate": 4.670396948932479e-05, "loss": 0.4019, "step": 11675 }, { "epoch": 2.811747713047665, "grad_norm": 1.1171875, "learning_rate": 4.6687003686830947e-05, "loss": 0.395, "step": 11680 }, { "epoch": 2.8129513721714012, "grad_norm": 1.0390625, "learning_rate": 4.667014446826338e-05, "loss": 0.4029, "step": 11685 }, { "epoch": 2.814155031295137, "grad_norm": 1.1640625, "learning_rate": 4.6653391860558815e-05, "loss": 0.4214, "step": 11690 }, { "epoch": 2.8153586904188734, "grad_norm": 1.1484375, "learning_rate": 4.6636745890483633e-05, "loss": 0.3709, "step": 11695 }, { "epoch": 2.8165623495426093, "grad_norm": 1.1796875, "learning_rate": 4.662020658463378e-05, "loss": 0.4045, "step": 11700 }, { "epoch": 2.8177660086663456, "grad_norm": 1.046875, "learning_rate": 4.6603773969434875e-05, "loss": 0.3849, "step": 11705 }, { "epoch": 2.818969667790082, "grad_norm": 1.109375, "learning_rate": 4.6587448071142e-05, "loss": 0.4063, "step": 11710 }, { "epoch": 2.8201733269138183, "grad_norm": 1.1640625, "learning_rate": 4.657122891583977e-05, "loss": 0.3738, "step": 11715 }, { "epoch": 2.821376986037554, "grad_norm": 1.109375, "learning_rate": 4.655511652944221e-05, "loss": 0.3776, "step": 11720 }, { "epoch": 2.8225806451612905, "grad_norm": 1.1640625, "learning_rate": 4.65391109376928e-05, "loss": 0.3899, "step": 11725 }, { "epoch": 2.8237843042850264, "grad_norm": 1.09375, "learning_rate": 4.65232121661644e-05, "loss": 0.3891, "step": 11730 }, { "epoch": 2.8249879634087627, "grad_norm": 1.109375, "learning_rate": 4.650742024025912e-05, "loss": 0.4112, "step": 11735 }, { "epoch": 2.826191622532499, "grad_norm": 1.1953125, "learning_rate": 4.649173518520845e-05, "loss": 0.3939, "step": 11740 }, { "epoch": 2.827395281656235, "grad_norm": 1.1015625, "learning_rate": 4.647615702607307e-05, "loss": 0.3799, "step": 11745 }, { "epoch": 2.828598940779971, "grad_norm": 1.1328125, "learning_rate": 4.646068578774288e-05, "loss": 0.3861, "step": 11750 }, { "epoch": 2.829802599903707, "grad_norm": 1.15625, "learning_rate": 4.644532149493697e-05, "loss": 0.3719, "step": 11755 }, { "epoch": 2.8310062590274434, "grad_norm": 1.1171875, "learning_rate": 4.6430064172203515e-05, "loss": 0.3909, "step": 11760 }, { "epoch": 2.8322099181511797, "grad_norm": 1.203125, "learning_rate": 4.6414913843919834e-05, "loss": 0.405, "step": 11765 }, { "epoch": 2.8334135772749156, "grad_norm": 1.140625, "learning_rate": 4.639987053429225e-05, "loss": 0.4022, "step": 11770 }, { "epoch": 2.834617236398652, "grad_norm": 1.203125, "learning_rate": 4.638493426735612e-05, "loss": 0.3874, "step": 11775 }, { "epoch": 2.835820895522388, "grad_norm": 1.0859375, "learning_rate": 4.6370105066975775e-05, "loss": 0.3734, "step": 11780 }, { "epoch": 2.837024554646124, "grad_norm": 1.125, "learning_rate": 4.635538295684446e-05, "loss": 0.3928, "step": 11785 }, { "epoch": 2.8382282137698605, "grad_norm": 1.1875, "learning_rate": 4.634076796048435e-05, "loss": 0.394, "step": 11790 }, { "epoch": 2.8394318728935968, "grad_norm": 1.1953125, "learning_rate": 4.632626010124645e-05, "loss": 0.406, "step": 11795 }, { "epoch": 2.8406355320173327, "grad_norm": 1.1875, "learning_rate": 4.631185940231062e-05, "loss": 0.3848, "step": 11800 }, { "epoch": 2.841839191141069, "grad_norm": 1.046875, "learning_rate": 4.6297565886685466e-05, "loss": 0.3838, "step": 11805 }, { "epoch": 2.843042850264805, "grad_norm": 1.140625, "learning_rate": 4.628337957720835e-05, "loss": 0.3926, "step": 11810 }, { "epoch": 2.844246509388541, "grad_norm": 1.1171875, "learning_rate": 4.62693004965454e-05, "loss": 0.3827, "step": 11815 }, { "epoch": 2.8454501685122775, "grad_norm": 1.1640625, "learning_rate": 4.6255328667191346e-05, "loss": 0.4085, "step": 11820 }, { "epoch": 2.8466538276360134, "grad_norm": 1.171875, "learning_rate": 4.6241464111469586e-05, "loss": 0.4502, "step": 11825 }, { "epoch": 2.8478574867597497, "grad_norm": 1.140625, "learning_rate": 4.622770685153214e-05, "loss": 0.3902, "step": 11830 }, { "epoch": 2.8490611458834856, "grad_norm": 1.1953125, "learning_rate": 4.6214056909359585e-05, "loss": 0.3776, "step": 11835 }, { "epoch": 2.850264805007222, "grad_norm": 1.1796875, "learning_rate": 4.6200514306761016e-05, "loss": 0.3854, "step": 11840 }, { "epoch": 2.8514684641309582, "grad_norm": 1.0859375, "learning_rate": 4.618707906537404e-05, "loss": 0.3774, "step": 11845 }, { "epoch": 2.8526721232546945, "grad_norm": 1.296875, "learning_rate": 4.6173751206664764e-05, "loss": 0.3915, "step": 11850 }, { "epoch": 2.8538757823784304, "grad_norm": 1.2578125, "learning_rate": 4.616053075192764e-05, "loss": 0.4132, "step": 11855 }, { "epoch": 2.8550794415021667, "grad_norm": 1.15625, "learning_rate": 4.61474177222856e-05, "loss": 0.3768, "step": 11860 }, { "epoch": 2.8562831006259026, "grad_norm": 1.1015625, "learning_rate": 4.613441213868988e-05, "loss": 0.3863, "step": 11865 }, { "epoch": 2.857486759749639, "grad_norm": 1.1953125, "learning_rate": 4.6121514021920075e-05, "loss": 0.3987, "step": 11870 }, { "epoch": 2.8586904188733753, "grad_norm": 1.1484375, "learning_rate": 4.6108723392584075e-05, "loss": 0.3995, "step": 11875 }, { "epoch": 2.859894077997111, "grad_norm": 1.0859375, "learning_rate": 4.609604027111803e-05, "loss": 0.4005, "step": 11880 }, { "epoch": 2.8610977371208475, "grad_norm": 1.234375, "learning_rate": 4.608346467778631e-05, "loss": 0.4342, "step": 11885 }, { "epoch": 2.8623013962445834, "grad_norm": 1.1015625, "learning_rate": 4.60709966326815e-05, "loss": 0.405, "step": 11890 }, { "epoch": 2.8635050553683197, "grad_norm": 1.234375, "learning_rate": 4.605863615572432e-05, "loss": 0.3938, "step": 11895 }, { "epoch": 2.864708714492056, "grad_norm": 1.1640625, "learning_rate": 4.6046383266663665e-05, "loss": 0.4093, "step": 11900 }, { "epoch": 2.865912373615792, "grad_norm": 1.0859375, "learning_rate": 4.6034237985076504e-05, "loss": 0.3839, "step": 11905 }, { "epoch": 2.867116032739528, "grad_norm": 1.09375, "learning_rate": 4.602220033036787e-05, "loss": 0.3766, "step": 11910 }, { "epoch": 2.868319691863264, "grad_norm": 1.109375, "learning_rate": 4.601027032177086e-05, "loss": 0.3646, "step": 11915 }, { "epoch": 2.8695233509870004, "grad_norm": 1.265625, "learning_rate": 4.5998447978346566e-05, "loss": 0.3933, "step": 11920 }, { "epoch": 2.8707270101107367, "grad_norm": 1.140625, "learning_rate": 4.598673331898408e-05, "loss": 0.3661, "step": 11925 }, { "epoch": 2.871930669234473, "grad_norm": 1.203125, "learning_rate": 4.59751263624004e-05, "loss": 0.3885, "step": 11930 }, { "epoch": 2.873134328358209, "grad_norm": 1.1796875, "learning_rate": 4.596362712714046e-05, "loss": 0.3985, "step": 11935 }, { "epoch": 2.8743379874819452, "grad_norm": 1.0546875, "learning_rate": 4.595223563157714e-05, "loss": 0.3615, "step": 11940 }, { "epoch": 2.875541646605681, "grad_norm": 1.125, "learning_rate": 4.5940951893911076e-05, "loss": 0.3869, "step": 11945 }, { "epoch": 2.8767453057294174, "grad_norm": 1.1015625, "learning_rate": 4.592977593217082e-05, "loss": 0.3752, "step": 11950 }, { "epoch": 2.8779489648531538, "grad_norm": 1.171875, "learning_rate": 4.5918707764212665e-05, "loss": 0.3578, "step": 11955 }, { "epoch": 2.8791526239768896, "grad_norm": 1.234375, "learning_rate": 4.590774740772073e-05, "loss": 0.4142, "step": 11960 }, { "epoch": 2.880356283100626, "grad_norm": 1.15625, "learning_rate": 4.5896894880206865e-05, "loss": 0.3769, "step": 11965 }, { "epoch": 2.881559942224362, "grad_norm": 1.203125, "learning_rate": 4.5886150199010585e-05, "loss": 0.4187, "step": 11970 }, { "epoch": 2.882763601348098, "grad_norm": 1.1953125, "learning_rate": 4.587551338129918e-05, "loss": 0.3809, "step": 11975 }, { "epoch": 2.8839672604718345, "grad_norm": 1.203125, "learning_rate": 4.586498444406754e-05, "loss": 0.4233, "step": 11980 }, { "epoch": 2.8851709195955704, "grad_norm": 1.0546875, "learning_rate": 4.5854563404138195e-05, "loss": 0.3566, "step": 11985 }, { "epoch": 2.8863745787193067, "grad_norm": 1.1484375, "learning_rate": 4.584425027816132e-05, "loss": 0.3931, "step": 11990 }, { "epoch": 2.887578237843043, "grad_norm": 1.21875, "learning_rate": 4.583404508261461e-05, "loss": 0.3689, "step": 11995 }, { "epoch": 2.888781896966779, "grad_norm": 1.1796875, "learning_rate": 4.58239478338034e-05, "loss": 0.369, "step": 12000 }, { "epoch": 2.888781896966779, "eval_loss": 0.3563074767589569, "eval_runtime": 2.4086, "eval_samples_per_second": 83.036, "eval_steps_per_second": 83.036, "step": 12000 }, { "epoch": 2.889985556090515, "grad_norm": 1.078125, "learning_rate": 4.5813958547860474e-05, "loss": 0.3634, "step": 12005 }, { "epoch": 2.8911892152142515, "grad_norm": 1.1953125, "learning_rate": 4.580407724074616e-05, "loss": 0.4074, "step": 12010 }, { "epoch": 2.8923928743379874, "grad_norm": 1.125, "learning_rate": 4.5794303928248246e-05, "loss": 0.395, "step": 12015 }, { "epoch": 2.8935965334617237, "grad_norm": 1.1640625, "learning_rate": 4.578463862598198e-05, "loss": 0.3752, "step": 12020 }, { "epoch": 2.8948001925854596, "grad_norm": 1.1484375, "learning_rate": 4.577508134939005e-05, "loss": 0.3828, "step": 12025 }, { "epoch": 2.896003851709196, "grad_norm": 1.1484375, "learning_rate": 4.576563211374252e-05, "loss": 0.4007, "step": 12030 }, { "epoch": 2.8972075108329323, "grad_norm": 1.0703125, "learning_rate": 4.5756290934136856e-05, "loss": 0.3892, "step": 12035 }, { "epoch": 2.898411169956668, "grad_norm": 1.1015625, "learning_rate": 4.574705782549786e-05, "loss": 0.3934, "step": 12040 }, { "epoch": 2.8996148290804045, "grad_norm": 1.234375, "learning_rate": 4.573793280257768e-05, "loss": 0.3848, "step": 12045 }, { "epoch": 2.9008184882041403, "grad_norm": 1.109375, "learning_rate": 4.572891587995576e-05, "loss": 0.4045, "step": 12050 }, { "epoch": 2.9020221473278767, "grad_norm": 1.171875, "learning_rate": 4.572000707203883e-05, "loss": 0.4103, "step": 12055 }, { "epoch": 2.903225806451613, "grad_norm": 1.1171875, "learning_rate": 4.5711206393060876e-05, "loss": 0.3755, "step": 12060 }, { "epoch": 2.9044294655753493, "grad_norm": 1.2734375, "learning_rate": 4.570251385708314e-05, "loss": 0.3847, "step": 12065 }, { "epoch": 2.905633124699085, "grad_norm": 1.1484375, "learning_rate": 4.569392947799406e-05, "loss": 0.3923, "step": 12070 }, { "epoch": 2.9068367838228215, "grad_norm": 1.171875, "learning_rate": 4.568545326950928e-05, "loss": 0.3719, "step": 12075 }, { "epoch": 2.9080404429465574, "grad_norm": 1.203125, "learning_rate": 4.567708524517159e-05, "loss": 0.3855, "step": 12080 }, { "epoch": 2.9092441020702937, "grad_norm": 1.28125, "learning_rate": 4.566882541835097e-05, "loss": 0.371, "step": 12085 }, { "epoch": 2.91044776119403, "grad_norm": 1.203125, "learning_rate": 4.566067380224448e-05, "loss": 0.413, "step": 12090 }, { "epoch": 2.911651420317766, "grad_norm": 1.21875, "learning_rate": 4.565263040987633e-05, "loss": 0.3755, "step": 12095 }, { "epoch": 2.9128550794415022, "grad_norm": 1.1953125, "learning_rate": 4.56446952540978e-05, "loss": 0.3831, "step": 12100 }, { "epoch": 2.914058738565238, "grad_norm": 1.1640625, "learning_rate": 4.563686834758722e-05, "loss": 0.4041, "step": 12105 }, { "epoch": 2.9152623976889744, "grad_norm": 1.0859375, "learning_rate": 4.5629149702850003e-05, "loss": 0.3839, "step": 12110 }, { "epoch": 2.9164660568127108, "grad_norm": 1.1015625, "learning_rate": 4.5621539332218563e-05, "loss": 0.3736, "step": 12115 }, { "epoch": 2.9176697159364466, "grad_norm": 1.1796875, "learning_rate": 4.56140372478523e-05, "loss": 0.4246, "step": 12120 }, { "epoch": 2.918873375060183, "grad_norm": 1.203125, "learning_rate": 4.560664346173763e-05, "loss": 0.3869, "step": 12125 }, { "epoch": 2.920077034183919, "grad_norm": 1.3125, "learning_rate": 4.5599357985687936e-05, "loss": 0.4291, "step": 12130 }, { "epoch": 2.921280693307655, "grad_norm": 1.171875, "learning_rate": 4.559218083134353e-05, "loss": 0.3501, "step": 12135 }, { "epoch": 2.9224843524313915, "grad_norm": 1.2578125, "learning_rate": 4.558511201017168e-05, "loss": 0.4301, "step": 12140 }, { "epoch": 2.923688011555128, "grad_norm": 1.1015625, "learning_rate": 4.557815153346653e-05, "loss": 0.388, "step": 12145 }, { "epoch": 2.9248916706788637, "grad_norm": 1.2890625, "learning_rate": 4.557129941234914e-05, "loss": 0.4119, "step": 12150 }, { "epoch": 2.9260953298026, "grad_norm": 1.15625, "learning_rate": 4.5564555657767465e-05, "loss": 0.3856, "step": 12155 }, { "epoch": 2.927298988926336, "grad_norm": 1.25, "learning_rate": 4.555792028049628e-05, "loss": 0.3855, "step": 12160 }, { "epoch": 2.928502648050072, "grad_norm": 1.203125, "learning_rate": 4.555139329113723e-05, "loss": 0.3782, "step": 12165 }, { "epoch": 2.9297063071738085, "grad_norm": 1.1171875, "learning_rate": 4.554497470011874e-05, "loss": 0.3901, "step": 12170 }, { "epoch": 2.9309099662975444, "grad_norm": 1.15625, "learning_rate": 4.55386645176961e-05, "loss": 0.4041, "step": 12175 }, { "epoch": 2.9321136254212807, "grad_norm": 1.15625, "learning_rate": 4.5532462753951366e-05, "loss": 0.4107, "step": 12180 }, { "epoch": 2.9333172845450166, "grad_norm": 1.171875, "learning_rate": 4.552636941879335e-05, "loss": 0.3928, "step": 12185 }, { "epoch": 2.934520943668753, "grad_norm": 1.0546875, "learning_rate": 4.552038452195764e-05, "loss": 0.4072, "step": 12190 }, { "epoch": 2.9357246027924893, "grad_norm": 1.046875, "learning_rate": 4.551450807300657e-05, "loss": 0.3735, "step": 12195 }, { "epoch": 2.9369282619162256, "grad_norm": 1.2265625, "learning_rate": 4.550874008132922e-05, "loss": 0.3951, "step": 12200 }, { "epoch": 2.9381319210399615, "grad_norm": 1.2734375, "learning_rate": 4.550308055614134e-05, "loss": 0.3922, "step": 12205 }, { "epoch": 2.939335580163698, "grad_norm": 1.2109375, "learning_rate": 4.54975295064854e-05, "loss": 0.4063, "step": 12210 }, { "epoch": 2.9405392392874337, "grad_norm": 1.125, "learning_rate": 4.5492086941230566e-05, "loss": 0.3764, "step": 12215 }, { "epoch": 2.94174289841117, "grad_norm": 1.046875, "learning_rate": 4.548675286907265e-05, "loss": 0.3877, "step": 12220 }, { "epoch": 2.9429465575349063, "grad_norm": 1.078125, "learning_rate": 4.5481527298534135e-05, "loss": 0.4026, "step": 12225 }, { "epoch": 2.944150216658642, "grad_norm": 1.1171875, "learning_rate": 4.547641023796415e-05, "loss": 0.3824, "step": 12230 }, { "epoch": 2.9453538757823785, "grad_norm": 1.0546875, "learning_rate": 4.547140169553844e-05, "loss": 0.4216, "step": 12235 }, { "epoch": 2.9465575349061144, "grad_norm": 1.1953125, "learning_rate": 4.546650167925937e-05, "loss": 0.3813, "step": 12240 }, { "epoch": 2.9477611940298507, "grad_norm": 1.125, "learning_rate": 4.546171019695592e-05, "loss": 0.3591, "step": 12245 }, { "epoch": 2.948964853153587, "grad_norm": 1.09375, "learning_rate": 4.5457027256283644e-05, "loss": 0.3884, "step": 12250 }, { "epoch": 2.950168512277323, "grad_norm": 1.0625, "learning_rate": 4.545245286472468e-05, "loss": 0.3715, "step": 12255 }, { "epoch": 2.9513721714010592, "grad_norm": 1.2421875, "learning_rate": 4.544798702958773e-05, "loss": 0.4098, "step": 12260 }, { "epoch": 2.952575830524795, "grad_norm": 1.125, "learning_rate": 4.5443629758008056e-05, "loss": 0.38, "step": 12265 }, { "epoch": 2.9537794896485314, "grad_norm": 1.15625, "learning_rate": 4.543938105694746e-05, "loss": 0.3903, "step": 12270 }, { "epoch": 2.9549831487722678, "grad_norm": 1.2578125, "learning_rate": 4.543524093319429e-05, "loss": 0.3801, "step": 12275 }, { "epoch": 2.956186807896004, "grad_norm": 1.1875, "learning_rate": 4.5431209393363386e-05, "loss": 0.4105, "step": 12280 }, { "epoch": 2.95739046701974, "grad_norm": 1.2265625, "learning_rate": 4.542728644389611e-05, "loss": 0.3941, "step": 12285 }, { "epoch": 2.9585941261434763, "grad_norm": 1.125, "learning_rate": 4.542347209106036e-05, "loss": 0.3654, "step": 12290 }, { "epoch": 2.959797785267212, "grad_norm": 1.2734375, "learning_rate": 4.541976634095046e-05, "loss": 0.3835, "step": 12295 }, { "epoch": 2.9610014443909485, "grad_norm": 1.09375, "learning_rate": 4.541616919948728e-05, "loss": 0.3821, "step": 12300 }, { "epoch": 2.962205103514685, "grad_norm": 1.203125, "learning_rate": 4.54126806724181e-05, "loss": 0.3633, "step": 12305 }, { "epoch": 2.9634087626384207, "grad_norm": 1.0546875, "learning_rate": 4.54093007653167e-05, "loss": 0.3833, "step": 12310 }, { "epoch": 2.964612421762157, "grad_norm": 1.109375, "learning_rate": 4.5406029483583325e-05, "loss": 0.3849, "step": 12315 }, { "epoch": 2.965816080885893, "grad_norm": 1.046875, "learning_rate": 4.540286683244462e-05, "loss": 0.4046, "step": 12320 }, { "epoch": 2.967019740009629, "grad_norm": 1.15625, "learning_rate": 4.53998128169537e-05, "loss": 0.367, "step": 12325 }, { "epoch": 2.9682233991333655, "grad_norm": 1.140625, "learning_rate": 4.539686744199011e-05, "loss": 0.3825, "step": 12330 }, { "epoch": 2.969427058257102, "grad_norm": 1.0, "learning_rate": 4.5394030712259775e-05, "loss": 0.399, "step": 12335 }, { "epoch": 2.9706307173808377, "grad_norm": 1.1875, "learning_rate": 4.53913026322951e-05, "loss": 0.4124, "step": 12340 }, { "epoch": 2.971834376504574, "grad_norm": 1.171875, "learning_rate": 4.538868320645483e-05, "loss": 0.3857, "step": 12345 }, { "epoch": 2.97303803562831, "grad_norm": 1.0703125, "learning_rate": 4.538617243892414e-05, "loss": 0.3928, "step": 12350 }, { "epoch": 2.9742416947520463, "grad_norm": 1.15625, "learning_rate": 4.5383770333714605e-05, "loss": 0.3823, "step": 12355 }, { "epoch": 2.9754453538757826, "grad_norm": 1.1796875, "learning_rate": 4.538147689466415e-05, "loss": 0.4059, "step": 12360 }, { "epoch": 2.9766490129995185, "grad_norm": 1.1640625, "learning_rate": 4.537929212543713e-05, "loss": 0.3888, "step": 12365 }, { "epoch": 2.9778526721232548, "grad_norm": 1.0546875, "learning_rate": 4.5377216029524235e-05, "loss": 0.3633, "step": 12370 }, { "epoch": 2.9790563312469907, "grad_norm": 1.3515625, "learning_rate": 4.5375248610242525e-05, "loss": 0.3799, "step": 12375 }, { "epoch": 2.980259990370727, "grad_norm": 1.25, "learning_rate": 4.537338987073546e-05, "loss": 0.3851, "step": 12380 }, { "epoch": 2.9814636494944633, "grad_norm": 1.1328125, "learning_rate": 4.537163981397279e-05, "loss": 0.3979, "step": 12385 }, { "epoch": 2.982667308618199, "grad_norm": 0.9921875, "learning_rate": 4.5369998442750676e-05, "loss": 0.3847, "step": 12390 }, { "epoch": 2.9838709677419355, "grad_norm": 1.1171875, "learning_rate": 4.5368465759691614e-05, "loss": 0.4103, "step": 12395 }, { "epoch": 2.9850746268656714, "grad_norm": 1.1171875, "learning_rate": 4.536704176724443e-05, "loss": 0.4009, "step": 12400 }, { "epoch": 2.9862782859894077, "grad_norm": 1.1328125, "learning_rate": 4.5365726467684305e-05, "loss": 0.3771, "step": 12405 }, { "epoch": 2.987481945113144, "grad_norm": 1.1640625, "learning_rate": 4.536451986311275e-05, "loss": 0.4039, "step": 12410 }, { "epoch": 2.9886856042368803, "grad_norm": 1.1171875, "learning_rate": 4.5363421955457594e-05, "loss": 0.3889, "step": 12415 }, { "epoch": 2.9898892633606162, "grad_norm": 1.1484375, "learning_rate": 4.536243274647304e-05, "loss": 0.3977, "step": 12420 }, { "epoch": 2.9910929224843525, "grad_norm": 1.1328125, "learning_rate": 4.536155223773957e-05, "loss": 0.4013, "step": 12425 }, { "epoch": 2.9922965816080884, "grad_norm": 1.0703125, "learning_rate": 4.5360780430664015e-05, "loss": 0.3995, "step": 12430 }, { "epoch": 2.9935002407318247, "grad_norm": 1.078125, "learning_rate": 4.536011732647953e-05, "loss": 0.3835, "step": 12435 }, { "epoch": 2.994703899855561, "grad_norm": 1.1640625, "learning_rate": 4.535956292624558e-05, "loss": 0.4118, "step": 12440 }, { "epoch": 2.995907558979297, "grad_norm": 1.140625, "learning_rate": 4.535911723084795e-05, "loss": 0.39, "step": 12445 }, { "epoch": 2.9971112181030333, "grad_norm": 1.296875, "learning_rate": 4.535878024099877e-05, "loss": 0.4116, "step": 12450 }, { "epoch": 2.998314877226769, "grad_norm": 1.078125, "learning_rate": 4.5358551957236436e-05, "loss": 0.3725, "step": 12455 }, { "epoch": 2.999277804525758, "eval_loss": 0.35562843084335327, "eval_runtime": 2.3589, "eval_samples_per_second": 84.786, "eval_steps_per_second": 84.786, "step": 12459 } ], "logging_steps": 5, "max_steps": 12462, "num_input_tokens_seen": 0, "num_train_epochs": 3, "save_steps": 500, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 3.12509450944512e+17, "train_batch_size": 48, "trial_name": null, "trial_params": null }