{ "best_global_step": 48000, "best_metric": 4.555622100830078, "best_model_checkpoint": "./model_save/checkpoint-48000", "epoch": 0.9829418630843897, "eval_steps": 2000, "global_step": 48000, "is_hyper_param_search": false, "is_local_process_zero": true, "is_world_process_zero": true, "log_history": [ { "epoch": 0.00020477955480924785, "grad_norm": 6.629329204559326, "learning_rate": 7.2e-07, "loss": 11.4041, "step": 10 }, { "epoch": 0.0004095591096184957, "grad_norm": 6.498970985412598, "learning_rate": 1.52e-06, "loss": 11.3063, "step": 20 }, { "epoch": 0.0006143386644277435, "grad_norm": 6.277300834655762, "learning_rate": 2.32e-06, "loss": 11.0971, "step": 30 }, { "epoch": 0.0008191182192369914, "grad_norm": 5.17841911315918, "learning_rate": 3.12e-06, "loss": 10.8001, "step": 40 }, { "epoch": 0.0010238977740462392, "grad_norm": 3.385416269302368, "learning_rate": 3.92e-06, "loss": 10.4549, "step": 50 }, { "epoch": 0.001228677328855487, "grad_norm": 2.175173044204712, "learning_rate": 4.72e-06, "loss": 10.202, "step": 60 }, { "epoch": 0.001433456883664735, "grad_norm": 1.7192872762680054, "learning_rate": 5.5200000000000005e-06, "loss": 10.0135, "step": 70 }, { "epoch": 0.0016382364384739828, "grad_norm": 1.3520652055740356, "learning_rate": 6.320000000000001e-06, "loss": 9.8705, "step": 80 }, { "epoch": 0.0018430159932832306, "grad_norm": 1.2889796495437622, "learning_rate": 7.1200000000000004e-06, "loss": 9.7625, "step": 90 }, { "epoch": 0.0020477955480924783, "grad_norm": 1.2184574604034424, "learning_rate": 7.92e-06, "loss": 9.6682, "step": 100 }, { "epoch": 0.0022525751029017264, "grad_norm": 1.253600001335144, "learning_rate": 8.720000000000001e-06, "loss": 9.5655, "step": 110 }, { "epoch": 0.002457354657710974, "grad_norm": 1.5453048944473267, "learning_rate": 9.52e-06, "loss": 9.462, "step": 120 }, { "epoch": 0.002662134212520222, "grad_norm": 1.1592094898223877, "learning_rate": 1.0320000000000001e-05, "loss": 9.3696, "step": 130 }, { "epoch": 0.00286691376732947, "grad_norm": 1.1517513990402222, "learning_rate": 1.112e-05, "loss": 9.2833, "step": 140 }, { "epoch": 0.0030716933221387175, "grad_norm": 1.1195098161697388, "learning_rate": 1.1920000000000001e-05, "loss": 9.2143, "step": 150 }, { "epoch": 0.0032764728769479655, "grad_norm": 1.066959261894226, "learning_rate": 1.2720000000000002e-05, "loss": 9.1244, "step": 160 }, { "epoch": 0.0034812524317572135, "grad_norm": 1.2716888189315796, "learning_rate": 1.352e-05, "loss": 9.0943, "step": 170 }, { "epoch": 0.003686031986566461, "grad_norm": 1.0497322082519531, "learning_rate": 1.432e-05, "loss": 8.974, "step": 180 }, { "epoch": 0.003890811541375709, "grad_norm": 0.9636754393577576, "learning_rate": 1.5120000000000001e-05, "loss": 8.903, "step": 190 }, { "epoch": 0.004095591096184957, "grad_norm": 0.962215781211853, "learning_rate": 1.592e-05, "loss": 8.8231, "step": 200 }, { "epoch": 0.004300370650994205, "grad_norm": 1.1262612342834473, "learning_rate": 1.672e-05, "loss": 8.7727, "step": 210 }, { "epoch": 0.004505150205803453, "grad_norm": 0.8915610909461975, "learning_rate": 1.752e-05, "loss": 8.6879, "step": 220 }, { "epoch": 0.0047099297606127, "grad_norm": 0.9334535598754883, "learning_rate": 1.832e-05, "loss": 8.6236, "step": 230 }, { "epoch": 0.004914709315421948, "grad_norm": 1.2129932641983032, "learning_rate": 1.9120000000000003e-05, "loss": 8.572, "step": 240 }, { "epoch": 0.005119488870231196, "grad_norm": 2.270386219024658, "learning_rate": 1.992e-05, "loss": 8.4909, "step": 250 }, { "epoch": 0.005324268425040444, "grad_norm": 0.9250027537345886, "learning_rate": 2.072e-05, "loss": 8.4783, "step": 260 }, { "epoch": 0.0055290479798496914, "grad_norm": 0.9208921194076538, "learning_rate": 2.152e-05, "loss": 8.3651, "step": 270 }, { "epoch": 0.00573382753465894, "grad_norm": 0.8561158776283264, "learning_rate": 2.2320000000000003e-05, "loss": 8.2926, "step": 280 }, { "epoch": 0.0059386070894681875, "grad_norm": 0.9620749950408936, "learning_rate": 2.312e-05, "loss": 8.2353, "step": 290 }, { "epoch": 0.006143386644277435, "grad_norm": 0.834102213382721, "learning_rate": 2.392e-05, "loss": 8.1975, "step": 300 }, { "epoch": 0.0063481661990866835, "grad_norm": 0.7779641151428223, "learning_rate": 2.472e-05, "loss": 8.1244, "step": 310 }, { "epoch": 0.006552945753895931, "grad_norm": 0.8238368034362793, "learning_rate": 2.552e-05, "loss": 8.0652, "step": 320 }, { "epoch": 0.006757725308705179, "grad_norm": 0.8252228498458862, "learning_rate": 2.632e-05, "loss": 8.0452, "step": 330 }, { "epoch": 0.006962504863514427, "grad_norm": 0.7772361636161804, "learning_rate": 2.712e-05, "loss": 8.0057, "step": 340 }, { "epoch": 0.007167284418323675, "grad_norm": 0.9958386421203613, "learning_rate": 2.792e-05, "loss": 7.8924, "step": 350 }, { "epoch": 0.007372063973132922, "grad_norm": 0.7524231672286987, "learning_rate": 2.8720000000000003e-05, "loss": 7.8663, "step": 360 }, { "epoch": 0.007576843527942171, "grad_norm": 0.7005847096443176, "learning_rate": 2.9520000000000002e-05, "loss": 7.8309, "step": 370 }, { "epoch": 0.007781623082751418, "grad_norm": 0.844709038734436, "learning_rate": 3.0320000000000004e-05, "loss": 7.8128, "step": 380 }, { "epoch": 0.007986402637560666, "grad_norm": 0.6282876133918762, "learning_rate": 3.112e-05, "loss": 7.7429, "step": 390 }, { "epoch": 0.008191182192369913, "grad_norm": 2.5666661262512207, "learning_rate": 3.192e-05, "loss": 7.7099, "step": 400 }, { "epoch": 0.008395961747179161, "grad_norm": 0.791608989238739, "learning_rate": 3.272e-05, "loss": 7.7298, "step": 410 }, { "epoch": 0.00860074130198841, "grad_norm": 0.7391555905342102, "learning_rate": 3.3520000000000004e-05, "loss": 7.6316, "step": 420 }, { "epoch": 0.008805520856797658, "grad_norm": 0.7813675999641418, "learning_rate": 3.4320000000000003e-05, "loss": 7.6225, "step": 430 }, { "epoch": 0.009010300411606905, "grad_norm": 0.7512689828872681, "learning_rate": 3.512e-05, "loss": 7.59, "step": 440 }, { "epoch": 0.009215079966416153, "grad_norm": 0.7102991342544556, "learning_rate": 3.592e-05, "loss": 7.5673, "step": 450 }, { "epoch": 0.0094198595212254, "grad_norm": 0.8545809388160706, "learning_rate": 3.672000000000001e-05, "loss": 7.5217, "step": 460 }, { "epoch": 0.009624639076034648, "grad_norm": 0.9673335552215576, "learning_rate": 3.752e-05, "loss": 7.5043, "step": 470 }, { "epoch": 0.009829418630843896, "grad_norm": 1.040452003479004, "learning_rate": 3.832e-05, "loss": 7.5176, "step": 480 }, { "epoch": 0.010034198185653145, "grad_norm": 1.3076322078704834, "learning_rate": 3.912e-05, "loss": 7.4432, "step": 490 }, { "epoch": 0.010238977740462393, "grad_norm": 0.9396184682846069, "learning_rate": 3.9920000000000004e-05, "loss": 7.4431, "step": 500 }, { "epoch": 0.01044375729527164, "grad_norm": 0.7993614673614502, "learning_rate": 4.072e-05, "loss": 7.3762, "step": 510 }, { "epoch": 0.010648536850080888, "grad_norm": 1.016692876815796, "learning_rate": 4.152e-05, "loss": 7.3908, "step": 520 }, { "epoch": 0.010853316404890135, "grad_norm": 1.1701678037643433, "learning_rate": 4.232e-05, "loss": 7.3692, "step": 530 }, { "epoch": 0.011058095959699383, "grad_norm": 0.9889584183692932, "learning_rate": 4.312000000000001e-05, "loss": 7.3655, "step": 540 }, { "epoch": 0.011262875514508632, "grad_norm": 1.624703049659729, "learning_rate": 4.392e-05, "loss": 7.3559, "step": 550 }, { "epoch": 0.01146765506931788, "grad_norm": 1.0079149007797241, "learning_rate": 4.472e-05, "loss": 7.3154, "step": 560 }, { "epoch": 0.011672434624127127, "grad_norm": 0.9229303002357483, "learning_rate": 4.5520000000000005e-05, "loss": 7.2466, "step": 570 }, { "epoch": 0.011877214178936375, "grad_norm": 1.0020170211791992, "learning_rate": 4.6320000000000004e-05, "loss": 7.2919, "step": 580 }, { "epoch": 0.012081993733745622, "grad_norm": 1.0504316091537476, "learning_rate": 4.712e-05, "loss": 7.2288, "step": 590 }, { "epoch": 0.01228677328855487, "grad_norm": 1.2128161191940308, "learning_rate": 4.792e-05, "loss": 7.229, "step": 600 }, { "epoch": 0.01249155284336412, "grad_norm": 0.9260151386260986, "learning_rate": 4.872000000000001e-05, "loss": 7.1875, "step": 610 }, { "epoch": 0.012696332398173367, "grad_norm": 1.083197832107544, "learning_rate": 4.952e-05, "loss": 7.2263, "step": 620 }, { "epoch": 0.012901111952982615, "grad_norm": 1.748647689819336, "learning_rate": 5.032e-05, "loss": 7.1888, "step": 630 }, { "epoch": 0.013105891507791862, "grad_norm": 1.4586197137832642, "learning_rate": 5.112e-05, "loss": 7.1513, "step": 640 }, { "epoch": 0.01331067106260111, "grad_norm": 1.279341459274292, "learning_rate": 5.1920000000000004e-05, "loss": 7.1123, "step": 650 }, { "epoch": 0.013515450617410357, "grad_norm": 1.140655755996704, "learning_rate": 5.2720000000000003e-05, "loss": 7.0848, "step": 660 }, { "epoch": 0.013720230172219605, "grad_norm": 1.1404409408569336, "learning_rate": 5.352e-05, "loss": 7.1542, "step": 670 }, { "epoch": 0.013925009727028854, "grad_norm": 1.4052735567092896, "learning_rate": 5.432e-05, "loss": 7.0715, "step": 680 }, { "epoch": 0.014129789281838102, "grad_norm": 1.4490317106246948, "learning_rate": 5.512000000000001e-05, "loss": 7.1085, "step": 690 }, { "epoch": 0.01433456883664735, "grad_norm": 0.9407364130020142, "learning_rate": 5.592000000000001e-05, "loss": 7.0504, "step": 700 }, { "epoch": 0.014539348391456597, "grad_norm": 1.1838454008102417, "learning_rate": 5.6720000000000006e-05, "loss": 7.0176, "step": 710 }, { "epoch": 0.014744127946265844, "grad_norm": 1.223021149635315, "learning_rate": 5.7520000000000005e-05, "loss": 6.9734, "step": 720 }, { "epoch": 0.014948907501075092, "grad_norm": 1.185410737991333, "learning_rate": 5.832000000000001e-05, "loss": 6.9838, "step": 730 }, { "epoch": 0.015153687055884341, "grad_norm": 1.4732284545898438, "learning_rate": 5.9119999999999996e-05, "loss": 6.9732, "step": 740 }, { "epoch": 0.015358466610693589, "grad_norm": 1.0380061864852905, "learning_rate": 5.9919999999999996e-05, "loss": 7.0131, "step": 750 }, { "epoch": 0.015563246165502836, "grad_norm": 1.2068144083023071, "learning_rate": 6.072e-05, "loss": 6.9211, "step": 760 }, { "epoch": 0.015768025720312086, "grad_norm": 1.0961358547210693, "learning_rate": 6.152e-05, "loss": 6.9088, "step": 770 }, { "epoch": 0.01597280527512133, "grad_norm": 1.4672534465789795, "learning_rate": 6.232e-05, "loss": 6.9141, "step": 780 }, { "epoch": 0.01617758482993058, "grad_norm": 1.2274079322814941, "learning_rate": 6.312e-05, "loss": 6.888, "step": 790 }, { "epoch": 0.016382364384739827, "grad_norm": 1.1605645418167114, "learning_rate": 6.392e-05, "loss": 6.9103, "step": 800 }, { "epoch": 0.016587143939549076, "grad_norm": 1.0360841751098633, "learning_rate": 6.472e-05, "loss": 6.8925, "step": 810 }, { "epoch": 0.016791923494358322, "grad_norm": 1.1419285535812378, "learning_rate": 6.552000000000001e-05, "loss": 6.8396, "step": 820 }, { "epoch": 0.01699670304916757, "grad_norm": 1.2122325897216797, "learning_rate": 6.632e-05, "loss": 6.8401, "step": 830 }, { "epoch": 0.01720148260397682, "grad_norm": 1.0237847566604614, "learning_rate": 6.712000000000001e-05, "loss": 6.8111, "step": 840 }, { "epoch": 0.017406262158786066, "grad_norm": 1.3663208484649658, "learning_rate": 6.792e-05, "loss": 6.842, "step": 850 }, { "epoch": 0.017611041713595316, "grad_norm": 1.6236571073532104, "learning_rate": 6.872e-05, "loss": 6.8099, "step": 860 }, { "epoch": 0.01781582126840456, "grad_norm": 1.5716248750686646, "learning_rate": 6.952000000000001e-05, "loss": 6.8146, "step": 870 }, { "epoch": 0.01802060082321381, "grad_norm": 1.4074079990386963, "learning_rate": 7.032e-05, "loss": 6.7876, "step": 880 }, { "epoch": 0.018225380378023057, "grad_norm": 1.0738171339035034, "learning_rate": 7.112000000000001e-05, "loss": 6.8033, "step": 890 }, { "epoch": 0.018430159932832306, "grad_norm": 1.4551218748092651, "learning_rate": 7.192e-05, "loss": 6.7627, "step": 900 }, { "epoch": 0.018634939487641555, "grad_norm": 1.0792328119277954, "learning_rate": 7.272e-05, "loss": 6.7315, "step": 910 }, { "epoch": 0.0188397190424508, "grad_norm": 1.079581379890442, "learning_rate": 7.352e-05, "loss": 6.7201, "step": 920 }, { "epoch": 0.01904449859726005, "grad_norm": 1.7889859676361084, "learning_rate": 7.432e-05, "loss": 6.7276, "step": 930 }, { "epoch": 0.019249278152069296, "grad_norm": 1.1572691202163696, "learning_rate": 7.512e-05, "loss": 6.6996, "step": 940 }, { "epoch": 0.019454057706878546, "grad_norm": 0.9270385503768921, "learning_rate": 7.592e-05, "loss": 6.7369, "step": 950 }, { "epoch": 0.01965883726168779, "grad_norm": 1.431523084640503, "learning_rate": 7.672e-05, "loss": 6.6738, "step": 960 }, { "epoch": 0.01986361681649704, "grad_norm": 1.3019062280654907, "learning_rate": 7.752e-05, "loss": 6.7316, "step": 970 }, { "epoch": 0.02006839637130629, "grad_norm": 0.9801391363143921, "learning_rate": 7.832000000000001e-05, "loss": 6.6684, "step": 980 }, { "epoch": 0.020273175926115536, "grad_norm": 1.3378275632858276, "learning_rate": 7.912e-05, "loss": 6.6285, "step": 990 }, { "epoch": 0.020477955480924785, "grad_norm": 1.1381784677505493, "learning_rate": 7.992000000000001e-05, "loss": 6.6235, "step": 1000 }, { "epoch": 0.02068273503573403, "grad_norm": 1.0569130182266235, "learning_rate": 8.072000000000001e-05, "loss": 6.61, "step": 1010 }, { "epoch": 0.02088751459054328, "grad_norm": 1.4086412191390991, "learning_rate": 8.152e-05, "loss": 6.6142, "step": 1020 }, { "epoch": 0.02109229414535253, "grad_norm": 1.226737141609192, "learning_rate": 8.232000000000001e-05, "loss": 6.6019, "step": 1030 }, { "epoch": 0.021297073700161775, "grad_norm": 1.1184046268463135, "learning_rate": 8.312e-05, "loss": 6.5787, "step": 1040 }, { "epoch": 0.021501853254971025, "grad_norm": 1.263479232788086, "learning_rate": 8.392e-05, "loss": 6.6065, "step": 1050 }, { "epoch": 0.02170663280978027, "grad_norm": 1.3066644668579102, "learning_rate": 8.472e-05, "loss": 6.5692, "step": 1060 }, { "epoch": 0.02191141236458952, "grad_norm": 1.0745807886123657, "learning_rate": 8.552e-05, "loss": 6.5497, "step": 1070 }, { "epoch": 0.022116191919398766, "grad_norm": 1.042258381843567, "learning_rate": 8.632e-05, "loss": 6.5626, "step": 1080 }, { "epoch": 0.022320971474208015, "grad_norm": 1.2225663661956787, "learning_rate": 8.712e-05, "loss": 6.5448, "step": 1090 }, { "epoch": 0.022525751029017264, "grad_norm": 1.6050411462783813, "learning_rate": 8.792e-05, "loss": 6.5282, "step": 1100 }, { "epoch": 0.02273053058382651, "grad_norm": 1.11445951461792, "learning_rate": 8.872e-05, "loss": 6.5315, "step": 1110 }, { "epoch": 0.02293531013863576, "grad_norm": 1.0947785377502441, "learning_rate": 8.952000000000001e-05, "loss": 6.5531, "step": 1120 }, { "epoch": 0.023140089693445005, "grad_norm": 1.2313827276229858, "learning_rate": 9.032e-05, "loss": 6.4887, "step": 1130 }, { "epoch": 0.023344869248254255, "grad_norm": 1.2121127843856812, "learning_rate": 9.112000000000001e-05, "loss": 6.4987, "step": 1140 }, { "epoch": 0.0235496488030635, "grad_norm": 1.3728684186935425, "learning_rate": 9.192e-05, "loss": 6.4936, "step": 1150 }, { "epoch": 0.02375442835787275, "grad_norm": 1.3086248636245728, "learning_rate": 9.272e-05, "loss": 6.5013, "step": 1160 }, { "epoch": 0.023959207912682, "grad_norm": 1.3813849687576294, "learning_rate": 9.352000000000001e-05, "loss": 6.4949, "step": 1170 }, { "epoch": 0.024163987467491245, "grad_norm": 1.3366695642471313, "learning_rate": 9.432e-05, "loss": 6.4448, "step": 1180 }, { "epoch": 0.024368767022300494, "grad_norm": 1.1331437826156616, "learning_rate": 9.512000000000001e-05, "loss": 6.5141, "step": 1190 }, { "epoch": 0.02457354657710974, "grad_norm": 1.0496031045913696, "learning_rate": 9.592e-05, "loss": 6.4705, "step": 1200 }, { "epoch": 0.02477832613191899, "grad_norm": 1.246792197227478, "learning_rate": 9.672e-05, "loss": 6.4496, "step": 1210 }, { "epoch": 0.02498310568672824, "grad_norm": 1.4758825302124023, "learning_rate": 9.752e-05, "loss": 6.4878, "step": 1220 }, { "epoch": 0.025187885241537485, "grad_norm": 1.1107237339019775, "learning_rate": 9.832000000000001e-05, "loss": 6.419, "step": 1230 }, { "epoch": 0.025392664796346734, "grad_norm": 0.9504449367523193, "learning_rate": 9.912e-05, "loss": 6.4278, "step": 1240 }, { "epoch": 0.02559744435115598, "grad_norm": 1.2253371477127075, "learning_rate": 9.992e-05, "loss": 6.3827, "step": 1250 }, { "epoch": 0.02580222390596523, "grad_norm": 0.9804422855377197, "learning_rate": 0.00010072000000000001, "loss": 6.381, "step": 1260 }, { "epoch": 0.026007003460774475, "grad_norm": 2.4996328353881836, "learning_rate": 0.00010152000000000002, "loss": 6.3951, "step": 1270 }, { "epoch": 0.026211783015583724, "grad_norm": 1.1646678447723389, "learning_rate": 0.00010232000000000001, "loss": 6.4003, "step": 1280 }, { "epoch": 0.026416562570392974, "grad_norm": 1.1945327520370483, "learning_rate": 0.00010311999999999999, "loss": 6.3521, "step": 1290 }, { "epoch": 0.02662134212520222, "grad_norm": 1.2656117677688599, "learning_rate": 0.00010392, "loss": 6.3738, "step": 1300 }, { "epoch": 0.02682612168001147, "grad_norm": 1.1956391334533691, "learning_rate": 0.00010472, "loss": 6.3543, "step": 1310 }, { "epoch": 0.027030901234820714, "grad_norm": 1.0304642915725708, "learning_rate": 0.00010551999999999999, "loss": 6.3451, "step": 1320 }, { "epoch": 0.027235680789629964, "grad_norm": 1.1612216234207153, "learning_rate": 0.00010632, "loss": 6.3446, "step": 1330 }, { "epoch": 0.02744046034443921, "grad_norm": 1.0887846946716309, "learning_rate": 0.00010712, "loss": 6.3251, "step": 1340 }, { "epoch": 0.02764523989924846, "grad_norm": 1.2382864952087402, "learning_rate": 0.00010792, "loss": 6.3444, "step": 1350 }, { "epoch": 0.027850019454057708, "grad_norm": 1.3389307260513306, "learning_rate": 0.00010872, "loss": 6.3271, "step": 1360 }, { "epoch": 0.028054799008866954, "grad_norm": 0.9926040768623352, "learning_rate": 0.00010952, "loss": 6.2983, "step": 1370 }, { "epoch": 0.028259578563676203, "grad_norm": 1.138631820678711, "learning_rate": 0.00011032, "loss": 6.3132, "step": 1380 }, { "epoch": 0.02846435811848545, "grad_norm": 1.2368875741958618, "learning_rate": 0.00011112, "loss": 6.3244, "step": 1390 }, { "epoch": 0.0286691376732947, "grad_norm": 0.8522224426269531, "learning_rate": 0.00011192, "loss": 6.3035, "step": 1400 }, { "epoch": 0.028873917228103944, "grad_norm": 1.553653597831726, "learning_rate": 0.00011272, "loss": 6.2799, "step": 1410 }, { "epoch": 0.029078696782913194, "grad_norm": 1.0834499597549438, "learning_rate": 0.00011352000000000001, "loss": 6.3157, "step": 1420 }, { "epoch": 0.029283476337722443, "grad_norm": 0.9788590669631958, "learning_rate": 0.00011432, "loss": 6.2857, "step": 1430 }, { "epoch": 0.02948825589253169, "grad_norm": 1.1679108142852783, "learning_rate": 0.00011512000000000001, "loss": 6.2885, "step": 1440 }, { "epoch": 0.029693035447340938, "grad_norm": 1.140278697013855, "learning_rate": 0.00011592, "loss": 6.272, "step": 1450 }, { "epoch": 0.029897815002150184, "grad_norm": 1.0994911193847656, "learning_rate": 0.00011672000000000001, "loss": 6.2571, "step": 1460 }, { "epoch": 0.030102594556959433, "grad_norm": 1.0351468324661255, "learning_rate": 0.00011752000000000001, "loss": 6.2471, "step": 1470 }, { "epoch": 0.030307374111768683, "grad_norm": 1.0071357488632202, "learning_rate": 0.00011832, "loss": 6.2214, "step": 1480 }, { "epoch": 0.03051215366657793, "grad_norm": 0.8304218649864197, "learning_rate": 0.00011912000000000001, "loss": 6.2174, "step": 1490 }, { "epoch": 0.030716933221387178, "grad_norm": 0.8604931831359863, "learning_rate": 0.00011992, "loss": 6.2274, "step": 1500 }, { "epoch": 0.030921712776196424, "grad_norm": 0.8300955295562744, "learning_rate": 0.00012072000000000001, "loss": 6.2381, "step": 1510 }, { "epoch": 0.031126492331005673, "grad_norm": 1.0048444271087646, "learning_rate": 0.00012152000000000002, "loss": 6.1938, "step": 1520 }, { "epoch": 0.03133127188581492, "grad_norm": 0.9349138736724854, "learning_rate": 0.00012232, "loss": 6.1735, "step": 1530 }, { "epoch": 0.03153605144062417, "grad_norm": 0.8391649127006531, "learning_rate": 0.00012312, "loss": 6.1767, "step": 1540 }, { "epoch": 0.031740830995433414, "grad_norm": 0.8740177750587463, "learning_rate": 0.00012392000000000002, "loss": 6.1987, "step": 1550 }, { "epoch": 0.03194561055024266, "grad_norm": 1.0528550148010254, "learning_rate": 0.00012472, "loss": 6.185, "step": 1560 }, { "epoch": 0.03215039010505191, "grad_norm": 0.8692805767059326, "learning_rate": 0.00012552, "loss": 6.2316, "step": 1570 }, { "epoch": 0.03235516965986116, "grad_norm": 1.4903044700622559, "learning_rate": 0.00012632000000000002, "loss": 6.1422, "step": 1580 }, { "epoch": 0.032559949214670404, "grad_norm": 0.8822526335716248, "learning_rate": 0.00012712000000000002, "loss": 6.1607, "step": 1590 }, { "epoch": 0.032764728769479654, "grad_norm": 0.8137292265892029, "learning_rate": 0.00012792, "loss": 6.1814, "step": 1600 }, { "epoch": 0.0329695083242889, "grad_norm": 0.8272304534912109, "learning_rate": 0.00012872, "loss": 6.1417, "step": 1610 }, { "epoch": 0.03317428787909815, "grad_norm": 1.164596438407898, "learning_rate": 0.00012952, "loss": 6.1647, "step": 1620 }, { "epoch": 0.0333790674339074, "grad_norm": 3.4363532066345215, "learning_rate": 0.00013031999999999999, "loss": 6.1314, "step": 1630 }, { "epoch": 0.033583846988716644, "grad_norm": 0.964524507522583, "learning_rate": 0.00013112, "loss": 6.1275, "step": 1640 }, { "epoch": 0.03378862654352589, "grad_norm": 0.7767365574836731, "learning_rate": 0.00013192, "loss": 6.1244, "step": 1650 }, { "epoch": 0.03399340609833514, "grad_norm": 0.9796528220176697, "learning_rate": 0.00013272, "loss": 6.1024, "step": 1660 }, { "epoch": 0.03419818565314439, "grad_norm": 0.9104465842247009, "learning_rate": 0.00013352, "loss": 6.0983, "step": 1670 }, { "epoch": 0.03440296520795364, "grad_norm": 0.8891839981079102, "learning_rate": 0.00013432, "loss": 6.1659, "step": 1680 }, { "epoch": 0.03460774476276288, "grad_norm": 0.9721062183380127, "learning_rate": 0.00013512, "loss": 6.1095, "step": 1690 }, { "epoch": 0.03481252431757213, "grad_norm": 0.9213342070579529, "learning_rate": 0.00013592, "loss": 6.1123, "step": 1700 }, { "epoch": 0.03501730387238138, "grad_norm": 0.7819191813468933, "learning_rate": 0.00013672, "loss": 6.0874, "step": 1710 }, { "epoch": 0.03522208342719063, "grad_norm": 0.9591174721717834, "learning_rate": 0.00013752, "loss": 6.0865, "step": 1720 }, { "epoch": 0.035426862981999874, "grad_norm": 1.9827978610992432, "learning_rate": 0.00013832000000000002, "loss": 6.1134, "step": 1730 }, { "epoch": 0.03563164253680912, "grad_norm": 0.8233531713485718, "learning_rate": 0.00013912, "loss": 6.1046, "step": 1740 }, { "epoch": 0.03583642209161837, "grad_norm": 0.8631060123443604, "learning_rate": 0.00013992, "loss": 6.0755, "step": 1750 }, { "epoch": 0.03604120164642762, "grad_norm": 0.8472694754600525, "learning_rate": 0.00014072, "loss": 6.0528, "step": 1760 }, { "epoch": 0.03624598120123687, "grad_norm": 0.865189790725708, "learning_rate": 0.00014152000000000001, "loss": 6.0325, "step": 1770 }, { "epoch": 0.03645076075604611, "grad_norm": 0.7865936160087585, "learning_rate": 0.00014232, "loss": 6.0146, "step": 1780 }, { "epoch": 0.03665554031085536, "grad_norm": 0.7830580472946167, "learning_rate": 0.00014312, "loss": 6.0466, "step": 1790 }, { "epoch": 0.03686031986566461, "grad_norm": 0.7788213491439819, "learning_rate": 0.00014392000000000002, "loss": 6.0575, "step": 1800 }, { "epoch": 0.03706509942047386, "grad_norm": 0.7438129782676697, "learning_rate": 0.00014472, "loss": 6.0617, "step": 1810 }, { "epoch": 0.03726987897528311, "grad_norm": 0.8572232127189636, "learning_rate": 0.00014552, "loss": 6.005, "step": 1820 }, { "epoch": 0.03747465853009235, "grad_norm": 3.089932680130005, "learning_rate": 0.00014632000000000002, "loss": 6.0865, "step": 1830 }, { "epoch": 0.0376794380849016, "grad_norm": 0.8924036026000977, "learning_rate": 0.00014712000000000001, "loss": 6.0572, "step": 1840 }, { "epoch": 0.03788421763971085, "grad_norm": 0.9803346991539001, "learning_rate": 0.00014792, "loss": 6.0735, "step": 1850 }, { "epoch": 0.0380889971945201, "grad_norm": 0.7436922788619995, "learning_rate": 0.00014872000000000003, "loss": 6.001, "step": 1860 }, { "epoch": 0.03829377674932935, "grad_norm": 0.8128496408462524, "learning_rate": 0.00014952000000000002, "loss": 5.9787, "step": 1870 }, { "epoch": 0.03849855630413859, "grad_norm": 0.772544801235199, "learning_rate": 0.00015032, "loss": 5.9811, "step": 1880 }, { "epoch": 0.03870333585894784, "grad_norm": 0.7379671931266785, "learning_rate": 0.00015112000000000003, "loss": 6.0148, "step": 1890 }, { "epoch": 0.03890811541375709, "grad_norm": 0.6884296536445618, "learning_rate": 0.00015192000000000002, "loss": 5.9905, "step": 1900 }, { "epoch": 0.03911289496856634, "grad_norm": 0.774961531162262, "learning_rate": 0.00015272, "loss": 5.9748, "step": 1910 }, { "epoch": 0.03931767452337558, "grad_norm": 0.79966139793396, "learning_rate": 0.00015352, "loss": 6.0226, "step": 1920 }, { "epoch": 0.03952245407818483, "grad_norm": 0.783399224281311, "learning_rate": 0.00015432, "loss": 5.9568, "step": 1930 }, { "epoch": 0.03972723363299408, "grad_norm": 0.694803774356842, "learning_rate": 0.00015512, "loss": 5.9702, "step": 1940 }, { "epoch": 0.03993201318780333, "grad_norm": 0.7792879939079285, "learning_rate": 0.00015592, "loss": 5.9541, "step": 1950 }, { "epoch": 0.04013679274261258, "grad_norm": 0.7265322208404541, "learning_rate": 0.00015672, "loss": 5.9542, "step": 1960 }, { "epoch": 0.04034157229742182, "grad_norm": 0.8625994324684143, "learning_rate": 0.00015752, "loss": 5.9561, "step": 1970 }, { "epoch": 0.04054635185223107, "grad_norm": 0.7278620004653931, "learning_rate": 0.00015832, "loss": 5.9455, "step": 1980 }, { "epoch": 0.04075113140704032, "grad_norm": 0.793200671672821, "learning_rate": 0.00015912, "loss": 5.966, "step": 1990 }, { "epoch": 0.04095591096184957, "grad_norm": 1.2044384479522705, "learning_rate": 0.00015992, "loss": 5.9557, "step": 2000 }, { "epoch": 0.04095591096184957, "eval_loss": 5.943562984466553, "eval_runtime": 56.2969, "eval_samples_per_second": 20.712, "eval_steps_per_second": 2.593, "step": 2000 }, { "epoch": 0.04116069051665882, "grad_norm": 0.7351077795028687, "learning_rate": 0.00016072, "loss": 5.9482, "step": 2010 }, { "epoch": 0.04136547007146806, "grad_norm": 0.8293958902359009, "learning_rate": 0.00016152, "loss": 5.9143, "step": 2020 }, { "epoch": 0.04157024962627731, "grad_norm": 0.8600688576698303, "learning_rate": 0.00016232, "loss": 5.9089, "step": 2030 }, { "epoch": 0.04177502918108656, "grad_norm": 0.7429504990577698, "learning_rate": 0.00016312, "loss": 5.8794, "step": 2040 }, { "epoch": 0.04197980873589581, "grad_norm": 0.7266488671302795, "learning_rate": 0.00016392000000000002, "loss": 5.8786, "step": 2050 }, { "epoch": 0.04218458829070506, "grad_norm": 0.6896419525146484, "learning_rate": 0.00016472, "loss": 5.8846, "step": 2060 }, { "epoch": 0.0423893678455143, "grad_norm": 0.6630206108093262, "learning_rate": 0.00016552, "loss": 5.8895, "step": 2070 }, { "epoch": 0.04259414740032355, "grad_norm": 0.7611812949180603, "learning_rate": 0.00016632000000000002, "loss": 5.8851, "step": 2080 }, { "epoch": 0.0427989269551328, "grad_norm": 0.7763521671295166, "learning_rate": 0.00016712, "loss": 5.8954, "step": 2090 }, { "epoch": 0.04300370650994205, "grad_norm": 1.1050055027008057, "learning_rate": 0.00016792, "loss": 5.9114, "step": 2100 }, { "epoch": 0.04320848606475129, "grad_norm": 0.7318460941314697, "learning_rate": 0.00016872000000000002, "loss": 5.8902, "step": 2110 }, { "epoch": 0.04341326561956054, "grad_norm": 0.9724841713905334, "learning_rate": 0.00016952000000000002, "loss": 5.8692, "step": 2120 }, { "epoch": 0.04361804517436979, "grad_norm": 0.7182177901268005, "learning_rate": 0.00017032, "loss": 5.8422, "step": 2130 }, { "epoch": 0.04382282472917904, "grad_norm": 0.7816573977470398, "learning_rate": 0.00017112, "loss": 5.877, "step": 2140 }, { "epoch": 0.04402760428398829, "grad_norm": 0.6064931750297546, "learning_rate": 0.00017192000000000002, "loss": 5.8499, "step": 2150 }, { "epoch": 0.04423238383879753, "grad_norm": 0.7619910836219788, "learning_rate": 0.00017272, "loss": 5.8362, "step": 2160 }, { "epoch": 0.04443716339360678, "grad_norm": 0.7827998399734497, "learning_rate": 0.00017352, "loss": 5.8615, "step": 2170 }, { "epoch": 0.04464194294841603, "grad_norm": 0.8352086544036865, "learning_rate": 0.00017432000000000003, "loss": 5.8117, "step": 2180 }, { "epoch": 0.04484672250322528, "grad_norm": 0.7334675788879395, "learning_rate": 0.00017512000000000002, "loss": 5.8298, "step": 2190 }, { "epoch": 0.04505150205803453, "grad_norm": 0.7153773307800293, "learning_rate": 0.00017592, "loss": 5.8426, "step": 2200 }, { "epoch": 0.04525628161284377, "grad_norm": 0.6356229186058044, "learning_rate": 0.00017672000000000003, "loss": 5.802, "step": 2210 }, { "epoch": 0.04546106116765302, "grad_norm": 0.8019924759864807, "learning_rate": 0.00017752, "loss": 5.8023, "step": 2220 }, { "epoch": 0.04566584072246227, "grad_norm": 0.6777639389038086, "learning_rate": 0.00017832, "loss": 5.795, "step": 2230 }, { "epoch": 0.04587062027727152, "grad_norm": 0.7413334846496582, "learning_rate": 0.00017912, "loss": 5.8084, "step": 2240 }, { "epoch": 0.04607539983208077, "grad_norm": 0.6552919745445251, "learning_rate": 0.00017992, "loss": 5.7766, "step": 2250 }, { "epoch": 0.04628017938689001, "grad_norm": 0.713080108165741, "learning_rate": 0.00018072, "loss": 5.7853, "step": 2260 }, { "epoch": 0.04648495894169926, "grad_norm": 0.6190316081047058, "learning_rate": 0.00018152, "loss": 5.767, "step": 2270 }, { "epoch": 0.04668973849650851, "grad_norm": 1.1991640329360962, "learning_rate": 0.00018232, "loss": 5.7482, "step": 2280 }, { "epoch": 0.04689451805131776, "grad_norm": 0.7628166079521179, "learning_rate": 0.00018312, "loss": 5.7666, "step": 2290 }, { "epoch": 0.047099297606127, "grad_norm": 0.6202970743179321, "learning_rate": 0.00018392000000000001, "loss": 5.7852, "step": 2300 }, { "epoch": 0.04730407716093625, "grad_norm": 0.6502846479415894, "learning_rate": 0.00018472, "loss": 5.7699, "step": 2310 }, { "epoch": 0.0475088567157455, "grad_norm": 0.7047054171562195, "learning_rate": 0.00018552, "loss": 5.7713, "step": 2320 }, { "epoch": 0.04771363627055475, "grad_norm": 0.6129122972488403, "learning_rate": 0.00018632000000000002, "loss": 5.7456, "step": 2330 }, { "epoch": 0.047918415825364, "grad_norm": 0.6389017701148987, "learning_rate": 0.00018712, "loss": 5.7337, "step": 2340 }, { "epoch": 0.04812319538017324, "grad_norm": 0.7061182260513306, "learning_rate": 0.00018792, "loss": 5.7949, "step": 2350 }, { "epoch": 0.04832797493498249, "grad_norm": 0.6337869167327881, "learning_rate": 0.00018872, "loss": 5.762, "step": 2360 }, { "epoch": 0.04853275448979174, "grad_norm": 0.6508966684341431, "learning_rate": 0.00018952000000000002, "loss": 5.7238, "step": 2370 }, { "epoch": 0.04873753404460099, "grad_norm": 0.6663705706596375, "learning_rate": 0.00019032, "loss": 5.7431, "step": 2380 }, { "epoch": 0.04894231359941024, "grad_norm": 1.348676085472107, "learning_rate": 0.00019112, "loss": 5.6784, "step": 2390 }, { "epoch": 0.04914709315421948, "grad_norm": 0.9762350916862488, "learning_rate": 0.00019192000000000002, "loss": 5.6957, "step": 2400 }, { "epoch": 0.04935187270902873, "grad_norm": 0.8128533959388733, "learning_rate": 0.00019272, "loss": 5.7294, "step": 2410 }, { "epoch": 0.04955665226383798, "grad_norm": 0.6565982699394226, "learning_rate": 0.00019352, "loss": 5.7049, "step": 2420 }, { "epoch": 0.04976143181864723, "grad_norm": 0.680180549621582, "learning_rate": 0.00019432000000000002, "loss": 5.7156, "step": 2430 }, { "epoch": 0.04996621137345648, "grad_norm": 0.6522653698921204, "learning_rate": 0.00019512000000000002, "loss": 5.73, "step": 2440 }, { "epoch": 0.05017099092826572, "grad_norm": 0.6685560941696167, "learning_rate": 0.00019592, "loss": 5.7061, "step": 2450 }, { "epoch": 0.05037577048307497, "grad_norm": 0.5945119261741638, "learning_rate": 0.00019672000000000003, "loss": 5.7045, "step": 2460 }, { "epoch": 0.05058055003788422, "grad_norm": 0.645095944404602, "learning_rate": 0.00019752000000000002, "loss": 5.6842, "step": 2470 }, { "epoch": 0.05078532959269347, "grad_norm": 0.666318953037262, "learning_rate": 0.00019832, "loss": 5.7015, "step": 2480 }, { "epoch": 0.05099010914750271, "grad_norm": 0.5758560299873352, "learning_rate": 0.00019912, "loss": 5.6524, "step": 2490 }, { "epoch": 0.05119488870231196, "grad_norm": 0.5842757225036621, "learning_rate": 0.00019992000000000002, "loss": 5.664, "step": 2500 }, { "epoch": 0.05139966825712121, "grad_norm": 0.6742421388626099, "learning_rate": 0.00020072000000000002, "loss": 5.6619, "step": 2510 }, { "epoch": 0.05160444781193046, "grad_norm": 0.6421381235122681, "learning_rate": 0.00020152, "loss": 5.6534, "step": 2520 }, { "epoch": 0.05180922736673971, "grad_norm": 1.3708223104476929, "learning_rate": 0.00020232000000000003, "loss": 5.6575, "step": 2530 }, { "epoch": 0.05201400692154895, "grad_norm": 0.655500054359436, "learning_rate": 0.00020312000000000002, "loss": 5.6502, "step": 2540 }, { "epoch": 0.0522187864763582, "grad_norm": 0.6765331625938416, "learning_rate": 0.00020392, "loss": 5.6581, "step": 2550 }, { "epoch": 0.05242356603116745, "grad_norm": 0.6311305165290833, "learning_rate": 0.00020472000000000003, "loss": 5.6377, "step": 2560 }, { "epoch": 0.0526283455859767, "grad_norm": 0.5618401169776917, "learning_rate": 0.00020552000000000002, "loss": 5.6092, "step": 2570 }, { "epoch": 0.05283312514078595, "grad_norm": 0.5681129693984985, "learning_rate": 0.00020632000000000002, "loss": 5.6378, "step": 2580 }, { "epoch": 0.05303790469559519, "grad_norm": 0.6466996073722839, "learning_rate": 0.00020712000000000004, "loss": 5.6487, "step": 2590 }, { "epoch": 0.05324268425040444, "grad_norm": 0.5487588047981262, "learning_rate": 0.00020792000000000003, "loss": 5.6589, "step": 2600 }, { "epoch": 0.05344746380521369, "grad_norm": 2.6463348865509033, "learning_rate": 0.00020872000000000002, "loss": 5.5857, "step": 2610 }, { "epoch": 0.05365224336002294, "grad_norm": 0.6164767146110535, "learning_rate": 0.00020952000000000004, "loss": 5.6318, "step": 2620 }, { "epoch": 0.05385702291483219, "grad_norm": 0.5371426343917847, "learning_rate": 0.00021032000000000003, "loss": 5.6176, "step": 2630 }, { "epoch": 0.05406180246964143, "grad_norm": 0.8848806619644165, "learning_rate": 0.00021112000000000003, "loss": 5.643, "step": 2640 }, { "epoch": 0.05426658202445068, "grad_norm": 0.5556992888450623, "learning_rate": 0.00021192000000000002, "loss": 5.6202, "step": 2650 }, { "epoch": 0.05447136157925993, "grad_norm": 0.6171382069587708, "learning_rate": 0.00021272000000000004, "loss": 5.6159, "step": 2660 }, { "epoch": 0.05467614113406918, "grad_norm": 0.5811936259269714, "learning_rate": 0.00021352000000000003, "loss": 5.6064, "step": 2670 }, { "epoch": 0.05488092068887842, "grad_norm": 0.5863711833953857, "learning_rate": 0.00021432000000000002, "loss": 5.5979, "step": 2680 }, { "epoch": 0.05508570024368767, "grad_norm": 0.7278106808662415, "learning_rate": 0.00021512, "loss": 5.6075, "step": 2690 }, { "epoch": 0.05529047979849692, "grad_norm": 0.5531920790672302, "learning_rate": 0.00021591999999999998, "loss": 5.5884, "step": 2700 }, { "epoch": 0.05549525935330617, "grad_norm": 0.5820106863975525, "learning_rate": 0.00021672, "loss": 5.6176, "step": 2710 }, { "epoch": 0.055700038908115417, "grad_norm": 0.5418875217437744, "learning_rate": 0.00021752, "loss": 5.5852, "step": 2720 }, { "epoch": 0.05590481846292466, "grad_norm": 0.6495028734207153, "learning_rate": 0.00021831999999999998, "loss": 5.5635, "step": 2730 }, { "epoch": 0.05610959801773391, "grad_norm": 0.5675711035728455, "learning_rate": 0.00021912, "loss": 5.5842, "step": 2740 }, { "epoch": 0.05631437757254316, "grad_norm": 0.5400895476341248, "learning_rate": 0.00021992, "loss": 5.591, "step": 2750 }, { "epoch": 0.05651915712735241, "grad_norm": 0.5396080613136292, "learning_rate": 0.00022072, "loss": 5.5988, "step": 2760 }, { "epoch": 0.056723936682161656, "grad_norm": 0.5995794534683228, "learning_rate": 0.00022152, "loss": 5.5976, "step": 2770 }, { "epoch": 0.0569287162369709, "grad_norm": 0.7403063774108887, "learning_rate": 0.00022232, "loss": 5.6153, "step": 2780 }, { "epoch": 0.05713349579178015, "grad_norm": 0.4941953718662262, "learning_rate": 0.00022312, "loss": 5.5713, "step": 2790 }, { "epoch": 0.0573382753465894, "grad_norm": 0.5359123945236206, "learning_rate": 0.00022391999999999998, "loss": 5.5815, "step": 2800 }, { "epoch": 0.057543054901398646, "grad_norm": 0.5588081479072571, "learning_rate": 0.00022472, "loss": 5.5127, "step": 2810 }, { "epoch": 0.05774783445620789, "grad_norm": 0.5440400242805481, "learning_rate": 0.00022552, "loss": 5.5131, "step": 2820 }, { "epoch": 0.05795261401101714, "grad_norm": 0.5617867112159729, "learning_rate": 0.00022632, "loss": 5.559, "step": 2830 }, { "epoch": 0.05815739356582639, "grad_norm": 0.5289216637611389, "learning_rate": 0.00022712, "loss": 5.5476, "step": 2840 }, { "epoch": 0.05836217312063564, "grad_norm": 0.5092751383781433, "learning_rate": 0.00022792, "loss": 5.5619, "step": 2850 }, { "epoch": 0.058566952675444886, "grad_norm": 0.49928438663482666, "learning_rate": 0.00022872, "loss": 5.5425, "step": 2860 }, { "epoch": 0.05877173223025413, "grad_norm": 0.5320514440536499, "learning_rate": 0.00022952, "loss": 5.5353, "step": 2870 }, { "epoch": 0.05897651178506338, "grad_norm": 0.7999703288078308, "learning_rate": 0.00023032, "loss": 5.5285, "step": 2880 }, { "epoch": 0.05918129133987263, "grad_norm": 0.5325676798820496, "learning_rate": 0.00023112, "loss": 5.5579, "step": 2890 }, { "epoch": 0.059386070894681876, "grad_norm": 0.5199950337409973, "learning_rate": 0.00023192000000000002, "loss": 5.5152, "step": 2900 }, { "epoch": 0.059590850449491126, "grad_norm": 0.5351914763450623, "learning_rate": 0.00023272, "loss": 5.5453, "step": 2910 }, { "epoch": 0.05979563000430037, "grad_norm": 1.4484779834747314, "learning_rate": 0.00023352, "loss": 5.5814, "step": 2920 }, { "epoch": 0.06000040955910962, "grad_norm": 0.6165844202041626, "learning_rate": 0.00023432, "loss": 5.5068, "step": 2930 }, { "epoch": 0.06020518911391887, "grad_norm": 0.5223903656005859, "learning_rate": 0.00023512, "loss": 5.5456, "step": 2940 }, { "epoch": 0.060409968668728116, "grad_norm": 0.45100030303001404, "learning_rate": 0.00023592, "loss": 5.4977, "step": 2950 }, { "epoch": 0.060614748223537365, "grad_norm": 0.5048749446868896, "learning_rate": 0.00023672, "loss": 5.503, "step": 2960 }, { "epoch": 0.06081952777834661, "grad_norm": 0.47033455967903137, "learning_rate": 0.00023752000000000002, "loss": 5.5039, "step": 2970 }, { "epoch": 0.06102430733315586, "grad_norm": 0.5028645396232605, "learning_rate": 0.00023832, "loss": 5.4865, "step": 2980 }, { "epoch": 0.061229086887965106, "grad_norm": 0.516358494758606, "learning_rate": 0.00023912, "loss": 5.5025, "step": 2990 }, { "epoch": 0.061433866442774356, "grad_norm": 0.49913960695266724, "learning_rate": 0.00023992000000000002, "loss": 5.4662, "step": 3000 }, { "epoch": 0.0616386459975836, "grad_norm": 0.5260842442512512, "learning_rate": 0.00024072, "loss": 5.5041, "step": 3010 }, { "epoch": 0.06184342555239285, "grad_norm": 0.5125996470451355, "learning_rate": 0.00024152, "loss": 5.4991, "step": 3020 }, { "epoch": 0.0620482051072021, "grad_norm": 0.5200065970420837, "learning_rate": 0.00024232000000000002, "loss": 5.5116, "step": 3030 }, { "epoch": 0.062252984662011346, "grad_norm": 0.4517768919467926, "learning_rate": 0.00024312000000000002, "loss": 5.4968, "step": 3040 }, { "epoch": 0.062457764216820595, "grad_norm": 0.8140433430671692, "learning_rate": 0.00024392, "loss": 5.4922, "step": 3050 }, { "epoch": 0.06266254377162984, "grad_norm": 0.509969174861908, "learning_rate": 0.00024472000000000003, "loss": 5.4785, "step": 3060 }, { "epoch": 0.06286732332643909, "grad_norm": 0.4542042016983032, "learning_rate": 0.00024552, "loss": 5.5198, "step": 3070 }, { "epoch": 0.06307210288124834, "grad_norm": 0.46822962164878845, "learning_rate": 0.00024632, "loss": 5.4635, "step": 3080 }, { "epoch": 0.06327688243605759, "grad_norm": 0.47385135293006897, "learning_rate": 0.00024712000000000003, "loss": 5.4718, "step": 3090 }, { "epoch": 0.06348166199086683, "grad_norm": 0.5590242743492126, "learning_rate": 0.00024792, "loss": 5.4687, "step": 3100 }, { "epoch": 0.06368644154567608, "grad_norm": 0.579818844795227, "learning_rate": 0.00024872, "loss": 5.4881, "step": 3110 }, { "epoch": 0.06389122110048533, "grad_norm": 0.5298537611961365, "learning_rate": 0.00024952000000000004, "loss": 5.4568, "step": 3120 }, { "epoch": 0.06409600065529457, "grad_norm": 0.4780171513557434, "learning_rate": 0.00025032, "loss": 5.472, "step": 3130 }, { "epoch": 0.06430078021010383, "grad_norm": 1.5736125707626343, "learning_rate": 0.00025112, "loss": 5.5136, "step": 3140 }, { "epoch": 0.06450555976491307, "grad_norm": 0.531993567943573, "learning_rate": 0.00025192000000000004, "loss": 5.4582, "step": 3150 }, { "epoch": 0.06471033931972232, "grad_norm": 0.49365732073783875, "learning_rate": 0.00025272, "loss": 5.4815, "step": 3160 }, { "epoch": 0.06491511887453157, "grad_norm": 0.4865797162055969, "learning_rate": 0.00025352, "loss": 5.4585, "step": 3170 }, { "epoch": 0.06511989842934081, "grad_norm": 0.5386083722114563, "learning_rate": 0.00025432000000000005, "loss": 5.4386, "step": 3180 }, { "epoch": 0.06532467798415006, "grad_norm": 0.5513334274291992, "learning_rate": 0.00025512, "loss": 5.4632, "step": 3190 }, { "epoch": 0.06552945753895931, "grad_norm": 0.46924030780792236, "learning_rate": 0.00025592000000000003, "loss": 5.475, "step": 3200 }, { "epoch": 0.06573423709376856, "grad_norm": 0.49195021390914917, "learning_rate": 0.00025672000000000005, "loss": 5.458, "step": 3210 }, { "epoch": 0.0659390166485778, "grad_norm": 0.7491171956062317, "learning_rate": 0.00025752, "loss": 5.4291, "step": 3220 }, { "epoch": 0.06614379620338705, "grad_norm": 0.4588443636894226, "learning_rate": 0.00025832000000000003, "loss": 5.4352, "step": 3230 }, { "epoch": 0.0663485757581963, "grad_norm": 0.48641809821128845, "learning_rate": 0.00025912000000000005, "loss": 5.4437, "step": 3240 }, { "epoch": 0.06655335531300555, "grad_norm": 0.4803517460823059, "learning_rate": 0.00025992, "loss": 5.4641, "step": 3250 }, { "epoch": 0.0667581348678148, "grad_norm": 0.5579696297645569, "learning_rate": 0.00026072000000000004, "loss": 5.4292, "step": 3260 }, { "epoch": 0.06696291442262405, "grad_norm": 0.4361009895801544, "learning_rate": 0.00026152000000000006, "loss": 5.4202, "step": 3270 }, { "epoch": 0.06716769397743329, "grad_norm": 0.45630156993865967, "learning_rate": 0.00026232, "loss": 5.4358, "step": 3280 }, { "epoch": 0.06737247353224254, "grad_norm": 0.5145776271820068, "learning_rate": 0.00026312000000000004, "loss": 5.468, "step": 3290 }, { "epoch": 0.06757725308705179, "grad_norm": 0.457546204328537, "learning_rate": 0.00026392, "loss": 5.4354, "step": 3300 }, { "epoch": 0.06778203264186104, "grad_norm": 0.4389740526676178, "learning_rate": 0.00026472, "loss": 5.4383, "step": 3310 }, { "epoch": 0.06798681219667028, "grad_norm": 0.44789761304855347, "learning_rate": 0.00026552, "loss": 5.448, "step": 3320 }, { "epoch": 0.06819159175147953, "grad_norm": 0.45958659052848816, "learning_rate": 0.00026632, "loss": 5.4228, "step": 3330 }, { "epoch": 0.06839637130628878, "grad_norm": 0.6689921617507935, "learning_rate": 0.00026712, "loss": 5.4684, "step": 3340 }, { "epoch": 0.06860115086109803, "grad_norm": 0.49806907773017883, "learning_rate": 0.00026792, "loss": 5.4264, "step": 3350 }, { "epoch": 0.06880593041590728, "grad_norm": 0.524211049079895, "learning_rate": 0.00026872, "loss": 5.4247, "step": 3360 }, { "epoch": 0.06901070997071652, "grad_norm": 0.5470885038375854, "learning_rate": 0.00026952, "loss": 5.436, "step": 3370 }, { "epoch": 0.06921548952552577, "grad_norm": 0.4457398056983948, "learning_rate": 0.00027032, "loss": 5.4005, "step": 3380 }, { "epoch": 0.06942026908033502, "grad_norm": 0.4752626121044159, "learning_rate": 0.00027112, "loss": 5.4081, "step": 3390 }, { "epoch": 0.06962504863514427, "grad_norm": 0.4663235545158386, "learning_rate": 0.00027192, "loss": 5.3891, "step": 3400 }, { "epoch": 0.06982982818995352, "grad_norm": 0.4726623296737671, "learning_rate": 0.00027272, "loss": 5.4364, "step": 3410 }, { "epoch": 0.07003460774476276, "grad_norm": 0.4584830105304718, "learning_rate": 0.00027352, "loss": 5.3909, "step": 3420 }, { "epoch": 0.070239387299572, "grad_norm": 0.5055278539657593, "learning_rate": 0.00027432, "loss": 5.3835, "step": 3430 }, { "epoch": 0.07044416685438126, "grad_norm": 0.45341160893440247, "learning_rate": 0.00027512, "loss": 5.4318, "step": 3440 }, { "epoch": 0.0706489464091905, "grad_norm": 0.412478506565094, "learning_rate": 0.00027592, "loss": 5.3912, "step": 3450 }, { "epoch": 0.07085372596399975, "grad_norm": 0.7927995920181274, "learning_rate": 0.00027672, "loss": 5.4522, "step": 3460 }, { "epoch": 0.071058505518809, "grad_norm": 0.4442445635795593, "learning_rate": 0.00027752, "loss": 5.3683, "step": 3470 }, { "epoch": 0.07126328507361825, "grad_norm": 0.4521198570728302, "learning_rate": 0.00027832, "loss": 5.4093, "step": 3480 }, { "epoch": 0.0714680646284275, "grad_norm": 0.5341065526008606, "learning_rate": 0.00027912, "loss": 5.3989, "step": 3490 }, { "epoch": 0.07167284418323674, "grad_norm": 0.4413537383079529, "learning_rate": 0.00027992, "loss": 5.4228, "step": 3500 }, { "epoch": 0.07187762373804599, "grad_norm": 0.46327200531959534, "learning_rate": 0.00028072, "loss": 5.3654, "step": 3510 }, { "epoch": 0.07208240329285524, "grad_norm": 0.3957655131816864, "learning_rate": 0.00028152, "loss": 5.4269, "step": 3520 }, { "epoch": 0.07228718284766449, "grad_norm": 0.42554226517677307, "learning_rate": 0.00028232, "loss": 5.345, "step": 3530 }, { "epoch": 0.07249196240247374, "grad_norm": 0.506805419921875, "learning_rate": 0.00028312, "loss": 5.3619, "step": 3540 }, { "epoch": 0.07269674195728298, "grad_norm": 0.4607420265674591, "learning_rate": 0.00028392, "loss": 5.35, "step": 3550 }, { "epoch": 0.07290152151209223, "grad_norm": 0.47505366802215576, "learning_rate": 0.00028472, "loss": 5.399, "step": 3560 }, { "epoch": 0.07310630106690148, "grad_norm": 0.45166629552841187, "learning_rate": 0.00028552, "loss": 5.4003, "step": 3570 }, { "epoch": 0.07331108062171073, "grad_norm": 0.41713911294937134, "learning_rate": 0.00028632, "loss": 5.3659, "step": 3580 }, { "epoch": 0.07351586017651998, "grad_norm": 0.5334256291389465, "learning_rate": 0.00028712000000000003, "loss": 5.4177, "step": 3590 }, { "epoch": 0.07372063973132922, "grad_norm": 0.4191318154335022, "learning_rate": 0.00028792, "loss": 5.3886, "step": 3600 }, { "epoch": 0.07392541928613847, "grad_norm": 0.46192094683647156, "learning_rate": 0.00028872, "loss": 5.3861, "step": 3610 }, { "epoch": 0.07413019884094772, "grad_norm": 0.4636607766151428, "learning_rate": 0.00028952000000000003, "loss": 5.3859, "step": 3620 }, { "epoch": 0.07433497839575696, "grad_norm": 0.4274023175239563, "learning_rate": 0.00029032, "loss": 5.3749, "step": 3630 }, { "epoch": 0.07453975795056622, "grad_norm": 0.4159563481807709, "learning_rate": 0.00029112, "loss": 5.3433, "step": 3640 }, { "epoch": 0.07474453750537546, "grad_norm": 0.4604257047176361, "learning_rate": 0.00029192000000000004, "loss": 5.3618, "step": 3650 }, { "epoch": 0.0749493170601847, "grad_norm": 0.44258370995521545, "learning_rate": 0.00029272, "loss": 5.3936, "step": 3660 }, { "epoch": 0.07515409661499396, "grad_norm": 0.5245718359947205, "learning_rate": 0.00029352, "loss": 5.3848, "step": 3670 }, { "epoch": 0.0753588761698032, "grad_norm": 0.5492002964019775, "learning_rate": 0.00029432000000000004, "loss": 5.3355, "step": 3680 }, { "epoch": 0.07556365572461246, "grad_norm": 0.41918817162513733, "learning_rate": 0.00029512, "loss": 5.3354, "step": 3690 }, { "epoch": 0.0757684352794217, "grad_norm": 0.4255083501338959, "learning_rate": 0.00029592, "loss": 5.3575, "step": 3700 }, { "epoch": 0.07597321483423095, "grad_norm": 0.4720117449760437, "learning_rate": 0.00029672000000000005, "loss": 5.3405, "step": 3710 }, { "epoch": 0.0761779943890402, "grad_norm": 0.47471868991851807, "learning_rate": 0.00029752, "loss": 5.3518, "step": 3720 }, { "epoch": 0.07638277394384944, "grad_norm": 0.4269292652606964, "learning_rate": 0.00029832000000000003, "loss": 5.3825, "step": 3730 }, { "epoch": 0.0765875534986587, "grad_norm": 2.8523495197296143, "learning_rate": 0.00029912, "loss": 5.3626, "step": 3740 }, { "epoch": 0.07679233305346794, "grad_norm": 0.4189223647117615, "learning_rate": 0.00029992, "loss": 5.3676, "step": 3750 }, { "epoch": 0.07699711260827719, "grad_norm": 1.7700881958007812, "learning_rate": 0.00030072000000000003, "loss": 5.3279, "step": 3760 }, { "epoch": 0.07720189216308644, "grad_norm": 0.39521193504333496, "learning_rate": 0.00030152, "loss": 5.3587, "step": 3770 }, { "epoch": 0.07740667171789568, "grad_norm": 0.4292346239089966, "learning_rate": 0.00030232, "loss": 5.3058, "step": 3780 }, { "epoch": 0.07761145127270494, "grad_norm": 0.42175960540771484, "learning_rate": 0.00030312000000000004, "loss": 5.3168, "step": 3790 }, { "epoch": 0.07781623082751418, "grad_norm": 0.3920210599899292, "learning_rate": 0.00030392, "loss": 5.3741, "step": 3800 }, { "epoch": 0.07802101038232342, "grad_norm": 0.43849965929985046, "learning_rate": 0.00030472, "loss": 5.3406, "step": 3810 }, { "epoch": 0.07822578993713268, "grad_norm": 0.42803141474723816, "learning_rate": 0.00030552000000000004, "loss": 5.3347, "step": 3820 }, { "epoch": 0.07843056949194192, "grad_norm": 0.42235347628593445, "learning_rate": 0.00030632, "loss": 5.3508, "step": 3830 }, { "epoch": 0.07863534904675117, "grad_norm": 0.4283842146396637, "learning_rate": 0.00030712000000000003, "loss": 5.3453, "step": 3840 }, { "epoch": 0.07884012860156042, "grad_norm": 0.44694867730140686, "learning_rate": 0.00030792000000000005, "loss": 5.3724, "step": 3850 }, { "epoch": 0.07904490815636966, "grad_norm": 0.5036173462867737, "learning_rate": 0.00030872, "loss": 5.3517, "step": 3860 }, { "epoch": 0.07924968771117892, "grad_norm": 0.5203410387039185, "learning_rate": 0.00030952000000000003, "loss": 5.3277, "step": 3870 }, { "epoch": 0.07945446726598816, "grad_norm": 0.4823491871356964, "learning_rate": 0.00031032000000000005, "loss": 5.3418, "step": 3880 }, { "epoch": 0.0796592468207974, "grad_norm": 0.39805030822753906, "learning_rate": 0.00031112, "loss": 5.3395, "step": 3890 }, { "epoch": 0.07986402637560666, "grad_norm": 0.3759382367134094, "learning_rate": 0.00031192000000000004, "loss": 5.2811, "step": 3900 }, { "epoch": 0.0800688059304159, "grad_norm": 0.3856908977031708, "learning_rate": 0.00031272000000000006, "loss": 5.3364, "step": 3910 }, { "epoch": 0.08027358548522516, "grad_norm": 0.5101864337921143, "learning_rate": 0.00031352, "loss": 5.3166, "step": 3920 }, { "epoch": 0.0804783650400344, "grad_norm": 0.4429487884044647, "learning_rate": 0.00031432000000000004, "loss": 5.3429, "step": 3930 }, { "epoch": 0.08068314459484364, "grad_norm": 0.36321690678596497, "learning_rate": 0.00031512, "loss": 5.3095, "step": 3940 }, { "epoch": 0.0808879241496529, "grad_norm": 0.4004923701286316, "learning_rate": 0.00031591999999999997, "loss": 5.3063, "step": 3950 }, { "epoch": 0.08109270370446214, "grad_norm": 0.4149259030818939, "learning_rate": 0.00031672, "loss": 5.2982, "step": 3960 }, { "epoch": 0.0812974832592714, "grad_norm": 0.635617733001709, "learning_rate": 0.00031752, "loss": 5.3013, "step": 3970 }, { "epoch": 0.08150226281408064, "grad_norm": 0.3996580243110657, "learning_rate": 0.00031832, "loss": 5.3785, "step": 3980 }, { "epoch": 0.08170704236888988, "grad_norm": 0.37409737706184387, "learning_rate": 0.00031912, "loss": 5.3497, "step": 3990 }, { "epoch": 0.08191182192369914, "grad_norm": 0.3825286030769348, "learning_rate": 0.00031992, "loss": 5.3066, "step": 4000 }, { "epoch": 0.08191182192369914, "eval_loss": 5.318787097930908, "eval_runtime": 4.3875, "eval_samples_per_second": 265.753, "eval_steps_per_second": 33.276, "step": 4000 }, { "epoch": 0.08211660147850838, "grad_norm": 0.3755367398262024, "learning_rate": 0.00032072, "loss": 5.2894, "step": 4010 }, { "epoch": 0.08232138103331764, "grad_norm": 0.40809503197669983, "learning_rate": 0.00032152, "loss": 5.2955, "step": 4020 }, { "epoch": 0.08252616058812688, "grad_norm": 0.41797196865081787, "learning_rate": 0.00032232, "loss": 5.3023, "step": 4030 }, { "epoch": 0.08273094014293612, "grad_norm": 0.3694484531879425, "learning_rate": 0.00032312, "loss": 5.3087, "step": 4040 }, { "epoch": 0.08293571969774538, "grad_norm": 0.38349223136901855, "learning_rate": 0.00032392, "loss": 5.334, "step": 4050 }, { "epoch": 0.08314049925255462, "grad_norm": 0.44905778765678406, "learning_rate": 0.00032472, "loss": 5.3084, "step": 4060 }, { "epoch": 0.08334527880736388, "grad_norm": 0.37099722027778625, "learning_rate": 0.00032552, "loss": 5.2817, "step": 4070 }, { "epoch": 0.08355005836217312, "grad_norm": 0.41185104846954346, "learning_rate": 0.00032632, "loss": 5.2967, "step": 4080 }, { "epoch": 0.08375483791698236, "grad_norm": 0.3768998980522156, "learning_rate": 0.00032712, "loss": 5.3252, "step": 4090 }, { "epoch": 0.08395961747179162, "grad_norm": 0.49293774366378784, "learning_rate": 0.00032792, "loss": 5.2841, "step": 4100 }, { "epoch": 0.08416439702660086, "grad_norm": 0.3644670248031616, "learning_rate": 0.00032872, "loss": 5.2919, "step": 4110 }, { "epoch": 0.08436917658141012, "grad_norm": 0.4202214181423187, "learning_rate": 0.00032952000000000003, "loss": 5.3057, "step": 4120 }, { "epoch": 0.08457395613621936, "grad_norm": 0.39260733127593994, "learning_rate": 0.00033032, "loss": 5.3047, "step": 4130 }, { "epoch": 0.0847787356910286, "grad_norm": 0.3847532570362091, "learning_rate": 0.00033112, "loss": 5.2819, "step": 4140 }, { "epoch": 0.08498351524583786, "grad_norm": 0.41340094804763794, "learning_rate": 0.00033192000000000003, "loss": 5.2916, "step": 4150 }, { "epoch": 0.0851882948006471, "grad_norm": 0.3760600984096527, "learning_rate": 0.00033272, "loss": 5.2889, "step": 4160 }, { "epoch": 0.08539307435545636, "grad_norm": 0.3887149393558502, "learning_rate": 0.00033352, "loss": 5.2597, "step": 4170 }, { "epoch": 0.0855978539102656, "grad_norm": 0.4354073107242584, "learning_rate": 0.00033432, "loss": 5.2719, "step": 4180 }, { "epoch": 0.08580263346507484, "grad_norm": 0.4749308228492737, "learning_rate": 0.00033512, "loss": 5.2916, "step": 4190 }, { "epoch": 0.0860074130198841, "grad_norm": 0.41358011960983276, "learning_rate": 0.00033592, "loss": 5.2996, "step": 4200 }, { "epoch": 0.08621219257469334, "grad_norm": 0.3583889901638031, "learning_rate": 0.00033672, "loss": 5.2626, "step": 4210 }, { "epoch": 0.08641697212950258, "grad_norm": 0.4384559094905853, "learning_rate": 0.00033752, "loss": 5.2314, "step": 4220 }, { "epoch": 0.08662175168431184, "grad_norm": 0.37336528301239014, "learning_rate": 0.00033832, "loss": 5.2923, "step": 4230 }, { "epoch": 0.08682653123912108, "grad_norm": 0.39408349990844727, "learning_rate": 0.00033912, "loss": 5.259, "step": 4240 }, { "epoch": 0.08703131079393034, "grad_norm": 0.32332339882850647, "learning_rate": 0.00033992, "loss": 5.2699, "step": 4250 }, { "epoch": 0.08723609034873958, "grad_norm": 0.46973833441734314, "learning_rate": 0.00034072000000000003, "loss": 5.3307, "step": 4260 }, { "epoch": 0.08744086990354882, "grad_norm": 0.3726697564125061, "learning_rate": 0.00034152, "loss": 5.2893, "step": 4270 }, { "epoch": 0.08764564945835808, "grad_norm": 0.3420617878437042, "learning_rate": 0.00034232, "loss": 5.3053, "step": 4280 }, { "epoch": 0.08785042901316732, "grad_norm": 0.3470669984817505, "learning_rate": 0.00034312000000000004, "loss": 5.2869, "step": 4290 }, { "epoch": 0.08805520856797658, "grad_norm": 0.3572559952735901, "learning_rate": 0.00034392, "loss": 5.2383, "step": 4300 }, { "epoch": 0.08825998812278582, "grad_norm": 0.3921760320663452, "learning_rate": 0.00034472, "loss": 5.2638, "step": 4310 }, { "epoch": 0.08846476767759506, "grad_norm": 0.37090209126472473, "learning_rate": 0.00034552000000000004, "loss": 5.2768, "step": 4320 }, { "epoch": 0.08866954723240432, "grad_norm": 0.34295281767845154, "learning_rate": 0.00034632, "loss": 5.2567, "step": 4330 }, { "epoch": 0.08887432678721356, "grad_norm": 0.379252552986145, "learning_rate": 0.00034712, "loss": 5.2565, "step": 4340 }, { "epoch": 0.08907910634202282, "grad_norm": 0.35600078105926514, "learning_rate": 0.00034792000000000004, "loss": 5.2404, "step": 4350 }, { "epoch": 0.08928388589683206, "grad_norm": 0.37798258662223816, "learning_rate": 0.00034872, "loss": 5.2591, "step": 4360 }, { "epoch": 0.0894886654516413, "grad_norm": 0.4064588248729706, "learning_rate": 0.00034952000000000003, "loss": 5.2677, "step": 4370 }, { "epoch": 0.08969344500645056, "grad_norm": 0.4060820937156677, "learning_rate": 0.00035032000000000005, "loss": 5.2776, "step": 4380 }, { "epoch": 0.0898982245612598, "grad_norm": 0.41651928424835205, "learning_rate": 0.00035112, "loss": 5.2984, "step": 4390 }, { "epoch": 0.09010300411606906, "grad_norm": 0.3946080505847931, "learning_rate": 0.00035192000000000003, "loss": 5.2596, "step": 4400 }, { "epoch": 0.0903077836708783, "grad_norm": 0.3811205327510834, "learning_rate": 0.00035272000000000005, "loss": 5.2489, "step": 4410 }, { "epoch": 0.09051256322568754, "grad_norm": 0.36853381991386414, "learning_rate": 0.00035352, "loss": 5.2474, "step": 4420 }, { "epoch": 0.0907173427804968, "grad_norm": 0.41594240069389343, "learning_rate": 0.00035432000000000004, "loss": 5.2584, "step": 4430 }, { "epoch": 0.09092212233530604, "grad_norm": 0.35103175044059753, "learning_rate": 0.00035512000000000006, "loss": 5.2674, "step": 4440 }, { "epoch": 0.0911269018901153, "grad_norm": 0.3328525424003601, "learning_rate": 0.00035592, "loss": 5.2757, "step": 4450 }, { "epoch": 0.09133168144492454, "grad_norm": 0.36880525946617126, "learning_rate": 0.00035672000000000004, "loss": 5.2261, "step": 4460 }, { "epoch": 0.09153646099973378, "grad_norm": 0.35445812344551086, "learning_rate": 0.00035752, "loss": 5.2195, "step": 4470 }, { "epoch": 0.09174124055454304, "grad_norm": 0.36946800351142883, "learning_rate": 0.00035832, "loss": 5.2225, "step": 4480 }, { "epoch": 0.09194602010935228, "grad_norm": 0.375847727060318, "learning_rate": 0.00035912000000000004, "loss": 5.2446, "step": 4490 }, { "epoch": 0.09215079966416154, "grad_norm": 0.3652852773666382, "learning_rate": 0.00035992, "loss": 5.2209, "step": 4500 }, { "epoch": 0.09235557921897078, "grad_norm": 0.36818650364875793, "learning_rate": 0.00036072000000000003, "loss": 5.2382, "step": 4510 }, { "epoch": 0.09256035877378002, "grad_norm": 0.3429717421531677, "learning_rate": 0.00036152000000000005, "loss": 5.2163, "step": 4520 }, { "epoch": 0.09276513832858928, "grad_norm": 0.3466830551624298, "learning_rate": 0.00036232, "loss": 5.2521, "step": 4530 }, { "epoch": 0.09296991788339852, "grad_norm": 0.40972962975502014, "learning_rate": 0.00036312000000000003, "loss": 5.2236, "step": 4540 }, { "epoch": 0.09317469743820776, "grad_norm": 0.39865970611572266, "learning_rate": 0.00036392000000000005, "loss": 5.2733, "step": 4550 }, { "epoch": 0.09337947699301702, "grad_norm": 0.3470962643623352, "learning_rate": 0.00036472, "loss": 5.2474, "step": 4560 }, { "epoch": 0.09358425654782626, "grad_norm": 0.36036545038223267, "learning_rate": 0.00036552, "loss": 5.2573, "step": 4570 }, { "epoch": 0.09378903610263552, "grad_norm": 0.41582751274108887, "learning_rate": 0.00036632, "loss": 5.2342, "step": 4580 }, { "epoch": 0.09399381565744476, "grad_norm": 0.3146849274635315, "learning_rate": 0.00036712, "loss": 5.2397, "step": 4590 }, { "epoch": 0.094198595212254, "grad_norm": 0.3361073434352875, "learning_rate": 0.00036792, "loss": 5.2354, "step": 4600 }, { "epoch": 0.09440337476706326, "grad_norm": 0.36320430040359497, "learning_rate": 0.00036872, "loss": 5.2333, "step": 4610 }, { "epoch": 0.0946081543218725, "grad_norm": 0.3914796710014343, "learning_rate": 0.00036952, "loss": 5.2271, "step": 4620 }, { "epoch": 0.09481293387668176, "grad_norm": 0.38062742352485657, "learning_rate": 0.00037032, "loss": 5.2512, "step": 4630 }, { "epoch": 0.095017713431491, "grad_norm": 0.35448241233825684, "learning_rate": 0.00037112, "loss": 5.2494, "step": 4640 }, { "epoch": 0.09522249298630024, "grad_norm": 0.3220553994178772, "learning_rate": 0.00037192, "loss": 5.2291, "step": 4650 }, { "epoch": 0.0954272725411095, "grad_norm": 0.5660349726676941, "learning_rate": 0.00037272, "loss": 5.2063, "step": 4660 }, { "epoch": 0.09563205209591874, "grad_norm": 0.3554455041885376, "learning_rate": 0.00037352, "loss": 5.21, "step": 4670 }, { "epoch": 0.095836831650728, "grad_norm": 0.33699843287467957, "learning_rate": 0.00037432, "loss": 5.2057, "step": 4680 }, { "epoch": 0.09604161120553724, "grad_norm": 0.4169362485408783, "learning_rate": 0.00037512, "loss": 5.1735, "step": 4690 }, { "epoch": 0.09624639076034648, "grad_norm": 0.34874531626701355, "learning_rate": 0.00037592, "loss": 5.2358, "step": 4700 }, { "epoch": 0.09645117031515574, "grad_norm": 0.3854043483734131, "learning_rate": 0.00037672, "loss": 5.2211, "step": 4710 }, { "epoch": 0.09665594986996498, "grad_norm": 0.32648953795433044, "learning_rate": 0.00037752, "loss": 5.1953, "step": 4720 }, { "epoch": 0.09686072942477424, "grad_norm": 0.3376765847206116, "learning_rate": 0.00037832, "loss": 5.2278, "step": 4730 }, { "epoch": 0.09706550897958348, "grad_norm": 0.33131176233291626, "learning_rate": 0.00037912, "loss": 5.2227, "step": 4740 }, { "epoch": 0.09727028853439272, "grad_norm": 0.33954697847366333, "learning_rate": 0.00037992, "loss": 5.2328, "step": 4750 }, { "epoch": 0.09747506808920198, "grad_norm": 0.3093273937702179, "learning_rate": 0.00038072000000000003, "loss": 5.2087, "step": 4760 }, { "epoch": 0.09767984764401122, "grad_norm": 0.3387199640274048, "learning_rate": 0.00038152, "loss": 5.2066, "step": 4770 }, { "epoch": 0.09788462719882048, "grad_norm": 0.3928283452987671, "learning_rate": 0.00038232, "loss": 5.2122, "step": 4780 }, { "epoch": 0.09808940675362972, "grad_norm": 0.31002166867256165, "learning_rate": 0.00038312000000000003, "loss": 5.197, "step": 4790 }, { "epoch": 0.09829418630843896, "grad_norm": 0.3293517827987671, "learning_rate": 0.00038392, "loss": 5.1674, "step": 4800 }, { "epoch": 0.09849896586324822, "grad_norm": 0.3640994131565094, "learning_rate": 0.00038472, "loss": 5.2214, "step": 4810 }, { "epoch": 0.09870374541805746, "grad_norm": 0.33961260318756104, "learning_rate": 0.00038552000000000004, "loss": 5.2219, "step": 4820 }, { "epoch": 0.09890852497286672, "grad_norm": 0.4387868046760559, "learning_rate": 0.00038632, "loss": 5.2487, "step": 4830 }, { "epoch": 0.09911330452767596, "grad_norm": 0.3797737956047058, "learning_rate": 0.00038712, "loss": 5.2065, "step": 4840 }, { "epoch": 0.0993180840824852, "grad_norm": 0.39219486713409424, "learning_rate": 0.00038792000000000004, "loss": 5.2036, "step": 4850 }, { "epoch": 0.09952286363729446, "grad_norm": 0.3058601915836334, "learning_rate": 0.00038872, "loss": 5.2291, "step": 4860 }, { "epoch": 0.0997276431921037, "grad_norm": 0.35596054792404175, "learning_rate": 0.00038952, "loss": 5.1957, "step": 4870 }, { "epoch": 0.09993242274691296, "grad_norm": 0.3059404790401459, "learning_rate": 0.00039032000000000004, "loss": 5.1907, "step": 4880 }, { "epoch": 0.1001372023017222, "grad_norm": 0.3631693422794342, "learning_rate": 0.00039112, "loss": 5.1968, "step": 4890 }, { "epoch": 0.10034198185653144, "grad_norm": 0.34312257170677185, "learning_rate": 0.00039192000000000003, "loss": 5.2482, "step": 4900 }, { "epoch": 0.1005467614113407, "grad_norm": 0.3389482796192169, "learning_rate": 0.00039272000000000005, "loss": 5.1609, "step": 4910 }, { "epoch": 0.10075154096614994, "grad_norm": 0.3607906997203827, "learning_rate": 0.00039352, "loss": 5.2572, "step": 4920 }, { "epoch": 0.10095632052095918, "grad_norm": 0.36973777413368225, "learning_rate": 0.00039432000000000003, "loss": 5.2087, "step": 4930 }, { "epoch": 0.10116110007576844, "grad_norm": 0.35736119747161865, "learning_rate": 0.00039512, "loss": 5.2319, "step": 4940 }, { "epoch": 0.10136587963057768, "grad_norm": 0.33283448219299316, "learning_rate": 0.00039592, "loss": 5.1959, "step": 4950 }, { "epoch": 0.10157065918538694, "grad_norm": 0.329438716173172, "learning_rate": 0.00039672000000000004, "loss": 5.2262, "step": 4960 }, { "epoch": 0.10177543874019618, "grad_norm": 0.37530723214149475, "learning_rate": 0.00039752, "loss": 5.2094, "step": 4970 }, { "epoch": 0.10198021829500542, "grad_norm": 0.3056955337524414, "learning_rate": 0.00039832, "loss": 5.2204, "step": 4980 }, { "epoch": 0.10218499784981468, "grad_norm": 0.3704677224159241, "learning_rate": 0.00039912000000000004, "loss": 5.1631, "step": 4990 }, { "epoch": 0.10238977740462392, "grad_norm": 0.31214451789855957, "learning_rate": 0.00039992, "loss": 5.2125, "step": 5000 }, { "epoch": 0.10259455695943318, "grad_norm": 0.313644140958786, "learning_rate": 0.0004, "loss": 5.1836, "step": 5010 }, { "epoch": 0.10279933651424242, "grad_norm": 0.33864766359329224, "learning_rate": 0.0004, "loss": 5.1933, "step": 5020 }, { "epoch": 0.10300411606905166, "grad_norm": 0.3204938769340515, "learning_rate": 0.0004, "loss": 5.2015, "step": 5030 }, { "epoch": 0.10320889562386092, "grad_norm": 0.30994290113449097, "learning_rate": 0.0004, "loss": 5.184, "step": 5040 }, { "epoch": 0.10341367517867016, "grad_norm": 0.33178046345710754, "learning_rate": 0.0004, "loss": 5.1727, "step": 5050 }, { "epoch": 0.10361845473347941, "grad_norm": 0.32041501998901367, "learning_rate": 0.0004, "loss": 5.1822, "step": 5060 }, { "epoch": 0.10382323428828866, "grad_norm": 0.3314732313156128, "learning_rate": 0.0004, "loss": 5.1814, "step": 5070 }, { "epoch": 0.1040280138430979, "grad_norm": 0.39618968963623047, "learning_rate": 0.0004, "loss": 5.2003, "step": 5080 }, { "epoch": 0.10423279339790716, "grad_norm": 0.3051014542579651, "learning_rate": 0.0004, "loss": 5.1695, "step": 5090 }, { "epoch": 0.1044375729527164, "grad_norm": 0.34999150037765503, "learning_rate": 0.0004, "loss": 5.1811, "step": 5100 }, { "epoch": 0.10464235250752565, "grad_norm": 0.309879869222641, "learning_rate": 0.0004, "loss": 5.1595, "step": 5110 }, { "epoch": 0.1048471320623349, "grad_norm": 0.3067319989204407, "learning_rate": 0.0004, "loss": 5.2001, "step": 5120 }, { "epoch": 0.10505191161714414, "grad_norm": 0.32993996143341064, "learning_rate": 0.0004, "loss": 5.1486, "step": 5130 }, { "epoch": 0.1052566911719534, "grad_norm": 0.34084463119506836, "learning_rate": 0.0004, "loss": 5.1787, "step": 5140 }, { "epoch": 0.10546147072676264, "grad_norm": 0.3445000946521759, "learning_rate": 0.0004, "loss": 5.1714, "step": 5150 }, { "epoch": 0.1056662502815719, "grad_norm": 0.3222779631614685, "learning_rate": 0.0004, "loss": 5.2008, "step": 5160 }, { "epoch": 0.10587102983638114, "grad_norm": 0.3041796088218689, "learning_rate": 0.0004, "loss": 5.1863, "step": 5170 }, { "epoch": 0.10607580939119038, "grad_norm": 0.3209103047847748, "learning_rate": 0.0004, "loss": 5.1932, "step": 5180 }, { "epoch": 0.10628058894599964, "grad_norm": 0.3323422372341156, "learning_rate": 0.0004, "loss": 5.1952, "step": 5190 }, { "epoch": 0.10648536850080888, "grad_norm": 0.325749933719635, "learning_rate": 0.0004, "loss": 5.1658, "step": 5200 }, { "epoch": 0.10669014805561813, "grad_norm": 0.30505135655403137, "learning_rate": 0.0004, "loss": 5.1855, "step": 5210 }, { "epoch": 0.10689492761042738, "grad_norm": 0.306336909532547, "learning_rate": 0.0004, "loss": 5.1555, "step": 5220 }, { "epoch": 0.10709970716523662, "grad_norm": 0.3228214979171753, "learning_rate": 0.0004, "loss": 5.1525, "step": 5230 }, { "epoch": 0.10730448672004587, "grad_norm": 0.38324278593063354, "learning_rate": 0.0004, "loss": 5.166, "step": 5240 }, { "epoch": 0.10750926627485512, "grad_norm": 0.30399709939956665, "learning_rate": 0.0004, "loss": 5.1625, "step": 5250 }, { "epoch": 0.10771404582966437, "grad_norm": 0.2954542636871338, "learning_rate": 0.0004, "loss": 5.1716, "step": 5260 }, { "epoch": 0.10791882538447362, "grad_norm": 0.3116077780723572, "learning_rate": 0.0004, "loss": 5.1651, "step": 5270 }, { "epoch": 0.10812360493928286, "grad_norm": 0.30015772581100464, "learning_rate": 0.0004, "loss": 5.1664, "step": 5280 }, { "epoch": 0.10832838449409211, "grad_norm": 0.3169012665748596, "learning_rate": 0.0004, "loss": 5.1646, "step": 5290 }, { "epoch": 0.10853316404890136, "grad_norm": 0.3500293493270874, "learning_rate": 0.0004, "loss": 5.1535, "step": 5300 }, { "epoch": 0.1087379436037106, "grad_norm": 0.3506503701210022, "learning_rate": 0.0004, "loss": 5.1542, "step": 5310 }, { "epoch": 0.10894272315851986, "grad_norm": 0.32760336995124817, "learning_rate": 0.0004, "loss": 5.1302, "step": 5320 }, { "epoch": 0.1091475027133291, "grad_norm": 0.2881554663181305, "learning_rate": 0.0004, "loss": 5.1459, "step": 5330 }, { "epoch": 0.10935228226813835, "grad_norm": 0.30306053161621094, "learning_rate": 0.0004, "loss": 5.1472, "step": 5340 }, { "epoch": 0.1095570618229476, "grad_norm": 0.31296682357788086, "learning_rate": 0.0004, "loss": 5.1584, "step": 5350 }, { "epoch": 0.10976184137775684, "grad_norm": 0.29192468523979187, "learning_rate": 0.0004, "loss": 5.1401, "step": 5360 }, { "epoch": 0.1099666209325661, "grad_norm": 0.28675010800361633, "learning_rate": 0.0004, "loss": 5.1557, "step": 5370 }, { "epoch": 0.11017140048737534, "grad_norm": 0.27933311462402344, "learning_rate": 0.0004, "loss": 5.1469, "step": 5380 }, { "epoch": 0.1103761800421846, "grad_norm": 0.29722484946250916, "learning_rate": 0.0004, "loss": 5.1144, "step": 5390 }, { "epoch": 0.11058095959699384, "grad_norm": 0.2667056918144226, "learning_rate": 0.0004, "loss": 5.1626, "step": 5400 }, { "epoch": 0.11078573915180308, "grad_norm": 0.3032090961933136, "learning_rate": 0.0004, "loss": 5.1395, "step": 5410 }, { "epoch": 0.11099051870661233, "grad_norm": 0.309581995010376, "learning_rate": 0.0004, "loss": 5.1377, "step": 5420 }, { "epoch": 0.11119529826142158, "grad_norm": 0.32929155230522156, "learning_rate": 0.0004, "loss": 5.1539, "step": 5430 }, { "epoch": 0.11140007781623083, "grad_norm": 0.2987646758556366, "learning_rate": 0.0004, "loss": 5.1414, "step": 5440 }, { "epoch": 0.11160485737104008, "grad_norm": 0.30965328216552734, "learning_rate": 0.0004, "loss": 5.1647, "step": 5450 }, { "epoch": 0.11180963692584932, "grad_norm": 0.28466081619262695, "learning_rate": 0.0004, "loss": 5.1423, "step": 5460 }, { "epoch": 0.11201441648065857, "grad_norm": 0.30453965067863464, "learning_rate": 0.0004, "loss": 5.1451, "step": 5470 }, { "epoch": 0.11221919603546782, "grad_norm": 0.2841033339500427, "learning_rate": 0.0004, "loss": 5.1354, "step": 5480 }, { "epoch": 0.11242397559027707, "grad_norm": 0.2860296070575714, "learning_rate": 0.0004, "loss": 5.1409, "step": 5490 }, { "epoch": 0.11262875514508632, "grad_norm": 0.30634403228759766, "learning_rate": 0.0004, "loss": 5.1375, "step": 5500 }, { "epoch": 0.11283353469989556, "grad_norm": 0.32455432415008545, "learning_rate": 0.0004, "loss": 5.1184, "step": 5510 }, { "epoch": 0.11303831425470481, "grad_norm": 0.2667500078678131, "learning_rate": 0.0004, "loss": 5.1514, "step": 5520 }, { "epoch": 0.11324309380951406, "grad_norm": 0.3181164264678955, "learning_rate": 0.0004, "loss": 5.1479, "step": 5530 }, { "epoch": 0.11344787336432331, "grad_norm": 0.2994334399700165, "learning_rate": 0.0004, "loss": 5.1251, "step": 5540 }, { "epoch": 0.11365265291913255, "grad_norm": 0.3418193459510803, "learning_rate": 0.0004, "loss": 5.1177, "step": 5550 }, { "epoch": 0.1138574324739418, "grad_norm": 0.31572362780570984, "learning_rate": 0.0004, "loss": 5.1733, "step": 5560 }, { "epoch": 0.11406221202875105, "grad_norm": 0.2817554771900177, "learning_rate": 0.0004, "loss": 5.1609, "step": 5570 }, { "epoch": 0.1142669915835603, "grad_norm": 0.2846536338329315, "learning_rate": 0.0004, "loss": 5.1243, "step": 5580 }, { "epoch": 0.11447177113836955, "grad_norm": 0.28663066029548645, "learning_rate": 0.0004, "loss": 5.1329, "step": 5590 }, { "epoch": 0.1146765506931788, "grad_norm": 0.31789955496788025, "learning_rate": 0.0004, "loss": 5.1209, "step": 5600 }, { "epoch": 0.11488133024798804, "grad_norm": 0.29089292883872986, "learning_rate": 0.0004, "loss": 5.1083, "step": 5610 }, { "epoch": 0.11508610980279729, "grad_norm": 0.28004202246665955, "learning_rate": 0.0004, "loss": 5.1544, "step": 5620 }, { "epoch": 0.11529088935760654, "grad_norm": 0.32260385155677795, "learning_rate": 0.0004, "loss": 5.1685, "step": 5630 }, { "epoch": 0.11549566891241578, "grad_norm": 0.30765828490257263, "learning_rate": 0.0004, "loss": 5.1183, "step": 5640 }, { "epoch": 0.11570044846722503, "grad_norm": 0.2876403331756592, "learning_rate": 0.0004, "loss": 5.1507, "step": 5650 }, { "epoch": 0.11590522802203428, "grad_norm": 0.2831974923610687, "learning_rate": 0.0004, "loss": 5.1111, "step": 5660 }, { "epoch": 0.11611000757684353, "grad_norm": 0.3007749617099762, "learning_rate": 0.0004, "loss": 5.1042, "step": 5670 }, { "epoch": 0.11631478713165277, "grad_norm": 0.29860612750053406, "learning_rate": 0.0004, "loss": 5.1098, "step": 5680 }, { "epoch": 0.11651956668646202, "grad_norm": 0.2871202528476715, "learning_rate": 0.0004, "loss": 5.119, "step": 5690 }, { "epoch": 0.11672434624127127, "grad_norm": 0.3583846390247345, "learning_rate": 0.0004, "loss": 5.0853, "step": 5700 }, { "epoch": 0.11692912579608052, "grad_norm": 0.3196072578430176, "learning_rate": 0.0004, "loss": 5.1183, "step": 5710 }, { "epoch": 0.11713390535088977, "grad_norm": 0.3659673035144806, "learning_rate": 0.0004, "loss": 5.1341, "step": 5720 }, { "epoch": 0.11733868490569901, "grad_norm": 0.34671393036842346, "learning_rate": 0.0004, "loss": 5.1032, "step": 5730 }, { "epoch": 0.11754346446050826, "grad_norm": 0.6136424541473389, "learning_rate": 0.0004, "loss": 5.1141, "step": 5740 }, { "epoch": 0.11774824401531751, "grad_norm": 0.29831385612487793, "learning_rate": 0.0004, "loss": 5.1164, "step": 5750 }, { "epoch": 0.11795302357012676, "grad_norm": 0.2739589214324951, "learning_rate": 0.0004, "loss": 5.1285, "step": 5760 }, { "epoch": 0.11815780312493601, "grad_norm": 0.3025692105293274, "learning_rate": 0.0004, "loss": 5.0998, "step": 5770 }, { "epoch": 0.11836258267974525, "grad_norm": 0.30093103647232056, "learning_rate": 0.0004, "loss": 5.1416, "step": 5780 }, { "epoch": 0.1185673622345545, "grad_norm": 0.28240787982940674, "learning_rate": 0.0004, "loss": 5.0816, "step": 5790 }, { "epoch": 0.11877214178936375, "grad_norm": 0.3148246705532074, "learning_rate": 0.0004, "loss": 5.0887, "step": 5800 }, { "epoch": 0.118976921344173, "grad_norm": 0.2791157364845276, "learning_rate": 0.0004, "loss": 5.1228, "step": 5810 }, { "epoch": 0.11918170089898225, "grad_norm": 0.31769630312919617, "learning_rate": 0.0004, "loss": 5.1121, "step": 5820 }, { "epoch": 0.1193864804537915, "grad_norm": 3.882388114929199, "learning_rate": 0.0004, "loss": 5.1327, "step": 5830 }, { "epoch": 0.11959126000860074, "grad_norm": 0.27645838260650635, "learning_rate": 0.0004, "loss": 5.1025, "step": 5840 }, { "epoch": 0.11979603956340999, "grad_norm": 0.2904749810695648, "learning_rate": 0.0004, "loss": 5.1297, "step": 5850 }, { "epoch": 0.12000081911821923, "grad_norm": 0.32768234610557556, "learning_rate": 0.0004, "loss": 5.0841, "step": 5860 }, { "epoch": 0.12020559867302849, "grad_norm": 0.3125588595867157, "learning_rate": 0.0004, "loss": 5.1031, "step": 5870 }, { "epoch": 0.12041037822783773, "grad_norm": 0.29457178711891174, "learning_rate": 0.0004, "loss": 5.1234, "step": 5880 }, { "epoch": 0.12061515778264698, "grad_norm": 0.33318033814430237, "learning_rate": 0.0004, "loss": 5.0914, "step": 5890 }, { "epoch": 0.12081993733745623, "grad_norm": 0.32637280225753784, "learning_rate": 0.0004, "loss": 5.0767, "step": 5900 }, { "epoch": 0.12102471689226547, "grad_norm": 0.3051298260688782, "learning_rate": 0.0004, "loss": 5.0964, "step": 5910 }, { "epoch": 0.12122949644707473, "grad_norm": 0.31057214736938477, "learning_rate": 0.0004, "loss": 5.0921, "step": 5920 }, { "epoch": 0.12143427600188397, "grad_norm": 0.31014057993888855, "learning_rate": 0.0004, "loss": 5.0711, "step": 5930 }, { "epoch": 0.12163905555669322, "grad_norm": 0.28628087043762207, "learning_rate": 0.0004, "loss": 5.0847, "step": 5940 }, { "epoch": 0.12184383511150247, "grad_norm": 0.280165433883667, "learning_rate": 0.0004, "loss": 5.1159, "step": 5950 }, { "epoch": 0.12204861466631171, "grad_norm": 0.32177525758743286, "learning_rate": 0.0004, "loss": 5.0868, "step": 5960 }, { "epoch": 0.12225339422112097, "grad_norm": 0.2712959349155426, "learning_rate": 0.0004, "loss": 5.106, "step": 5970 }, { "epoch": 0.12245817377593021, "grad_norm": 0.3010253310203552, "learning_rate": 0.0004, "loss": 5.1076, "step": 5980 }, { "epoch": 0.12266295333073945, "grad_norm": 0.28395840525627136, "learning_rate": 0.0004, "loss": 5.0702, "step": 5990 }, { "epoch": 0.12286773288554871, "grad_norm": 0.3164820373058319, "learning_rate": 0.0004, "loss": 5.0433, "step": 6000 }, { "epoch": 0.12286773288554871, "eval_loss": 5.105718612670898, "eval_runtime": 4.4155, "eval_samples_per_second": 264.072, "eval_steps_per_second": 33.066, "step": 6000 }, { "epoch": 0.12307251244035795, "grad_norm": 0.2929603159427643, "learning_rate": 0.0004, "loss": 5.0877, "step": 6010 }, { "epoch": 0.1232772919951672, "grad_norm": 0.2534056603908539, "learning_rate": 0.0004, "loss": 5.0613, "step": 6020 }, { "epoch": 0.12348207154997645, "grad_norm": 0.2689981758594513, "learning_rate": 0.0004, "loss": 5.0937, "step": 6030 }, { "epoch": 0.1236868511047857, "grad_norm": 0.2826198935508728, "learning_rate": 0.0004, "loss": 5.0865, "step": 6040 }, { "epoch": 0.12389163065959495, "grad_norm": 0.3296550512313843, "learning_rate": 0.0004, "loss": 5.0676, "step": 6050 }, { "epoch": 0.1240964102144042, "grad_norm": 0.2820937931537628, "learning_rate": 0.0004, "loss": 5.059, "step": 6060 }, { "epoch": 0.12430118976921344, "grad_norm": 0.2708949148654938, "learning_rate": 0.0004, "loss": 5.1074, "step": 6070 }, { "epoch": 0.12450596932402269, "grad_norm": 0.28163817524909973, "learning_rate": 0.0004, "loss": 5.0812, "step": 6080 }, { "epoch": 0.12471074887883193, "grad_norm": 0.2787400782108307, "learning_rate": 0.0004, "loss": 5.0943, "step": 6090 }, { "epoch": 0.12491552843364119, "grad_norm": 0.29888108372688293, "learning_rate": 0.0004, "loss": 5.0569, "step": 6100 }, { "epoch": 0.12512030798845045, "grad_norm": 0.31003960967063904, "learning_rate": 0.0004, "loss": 5.0997, "step": 6110 }, { "epoch": 0.1253250875432597, "grad_norm": 0.3110891878604889, "learning_rate": 0.0004, "loss": 5.1029, "step": 6120 }, { "epoch": 0.12552986709806893, "grad_norm": 0.27945610880851746, "learning_rate": 0.0004, "loss": 5.0837, "step": 6130 }, { "epoch": 0.12573464665287817, "grad_norm": 0.3528691232204437, "learning_rate": 0.0004, "loss": 5.1196, "step": 6140 }, { "epoch": 0.12593942620768742, "grad_norm": 0.34919026494026184, "learning_rate": 0.0004, "loss": 5.0934, "step": 6150 }, { "epoch": 0.1261442057624967, "grad_norm": 0.3034595549106598, "learning_rate": 0.0004, "loss": 5.0804, "step": 6160 }, { "epoch": 0.12634898531730593, "grad_norm": 0.29646971821784973, "learning_rate": 0.0004, "loss": 5.1081, "step": 6170 }, { "epoch": 0.12655376487211517, "grad_norm": 0.3521294593811035, "learning_rate": 0.0004, "loss": 5.0633, "step": 6180 }, { "epoch": 0.1267585444269244, "grad_norm": 0.2630900740623474, "learning_rate": 0.0004, "loss": 5.0545, "step": 6190 }, { "epoch": 0.12696332398173366, "grad_norm": 0.27108538150787354, "learning_rate": 0.0004, "loss": 5.0702, "step": 6200 }, { "epoch": 0.1271681035365429, "grad_norm": 0.28313174843788147, "learning_rate": 0.0004, "loss": 5.0878, "step": 6210 }, { "epoch": 0.12737288309135217, "grad_norm": 0.2536354064941406, "learning_rate": 0.0004, "loss": 5.0829, "step": 6220 }, { "epoch": 0.1275776626461614, "grad_norm": 0.3007798194885254, "learning_rate": 0.0004, "loss": 5.1176, "step": 6230 }, { "epoch": 0.12778244220097065, "grad_norm": 0.290460467338562, "learning_rate": 0.0004, "loss": 5.0617, "step": 6240 }, { "epoch": 0.1279872217557799, "grad_norm": 0.27415731549263, "learning_rate": 0.0004, "loss": 5.0808, "step": 6250 }, { "epoch": 0.12819200131058914, "grad_norm": 0.28427478671073914, "learning_rate": 0.0004, "loss": 5.101, "step": 6260 }, { "epoch": 0.1283967808653984, "grad_norm": 0.7649832963943481, "learning_rate": 0.0004, "loss": 5.0595, "step": 6270 }, { "epoch": 0.12860156042020765, "grad_norm": 0.28299009799957275, "learning_rate": 0.0004, "loss": 5.0721, "step": 6280 }, { "epoch": 0.1288063399750169, "grad_norm": 0.27343279123306274, "learning_rate": 0.0004, "loss": 5.0611, "step": 6290 }, { "epoch": 0.12901111952982613, "grad_norm": 0.27850478887557983, "learning_rate": 0.0004, "loss": 5.059, "step": 6300 }, { "epoch": 0.12921589908463538, "grad_norm": 0.2900664806365967, "learning_rate": 0.0004, "loss": 5.0783, "step": 6310 }, { "epoch": 0.12942067863944465, "grad_norm": 0.2805745005607605, "learning_rate": 0.0004, "loss": 5.048, "step": 6320 }, { "epoch": 0.1296254581942539, "grad_norm": 0.2737388610839844, "learning_rate": 0.0004, "loss": 5.0706, "step": 6330 }, { "epoch": 0.12983023774906313, "grad_norm": 0.2718786895275116, "learning_rate": 0.0004, "loss": 5.0774, "step": 6340 }, { "epoch": 0.13003501730387237, "grad_norm": 0.304569810628891, "learning_rate": 0.0004, "loss": 5.0472, "step": 6350 }, { "epoch": 0.13023979685868162, "grad_norm": 0.2766701877117157, "learning_rate": 0.0004, "loss": 5.0896, "step": 6360 }, { "epoch": 0.1304445764134909, "grad_norm": 0.30049368739128113, "learning_rate": 0.0004, "loss": 5.055, "step": 6370 }, { "epoch": 0.13064935596830013, "grad_norm": 0.2772940993309021, "learning_rate": 0.0004, "loss": 5.0653, "step": 6380 }, { "epoch": 0.13085413552310937, "grad_norm": 0.26545464992523193, "learning_rate": 0.0004, "loss": 5.0381, "step": 6390 }, { "epoch": 0.13105891507791861, "grad_norm": 0.28183674812316895, "learning_rate": 0.0004, "loss": 5.0518, "step": 6400 }, { "epoch": 0.13126369463272786, "grad_norm": 0.2801971733570099, "learning_rate": 0.0004, "loss": 5.0005, "step": 6410 }, { "epoch": 0.13146847418753713, "grad_norm": 0.28805556893348694, "learning_rate": 0.0004, "loss": 5.0386, "step": 6420 }, { "epoch": 0.13167325374234637, "grad_norm": 0.32158318161964417, "learning_rate": 0.0004, "loss": 5.0674, "step": 6430 }, { "epoch": 0.1318780332971556, "grad_norm": 0.28521737456321716, "learning_rate": 0.0004, "loss": 5.0649, "step": 6440 }, { "epoch": 0.13208281285196485, "grad_norm": 0.34175628423690796, "learning_rate": 0.0004, "loss": 5.0622, "step": 6450 }, { "epoch": 0.1322875924067741, "grad_norm": 0.5137848258018494, "learning_rate": 0.0004, "loss": 5.0687, "step": 6460 }, { "epoch": 0.13249237196158337, "grad_norm": 0.2890058755874634, "learning_rate": 0.0004, "loss": 5.0341, "step": 6470 }, { "epoch": 0.1326971515163926, "grad_norm": 0.2562602162361145, "learning_rate": 0.0004, "loss": 5.0585, "step": 6480 }, { "epoch": 0.13290193107120185, "grad_norm": 0.2638840973377228, "learning_rate": 0.0004, "loss": 5.0344, "step": 6490 }, { "epoch": 0.1331067106260111, "grad_norm": 0.25415486097335815, "learning_rate": 0.0004, "loss": 5.0503, "step": 6500 }, { "epoch": 0.13331149018082034, "grad_norm": 0.27743104100227356, "learning_rate": 0.0004, "loss": 5.0592, "step": 6510 }, { "epoch": 0.1335162697356296, "grad_norm": 0.2603805363178253, "learning_rate": 0.0004, "loss": 5.0396, "step": 6520 }, { "epoch": 0.13372104929043885, "grad_norm": 0.28930333256721497, "learning_rate": 0.0004, "loss": 5.0449, "step": 6530 }, { "epoch": 0.1339258288452481, "grad_norm": 0.33630481362342834, "learning_rate": 0.0004, "loss": 5.1177, "step": 6540 }, { "epoch": 0.13413060840005733, "grad_norm": 0.334529846906662, "learning_rate": 0.0004, "loss": 5.0603, "step": 6550 }, { "epoch": 0.13433538795486658, "grad_norm": 0.27738118171691895, "learning_rate": 0.0004, "loss": 5.0817, "step": 6560 }, { "epoch": 0.13454016750967585, "grad_norm": 0.28533002734184265, "learning_rate": 0.0004, "loss": 5.0557, "step": 6570 }, { "epoch": 0.1347449470644851, "grad_norm": 0.28852105140686035, "learning_rate": 0.0004, "loss": 5.0284, "step": 6580 }, { "epoch": 0.13494972661929433, "grad_norm": 0.24459971487522125, "learning_rate": 0.0004, "loss": 5.0455, "step": 6590 }, { "epoch": 0.13515450617410357, "grad_norm": 0.25609290599823, "learning_rate": 0.0004, "loss": 5.0603, "step": 6600 }, { "epoch": 0.13535928572891281, "grad_norm": 0.3024841248989105, "learning_rate": 0.0004, "loss": 5.0508, "step": 6610 }, { "epoch": 0.13556406528372208, "grad_norm": 0.2652977705001831, "learning_rate": 0.0004, "loss": 5.0262, "step": 6620 }, { "epoch": 0.13576884483853133, "grad_norm": 0.28360605239868164, "learning_rate": 0.0004, "loss": 5.0439, "step": 6630 }, { "epoch": 0.13597362439334057, "grad_norm": 0.28990018367767334, "learning_rate": 0.0004, "loss": 5.0549, "step": 6640 }, { "epoch": 0.1361784039481498, "grad_norm": 0.2533547878265381, "learning_rate": 0.0004, "loss": 5.018, "step": 6650 }, { "epoch": 0.13638318350295905, "grad_norm": 0.2740333676338196, "learning_rate": 0.0004, "loss": 5.0678, "step": 6660 }, { "epoch": 0.13658796305776832, "grad_norm": 0.29817625880241394, "learning_rate": 0.0004, "loss": 5.0712, "step": 6670 }, { "epoch": 0.13679274261257757, "grad_norm": 0.31441426277160645, "learning_rate": 0.0004, "loss": 5.0656, "step": 6680 }, { "epoch": 0.1369975221673868, "grad_norm": 0.3231841027736664, "learning_rate": 0.0004, "loss": 5.0736, "step": 6690 }, { "epoch": 0.13720230172219605, "grad_norm": 0.30328667163848877, "learning_rate": 0.0004, "loss": 5.0546, "step": 6700 }, { "epoch": 0.1374070812770053, "grad_norm": 0.29146936535835266, "learning_rate": 0.0004, "loss": 5.0231, "step": 6710 }, { "epoch": 0.13761186083181456, "grad_norm": 0.26964816451072693, "learning_rate": 0.0004, "loss": 5.0198, "step": 6720 }, { "epoch": 0.1378166403866238, "grad_norm": 0.29934099316596985, "learning_rate": 0.0004, "loss": 5.0214, "step": 6730 }, { "epoch": 0.13802141994143305, "grad_norm": 0.31749799847602844, "learning_rate": 0.0004, "loss": 5.024, "step": 6740 }, { "epoch": 0.1382261994962423, "grad_norm": 0.27449607849121094, "learning_rate": 0.0004, "loss": 5.0448, "step": 6750 }, { "epoch": 0.13843097905105153, "grad_norm": 0.2636013627052307, "learning_rate": 0.0004, "loss": 5.0019, "step": 6760 }, { "epoch": 0.1386357586058608, "grad_norm": 0.2735285758972168, "learning_rate": 0.0004, "loss": 5.0317, "step": 6770 }, { "epoch": 0.13884053816067005, "grad_norm": 0.27161097526550293, "learning_rate": 0.0004, "loss": 5.0621, "step": 6780 }, { "epoch": 0.1390453177154793, "grad_norm": 0.2726326882839203, "learning_rate": 0.0004, "loss": 5.0115, "step": 6790 }, { "epoch": 0.13925009727028853, "grad_norm": 0.2701372802257538, "learning_rate": 0.0004, "loss": 5.0728, "step": 6800 }, { "epoch": 0.13945487682509777, "grad_norm": 0.28725504875183105, "learning_rate": 0.0004, "loss": 5.0328, "step": 6810 }, { "epoch": 0.13965965637990704, "grad_norm": 0.2574363946914673, "learning_rate": 0.0004, "loss": 5.0259, "step": 6820 }, { "epoch": 0.13986443593471629, "grad_norm": 0.3175942003726959, "learning_rate": 0.0004, "loss": 5.0614, "step": 6830 }, { "epoch": 0.14006921548952553, "grad_norm": 0.27931883931159973, "learning_rate": 0.0004, "loss": 5.0172, "step": 6840 }, { "epoch": 0.14027399504433477, "grad_norm": 0.2712056040763855, "learning_rate": 0.0004, "loss": 5.0233, "step": 6850 }, { "epoch": 0.140478774599144, "grad_norm": 0.2554517090320587, "learning_rate": 0.0004, "loss": 5.0289, "step": 6860 }, { "epoch": 0.14068355415395328, "grad_norm": 0.27673497796058655, "learning_rate": 0.0004, "loss": 5.0249, "step": 6870 }, { "epoch": 0.14088833370876253, "grad_norm": 0.28359177708625793, "learning_rate": 0.0004, "loss": 5.055, "step": 6880 }, { "epoch": 0.14109311326357177, "grad_norm": 0.9358149766921997, "learning_rate": 0.0004, "loss": 5.0159, "step": 6890 }, { "epoch": 0.141297892818381, "grad_norm": 0.29279693961143494, "learning_rate": 0.0004, "loss": 5.0301, "step": 6900 }, { "epoch": 0.14150267237319025, "grad_norm": 0.2918263077735901, "learning_rate": 0.0004, "loss": 5.03, "step": 6910 }, { "epoch": 0.1417074519279995, "grad_norm": 0.2591414749622345, "learning_rate": 0.0004, "loss": 4.9956, "step": 6920 }, { "epoch": 0.14191223148280876, "grad_norm": 0.2563314139842987, "learning_rate": 0.0004, "loss": 5.0245, "step": 6930 }, { "epoch": 0.142117011037618, "grad_norm": 0.29098746180534363, "learning_rate": 0.0004, "loss": 5.0174, "step": 6940 }, { "epoch": 0.14232179059242725, "grad_norm": 0.3085611164569855, "learning_rate": 0.0004, "loss": 5.0243, "step": 6950 }, { "epoch": 0.1425265701472365, "grad_norm": 0.26494288444519043, "learning_rate": 0.0004, "loss": 5.0631, "step": 6960 }, { "epoch": 0.14273134970204573, "grad_norm": 0.24861690402030945, "learning_rate": 0.0004, "loss": 5.036, "step": 6970 }, { "epoch": 0.142936129256855, "grad_norm": 0.2928735613822937, "learning_rate": 0.0004, "loss": 5.06, "step": 6980 }, { "epoch": 0.14314090881166425, "grad_norm": 0.3489916920661926, "learning_rate": 0.0004, "loss": 5.0146, "step": 6990 }, { "epoch": 0.1433456883664735, "grad_norm": 0.29149192571640015, "learning_rate": 0.0004, "loss": 4.9889, "step": 7000 }, { "epoch": 0.14355046792128273, "grad_norm": 0.26062437891960144, "learning_rate": 0.0004, "loss": 5.0303, "step": 7010 }, { "epoch": 0.14375524747609197, "grad_norm": 0.3030029237270355, "learning_rate": 0.0004, "loss": 4.9966, "step": 7020 }, { "epoch": 0.14396002703090124, "grad_norm": 0.2680526375770569, "learning_rate": 0.0004, "loss": 5.0185, "step": 7030 }, { "epoch": 0.1441648065857105, "grad_norm": 0.31157636642456055, "learning_rate": 0.0004, "loss": 5.0236, "step": 7040 }, { "epoch": 0.14436958614051973, "grad_norm": 0.2734978497028351, "learning_rate": 0.0004, "loss": 5.0232, "step": 7050 }, { "epoch": 0.14457436569532897, "grad_norm": 0.2717677354812622, "learning_rate": 0.0004, "loss": 4.9828, "step": 7060 }, { "epoch": 0.1447791452501382, "grad_norm": 0.2949996590614319, "learning_rate": 0.0004, "loss": 5.0299, "step": 7070 }, { "epoch": 0.14498392480494748, "grad_norm": 0.27881181240081787, "learning_rate": 0.0004, "loss": 5.0718, "step": 7080 }, { "epoch": 0.14518870435975673, "grad_norm": 0.2645169198513031, "learning_rate": 0.0004, "loss": 5.0287, "step": 7090 }, { "epoch": 0.14539348391456597, "grad_norm": 0.258129358291626, "learning_rate": 0.0004, "loss": 4.9836, "step": 7100 }, { "epoch": 0.1455982634693752, "grad_norm": 0.2882113754749298, "learning_rate": 0.0004, "loss": 5.0085, "step": 7110 }, { "epoch": 0.14580304302418445, "grad_norm": 0.27890050411224365, "learning_rate": 0.0004, "loss": 5.0173, "step": 7120 }, { "epoch": 0.14600782257899372, "grad_norm": 0.2726353704929352, "learning_rate": 0.0004, "loss": 4.9976, "step": 7130 }, { "epoch": 0.14621260213380297, "grad_norm": 0.26558926701545715, "learning_rate": 0.0004, "loss": 5.0114, "step": 7140 }, { "epoch": 0.1464173816886122, "grad_norm": 0.26955997943878174, "learning_rate": 0.0004, "loss": 5.0349, "step": 7150 }, { "epoch": 0.14662216124342145, "grad_norm": 0.31367459893226624, "learning_rate": 0.0004, "loss": 4.9823, "step": 7160 }, { "epoch": 0.1468269407982307, "grad_norm": 0.2597455084323883, "learning_rate": 0.0004, "loss": 5.034, "step": 7170 }, { "epoch": 0.14703172035303996, "grad_norm": 0.3116661012172699, "learning_rate": 0.0004, "loss": 5.0626, "step": 7180 }, { "epoch": 0.1472364999078492, "grad_norm": 0.28304263949394226, "learning_rate": 0.0004, "loss": 5.0162, "step": 7190 }, { "epoch": 0.14744127946265845, "grad_norm": 0.2740379571914673, "learning_rate": 0.0004, "loss": 4.9967, "step": 7200 }, { "epoch": 0.1476460590174677, "grad_norm": 0.297842413187027, "learning_rate": 0.0004, "loss": 5.0032, "step": 7210 }, { "epoch": 0.14785083857227693, "grad_norm": 0.29578113555908203, "learning_rate": 0.0004, "loss": 5.0335, "step": 7220 }, { "epoch": 0.1480556181270862, "grad_norm": 0.2504656910896301, "learning_rate": 0.0004, "loss": 5.0501, "step": 7230 }, { "epoch": 0.14826039768189544, "grad_norm": 0.2642020285129547, "learning_rate": 0.0004, "loss": 5.0067, "step": 7240 }, { "epoch": 0.1484651772367047, "grad_norm": 0.2889758348464966, "learning_rate": 0.0004, "loss": 5.0189, "step": 7250 }, { "epoch": 0.14866995679151393, "grad_norm": 0.25152787566185, "learning_rate": 0.0004, "loss": 5.0003, "step": 7260 }, { "epoch": 0.14887473634632317, "grad_norm": 0.26755887269973755, "learning_rate": 0.0004, "loss": 5.0276, "step": 7270 }, { "epoch": 0.14907951590113244, "grad_norm": 0.27270573377609253, "learning_rate": 0.0004, "loss": 5.0269, "step": 7280 }, { "epoch": 0.14928429545594168, "grad_norm": 0.28820136189460754, "learning_rate": 0.0004, "loss": 5.0243, "step": 7290 }, { "epoch": 0.14948907501075093, "grad_norm": 0.26135873794555664, "learning_rate": 0.0004, "loss": 5.0132, "step": 7300 }, { "epoch": 0.14969385456556017, "grad_norm": 0.2627226710319519, "learning_rate": 0.0004, "loss": 5.0009, "step": 7310 }, { "epoch": 0.1498986341203694, "grad_norm": 0.2626127004623413, "learning_rate": 0.0004, "loss": 5.0199, "step": 7320 }, { "epoch": 0.15010341367517868, "grad_norm": 0.2863093912601471, "learning_rate": 0.0004, "loss": 4.9827, "step": 7330 }, { "epoch": 0.15030819322998792, "grad_norm": 0.2922162711620331, "learning_rate": 0.0004, "loss": 4.9776, "step": 7340 }, { "epoch": 0.15051297278479717, "grad_norm": 0.2657029330730438, "learning_rate": 0.0004, "loss": 5.0001, "step": 7350 }, { "epoch": 0.1507177523396064, "grad_norm": 0.2926842272281647, "learning_rate": 0.0004, "loss": 4.9807, "step": 7360 }, { "epoch": 0.15092253189441565, "grad_norm": 0.2773357033729553, "learning_rate": 0.0004, "loss": 5.0257, "step": 7370 }, { "epoch": 0.15112731144922492, "grad_norm": 0.277850866317749, "learning_rate": 0.0004, "loss": 5.0355, "step": 7380 }, { "epoch": 0.15133209100403416, "grad_norm": 0.2935318648815155, "learning_rate": 0.0004, "loss": 5.0019, "step": 7390 }, { "epoch": 0.1515368705588434, "grad_norm": 0.25857090950012207, "learning_rate": 0.0004, "loss": 5.0039, "step": 7400 }, { "epoch": 0.15174165011365265, "grad_norm": 0.26904380321502686, "learning_rate": 0.0004, "loss": 4.9963, "step": 7410 }, { "epoch": 0.1519464296684619, "grad_norm": 0.29226887226104736, "learning_rate": 0.0004, "loss": 4.9976, "step": 7420 }, { "epoch": 0.15215120922327116, "grad_norm": 0.2924584448337555, "learning_rate": 0.0004, "loss": 5.0066, "step": 7430 }, { "epoch": 0.1523559887780804, "grad_norm": 0.3263062536716461, "learning_rate": 0.0004, "loss": 5.0121, "step": 7440 }, { "epoch": 0.15256076833288965, "grad_norm": 0.2633025348186493, "learning_rate": 0.0004, "loss": 5.0047, "step": 7450 }, { "epoch": 0.1527655478876989, "grad_norm": 0.2768848240375519, "learning_rate": 0.0004, "loss": 4.9915, "step": 7460 }, { "epoch": 0.15297032744250813, "grad_norm": 0.2501979172229767, "learning_rate": 0.0004, "loss": 5.0219, "step": 7470 }, { "epoch": 0.1531751069973174, "grad_norm": 0.28026828169822693, "learning_rate": 0.0004, "loss": 5.0037, "step": 7480 }, { "epoch": 0.15337988655212664, "grad_norm": 0.2698480784893036, "learning_rate": 0.0004, "loss": 4.9814, "step": 7490 }, { "epoch": 0.15358466610693589, "grad_norm": 0.239173024892807, "learning_rate": 0.0004, "loss": 5.0149, "step": 7500 }, { "epoch": 0.15378944566174513, "grad_norm": 0.27128395438194275, "learning_rate": 0.0004, "loss": 5.0123, "step": 7510 }, { "epoch": 0.15399422521655437, "grad_norm": 0.26015204191207886, "learning_rate": 0.0004, "loss": 5.0152, "step": 7520 }, { "epoch": 0.15419900477136364, "grad_norm": 0.27965638041496277, "learning_rate": 0.0004, "loss": 4.9905, "step": 7530 }, { "epoch": 0.15440378432617288, "grad_norm": 0.2805701494216919, "learning_rate": 0.0004, "loss": 4.978, "step": 7540 }, { "epoch": 0.15460856388098212, "grad_norm": 0.3172330856323242, "learning_rate": 0.0004, "loss": 4.996, "step": 7550 }, { "epoch": 0.15481334343579137, "grad_norm": 0.25200405716896057, "learning_rate": 0.0004, "loss": 4.9709, "step": 7560 }, { "epoch": 0.1550181229906006, "grad_norm": 0.2770288586616516, "learning_rate": 0.0004, "loss": 5.0222, "step": 7570 }, { "epoch": 0.15522290254540988, "grad_norm": 0.2640692889690399, "learning_rate": 0.0004, "loss": 5.007, "step": 7580 }, { "epoch": 0.15542768210021912, "grad_norm": 0.2653588354587555, "learning_rate": 0.0004, "loss": 4.9661, "step": 7590 }, { "epoch": 0.15563246165502836, "grad_norm": 0.2635646462440491, "learning_rate": 0.0004, "loss": 4.9884, "step": 7600 }, { "epoch": 0.1558372412098376, "grad_norm": 0.2831110954284668, "learning_rate": 0.0004, "loss": 4.9915, "step": 7610 }, { "epoch": 0.15604202076464685, "grad_norm": 0.2644828259944916, "learning_rate": 0.0004, "loss": 4.9793, "step": 7620 }, { "epoch": 0.1562468003194561, "grad_norm": 0.2570164203643799, "learning_rate": 0.0004, "loss": 4.9477, "step": 7630 }, { "epoch": 0.15645157987426536, "grad_norm": 0.27692949771881104, "learning_rate": 0.0004, "loss": 4.9855, "step": 7640 }, { "epoch": 0.1566563594290746, "grad_norm": 0.32910946011543274, "learning_rate": 0.0004, "loss": 5.003, "step": 7650 }, { "epoch": 0.15686113898388385, "grad_norm": 0.2637607753276825, "learning_rate": 0.0004, "loss": 4.971, "step": 7660 }, { "epoch": 0.1570659185386931, "grad_norm": 0.3097701966762543, "learning_rate": 0.0004, "loss": 5.0238, "step": 7670 }, { "epoch": 0.15727069809350233, "grad_norm": 0.30924656987190247, "learning_rate": 0.0004, "loss": 5.0226, "step": 7680 }, { "epoch": 0.1574754776483116, "grad_norm": 0.2889367938041687, "learning_rate": 0.0004, "loss": 5.023, "step": 7690 }, { "epoch": 0.15768025720312084, "grad_norm": 0.2582051455974579, "learning_rate": 0.0004, "loss": 4.9851, "step": 7700 }, { "epoch": 0.1578850367579301, "grad_norm": 0.4065515398979187, "learning_rate": 0.0004, "loss": 4.998, "step": 7710 }, { "epoch": 0.15808981631273933, "grad_norm": 0.2995768189430237, "learning_rate": 0.0004, "loss": 5.0239, "step": 7720 }, { "epoch": 0.15829459586754857, "grad_norm": 0.2907010614871979, "learning_rate": 0.0004, "loss": 5.0208, "step": 7730 }, { "epoch": 0.15849937542235784, "grad_norm": 0.2729869782924652, "learning_rate": 0.0004, "loss": 4.9835, "step": 7740 }, { "epoch": 0.15870415497716708, "grad_norm": 2.6075010299682617, "learning_rate": 0.0004, "loss": 4.985, "step": 7750 }, { "epoch": 0.15890893453197633, "grad_norm": 0.27993106842041016, "learning_rate": 0.0004, "loss": 5.0196, "step": 7760 }, { "epoch": 0.15911371408678557, "grad_norm": 0.27130773663520813, "learning_rate": 0.0004, "loss": 4.9265, "step": 7770 }, { "epoch": 0.1593184936415948, "grad_norm": 0.3074718713760376, "learning_rate": 0.0004, "loss": 4.9919, "step": 7780 }, { "epoch": 0.15952327319640408, "grad_norm": 0.3180055022239685, "learning_rate": 0.0004, "loss": 4.9662, "step": 7790 }, { "epoch": 0.15972805275121332, "grad_norm": 0.25885850191116333, "learning_rate": 0.0004, "loss": 4.9778, "step": 7800 }, { "epoch": 0.15993283230602257, "grad_norm": 0.2722429037094116, "learning_rate": 0.0004, "loss": 4.9624, "step": 7810 }, { "epoch": 0.1601376118608318, "grad_norm": 0.2777957618236542, "learning_rate": 0.0004, "loss": 4.9673, "step": 7820 }, { "epoch": 0.16034239141564105, "grad_norm": 0.24738921225070953, "learning_rate": 0.0004, "loss": 4.9967, "step": 7830 }, { "epoch": 0.16054717097045032, "grad_norm": 0.2691292464733124, "learning_rate": 0.0004, "loss": 4.9649, "step": 7840 }, { "epoch": 0.16075195052525956, "grad_norm": 0.25139743089675903, "learning_rate": 0.0004, "loss": 4.9643, "step": 7850 }, { "epoch": 0.1609567300800688, "grad_norm": 0.24456483125686646, "learning_rate": 0.0004, "loss": 4.9518, "step": 7860 }, { "epoch": 0.16116150963487805, "grad_norm": 0.2848648428916931, "learning_rate": 0.0004, "loss": 4.9543, "step": 7870 }, { "epoch": 0.1613662891896873, "grad_norm": 0.28265640139579773, "learning_rate": 0.0004, "loss": 4.9814, "step": 7880 }, { "epoch": 0.16157106874449656, "grad_norm": 0.25388023257255554, "learning_rate": 0.0004, "loss": 4.9623, "step": 7890 }, { "epoch": 0.1617758482993058, "grad_norm": 0.277907133102417, "learning_rate": 0.0004, "loss": 5.0045, "step": 7900 }, { "epoch": 0.16198062785411504, "grad_norm": 0.33783209323883057, "learning_rate": 0.0004, "loss": 5.014, "step": 7910 }, { "epoch": 0.1621854074089243, "grad_norm": 0.4126969277858734, "learning_rate": 0.0004, "loss": 4.9839, "step": 7920 }, { "epoch": 0.16239018696373353, "grad_norm": 0.2501499652862549, "learning_rate": 0.0004, "loss": 4.997, "step": 7930 }, { "epoch": 0.1625949665185428, "grad_norm": 0.31211671233177185, "learning_rate": 0.0004, "loss": 4.9882, "step": 7940 }, { "epoch": 0.16279974607335204, "grad_norm": 0.28681033849716187, "learning_rate": 0.0004, "loss": 4.9808, "step": 7950 }, { "epoch": 0.16300452562816128, "grad_norm": 0.30036285519599915, "learning_rate": 0.0004, "loss": 4.9682, "step": 7960 }, { "epoch": 0.16320930518297053, "grad_norm": 0.2959885001182556, "learning_rate": 0.0004, "loss": 4.9675, "step": 7970 }, { "epoch": 0.16341408473777977, "grad_norm": 0.2631414532661438, "learning_rate": 0.0004, "loss": 4.932, "step": 7980 }, { "epoch": 0.16361886429258904, "grad_norm": 0.2474392056465149, "learning_rate": 0.0004, "loss": 4.9942, "step": 7990 }, { "epoch": 0.16382364384739828, "grad_norm": 0.3146931827068329, "learning_rate": 0.0004, "loss": 4.9568, "step": 8000 }, { "epoch": 0.16382364384739828, "eval_loss": 4.985713005065918, "eval_runtime": 4.385, "eval_samples_per_second": 265.906, "eval_steps_per_second": 33.295, "step": 8000 }, { "epoch": 0.16402842340220752, "grad_norm": 0.27640268206596375, "learning_rate": 0.0004, "loss": 4.8895, "step": 8010 }, { "epoch": 0.16423320295701677, "grad_norm": 0.26192760467529297, "learning_rate": 0.0004, "loss": 4.9724, "step": 8020 }, { "epoch": 0.164437982511826, "grad_norm": 0.29010656476020813, "learning_rate": 0.0004, "loss": 4.9561, "step": 8030 }, { "epoch": 0.16464276206663528, "grad_norm": 0.2687608599662781, "learning_rate": 0.0004, "loss": 4.9903, "step": 8040 }, { "epoch": 0.16484754162144452, "grad_norm": 0.38182979822158813, "learning_rate": 0.0004, "loss": 4.9518, "step": 8050 }, { "epoch": 0.16505232117625376, "grad_norm": 0.253173291683197, "learning_rate": 0.0004, "loss": 4.9921, "step": 8060 }, { "epoch": 0.165257100731063, "grad_norm": 0.2626391053199768, "learning_rate": 0.0004, "loss": 4.9634, "step": 8070 }, { "epoch": 0.16546188028587225, "grad_norm": 0.28041955828666687, "learning_rate": 0.0004, "loss": 4.9432, "step": 8080 }, { "epoch": 0.16566665984068152, "grad_norm": 0.2790217399597168, "learning_rate": 0.0004, "loss": 4.9541, "step": 8090 }, { "epoch": 0.16587143939549076, "grad_norm": 0.2618565857410431, "learning_rate": 0.0004, "loss": 4.9578, "step": 8100 }, { "epoch": 0.1660762189503, "grad_norm": 0.316425085067749, "learning_rate": 0.0004, "loss": 4.9673, "step": 8110 }, { "epoch": 0.16628099850510925, "grad_norm": 0.30675795674324036, "learning_rate": 0.0004, "loss": 4.9583, "step": 8120 }, { "epoch": 0.1664857780599185, "grad_norm": 0.2543947696685791, "learning_rate": 0.0004, "loss": 4.9544, "step": 8130 }, { "epoch": 0.16669055761472776, "grad_norm": 0.2793639302253723, "learning_rate": 0.0004, "loss": 4.9822, "step": 8140 }, { "epoch": 0.166895337169537, "grad_norm": 0.25796473026275635, "learning_rate": 0.0004, "loss": 5.0105, "step": 8150 }, { "epoch": 0.16710011672434624, "grad_norm": 0.25799325108528137, "learning_rate": 0.0004, "loss": 4.9849, "step": 8160 }, { "epoch": 0.16730489627915549, "grad_norm": 0.27757272124290466, "learning_rate": 0.0004, "loss": 4.9951, "step": 8170 }, { "epoch": 0.16750967583396473, "grad_norm": 0.26644569635391235, "learning_rate": 0.0004, "loss": 4.9706, "step": 8180 }, { "epoch": 0.167714455388774, "grad_norm": 0.25647637248039246, "learning_rate": 0.0004, "loss": 4.9509, "step": 8190 }, { "epoch": 0.16791923494358324, "grad_norm": 0.28371259570121765, "learning_rate": 0.0004, "loss": 4.9445, "step": 8200 }, { "epoch": 0.16812401449839248, "grad_norm": 0.2952437698841095, "learning_rate": 0.0004, "loss": 4.9562, "step": 8210 }, { "epoch": 0.16832879405320172, "grad_norm": 0.39984285831451416, "learning_rate": 0.0004, "loss": 4.9866, "step": 8220 }, { "epoch": 0.16853357360801097, "grad_norm": 0.2723577320575714, "learning_rate": 0.0004, "loss": 4.9398, "step": 8230 }, { "epoch": 0.16873835316282024, "grad_norm": 0.300861656665802, "learning_rate": 0.0004, "loss": 4.9795, "step": 8240 }, { "epoch": 0.16894313271762948, "grad_norm": 0.2561473250389099, "learning_rate": 0.0004, "loss": 4.941, "step": 8250 }, { "epoch": 0.16914791227243872, "grad_norm": 0.2530801594257355, "learning_rate": 0.0004, "loss": 4.9801, "step": 8260 }, { "epoch": 0.16935269182724796, "grad_norm": 0.2439454346895218, "learning_rate": 0.0004, "loss": 4.942, "step": 8270 }, { "epoch": 0.1695574713820572, "grad_norm": 0.26756080985069275, "learning_rate": 0.0004, "loss": 4.9709, "step": 8280 }, { "epoch": 0.16976225093686648, "grad_norm": 0.26916906237602234, "learning_rate": 0.0004, "loss": 4.9696, "step": 8290 }, { "epoch": 0.16996703049167572, "grad_norm": 0.26618748903274536, "learning_rate": 0.0004, "loss": 4.9541, "step": 8300 }, { "epoch": 0.17017181004648496, "grad_norm": 0.25927209854125977, "learning_rate": 0.0004, "loss": 4.9438, "step": 8310 }, { "epoch": 0.1703765896012942, "grad_norm": 0.28418412804603577, "learning_rate": 0.0004, "loss": 4.956, "step": 8320 }, { "epoch": 0.17058136915610345, "grad_norm": 0.275880366563797, "learning_rate": 0.0004, "loss": 4.9916, "step": 8330 }, { "epoch": 0.17078614871091272, "grad_norm": 0.2647210955619812, "learning_rate": 0.0004, "loss": 4.9571, "step": 8340 }, { "epoch": 0.17099092826572196, "grad_norm": 0.2847307324409485, "learning_rate": 0.0004, "loss": 4.9439, "step": 8350 }, { "epoch": 0.1711957078205312, "grad_norm": 0.2591257691383362, "learning_rate": 0.0004, "loss": 4.9707, "step": 8360 }, { "epoch": 0.17140048737534044, "grad_norm": 0.27943095564842224, "learning_rate": 0.0004, "loss": 4.9864, "step": 8370 }, { "epoch": 0.17160526693014969, "grad_norm": 0.27202245593070984, "learning_rate": 0.0004, "loss": 4.9512, "step": 8380 }, { "epoch": 0.17181004648495893, "grad_norm": 0.2789130210876465, "learning_rate": 0.0004, "loss": 4.9389, "step": 8390 }, { "epoch": 0.1720148260397682, "grad_norm": 0.2560030519962311, "learning_rate": 0.0004, "loss": 4.9418, "step": 8400 }, { "epoch": 0.17221960559457744, "grad_norm": 0.2689448595046997, "learning_rate": 0.0004, "loss": 4.961, "step": 8410 }, { "epoch": 0.17242438514938668, "grad_norm": 0.265110582113266, "learning_rate": 0.0004, "loss": 4.9237, "step": 8420 }, { "epoch": 0.17262916470419593, "grad_norm": 0.259001761674881, "learning_rate": 0.0004, "loss": 4.9322, "step": 8430 }, { "epoch": 0.17283394425900517, "grad_norm": 0.2653099298477173, "learning_rate": 0.0004, "loss": 4.9607, "step": 8440 }, { "epoch": 0.17303872381381444, "grad_norm": 0.2665369510650635, "learning_rate": 0.0004, "loss": 4.9522, "step": 8450 }, { "epoch": 0.17324350336862368, "grad_norm": 0.2588624954223633, "learning_rate": 0.0004, "loss": 4.9419, "step": 8460 }, { "epoch": 0.17344828292343292, "grad_norm": 0.2519038915634155, "learning_rate": 0.0004, "loss": 4.946, "step": 8470 }, { "epoch": 0.17365306247824217, "grad_norm": 0.2395952045917511, "learning_rate": 0.0004, "loss": 4.915, "step": 8480 }, { "epoch": 0.1738578420330514, "grad_norm": 0.5889842510223389, "learning_rate": 0.0004, "loss": 4.9647, "step": 8490 }, { "epoch": 0.17406262158786068, "grad_norm": 0.27600687742233276, "learning_rate": 0.0004, "loss": 4.9693, "step": 8500 }, { "epoch": 0.17426740114266992, "grad_norm": 0.28736770153045654, "learning_rate": 0.0004, "loss": 4.9688, "step": 8510 }, { "epoch": 0.17447218069747916, "grad_norm": 0.2757735550403595, "learning_rate": 0.0004, "loss": 4.9578, "step": 8520 }, { "epoch": 0.1746769602522884, "grad_norm": 0.2744831442832947, "learning_rate": 0.0004, "loss": 4.9767, "step": 8530 }, { "epoch": 0.17488173980709765, "grad_norm": 0.33410757780075073, "learning_rate": 0.0004, "loss": 4.9444, "step": 8540 }, { "epoch": 0.17508651936190692, "grad_norm": 0.27433595061302185, "learning_rate": 0.0004, "loss": 4.9834, "step": 8550 }, { "epoch": 0.17529129891671616, "grad_norm": 0.27666985988616943, "learning_rate": 0.0004, "loss": 4.9854, "step": 8560 }, { "epoch": 0.1754960784715254, "grad_norm": 0.2830606997013092, "learning_rate": 0.0004, "loss": 4.9928, "step": 8570 }, { "epoch": 0.17570085802633464, "grad_norm": 0.7617194652557373, "learning_rate": 0.0004, "loss": 4.9465, "step": 8580 }, { "epoch": 0.1759056375811439, "grad_norm": 0.25556811690330505, "learning_rate": 0.0004, "loss": 4.9076, "step": 8590 }, { "epoch": 0.17611041713595316, "grad_norm": 0.2798091769218445, "learning_rate": 0.0004, "loss": 4.9809, "step": 8600 }, { "epoch": 0.1763151966907624, "grad_norm": 0.252435564994812, "learning_rate": 0.0004, "loss": 4.9489, "step": 8610 }, { "epoch": 0.17651997624557164, "grad_norm": 0.2565086781978607, "learning_rate": 0.0004, "loss": 4.9689, "step": 8620 }, { "epoch": 0.17672475580038088, "grad_norm": 0.25653916597366333, "learning_rate": 0.0004, "loss": 4.9555, "step": 8630 }, { "epoch": 0.17692953535519013, "grad_norm": 0.24862580001354218, "learning_rate": 0.0004, "loss": 4.9307, "step": 8640 }, { "epoch": 0.1771343149099994, "grad_norm": 0.26300904154777527, "learning_rate": 0.0004, "loss": 4.9638, "step": 8650 }, { "epoch": 0.17733909446480864, "grad_norm": 0.2595764696598053, "learning_rate": 0.0004, "loss": 4.9342, "step": 8660 }, { "epoch": 0.17754387401961788, "grad_norm": 0.2754005789756775, "learning_rate": 0.0004, "loss": 4.938, "step": 8670 }, { "epoch": 0.17774865357442712, "grad_norm": 0.2422814816236496, "learning_rate": 0.0004, "loss": 4.9392, "step": 8680 }, { "epoch": 0.17795343312923637, "grad_norm": 0.2505927085876465, "learning_rate": 0.0004, "loss": 4.9132, "step": 8690 }, { "epoch": 0.17815821268404564, "grad_norm": 0.3159048855304718, "learning_rate": 0.0004, "loss": 4.8876, "step": 8700 }, { "epoch": 0.17836299223885488, "grad_norm": 0.268097460269928, "learning_rate": 0.0004, "loss": 4.9008, "step": 8710 }, { "epoch": 0.17856777179366412, "grad_norm": 0.25193169713020325, "learning_rate": 0.0004, "loss": 4.9669, "step": 8720 }, { "epoch": 0.17877255134847336, "grad_norm": 0.2685261368751526, "learning_rate": 0.0004, "loss": 4.9527, "step": 8730 }, { "epoch": 0.1789773309032826, "grad_norm": 0.2474873661994934, "learning_rate": 0.0004, "loss": 4.9591, "step": 8740 }, { "epoch": 0.17918211045809188, "grad_norm": 0.2831394374370575, "learning_rate": 0.0004, "loss": 4.9445, "step": 8750 }, { "epoch": 0.17938689001290112, "grad_norm": 0.25589999556541443, "learning_rate": 0.0004, "loss": 4.9701, "step": 8760 }, { "epoch": 0.17959166956771036, "grad_norm": 0.2762419879436493, "learning_rate": 0.0004, "loss": 4.9525, "step": 8770 }, { "epoch": 0.1797964491225196, "grad_norm": 0.3691735565662384, "learning_rate": 0.0004, "loss": 4.9013, "step": 8780 }, { "epoch": 0.18000122867732885, "grad_norm": 0.27381083369255066, "learning_rate": 0.0004, "loss": 4.929, "step": 8790 }, { "epoch": 0.18020600823213812, "grad_norm": 0.2597827911376953, "learning_rate": 0.0004, "loss": 4.9431, "step": 8800 }, { "epoch": 0.18041078778694736, "grad_norm": 0.34209340810775757, "learning_rate": 0.0004, "loss": 4.9308, "step": 8810 }, { "epoch": 0.1806155673417566, "grad_norm": 0.26386550068855286, "learning_rate": 0.0004, "loss": 4.9484, "step": 8820 }, { "epoch": 0.18082034689656584, "grad_norm": 0.25239115953445435, "learning_rate": 0.0004, "loss": 4.9614, "step": 8830 }, { "epoch": 0.18102512645137508, "grad_norm": 0.2644425630569458, "learning_rate": 0.0004, "loss": 4.9233, "step": 8840 }, { "epoch": 0.18122990600618435, "grad_norm": 0.27113065123558044, "learning_rate": 0.0004, "loss": 4.9295, "step": 8850 }, { "epoch": 0.1814346855609936, "grad_norm": 0.24539048969745636, "learning_rate": 0.0004, "loss": 4.9393, "step": 8860 }, { "epoch": 0.18163946511580284, "grad_norm": 0.2639126181602478, "learning_rate": 0.0004, "loss": 4.9368, "step": 8870 }, { "epoch": 0.18184424467061208, "grad_norm": 0.2735327184200287, "learning_rate": 0.0004, "loss": 4.934, "step": 8880 }, { "epoch": 0.18204902422542132, "grad_norm": 0.27094408869743347, "learning_rate": 0.0004, "loss": 4.9357, "step": 8890 }, { "epoch": 0.1822538037802306, "grad_norm": 0.2365713119506836, "learning_rate": 0.0004, "loss": 4.9588, "step": 8900 }, { "epoch": 0.18245858333503984, "grad_norm": 0.24846386909484863, "learning_rate": 0.0004, "loss": 4.9726, "step": 8910 }, { "epoch": 0.18266336288984908, "grad_norm": 0.47635194659233093, "learning_rate": 0.0004, "loss": 4.9204, "step": 8920 }, { "epoch": 0.18286814244465832, "grad_norm": 0.23480106890201569, "learning_rate": 0.0004, "loss": 4.9501, "step": 8930 }, { "epoch": 0.18307292199946756, "grad_norm": 0.2801901698112488, "learning_rate": 0.0004, "loss": 4.9348, "step": 8940 }, { "epoch": 0.18327770155427683, "grad_norm": 0.25215595960617065, "learning_rate": 0.0004, "loss": 4.9502, "step": 8950 }, { "epoch": 0.18348248110908608, "grad_norm": 0.2644833028316498, "learning_rate": 0.0004, "loss": 4.9308, "step": 8960 }, { "epoch": 0.18368726066389532, "grad_norm": 0.27154913544654846, "learning_rate": 0.0004, "loss": 4.9412, "step": 8970 }, { "epoch": 0.18389204021870456, "grad_norm": 0.24921545386314392, "learning_rate": 0.0004, "loss": 4.9457, "step": 8980 }, { "epoch": 0.1840968197735138, "grad_norm": 0.2780075967311859, "learning_rate": 0.0004, "loss": 4.8964, "step": 8990 }, { "epoch": 0.18430159932832307, "grad_norm": 0.24482417106628418, "learning_rate": 0.0004, "loss": 4.9115, "step": 9000 }, { "epoch": 0.18450637888313232, "grad_norm": 0.2918320894241333, "learning_rate": 0.0004, "loss": 4.9544, "step": 9010 }, { "epoch": 0.18471115843794156, "grad_norm": 0.27354565262794495, "learning_rate": 0.0004, "loss": 4.9285, "step": 9020 }, { "epoch": 0.1849159379927508, "grad_norm": 0.2596583962440491, "learning_rate": 0.0004, "loss": 4.9105, "step": 9030 }, { "epoch": 0.18512071754756004, "grad_norm": 0.27231499552726746, "learning_rate": 0.0004, "loss": 4.923, "step": 9040 }, { "epoch": 0.1853254971023693, "grad_norm": 0.2770763039588928, "learning_rate": 0.0004, "loss": 4.9517, "step": 9050 }, { "epoch": 0.18553027665717856, "grad_norm": 0.2580932378768921, "learning_rate": 0.0004, "loss": 4.9304, "step": 9060 }, { "epoch": 0.1857350562119878, "grad_norm": 0.24377305805683136, "learning_rate": 0.0004, "loss": 4.93, "step": 9070 }, { "epoch": 0.18593983576679704, "grad_norm": 0.26057571172714233, "learning_rate": 0.0004, "loss": 4.9098, "step": 9080 }, { "epoch": 0.18614461532160628, "grad_norm": 0.27135148644447327, "learning_rate": 0.0004, "loss": 4.9042, "step": 9090 }, { "epoch": 0.18634939487641553, "grad_norm": 0.2588081359863281, "learning_rate": 0.0004, "loss": 4.906, "step": 9100 }, { "epoch": 0.1865541744312248, "grad_norm": 0.2858315110206604, "learning_rate": 0.0004, "loss": 4.9154, "step": 9110 }, { "epoch": 0.18675895398603404, "grad_norm": 0.26252084970474243, "learning_rate": 0.0004, "loss": 4.9141, "step": 9120 }, { "epoch": 0.18696373354084328, "grad_norm": 0.2764761745929718, "learning_rate": 0.0004, "loss": 4.8958, "step": 9130 }, { "epoch": 0.18716851309565252, "grad_norm": 0.29110220074653625, "learning_rate": 0.0004, "loss": 4.9123, "step": 9140 }, { "epoch": 0.18737329265046176, "grad_norm": 0.2575053870677948, "learning_rate": 0.0004, "loss": 4.9099, "step": 9150 }, { "epoch": 0.18757807220527103, "grad_norm": 0.29032933712005615, "learning_rate": 0.0004, "loss": 4.9262, "step": 9160 }, { "epoch": 0.18778285176008028, "grad_norm": 0.3194615840911865, "learning_rate": 0.0004, "loss": 4.9179, "step": 9170 }, { "epoch": 0.18798763131488952, "grad_norm": 0.26347652077674866, "learning_rate": 0.0004, "loss": 4.9459, "step": 9180 }, { "epoch": 0.18819241086969876, "grad_norm": 0.2674354314804077, "learning_rate": 0.0004, "loss": 4.9468, "step": 9190 }, { "epoch": 0.188397190424508, "grad_norm": 0.2521820068359375, "learning_rate": 0.0004, "loss": 4.9324, "step": 9200 }, { "epoch": 0.18860196997931727, "grad_norm": 0.24796158075332642, "learning_rate": 0.0004, "loss": 4.9299, "step": 9210 }, { "epoch": 0.18880674953412652, "grad_norm": 0.2503640949726105, "learning_rate": 0.0004, "loss": 4.9388, "step": 9220 }, { "epoch": 0.18901152908893576, "grad_norm": 0.2608177363872528, "learning_rate": 0.0004, "loss": 4.9164, "step": 9230 }, { "epoch": 0.189216308643745, "grad_norm": 0.2645958960056305, "learning_rate": 0.0004, "loss": 4.8984, "step": 9240 }, { "epoch": 0.18942108819855424, "grad_norm": 0.26529330015182495, "learning_rate": 0.0004, "loss": 4.9369, "step": 9250 }, { "epoch": 0.18962586775336351, "grad_norm": 0.25383853912353516, "learning_rate": 0.0004, "loss": 4.9377, "step": 9260 }, { "epoch": 0.18983064730817276, "grad_norm": 0.25946077704429626, "learning_rate": 0.0004, "loss": 4.9082, "step": 9270 }, { "epoch": 0.190035426862982, "grad_norm": 0.2694838345050812, "learning_rate": 0.0004, "loss": 4.8979, "step": 9280 }, { "epoch": 0.19024020641779124, "grad_norm": 0.24734584987163544, "learning_rate": 0.0004, "loss": 4.9369, "step": 9290 }, { "epoch": 0.19044498597260048, "grad_norm": 0.9209773540496826, "learning_rate": 0.0004, "loss": 4.929, "step": 9300 }, { "epoch": 0.19064976552740975, "grad_norm": 0.26379844546318054, "learning_rate": 0.0004, "loss": 4.9336, "step": 9310 }, { "epoch": 0.190854545082219, "grad_norm": 0.2513795793056488, "learning_rate": 0.0004, "loss": 4.9192, "step": 9320 }, { "epoch": 0.19105932463702824, "grad_norm": 0.2901628613471985, "learning_rate": 0.0004, "loss": 4.9422, "step": 9330 }, { "epoch": 0.19126410419183748, "grad_norm": 0.25602996349334717, "learning_rate": 0.0004, "loss": 4.8811, "step": 9340 }, { "epoch": 0.19146888374664672, "grad_norm": 0.2577371895313263, "learning_rate": 0.0004, "loss": 4.9184, "step": 9350 }, { "epoch": 0.191673663301456, "grad_norm": 0.2465871423482895, "learning_rate": 0.0004, "loss": 4.9836, "step": 9360 }, { "epoch": 0.19187844285626524, "grad_norm": 0.2691284120082855, "learning_rate": 0.0004, "loss": 4.9593, "step": 9370 }, { "epoch": 0.19208322241107448, "grad_norm": 0.28360679745674133, "learning_rate": 0.0004, "loss": 4.9323, "step": 9380 }, { "epoch": 0.19228800196588372, "grad_norm": 0.2639329135417938, "learning_rate": 0.0004, "loss": 4.8867, "step": 9390 }, { "epoch": 0.19249278152069296, "grad_norm": 0.26145270466804504, "learning_rate": 0.0004, "loss": 4.932, "step": 9400 }, { "epoch": 0.19269756107550223, "grad_norm": 0.24542520940303802, "learning_rate": 0.0004, "loss": 4.9079, "step": 9410 }, { "epoch": 0.19290234063031148, "grad_norm": 0.3000343441963196, "learning_rate": 0.0004, "loss": 4.9274, "step": 9420 }, { "epoch": 0.19310712018512072, "grad_norm": 0.2719687521457672, "learning_rate": 0.0004, "loss": 4.8972, "step": 9430 }, { "epoch": 0.19331189973992996, "grad_norm": 0.2948462665081024, "learning_rate": 0.0004, "loss": 4.9418, "step": 9440 }, { "epoch": 0.1935166792947392, "grad_norm": 0.266860693693161, "learning_rate": 0.0004, "loss": 4.9422, "step": 9450 }, { "epoch": 0.19372145884954847, "grad_norm": 0.2978024184703827, "learning_rate": 0.0004, "loss": 4.8979, "step": 9460 }, { "epoch": 0.19392623840435771, "grad_norm": 0.24917840957641602, "learning_rate": 0.0004, "loss": 4.9035, "step": 9470 }, { "epoch": 0.19413101795916696, "grad_norm": 0.2922961115837097, "learning_rate": 0.0004, "loss": 4.9164, "step": 9480 }, { "epoch": 0.1943357975139762, "grad_norm": 0.29190903902053833, "learning_rate": 0.0004, "loss": 4.8901, "step": 9490 }, { "epoch": 0.19454057706878544, "grad_norm": 0.24561089277267456, "learning_rate": 0.0004, "loss": 4.934, "step": 9500 }, { "epoch": 0.1947453566235947, "grad_norm": 0.2607162296772003, "learning_rate": 0.0004, "loss": 4.902, "step": 9510 }, { "epoch": 0.19495013617840395, "grad_norm": 0.26801997423171997, "learning_rate": 0.0004, "loss": 4.9062, "step": 9520 }, { "epoch": 0.1951549157332132, "grad_norm": 0.2481057345867157, "learning_rate": 0.0004, "loss": 4.9288, "step": 9530 }, { "epoch": 0.19535969528802244, "grad_norm": 0.31173771619796753, "learning_rate": 0.0004, "loss": 4.9154, "step": 9540 }, { "epoch": 0.19556447484283168, "grad_norm": 0.2514681816101074, "learning_rate": 0.0004, "loss": 4.9363, "step": 9550 }, { "epoch": 0.19576925439764095, "grad_norm": 0.2622108459472656, "learning_rate": 0.0004, "loss": 4.9236, "step": 9560 }, { "epoch": 0.1959740339524502, "grad_norm": 0.25495150685310364, "learning_rate": 0.0004, "loss": 4.908, "step": 9570 }, { "epoch": 0.19617881350725944, "grad_norm": 0.26596617698669434, "learning_rate": 0.0004, "loss": 4.8762, "step": 9580 }, { "epoch": 0.19638359306206868, "grad_norm": 0.2579491436481476, "learning_rate": 0.0004, "loss": 4.9194, "step": 9590 }, { "epoch": 0.19658837261687792, "grad_norm": 0.2557850480079651, "learning_rate": 0.0004, "loss": 4.9023, "step": 9600 }, { "epoch": 0.1967931521716872, "grad_norm": 0.24434374272823334, "learning_rate": 0.0004, "loss": 4.9162, "step": 9610 }, { "epoch": 0.19699793172649643, "grad_norm": 0.25301647186279297, "learning_rate": 0.0004, "loss": 4.8545, "step": 9620 }, { "epoch": 0.19720271128130568, "grad_norm": 0.2586268186569214, "learning_rate": 0.0004, "loss": 4.9286, "step": 9630 }, { "epoch": 0.19740749083611492, "grad_norm": 0.2630811631679535, "learning_rate": 0.0004, "loss": 4.9074, "step": 9640 }, { "epoch": 0.19761227039092416, "grad_norm": 0.28417789936065674, "learning_rate": 0.0004, "loss": 4.9275, "step": 9650 }, { "epoch": 0.19781704994573343, "grad_norm": 0.2516314685344696, "learning_rate": 0.0004, "loss": 4.9021, "step": 9660 }, { "epoch": 0.19802182950054267, "grad_norm": 0.2682010531425476, "learning_rate": 0.0004, "loss": 4.9054, "step": 9670 }, { "epoch": 0.19822660905535192, "grad_norm": 0.25043565034866333, "learning_rate": 0.0004, "loss": 4.8953, "step": 9680 }, { "epoch": 0.19843138861016116, "grad_norm": 0.2632971704006195, "learning_rate": 0.0004, "loss": 4.9153, "step": 9690 }, { "epoch": 0.1986361681649704, "grad_norm": 0.26531434059143066, "learning_rate": 0.0004, "loss": 4.8796, "step": 9700 }, { "epoch": 0.19884094771977967, "grad_norm": 0.26835283637046814, "learning_rate": 0.0004, "loss": 4.9055, "step": 9710 }, { "epoch": 0.1990457272745889, "grad_norm": 0.26542049646377563, "learning_rate": 0.0004, "loss": 4.9236, "step": 9720 }, { "epoch": 0.19925050682939816, "grad_norm": 0.28944799304008484, "learning_rate": 0.0004, "loss": 4.8907, "step": 9730 }, { "epoch": 0.1994552863842074, "grad_norm": 0.2508513629436493, "learning_rate": 0.0004, "loss": 4.9092, "step": 9740 }, { "epoch": 0.19966006593901664, "grad_norm": 0.24191100895404816, "learning_rate": 0.0004, "loss": 4.8827, "step": 9750 }, { "epoch": 0.1998648454938259, "grad_norm": 0.26619407534599304, "learning_rate": 0.0004, "loss": 4.8478, "step": 9760 }, { "epoch": 0.20006962504863515, "grad_norm": 0.2426200807094574, "learning_rate": 0.0004, "loss": 4.9073, "step": 9770 }, { "epoch": 0.2002744046034444, "grad_norm": 0.26734215021133423, "learning_rate": 0.0004, "loss": 4.895, "step": 9780 }, { "epoch": 0.20047918415825364, "grad_norm": 0.2878583073616028, "learning_rate": 0.0004, "loss": 4.9005, "step": 9790 }, { "epoch": 0.20068396371306288, "grad_norm": 0.23906348645687103, "learning_rate": 0.0004, "loss": 4.9572, "step": 9800 }, { "epoch": 0.20088874326787212, "grad_norm": 0.28374752402305603, "learning_rate": 0.0004, "loss": 4.916, "step": 9810 }, { "epoch": 0.2010935228226814, "grad_norm": 0.2827334403991699, "learning_rate": 0.0004, "loss": 4.9005, "step": 9820 }, { "epoch": 0.20129830237749063, "grad_norm": 0.3524470925331116, "learning_rate": 0.0004, "loss": 4.8969, "step": 9830 }, { "epoch": 0.20150308193229988, "grad_norm": 0.2859930992126465, "learning_rate": 0.0004, "loss": 4.9207, "step": 9840 }, { "epoch": 0.20170786148710912, "grad_norm": 0.2604099214076996, "learning_rate": 0.0004, "loss": 4.8868, "step": 9850 }, { "epoch": 0.20191264104191836, "grad_norm": 0.24747997522354126, "learning_rate": 0.0004, "loss": 4.9379, "step": 9860 }, { "epoch": 0.20211742059672763, "grad_norm": 0.27505552768707275, "learning_rate": 0.0004, "loss": 4.8898, "step": 9870 }, { "epoch": 0.20232220015153687, "grad_norm": 0.25134706497192383, "learning_rate": 0.0004, "loss": 4.9083, "step": 9880 }, { "epoch": 0.20252697970634612, "grad_norm": 0.2687472999095917, "learning_rate": 0.0004, "loss": 4.9199, "step": 9890 }, { "epoch": 0.20273175926115536, "grad_norm": 0.2637028992176056, "learning_rate": 0.0004, "loss": 4.9064, "step": 9900 }, { "epoch": 0.2029365388159646, "grad_norm": 0.34653931856155396, "learning_rate": 0.0004, "loss": 4.9107, "step": 9910 }, { "epoch": 0.20314131837077387, "grad_norm": 0.2523519992828369, "learning_rate": 0.0004, "loss": 4.9172, "step": 9920 }, { "epoch": 0.2033460979255831, "grad_norm": 0.2544276714324951, "learning_rate": 0.0004, "loss": 4.928, "step": 9930 }, { "epoch": 0.20355087748039236, "grad_norm": 0.2527393698692322, "learning_rate": 0.0004, "loss": 4.9027, "step": 9940 }, { "epoch": 0.2037556570352016, "grad_norm": 0.2704786956310272, "learning_rate": 0.0004, "loss": 4.8868, "step": 9950 }, { "epoch": 0.20396043659001084, "grad_norm": 0.24265450239181519, "learning_rate": 0.0004, "loss": 4.9287, "step": 9960 }, { "epoch": 0.2041652161448201, "grad_norm": 0.26238685846328735, "learning_rate": 0.0004, "loss": 4.882, "step": 9970 }, { "epoch": 0.20436999569962935, "grad_norm": 0.27310532331466675, "learning_rate": 0.0004, "loss": 4.8909, "step": 9980 }, { "epoch": 0.2045747752544386, "grad_norm": 0.2510082721710205, "learning_rate": 0.0004, "loss": 4.8839, "step": 9990 }, { "epoch": 0.20477955480924784, "grad_norm": 0.2608467638492584, "learning_rate": 0.0004, "loss": 4.9269, "step": 10000 }, { "epoch": 0.20477955480924784, "eval_loss": 4.915364742279053, "eval_runtime": 4.3939, "eval_samples_per_second": 265.37, "eval_steps_per_second": 33.228, "step": 10000 }, { "epoch": 0.20498433436405708, "grad_norm": 0.29385286569595337, "learning_rate": 0.0004, "loss": 4.8996, "step": 10010 }, { "epoch": 0.20518911391886635, "grad_norm": 0.26717716455459595, "learning_rate": 0.0004, "loss": 4.9464, "step": 10020 }, { "epoch": 0.2053938934736756, "grad_norm": 0.24872258305549622, "learning_rate": 0.0004, "loss": 4.927, "step": 10030 }, { "epoch": 0.20559867302848484, "grad_norm": 0.3139999508857727, "learning_rate": 0.0004, "loss": 4.9129, "step": 10040 }, { "epoch": 0.20580345258329408, "grad_norm": 0.2630196511745453, "learning_rate": 0.0004, "loss": 4.9052, "step": 10050 }, { "epoch": 0.20600823213810332, "grad_norm": 0.2669818699359894, "learning_rate": 0.0004, "loss": 4.9042, "step": 10060 }, { "epoch": 0.2062130116929126, "grad_norm": 0.23856590688228607, "learning_rate": 0.0004, "loss": 4.9258, "step": 10070 }, { "epoch": 0.20641779124772183, "grad_norm": 0.269837886095047, "learning_rate": 0.0004, "loss": 4.8969, "step": 10080 }, { "epoch": 0.20662257080253107, "grad_norm": 0.2798609733581543, "learning_rate": 0.0004, "loss": 4.9189, "step": 10090 }, { "epoch": 0.20682735035734032, "grad_norm": 0.2599700391292572, "learning_rate": 0.0004, "loss": 4.9153, "step": 10100 }, { "epoch": 0.20703212991214956, "grad_norm": 0.24514122307300568, "learning_rate": 0.0004, "loss": 4.9016, "step": 10110 }, { "epoch": 0.20723690946695883, "grad_norm": 0.25042402744293213, "learning_rate": 0.0004, "loss": 4.9002, "step": 10120 }, { "epoch": 0.20744168902176807, "grad_norm": 0.32178834080696106, "learning_rate": 0.0004, "loss": 4.8801, "step": 10130 }, { "epoch": 0.20764646857657731, "grad_norm": 0.2308470606803894, "learning_rate": 0.0004, "loss": 4.8786, "step": 10140 }, { "epoch": 0.20785124813138656, "grad_norm": 0.24936145544052124, "learning_rate": 0.0004, "loss": 4.8801, "step": 10150 }, { "epoch": 0.2080560276861958, "grad_norm": 0.26884016394615173, "learning_rate": 0.0004, "loss": 4.8996, "step": 10160 }, { "epoch": 0.20826080724100507, "grad_norm": 0.24865688383579254, "learning_rate": 0.0004, "loss": 4.9331, "step": 10170 }, { "epoch": 0.2084655867958143, "grad_norm": 0.23579901456832886, "learning_rate": 0.0004, "loss": 4.8948, "step": 10180 }, { "epoch": 0.20867036635062355, "grad_norm": 0.2458990514278412, "learning_rate": 0.0004, "loss": 4.8782, "step": 10190 }, { "epoch": 0.2088751459054328, "grad_norm": 0.2518019378185272, "learning_rate": 0.0004, "loss": 4.9196, "step": 10200 }, { "epoch": 0.20907992546024204, "grad_norm": 0.25939616560935974, "learning_rate": 0.0004, "loss": 4.9116, "step": 10210 }, { "epoch": 0.2092847050150513, "grad_norm": 0.31639620661735535, "learning_rate": 0.0004, "loss": 4.8841, "step": 10220 }, { "epoch": 0.20948948456986055, "grad_norm": 0.27148041129112244, "learning_rate": 0.0004, "loss": 4.8721, "step": 10230 }, { "epoch": 0.2096942641246698, "grad_norm": 0.23913012444972992, "learning_rate": 0.0004, "loss": 4.8817, "step": 10240 }, { "epoch": 0.20989904367947904, "grad_norm": 0.27129408717155457, "learning_rate": 0.0004, "loss": 4.8802, "step": 10250 }, { "epoch": 0.21010382323428828, "grad_norm": 0.26640304923057556, "learning_rate": 0.0004, "loss": 4.8998, "step": 10260 }, { "epoch": 0.21030860278909755, "grad_norm": 0.8138028979301453, "learning_rate": 0.0004, "loss": 4.9031, "step": 10270 }, { "epoch": 0.2105133823439068, "grad_norm": 0.26214420795440674, "learning_rate": 0.0004, "loss": 4.9146, "step": 10280 }, { "epoch": 0.21071816189871603, "grad_norm": 0.2588198184967041, "learning_rate": 0.0004, "loss": 4.8749, "step": 10290 }, { "epoch": 0.21092294145352528, "grad_norm": 0.26211073994636536, "learning_rate": 0.0004, "loss": 4.8954, "step": 10300 }, { "epoch": 0.21112772100833452, "grad_norm": 0.254951149225235, "learning_rate": 0.0004, "loss": 4.8531, "step": 10310 }, { "epoch": 0.2113325005631438, "grad_norm": 0.27760276198387146, "learning_rate": 0.0004, "loss": 4.8912, "step": 10320 }, { "epoch": 0.21153728011795303, "grad_norm": 0.27546319365501404, "learning_rate": 0.0004, "loss": 4.9153, "step": 10330 }, { "epoch": 0.21174205967276227, "grad_norm": 0.248246431350708, "learning_rate": 0.0004, "loss": 4.9059, "step": 10340 }, { "epoch": 0.21194683922757152, "grad_norm": 0.2779110372066498, "learning_rate": 0.0004, "loss": 4.8846, "step": 10350 }, { "epoch": 0.21215161878238076, "grad_norm": 0.25909432768821716, "learning_rate": 0.0004, "loss": 4.9028, "step": 10360 }, { "epoch": 0.21235639833719003, "grad_norm": 0.2711811661720276, "learning_rate": 0.0004, "loss": 4.903, "step": 10370 }, { "epoch": 0.21256117789199927, "grad_norm": 0.262154221534729, "learning_rate": 0.0004, "loss": 4.8841, "step": 10380 }, { "epoch": 0.2127659574468085, "grad_norm": 0.2557555139064789, "learning_rate": 0.0004, "loss": 4.9125, "step": 10390 }, { "epoch": 0.21297073700161775, "grad_norm": 0.3354072868824005, "learning_rate": 0.0004, "loss": 4.8713, "step": 10400 }, { "epoch": 0.213175516556427, "grad_norm": 0.26146626472473145, "learning_rate": 0.0004, "loss": 4.8713, "step": 10410 }, { "epoch": 0.21338029611123627, "grad_norm": 0.2696409523487091, "learning_rate": 0.0004, "loss": 4.8715, "step": 10420 }, { "epoch": 0.2135850756660455, "grad_norm": 0.29851871728897095, "learning_rate": 0.0004, "loss": 4.8633, "step": 10430 }, { "epoch": 0.21378985522085475, "grad_norm": 0.4111309051513672, "learning_rate": 0.0004, "loss": 4.8891, "step": 10440 }, { "epoch": 0.213994634775664, "grad_norm": 0.2570556700229645, "learning_rate": 0.0004, "loss": 4.8538, "step": 10450 }, { "epoch": 0.21419941433047324, "grad_norm": 0.27073532342910767, "learning_rate": 0.0004, "loss": 4.89, "step": 10460 }, { "epoch": 0.2144041938852825, "grad_norm": 0.2688610851764679, "learning_rate": 0.0004, "loss": 4.8796, "step": 10470 }, { "epoch": 0.21460897344009175, "grad_norm": 0.27269771695137024, "learning_rate": 0.0004, "loss": 4.8793, "step": 10480 }, { "epoch": 0.214813752994901, "grad_norm": 0.2495245784521103, "learning_rate": 0.0004, "loss": 4.8698, "step": 10490 }, { "epoch": 0.21501853254971023, "grad_norm": 0.2419559359550476, "learning_rate": 0.0004, "loss": 4.8997, "step": 10500 }, { "epoch": 0.21522331210451948, "grad_norm": 0.2923683226108551, "learning_rate": 0.0004, "loss": 4.8748, "step": 10510 }, { "epoch": 0.21542809165932875, "grad_norm": 0.2532537579536438, "learning_rate": 0.0004, "loss": 4.901, "step": 10520 }, { "epoch": 0.215632871214138, "grad_norm": 0.2619161605834961, "learning_rate": 0.0004, "loss": 4.8878, "step": 10530 }, { "epoch": 0.21583765076894723, "grad_norm": 0.2929726839065552, "learning_rate": 0.0004, "loss": 4.9189, "step": 10540 }, { "epoch": 0.21604243032375647, "grad_norm": 0.26524537801742554, "learning_rate": 0.0004, "loss": 4.895, "step": 10550 }, { "epoch": 0.21624720987856572, "grad_norm": 0.2544373571872711, "learning_rate": 0.0004, "loss": 4.8792, "step": 10560 }, { "epoch": 0.21645198943337496, "grad_norm": 0.25470906496047974, "learning_rate": 0.0004, "loss": 4.8639, "step": 10570 }, { "epoch": 0.21665676898818423, "grad_norm": 0.27853649854660034, "learning_rate": 0.0004, "loss": 4.9067, "step": 10580 }, { "epoch": 0.21686154854299347, "grad_norm": 0.26324573159217834, "learning_rate": 0.0004, "loss": 4.8631, "step": 10590 }, { "epoch": 0.2170663280978027, "grad_norm": 0.2501038908958435, "learning_rate": 0.0004, "loss": 4.9081, "step": 10600 }, { "epoch": 0.21727110765261196, "grad_norm": 0.25858011841773987, "learning_rate": 0.0004, "loss": 4.8712, "step": 10610 }, { "epoch": 0.2174758872074212, "grad_norm": 0.30494385957717896, "learning_rate": 0.0004, "loss": 4.896, "step": 10620 }, { "epoch": 0.21768066676223047, "grad_norm": 0.2731782793998718, "learning_rate": 0.0004, "loss": 4.8612, "step": 10630 }, { "epoch": 0.2178854463170397, "grad_norm": 0.27870678901672363, "learning_rate": 0.0004, "loss": 4.9169, "step": 10640 }, { "epoch": 0.21809022587184895, "grad_norm": 0.2358781397342682, "learning_rate": 0.0004, "loss": 4.8979, "step": 10650 }, { "epoch": 0.2182950054266582, "grad_norm": 0.27755194902420044, "learning_rate": 0.0004, "loss": 4.8713, "step": 10660 }, { "epoch": 0.21849978498146744, "grad_norm": 0.24838866293430328, "learning_rate": 0.0004, "loss": 4.8469, "step": 10670 }, { "epoch": 0.2187045645362767, "grad_norm": 0.46490806341171265, "learning_rate": 0.0004, "loss": 4.8521, "step": 10680 }, { "epoch": 0.21890934409108595, "grad_norm": 0.29437142610549927, "learning_rate": 0.0004, "loss": 4.867, "step": 10690 }, { "epoch": 0.2191141236458952, "grad_norm": 0.31970828771591187, "learning_rate": 0.0004, "loss": 4.8879, "step": 10700 }, { "epoch": 0.21931890320070443, "grad_norm": 0.25731003284454346, "learning_rate": 0.0004, "loss": 4.8562, "step": 10710 }, { "epoch": 0.21952368275551368, "grad_norm": 0.2647119462490082, "learning_rate": 0.0004, "loss": 4.8804, "step": 10720 }, { "epoch": 0.21972846231032295, "grad_norm": 0.3134158253669739, "learning_rate": 0.0004, "loss": 4.8631, "step": 10730 }, { "epoch": 0.2199332418651322, "grad_norm": 0.2509680986404419, "learning_rate": 0.0004, "loss": 4.8757, "step": 10740 }, { "epoch": 0.22013802141994143, "grad_norm": 0.26775500178337097, "learning_rate": 0.0004, "loss": 4.8928, "step": 10750 }, { "epoch": 0.22034280097475067, "grad_norm": 0.2678174674510956, "learning_rate": 0.0004, "loss": 4.844, "step": 10760 }, { "epoch": 0.22054758052955992, "grad_norm": 0.24718265235424042, "learning_rate": 0.0004, "loss": 4.884, "step": 10770 }, { "epoch": 0.2207523600843692, "grad_norm": 0.2549584209918976, "learning_rate": 0.0004, "loss": 4.8441, "step": 10780 }, { "epoch": 0.22095713963917843, "grad_norm": 0.2551555931568146, "learning_rate": 0.0004, "loss": 4.8953, "step": 10790 }, { "epoch": 0.22116191919398767, "grad_norm": 0.25722163915634155, "learning_rate": 0.0004, "loss": 4.872, "step": 10800 }, { "epoch": 0.22136669874879691, "grad_norm": 0.25247806310653687, "learning_rate": 0.0004, "loss": 4.8753, "step": 10810 }, { "epoch": 0.22157147830360616, "grad_norm": 0.2747890055179596, "learning_rate": 0.0004, "loss": 4.8829, "step": 10820 }, { "epoch": 0.22177625785841543, "grad_norm": 0.27008911967277527, "learning_rate": 0.0004, "loss": 4.8588, "step": 10830 }, { "epoch": 0.22198103741322467, "grad_norm": 0.302053302526474, "learning_rate": 0.0004, "loss": 4.8748, "step": 10840 }, { "epoch": 0.2221858169680339, "grad_norm": 0.26521286368370056, "learning_rate": 0.0004, "loss": 4.9043, "step": 10850 }, { "epoch": 0.22239059652284315, "grad_norm": 0.24146080017089844, "learning_rate": 0.0004, "loss": 4.901, "step": 10860 }, { "epoch": 0.2225953760776524, "grad_norm": 0.24695172905921936, "learning_rate": 0.0004, "loss": 4.8892, "step": 10870 }, { "epoch": 0.22280015563246167, "grad_norm": 0.2761094868183136, "learning_rate": 0.0004, "loss": 4.8787, "step": 10880 }, { "epoch": 0.2230049351872709, "grad_norm": 0.25380679965019226, "learning_rate": 0.0004, "loss": 4.866, "step": 10890 }, { "epoch": 0.22320971474208015, "grad_norm": 0.2534860372543335, "learning_rate": 0.0004, "loss": 4.8636, "step": 10900 }, { "epoch": 0.2234144942968894, "grad_norm": 0.26680171489715576, "learning_rate": 0.0004, "loss": 4.8631, "step": 10910 }, { "epoch": 0.22361927385169864, "grad_norm": 0.25146228075027466, "learning_rate": 0.0004, "loss": 4.8753, "step": 10920 }, { "epoch": 0.2238240534065079, "grad_norm": 0.2974937856197357, "learning_rate": 0.0004, "loss": 4.8866, "step": 10930 }, { "epoch": 0.22402883296131715, "grad_norm": 0.2717348337173462, "learning_rate": 0.0004, "loss": 4.8891, "step": 10940 }, { "epoch": 0.2242336125161264, "grad_norm": 0.2560259699821472, "learning_rate": 0.0004, "loss": 4.8772, "step": 10950 }, { "epoch": 0.22443839207093563, "grad_norm": 0.27203595638275146, "learning_rate": 0.0004, "loss": 4.8685, "step": 10960 }, { "epoch": 0.22464317162574488, "grad_norm": 0.26731157302856445, "learning_rate": 0.0004, "loss": 4.8656, "step": 10970 }, { "epoch": 0.22484795118055415, "grad_norm": 0.2695630192756653, "learning_rate": 0.0004, "loss": 4.8945, "step": 10980 }, { "epoch": 0.2250527307353634, "grad_norm": 0.28685614466667175, "learning_rate": 0.0004, "loss": 4.8796, "step": 10990 }, { "epoch": 0.22525751029017263, "grad_norm": 0.26128077507019043, "learning_rate": 0.0004, "loss": 4.8915, "step": 11000 }, { "epoch": 0.22546228984498187, "grad_norm": 0.24580059945583344, "learning_rate": 0.0004, "loss": 4.8352, "step": 11010 }, { "epoch": 0.22566706939979111, "grad_norm": 0.2452968955039978, "learning_rate": 0.0004, "loss": 4.9106, "step": 11020 }, { "epoch": 0.22587184895460038, "grad_norm": 0.26197394728660583, "learning_rate": 0.0004, "loss": 4.8995, "step": 11030 }, { "epoch": 0.22607662850940963, "grad_norm": 0.3471392095088959, "learning_rate": 0.0004, "loss": 4.8674, "step": 11040 }, { "epoch": 0.22628140806421887, "grad_norm": 0.37223684787750244, "learning_rate": 0.0004, "loss": 4.8858, "step": 11050 }, { "epoch": 0.2264861876190281, "grad_norm": 0.2760046124458313, "learning_rate": 0.0004, "loss": 4.8761, "step": 11060 }, { "epoch": 0.22669096717383735, "grad_norm": 0.25126567482948303, "learning_rate": 0.0004, "loss": 4.8696, "step": 11070 }, { "epoch": 0.22689574672864662, "grad_norm": 0.2638050317764282, "learning_rate": 0.0004, "loss": 4.8597, "step": 11080 }, { "epoch": 0.22710052628345587, "grad_norm": 0.24045559763908386, "learning_rate": 0.0004, "loss": 4.8482, "step": 11090 }, { "epoch": 0.2273053058382651, "grad_norm": 0.3191254734992981, "learning_rate": 0.0004, "loss": 4.8928, "step": 11100 }, { "epoch": 0.22751008539307435, "grad_norm": 0.2639184892177582, "learning_rate": 0.0004, "loss": 4.8888, "step": 11110 }, { "epoch": 0.2277148649478836, "grad_norm": 0.2678391635417938, "learning_rate": 0.0004, "loss": 4.8455, "step": 11120 }, { "epoch": 0.22791964450269286, "grad_norm": 0.2614695131778717, "learning_rate": 0.0004, "loss": 4.8596, "step": 11130 }, { "epoch": 0.2281244240575021, "grad_norm": 0.24117058515548706, "learning_rate": 0.0004, "loss": 4.8544, "step": 11140 }, { "epoch": 0.22832920361231135, "grad_norm": 0.27370381355285645, "learning_rate": 0.0004, "loss": 4.9183, "step": 11150 }, { "epoch": 0.2285339831671206, "grad_norm": 0.2558698356151581, "learning_rate": 0.0004, "loss": 4.8651, "step": 11160 }, { "epoch": 0.22873876272192983, "grad_norm": 0.24600282311439514, "learning_rate": 0.0004, "loss": 4.8372, "step": 11170 }, { "epoch": 0.2289435422767391, "grad_norm": 0.3030242323875427, "learning_rate": 0.0004, "loss": 4.8691, "step": 11180 }, { "epoch": 0.22914832183154835, "grad_norm": 0.24733074009418488, "learning_rate": 0.0004, "loss": 4.8805, "step": 11190 }, { "epoch": 0.2293531013863576, "grad_norm": 0.27617985010147095, "learning_rate": 0.0004, "loss": 4.8768, "step": 11200 }, { "epoch": 0.22955788094116683, "grad_norm": 0.2529560327529907, "learning_rate": 0.0004, "loss": 4.8199, "step": 11210 }, { "epoch": 0.22976266049597607, "grad_norm": 0.239194855093956, "learning_rate": 0.0004, "loss": 4.8392, "step": 11220 }, { "epoch": 0.22996744005078534, "grad_norm": 0.29008886218070984, "learning_rate": 0.0004, "loss": 4.8582, "step": 11230 }, { "epoch": 0.23017221960559459, "grad_norm": 0.2568531036376953, "learning_rate": 0.0004, "loss": 4.8622, "step": 11240 }, { "epoch": 0.23037699916040383, "grad_norm": 0.25861501693725586, "learning_rate": 0.0004, "loss": 4.8598, "step": 11250 }, { "epoch": 0.23058177871521307, "grad_norm": 0.280758798122406, "learning_rate": 0.0004, "loss": 4.8872, "step": 11260 }, { "epoch": 0.2307865582700223, "grad_norm": 0.2618058919906616, "learning_rate": 0.0004, "loss": 4.8539, "step": 11270 }, { "epoch": 0.23099133782483156, "grad_norm": 0.2842026650905609, "learning_rate": 0.0004, "loss": 4.8277, "step": 11280 }, { "epoch": 0.23119611737964083, "grad_norm": 0.25333431363105774, "learning_rate": 0.0004, "loss": 4.8711, "step": 11290 }, { "epoch": 0.23140089693445007, "grad_norm": 0.22926181554794312, "learning_rate": 0.0004, "loss": 4.8375, "step": 11300 }, { "epoch": 0.2316056764892593, "grad_norm": 0.27335697412490845, "learning_rate": 0.0004, "loss": 4.8778, "step": 11310 }, { "epoch": 0.23181045604406855, "grad_norm": 0.2511597275733948, "learning_rate": 0.0004, "loss": 4.891, "step": 11320 }, { "epoch": 0.2320152355988778, "grad_norm": 0.2607283592224121, "learning_rate": 0.0004, "loss": 4.8494, "step": 11330 }, { "epoch": 0.23222001515368706, "grad_norm": 0.24405378103256226, "learning_rate": 0.0004, "loss": 4.8579, "step": 11340 }, { "epoch": 0.2324247947084963, "grad_norm": 0.3528309166431427, "learning_rate": 0.0004, "loss": 4.8861, "step": 11350 }, { "epoch": 0.23262957426330555, "grad_norm": 0.30794113874435425, "learning_rate": 0.0004, "loss": 4.8548, "step": 11360 }, { "epoch": 0.2328343538181148, "grad_norm": 0.24918349087238312, "learning_rate": 0.0004, "loss": 4.8867, "step": 11370 }, { "epoch": 0.23303913337292403, "grad_norm": 0.28293827176094055, "learning_rate": 0.0004, "loss": 4.8692, "step": 11380 }, { "epoch": 0.2332439129277333, "grad_norm": 0.27161991596221924, "learning_rate": 0.0004, "loss": 4.8822, "step": 11390 }, { "epoch": 0.23344869248254255, "grad_norm": 0.2860964834690094, "learning_rate": 0.0004, "loss": 4.8204, "step": 11400 }, { "epoch": 0.2336534720373518, "grad_norm": 0.24801798164844513, "learning_rate": 0.0004, "loss": 4.8735, "step": 11410 }, { "epoch": 0.23385825159216103, "grad_norm": 0.26853352785110474, "learning_rate": 0.0004, "loss": 4.8847, "step": 11420 }, { "epoch": 0.23406303114697027, "grad_norm": 0.27251923084259033, "learning_rate": 0.0004, "loss": 4.8839, "step": 11430 }, { "epoch": 0.23426781070177954, "grad_norm": 0.2775683104991913, "learning_rate": 0.0004, "loss": 4.9025, "step": 11440 }, { "epoch": 0.2344725902565888, "grad_norm": 0.23779477179050446, "learning_rate": 0.0004, "loss": 4.855, "step": 11450 }, { "epoch": 0.23467736981139803, "grad_norm": 0.327796071767807, "learning_rate": 0.0004, "loss": 4.8983, "step": 11460 }, { "epoch": 0.23488214936620727, "grad_norm": 0.25727519392967224, "learning_rate": 0.0004, "loss": 4.9148, "step": 11470 }, { "epoch": 0.2350869289210165, "grad_norm": 0.2634661793708801, "learning_rate": 0.0004, "loss": 4.8479, "step": 11480 }, { "epoch": 0.23529170847582578, "grad_norm": 0.24151748418807983, "learning_rate": 0.0004, "loss": 4.8902, "step": 11490 }, { "epoch": 0.23549648803063503, "grad_norm": 0.27254611253738403, "learning_rate": 0.0004, "loss": 4.848, "step": 11500 }, { "epoch": 0.23570126758544427, "grad_norm": 0.3315676748752594, "learning_rate": 0.0004, "loss": 4.8379, "step": 11510 }, { "epoch": 0.2359060471402535, "grad_norm": 0.2895018756389618, "learning_rate": 0.0004, "loss": 4.9049, "step": 11520 }, { "epoch": 0.23611082669506275, "grad_norm": 0.28068599104881287, "learning_rate": 0.0004, "loss": 4.8904, "step": 11530 }, { "epoch": 0.23631560624987202, "grad_norm": 0.27318912744522095, "learning_rate": 0.0004, "loss": 4.8829, "step": 11540 }, { "epoch": 0.23652038580468127, "grad_norm": 0.2426539659500122, "learning_rate": 0.0004, "loss": 4.8632, "step": 11550 }, { "epoch": 0.2367251653594905, "grad_norm": 0.2518153488636017, "learning_rate": 0.0004, "loss": 4.8686, "step": 11560 }, { "epoch": 0.23692994491429975, "grad_norm": 0.2427929937839508, "learning_rate": 0.0004, "loss": 4.8466, "step": 11570 }, { "epoch": 0.237134724469109, "grad_norm": 0.252975195646286, "learning_rate": 0.0004, "loss": 4.8835, "step": 11580 }, { "epoch": 0.23733950402391826, "grad_norm": 0.24528580904006958, "learning_rate": 0.0004, "loss": 4.8544, "step": 11590 }, { "epoch": 0.2375442835787275, "grad_norm": 0.24127419292926788, "learning_rate": 0.0004, "loss": 4.8496, "step": 11600 }, { "epoch": 0.23774906313353675, "grad_norm": 0.2564906179904938, "learning_rate": 0.0004, "loss": 4.844, "step": 11610 }, { "epoch": 0.237953842688346, "grad_norm": 0.24771487712860107, "learning_rate": 0.0004, "loss": 4.8822, "step": 11620 }, { "epoch": 0.23815862224315523, "grad_norm": 0.2635604739189148, "learning_rate": 0.0004, "loss": 4.862, "step": 11630 }, { "epoch": 0.2383634017979645, "grad_norm": 0.2611459493637085, "learning_rate": 0.0004, "loss": 4.8637, "step": 11640 }, { "epoch": 0.23856818135277374, "grad_norm": 0.2843973636627197, "learning_rate": 0.0004, "loss": 4.881, "step": 11650 }, { "epoch": 0.238772960907583, "grad_norm": 0.2378414124250412, "learning_rate": 0.0004, "loss": 4.8555, "step": 11660 }, { "epoch": 0.23897774046239223, "grad_norm": 0.2471654862165451, "learning_rate": 0.0004, "loss": 4.8853, "step": 11670 }, { "epoch": 0.23918252001720147, "grad_norm": 0.23446987569332123, "learning_rate": 0.0004, "loss": 4.8558, "step": 11680 }, { "epoch": 0.23938729957201074, "grad_norm": 0.2632286250591278, "learning_rate": 0.0004, "loss": 4.8768, "step": 11690 }, { "epoch": 0.23959207912681998, "grad_norm": 0.23805192112922668, "learning_rate": 0.0004, "loss": 4.8559, "step": 11700 }, { "epoch": 0.23979685868162923, "grad_norm": 0.24758028984069824, "learning_rate": 0.0004, "loss": 4.8264, "step": 11710 }, { "epoch": 0.24000163823643847, "grad_norm": 0.25618085265159607, "learning_rate": 0.0004, "loss": 4.8964, "step": 11720 }, { "epoch": 0.2402064177912477, "grad_norm": 0.27674999833106995, "learning_rate": 0.0004, "loss": 4.8612, "step": 11730 }, { "epoch": 0.24041119734605698, "grad_norm": 0.27882149815559387, "learning_rate": 0.0004, "loss": 4.879, "step": 11740 }, { "epoch": 0.24061597690086622, "grad_norm": 0.2586473226547241, "learning_rate": 0.0004, "loss": 4.8955, "step": 11750 }, { "epoch": 0.24082075645567547, "grad_norm": 0.26056963205337524, "learning_rate": 0.0004, "loss": 4.8392, "step": 11760 }, { "epoch": 0.2410255360104847, "grad_norm": 0.32773515582084656, "learning_rate": 0.0004, "loss": 4.8796, "step": 11770 }, { "epoch": 0.24123031556529395, "grad_norm": 0.4262612760066986, "learning_rate": 0.0004, "loss": 4.8597, "step": 11780 }, { "epoch": 0.24143509512010322, "grad_norm": 0.2506944537162781, "learning_rate": 0.0004, "loss": 4.845, "step": 11790 }, { "epoch": 0.24163987467491246, "grad_norm": 0.23852330446243286, "learning_rate": 0.0004, "loss": 4.8575, "step": 11800 }, { "epoch": 0.2418446542297217, "grad_norm": 0.23398999869823456, "learning_rate": 0.0004, "loss": 4.8889, "step": 11810 }, { "epoch": 0.24204943378453095, "grad_norm": 1.147854208946228, "learning_rate": 0.0004, "loss": 4.8906, "step": 11820 }, { "epoch": 0.2422542133393402, "grad_norm": 0.2868060767650604, "learning_rate": 0.0004, "loss": 4.8874, "step": 11830 }, { "epoch": 0.24245899289414946, "grad_norm": 0.26699528098106384, "learning_rate": 0.0004, "loss": 4.8525, "step": 11840 }, { "epoch": 0.2426637724489587, "grad_norm": 0.2768281102180481, "learning_rate": 0.0004, "loss": 4.8584, "step": 11850 }, { "epoch": 0.24286855200376795, "grad_norm": 0.2719232141971588, "learning_rate": 0.0004, "loss": 4.8487, "step": 11860 }, { "epoch": 0.2430733315585772, "grad_norm": 0.2644749581813812, "learning_rate": 0.0004, "loss": 4.8635, "step": 11870 }, { "epoch": 0.24327811111338643, "grad_norm": 0.27209368348121643, "learning_rate": 0.0004, "loss": 4.8635, "step": 11880 }, { "epoch": 0.2434828906681957, "grad_norm": 0.248479425907135, "learning_rate": 0.0004, "loss": 4.8584, "step": 11890 }, { "epoch": 0.24368767022300494, "grad_norm": 0.3290988504886627, "learning_rate": 0.0004, "loss": 4.8623, "step": 11900 }, { "epoch": 0.24389244977781419, "grad_norm": 0.2829190492630005, "learning_rate": 0.0004, "loss": 4.8083, "step": 11910 }, { "epoch": 0.24409722933262343, "grad_norm": 0.2845098376274109, "learning_rate": 0.0004, "loss": 4.8614, "step": 11920 }, { "epoch": 0.24430200888743267, "grad_norm": 0.2518509328365326, "learning_rate": 0.0004, "loss": 4.8812, "step": 11930 }, { "epoch": 0.24450678844224194, "grad_norm": 0.24854156374931335, "learning_rate": 0.0004, "loss": 4.8624, "step": 11940 }, { "epoch": 0.24471156799705118, "grad_norm": 0.29206472635269165, "learning_rate": 0.0004, "loss": 4.8661, "step": 11950 }, { "epoch": 0.24491634755186042, "grad_norm": 0.24712790548801422, "learning_rate": 0.0004, "loss": 4.8687, "step": 11960 }, { "epoch": 0.24512112710666967, "grad_norm": 0.25610557198524475, "learning_rate": 0.0004, "loss": 4.8644, "step": 11970 }, { "epoch": 0.2453259066614789, "grad_norm": 0.24736113846302032, "learning_rate": 0.0004, "loss": 4.862, "step": 11980 }, { "epoch": 0.24553068621628815, "grad_norm": 0.2579999268054962, "learning_rate": 0.0004, "loss": 4.8854, "step": 11990 }, { "epoch": 0.24573546577109742, "grad_norm": 0.2651893198490143, "learning_rate": 0.0004, "loss": 4.8444, "step": 12000 }, { "epoch": 0.24573546577109742, "eval_loss": 4.867031574249268, "eval_runtime": 4.2851, "eval_samples_per_second": 272.103, "eval_steps_per_second": 34.071, "step": 12000 }, { "epoch": 0.24594024532590666, "grad_norm": 0.2691151797771454, "learning_rate": 0.0004, "loss": 4.8714, "step": 12010 }, { "epoch": 0.2461450248807159, "grad_norm": 0.3087450861930847, "learning_rate": 0.0004, "loss": 4.8911, "step": 12020 }, { "epoch": 0.24634980443552515, "grad_norm": 0.5508211851119995, "learning_rate": 0.0004, "loss": 4.829, "step": 12030 }, { "epoch": 0.2465545839903344, "grad_norm": 0.2314455211162567, "learning_rate": 0.0004, "loss": 4.8503, "step": 12040 }, { "epoch": 0.24675936354514366, "grad_norm": 0.23762647807598114, "learning_rate": 0.0004, "loss": 4.8723, "step": 12050 }, { "epoch": 0.2469641430999529, "grad_norm": 0.24536122381687164, "learning_rate": 0.0004, "loss": 4.8459, "step": 12060 }, { "epoch": 0.24716892265476215, "grad_norm": 0.24700681865215302, "learning_rate": 0.0004, "loss": 4.8566, "step": 12070 }, { "epoch": 0.2473737022095714, "grad_norm": 0.34846436977386475, "learning_rate": 0.0004, "loss": 4.8286, "step": 12080 }, { "epoch": 0.24757848176438063, "grad_norm": 0.3168438971042633, "learning_rate": 0.0004, "loss": 4.8503, "step": 12090 }, { "epoch": 0.2477832613191899, "grad_norm": 0.2919858396053314, "learning_rate": 0.0004, "loss": 4.8207, "step": 12100 }, { "epoch": 0.24798804087399914, "grad_norm": 0.3359951674938202, "learning_rate": 0.0004, "loss": 4.8341, "step": 12110 }, { "epoch": 0.2481928204288084, "grad_norm": 0.4131545126438141, "learning_rate": 0.0004, "loss": 4.8733, "step": 12120 }, { "epoch": 0.24839759998361763, "grad_norm": 0.2484077513217926, "learning_rate": 0.0004, "loss": 4.8611, "step": 12130 }, { "epoch": 0.24860237953842687, "grad_norm": 0.2616230845451355, "learning_rate": 0.0004, "loss": 4.8953, "step": 12140 }, { "epoch": 0.24880715909323614, "grad_norm": 0.2490583062171936, "learning_rate": 0.0004, "loss": 4.8732, "step": 12150 }, { "epoch": 0.24901193864804538, "grad_norm": 0.2691861093044281, "learning_rate": 0.0004, "loss": 4.8612, "step": 12160 }, { "epoch": 0.24921671820285463, "grad_norm": 0.2695310413837433, "learning_rate": 0.0004, "loss": 4.8501, "step": 12170 }, { "epoch": 0.24942149775766387, "grad_norm": 0.2886080741882324, "learning_rate": 0.0004, "loss": 4.8829, "step": 12180 }, { "epoch": 0.2496262773124731, "grad_norm": 0.2912190854549408, "learning_rate": 0.0004, "loss": 4.828, "step": 12190 }, { "epoch": 0.24983105686728238, "grad_norm": 0.27903711795806885, "learning_rate": 0.0004, "loss": 4.8104, "step": 12200 }, { "epoch": 0.2500358364220916, "grad_norm": 0.2646547853946686, "learning_rate": 0.0004, "loss": 4.838, "step": 12210 }, { "epoch": 0.2502406159769009, "grad_norm": 0.2762759029865265, "learning_rate": 0.0004, "loss": 4.8467, "step": 12220 }, { "epoch": 0.25044539553171014, "grad_norm": 0.23205457627773285, "learning_rate": 0.0004, "loss": 4.8558, "step": 12230 }, { "epoch": 0.2506501750865194, "grad_norm": 0.2654586732387543, "learning_rate": 0.0004, "loss": 4.8856, "step": 12240 }, { "epoch": 0.2508549546413286, "grad_norm": 0.26850640773773193, "learning_rate": 0.0004, "loss": 4.8427, "step": 12250 }, { "epoch": 0.25105973419613786, "grad_norm": 0.27965614199638367, "learning_rate": 0.0004, "loss": 4.8214, "step": 12260 }, { "epoch": 0.2512645137509471, "grad_norm": 0.24927881360054016, "learning_rate": 0.0004, "loss": 4.867, "step": 12270 }, { "epoch": 0.25146929330575635, "grad_norm": 0.2590121924877167, "learning_rate": 0.0004, "loss": 4.8543, "step": 12280 }, { "epoch": 0.2516740728605656, "grad_norm": 0.2736886739730835, "learning_rate": 0.0004, "loss": 4.8594, "step": 12290 }, { "epoch": 0.25187885241537483, "grad_norm": 0.27250829339027405, "learning_rate": 0.0004, "loss": 4.8308, "step": 12300 }, { "epoch": 0.2520836319701841, "grad_norm": 0.25367042422294617, "learning_rate": 0.0004, "loss": 4.8708, "step": 12310 }, { "epoch": 0.2522884115249934, "grad_norm": 0.2685873806476593, "learning_rate": 0.0004, "loss": 4.8263, "step": 12320 }, { "epoch": 0.2524931910798026, "grad_norm": 0.24380801618099213, "learning_rate": 0.0004, "loss": 4.8282, "step": 12330 }, { "epoch": 0.25269797063461186, "grad_norm": 0.25229397416114807, "learning_rate": 0.0004, "loss": 4.8737, "step": 12340 }, { "epoch": 0.2529027501894211, "grad_norm": 0.2535671293735504, "learning_rate": 0.0004, "loss": 4.8529, "step": 12350 }, { "epoch": 0.25310752974423034, "grad_norm": 0.25247514247894287, "learning_rate": 0.0004, "loss": 4.8421, "step": 12360 }, { "epoch": 0.2533123092990396, "grad_norm": 0.27063506841659546, "learning_rate": 0.0004, "loss": 4.8642, "step": 12370 }, { "epoch": 0.2535170888538488, "grad_norm": 0.2680909335613251, "learning_rate": 0.0004, "loss": 4.8485, "step": 12380 }, { "epoch": 0.25372186840865807, "grad_norm": 0.24620793759822845, "learning_rate": 0.0004, "loss": 4.8643, "step": 12390 }, { "epoch": 0.2539266479634673, "grad_norm": 0.47949567437171936, "learning_rate": 0.0004, "loss": 4.8472, "step": 12400 }, { "epoch": 0.25413142751827655, "grad_norm": 0.2958216071128845, "learning_rate": 0.0004, "loss": 4.8684, "step": 12410 }, { "epoch": 0.2543362070730858, "grad_norm": 0.2521269917488098, "learning_rate": 0.0004, "loss": 4.8361, "step": 12420 }, { "epoch": 0.2545409866278951, "grad_norm": 0.2519783079624176, "learning_rate": 0.0004, "loss": 4.8443, "step": 12430 }, { "epoch": 0.25474576618270434, "grad_norm": 0.2367222160100937, "learning_rate": 0.0004, "loss": 4.8228, "step": 12440 }, { "epoch": 0.2549505457375136, "grad_norm": 0.26444026827812195, "learning_rate": 0.0004, "loss": 4.8343, "step": 12450 }, { "epoch": 0.2551553252923228, "grad_norm": 0.27004826068878174, "learning_rate": 0.0004, "loss": 4.8437, "step": 12460 }, { "epoch": 0.25536010484713206, "grad_norm": 0.25829994678497314, "learning_rate": 0.0004, "loss": 4.8329, "step": 12470 }, { "epoch": 0.2555648844019413, "grad_norm": 0.24384234845638275, "learning_rate": 0.0004, "loss": 4.8512, "step": 12480 }, { "epoch": 0.25576966395675055, "grad_norm": 0.2575257420539856, "learning_rate": 0.0004, "loss": 4.8562, "step": 12490 }, { "epoch": 0.2559744435115598, "grad_norm": 0.26241886615753174, "learning_rate": 0.0004, "loss": 4.8509, "step": 12500 }, { "epoch": 0.25617922306636903, "grad_norm": 0.25610288977622986, "learning_rate": 0.0004, "loss": 4.8747, "step": 12510 }, { "epoch": 0.2563840026211783, "grad_norm": 0.26983582973480225, "learning_rate": 0.0004, "loss": 4.8247, "step": 12520 }, { "epoch": 0.2565887821759876, "grad_norm": 0.24773332476615906, "learning_rate": 0.0004, "loss": 4.8414, "step": 12530 }, { "epoch": 0.2567935617307968, "grad_norm": 0.3391345143318176, "learning_rate": 0.0004, "loss": 4.8386, "step": 12540 }, { "epoch": 0.25699834128560606, "grad_norm": 0.2731577754020691, "learning_rate": 0.0004, "loss": 4.8222, "step": 12550 }, { "epoch": 0.2572031208404153, "grad_norm": 0.24546635150909424, "learning_rate": 0.0004, "loss": 4.8283, "step": 12560 }, { "epoch": 0.25740790039522454, "grad_norm": 0.2784971296787262, "learning_rate": 0.0004, "loss": 4.8522, "step": 12570 }, { "epoch": 0.2576126799500338, "grad_norm": 0.25216221809387207, "learning_rate": 0.0004, "loss": 4.8627, "step": 12580 }, { "epoch": 0.257817459504843, "grad_norm": 0.24354113638401031, "learning_rate": 0.0004, "loss": 4.8624, "step": 12590 }, { "epoch": 0.25802223905965227, "grad_norm": 0.2683369815349579, "learning_rate": 0.0004, "loss": 4.8449, "step": 12600 }, { "epoch": 0.2582270186144615, "grad_norm": 0.24807274341583252, "learning_rate": 0.0004, "loss": 4.8605, "step": 12610 }, { "epoch": 0.25843179816927075, "grad_norm": 0.2839908301830292, "learning_rate": 0.0004, "loss": 4.8308, "step": 12620 }, { "epoch": 0.25863657772408005, "grad_norm": 0.27576714754104614, "learning_rate": 0.0004, "loss": 4.8367, "step": 12630 }, { "epoch": 0.2588413572788893, "grad_norm": 0.2834664583206177, "learning_rate": 0.0004, "loss": 4.8193, "step": 12640 }, { "epoch": 0.25904613683369854, "grad_norm": 0.26658859848976135, "learning_rate": 0.0004, "loss": 4.8781, "step": 12650 }, { "epoch": 0.2592509163885078, "grad_norm": 0.2482862025499344, "learning_rate": 0.0004, "loss": 4.8382, "step": 12660 }, { "epoch": 0.259455695943317, "grad_norm": 0.2739425599575043, "learning_rate": 0.0004, "loss": 4.8662, "step": 12670 }, { "epoch": 0.25966047549812626, "grad_norm": 0.2573910653591156, "learning_rate": 0.0004, "loss": 4.8293, "step": 12680 }, { "epoch": 0.2598652550529355, "grad_norm": 0.24086612462997437, "learning_rate": 0.0004, "loss": 4.8442, "step": 12690 }, { "epoch": 0.26007003460774475, "grad_norm": 0.28721368312835693, "learning_rate": 0.0004, "loss": 4.8473, "step": 12700 }, { "epoch": 0.260274814162554, "grad_norm": 0.28750866651535034, "learning_rate": 0.0004, "loss": 4.8133, "step": 12710 }, { "epoch": 0.26047959371736323, "grad_norm": 0.25055843591690063, "learning_rate": 0.0004, "loss": 4.8315, "step": 12720 }, { "epoch": 0.26068437327217253, "grad_norm": 0.2917591333389282, "learning_rate": 0.0004, "loss": 4.8668, "step": 12730 }, { "epoch": 0.2608891528269818, "grad_norm": 0.2694515287876129, "learning_rate": 0.0004, "loss": 4.8818, "step": 12740 }, { "epoch": 0.261093932381791, "grad_norm": 0.3116830289363861, "learning_rate": 0.0004, "loss": 4.8163, "step": 12750 }, { "epoch": 0.26129871193660026, "grad_norm": 0.28766903281211853, "learning_rate": 0.0004, "loss": 4.8063, "step": 12760 }, { "epoch": 0.2615034914914095, "grad_norm": 0.2460690587759018, "learning_rate": 0.0004, "loss": 4.8273, "step": 12770 }, { "epoch": 0.26170827104621874, "grad_norm": 0.2663492262363434, "learning_rate": 0.0004, "loss": 4.8314, "step": 12780 }, { "epoch": 0.261913050601028, "grad_norm": 0.32743126153945923, "learning_rate": 0.0004, "loss": 4.844, "step": 12790 }, { "epoch": 0.26211783015583723, "grad_norm": 0.26597467064857483, "learning_rate": 0.0004, "loss": 4.8344, "step": 12800 }, { "epoch": 0.26232260971064647, "grad_norm": 0.2584017515182495, "learning_rate": 0.0004, "loss": 4.7981, "step": 12810 }, { "epoch": 0.2625273892654557, "grad_norm": 0.26215118169784546, "learning_rate": 0.0004, "loss": 4.8678, "step": 12820 }, { "epoch": 0.262732168820265, "grad_norm": 0.3123892545700073, "learning_rate": 0.0004, "loss": 4.8201, "step": 12830 }, { "epoch": 0.26293694837507425, "grad_norm": 0.25252947211265564, "learning_rate": 0.0004, "loss": 4.8045, "step": 12840 }, { "epoch": 0.2631417279298835, "grad_norm": 0.3082652688026428, "learning_rate": 0.0004, "loss": 4.857, "step": 12850 }, { "epoch": 0.26334650748469274, "grad_norm": 0.2548525333404541, "learning_rate": 0.0004, "loss": 4.8549, "step": 12860 }, { "epoch": 0.263551287039502, "grad_norm": 0.3002852499485016, "learning_rate": 0.0004, "loss": 4.835, "step": 12870 }, { "epoch": 0.2637560665943112, "grad_norm": 0.2531322240829468, "learning_rate": 0.0004, "loss": 4.8445, "step": 12880 }, { "epoch": 0.26396084614912046, "grad_norm": 0.25382447242736816, "learning_rate": 0.0004, "loss": 4.8509, "step": 12890 }, { "epoch": 0.2641656257039297, "grad_norm": 0.24311015009880066, "learning_rate": 0.0004, "loss": 4.8653, "step": 12900 }, { "epoch": 0.26437040525873895, "grad_norm": 0.28109368681907654, "learning_rate": 0.0004, "loss": 4.8036, "step": 12910 }, { "epoch": 0.2645751848135482, "grad_norm": 0.3440668284893036, "learning_rate": 0.0004, "loss": 4.8436, "step": 12920 }, { "epoch": 0.2647799643683575, "grad_norm": 0.2777434289455414, "learning_rate": 0.0004, "loss": 4.8644, "step": 12930 }, { "epoch": 0.26498474392316673, "grad_norm": 0.2744606137275696, "learning_rate": 0.0004, "loss": 4.8402, "step": 12940 }, { "epoch": 0.265189523477976, "grad_norm": 0.27684512734413147, "learning_rate": 0.0004, "loss": 4.8427, "step": 12950 }, { "epoch": 0.2653943030327852, "grad_norm": 0.2579965889453888, "learning_rate": 0.0004, "loss": 4.8605, "step": 12960 }, { "epoch": 0.26559908258759446, "grad_norm": 0.2631833851337433, "learning_rate": 0.0004, "loss": 4.8222, "step": 12970 }, { "epoch": 0.2658038621424037, "grad_norm": 0.2552182674407959, "learning_rate": 0.0004, "loss": 4.8645, "step": 12980 }, { "epoch": 0.26600864169721294, "grad_norm": 0.29956743121147156, "learning_rate": 0.0004, "loss": 4.8358, "step": 12990 }, { "epoch": 0.2662134212520222, "grad_norm": 0.28119608759880066, "learning_rate": 0.0004, "loss": 4.8141, "step": 13000 }, { "epoch": 0.26641820080683143, "grad_norm": 0.26227396726608276, "learning_rate": 0.0004, "loss": 4.8162, "step": 13010 }, { "epoch": 0.26662298036164067, "grad_norm": 0.25072818994522095, "learning_rate": 0.0004, "loss": 4.8293, "step": 13020 }, { "epoch": 0.26682775991644997, "grad_norm": 0.2739667296409607, "learning_rate": 0.0004, "loss": 4.8323, "step": 13030 }, { "epoch": 0.2670325394712592, "grad_norm": 0.2423141598701477, "learning_rate": 0.0004, "loss": 4.8708, "step": 13040 }, { "epoch": 0.26723731902606845, "grad_norm": 0.2485387921333313, "learning_rate": 0.0004, "loss": 4.8406, "step": 13050 }, { "epoch": 0.2674420985808777, "grad_norm": 0.25066694617271423, "learning_rate": 0.0004, "loss": 4.8422, "step": 13060 }, { "epoch": 0.26764687813568694, "grad_norm": 0.3027220070362091, "learning_rate": 0.0004, "loss": 4.8095, "step": 13070 }, { "epoch": 0.2678516576904962, "grad_norm": 0.2532387673854828, "learning_rate": 0.0004, "loss": 4.8387, "step": 13080 }, { "epoch": 0.2680564372453054, "grad_norm": 0.2525840699672699, "learning_rate": 0.0004, "loss": 4.8176, "step": 13090 }, { "epoch": 0.26826121680011467, "grad_norm": 0.38074547052383423, "learning_rate": 0.0004, "loss": 4.8854, "step": 13100 }, { "epoch": 0.2684659963549239, "grad_norm": 0.2759318947792053, "learning_rate": 0.0004, "loss": 4.8315, "step": 13110 }, { "epoch": 0.26867077590973315, "grad_norm": 0.2519468069076538, "learning_rate": 0.0004, "loss": 4.8259, "step": 13120 }, { "epoch": 0.2688755554645424, "grad_norm": 0.269398033618927, "learning_rate": 0.0004, "loss": 4.8357, "step": 13130 }, { "epoch": 0.2690803350193517, "grad_norm": 0.2784993052482605, "learning_rate": 0.0004, "loss": 4.8182, "step": 13140 }, { "epoch": 0.26928511457416093, "grad_norm": 0.2523784935474396, "learning_rate": 0.0004, "loss": 4.8344, "step": 13150 }, { "epoch": 0.2694898941289702, "grad_norm": 0.2615765631198883, "learning_rate": 0.0004, "loss": 4.8556, "step": 13160 }, { "epoch": 0.2696946736837794, "grad_norm": 0.2687089443206787, "learning_rate": 0.0004, "loss": 4.8339, "step": 13170 }, { "epoch": 0.26989945323858866, "grad_norm": 0.2403348684310913, "learning_rate": 0.0004, "loss": 4.8512, "step": 13180 }, { "epoch": 0.2701042327933979, "grad_norm": 0.26277419924736023, "learning_rate": 0.0004, "loss": 4.845, "step": 13190 }, { "epoch": 0.27030901234820714, "grad_norm": 0.2699856162071228, "learning_rate": 0.0004, "loss": 4.8305, "step": 13200 }, { "epoch": 0.2705137919030164, "grad_norm": 0.2603802978992462, "learning_rate": 0.0004, "loss": 4.8595, "step": 13210 }, { "epoch": 0.27071857145782563, "grad_norm": 0.24295683205127716, "learning_rate": 0.0004, "loss": 4.8334, "step": 13220 }, { "epoch": 0.27092335101263487, "grad_norm": 0.26295214891433716, "learning_rate": 0.0004, "loss": 4.8032, "step": 13230 }, { "epoch": 0.27112813056744417, "grad_norm": 0.27498677372932434, "learning_rate": 0.0004, "loss": 4.8624, "step": 13240 }, { "epoch": 0.2713329101222534, "grad_norm": 0.27875176072120667, "learning_rate": 0.0004, "loss": 4.7901, "step": 13250 }, { "epoch": 0.27153768967706265, "grad_norm": 0.28385552763938904, "learning_rate": 0.0004, "loss": 4.8203, "step": 13260 }, { "epoch": 0.2717424692318719, "grad_norm": 0.2722179889678955, "learning_rate": 0.0004, "loss": 4.8618, "step": 13270 }, { "epoch": 0.27194724878668114, "grad_norm": 0.32632526755332947, "learning_rate": 0.0004, "loss": 4.8253, "step": 13280 }, { "epoch": 0.2721520283414904, "grad_norm": 0.33001449704170227, "learning_rate": 0.0004, "loss": 4.8079, "step": 13290 }, { "epoch": 0.2723568078962996, "grad_norm": 0.2521580755710602, "learning_rate": 0.0004, "loss": 4.8104, "step": 13300 }, { "epoch": 0.27256158745110887, "grad_norm": 0.25663989782333374, "learning_rate": 0.0004, "loss": 4.8222, "step": 13310 }, { "epoch": 0.2727663670059181, "grad_norm": 0.30672553181648254, "learning_rate": 0.0004, "loss": 4.8358, "step": 13320 }, { "epoch": 0.27297114656072735, "grad_norm": 0.2527655363082886, "learning_rate": 0.0004, "loss": 4.8398, "step": 13330 }, { "epoch": 0.27317592611553665, "grad_norm": 0.2610364258289337, "learning_rate": 0.0004, "loss": 4.8168, "step": 13340 }, { "epoch": 0.2733807056703459, "grad_norm": 0.26267045736312866, "learning_rate": 0.0004, "loss": 4.7884, "step": 13350 }, { "epoch": 0.27358548522515513, "grad_norm": 0.29419398307800293, "learning_rate": 0.0004, "loss": 4.8012, "step": 13360 }, { "epoch": 0.2737902647799644, "grad_norm": 0.30731770396232605, "learning_rate": 0.0004, "loss": 4.8163, "step": 13370 }, { "epoch": 0.2739950443347736, "grad_norm": 0.29380232095718384, "learning_rate": 0.0004, "loss": 4.8384, "step": 13380 }, { "epoch": 0.27419982388958286, "grad_norm": 0.2595800459384918, "learning_rate": 0.0004, "loss": 4.8695, "step": 13390 }, { "epoch": 0.2744046034443921, "grad_norm": 0.2919449508190155, "learning_rate": 0.0004, "loss": 4.8487, "step": 13400 }, { "epoch": 0.27460938299920135, "grad_norm": 0.34679651260375977, "learning_rate": 0.0004, "loss": 4.866, "step": 13410 }, { "epoch": 0.2748141625540106, "grad_norm": 0.26811766624450684, "learning_rate": 0.0004, "loss": 4.7992, "step": 13420 }, { "epoch": 0.27501894210881983, "grad_norm": 0.2668153941631317, "learning_rate": 0.0004, "loss": 4.795, "step": 13430 }, { "epoch": 0.27522372166362913, "grad_norm": 0.2654266953468323, "learning_rate": 0.0004, "loss": 4.7881, "step": 13440 }, { "epoch": 0.27542850121843837, "grad_norm": 0.27033060789108276, "learning_rate": 0.0004, "loss": 4.848, "step": 13450 }, { "epoch": 0.2756332807732476, "grad_norm": 0.27064186334609985, "learning_rate": 0.0004, "loss": 4.8293, "step": 13460 }, { "epoch": 0.27583806032805686, "grad_norm": 0.2640422582626343, "learning_rate": 0.0004, "loss": 4.8109, "step": 13470 }, { "epoch": 0.2760428398828661, "grad_norm": 0.2696080505847931, "learning_rate": 0.0004, "loss": 4.8407, "step": 13480 }, { "epoch": 0.27624761943767534, "grad_norm": 0.27713286876678467, "learning_rate": 0.0004, "loss": 4.8246, "step": 13490 }, { "epoch": 0.2764523989924846, "grad_norm": 0.27920326590538025, "learning_rate": 0.0004, "loss": 4.8669, "step": 13500 }, { "epoch": 0.2766571785472938, "grad_norm": 0.26664379239082336, "learning_rate": 0.0004, "loss": 4.7622, "step": 13510 }, { "epoch": 0.27686195810210307, "grad_norm": 0.25868895649909973, "learning_rate": 0.0004, "loss": 4.8356, "step": 13520 }, { "epoch": 0.2770667376569123, "grad_norm": 0.28665560483932495, "learning_rate": 0.0004, "loss": 4.8663, "step": 13530 }, { "epoch": 0.2772715172117216, "grad_norm": 0.24480880796909332, "learning_rate": 0.0004, "loss": 4.8462, "step": 13540 }, { "epoch": 0.27747629676653085, "grad_norm": 0.26968759298324585, "learning_rate": 0.0004, "loss": 4.8339, "step": 13550 }, { "epoch": 0.2776810763213401, "grad_norm": 0.24893125891685486, "learning_rate": 0.0004, "loss": 4.8065, "step": 13560 }, { "epoch": 0.27788585587614933, "grad_norm": 0.3047983646392822, "learning_rate": 0.0004, "loss": 4.7798, "step": 13570 }, { "epoch": 0.2780906354309586, "grad_norm": 0.2820693254470825, "learning_rate": 0.0004, "loss": 4.8267, "step": 13580 }, { "epoch": 0.2782954149857678, "grad_norm": 0.26072901487350464, "learning_rate": 0.0004, "loss": 4.8529, "step": 13590 }, { "epoch": 0.27850019454057706, "grad_norm": 0.4209215044975281, "learning_rate": 0.0004, "loss": 4.8032, "step": 13600 }, { "epoch": 0.2787049740953863, "grad_norm": 0.26476553082466125, "learning_rate": 0.0004, "loss": 4.8548, "step": 13610 }, { "epoch": 0.27890975365019555, "grad_norm": 0.28009968996047974, "learning_rate": 0.0004, "loss": 4.8097, "step": 13620 }, { "epoch": 0.2791145332050048, "grad_norm": 0.2644807994365692, "learning_rate": 0.0004, "loss": 4.8619, "step": 13630 }, { "epoch": 0.2793193127598141, "grad_norm": 0.2874789535999298, "learning_rate": 0.0004, "loss": 4.8178, "step": 13640 }, { "epoch": 0.27952409231462333, "grad_norm": 0.29766467213630676, "learning_rate": 0.0004, "loss": 4.8629, "step": 13650 }, { "epoch": 0.27972887186943257, "grad_norm": 0.2470841407775879, "learning_rate": 0.0004, "loss": 4.8008, "step": 13660 }, { "epoch": 0.2799336514242418, "grad_norm": 0.29466772079467773, "learning_rate": 0.0004, "loss": 4.8343, "step": 13670 }, { "epoch": 0.28013843097905106, "grad_norm": 0.24835602939128876, "learning_rate": 0.0004, "loss": 4.7946, "step": 13680 }, { "epoch": 0.2803432105338603, "grad_norm": 0.24725662171840668, "learning_rate": 0.0004, "loss": 4.8112, "step": 13690 }, { "epoch": 0.28054799008866954, "grad_norm": 0.34971049427986145, "learning_rate": 0.0004, "loss": 4.807, "step": 13700 }, { "epoch": 0.2807527696434788, "grad_norm": 0.2836594581604004, "learning_rate": 0.0004, "loss": 4.8198, "step": 13710 }, { "epoch": 0.280957549198288, "grad_norm": 0.25764623284339905, "learning_rate": 0.0004, "loss": 4.8185, "step": 13720 }, { "epoch": 0.28116232875309727, "grad_norm": 0.2625957131385803, "learning_rate": 0.0004, "loss": 4.8503, "step": 13730 }, { "epoch": 0.28136710830790657, "grad_norm": 0.27676263451576233, "learning_rate": 0.0004, "loss": 4.8225, "step": 13740 }, { "epoch": 0.2815718878627158, "grad_norm": 0.26145434379577637, "learning_rate": 0.0004, "loss": 4.8447, "step": 13750 }, { "epoch": 0.28177666741752505, "grad_norm": 0.269905686378479, "learning_rate": 0.0004, "loss": 4.8192, "step": 13760 }, { "epoch": 0.2819814469723343, "grad_norm": 0.27324187755584717, "learning_rate": 0.0004, "loss": 4.8376, "step": 13770 }, { "epoch": 0.28218622652714354, "grad_norm": 0.24123696982860565, "learning_rate": 0.0004, "loss": 4.8201, "step": 13780 }, { "epoch": 0.2823910060819528, "grad_norm": 0.29068803787231445, "learning_rate": 0.0004, "loss": 4.8178, "step": 13790 }, { "epoch": 0.282595785636762, "grad_norm": 0.31595665216445923, "learning_rate": 0.0004, "loss": 4.8576, "step": 13800 }, { "epoch": 0.28280056519157126, "grad_norm": 0.2802872955799103, "learning_rate": 0.0004, "loss": 4.8264, "step": 13810 }, { "epoch": 0.2830053447463805, "grad_norm": 0.26241886615753174, "learning_rate": 0.0004, "loss": 4.827, "step": 13820 }, { "epoch": 0.28321012430118975, "grad_norm": 0.2513846158981323, "learning_rate": 0.0004, "loss": 4.8288, "step": 13830 }, { "epoch": 0.283414903855999, "grad_norm": 0.28127583861351013, "learning_rate": 0.0004, "loss": 4.8314, "step": 13840 }, { "epoch": 0.2836196834108083, "grad_norm": 0.3425098657608032, "learning_rate": 0.0004, "loss": 4.8583, "step": 13850 }, { "epoch": 0.28382446296561753, "grad_norm": 0.2766761779785156, "learning_rate": 0.0004, "loss": 4.8232, "step": 13860 }, { "epoch": 0.2840292425204268, "grad_norm": 0.252049058675766, "learning_rate": 0.0004, "loss": 4.8189, "step": 13870 }, { "epoch": 0.284234022075236, "grad_norm": 0.26199719309806824, "learning_rate": 0.0004, "loss": 4.8369, "step": 13880 }, { "epoch": 0.28443880163004526, "grad_norm": 0.2577882707118988, "learning_rate": 0.0004, "loss": 4.8116, "step": 13890 }, { "epoch": 0.2846435811848545, "grad_norm": 0.2483375519514084, "learning_rate": 0.0004, "loss": 4.8511, "step": 13900 }, { "epoch": 0.28484836073966374, "grad_norm": 0.2529613673686981, "learning_rate": 0.0004, "loss": 4.8162, "step": 13910 }, { "epoch": 0.285053140294473, "grad_norm": 0.26325222849845886, "learning_rate": 0.0004, "loss": 4.8128, "step": 13920 }, { "epoch": 0.2852579198492822, "grad_norm": 0.25958263874053955, "learning_rate": 0.0004, "loss": 4.8054, "step": 13930 }, { "epoch": 0.28546269940409147, "grad_norm": 0.2949371933937073, "learning_rate": 0.0004, "loss": 4.8334, "step": 13940 }, { "epoch": 0.28566747895890077, "grad_norm": 0.27521225810050964, "learning_rate": 0.0004, "loss": 4.819, "step": 13950 }, { "epoch": 0.28587225851371, "grad_norm": 0.2950633764266968, "learning_rate": 0.0004, "loss": 4.8464, "step": 13960 }, { "epoch": 0.28607703806851925, "grad_norm": 0.29058393836021423, "learning_rate": 0.0004, "loss": 4.8675, "step": 13970 }, { "epoch": 0.2862818176233285, "grad_norm": 0.24924442172050476, "learning_rate": 0.0004, "loss": 4.8347, "step": 13980 }, { "epoch": 0.28648659717813774, "grad_norm": 0.25537240505218506, "learning_rate": 0.0004, "loss": 4.8463, "step": 13990 }, { "epoch": 0.286691376732947, "grad_norm": 0.2523159980773926, "learning_rate": 0.0004, "loss": 4.7991, "step": 14000 }, { "epoch": 0.286691376732947, "eval_loss": 4.831753253936768, "eval_runtime": 4.2725, "eval_samples_per_second": 272.908, "eval_steps_per_second": 34.172, "step": 14000 }, { "epoch": 0.2868961562877562, "grad_norm": 0.23810714483261108, "learning_rate": 0.0004, "loss": 4.8013, "step": 14010 }, { "epoch": 0.28710093584256546, "grad_norm": 0.2689630389213562, "learning_rate": 0.0004, "loss": 4.8474, "step": 14020 }, { "epoch": 0.2873057153973747, "grad_norm": 0.2767544686794281, "learning_rate": 0.0004, "loss": 4.8345, "step": 14030 }, { "epoch": 0.28751049495218395, "grad_norm": 0.687874972820282, "learning_rate": 0.0004, "loss": 4.8139, "step": 14040 }, { "epoch": 0.28771527450699325, "grad_norm": 0.24040289223194122, "learning_rate": 0.0004, "loss": 4.8437, "step": 14050 }, { "epoch": 0.2879200540618025, "grad_norm": 0.250606507062912, "learning_rate": 0.0004, "loss": 4.8557, "step": 14060 }, { "epoch": 0.28812483361661173, "grad_norm": 0.29520824551582336, "learning_rate": 0.0004, "loss": 4.8032, "step": 14070 }, { "epoch": 0.288329613171421, "grad_norm": 0.27100345492362976, "learning_rate": 0.0004, "loss": 4.8219, "step": 14080 }, { "epoch": 0.2885343927262302, "grad_norm": 0.2497226595878601, "learning_rate": 0.0004, "loss": 4.802, "step": 14090 }, { "epoch": 0.28873917228103946, "grad_norm": 0.2438780963420868, "learning_rate": 0.0004, "loss": 4.8118, "step": 14100 }, { "epoch": 0.2889439518358487, "grad_norm": 0.2748144268989563, "learning_rate": 0.0004, "loss": 4.8313, "step": 14110 }, { "epoch": 0.28914873139065794, "grad_norm": 0.2574600279331207, "learning_rate": 0.0004, "loss": 4.8441, "step": 14120 }, { "epoch": 0.2893535109454672, "grad_norm": 0.27852386236190796, "learning_rate": 0.0004, "loss": 4.8072, "step": 14130 }, { "epoch": 0.2895582905002764, "grad_norm": 0.26286157965660095, "learning_rate": 0.0004, "loss": 4.806, "step": 14140 }, { "epoch": 0.2897630700550857, "grad_norm": 0.26902690529823303, "learning_rate": 0.0004, "loss": 4.8018, "step": 14150 }, { "epoch": 0.28996784960989497, "grad_norm": 0.2873010039329529, "learning_rate": 0.0004, "loss": 4.829, "step": 14160 }, { "epoch": 0.2901726291647042, "grad_norm": 0.3127599358558655, "learning_rate": 0.0004, "loss": 4.8351, "step": 14170 }, { "epoch": 0.29037740871951345, "grad_norm": 0.2716214060783386, "learning_rate": 0.0004, "loss": 4.8234, "step": 14180 }, { "epoch": 0.2905821882743227, "grad_norm": 0.24700114130973816, "learning_rate": 0.0004, "loss": 4.8161, "step": 14190 }, { "epoch": 0.29078696782913194, "grad_norm": 0.23971880972385406, "learning_rate": 0.0004, "loss": 4.815, "step": 14200 }, { "epoch": 0.2909917473839412, "grad_norm": 0.25367897748947144, "learning_rate": 0.0004, "loss": 4.8097, "step": 14210 }, { "epoch": 0.2911965269387504, "grad_norm": 0.282354474067688, "learning_rate": 0.0004, "loss": 4.8113, "step": 14220 }, { "epoch": 0.29140130649355966, "grad_norm": 0.2775696814060211, "learning_rate": 0.0004, "loss": 4.7921, "step": 14230 }, { "epoch": 0.2916060860483689, "grad_norm": 0.26994824409484863, "learning_rate": 0.0004, "loss": 4.8218, "step": 14240 }, { "epoch": 0.2918108656031782, "grad_norm": 0.2710299491882324, "learning_rate": 0.0004, "loss": 4.8288, "step": 14250 }, { "epoch": 0.29201564515798745, "grad_norm": 0.27665871381759644, "learning_rate": 0.0004, "loss": 4.7902, "step": 14260 }, { "epoch": 0.2922204247127967, "grad_norm": 0.2653272747993469, "learning_rate": 0.0004, "loss": 4.8285, "step": 14270 }, { "epoch": 0.29242520426760593, "grad_norm": 0.2883239686489105, "learning_rate": 0.0004, "loss": 4.8096, "step": 14280 }, { "epoch": 0.2926299838224152, "grad_norm": 0.2962378263473511, "learning_rate": 0.0004, "loss": 4.8441, "step": 14290 }, { "epoch": 0.2928347633772244, "grad_norm": 0.7176985144615173, "learning_rate": 0.0004, "loss": 4.839, "step": 14300 }, { "epoch": 0.29303954293203366, "grad_norm": 0.2909333407878876, "learning_rate": 0.0004, "loss": 4.7834, "step": 14310 }, { "epoch": 0.2932443224868429, "grad_norm": 0.27297136187553406, "learning_rate": 0.0004, "loss": 4.8204, "step": 14320 }, { "epoch": 0.29344910204165214, "grad_norm": 0.2654514014720917, "learning_rate": 0.0004, "loss": 4.8241, "step": 14330 }, { "epoch": 0.2936538815964614, "grad_norm": 0.26377516984939575, "learning_rate": 0.0004, "loss": 4.8032, "step": 14340 }, { "epoch": 0.2938586611512707, "grad_norm": 0.2605004608631134, "learning_rate": 0.0004, "loss": 4.8014, "step": 14350 }, { "epoch": 0.2940634407060799, "grad_norm": 0.2723621129989624, "learning_rate": 0.0004, "loss": 4.7801, "step": 14360 }, { "epoch": 0.29426822026088917, "grad_norm": 0.29298287630081177, "learning_rate": 0.0004, "loss": 4.7942, "step": 14370 }, { "epoch": 0.2944729998156984, "grad_norm": 0.286304771900177, "learning_rate": 0.0004, "loss": 4.848, "step": 14380 }, { "epoch": 0.29467777937050765, "grad_norm": 0.2744176387786865, "learning_rate": 0.0004, "loss": 4.8144, "step": 14390 }, { "epoch": 0.2948825589253169, "grad_norm": 0.2628437876701355, "learning_rate": 0.0004, "loss": 4.7881, "step": 14400 }, { "epoch": 0.29508733848012614, "grad_norm": 0.24791161715984344, "learning_rate": 0.0004, "loss": 4.7821, "step": 14410 }, { "epoch": 0.2952921180349354, "grad_norm": 0.277873694896698, "learning_rate": 0.0004, "loss": 4.82, "step": 14420 }, { "epoch": 0.2954968975897446, "grad_norm": 0.378300279378891, "learning_rate": 0.0004, "loss": 4.8421, "step": 14430 }, { "epoch": 0.29570167714455386, "grad_norm": 0.2474977821111679, "learning_rate": 0.0004, "loss": 4.8351, "step": 14440 }, { "epoch": 0.29590645669936316, "grad_norm": 0.263503760099411, "learning_rate": 0.0004, "loss": 4.8063, "step": 14450 }, { "epoch": 0.2961112362541724, "grad_norm": 0.24829213321208954, "learning_rate": 0.0004, "loss": 4.798, "step": 14460 }, { "epoch": 0.29631601580898165, "grad_norm": 0.24435283243656158, "learning_rate": 0.0004, "loss": 4.8187, "step": 14470 }, { "epoch": 0.2965207953637909, "grad_norm": 0.25660616159439087, "learning_rate": 0.0004, "loss": 4.7885, "step": 14480 }, { "epoch": 0.29672557491860013, "grad_norm": 0.25041988492012024, "learning_rate": 0.0004, "loss": 4.8173, "step": 14490 }, { "epoch": 0.2969303544734094, "grad_norm": 0.26193779706954956, "learning_rate": 0.0004, "loss": 4.7869, "step": 14500 }, { "epoch": 0.2971351340282186, "grad_norm": 0.2585957646369934, "learning_rate": 0.0004, "loss": 4.805, "step": 14510 }, { "epoch": 0.29733991358302786, "grad_norm": 0.2570123076438904, "learning_rate": 0.0004, "loss": 4.8001, "step": 14520 }, { "epoch": 0.2975446931378371, "grad_norm": 0.2526560425758362, "learning_rate": 0.0004, "loss": 4.8152, "step": 14530 }, { "epoch": 0.29774947269264634, "grad_norm": 0.27037957310676575, "learning_rate": 0.0004, "loss": 4.8123, "step": 14540 }, { "epoch": 0.2979542522474556, "grad_norm": 0.2811831533908844, "learning_rate": 0.0004, "loss": 4.8307, "step": 14550 }, { "epoch": 0.2981590318022649, "grad_norm": 0.3060413599014282, "learning_rate": 0.0004, "loss": 4.8277, "step": 14560 }, { "epoch": 0.2983638113570741, "grad_norm": 0.25229233503341675, "learning_rate": 0.0004, "loss": 4.8506, "step": 14570 }, { "epoch": 0.29856859091188337, "grad_norm": 0.25974372029304504, "learning_rate": 0.0004, "loss": 4.7903, "step": 14580 }, { "epoch": 0.2987733704666926, "grad_norm": 0.30049508810043335, "learning_rate": 0.0004, "loss": 4.7895, "step": 14590 }, { "epoch": 0.29897815002150185, "grad_norm": 0.26262393593788147, "learning_rate": 0.0004, "loss": 4.7998, "step": 14600 }, { "epoch": 0.2991829295763111, "grad_norm": 0.27901288866996765, "learning_rate": 0.0004, "loss": 4.828, "step": 14610 }, { "epoch": 0.29938770913112034, "grad_norm": 0.2929536998271942, "learning_rate": 0.0004, "loss": 4.8018, "step": 14620 }, { "epoch": 0.2995924886859296, "grad_norm": 0.2543736696243286, "learning_rate": 0.0004, "loss": 4.808, "step": 14630 }, { "epoch": 0.2997972682407388, "grad_norm": 0.2771528959274292, "learning_rate": 0.0004, "loss": 4.8139, "step": 14640 }, { "epoch": 0.30000204779554807, "grad_norm": 0.26029521226882935, "learning_rate": 0.0004, "loss": 4.8371, "step": 14650 }, { "epoch": 0.30020682735035736, "grad_norm": 0.2559875547885895, "learning_rate": 0.0004, "loss": 4.8341, "step": 14660 }, { "epoch": 0.3004116069051666, "grad_norm": 0.2704298496246338, "learning_rate": 0.0004, "loss": 4.826, "step": 14670 }, { "epoch": 0.30061638645997585, "grad_norm": 0.2680201530456543, "learning_rate": 0.0004, "loss": 4.796, "step": 14680 }, { "epoch": 0.3008211660147851, "grad_norm": 0.3763655424118042, "learning_rate": 0.0004, "loss": 4.7792, "step": 14690 }, { "epoch": 0.30102594556959433, "grad_norm": 0.2631003260612488, "learning_rate": 0.0004, "loss": 4.8051, "step": 14700 }, { "epoch": 0.3012307251244036, "grad_norm": 0.27197256684303284, "learning_rate": 0.0004, "loss": 4.8303, "step": 14710 }, { "epoch": 0.3014355046792128, "grad_norm": 0.2803315818309784, "learning_rate": 0.0004, "loss": 4.7848, "step": 14720 }, { "epoch": 0.30164028423402206, "grad_norm": 0.25601300597190857, "learning_rate": 0.0004, "loss": 4.818, "step": 14730 }, { "epoch": 0.3018450637888313, "grad_norm": 0.2942025065422058, "learning_rate": 0.0004, "loss": 4.8216, "step": 14740 }, { "epoch": 0.30204984334364054, "grad_norm": 0.2761382758617401, "learning_rate": 0.0004, "loss": 4.8019, "step": 14750 }, { "epoch": 0.30225462289844984, "grad_norm": 0.25395074486732483, "learning_rate": 0.0004, "loss": 4.8356, "step": 14760 }, { "epoch": 0.3024594024532591, "grad_norm": 0.2864772379398346, "learning_rate": 0.0003999999923462465, "loss": 4.8108, "step": 14770 }, { "epoch": 0.3026641820080683, "grad_norm": 0.25308045744895935, "learning_rate": 0.00039999985627953394, "loss": 4.7768, "step": 14780 }, { "epoch": 0.30286896156287757, "grad_norm": 0.269564688205719, "learning_rate": 0.0003999995501295434, "loss": 4.8086, "step": 14790 }, { "epoch": 0.3030737411176868, "grad_norm": 0.25448814034461975, "learning_rate": 0.0003999990738965353, "loss": 4.7895, "step": 14800 }, { "epoch": 0.30327852067249605, "grad_norm": 0.33111077547073364, "learning_rate": 0.0003999984275809148, "loss": 4.8142, "step": 14810 }, { "epoch": 0.3034833002273053, "grad_norm": 0.357445627450943, "learning_rate": 0.00039999761118323117, "loss": 4.8057, "step": 14820 }, { "epoch": 0.30368807978211454, "grad_norm": 0.26140981912612915, "learning_rate": 0.000399996624704179, "loss": 4.8357, "step": 14830 }, { "epoch": 0.3038928593369238, "grad_norm": 0.2446521371603012, "learning_rate": 0.0003999954681445971, "loss": 4.7919, "step": 14840 }, { "epoch": 0.304097638891733, "grad_norm": 0.2549498379230499, "learning_rate": 0.000399994141505469, "loss": 4.7776, "step": 14850 }, { "epoch": 0.3043024184465423, "grad_norm": 0.29111766815185547, "learning_rate": 0.0003999926447879229, "loss": 4.8121, "step": 14860 }, { "epoch": 0.30450719800135156, "grad_norm": 0.2740670442581177, "learning_rate": 0.0003999909779932317, "loss": 4.7997, "step": 14870 }, { "epoch": 0.3047119775561608, "grad_norm": 0.2761414647102356, "learning_rate": 0.0003999891411228129, "loss": 4.8195, "step": 14880 }, { "epoch": 0.30491675711097005, "grad_norm": 0.4468514025211334, "learning_rate": 0.0003999871341782284, "loss": 4.796, "step": 14890 }, { "epoch": 0.3051215366657793, "grad_norm": 0.28377726674079895, "learning_rate": 0.00039998495716118515, "loss": 4.7818, "step": 14900 }, { "epoch": 0.30532631622058853, "grad_norm": 0.27489548921585083, "learning_rate": 0.00039998261007353445, "loss": 4.7957, "step": 14910 }, { "epoch": 0.3055310957753978, "grad_norm": 0.24948351085186005, "learning_rate": 0.0003999800929172723, "loss": 4.8064, "step": 14920 }, { "epoch": 0.305735875330207, "grad_norm": 0.3283112049102783, "learning_rate": 0.00039997740569453936, "loss": 4.7847, "step": 14930 }, { "epoch": 0.30594065488501626, "grad_norm": 0.28321975469589233, "learning_rate": 0.00039997454840762085, "loss": 4.7876, "step": 14940 }, { "epoch": 0.3061454344398255, "grad_norm": 0.2677721679210663, "learning_rate": 0.00039997152105894664, "loss": 4.8247, "step": 14950 }, { "epoch": 0.3063502139946348, "grad_norm": 0.26569730043411255, "learning_rate": 0.00039996832365109123, "loss": 4.8238, "step": 14960 }, { "epoch": 0.30655499354944404, "grad_norm": 0.25424227118492126, "learning_rate": 0.0003999649561867739, "loss": 4.8004, "step": 14970 }, { "epoch": 0.3067597731042533, "grad_norm": 0.25932741165161133, "learning_rate": 0.0003999614186688582, "loss": 4.8324, "step": 14980 }, { "epoch": 0.30696455265906253, "grad_norm": 0.2988094091415405, "learning_rate": 0.00039995771110035264, "loss": 4.796, "step": 14990 }, { "epoch": 0.30716933221387177, "grad_norm": 0.28092488646507263, "learning_rate": 0.0003999538334844101, "loss": 4.7976, "step": 15000 }, { "epoch": 0.307374111768681, "grad_norm": 0.24174724519252777, "learning_rate": 0.00039994978582432827, "loss": 4.8073, "step": 15010 }, { "epoch": 0.30757889132349026, "grad_norm": 0.26243990659713745, "learning_rate": 0.00039994556812354926, "loss": 4.8095, "step": 15020 }, { "epoch": 0.3077836708782995, "grad_norm": 0.2617025375366211, "learning_rate": 0.00039994118038565987, "loss": 4.8197, "step": 15030 }, { "epoch": 0.30798845043310874, "grad_norm": 0.28900280594825745, "learning_rate": 0.0003999366226143916, "loss": 4.7884, "step": 15040 }, { "epoch": 0.308193229987918, "grad_norm": 0.3000846207141876, "learning_rate": 0.0003999318948136204, "loss": 4.8049, "step": 15050 }, { "epoch": 0.3083980095427273, "grad_norm": 0.2622714340686798, "learning_rate": 0.00039992699698736684, "loss": 4.7619, "step": 15060 }, { "epoch": 0.3086027890975365, "grad_norm": 0.2860952913761139, "learning_rate": 0.0003999219291397961, "loss": 4.7925, "step": 15070 }, { "epoch": 0.30880756865234577, "grad_norm": 0.28790974617004395, "learning_rate": 0.0003999166912752181, "loss": 4.7417, "step": 15080 }, { "epoch": 0.309012348207155, "grad_norm": 0.25071680545806885, "learning_rate": 0.0003999112833980871, "loss": 4.8159, "step": 15090 }, { "epoch": 0.30921712776196425, "grad_norm": 0.27083414793014526, "learning_rate": 0.00039990570551300206, "loss": 4.7773, "step": 15100 }, { "epoch": 0.3094219073167735, "grad_norm": 0.29703348875045776, "learning_rate": 0.0003998999576247065, "loss": 4.812, "step": 15110 }, { "epoch": 0.30962668687158273, "grad_norm": 0.3592798411846161, "learning_rate": 0.0003998940397380886, "loss": 4.7812, "step": 15120 }, { "epoch": 0.309831466426392, "grad_norm": 0.267671138048172, "learning_rate": 0.0003998879518581809, "loss": 4.8375, "step": 15130 }, { "epoch": 0.3100362459812012, "grad_norm": 0.25858592987060547, "learning_rate": 0.00039988169399016074, "loss": 4.85, "step": 15140 }, { "epoch": 0.31024102553601046, "grad_norm": 0.2685125470161438, "learning_rate": 0.00039987526613934987, "loss": 4.8146, "step": 15150 }, { "epoch": 0.31044580509081976, "grad_norm": 0.47396326065063477, "learning_rate": 0.0003998686683112147, "loss": 4.7731, "step": 15160 }, { "epoch": 0.310650584645629, "grad_norm": 0.26775190234184265, "learning_rate": 0.00039986190051136605, "loss": 4.8311, "step": 15170 }, { "epoch": 0.31085536420043824, "grad_norm": 0.24957019090652466, "learning_rate": 0.0003998549627455594, "loss": 4.8156, "step": 15180 }, { "epoch": 0.3110601437552475, "grad_norm": 0.2548575699329376, "learning_rate": 0.0003998478550196948, "loss": 4.7477, "step": 15190 }, { "epoch": 0.31126492331005673, "grad_norm": 0.26530712842941284, "learning_rate": 0.00039984057733981674, "loss": 4.7858, "step": 15200 }, { "epoch": 0.31146970286486597, "grad_norm": 0.2542381286621094, "learning_rate": 0.0003998331297121143, "loss": 4.7727, "step": 15210 }, { "epoch": 0.3116744824196752, "grad_norm": 0.26388663053512573, "learning_rate": 0.00039982551214292104, "loss": 4.82, "step": 15220 }, { "epoch": 0.31187926197448446, "grad_norm": 0.2462894469499588, "learning_rate": 0.00039981772463871504, "loss": 4.8186, "step": 15230 }, { "epoch": 0.3120840415292937, "grad_norm": 0.28837886452674866, "learning_rate": 0.000399809767206119, "loss": 4.8143, "step": 15240 }, { "epoch": 0.31228882108410294, "grad_norm": 0.26220229268074036, "learning_rate": 0.00039980163985190006, "loss": 4.8026, "step": 15250 }, { "epoch": 0.3124936006389122, "grad_norm": 0.25457707047462463, "learning_rate": 0.00039979334258296976, "loss": 4.8067, "step": 15260 }, { "epoch": 0.3126983801937215, "grad_norm": 0.2709813117980957, "learning_rate": 0.0003997848754063844, "loss": 4.8274, "step": 15270 }, { "epoch": 0.3129031597485307, "grad_norm": 0.2797374427318573, "learning_rate": 0.0003997762383293444, "loss": 4.8108, "step": 15280 }, { "epoch": 0.31310793930333997, "grad_norm": 0.2793888449668884, "learning_rate": 0.0003997674313591951, "loss": 4.823, "step": 15290 }, { "epoch": 0.3133127188581492, "grad_norm": 0.2516084611415863, "learning_rate": 0.0003997584545034259, "loss": 4.7992, "step": 15300 }, { "epoch": 0.31351749841295845, "grad_norm": 0.25258857011795044, "learning_rate": 0.000399749307769671, "loss": 4.8223, "step": 15310 }, { "epoch": 0.3137222779677677, "grad_norm": 0.34943556785583496, "learning_rate": 0.00039973999116570895, "loss": 4.808, "step": 15320 }, { "epoch": 0.31392705752257694, "grad_norm": 0.25606784224510193, "learning_rate": 0.0003997305046994626, "loss": 4.7744, "step": 15330 }, { "epoch": 0.3141318370773862, "grad_norm": 0.27452796697616577, "learning_rate": 0.00039972084837899954, "loss": 4.7913, "step": 15340 }, { "epoch": 0.3143366166321954, "grad_norm": 0.2905919849872589, "learning_rate": 0.00039971102221253164, "loss": 4.7796, "step": 15350 }, { "epoch": 0.31454139618700466, "grad_norm": 0.2494192123413086, "learning_rate": 0.00039970102620841524, "loss": 4.7946, "step": 15360 }, { "epoch": 0.31474617574181396, "grad_norm": 0.25385144352912903, "learning_rate": 0.0003996908603751511, "loss": 4.8419, "step": 15370 }, { "epoch": 0.3149509552966232, "grad_norm": 0.2682260274887085, "learning_rate": 0.00039968052472138437, "loss": 4.7929, "step": 15380 }, { "epoch": 0.31515573485143245, "grad_norm": 0.9018281102180481, "learning_rate": 0.0003996700192559047, "loss": 4.8231, "step": 15390 }, { "epoch": 0.3153605144062417, "grad_norm": 0.27546435594558716, "learning_rate": 0.00039965934398764616, "loss": 4.8068, "step": 15400 }, { "epoch": 0.31556529396105093, "grad_norm": 0.27842995524406433, "learning_rate": 0.0003996484989256871, "loss": 4.836, "step": 15410 }, { "epoch": 0.3157700735158602, "grad_norm": 0.2818785309791565, "learning_rate": 0.0003996374840792505, "loss": 4.8011, "step": 15420 }, { "epoch": 0.3159748530706694, "grad_norm": 0.27837562561035156, "learning_rate": 0.0003996262994577034, "loss": 4.7473, "step": 15430 }, { "epoch": 0.31617963262547866, "grad_norm": 0.29244813323020935, "learning_rate": 0.00039961494507055747, "loss": 4.8349, "step": 15440 }, { "epoch": 0.3163844121802879, "grad_norm": 0.27803468704223633, "learning_rate": 0.00039960342092746866, "loss": 4.7763, "step": 15450 }, { "epoch": 0.31658919173509714, "grad_norm": 0.27822449803352356, "learning_rate": 0.0003995917270382373, "loss": 4.8026, "step": 15460 }, { "epoch": 0.31679397128990644, "grad_norm": 0.25251296162605286, "learning_rate": 0.00039957986341280805, "loss": 4.7876, "step": 15470 }, { "epoch": 0.3169987508447157, "grad_norm": 0.28681665658950806, "learning_rate": 0.00039956783006127, "loss": 4.7721, "step": 15480 }, { "epoch": 0.3172035303995249, "grad_norm": 0.27361565828323364, "learning_rate": 0.00039955562699385647, "loss": 4.7894, "step": 15490 }, { "epoch": 0.31740830995433417, "grad_norm": 0.264676570892334, "learning_rate": 0.00039954325422094516, "loss": 4.8182, "step": 15500 }, { "epoch": 0.3176130895091434, "grad_norm": 0.26645535230636597, "learning_rate": 0.00039953071175305807, "loss": 4.7915, "step": 15510 }, { "epoch": 0.31781786906395265, "grad_norm": 0.2505214810371399, "learning_rate": 0.00039951799960086156, "loss": 4.8, "step": 15520 }, { "epoch": 0.3180226486187619, "grad_norm": 0.2482193261384964, "learning_rate": 0.0003995051177751662, "loss": 4.8003, "step": 15530 }, { "epoch": 0.31822742817357114, "grad_norm": 0.2835414707660675, "learning_rate": 0.00039949206628692705, "loss": 4.813, "step": 15540 }, { "epoch": 0.3184322077283804, "grad_norm": 0.2924768030643463, "learning_rate": 0.0003994788451472432, "loss": 4.7809, "step": 15550 }, { "epoch": 0.3186369872831896, "grad_norm": 0.25978416204452515, "learning_rate": 0.0003994654543673581, "loss": 4.8088, "step": 15560 }, { "epoch": 0.3188417668379989, "grad_norm": 0.28737881779670715, "learning_rate": 0.0003994518939586596, "loss": 4.7877, "step": 15570 }, { "epoch": 0.31904654639280816, "grad_norm": 0.27220314741134644, "learning_rate": 0.00039943816393267967, "loss": 4.8257, "step": 15580 }, { "epoch": 0.3192513259476174, "grad_norm": 0.262319952249527, "learning_rate": 0.0003994242643010945, "loss": 4.7999, "step": 15590 }, { "epoch": 0.31945610550242665, "grad_norm": 0.24199426174163818, "learning_rate": 0.0003994101950757247, "loss": 4.7977, "step": 15600 }, { "epoch": 0.3196608850572359, "grad_norm": 0.32901597023010254, "learning_rate": 0.0003993959562685349, "loss": 4.8114, "step": 15610 }, { "epoch": 0.31986566461204513, "grad_norm": 0.24459275603294373, "learning_rate": 0.00039938154789163395, "loss": 4.7623, "step": 15620 }, { "epoch": 0.3200704441668544, "grad_norm": 0.26827767491340637, "learning_rate": 0.00039936696995727505, "loss": 4.7552, "step": 15630 }, { "epoch": 0.3202752237216636, "grad_norm": 0.254894882440567, "learning_rate": 0.00039935222247785554, "loss": 4.7585, "step": 15640 }, { "epoch": 0.32048000327647286, "grad_norm": 0.46504759788513184, "learning_rate": 0.0003993373054659169, "loss": 4.8143, "step": 15650 }, { "epoch": 0.3206847828312821, "grad_norm": 0.28007370233535767, "learning_rate": 0.00039932221893414487, "loss": 4.7931, "step": 15660 }, { "epoch": 0.3208895623860914, "grad_norm": 0.24671123921871185, "learning_rate": 0.0003993069628953692, "loss": 4.7946, "step": 15670 }, { "epoch": 0.32109434194090064, "grad_norm": 0.28686684370040894, "learning_rate": 0.000399291537362564, "loss": 4.84, "step": 15680 }, { "epoch": 0.3212991214957099, "grad_norm": 0.2491459995508194, "learning_rate": 0.0003992759423488472, "loss": 4.7923, "step": 15690 }, { "epoch": 0.3215039010505191, "grad_norm": 0.24470731616020203, "learning_rate": 0.00039926017786748135, "loss": 4.8088, "step": 15700 }, { "epoch": 0.32170868060532837, "grad_norm": 0.2530973553657532, "learning_rate": 0.00039924424393187263, "loss": 4.8138, "step": 15710 }, { "epoch": 0.3219134601601376, "grad_norm": 0.2529483437538147, "learning_rate": 0.0003992281405555716, "loss": 4.7946, "step": 15720 }, { "epoch": 0.32211823971494685, "grad_norm": 0.25579866766929626, "learning_rate": 0.00039921186775227277, "loss": 4.7957, "step": 15730 }, { "epoch": 0.3223230192697561, "grad_norm": 0.26418569684028625, "learning_rate": 0.0003991954255358149, "loss": 4.7991, "step": 15740 }, { "epoch": 0.32252779882456534, "grad_norm": 0.2771753966808319, "learning_rate": 0.00039917881392018064, "loss": 4.8127, "step": 15750 }, { "epoch": 0.3227325783793746, "grad_norm": 0.25980281829833984, "learning_rate": 0.0003991620329194969, "loss": 4.816, "step": 15760 }, { "epoch": 0.3229373579341839, "grad_norm": 0.27328717708587646, "learning_rate": 0.0003991450825480345, "loss": 4.8168, "step": 15770 }, { "epoch": 0.3231421374889931, "grad_norm": 0.2446042001247406, "learning_rate": 0.0003991279628202082, "loss": 4.7819, "step": 15780 }, { "epoch": 0.32334691704380236, "grad_norm": 0.3084605634212494, "learning_rate": 0.00039911067375057713, "loss": 4.8135, "step": 15790 }, { "epoch": 0.3235516965986116, "grad_norm": 0.2617594599723816, "learning_rate": 0.00039909321535384397, "loss": 4.8365, "step": 15800 }, { "epoch": 0.32375647615342085, "grad_norm": 0.24815472960472107, "learning_rate": 0.00039907558764485583, "loss": 4.7804, "step": 15810 }, { "epoch": 0.3239612557082301, "grad_norm": 0.2749323546886444, "learning_rate": 0.0003990577906386036, "loss": 4.8013, "step": 15820 }, { "epoch": 0.32416603526303933, "grad_norm": 0.2664775252342224, "learning_rate": 0.00039903982435022194, "loss": 4.8169, "step": 15830 }, { "epoch": 0.3243708148178486, "grad_norm": 0.28417569398880005, "learning_rate": 0.00039902168879498996, "loss": 4.777, "step": 15840 }, { "epoch": 0.3245755943726578, "grad_norm": 0.26321133971214294, "learning_rate": 0.00039900338398833027, "loss": 4.8145, "step": 15850 }, { "epoch": 0.32478037392746706, "grad_norm": 0.2757444977760315, "learning_rate": 0.00039898490994580963, "loss": 4.7887, "step": 15860 }, { "epoch": 0.32498515348227636, "grad_norm": 0.2671668231487274, "learning_rate": 0.00039896626668313874, "loss": 4.762, "step": 15870 }, { "epoch": 0.3251899330370856, "grad_norm": 0.279275506734848, "learning_rate": 0.0003989474542161721, "loss": 4.8086, "step": 15880 }, { "epoch": 0.32539471259189484, "grad_norm": 0.2854190170764923, "learning_rate": 0.0003989284725609081, "loss": 4.7784, "step": 15890 }, { "epoch": 0.3255994921467041, "grad_norm": 0.24825824797153473, "learning_rate": 0.0003989093217334891, "loss": 4.7646, "step": 15900 }, { "epoch": 0.3258042717015133, "grad_norm": 0.2487155646085739, "learning_rate": 0.0003988900017502013, "loss": 4.8313, "step": 15910 }, { "epoch": 0.32600905125632257, "grad_norm": 0.49503177404403687, "learning_rate": 0.0003988705126274749, "loss": 4.7791, "step": 15920 }, { "epoch": 0.3262138308111318, "grad_norm": 0.2913387715816498, "learning_rate": 0.0003988508543818835, "loss": 4.804, "step": 15930 }, { "epoch": 0.32641861036594105, "grad_norm": 0.2623058259487152, "learning_rate": 0.00039883102703014493, "loss": 4.7857, "step": 15940 }, { "epoch": 0.3266233899207503, "grad_norm": 0.25395411252975464, "learning_rate": 0.00039881103058912076, "loss": 4.7938, "step": 15950 }, { "epoch": 0.32682816947555954, "grad_norm": 0.2600069046020508, "learning_rate": 0.00039879086507581615, "loss": 4.7651, "step": 15960 }, { "epoch": 0.3270329490303688, "grad_norm": 0.287539005279541, "learning_rate": 0.0003987705305073804, "loss": 4.7746, "step": 15970 }, { "epoch": 0.3272377285851781, "grad_norm": 0.2536559998989105, "learning_rate": 0.00039875002690110625, "loss": 4.8204, "step": 15980 }, { "epoch": 0.3274425081399873, "grad_norm": 0.2836781144142151, "learning_rate": 0.00039872935427443033, "loss": 4.8302, "step": 15990 }, { "epoch": 0.32764728769479656, "grad_norm": 0.2694326341152191, "learning_rate": 0.00039870851264493297, "loss": 4.8092, "step": 16000 }, { "epoch": 0.32764728769479656, "eval_loss": 4.804255485534668, "eval_runtime": 4.3883, "eval_samples_per_second": 265.704, "eval_steps_per_second": 33.27, "step": 16000 }, { "epoch": 0.3278520672496058, "grad_norm": 0.2952719032764435, "learning_rate": 0.00039868750203033836, "loss": 4.7977, "step": 16010 }, { "epoch": 0.32805684680441505, "grad_norm": 0.2660292685031891, "learning_rate": 0.0003986663224485142, "loss": 4.8145, "step": 16020 }, { "epoch": 0.3282616263592243, "grad_norm": 0.2490932047367096, "learning_rate": 0.0003986449739174718, "loss": 4.8062, "step": 16030 }, { "epoch": 0.32846640591403353, "grad_norm": 0.2510133385658264, "learning_rate": 0.0003986234564553667, "loss": 4.7892, "step": 16040 }, { "epoch": 0.3286711854688428, "grad_norm": 0.2521056532859802, "learning_rate": 0.00039860177008049736, "loss": 4.8038, "step": 16050 }, { "epoch": 0.328875965023652, "grad_norm": 0.2704649567604065, "learning_rate": 0.00039857991481130644, "loss": 4.7656, "step": 16060 }, { "epoch": 0.32908074457846126, "grad_norm": 0.25223037600517273, "learning_rate": 0.0003985578906663799, "loss": 4.7934, "step": 16070 }, { "epoch": 0.32928552413327056, "grad_norm": 0.2978556752204895, "learning_rate": 0.0003985356976644476, "loss": 4.8232, "step": 16080 }, { "epoch": 0.3294903036880798, "grad_norm": 0.31600844860076904, "learning_rate": 0.0003985133358243827, "loss": 4.7498, "step": 16090 }, { "epoch": 0.32969508324288904, "grad_norm": 0.3390003442764282, "learning_rate": 0.0003984908051652022, "loss": 4.7813, "step": 16100 }, { "epoch": 0.3298998627976983, "grad_norm": 0.2710893154144287, "learning_rate": 0.0003984681057060665, "loss": 4.788, "step": 16110 }, { "epoch": 0.3301046423525075, "grad_norm": 0.27168989181518555, "learning_rate": 0.0003984452374662796, "loss": 4.804, "step": 16120 }, { "epoch": 0.33030942190731677, "grad_norm": 0.27508488297462463, "learning_rate": 0.00039842220046528906, "loss": 4.8179, "step": 16130 }, { "epoch": 0.330514201462126, "grad_norm": 0.3193860352039337, "learning_rate": 0.0003983989947226859, "loss": 4.8144, "step": 16140 }, { "epoch": 0.33071898101693525, "grad_norm": 0.24990639090538025, "learning_rate": 0.00039837562025820474, "loss": 4.7533, "step": 16150 }, { "epoch": 0.3309237605717445, "grad_norm": 0.25338494777679443, "learning_rate": 0.0003983520770917236, "loss": 4.7864, "step": 16160 }, { "epoch": 0.33112854012655374, "grad_norm": 0.27676770091056824, "learning_rate": 0.000398328365243264, "loss": 4.7785, "step": 16170 }, { "epoch": 0.33133331968136304, "grad_norm": 0.31176072359085083, "learning_rate": 0.0003983044847329908, "loss": 4.7863, "step": 16180 }, { "epoch": 0.3315380992361723, "grad_norm": 0.37212496995925903, "learning_rate": 0.0003982804355812125, "loss": 4.802, "step": 16190 }, { "epoch": 0.3317428787909815, "grad_norm": 0.2535504996776581, "learning_rate": 0.0003982562178083809, "loss": 4.8146, "step": 16200 }, { "epoch": 0.33194765834579076, "grad_norm": 0.26328185200691223, "learning_rate": 0.0003982318314350912, "loss": 4.758, "step": 16210 }, { "epoch": 0.3321524379006, "grad_norm": 0.26491978764533997, "learning_rate": 0.000398207276482082, "loss": 4.7593, "step": 16220 }, { "epoch": 0.33235721745540925, "grad_norm": 0.2541959285736084, "learning_rate": 0.0003981825529702351, "loss": 4.7874, "step": 16230 }, { "epoch": 0.3325619970102185, "grad_norm": 0.2653116285800934, "learning_rate": 0.00039815766092057603, "loss": 4.7881, "step": 16240 }, { "epoch": 0.33276677656502773, "grad_norm": 0.2704290449619293, "learning_rate": 0.0003981326003542733, "loss": 4.8138, "step": 16250 }, { "epoch": 0.332971556119837, "grad_norm": 0.25799617171287537, "learning_rate": 0.00039810737129263875, "loss": 4.7731, "step": 16260 }, { "epoch": 0.3331763356746462, "grad_norm": 0.28069770336151123, "learning_rate": 0.0003980819737571278, "loss": 4.7964, "step": 16270 }, { "epoch": 0.3333811152294555, "grad_norm": 0.32808852195739746, "learning_rate": 0.0003980564077693388, "loss": 4.7703, "step": 16280 }, { "epoch": 0.33358589478426476, "grad_norm": 0.28964656591415405, "learning_rate": 0.00039803067335101347, "loss": 4.799, "step": 16290 }, { "epoch": 0.333790674339074, "grad_norm": 0.2691846787929535, "learning_rate": 0.0003980047705240369, "loss": 4.7887, "step": 16300 }, { "epoch": 0.33399545389388324, "grad_norm": 0.2500579059123993, "learning_rate": 0.0003979786993104373, "loss": 4.7978, "step": 16310 }, { "epoch": 0.3342002334486925, "grad_norm": 0.25928881764411926, "learning_rate": 0.00039795245973238603, "loss": 4.7758, "step": 16320 }, { "epoch": 0.3344050130035017, "grad_norm": 0.28237369656562805, "learning_rate": 0.00039792605181219765, "loss": 4.7988, "step": 16330 }, { "epoch": 0.33460979255831097, "grad_norm": 0.2945230305194855, "learning_rate": 0.00039789947557232995, "loss": 4.7976, "step": 16340 }, { "epoch": 0.3348145721131202, "grad_norm": 0.24285341799259186, "learning_rate": 0.00039787273103538377, "loss": 4.8141, "step": 16350 }, { "epoch": 0.33501935166792945, "grad_norm": 0.3118240535259247, "learning_rate": 0.0003978458182241032, "loss": 4.7789, "step": 16360 }, { "epoch": 0.3352241312227387, "grad_norm": 0.285800963640213, "learning_rate": 0.0003978187371613752, "loss": 4.7804, "step": 16370 }, { "epoch": 0.335428910777548, "grad_norm": 0.25658172369003296, "learning_rate": 0.00039779148787023015, "loss": 4.7692, "step": 16380 }, { "epoch": 0.33563369033235724, "grad_norm": 0.2750941812992096, "learning_rate": 0.0003977640703738412, "loss": 4.78, "step": 16390 }, { "epoch": 0.3358384698871665, "grad_norm": 0.26658308506011963, "learning_rate": 0.00039773648469552464, "loss": 4.7697, "step": 16400 }, { "epoch": 0.3360432494419757, "grad_norm": 0.36863094568252563, "learning_rate": 0.00039770873085873994, "loss": 4.7782, "step": 16410 }, { "epoch": 0.33624802899678496, "grad_norm": 0.2577030062675476, "learning_rate": 0.00039768080888708923, "loss": 4.8157, "step": 16420 }, { "epoch": 0.3364528085515942, "grad_norm": 0.2842894494533539, "learning_rate": 0.000397652718804318, "loss": 4.785, "step": 16430 }, { "epoch": 0.33665758810640345, "grad_norm": 0.2669421434402466, "learning_rate": 0.0003976244606343144, "loss": 4.7567, "step": 16440 }, { "epoch": 0.3368623676612127, "grad_norm": 0.282748281955719, "learning_rate": 0.00039759603440110984, "loss": 4.777, "step": 16450 }, { "epoch": 0.33706714721602193, "grad_norm": 0.26537439227104187, "learning_rate": 0.00039756744012887826, "loss": 4.7889, "step": 16460 }, { "epoch": 0.3372719267708312, "grad_norm": 0.2713695168495178, "learning_rate": 0.00039753867784193684, "loss": 4.7887, "step": 16470 }, { "epoch": 0.3374767063256405, "grad_norm": 0.2605438530445099, "learning_rate": 0.0003975097475647455, "loss": 4.8369, "step": 16480 }, { "epoch": 0.3376814858804497, "grad_norm": 0.26819923520088196, "learning_rate": 0.00039748064932190703, "loss": 4.7778, "step": 16490 }, { "epoch": 0.33788626543525896, "grad_norm": 0.28676170110702515, "learning_rate": 0.000397451383138167, "loss": 4.8162, "step": 16500 }, { "epoch": 0.3380910449900682, "grad_norm": 0.2624913156032562, "learning_rate": 0.00039742194903841406, "loss": 4.809, "step": 16510 }, { "epoch": 0.33829582454487744, "grad_norm": 0.2781206965446472, "learning_rate": 0.00039739234704767927, "loss": 4.7931, "step": 16520 }, { "epoch": 0.3385006040996867, "grad_norm": 0.2613312005996704, "learning_rate": 0.0003973625771911368, "loss": 4.8285, "step": 16530 }, { "epoch": 0.33870538365449593, "grad_norm": 0.2740297317504883, "learning_rate": 0.0003973326394941034, "loss": 4.7699, "step": 16540 }, { "epoch": 0.33891016320930517, "grad_norm": 0.2648509740829468, "learning_rate": 0.00039730253398203853, "loss": 4.7953, "step": 16550 }, { "epoch": 0.3391149427641144, "grad_norm": 0.2660149931907654, "learning_rate": 0.0003972722606805445, "loss": 4.8125, "step": 16560 }, { "epoch": 0.33931972231892366, "grad_norm": 0.25253239274024963, "learning_rate": 0.0003972418196153662, "loss": 4.7355, "step": 16570 }, { "epoch": 0.33952450187373295, "grad_norm": 0.2715434730052948, "learning_rate": 0.00039721121081239127, "loss": 4.7798, "step": 16580 }, { "epoch": 0.3397292814285422, "grad_norm": 0.2549162209033966, "learning_rate": 0.00039718043429764994, "loss": 4.781, "step": 16590 }, { "epoch": 0.33993406098335144, "grad_norm": 0.38355880975723267, "learning_rate": 0.0003971494900973151, "loss": 4.8202, "step": 16600 }, { "epoch": 0.3401388405381607, "grad_norm": 0.28966614603996277, "learning_rate": 0.0003971183782377022, "loss": 4.8154, "step": 16610 }, { "epoch": 0.3403436200929699, "grad_norm": 0.2820665240287781, "learning_rate": 0.0003970870987452693, "loss": 4.7849, "step": 16620 }, { "epoch": 0.34054839964777917, "grad_norm": 0.2683079242706299, "learning_rate": 0.00039705565164661704, "loss": 4.7598, "step": 16630 }, { "epoch": 0.3407531792025884, "grad_norm": 0.2876993417739868, "learning_rate": 0.0003970240369684885, "loss": 4.7977, "step": 16640 }, { "epoch": 0.34095795875739765, "grad_norm": 0.2624385952949524, "learning_rate": 0.0003969922547377694, "loss": 4.7746, "step": 16650 }, { "epoch": 0.3411627383122069, "grad_norm": 0.28317347168922424, "learning_rate": 0.00039696030498148784, "loss": 4.7449, "step": 16660 }, { "epoch": 0.34136751786701613, "grad_norm": 0.27427518367767334, "learning_rate": 0.0003969281877268145, "loss": 4.7897, "step": 16670 }, { "epoch": 0.34157229742182543, "grad_norm": 0.26816168427467346, "learning_rate": 0.00039689590300106235, "loss": 4.7957, "step": 16680 }, { "epoch": 0.3417770769766347, "grad_norm": 0.27925974130630493, "learning_rate": 0.00039686345083168694, "loss": 4.77, "step": 16690 }, { "epoch": 0.3419818565314439, "grad_norm": 0.26638373732566833, "learning_rate": 0.00039683083124628614, "loss": 4.7762, "step": 16700 }, { "epoch": 0.34218663608625316, "grad_norm": 1.233842372894287, "learning_rate": 0.00039679804427260025, "loss": 4.8333, "step": 16710 }, { "epoch": 0.3423914156410624, "grad_norm": 0.2703392505645752, "learning_rate": 0.00039676508993851176, "loss": 4.7451, "step": 16720 }, { "epoch": 0.34259619519587164, "grad_norm": 0.34831905364990234, "learning_rate": 0.0003967319682720456, "loss": 4.7567, "step": 16730 }, { "epoch": 0.3428009747506809, "grad_norm": 0.28535258769989014, "learning_rate": 0.0003966986793013691, "loss": 4.7805, "step": 16740 }, { "epoch": 0.34300575430549013, "grad_norm": 0.3032117784023285, "learning_rate": 0.0003966652230547917, "loss": 4.7826, "step": 16750 }, { "epoch": 0.34321053386029937, "grad_norm": 0.30235326290130615, "learning_rate": 0.0003966315995607651, "loss": 4.815, "step": 16760 }, { "epoch": 0.3434153134151086, "grad_norm": 0.25587034225463867, "learning_rate": 0.00039659780884788347, "loss": 4.8103, "step": 16770 }, { "epoch": 0.34362009296991786, "grad_norm": 0.29329124093055725, "learning_rate": 0.00039656385094488287, "loss": 4.8138, "step": 16780 }, { "epoch": 0.34382487252472715, "grad_norm": 0.26573923230171204, "learning_rate": 0.0003965297258806417, "loss": 4.7919, "step": 16790 }, { "epoch": 0.3440296520795364, "grad_norm": 0.2600584924221039, "learning_rate": 0.0003964954336841805, "loss": 4.79, "step": 16800 }, { "epoch": 0.34423443163434564, "grad_norm": 0.2669816017150879, "learning_rate": 0.00039646097438466194, "loss": 4.775, "step": 16810 }, { "epoch": 0.3444392111891549, "grad_norm": 0.24775204062461853, "learning_rate": 0.00039642634801139077, "loss": 4.7953, "step": 16820 }, { "epoch": 0.3446439907439641, "grad_norm": 0.25428342819213867, "learning_rate": 0.00039639155459381397, "loss": 4.7679, "step": 16830 }, { "epoch": 0.34484877029877337, "grad_norm": 0.26264557242393494, "learning_rate": 0.0003963565941615202, "loss": 4.794, "step": 16840 }, { "epoch": 0.3450535498535826, "grad_norm": 0.26654911041259766, "learning_rate": 0.00039632146674424063, "loss": 4.7953, "step": 16850 }, { "epoch": 0.34525832940839185, "grad_norm": 0.3195827901363373, "learning_rate": 0.00039628617237184817, "loss": 4.758, "step": 16860 }, { "epoch": 0.3454631089632011, "grad_norm": 0.24894453585147858, "learning_rate": 0.00039625071107435766, "loss": 4.7523, "step": 16870 }, { "epoch": 0.34566788851801034, "grad_norm": 0.29483887553215027, "learning_rate": 0.00039621508288192615, "loss": 4.7372, "step": 16880 }, { "epoch": 0.34587266807281963, "grad_norm": 0.2702201008796692, "learning_rate": 0.0003961792878248523, "loss": 4.7731, "step": 16890 }, { "epoch": 0.3460774476276289, "grad_norm": 0.2855082154273987, "learning_rate": 0.0003961433259335769, "loss": 4.7888, "step": 16900 }, { "epoch": 0.3462822271824381, "grad_norm": 0.3111724257469177, "learning_rate": 0.0003961071972386826, "loss": 4.7917, "step": 16910 }, { "epoch": 0.34648700673724736, "grad_norm": 0.27766406536102295, "learning_rate": 0.00039607090177089374, "loss": 4.7401, "step": 16920 }, { "epoch": 0.3466917862920566, "grad_norm": 0.2639821171760559, "learning_rate": 0.00039603443956107674, "loss": 4.7449, "step": 16930 }, { "epoch": 0.34689656584686585, "grad_norm": 0.264573872089386, "learning_rate": 0.0003959978106402396, "loss": 4.7777, "step": 16940 }, { "epoch": 0.3471013454016751, "grad_norm": 0.25831058621406555, "learning_rate": 0.0003959610150395322, "loss": 4.7678, "step": 16950 }, { "epoch": 0.34730612495648433, "grad_norm": 0.27648264169692993, "learning_rate": 0.0003959240527902461, "loss": 4.7753, "step": 16960 }, { "epoch": 0.3475109045112936, "grad_norm": 0.23673586547374725, "learning_rate": 0.0003958869239238147, "loss": 4.7935, "step": 16970 }, { "epoch": 0.3477156840661028, "grad_norm": 0.27137354016304016, "learning_rate": 0.00039584962847181293, "loss": 4.768, "step": 16980 }, { "epoch": 0.3479204636209121, "grad_norm": 0.30974748730659485, "learning_rate": 0.0003958121664659576, "loss": 4.7705, "step": 16990 }, { "epoch": 0.34812524317572136, "grad_norm": 0.29479289054870605, "learning_rate": 0.000395774537938107, "loss": 4.7628, "step": 17000 }, { "epoch": 0.3483300227305306, "grad_norm": 0.2980476915836334, "learning_rate": 0.00039573674292026095, "loss": 4.7691, "step": 17010 }, { "epoch": 0.34853480228533984, "grad_norm": 0.26816049218177795, "learning_rate": 0.0003956987814445611, "loss": 4.7873, "step": 17020 }, { "epoch": 0.3487395818401491, "grad_norm": 0.29303935170173645, "learning_rate": 0.0003956606535432905, "loss": 4.7955, "step": 17030 }, { "epoch": 0.3489443613949583, "grad_norm": 0.27338966727256775, "learning_rate": 0.0003956223592488738, "loss": 4.7842, "step": 17040 }, { "epoch": 0.34914914094976757, "grad_norm": 0.2604229748249054, "learning_rate": 0.000395583898593877, "loss": 4.7882, "step": 17050 }, { "epoch": 0.3493539205045768, "grad_norm": 0.2727995216846466, "learning_rate": 0.00039554527161100785, "loss": 4.801, "step": 17060 }, { "epoch": 0.34955870005938605, "grad_norm": 0.26669710874557495, "learning_rate": 0.0003955064783331152, "loss": 4.7601, "step": 17070 }, { "epoch": 0.3497634796141953, "grad_norm": 0.2862171530723572, "learning_rate": 0.00039546751879318976, "loss": 4.8028, "step": 17080 }, { "epoch": 0.3499682591690046, "grad_norm": 0.25458213686943054, "learning_rate": 0.0003954283930243632, "loss": 4.7981, "step": 17090 }, { "epoch": 0.35017303872381383, "grad_norm": 0.2625376582145691, "learning_rate": 0.0003953891010599088, "loss": 4.7393, "step": 17100 }, { "epoch": 0.3503778182786231, "grad_norm": 0.25130653381347656, "learning_rate": 0.0003953496429332411, "loss": 4.8091, "step": 17110 }, { "epoch": 0.3505825978334323, "grad_norm": 0.3048926591873169, "learning_rate": 0.00039531001867791605, "loss": 4.7784, "step": 17120 }, { "epoch": 0.35078737738824156, "grad_norm": 0.2529113292694092, "learning_rate": 0.0003952702283276306, "loss": 4.764, "step": 17130 }, { "epoch": 0.3509921569430508, "grad_norm": 0.3067050576210022, "learning_rate": 0.00039523027191622337, "loss": 4.7556, "step": 17140 }, { "epoch": 0.35119693649786005, "grad_norm": 0.26279208064079285, "learning_rate": 0.00039519014947767374, "loss": 4.7482, "step": 17150 }, { "epoch": 0.3514017160526693, "grad_norm": 0.26440808176994324, "learning_rate": 0.00039514986104610266, "loss": 4.7705, "step": 17160 }, { "epoch": 0.35160649560747853, "grad_norm": 0.2666628360748291, "learning_rate": 0.00039510940665577214, "loss": 4.7793, "step": 17170 }, { "epoch": 0.3518112751622878, "grad_norm": 0.256744384765625, "learning_rate": 0.0003950687863410851, "loss": 4.7938, "step": 17180 }, { "epoch": 0.35201605471709707, "grad_norm": 0.2572229504585266, "learning_rate": 0.00039502800013658596, "loss": 4.7568, "step": 17190 }, { "epoch": 0.3522208342719063, "grad_norm": 0.28228095173835754, "learning_rate": 0.00039498704807695986, "loss": 4.7961, "step": 17200 }, { "epoch": 0.35242561382671556, "grad_norm": 0.3019843101501465, "learning_rate": 0.00039494593019703307, "loss": 4.7828, "step": 17210 }, { "epoch": 0.3526303933815248, "grad_norm": 0.2625148296356201, "learning_rate": 0.0003949046465317731, "loss": 4.756, "step": 17220 }, { "epoch": 0.35283517293633404, "grad_norm": 0.2715732455253601, "learning_rate": 0.00039486319711628826, "loss": 4.7641, "step": 17230 }, { "epoch": 0.3530399524911433, "grad_norm": 0.26373767852783203, "learning_rate": 0.0003948215819858277, "loss": 4.7837, "step": 17240 }, { "epoch": 0.3532447320459525, "grad_norm": 0.26473498344421387, "learning_rate": 0.00039477980117578175, "loss": 4.7717, "step": 17250 }, { "epoch": 0.35344951160076177, "grad_norm": 0.30615073442459106, "learning_rate": 0.0003947378547216815, "loss": 4.7593, "step": 17260 }, { "epoch": 0.353654291155571, "grad_norm": 0.2714005410671234, "learning_rate": 0.0003946957426591989, "loss": 4.7721, "step": 17270 }, { "epoch": 0.35385907071038025, "grad_norm": 0.2753409743309021, "learning_rate": 0.0003946534650241468, "loss": 4.7746, "step": 17280 }, { "epoch": 0.35406385026518955, "grad_norm": 0.280316025018692, "learning_rate": 0.00039461102185247876, "loss": 4.7368, "step": 17290 }, { "epoch": 0.3542686298199988, "grad_norm": 0.2765102684497833, "learning_rate": 0.00039456841318028933, "loss": 4.7986, "step": 17300 }, { "epoch": 0.35447340937480804, "grad_norm": 0.2677052617073059, "learning_rate": 0.0003945256390438134, "loss": 4.7974, "step": 17310 }, { "epoch": 0.3546781889296173, "grad_norm": 0.262820839881897, "learning_rate": 0.000394482699479427, "loss": 4.7682, "step": 17320 }, { "epoch": 0.3548829684844265, "grad_norm": 0.25526392459869385, "learning_rate": 0.00039443959452364674, "loss": 4.7675, "step": 17330 }, { "epoch": 0.35508774803923576, "grad_norm": 0.30862829089164734, "learning_rate": 0.0003943963242131296, "loss": 4.7875, "step": 17340 }, { "epoch": 0.355292527594045, "grad_norm": 0.2543276846408844, "learning_rate": 0.0003943528885846734, "loss": 4.7652, "step": 17350 }, { "epoch": 0.35549730714885425, "grad_norm": 0.24953249096870422, "learning_rate": 0.00039430928767521673, "loss": 4.7657, "step": 17360 }, { "epoch": 0.3557020867036635, "grad_norm": 0.26887795329093933, "learning_rate": 0.00039426552152183834, "loss": 4.7724, "step": 17370 }, { "epoch": 0.35590686625847273, "grad_norm": 0.27610909938812256, "learning_rate": 0.0003942215901617579, "loss": 4.7987, "step": 17380 }, { "epoch": 0.35611164581328203, "grad_norm": 0.26154568791389465, "learning_rate": 0.0003941774936323352, "loss": 4.7506, "step": 17390 }, { "epoch": 0.35631642536809127, "grad_norm": 0.456100732088089, "learning_rate": 0.0003941332319710708, "loss": 4.7871, "step": 17400 }, { "epoch": 0.3565212049229005, "grad_norm": 0.24634867906570435, "learning_rate": 0.0003940888052156056, "loss": 4.7905, "step": 17410 }, { "epoch": 0.35672598447770976, "grad_norm": 0.29051482677459717, "learning_rate": 0.00039404421340372066, "loss": 4.7968, "step": 17420 }, { "epoch": 0.356930764032519, "grad_norm": 0.2819485068321228, "learning_rate": 0.0003939994565733379, "loss": 4.7491, "step": 17430 }, { "epoch": 0.35713554358732824, "grad_norm": 0.26910969614982605, "learning_rate": 0.00039395453476251906, "loss": 4.7611, "step": 17440 }, { "epoch": 0.3573403231421375, "grad_norm": 0.2533196806907654, "learning_rate": 0.00039390944800946656, "loss": 4.7859, "step": 17450 }, { "epoch": 0.3575451026969467, "grad_norm": 0.3193216323852539, "learning_rate": 0.00039386419635252285, "loss": 4.7623, "step": 17460 }, { "epoch": 0.35774988225175597, "grad_norm": 0.2615881562232971, "learning_rate": 0.0003938187798301707, "loss": 4.7673, "step": 17470 }, { "epoch": 0.3579546618065652, "grad_norm": 0.26223790645599365, "learning_rate": 0.00039377319848103314, "loss": 4.769, "step": 17480 }, { "epoch": 0.35815944136137445, "grad_norm": 0.2690642476081848, "learning_rate": 0.0003937274523438733, "loss": 4.7683, "step": 17490 }, { "epoch": 0.35836422091618375, "grad_norm": 0.47632238268852234, "learning_rate": 0.00039368154145759453, "loss": 4.7411, "step": 17500 }, { "epoch": 0.358569000470993, "grad_norm": 0.2614399492740631, "learning_rate": 0.00039363546586124024, "loss": 4.7941, "step": 17510 }, { "epoch": 0.35877378002580224, "grad_norm": 0.28558123111724854, "learning_rate": 0.0003935892255939938, "loss": 4.764, "step": 17520 }, { "epoch": 0.3589785595806115, "grad_norm": 0.25479230284690857, "learning_rate": 0.00039354282069517873, "loss": 4.7787, "step": 17530 }, { "epoch": 0.3591833391354207, "grad_norm": 0.270384281873703, "learning_rate": 0.0003934962512042586, "loss": 4.7613, "step": 17540 }, { "epoch": 0.35938811869022996, "grad_norm": 0.29647931456565857, "learning_rate": 0.00039344951716083687, "loss": 4.7855, "step": 17550 }, { "epoch": 0.3595928982450392, "grad_norm": 0.29567044973373413, "learning_rate": 0.00039340261860465697, "loss": 4.7874, "step": 17560 }, { "epoch": 0.35979767779984845, "grad_norm": 0.2841157913208008, "learning_rate": 0.0003933555555756023, "loss": 4.7343, "step": 17570 }, { "epoch": 0.3600024573546577, "grad_norm": 0.292279988527298, "learning_rate": 0.0003933083281136959, "loss": 4.7657, "step": 17580 }, { "epoch": 0.36020723690946693, "grad_norm": 0.2682152986526489, "learning_rate": 0.00039326093625910094, "loss": 4.8004, "step": 17590 }, { "epoch": 0.36041201646427623, "grad_norm": 0.261972576379776, "learning_rate": 0.0003932133800521202, "loss": 4.7783, "step": 17600 }, { "epoch": 0.3606167960190855, "grad_norm": 0.26742562651634216, "learning_rate": 0.00039316565953319635, "loss": 4.725, "step": 17610 }, { "epoch": 0.3608215755738947, "grad_norm": 0.26306724548339844, "learning_rate": 0.0003931177747429117, "loss": 4.7564, "step": 17620 }, { "epoch": 0.36102635512870396, "grad_norm": 0.28656601905822754, "learning_rate": 0.0003930697257219882, "loss": 4.7925, "step": 17630 }, { "epoch": 0.3612311346835132, "grad_norm": 0.27165529131889343, "learning_rate": 0.0003930215125112876, "loss": 4.7735, "step": 17640 }, { "epoch": 0.36143591423832244, "grad_norm": 0.3619728684425354, "learning_rate": 0.00039297313515181137, "loss": 4.8043, "step": 17650 }, { "epoch": 0.3616406937931317, "grad_norm": 0.2687563896179199, "learning_rate": 0.00039292459368470026, "loss": 4.7906, "step": 17660 }, { "epoch": 0.3618454733479409, "grad_norm": 0.26408448815345764, "learning_rate": 0.0003928758881512349, "loss": 4.7839, "step": 17670 }, { "epoch": 0.36205025290275017, "grad_norm": 0.257796972990036, "learning_rate": 0.00039282701859283517, "loss": 4.8255, "step": 17680 }, { "epoch": 0.3622550324575594, "grad_norm": 0.24361667037010193, "learning_rate": 0.00039277798505106066, "loss": 4.7925, "step": 17690 }, { "epoch": 0.3624598120123687, "grad_norm": 0.27500542998313904, "learning_rate": 0.0003927287875676103, "loss": 4.8244, "step": 17700 }, { "epoch": 0.36266459156717795, "grad_norm": 0.27108463644981384, "learning_rate": 0.0003926794261843225, "loss": 4.793, "step": 17710 }, { "epoch": 0.3628693711219872, "grad_norm": 0.2628352642059326, "learning_rate": 0.00039262990094317496, "loss": 4.776, "step": 17720 }, { "epoch": 0.36307415067679644, "grad_norm": 0.26020681858062744, "learning_rate": 0.00039258021188628484, "loss": 4.7599, "step": 17730 }, { "epoch": 0.3632789302316057, "grad_norm": 0.2727605700492859, "learning_rate": 0.00039253035905590847, "loss": 4.8071, "step": 17740 }, { "epoch": 0.3634837097864149, "grad_norm": 0.28571629524230957, "learning_rate": 0.0003924803424944417, "loss": 4.7751, "step": 17750 }, { "epoch": 0.36368848934122416, "grad_norm": 0.27115678787231445, "learning_rate": 0.0003924301622444194, "loss": 4.769, "step": 17760 }, { "epoch": 0.3638932688960334, "grad_norm": 0.2659580707550049, "learning_rate": 0.00039237981834851557, "loss": 4.7609, "step": 17770 }, { "epoch": 0.36409804845084265, "grad_norm": 0.30356279015541077, "learning_rate": 0.0003923293108495437, "loss": 4.758, "step": 17780 }, { "epoch": 0.3643028280056519, "grad_norm": 0.28485971689224243, "learning_rate": 0.00039227863979045616, "loss": 4.7592, "step": 17790 }, { "epoch": 0.3645076075604612, "grad_norm": 0.25537344813346863, "learning_rate": 0.0003922278052143444, "loss": 4.7998, "step": 17800 }, { "epoch": 0.36471238711527043, "grad_norm": 0.2637980878353119, "learning_rate": 0.0003921768071644391, "loss": 4.774, "step": 17810 }, { "epoch": 0.3649171666700797, "grad_norm": 0.2536860704421997, "learning_rate": 0.0003921256456841099, "loss": 4.7985, "step": 17820 }, { "epoch": 0.3651219462248889, "grad_norm": 0.2698429226875305, "learning_rate": 0.00039207432081686533, "loss": 4.7772, "step": 17830 }, { "epoch": 0.36532672577969816, "grad_norm": 0.2777520716190338, "learning_rate": 0.000392022832606353, "loss": 4.7503, "step": 17840 }, { "epoch": 0.3655315053345074, "grad_norm": 0.2618495225906372, "learning_rate": 0.0003919711810963592, "loss": 4.8051, "step": 17850 }, { "epoch": 0.36573628488931664, "grad_norm": 0.26113712787628174, "learning_rate": 0.0003919193663308094, "loss": 4.7546, "step": 17860 }, { "epoch": 0.3659410644441259, "grad_norm": 0.2648015022277832, "learning_rate": 0.0003918673883537677, "loss": 4.7571, "step": 17870 }, { "epoch": 0.3661458439989351, "grad_norm": 0.2523532509803772, "learning_rate": 0.0003918152472094371, "loss": 4.7755, "step": 17880 }, { "epoch": 0.36635062355374437, "grad_norm": 0.26697415113449097, "learning_rate": 0.00039176294294215923, "loss": 4.7593, "step": 17890 }, { "epoch": 0.36655540310855367, "grad_norm": 0.32178786396980286, "learning_rate": 0.0003917104755964146, "loss": 4.7459, "step": 17900 }, { "epoch": 0.3667601826633629, "grad_norm": 0.2983376085758209, "learning_rate": 0.0003916578452168223, "loss": 4.7748, "step": 17910 }, { "epoch": 0.36696496221817215, "grad_norm": 0.2601611912250519, "learning_rate": 0.0003916050518481402, "loss": 4.7667, "step": 17920 }, { "epoch": 0.3671697417729814, "grad_norm": 0.29818862676620483, "learning_rate": 0.0003915520955352645, "loss": 4.8242, "step": 17930 }, { "epoch": 0.36737452132779064, "grad_norm": 0.2593207359313965, "learning_rate": 0.00039149897632323027, "loss": 4.7699, "step": 17940 }, { "epoch": 0.3675793008825999, "grad_norm": 0.29024702310562134, "learning_rate": 0.000391445694257211, "loss": 4.8051, "step": 17950 }, { "epoch": 0.3677840804374091, "grad_norm": 0.2815151810646057, "learning_rate": 0.00039139224938251855, "loss": 4.822, "step": 17960 }, { "epoch": 0.36798885999221836, "grad_norm": 0.2534072697162628, "learning_rate": 0.0003913386417446035, "loss": 4.7491, "step": 17970 }, { "epoch": 0.3681936395470276, "grad_norm": 0.2836243808269501, "learning_rate": 0.0003912848713890546, "loss": 4.7683, "step": 17980 }, { "epoch": 0.36839841910183685, "grad_norm": 0.2671874463558197, "learning_rate": 0.0003912309383615991, "loss": 4.7356, "step": 17990 }, { "epoch": 0.36860319865664615, "grad_norm": 0.2524053752422333, "learning_rate": 0.00039117684270810254, "loss": 4.7682, "step": 18000 }, { "epoch": 0.36860319865664615, "eval_loss": 4.784202575683594, "eval_runtime": 4.4019, "eval_samples_per_second": 264.887, "eval_steps_per_second": 33.168, "step": 18000 }, { "epoch": 0.3688079782114554, "grad_norm": 0.2831714153289795, "learning_rate": 0.0003911225844745688, "loss": 4.7654, "step": 18010 }, { "epoch": 0.36901275776626463, "grad_norm": 0.2763456404209137, "learning_rate": 0.00039106816370714, "loss": 4.7599, "step": 18020 }, { "epoch": 0.3692175373210739, "grad_norm": 0.2494124472141266, "learning_rate": 0.0003910135804520965, "loss": 4.7865, "step": 18030 }, { "epoch": 0.3694223168758831, "grad_norm": 0.2747681438922882, "learning_rate": 0.00039095883475585684, "loss": 4.757, "step": 18040 }, { "epoch": 0.36962709643069236, "grad_norm": 0.2882513403892517, "learning_rate": 0.0003909039266649777, "loss": 4.7679, "step": 18050 }, { "epoch": 0.3698318759855016, "grad_norm": 0.269535630941391, "learning_rate": 0.0003908488562261538, "loss": 4.7528, "step": 18060 }, { "epoch": 0.37003665554031084, "grad_norm": 0.2854122221469879, "learning_rate": 0.000390793623486218, "loss": 4.7805, "step": 18070 }, { "epoch": 0.3702414350951201, "grad_norm": 0.2871333658695221, "learning_rate": 0.0003907382284921412, "loss": 4.799, "step": 18080 }, { "epoch": 0.37044621464992933, "grad_norm": 0.2664596736431122, "learning_rate": 0.00039068267129103215, "loss": 4.7784, "step": 18090 }, { "epoch": 0.3706509942047386, "grad_norm": 0.29560941457748413, "learning_rate": 0.0003906269519301377, "loss": 4.7425, "step": 18100 }, { "epoch": 0.37085577375954787, "grad_norm": 0.27781563997268677, "learning_rate": 0.0003905710704568426, "loss": 4.7756, "step": 18110 }, { "epoch": 0.3710605533143571, "grad_norm": 0.25890418887138367, "learning_rate": 0.0003905150269186693, "loss": 4.7836, "step": 18120 }, { "epoch": 0.37126533286916635, "grad_norm": 0.24676479399204254, "learning_rate": 0.0003904588213632782, "loss": 4.7798, "step": 18130 }, { "epoch": 0.3714701124239756, "grad_norm": 0.2712087631225586, "learning_rate": 0.00039040245383846755, "loss": 4.7721, "step": 18140 }, { "epoch": 0.37167489197878484, "grad_norm": 0.2562553584575653, "learning_rate": 0.00039034592439217316, "loss": 4.7077, "step": 18150 }, { "epoch": 0.3718796715335941, "grad_norm": 0.25950464606285095, "learning_rate": 0.00039028923307246865, "loss": 4.7422, "step": 18160 }, { "epoch": 0.3720844510884033, "grad_norm": 0.30557316541671753, "learning_rate": 0.0003902323799275653, "loss": 4.7718, "step": 18170 }, { "epoch": 0.37228923064321257, "grad_norm": 0.26297929883003235, "learning_rate": 0.00039017536500581196, "loss": 4.7664, "step": 18180 }, { "epoch": 0.3724940101980218, "grad_norm": 0.3164907991886139, "learning_rate": 0.0003901181883556951, "loss": 4.8207, "step": 18190 }, { "epoch": 0.37269878975283105, "grad_norm": 0.269145131111145, "learning_rate": 0.00039006085002583874, "loss": 4.7828, "step": 18200 }, { "epoch": 0.37290356930764035, "grad_norm": 0.33246415853500366, "learning_rate": 0.00039000335006500437, "loss": 4.7621, "step": 18210 }, { "epoch": 0.3731083488624496, "grad_norm": 0.2733037769794464, "learning_rate": 0.0003899456885220909, "loss": 4.7872, "step": 18220 }, { "epoch": 0.37331312841725883, "grad_norm": 0.3253006935119629, "learning_rate": 0.0003898878654461347, "loss": 4.7675, "step": 18230 }, { "epoch": 0.3735179079720681, "grad_norm": 0.26410481333732605, "learning_rate": 0.00038982988088630954, "loss": 4.7431, "step": 18240 }, { "epoch": 0.3737226875268773, "grad_norm": 0.2555406987667084, "learning_rate": 0.00038977173489192636, "loss": 4.7816, "step": 18250 }, { "epoch": 0.37392746708168656, "grad_norm": 0.2523527443408966, "learning_rate": 0.00038971342751243363, "loss": 4.768, "step": 18260 }, { "epoch": 0.3741322466364958, "grad_norm": 0.30072832107543945, "learning_rate": 0.0003896549587974169, "loss": 4.7998, "step": 18270 }, { "epoch": 0.37433702619130504, "grad_norm": 0.28505706787109375, "learning_rate": 0.0003895963287965989, "loss": 4.8048, "step": 18280 }, { "epoch": 0.3745418057461143, "grad_norm": 0.3019676208496094, "learning_rate": 0.00038953753755983965, "loss": 4.8093, "step": 18290 }, { "epoch": 0.37474658530092353, "grad_norm": 0.28492453694343567, "learning_rate": 0.0003894785851371362, "loss": 4.7957, "step": 18300 }, { "epoch": 0.3749513648557328, "grad_norm": 0.2514193654060364, "learning_rate": 0.0003894194715786227, "loss": 4.7738, "step": 18310 }, { "epoch": 0.37515614441054207, "grad_norm": 0.2678600251674652, "learning_rate": 0.00038936019693457027, "loss": 4.7761, "step": 18320 }, { "epoch": 0.3753609239653513, "grad_norm": 0.25014519691467285, "learning_rate": 0.0003893007612553871, "loss": 4.7812, "step": 18330 }, { "epoch": 0.37556570352016055, "grad_norm": 0.27169695496559143, "learning_rate": 0.0003892411645916184, "loss": 4.7652, "step": 18340 }, { "epoch": 0.3757704830749698, "grad_norm": 0.25945594906806946, "learning_rate": 0.00038918140699394603, "loss": 4.7358, "step": 18350 }, { "epoch": 0.37597526262977904, "grad_norm": 0.28693246841430664, "learning_rate": 0.00038912148851318897, "loss": 4.8069, "step": 18360 }, { "epoch": 0.3761800421845883, "grad_norm": 0.255780428647995, "learning_rate": 0.0003890614092003029, "loss": 4.7505, "step": 18370 }, { "epoch": 0.3763848217393975, "grad_norm": 0.2798100411891937, "learning_rate": 0.00038900116910638025, "loss": 4.7633, "step": 18380 }, { "epoch": 0.37658960129420677, "grad_norm": 0.26866233348846436, "learning_rate": 0.00038894076828265017, "loss": 4.7489, "step": 18390 }, { "epoch": 0.376794380849016, "grad_norm": 0.24337950348854065, "learning_rate": 0.0003888802067804787, "loss": 4.7778, "step": 18400 }, { "epoch": 0.3769991604038253, "grad_norm": 0.2849843204021454, "learning_rate": 0.0003888194846513683, "loss": 4.7988, "step": 18410 }, { "epoch": 0.37720393995863455, "grad_norm": 0.25371941924095154, "learning_rate": 0.00038875860194695805, "loss": 4.7348, "step": 18420 }, { "epoch": 0.3774087195134438, "grad_norm": 0.27701646089553833, "learning_rate": 0.0003886975587190237, "loss": 4.7911, "step": 18430 }, { "epoch": 0.37761349906825303, "grad_norm": 0.2591417729854584, "learning_rate": 0.00038863635501947745, "loss": 4.7805, "step": 18440 }, { "epoch": 0.3778182786230623, "grad_norm": 0.2680375277996063, "learning_rate": 0.0003885749909003679, "loss": 4.7724, "step": 18450 }, { "epoch": 0.3780230581778715, "grad_norm": 0.2742789089679718, "learning_rate": 0.0003885134664138802, "loss": 4.7793, "step": 18460 }, { "epoch": 0.37822783773268076, "grad_norm": 0.25720226764678955, "learning_rate": 0.00038845178161233583, "loss": 4.7727, "step": 18470 }, { "epoch": 0.37843261728749, "grad_norm": 0.2753466069698334, "learning_rate": 0.0003883899365481925, "loss": 4.7827, "step": 18480 }, { "epoch": 0.37863739684229925, "grad_norm": 0.2798489034175873, "learning_rate": 0.00038832793127404453, "loss": 4.7856, "step": 18490 }, { "epoch": 0.3788421763971085, "grad_norm": 0.2578984498977661, "learning_rate": 0.00038826576584262195, "loss": 4.7821, "step": 18500 }, { "epoch": 0.3790469559519178, "grad_norm": 0.28740787506103516, "learning_rate": 0.0003882034403067915, "loss": 4.8192, "step": 18510 }, { "epoch": 0.37925173550672703, "grad_norm": 0.2980821430683136, "learning_rate": 0.0003881409547195558, "loss": 4.7659, "step": 18520 }, { "epoch": 0.37945651506153627, "grad_norm": 0.259356826543808, "learning_rate": 0.00038807830913405373, "loss": 4.7364, "step": 18530 }, { "epoch": 0.3796612946163455, "grad_norm": 0.2616250514984131, "learning_rate": 0.0003880155036035601, "loss": 4.7614, "step": 18540 }, { "epoch": 0.37986607417115476, "grad_norm": 0.45178496837615967, "learning_rate": 0.0003879525381814858, "loss": 4.7945, "step": 18550 }, { "epoch": 0.380070853725964, "grad_norm": 0.2830975651741028, "learning_rate": 0.0003878894129213777, "loss": 4.7457, "step": 18560 }, { "epoch": 0.38027563328077324, "grad_norm": 0.27111199498176575, "learning_rate": 0.00038782612787691866, "loss": 4.7598, "step": 18570 }, { "epoch": 0.3804804128355825, "grad_norm": 0.28992077708244324, "learning_rate": 0.00038776268310192723, "loss": 4.7677, "step": 18580 }, { "epoch": 0.3806851923903917, "grad_norm": 0.2773837447166443, "learning_rate": 0.000387699078650358, "loss": 4.7403, "step": 18590 }, { "epoch": 0.38088997194520097, "grad_norm": 0.23663872480392456, "learning_rate": 0.0003876353145763013, "loss": 4.7548, "step": 18600 }, { "epoch": 0.38109475150001026, "grad_norm": 0.2761427164077759, "learning_rate": 0.0003875713909339831, "loss": 4.7508, "step": 18610 }, { "epoch": 0.3812995310548195, "grad_norm": 0.2748549282550812, "learning_rate": 0.00038750730777776534, "loss": 4.7658, "step": 18620 }, { "epoch": 0.38150431060962875, "grad_norm": 0.2602144181728363, "learning_rate": 0.00038744306516214517, "loss": 4.7802, "step": 18630 }, { "epoch": 0.381709090164438, "grad_norm": 0.2753234803676605, "learning_rate": 0.00038737866314175577, "loss": 4.7783, "step": 18640 }, { "epoch": 0.38191386971924723, "grad_norm": 0.25762417912483215, "learning_rate": 0.0003873141017713657, "loss": 4.7518, "step": 18650 }, { "epoch": 0.3821186492740565, "grad_norm": 0.2581968903541565, "learning_rate": 0.00038724938110587894, "loss": 4.7841, "step": 18660 }, { "epoch": 0.3823234288288657, "grad_norm": 0.4594026505947113, "learning_rate": 0.00038718450120033525, "loss": 4.7624, "step": 18670 }, { "epoch": 0.38252820838367496, "grad_norm": 0.2587875425815582, "learning_rate": 0.0003871194621099093, "loss": 4.762, "step": 18680 }, { "epoch": 0.3827329879384842, "grad_norm": 0.31391558051109314, "learning_rate": 0.0003870542638899118, "loss": 4.7996, "step": 18690 }, { "epoch": 0.38293776749329345, "grad_norm": 0.26744288206100464, "learning_rate": 0.0003869889065957881, "loss": 4.7354, "step": 18700 }, { "epoch": 0.38314254704810274, "grad_norm": 0.26079440116882324, "learning_rate": 0.00038692339028311936, "loss": 4.7514, "step": 18710 }, { "epoch": 0.383347326602912, "grad_norm": 0.27135249972343445, "learning_rate": 0.0003868577150076217, "loss": 4.7754, "step": 18720 }, { "epoch": 0.38355210615772123, "grad_norm": 0.2575429379940033, "learning_rate": 0.0003867918808251464, "loss": 4.754, "step": 18730 }, { "epoch": 0.38375688571253047, "grad_norm": 0.2666241526603699, "learning_rate": 0.0003867258877916801, "loss": 4.7383, "step": 18740 }, { "epoch": 0.3839616652673397, "grad_norm": 0.25891196727752686, "learning_rate": 0.00038665973596334444, "loss": 4.758, "step": 18750 }, { "epoch": 0.38416644482214896, "grad_norm": 0.28033098578453064, "learning_rate": 0.00038659342539639586, "loss": 4.7603, "step": 18760 }, { "epoch": 0.3843712243769582, "grad_norm": 0.26347917318344116, "learning_rate": 0.00038652695614722616, "loss": 4.7399, "step": 18770 }, { "epoch": 0.38457600393176744, "grad_norm": 0.32026243209838867, "learning_rate": 0.00038646032827236186, "loss": 4.7516, "step": 18780 }, { "epoch": 0.3847807834865767, "grad_norm": 0.2726016044616699, "learning_rate": 0.0003863935418284645, "loss": 4.7387, "step": 18790 }, { "epoch": 0.3849855630413859, "grad_norm": 0.2759679853916168, "learning_rate": 0.0003863265968723303, "loss": 4.7852, "step": 18800 }, { "epoch": 0.3851903425961952, "grad_norm": 0.4238550662994385, "learning_rate": 0.0003862594934608904, "loss": 4.7714, "step": 18810 }, { "epoch": 0.38539512215100447, "grad_norm": 0.27191659808158875, "learning_rate": 0.0003861922316512108, "loss": 4.7935, "step": 18820 }, { "epoch": 0.3855999017058137, "grad_norm": 0.27841800451278687, "learning_rate": 0.0003861248115004919, "loss": 4.8057, "step": 18830 }, { "epoch": 0.38580468126062295, "grad_norm": 0.26735278964042664, "learning_rate": 0.00038605723306606914, "loss": 4.7604, "step": 18840 }, { "epoch": 0.3860094608154322, "grad_norm": 0.2815970182418823, "learning_rate": 0.00038598949640541227, "loss": 4.7796, "step": 18850 }, { "epoch": 0.38621424037024144, "grad_norm": 0.28123724460601807, "learning_rate": 0.0003859216015761257, "loss": 4.7221, "step": 18860 }, { "epoch": 0.3864190199250507, "grad_norm": 0.2720465660095215, "learning_rate": 0.0003858535486359484, "loss": 4.7925, "step": 18870 }, { "epoch": 0.3866237994798599, "grad_norm": 0.30233508348464966, "learning_rate": 0.00038578533764275363, "loss": 4.7629, "step": 18880 }, { "epoch": 0.38682857903466916, "grad_norm": 0.2735285460948944, "learning_rate": 0.0003857169686545493, "loss": 4.7696, "step": 18890 }, { "epoch": 0.3870333585894784, "grad_norm": 0.27527323365211487, "learning_rate": 0.0003856484417294775, "loss": 4.757, "step": 18900 }, { "epoch": 0.38723813814428765, "grad_norm": 0.2703647315502167, "learning_rate": 0.00038557975692581466, "loss": 4.776, "step": 18910 }, { "epoch": 0.38744291769909694, "grad_norm": 0.2896738052368164, "learning_rate": 0.0003855109143019716, "loss": 4.7545, "step": 18920 }, { "epoch": 0.3876476972539062, "grad_norm": 0.26229390501976013, "learning_rate": 0.00038544191391649325, "loss": 4.7466, "step": 18930 }, { "epoch": 0.38785247680871543, "grad_norm": 0.3293417990207672, "learning_rate": 0.00038537275582805854, "loss": 4.7214, "step": 18940 }, { "epoch": 0.38805725636352467, "grad_norm": 0.26354289054870605, "learning_rate": 0.0003853034400954809, "loss": 4.7472, "step": 18950 }, { "epoch": 0.3882620359183339, "grad_norm": 0.28598079085350037, "learning_rate": 0.0003852339667777075, "loss": 4.7531, "step": 18960 }, { "epoch": 0.38846681547314316, "grad_norm": 0.33052363991737366, "learning_rate": 0.0003851643359338196, "loss": 4.8054, "step": 18970 }, { "epoch": 0.3886715950279524, "grad_norm": 0.2778172194957733, "learning_rate": 0.0003850945476230326, "loss": 4.7523, "step": 18980 }, { "epoch": 0.38887637458276164, "grad_norm": 0.3006640076637268, "learning_rate": 0.0003850246019046955, "loss": 4.7239, "step": 18990 }, { "epoch": 0.3890811541375709, "grad_norm": 0.29719680547714233, "learning_rate": 0.00038495449883829143, "loss": 4.7954, "step": 19000 }, { "epoch": 0.3892859336923801, "grad_norm": 0.27522799372673035, "learning_rate": 0.00038488423848343715, "loss": 4.7821, "step": 19010 }, { "epoch": 0.3894907132471894, "grad_norm": 0.35163021087646484, "learning_rate": 0.0003848138208998833, "loss": 4.7532, "step": 19020 }, { "epoch": 0.38969549280199867, "grad_norm": 0.34216707944869995, "learning_rate": 0.0003847432461475142, "loss": 4.7659, "step": 19030 }, { "epoch": 0.3899002723568079, "grad_norm": 0.27476340532302856, "learning_rate": 0.00038467251428634785, "loss": 4.7632, "step": 19040 }, { "epoch": 0.39010505191161715, "grad_norm": 0.2805178165435791, "learning_rate": 0.00038460162537653586, "loss": 4.7819, "step": 19050 }, { "epoch": 0.3903098314664264, "grad_norm": 0.28047510981559753, "learning_rate": 0.0003845305794783632, "loss": 4.7759, "step": 19060 }, { "epoch": 0.39051461102123564, "grad_norm": 0.3435409963130951, "learning_rate": 0.00038445937665224864, "loss": 4.7387, "step": 19070 }, { "epoch": 0.3907193905760449, "grad_norm": 0.26504743099212646, "learning_rate": 0.0003843880169587442, "loss": 4.7575, "step": 19080 }, { "epoch": 0.3909241701308541, "grad_norm": 0.2697354853153229, "learning_rate": 0.00038431650045853545, "loss": 4.7719, "step": 19090 }, { "epoch": 0.39112894968566336, "grad_norm": 0.2649177610874176, "learning_rate": 0.00038424482721244123, "loss": 4.7562, "step": 19100 }, { "epoch": 0.3913337292404726, "grad_norm": 0.2683025300502777, "learning_rate": 0.0003841729972814136, "loss": 4.7737, "step": 19110 }, { "epoch": 0.3915385087952819, "grad_norm": 0.2645135819911957, "learning_rate": 0.00038410101072653807, "loss": 4.7911, "step": 19120 }, { "epoch": 0.39174328835009115, "grad_norm": 0.26734185218811035, "learning_rate": 0.00038402886760903314, "loss": 4.7697, "step": 19130 }, { "epoch": 0.3919480679049004, "grad_norm": 0.25954359769821167, "learning_rate": 0.00038395656799025056, "loss": 4.7931, "step": 19140 }, { "epoch": 0.39215284745970963, "grad_norm": 0.2955479621887207, "learning_rate": 0.00038388411193167516, "loss": 4.7503, "step": 19150 }, { "epoch": 0.3923576270145189, "grad_norm": 0.36487194895744324, "learning_rate": 0.0003838114994949249, "loss": 4.7783, "step": 19160 }, { "epoch": 0.3925624065693281, "grad_norm": 0.2788164019584656, "learning_rate": 0.00038373873074175053, "loss": 4.7736, "step": 19170 }, { "epoch": 0.39276718612413736, "grad_norm": 0.2730484902858734, "learning_rate": 0.0003836658057340358, "loss": 4.7615, "step": 19180 }, { "epoch": 0.3929719656789466, "grad_norm": 0.28349214792251587, "learning_rate": 0.00038359272453379747, "loss": 4.7403, "step": 19190 }, { "epoch": 0.39317674523375584, "grad_norm": 0.26683154702186584, "learning_rate": 0.000383519487203185, "loss": 4.7825, "step": 19200 }, { "epoch": 0.3933815247885651, "grad_norm": 0.2710789740085602, "learning_rate": 0.00038344609380448064, "loss": 4.7478, "step": 19210 }, { "epoch": 0.3935863043433744, "grad_norm": 0.26695185899734497, "learning_rate": 0.00038337254440009937, "loss": 4.7665, "step": 19220 }, { "epoch": 0.3937910838981836, "grad_norm": 0.273545503616333, "learning_rate": 0.0003832988390525889, "loss": 4.7497, "step": 19230 }, { "epoch": 0.39399586345299287, "grad_norm": 0.30231332778930664, "learning_rate": 0.0003832249778246295, "loss": 4.7653, "step": 19240 }, { "epoch": 0.3942006430078021, "grad_norm": 0.28207749128341675, "learning_rate": 0.00038315096077903404, "loss": 4.7185, "step": 19250 }, { "epoch": 0.39440542256261135, "grad_norm": 0.31809091567993164, "learning_rate": 0.00038307678797874783, "loss": 4.7526, "step": 19260 }, { "epoch": 0.3946102021174206, "grad_norm": 0.295795738697052, "learning_rate": 0.0003830024594868487, "loss": 4.7796, "step": 19270 }, { "epoch": 0.39481498167222984, "grad_norm": 0.2881593108177185, "learning_rate": 0.00038292797536654684, "loss": 4.7723, "step": 19280 }, { "epoch": 0.3950197612270391, "grad_norm": 0.286245733499527, "learning_rate": 0.0003828533356811848, "loss": 4.7548, "step": 19290 }, { "epoch": 0.3952245407818483, "grad_norm": 0.297598272562027, "learning_rate": 0.00038277854049423754, "loss": 4.7891, "step": 19300 }, { "epoch": 0.39542932033665756, "grad_norm": 0.2738468647003174, "learning_rate": 0.000382703589869312, "loss": 4.7706, "step": 19310 }, { "epoch": 0.39563409989146686, "grad_norm": 0.29014673829078674, "learning_rate": 0.0003826284838701476, "loss": 4.7256, "step": 19320 }, { "epoch": 0.3958388794462761, "grad_norm": 0.25152087211608887, "learning_rate": 0.0003825532225606157, "loss": 4.782, "step": 19330 }, { "epoch": 0.39604365900108535, "grad_norm": 0.2573102116584778, "learning_rate": 0.00038247780600471977, "loss": 4.7479, "step": 19340 }, { "epoch": 0.3962484385558946, "grad_norm": 0.29279622435569763, "learning_rate": 0.00038240223426659544, "loss": 4.747, "step": 19350 }, { "epoch": 0.39645321811070383, "grad_norm": 0.2536981403827667, "learning_rate": 0.00038232650741051004, "loss": 4.7579, "step": 19360 }, { "epoch": 0.3966579976655131, "grad_norm": 0.3073466718196869, "learning_rate": 0.0003822506255008631, "loss": 4.7325, "step": 19370 }, { "epoch": 0.3968627772203223, "grad_norm": 0.28557318449020386, "learning_rate": 0.00038217458860218586, "loss": 4.7271, "step": 19380 }, { "epoch": 0.39706755677513156, "grad_norm": 0.2675096392631531, "learning_rate": 0.00038209839677914143, "loss": 4.7627, "step": 19390 }, { "epoch": 0.3972723363299408, "grad_norm": 0.25349166989326477, "learning_rate": 0.0003820220500965245, "loss": 4.7664, "step": 19400 }, { "epoch": 0.39747711588475004, "grad_norm": 0.27463775873184204, "learning_rate": 0.0003819455486192617, "loss": 4.7561, "step": 19410 }, { "epoch": 0.39768189543955934, "grad_norm": 0.26222658157348633, "learning_rate": 0.00038186889241241116, "loss": 4.7602, "step": 19420 }, { "epoch": 0.3978866749943686, "grad_norm": 0.2814188301563263, "learning_rate": 0.00038179208154116265, "loss": 4.7413, "step": 19430 }, { "epoch": 0.3980914545491778, "grad_norm": 0.25720828771591187, "learning_rate": 0.0003817151160708373, "loss": 4.7399, "step": 19440 }, { "epoch": 0.39829623410398707, "grad_norm": 0.2466021180152893, "learning_rate": 0.00038163799606688803, "loss": 4.7245, "step": 19450 }, { "epoch": 0.3985010136587963, "grad_norm": 0.2728370428085327, "learning_rate": 0.00038156072159489893, "loss": 4.7553, "step": 19460 }, { "epoch": 0.39870579321360555, "grad_norm": 0.26696377992630005, "learning_rate": 0.00038148329272058557, "loss": 4.789, "step": 19470 }, { "epoch": 0.3989105727684148, "grad_norm": 0.25925275683403015, "learning_rate": 0.0003814057095097947, "loss": 4.7449, "step": 19480 }, { "epoch": 0.39911535232322404, "grad_norm": 0.3272514045238495, "learning_rate": 0.0003813279720285045, "loss": 4.7465, "step": 19490 }, { "epoch": 0.3993201318780333, "grad_norm": 0.2468366175889969, "learning_rate": 0.0003812500803428241, "loss": 4.7569, "step": 19500 }, { "epoch": 0.3995249114328425, "grad_norm": 0.26700812578201294, "learning_rate": 0.000381172034518994, "loss": 4.7355, "step": 19510 }, { "epoch": 0.3997296909876518, "grad_norm": 0.2623588442802429, "learning_rate": 0.0003810938346233858, "loss": 4.7481, "step": 19520 }, { "epoch": 0.39993447054246106, "grad_norm": 0.2693066895008087, "learning_rate": 0.00038101548072250185, "loss": 4.7676, "step": 19530 }, { "epoch": 0.4001392500972703, "grad_norm": 0.267498254776001, "learning_rate": 0.0003809369728829757, "loss": 4.7471, "step": 19540 }, { "epoch": 0.40034402965207955, "grad_norm": 0.2677249312400818, "learning_rate": 0.0003808583111715717, "loss": 4.7504, "step": 19550 }, { "epoch": 0.4005488092068888, "grad_norm": 0.27942562103271484, "learning_rate": 0.0003807794956551853, "loss": 4.7371, "step": 19560 }, { "epoch": 0.40075358876169803, "grad_norm": 0.3115752637386322, "learning_rate": 0.00038070052640084233, "loss": 4.7533, "step": 19570 }, { "epoch": 0.4009583683165073, "grad_norm": 0.32602185010910034, "learning_rate": 0.0003806214034756997, "loss": 4.7557, "step": 19580 }, { "epoch": 0.4011631478713165, "grad_norm": 0.2551416754722595, "learning_rate": 0.0003805421269470449, "loss": 4.7517, "step": 19590 }, { "epoch": 0.40136792742612576, "grad_norm": 0.29488468170166016, "learning_rate": 0.000380462696882296, "loss": 4.7618, "step": 19600 }, { "epoch": 0.401572706980935, "grad_norm": 0.2805251181125641, "learning_rate": 0.0003803831133490016, "loss": 4.7674, "step": 19610 }, { "epoch": 0.40177748653574424, "grad_norm": 0.25679463148117065, "learning_rate": 0.00038030337641484105, "loss": 4.7475, "step": 19620 }, { "epoch": 0.40198226609055354, "grad_norm": 0.2735605239868164, "learning_rate": 0.0003802234861476239, "loss": 4.7615, "step": 19630 }, { "epoch": 0.4021870456453628, "grad_norm": 0.2695820927619934, "learning_rate": 0.0003801434426152902, "loss": 4.7394, "step": 19640 }, { "epoch": 0.402391825200172, "grad_norm": 0.2904840409755707, "learning_rate": 0.0003800632458859103, "loss": 4.7193, "step": 19650 }, { "epoch": 0.40259660475498127, "grad_norm": 0.2951556146144867, "learning_rate": 0.000379982896027685, "loss": 4.7579, "step": 19660 }, { "epoch": 0.4028013843097905, "grad_norm": 0.27108797430992126, "learning_rate": 0.000379902393108945, "loss": 4.7922, "step": 19670 }, { "epoch": 0.40300616386459975, "grad_norm": 0.32287734746932983, "learning_rate": 0.00037982173719815145, "loss": 4.7191, "step": 19680 }, { "epoch": 0.403210943419409, "grad_norm": 0.2531939744949341, "learning_rate": 0.0003797409283638955, "loss": 4.7648, "step": 19690 }, { "epoch": 0.40341572297421824, "grad_norm": 0.25351694226264954, "learning_rate": 0.0003796599666748984, "loss": 4.7455, "step": 19700 }, { "epoch": 0.4036205025290275, "grad_norm": 0.27533668279647827, "learning_rate": 0.00037957885220001123, "loss": 4.7252, "step": 19710 }, { "epoch": 0.4038252820838367, "grad_norm": 0.2773134410381317, "learning_rate": 0.0003794975850082152, "loss": 4.7253, "step": 19720 }, { "epoch": 0.404030061638646, "grad_norm": 0.27424198389053345, "learning_rate": 0.00037941616516862133, "loss": 4.7617, "step": 19730 }, { "epoch": 0.40423484119345526, "grad_norm": 0.28016701340675354, "learning_rate": 0.0003793345927504704, "loss": 4.7635, "step": 19740 }, { "epoch": 0.4044396207482645, "grad_norm": 0.2677452862262726, "learning_rate": 0.00037925286782313297, "loss": 4.7704, "step": 19750 }, { "epoch": 0.40464440030307375, "grad_norm": 0.28092268109321594, "learning_rate": 0.00037917099045610934, "loss": 4.7673, "step": 19760 }, { "epoch": 0.404849179857883, "grad_norm": 0.25641924142837524, "learning_rate": 0.00037908896071902937, "loss": 4.7884, "step": 19770 }, { "epoch": 0.40505395941269223, "grad_norm": 0.29772433638572693, "learning_rate": 0.00037900677868165256, "loss": 4.6987, "step": 19780 }, { "epoch": 0.4052587389675015, "grad_norm": 0.2560875713825226, "learning_rate": 0.000378924444413868, "loss": 4.7711, "step": 19790 }, { "epoch": 0.4054635185223107, "grad_norm": 0.30855676531791687, "learning_rate": 0.00037884195798569396, "loss": 4.749, "step": 19800 }, { "epoch": 0.40566829807711996, "grad_norm": 0.28452619910240173, "learning_rate": 0.0003787593194672785, "loss": 4.7516, "step": 19810 }, { "epoch": 0.4058730776319292, "grad_norm": 0.2876434028148651, "learning_rate": 0.00037867652892889866, "loss": 4.732, "step": 19820 }, { "epoch": 0.4060778571867385, "grad_norm": 0.27363520860671997, "learning_rate": 0.00037859358644096104, "loss": 4.7889, "step": 19830 }, { "epoch": 0.40628263674154774, "grad_norm": 0.35006874799728394, "learning_rate": 0.00037851049207400126, "loss": 4.7257, "step": 19840 }, { "epoch": 0.406487416296357, "grad_norm": 0.2801511585712433, "learning_rate": 0.0003784272458986843, "loss": 4.7663, "step": 19850 }, { "epoch": 0.4066921958511662, "grad_norm": 0.2624078691005707, "learning_rate": 0.000378343847985804, "loss": 4.7674, "step": 19860 }, { "epoch": 0.40689697540597547, "grad_norm": 0.2904002070426941, "learning_rate": 0.0003782602984062834, "loss": 4.7124, "step": 19870 }, { "epoch": 0.4071017549607847, "grad_norm": 0.2745887041091919, "learning_rate": 0.0003781765972311745, "loss": 4.7743, "step": 19880 }, { "epoch": 0.40730653451559395, "grad_norm": 0.2452814280986786, "learning_rate": 0.00037809274453165823, "loss": 4.707, "step": 19890 }, { "epoch": 0.4075113140704032, "grad_norm": 0.32110917568206787, "learning_rate": 0.0003780087403790442, "loss": 4.7431, "step": 19900 }, { "epoch": 0.40771609362521244, "grad_norm": 0.24623283743858337, "learning_rate": 0.00037792458484477116, "loss": 4.7738, "step": 19910 }, { "epoch": 0.4079208731800217, "grad_norm": 0.26000604033470154, "learning_rate": 0.00037784027800040633, "loss": 4.7656, "step": 19920 }, { "epoch": 0.408125652734831, "grad_norm": 0.2891079783439636, "learning_rate": 0.0003777558199176457, "loss": 4.7705, "step": 19930 }, { "epoch": 0.4083304322896402, "grad_norm": 0.28778308629989624, "learning_rate": 0.0003776712106683138, "loss": 4.7482, "step": 19940 }, { "epoch": 0.40853521184444946, "grad_norm": 0.2763589322566986, "learning_rate": 0.00037758645032436386, "loss": 4.7377, "step": 19950 }, { "epoch": 0.4087399913992587, "grad_norm": 0.26450416445732117, "learning_rate": 0.00037750153895787744, "loss": 4.7238, "step": 19960 }, { "epoch": 0.40894477095406795, "grad_norm": 0.27174514532089233, "learning_rate": 0.0003774164766410646, "loss": 4.7706, "step": 19970 }, { "epoch": 0.4091495505088772, "grad_norm": 0.26067888736724854, "learning_rate": 0.0003773312634462639, "loss": 4.7752, "step": 19980 }, { "epoch": 0.40935433006368643, "grad_norm": 0.2728261649608612, "learning_rate": 0.000377245899445942, "loss": 4.7563, "step": 19990 }, { "epoch": 0.4095591096184957, "grad_norm": 0.3048381209373474, "learning_rate": 0.00037716038471269386, "loss": 4.7654, "step": 20000 }, { "epoch": 0.4095591096184957, "eval_loss": 4.762139797210693, "eval_runtime": 4.4093, "eval_samples_per_second": 264.44, "eval_steps_per_second": 33.112, "step": 20000 }, { "epoch": 0.4097638891733049, "grad_norm": 0.31729310750961304, "learning_rate": 0.00037707471931924276, "loss": 4.7428, "step": 20010 }, { "epoch": 0.40996866872811416, "grad_norm": 0.25164294242858887, "learning_rate": 0.00037698890333843994, "loss": 4.7363, "step": 20020 }, { "epoch": 0.41017344828292346, "grad_norm": 0.256254643201828, "learning_rate": 0.0003769029368432648, "loss": 4.7656, "step": 20030 }, { "epoch": 0.4103782278377327, "grad_norm": 0.27743199467658997, "learning_rate": 0.00037681681990682465, "loss": 4.7731, "step": 20040 }, { "epoch": 0.41058300739254194, "grad_norm": 0.2577210068702698, "learning_rate": 0.0003767305526023549, "loss": 4.7563, "step": 20050 }, { "epoch": 0.4107877869473512, "grad_norm": 0.28428512811660767, "learning_rate": 0.0003766441350032187, "loss": 4.7429, "step": 20060 }, { "epoch": 0.41099256650216043, "grad_norm": 0.2780125141143799, "learning_rate": 0.000376557567182907, "loss": 4.7418, "step": 20070 }, { "epoch": 0.41119734605696967, "grad_norm": 0.2464691400527954, "learning_rate": 0.0003764708492150385, "loss": 4.7388, "step": 20080 }, { "epoch": 0.4114021256117789, "grad_norm": 0.27678778767585754, "learning_rate": 0.0003763839811733598, "loss": 4.7563, "step": 20090 }, { "epoch": 0.41160690516658816, "grad_norm": 0.2662733495235443, "learning_rate": 0.00037629696313174485, "loss": 4.7278, "step": 20100 }, { "epoch": 0.4118116847213974, "grad_norm": 0.33604541420936584, "learning_rate": 0.0003762097951641952, "loss": 4.6944, "step": 20110 }, { "epoch": 0.41201646427620664, "grad_norm": 0.3161959946155548, "learning_rate": 0.0003761224773448401, "loss": 4.7477, "step": 20120 }, { "epoch": 0.41222124383101594, "grad_norm": 0.3201931416988373, "learning_rate": 0.0003760350097479361, "loss": 4.7126, "step": 20130 }, { "epoch": 0.4124260233858252, "grad_norm": 0.27246227860450745, "learning_rate": 0.0003759473924478671, "loss": 4.722, "step": 20140 }, { "epoch": 0.4126308029406344, "grad_norm": 0.2522353231906891, "learning_rate": 0.00037585962551914433, "loss": 4.7478, "step": 20150 }, { "epoch": 0.41283558249544366, "grad_norm": 0.29608771204948425, "learning_rate": 0.0003757717090364062, "loss": 4.7393, "step": 20160 }, { "epoch": 0.4130403620502529, "grad_norm": 0.2863655686378479, "learning_rate": 0.0003756836430744185, "loss": 4.7444, "step": 20170 }, { "epoch": 0.41324514160506215, "grad_norm": 0.28394028544425964, "learning_rate": 0.0003755954277080741, "loss": 4.7254, "step": 20180 }, { "epoch": 0.4134499211598714, "grad_norm": 0.33804798126220703, "learning_rate": 0.0003755070630123926, "loss": 4.7376, "step": 20190 }, { "epoch": 0.41365470071468063, "grad_norm": 0.3279488682746887, "learning_rate": 0.00037541854906252106, "loss": 4.7475, "step": 20200 }, { "epoch": 0.4138594802694899, "grad_norm": 0.29185786843299866, "learning_rate": 0.0003753298859337331, "loss": 4.7063, "step": 20210 }, { "epoch": 0.4140642598242991, "grad_norm": 0.3181179463863373, "learning_rate": 0.00037524107370142946, "loss": 4.6854, "step": 20220 }, { "epoch": 0.4142690393791084, "grad_norm": 0.2748674750328064, "learning_rate": 0.0003751521124411375, "loss": 4.7508, "step": 20230 }, { "epoch": 0.41447381893391766, "grad_norm": 0.26090162992477417, "learning_rate": 0.00037506300222851145, "loss": 4.7094, "step": 20240 }, { "epoch": 0.4146785984887269, "grad_norm": 0.2858532965183258, "learning_rate": 0.00037497374313933204, "loss": 4.7219, "step": 20250 }, { "epoch": 0.41488337804353614, "grad_norm": 0.27587875723838806, "learning_rate": 0.00037488433524950684, "loss": 4.7465, "step": 20260 }, { "epoch": 0.4150881575983454, "grad_norm": 0.2696737051010132, "learning_rate": 0.0003747947786350698, "loss": 4.7344, "step": 20270 }, { "epoch": 0.41529293715315463, "grad_norm": 0.2946103513240814, "learning_rate": 0.0003747050733721814, "loss": 4.7315, "step": 20280 }, { "epoch": 0.41549771670796387, "grad_norm": 0.2917758524417877, "learning_rate": 0.0003746152195371284, "loss": 4.7215, "step": 20290 }, { "epoch": 0.4157024962627731, "grad_norm": 0.2587776184082031, "learning_rate": 0.0003745252172063242, "loss": 4.7572, "step": 20300 }, { "epoch": 0.41590727581758236, "grad_norm": 0.257627010345459, "learning_rate": 0.0003744350664563083, "loss": 4.7421, "step": 20310 }, { "epoch": 0.4161120553723916, "grad_norm": 0.24898594617843628, "learning_rate": 0.00037434476736374626, "loss": 4.7361, "step": 20320 }, { "epoch": 0.41631683492720084, "grad_norm": 0.3044833838939667, "learning_rate": 0.0003742543200054301, "loss": 4.7529, "step": 20330 }, { "epoch": 0.41652161448201014, "grad_norm": 0.26270875334739685, "learning_rate": 0.0003741637244582778, "loss": 4.7394, "step": 20340 }, { "epoch": 0.4167263940368194, "grad_norm": 0.2890133559703827, "learning_rate": 0.00037407298079933333, "loss": 4.7101, "step": 20350 }, { "epoch": 0.4169311735916286, "grad_norm": 0.2605116069316864, "learning_rate": 0.0003739820891057666, "loss": 4.7135, "step": 20360 }, { "epoch": 0.41713595314643787, "grad_norm": 0.2764221131801605, "learning_rate": 0.00037389104945487347, "loss": 4.7378, "step": 20370 }, { "epoch": 0.4173407327012471, "grad_norm": 0.3196144700050354, "learning_rate": 0.00037379986192407564, "loss": 4.7522, "step": 20380 }, { "epoch": 0.41754551225605635, "grad_norm": 0.2779579758644104, "learning_rate": 0.0003737085265909205, "loss": 4.7347, "step": 20390 }, { "epoch": 0.4177502918108656, "grad_norm": 0.27223050594329834, "learning_rate": 0.0003736170435330812, "loss": 4.7316, "step": 20400 }, { "epoch": 0.41795507136567484, "grad_norm": 0.2609656751155853, "learning_rate": 0.00037352541282835645, "loss": 4.744, "step": 20410 }, { "epoch": 0.4181598509204841, "grad_norm": 0.27855393290519714, "learning_rate": 0.0003734336345546706, "loss": 4.7867, "step": 20420 }, { "epoch": 0.4183646304752933, "grad_norm": 0.30180808901786804, "learning_rate": 0.0003733417087900734, "loss": 4.7543, "step": 20430 }, { "epoch": 0.4185694100301026, "grad_norm": 0.2689778506755829, "learning_rate": 0.00037324963561274015, "loss": 4.7679, "step": 20440 }, { "epoch": 0.41877418958491186, "grad_norm": 0.2800348699092865, "learning_rate": 0.00037315741510097144, "loss": 4.7627, "step": 20450 }, { "epoch": 0.4189789691397211, "grad_norm": 0.2967687249183655, "learning_rate": 0.0003730650473331931, "loss": 4.7548, "step": 20460 }, { "epoch": 0.41918374869453034, "grad_norm": 0.27397188544273376, "learning_rate": 0.00037297253238795626, "loss": 4.7223, "step": 20470 }, { "epoch": 0.4193885282493396, "grad_norm": 0.26289302110671997, "learning_rate": 0.0003728798703439373, "loss": 4.7291, "step": 20480 }, { "epoch": 0.41959330780414883, "grad_norm": 0.24988263845443726, "learning_rate": 0.0003727870612799374, "loss": 4.7163, "step": 20490 }, { "epoch": 0.41979808735895807, "grad_norm": 0.2630736529827118, "learning_rate": 0.0003726941052748832, "loss": 4.7577, "step": 20500 }, { "epoch": 0.4200028669137673, "grad_norm": 0.2840738892555237, "learning_rate": 0.0003726010024078259, "loss": 4.7431, "step": 20510 }, { "epoch": 0.42020764646857656, "grad_norm": 0.36802077293395996, "learning_rate": 0.0003725077527579418, "loss": 4.7876, "step": 20520 }, { "epoch": 0.4204124260233858, "grad_norm": 0.2600573003292084, "learning_rate": 0.0003724143564045321, "loss": 4.8069, "step": 20530 }, { "epoch": 0.4206172055781951, "grad_norm": 0.2773413062095642, "learning_rate": 0.0003723208134270225, "loss": 4.7437, "step": 20540 }, { "epoch": 0.42082198513300434, "grad_norm": 0.2827748954296112, "learning_rate": 0.00037222712390496353, "loss": 4.7335, "step": 20550 }, { "epoch": 0.4210267646878136, "grad_norm": 0.2878071963787079, "learning_rate": 0.0003721332879180305, "loss": 4.7648, "step": 20560 }, { "epoch": 0.4212315442426228, "grad_norm": 0.2613888680934906, "learning_rate": 0.000372039305546023, "loss": 4.7907, "step": 20570 }, { "epoch": 0.42143632379743207, "grad_norm": 0.2486087679862976, "learning_rate": 0.0003719451768688653, "loss": 4.7224, "step": 20580 }, { "epoch": 0.4216411033522413, "grad_norm": 0.3080863356590271, "learning_rate": 0.000371850901966606, "loss": 4.7483, "step": 20590 }, { "epoch": 0.42184588290705055, "grad_norm": 0.29384005069732666, "learning_rate": 0.00037175648091941805, "loss": 4.7485, "step": 20600 }, { "epoch": 0.4220506624618598, "grad_norm": 0.2579139471054077, "learning_rate": 0.0003716619138075988, "loss": 4.753, "step": 20610 }, { "epoch": 0.42225544201666904, "grad_norm": 0.3039598762989044, "learning_rate": 0.00037156720071156965, "loss": 4.7608, "step": 20620 }, { "epoch": 0.4224602215714783, "grad_norm": 0.28961753845214844, "learning_rate": 0.00037147234171187627, "loss": 4.7226, "step": 20630 }, { "epoch": 0.4226650011262876, "grad_norm": 0.2628253996372223, "learning_rate": 0.0003713773368891883, "loss": 4.7403, "step": 20640 }, { "epoch": 0.4228697806810968, "grad_norm": 0.26308774948120117, "learning_rate": 0.0003712821863242996, "loss": 4.7542, "step": 20650 }, { "epoch": 0.42307456023590606, "grad_norm": 0.27785658836364746, "learning_rate": 0.00037118689009812766, "loss": 4.7289, "step": 20660 }, { "epoch": 0.4232793397907153, "grad_norm": 0.31840765476226807, "learning_rate": 0.0003710914482917141, "loss": 4.7255, "step": 20670 }, { "epoch": 0.42348411934552455, "grad_norm": 0.26973164081573486, "learning_rate": 0.0003709958609862243, "loss": 4.7273, "step": 20680 }, { "epoch": 0.4236888989003338, "grad_norm": 0.2843843698501587, "learning_rate": 0.0003709001282629472, "loss": 4.7286, "step": 20690 }, { "epoch": 0.42389367845514303, "grad_norm": 0.2571497857570648, "learning_rate": 0.00037080425020329566, "loss": 4.7225, "step": 20700 }, { "epoch": 0.4240984580099523, "grad_norm": 0.27932247519493103, "learning_rate": 0.00037070822688880595, "loss": 4.751, "step": 20710 }, { "epoch": 0.4243032375647615, "grad_norm": 0.30829671025276184, "learning_rate": 0.0003706120584011379, "loss": 4.7353, "step": 20720 }, { "epoch": 0.42450801711957076, "grad_norm": 0.2808021903038025, "learning_rate": 0.00037051574482207486, "loss": 4.7434, "step": 20730 }, { "epoch": 0.42471279667438006, "grad_norm": 0.30690625309944153, "learning_rate": 0.00037041928623352357, "loss": 4.7524, "step": 20740 }, { "epoch": 0.4249175762291893, "grad_norm": 0.25853148102760315, "learning_rate": 0.000370322682717514, "loss": 4.7344, "step": 20750 }, { "epoch": 0.42512235578399854, "grad_norm": 0.2632678151130676, "learning_rate": 0.00037022593435619947, "loss": 4.7313, "step": 20760 }, { "epoch": 0.4253271353388078, "grad_norm": 0.26891714334487915, "learning_rate": 0.00037012904123185644, "loss": 4.7425, "step": 20770 }, { "epoch": 0.425531914893617, "grad_norm": 0.2848103642463684, "learning_rate": 0.00037003200342688433, "loss": 4.7262, "step": 20780 }, { "epoch": 0.42573669444842627, "grad_norm": 0.26982730627059937, "learning_rate": 0.00036993482102380595, "loss": 4.7739, "step": 20790 }, { "epoch": 0.4259414740032355, "grad_norm": 0.2797873914241791, "learning_rate": 0.0003698374941052667, "loss": 4.7305, "step": 20800 }, { "epoch": 0.42614625355804475, "grad_norm": 0.24877497553825378, "learning_rate": 0.0003697400227540351, "loss": 4.7422, "step": 20810 }, { "epoch": 0.426351033112854, "grad_norm": 0.2713570296764374, "learning_rate": 0.0003696424070530025, "loss": 4.7698, "step": 20820 }, { "epoch": 0.42655581266766324, "grad_norm": 0.2632727324962616, "learning_rate": 0.00036954464708518293, "loss": 4.7068, "step": 20830 }, { "epoch": 0.42676059222247253, "grad_norm": 0.26218199729919434, "learning_rate": 0.0003694467429337131, "loss": 4.7559, "step": 20840 }, { "epoch": 0.4269653717772818, "grad_norm": 0.260280042886734, "learning_rate": 0.0003693486946818523, "loss": 4.7461, "step": 20850 }, { "epoch": 0.427170151332091, "grad_norm": 0.2665586471557617, "learning_rate": 0.00036925050241298253, "loss": 4.7795, "step": 20860 }, { "epoch": 0.42737493088690026, "grad_norm": 0.2628210186958313, "learning_rate": 0.00036915216621060813, "loss": 4.745, "step": 20870 }, { "epoch": 0.4275797104417095, "grad_norm": 0.29638341069221497, "learning_rate": 0.00036905368615835593, "loss": 4.7348, "step": 20880 }, { "epoch": 0.42778448999651875, "grad_norm": 0.35531070828437805, "learning_rate": 0.00036895506233997495, "loss": 4.7324, "step": 20890 }, { "epoch": 0.427989269551328, "grad_norm": 0.2501230537891388, "learning_rate": 0.0003688562948393366, "loss": 4.7334, "step": 20900 }, { "epoch": 0.42819404910613723, "grad_norm": 0.24845455586910248, "learning_rate": 0.0003687573837404345, "loss": 4.7743, "step": 20910 }, { "epoch": 0.4283988286609465, "grad_norm": 0.2809300720691681, "learning_rate": 0.0003686583291273842, "loss": 4.7217, "step": 20920 }, { "epoch": 0.4286036082157557, "grad_norm": 0.28208544850349426, "learning_rate": 0.0003685591310844235, "loss": 4.7296, "step": 20930 }, { "epoch": 0.428808387770565, "grad_norm": 0.2754552364349365, "learning_rate": 0.00036845978969591214, "loss": 4.723, "step": 20940 }, { "epoch": 0.42901316732537426, "grad_norm": 0.26814135909080505, "learning_rate": 0.0003683603050463317, "loss": 4.7164, "step": 20950 }, { "epoch": 0.4292179468801835, "grad_norm": 0.9360597133636475, "learning_rate": 0.00036826067722028563, "loss": 4.6658, "step": 20960 }, { "epoch": 0.42942272643499274, "grad_norm": 0.2701377272605896, "learning_rate": 0.00036816090630249906, "loss": 4.7398, "step": 20970 }, { "epoch": 0.429627505989802, "grad_norm": 0.2618829905986786, "learning_rate": 0.000368060992377819, "loss": 4.7182, "step": 20980 }, { "epoch": 0.4298322855446112, "grad_norm": 0.3635317087173462, "learning_rate": 0.0003679609355312137, "loss": 4.7688, "step": 20990 }, { "epoch": 0.43003706509942047, "grad_norm": 0.26796573400497437, "learning_rate": 0.0003678607358477735, "loss": 4.7562, "step": 21000 }, { "epoch": 0.4302418446542297, "grad_norm": 0.2521805763244629, "learning_rate": 0.0003677603934127098, "loss": 4.6947, "step": 21010 }, { "epoch": 0.43044662420903895, "grad_norm": 0.26770153641700745, "learning_rate": 0.0003676599083113555, "loss": 4.7798, "step": 21020 }, { "epoch": 0.4306514037638482, "grad_norm": 0.28818097710609436, "learning_rate": 0.00036755928062916476, "loss": 4.7426, "step": 21030 }, { "epoch": 0.4308561833186575, "grad_norm": 0.3475240170955658, "learning_rate": 0.00036745851045171326, "loss": 4.7895, "step": 21040 }, { "epoch": 0.43106096287346674, "grad_norm": 0.3224576711654663, "learning_rate": 0.0003673575978646975, "loss": 4.7569, "step": 21050 }, { "epoch": 0.431265742428276, "grad_norm": 0.2977074682712555, "learning_rate": 0.00036725654295393533, "loss": 4.7342, "step": 21060 }, { "epoch": 0.4314705219830852, "grad_norm": 0.24770216643810272, "learning_rate": 0.0003671553458053655, "loss": 4.7489, "step": 21070 }, { "epoch": 0.43167530153789446, "grad_norm": 0.3409057557582855, "learning_rate": 0.00036705400650504787, "loss": 4.7374, "step": 21080 }, { "epoch": 0.4318800810927037, "grad_norm": 0.26433828473091125, "learning_rate": 0.00036695252513916314, "loss": 4.7437, "step": 21090 }, { "epoch": 0.43208486064751295, "grad_norm": 0.2731941044330597, "learning_rate": 0.00036685090179401273, "loss": 4.6672, "step": 21100 }, { "epoch": 0.4322896402023222, "grad_norm": 0.28759586811065674, "learning_rate": 0.00036674913655601875, "loss": 4.7389, "step": 21110 }, { "epoch": 0.43249441975713143, "grad_norm": 0.2854158580303192, "learning_rate": 0.00036664722951172424, "loss": 4.762, "step": 21120 }, { "epoch": 0.4326991993119407, "grad_norm": 0.2920982241630554, "learning_rate": 0.0003665451807477927, "loss": 4.7418, "step": 21130 }, { "epoch": 0.4329039788667499, "grad_norm": 0.2870182693004608, "learning_rate": 0.00036644299035100804, "loss": 4.7353, "step": 21140 }, { "epoch": 0.4331087584215592, "grad_norm": 0.2995765507221222, "learning_rate": 0.00036634065840827467, "loss": 4.7341, "step": 21150 }, { "epoch": 0.43331353797636846, "grad_norm": 0.27756834030151367, "learning_rate": 0.00036623818500661757, "loss": 4.7674, "step": 21160 }, { "epoch": 0.4335183175311777, "grad_norm": 0.28837379813194275, "learning_rate": 0.0003661355702331817, "loss": 4.6978, "step": 21170 }, { "epoch": 0.43372309708598694, "grad_norm": 0.3934195041656494, "learning_rate": 0.00036603281417523257, "loss": 4.7227, "step": 21180 }, { "epoch": 0.4339278766407962, "grad_norm": 0.26383545994758606, "learning_rate": 0.0003659299169201556, "loss": 4.7412, "step": 21190 }, { "epoch": 0.4341326561956054, "grad_norm": 0.3552657663822174, "learning_rate": 0.0003658268785554563, "loss": 4.7382, "step": 21200 }, { "epoch": 0.43433743575041467, "grad_norm": 0.2834928035736084, "learning_rate": 0.0003657236991687603, "loss": 4.7276, "step": 21210 }, { "epoch": 0.4345422153052239, "grad_norm": 0.28129100799560547, "learning_rate": 0.00036562037884781313, "loss": 4.7123, "step": 21220 }, { "epoch": 0.43474699486003315, "grad_norm": 0.28226780891418457, "learning_rate": 0.00036551691768048014, "loss": 4.748, "step": 21230 }, { "epoch": 0.4349517744148424, "grad_norm": 0.2716659605503082, "learning_rate": 0.00036541331575474644, "loss": 4.7273, "step": 21240 }, { "epoch": 0.4351565539696517, "grad_norm": 0.28415346145629883, "learning_rate": 0.0003653095731587169, "loss": 4.7302, "step": 21250 }, { "epoch": 0.43536133352446094, "grad_norm": 0.30821385979652405, "learning_rate": 0.00036520568998061595, "loss": 4.741, "step": 21260 }, { "epoch": 0.4355661130792702, "grad_norm": 0.32026225328445435, "learning_rate": 0.00036510166630878766, "loss": 4.7348, "step": 21270 }, { "epoch": 0.4357708926340794, "grad_norm": 0.2765481173992157, "learning_rate": 0.0003649975022316955, "loss": 4.7263, "step": 21280 }, { "epoch": 0.43597567218888866, "grad_norm": 0.2862047851085663, "learning_rate": 0.00036489319783792235, "loss": 4.717, "step": 21290 }, { "epoch": 0.4361804517436979, "grad_norm": 0.25941938161849976, "learning_rate": 0.00036478875321617043, "loss": 4.7675, "step": 21300 }, { "epoch": 0.43638523129850715, "grad_norm": 0.2648298740386963, "learning_rate": 0.00036468416845526134, "loss": 4.7403, "step": 21310 }, { "epoch": 0.4365900108533164, "grad_norm": 0.27693983912467957, "learning_rate": 0.00036457944364413555, "loss": 4.7214, "step": 21320 }, { "epoch": 0.43679479040812563, "grad_norm": 0.31461483240127563, "learning_rate": 0.000364474578871853, "loss": 4.7298, "step": 21330 }, { "epoch": 0.4369995699629349, "grad_norm": 0.2823515832424164, "learning_rate": 0.00036436957422759236, "loss": 4.7274, "step": 21340 }, { "epoch": 0.4372043495177442, "grad_norm": 0.2560424506664276, "learning_rate": 0.00036426442980065144, "loss": 4.7195, "step": 21350 }, { "epoch": 0.4374091290725534, "grad_norm": 0.2791268825531006, "learning_rate": 0.0003641591456804468, "loss": 4.7352, "step": 21360 }, { "epoch": 0.43761390862736266, "grad_norm": 0.26592713594436646, "learning_rate": 0.0003640537219565138, "loss": 4.737, "step": 21370 }, { "epoch": 0.4378186881821719, "grad_norm": 0.2827947437763214, "learning_rate": 0.0003639481587185067, "loss": 4.7368, "step": 21380 }, { "epoch": 0.43802346773698114, "grad_norm": 0.5054929852485657, "learning_rate": 0.00036384245605619814, "loss": 4.6831, "step": 21390 }, { "epoch": 0.4382282472917904, "grad_norm": 0.25543633103370667, "learning_rate": 0.00036373661405947956, "loss": 4.7222, "step": 21400 }, { "epoch": 0.4384330268465996, "grad_norm": 0.2819623351097107, "learning_rate": 0.0003636306328183607, "loss": 4.7212, "step": 21410 }, { "epoch": 0.43863780640140887, "grad_norm": 0.29091677069664, "learning_rate": 0.0003635245124229699, "loss": 4.7207, "step": 21420 }, { "epoch": 0.4388425859562181, "grad_norm": 0.27804920077323914, "learning_rate": 0.00036341825296355365, "loss": 4.7396, "step": 21430 }, { "epoch": 0.43904736551102735, "grad_norm": 0.27245569229125977, "learning_rate": 0.00036331185453047693, "loss": 4.7117, "step": 21440 }, { "epoch": 0.43925214506583665, "grad_norm": 0.25816354155540466, "learning_rate": 0.0003632053172142227, "loss": 4.7511, "step": 21450 }, { "epoch": 0.4394569246206459, "grad_norm": 0.2858197093009949, "learning_rate": 0.00036309864110539204, "loss": 4.7622, "step": 21460 }, { "epoch": 0.43966170417545514, "grad_norm": 0.37263616919517517, "learning_rate": 0.00036299182629470427, "loss": 4.7448, "step": 21470 }, { "epoch": 0.4398664837302644, "grad_norm": 0.26793110370635986, "learning_rate": 0.00036288487287299643, "loss": 4.7803, "step": 21480 }, { "epoch": 0.4400712632850736, "grad_norm": 0.27125048637390137, "learning_rate": 0.0003627777809312235, "loss": 4.7227, "step": 21490 }, { "epoch": 0.44027604283988286, "grad_norm": 0.2748330533504486, "learning_rate": 0.00036267055056045833, "loss": 4.7303, "step": 21500 }, { "epoch": 0.4404808223946921, "grad_norm": 0.2754065990447998, "learning_rate": 0.00036256318185189156, "loss": 4.7009, "step": 21510 }, { "epoch": 0.44068560194950135, "grad_norm": 0.27515119314193726, "learning_rate": 0.00036245567489683115, "loss": 4.7158, "step": 21520 }, { "epoch": 0.4408903815043106, "grad_norm": 0.296744167804718, "learning_rate": 0.00036234802978670296, "loss": 4.7453, "step": 21530 }, { "epoch": 0.44109516105911983, "grad_norm": 0.26276278495788574, "learning_rate": 0.00036224024661305024, "loss": 4.7214, "step": 21540 }, { "epoch": 0.44129994061392913, "grad_norm": 0.46705666184425354, "learning_rate": 0.00036213232546753366, "loss": 4.714, "step": 21550 }, { "epoch": 0.4415047201687384, "grad_norm": 0.3351914882659912, "learning_rate": 0.00036202426644193103, "loss": 4.7077, "step": 21560 }, { "epoch": 0.4417094997235476, "grad_norm": 0.268264502286911, "learning_rate": 0.0003619160696281378, "loss": 4.7474, "step": 21570 }, { "epoch": 0.44191427927835686, "grad_norm": 0.5669612884521484, "learning_rate": 0.0003618077351181662, "loss": 4.7052, "step": 21580 }, { "epoch": 0.4421190588331661, "grad_norm": 0.30829110741615295, "learning_rate": 0.0003616992630041458, "loss": 4.7338, "step": 21590 }, { "epoch": 0.44232383838797534, "grad_norm": 0.2763216197490692, "learning_rate": 0.0003615906533783232, "loss": 4.7038, "step": 21600 }, { "epoch": 0.4425286179427846, "grad_norm": 0.2838597595691681, "learning_rate": 0.0003614819063330618, "loss": 4.7492, "step": 21610 }, { "epoch": 0.44273339749759383, "grad_norm": 0.2547735571861267, "learning_rate": 0.000361373021960842, "loss": 4.7308, "step": 21620 }, { "epoch": 0.44293817705240307, "grad_norm": 0.26135656237602234, "learning_rate": 0.0003612640003542608, "loss": 4.7146, "step": 21630 }, { "epoch": 0.4431429566072123, "grad_norm": 0.259290874004364, "learning_rate": 0.00036115484160603213, "loss": 4.7242, "step": 21640 }, { "epoch": 0.4433477361620216, "grad_norm": 0.2724955677986145, "learning_rate": 0.0003610455458089864, "loss": 4.7854, "step": 21650 }, { "epoch": 0.44355251571683085, "grad_norm": 0.2639175355434418, "learning_rate": 0.0003609361130560706, "loss": 4.7226, "step": 21660 }, { "epoch": 0.4437572952716401, "grad_norm": 0.2689358592033386, "learning_rate": 0.0003608265434403483, "loss": 4.7075, "step": 21670 }, { "epoch": 0.44396207482644934, "grad_norm": 0.2829224765300751, "learning_rate": 0.0003607168370549993, "loss": 4.7617, "step": 21680 }, { "epoch": 0.4441668543812586, "grad_norm": 0.25918588042259216, "learning_rate": 0.00036060699399331975, "loss": 4.7101, "step": 21690 }, { "epoch": 0.4443716339360678, "grad_norm": 0.2661386728286743, "learning_rate": 0.000360497014348722, "loss": 4.7684, "step": 21700 }, { "epoch": 0.44457641349087706, "grad_norm": 0.2601064145565033, "learning_rate": 0.00036038689821473477, "loss": 4.7369, "step": 21710 }, { "epoch": 0.4447811930456863, "grad_norm": 0.27942773699760437, "learning_rate": 0.0003602766456850026, "loss": 4.6933, "step": 21720 }, { "epoch": 0.44498597260049555, "grad_norm": 0.2897523045539856, "learning_rate": 0.0003601662568532862, "loss": 4.7398, "step": 21730 }, { "epoch": 0.4451907521553048, "grad_norm": 0.28540870547294617, "learning_rate": 0.00036005573181346196, "loss": 4.7736, "step": 21740 }, { "epoch": 0.4453955317101141, "grad_norm": 0.2802298069000244, "learning_rate": 0.00035994507065952237, "loss": 4.7264, "step": 21750 }, { "epoch": 0.44560031126492333, "grad_norm": 0.266023188829422, "learning_rate": 0.0003598342734855755, "loss": 4.7555, "step": 21760 }, { "epoch": 0.4458050908197326, "grad_norm": 0.29166194796562195, "learning_rate": 0.0003597233403858451, "loss": 4.7499, "step": 21770 }, { "epoch": 0.4460098703745418, "grad_norm": 0.27979788184165955, "learning_rate": 0.0003596122714546707, "loss": 4.6946, "step": 21780 }, { "epoch": 0.44621464992935106, "grad_norm": 0.28484687209129333, "learning_rate": 0.00035950106678650707, "loss": 4.7541, "step": 21790 }, { "epoch": 0.4464194294841603, "grad_norm": 0.3059164583683014, "learning_rate": 0.00035938972647592466, "loss": 4.6932, "step": 21800 }, { "epoch": 0.44662420903896954, "grad_norm": 0.28869423270225525, "learning_rate": 0.0003592782506176091, "loss": 4.7477, "step": 21810 }, { "epoch": 0.4468289885937788, "grad_norm": 0.29801151156425476, "learning_rate": 0.0003591666393063615, "loss": 4.7742, "step": 21820 }, { "epoch": 0.44703376814858803, "grad_norm": 0.2738795876502991, "learning_rate": 0.00035905489263709774, "loss": 4.7315, "step": 21830 }, { "epoch": 0.44723854770339727, "grad_norm": 0.2763841450214386, "learning_rate": 0.0003589430107048493, "loss": 4.751, "step": 21840 }, { "epoch": 0.4474433272582065, "grad_norm": 0.2792346477508545, "learning_rate": 0.0003588309936047624, "loss": 4.7352, "step": 21850 }, { "epoch": 0.4476481068130158, "grad_norm": 0.2677178978919983, "learning_rate": 0.0003587188414320983, "loss": 4.701, "step": 21860 }, { "epoch": 0.44785288636782505, "grad_norm": 0.26957011222839355, "learning_rate": 0.00035860655428223315, "loss": 4.7266, "step": 21870 }, { "epoch": 0.4480576659226343, "grad_norm": 0.2695551812648773, "learning_rate": 0.0003584941322506578, "loss": 4.7265, "step": 21880 }, { "epoch": 0.44826244547744354, "grad_norm": 0.28194916248321533, "learning_rate": 0.00035838157543297786, "loss": 4.741, "step": 21890 }, { "epoch": 0.4484672250322528, "grad_norm": 0.26357901096343994, "learning_rate": 0.00035826888392491364, "loss": 4.7307, "step": 21900 }, { "epoch": 0.448672004587062, "grad_norm": 0.2969251275062561, "learning_rate": 0.00035815605782229984, "loss": 4.7095, "step": 21910 }, { "epoch": 0.44887678414187127, "grad_norm": 0.2593156397342682, "learning_rate": 0.0003580430972210857, "loss": 4.7314, "step": 21920 }, { "epoch": 0.4490815636966805, "grad_norm": 0.29631832242012024, "learning_rate": 0.0003579300022173348, "loss": 4.7677, "step": 21930 }, { "epoch": 0.44928634325148975, "grad_norm": 0.34863293170928955, "learning_rate": 0.00035781677290722504, "loss": 4.7401, "step": 21940 }, { "epoch": 0.449491122806299, "grad_norm": 0.2681001126766205, "learning_rate": 0.0003577034093870487, "loss": 4.7624, "step": 21950 }, { "epoch": 0.4496959023611083, "grad_norm": 0.26574641466140747, "learning_rate": 0.0003575899117532119, "loss": 4.723, "step": 21960 }, { "epoch": 0.44990068191591753, "grad_norm": 0.2752978801727295, "learning_rate": 0.00035747628010223504, "loss": 4.7108, "step": 21970 }, { "epoch": 0.4501054614707268, "grad_norm": 0.31212377548217773, "learning_rate": 0.00035736251453075243, "loss": 4.738, "step": 21980 }, { "epoch": 0.450310241025536, "grad_norm": 0.3273192048072815, "learning_rate": 0.0003572486151355121, "loss": 4.7489, "step": 21990 }, { "epoch": 0.45051502058034526, "grad_norm": 0.3560446798801422, "learning_rate": 0.0003571345820133762, "loss": 4.6892, "step": 22000 }, { "epoch": 0.45051502058034526, "eval_loss": 4.739803791046143, "eval_runtime": 4.3827, "eval_samples_per_second": 266.044, "eval_steps_per_second": 33.313, "step": 22000 }, { "epoch": 0.4507198001351545, "grad_norm": 0.290144145488739, "learning_rate": 0.00035702041526132033, "loss": 4.7475, "step": 22010 }, { "epoch": 0.45092457968996374, "grad_norm": 0.28808197379112244, "learning_rate": 0.00035690611497643385, "loss": 4.7419, "step": 22020 }, { "epoch": 0.451129359244773, "grad_norm": 0.27849912643432617, "learning_rate": 0.0003567916812559197, "loss": 4.7365, "step": 22030 }, { "epoch": 0.45133413879958223, "grad_norm": 0.27138206362724304, "learning_rate": 0.00035667711419709423, "loss": 4.742, "step": 22040 }, { "epoch": 0.45153891835439147, "grad_norm": 0.26201629638671875, "learning_rate": 0.0003565624138973872, "loss": 4.7362, "step": 22050 }, { "epoch": 0.45174369790920077, "grad_norm": 0.2977285087108612, "learning_rate": 0.0003564475804543418, "loss": 4.7327, "step": 22060 }, { "epoch": 0.45194847746401, "grad_norm": 0.27661874890327454, "learning_rate": 0.00035633261396561425, "loss": 4.7272, "step": 22070 }, { "epoch": 0.45215325701881925, "grad_norm": 0.29062989354133606, "learning_rate": 0.000356217514528974, "loss": 4.7189, "step": 22080 }, { "epoch": 0.4523580365736285, "grad_norm": 0.2621004581451416, "learning_rate": 0.0003561022822423036, "loss": 4.6906, "step": 22090 }, { "epoch": 0.45256281612843774, "grad_norm": 0.27067258954048157, "learning_rate": 0.00035598691720359853, "loss": 4.7545, "step": 22100 }, { "epoch": 0.452767595683247, "grad_norm": 0.25577986240386963, "learning_rate": 0.0003558714195109672, "loss": 4.7186, "step": 22110 }, { "epoch": 0.4529723752380562, "grad_norm": 0.2554454803466797, "learning_rate": 0.00035575578926263075, "loss": 4.7174, "step": 22120 }, { "epoch": 0.45317715479286547, "grad_norm": 0.28933700919151306, "learning_rate": 0.00035564002655692317, "loss": 4.7488, "step": 22130 }, { "epoch": 0.4533819343476747, "grad_norm": 0.268405944108963, "learning_rate": 0.0003555241314922911, "loss": 4.7641, "step": 22140 }, { "epoch": 0.45358671390248395, "grad_norm": 0.2583339512348175, "learning_rate": 0.0003554081041672935, "loss": 4.7023, "step": 22150 }, { "epoch": 0.45379149345729325, "grad_norm": 0.2833841145038605, "learning_rate": 0.0003552919446806021, "loss": 4.7593, "step": 22160 }, { "epoch": 0.4539962730121025, "grad_norm": 0.250685453414917, "learning_rate": 0.0003551756531310009, "loss": 4.7128, "step": 22170 }, { "epoch": 0.45420105256691173, "grad_norm": 0.28385424613952637, "learning_rate": 0.00035505922961738616, "loss": 4.7244, "step": 22180 }, { "epoch": 0.454405832121721, "grad_norm": 0.27036428451538086, "learning_rate": 0.00035494267423876645, "loss": 4.7176, "step": 22190 }, { "epoch": 0.4546106116765302, "grad_norm": 0.2948550581932068, "learning_rate": 0.0003548259870942624, "loss": 4.6934, "step": 22200 }, { "epoch": 0.45481539123133946, "grad_norm": 0.25978729128837585, "learning_rate": 0.00035470916828310683, "loss": 4.7652, "step": 22210 }, { "epoch": 0.4550201707861487, "grad_norm": 0.28499385714530945, "learning_rate": 0.00035459221790464434, "loss": 4.7189, "step": 22220 }, { "epoch": 0.45522495034095795, "grad_norm": 0.3245640993118286, "learning_rate": 0.0003544751360583316, "loss": 4.7064, "step": 22230 }, { "epoch": 0.4554297298957672, "grad_norm": 0.2962793707847595, "learning_rate": 0.000354357922843737, "loss": 4.7432, "step": 22240 }, { "epoch": 0.45563450945057643, "grad_norm": 0.27355194091796875, "learning_rate": 0.00035424057836054064, "loss": 4.7244, "step": 22250 }, { "epoch": 0.45583928900538573, "grad_norm": 0.29868021607398987, "learning_rate": 0.0003541231027085342, "loss": 4.7054, "step": 22260 }, { "epoch": 0.45604406856019497, "grad_norm": 0.26717624068260193, "learning_rate": 0.000354005495987621, "loss": 4.6939, "step": 22270 }, { "epoch": 0.4562488481150042, "grad_norm": 0.2547793686389923, "learning_rate": 0.00035388775829781586, "loss": 4.7223, "step": 22280 }, { "epoch": 0.45645362766981346, "grad_norm": 0.27395254373550415, "learning_rate": 0.0003537698897392448, "loss": 4.7214, "step": 22290 }, { "epoch": 0.4566584072246227, "grad_norm": 0.38744744658470154, "learning_rate": 0.00035365189041214535, "loss": 4.7222, "step": 22300 }, { "epoch": 0.45686318677943194, "grad_norm": 0.2687004506587982, "learning_rate": 0.00035353376041686614, "loss": 4.6706, "step": 22310 }, { "epoch": 0.4570679663342412, "grad_norm": 0.3844720423221588, "learning_rate": 0.0003534154998538669, "loss": 4.7387, "step": 22320 }, { "epoch": 0.4572727458890504, "grad_norm": 0.33876246213912964, "learning_rate": 0.0003532971088237184, "loss": 4.7352, "step": 22330 }, { "epoch": 0.45747752544385967, "grad_norm": 0.3038104176521301, "learning_rate": 0.0003531785874271023, "loss": 4.7192, "step": 22340 }, { "epoch": 0.4576823049986689, "grad_norm": 0.29718440771102905, "learning_rate": 0.0003530599357648114, "loss": 4.7036, "step": 22350 }, { "epoch": 0.4578870845534782, "grad_norm": 0.2830037772655487, "learning_rate": 0.000352941153937749, "loss": 4.7056, "step": 22360 }, { "epoch": 0.45809186410828745, "grad_norm": 0.26650917530059814, "learning_rate": 0.0003528222420469292, "loss": 4.7275, "step": 22370 }, { "epoch": 0.4582966436630967, "grad_norm": 0.3146975040435791, "learning_rate": 0.0003527032001934767, "loss": 4.7309, "step": 22380 }, { "epoch": 0.45850142321790593, "grad_norm": 0.26475873589515686, "learning_rate": 0.00035258402847862665, "loss": 4.7211, "step": 22390 }, { "epoch": 0.4587062027727152, "grad_norm": 0.2605159282684326, "learning_rate": 0.0003524647270037248, "loss": 4.7134, "step": 22400 }, { "epoch": 0.4589109823275244, "grad_norm": 0.29321005940437317, "learning_rate": 0.00035234529587022703, "loss": 4.7272, "step": 22410 }, { "epoch": 0.45911576188233366, "grad_norm": 0.27110812067985535, "learning_rate": 0.0003522257351796997, "loss": 4.6885, "step": 22420 }, { "epoch": 0.4593205414371429, "grad_norm": 0.2615066468715668, "learning_rate": 0.00035210604503381926, "loss": 4.6825, "step": 22430 }, { "epoch": 0.45952532099195215, "grad_norm": 0.2677325904369354, "learning_rate": 0.0003519862255343722, "loss": 4.7029, "step": 22440 }, { "epoch": 0.4597301005467614, "grad_norm": 0.29238784313201904, "learning_rate": 0.00035186627678325506, "loss": 4.7393, "step": 22450 }, { "epoch": 0.4599348801015707, "grad_norm": 0.28893929719924927, "learning_rate": 0.00035174619888247435, "loss": 4.7201, "step": 22460 }, { "epoch": 0.46013965965637993, "grad_norm": 0.2545911371707916, "learning_rate": 0.0003516259919341463, "loss": 4.76, "step": 22470 }, { "epoch": 0.46034443921118917, "grad_norm": 0.28869879245758057, "learning_rate": 0.00035150565604049694, "loss": 4.7196, "step": 22480 }, { "epoch": 0.4605492187659984, "grad_norm": 0.27274906635284424, "learning_rate": 0.00035138519130386203, "loss": 4.7187, "step": 22490 }, { "epoch": 0.46075399832080766, "grad_norm": 0.27680766582489014, "learning_rate": 0.00035126459782668676, "loss": 4.7345, "step": 22500 }, { "epoch": 0.4609587778756169, "grad_norm": 0.2799220085144043, "learning_rate": 0.00035114387571152586, "loss": 4.7529, "step": 22510 }, { "epoch": 0.46116355743042614, "grad_norm": 0.2812877595424652, "learning_rate": 0.00035102302506104363, "loss": 4.7344, "step": 22520 }, { "epoch": 0.4613683369852354, "grad_norm": 0.2624521553516388, "learning_rate": 0.0003509020459780133, "loss": 4.7282, "step": 22530 }, { "epoch": 0.4615731165400446, "grad_norm": 0.28584733605384827, "learning_rate": 0.0003507809385653176, "loss": 4.7569, "step": 22540 }, { "epoch": 0.46177789609485387, "grad_norm": 0.29557710886001587, "learning_rate": 0.0003506597029259484, "loss": 4.7186, "step": 22550 }, { "epoch": 0.4619826756496631, "grad_norm": 0.33304429054260254, "learning_rate": 0.00035053833916300656, "loss": 4.7658, "step": 22560 }, { "epoch": 0.4621874552044724, "grad_norm": 0.27438461780548096, "learning_rate": 0.0003504168473797018, "loss": 4.7322, "step": 22570 }, { "epoch": 0.46239223475928165, "grad_norm": 0.27427247166633606, "learning_rate": 0.00035029522767935284, "loss": 4.6952, "step": 22580 }, { "epoch": 0.4625970143140909, "grad_norm": 0.3031047284603119, "learning_rate": 0.0003501734801653871, "loss": 4.694, "step": 22590 }, { "epoch": 0.46280179386890014, "grad_norm": 0.32042402029037476, "learning_rate": 0.0003500516049413409, "loss": 4.7333, "step": 22600 }, { "epoch": 0.4630065734237094, "grad_norm": 0.2565493881702423, "learning_rate": 0.00034992960211085875, "loss": 4.6752, "step": 22610 }, { "epoch": 0.4632113529785186, "grad_norm": 0.3041118383407593, "learning_rate": 0.0003498074717776941, "loss": 4.7079, "step": 22620 }, { "epoch": 0.46341613253332786, "grad_norm": 0.2607264518737793, "learning_rate": 0.00034968521404570864, "loss": 4.7254, "step": 22630 }, { "epoch": 0.4636209120881371, "grad_norm": 0.3120455741882324, "learning_rate": 0.00034956282901887236, "loss": 4.6809, "step": 22640 }, { "epoch": 0.46382569164294635, "grad_norm": 0.2700790464878082, "learning_rate": 0.0003494403168012637, "loss": 4.7507, "step": 22650 }, { "epoch": 0.4640304711977556, "grad_norm": 0.8952503800392151, "learning_rate": 0.00034931767749706895, "loss": 4.7224, "step": 22660 }, { "epoch": 0.4642352507525649, "grad_norm": 0.26338472962379456, "learning_rate": 0.00034919491121058284, "loss": 4.7215, "step": 22670 }, { "epoch": 0.46444003030737413, "grad_norm": 0.2886572480201721, "learning_rate": 0.0003490720180462078, "loss": 4.7211, "step": 22680 }, { "epoch": 0.46464480986218337, "grad_norm": 0.26196008920669556, "learning_rate": 0.00034894899810845425, "loss": 4.6938, "step": 22690 }, { "epoch": 0.4648495894169926, "grad_norm": 0.2621656656265259, "learning_rate": 0.00034882585150194045, "loss": 4.7492, "step": 22700 }, { "epoch": 0.46505436897180186, "grad_norm": 0.27103763818740845, "learning_rate": 0.0003487025783313925, "loss": 4.7302, "step": 22710 }, { "epoch": 0.4652591485266111, "grad_norm": 0.2672967314720154, "learning_rate": 0.00034857917870164377, "loss": 4.7376, "step": 22720 }, { "epoch": 0.46546392808142034, "grad_norm": 0.2624387741088867, "learning_rate": 0.00034845565271763554, "loss": 4.7141, "step": 22730 }, { "epoch": 0.4656687076362296, "grad_norm": 0.26433417201042175, "learning_rate": 0.0003483320004844164, "loss": 4.7191, "step": 22740 }, { "epoch": 0.4658734871910388, "grad_norm": 0.35606059432029724, "learning_rate": 0.0003482082221071424, "loss": 4.7023, "step": 22750 }, { "epoch": 0.46607826674584807, "grad_norm": 0.34465745091438293, "learning_rate": 0.00034808431769107665, "loss": 4.7355, "step": 22760 }, { "epoch": 0.46628304630065737, "grad_norm": 0.27950605750083923, "learning_rate": 0.0003479602873415895, "loss": 4.7628, "step": 22770 }, { "epoch": 0.4664878258554666, "grad_norm": 0.25964662432670593, "learning_rate": 0.00034783613116415866, "loss": 4.6999, "step": 22780 }, { "epoch": 0.46669260541027585, "grad_norm": 0.28749728202819824, "learning_rate": 0.0003477118492643685, "loss": 4.7659, "step": 22790 }, { "epoch": 0.4668973849650851, "grad_norm": 0.2610335648059845, "learning_rate": 0.00034758744174791055, "loss": 4.6715, "step": 22800 }, { "epoch": 0.46710216451989434, "grad_norm": 0.3397755026817322, "learning_rate": 0.0003474629087205831, "loss": 4.7363, "step": 22810 }, { "epoch": 0.4673069440747036, "grad_norm": 0.2962636947631836, "learning_rate": 0.00034733825028829106, "loss": 4.7283, "step": 22820 }, { "epoch": 0.4675117236295128, "grad_norm": 0.2690482437610626, "learning_rate": 0.00034721346655704617, "loss": 4.7064, "step": 22830 }, { "epoch": 0.46771650318432206, "grad_norm": 0.3107163906097412, "learning_rate": 0.0003470885576329665, "loss": 4.7619, "step": 22840 }, { "epoch": 0.4679212827391313, "grad_norm": 0.27058449387550354, "learning_rate": 0.0003469635236222768, "loss": 4.7168, "step": 22850 }, { "epoch": 0.46812606229394055, "grad_norm": 0.3666093051433563, "learning_rate": 0.00034683836463130825, "loss": 4.7055, "step": 22860 }, { "epoch": 0.46833084184874985, "grad_norm": 0.27504703402519226, "learning_rate": 0.000346713080766498, "loss": 4.7398, "step": 22870 }, { "epoch": 0.4685356214035591, "grad_norm": 0.3063828647136688, "learning_rate": 0.00034658767213438963, "loss": 4.76, "step": 22880 }, { "epoch": 0.46874040095836833, "grad_norm": 0.3170681595802307, "learning_rate": 0.0003464621388416328, "loss": 4.6956, "step": 22890 }, { "epoch": 0.4689451805131776, "grad_norm": 0.2757911682128906, "learning_rate": 0.0003463364809949832, "loss": 4.7313, "step": 22900 }, { "epoch": 0.4691499600679868, "grad_norm": 0.2873567044734955, "learning_rate": 0.0003462106987013023, "loss": 4.7163, "step": 22910 }, { "epoch": 0.46935473962279606, "grad_norm": 0.2824573516845703, "learning_rate": 0.0003460847920675575, "loss": 4.692, "step": 22920 }, { "epoch": 0.4695595191776053, "grad_norm": 0.2875484824180603, "learning_rate": 0.00034595876120082204, "loss": 4.716, "step": 22930 }, { "epoch": 0.46976429873241454, "grad_norm": 0.27463841438293457, "learning_rate": 0.0003458326062082746, "loss": 4.731, "step": 22940 }, { "epoch": 0.4699690782872238, "grad_norm": 0.2988135516643524, "learning_rate": 0.00034570632719719966, "loss": 4.7196, "step": 22950 }, { "epoch": 0.470173857842033, "grad_norm": 0.28052037954330444, "learning_rate": 0.0003455799242749869, "loss": 4.7489, "step": 22960 }, { "epoch": 0.4703786373968423, "grad_norm": 0.29746437072753906, "learning_rate": 0.0003454533975491316, "loss": 4.7202, "step": 22970 }, { "epoch": 0.47058341695165157, "grad_norm": 0.26427146792411804, "learning_rate": 0.00034532674712723423, "loss": 4.74, "step": 22980 }, { "epoch": 0.4707881965064608, "grad_norm": 0.263955295085907, "learning_rate": 0.00034519997311700047, "loss": 4.6972, "step": 22990 }, { "epoch": 0.47099297606127005, "grad_norm": 0.2544487416744232, "learning_rate": 0.00034507307562624105, "loss": 4.7433, "step": 23000 }, { "epoch": 0.4711977556160793, "grad_norm": 0.3022330701351166, "learning_rate": 0.00034494605476287176, "loss": 4.7164, "step": 23010 }, { "epoch": 0.47140253517088854, "grad_norm": 0.24907058477401733, "learning_rate": 0.00034481891063491333, "loss": 4.6942, "step": 23020 }, { "epoch": 0.4716073147256978, "grad_norm": 0.2630559504032135, "learning_rate": 0.0003446916433504913, "loss": 4.7131, "step": 23030 }, { "epoch": 0.471812094280507, "grad_norm": 0.27668577432632446, "learning_rate": 0.00034456425301783585, "loss": 4.7348, "step": 23040 }, { "epoch": 0.47201687383531626, "grad_norm": 0.28554463386535645, "learning_rate": 0.000344436739745282, "loss": 4.6918, "step": 23050 }, { "epoch": 0.4722216533901255, "grad_norm": 0.27102744579315186, "learning_rate": 0.00034430910364126907, "loss": 4.7203, "step": 23060 }, { "epoch": 0.4724264329449348, "grad_norm": 0.2832464873790741, "learning_rate": 0.00034418134481434113, "loss": 4.7284, "step": 23070 }, { "epoch": 0.47263121249974405, "grad_norm": 0.3161238729953766, "learning_rate": 0.00034405346337314637, "loss": 4.728, "step": 23080 }, { "epoch": 0.4728359920545533, "grad_norm": 0.3083503246307373, "learning_rate": 0.0003439254594264373, "loss": 4.7062, "step": 23090 }, { "epoch": 0.47304077160936253, "grad_norm": 0.2700280249118805, "learning_rate": 0.00034379733308307074, "loss": 4.6864, "step": 23100 }, { "epoch": 0.4732455511641718, "grad_norm": 0.3294050991535187, "learning_rate": 0.0003436690844520075, "loss": 4.7536, "step": 23110 }, { "epoch": 0.473450330718981, "grad_norm": 0.2681991159915924, "learning_rate": 0.0003435407136423123, "loss": 4.7242, "step": 23120 }, { "epoch": 0.47365511027379026, "grad_norm": 0.2928425371646881, "learning_rate": 0.000343412220763154, "loss": 4.7083, "step": 23130 }, { "epoch": 0.4738598898285995, "grad_norm": 0.271829217672348, "learning_rate": 0.000343283605923805, "loss": 4.7175, "step": 23140 }, { "epoch": 0.47406466938340874, "grad_norm": 0.2743125855922699, "learning_rate": 0.00034315486923364164, "loss": 4.6993, "step": 23150 }, { "epoch": 0.474269448938218, "grad_norm": 0.2834107577800751, "learning_rate": 0.00034302601080214376, "loss": 4.72, "step": 23160 }, { "epoch": 0.4744742284930273, "grad_norm": 0.2990763783454895, "learning_rate": 0.0003428970307388947, "loss": 4.7021, "step": 23170 }, { "epoch": 0.4746790080478365, "grad_norm": 0.28749608993530273, "learning_rate": 0.00034276792915358145, "loss": 4.724, "step": 23180 }, { "epoch": 0.47488378760264577, "grad_norm": 0.2540006935596466, "learning_rate": 0.000342638706155994, "loss": 4.7423, "step": 23190 }, { "epoch": 0.475088567157455, "grad_norm": 0.5250611305236816, "learning_rate": 0.0003425093618560259, "loss": 4.755, "step": 23200 }, { "epoch": 0.47529334671226425, "grad_norm": 0.2630171477794647, "learning_rate": 0.00034237989636367374, "loss": 4.7299, "step": 23210 }, { "epoch": 0.4754981262670735, "grad_norm": 0.28354302048683167, "learning_rate": 0.00034225030978903714, "loss": 4.7504, "step": 23220 }, { "epoch": 0.47570290582188274, "grad_norm": 0.3067748248577118, "learning_rate": 0.00034212060224231874, "loss": 4.7438, "step": 23230 }, { "epoch": 0.475907685376692, "grad_norm": 0.26697567105293274, "learning_rate": 0.0003419907738338241, "loss": 4.7372, "step": 23240 }, { "epoch": 0.4761124649315012, "grad_norm": 0.30045849084854126, "learning_rate": 0.0003418608246739614, "loss": 4.7013, "step": 23250 }, { "epoch": 0.47631724448631046, "grad_norm": 0.5347376465797424, "learning_rate": 0.0003417307548732417, "loss": 4.6699, "step": 23260 }, { "epoch": 0.4765220240411197, "grad_norm": 0.2729143500328064, "learning_rate": 0.0003416005645422785, "loss": 4.7144, "step": 23270 }, { "epoch": 0.476726803595929, "grad_norm": 0.2777980864048004, "learning_rate": 0.00034147025379178795, "loss": 4.7186, "step": 23280 }, { "epoch": 0.47693158315073825, "grad_norm": 0.2925539016723633, "learning_rate": 0.00034133982273258846, "loss": 4.7271, "step": 23290 }, { "epoch": 0.4771363627055475, "grad_norm": 0.2643592655658722, "learning_rate": 0.0003412092714756008, "loss": 4.7118, "step": 23300 }, { "epoch": 0.47734114226035673, "grad_norm": 0.2659567594528198, "learning_rate": 0.0003410786001318481, "loss": 4.7294, "step": 23310 }, { "epoch": 0.477545921815166, "grad_norm": 0.2851400673389435, "learning_rate": 0.0003409478088124554, "loss": 4.723, "step": 23320 }, { "epoch": 0.4777507013699752, "grad_norm": 0.2710046172142029, "learning_rate": 0.0003408168976286498, "loss": 4.7037, "step": 23330 }, { "epoch": 0.47795548092478446, "grad_norm": 0.4545775055885315, "learning_rate": 0.00034068586669176054, "loss": 4.7028, "step": 23340 }, { "epoch": 0.4781602604795937, "grad_norm": 0.2931750416755676, "learning_rate": 0.00034055471611321855, "loss": 4.7206, "step": 23350 }, { "epoch": 0.47836504003440294, "grad_norm": 0.2837902307510376, "learning_rate": 0.00034042344600455633, "loss": 4.6986, "step": 23360 }, { "epoch": 0.4785698195892122, "grad_norm": 0.2740366756916046, "learning_rate": 0.00034029205647740843, "loss": 4.7222, "step": 23370 }, { "epoch": 0.4787745991440215, "grad_norm": 0.271064430475235, "learning_rate": 0.00034016054764351056, "loss": 4.7188, "step": 23380 }, { "epoch": 0.4789793786988307, "grad_norm": 0.26214951276779175, "learning_rate": 0.0003400289196147002, "loss": 4.7356, "step": 23390 }, { "epoch": 0.47918415825363997, "grad_norm": 0.30367031693458557, "learning_rate": 0.000339897172502916, "loss": 4.7153, "step": 23400 }, { "epoch": 0.4793889378084492, "grad_norm": 0.3222086727619171, "learning_rate": 0.00033976530642019793, "loss": 4.7567, "step": 23410 }, { "epoch": 0.47959371736325845, "grad_norm": 0.2993817627429962, "learning_rate": 0.00033963332147868725, "loss": 4.7273, "step": 23420 }, { "epoch": 0.4797984969180677, "grad_norm": 0.2629406154155731, "learning_rate": 0.0003395012177906261, "loss": 4.7384, "step": 23430 }, { "epoch": 0.48000327647287694, "grad_norm": 0.2722770869731903, "learning_rate": 0.00033936899546835764, "loss": 4.6497, "step": 23440 }, { "epoch": 0.4802080560276862, "grad_norm": 0.28254806995391846, "learning_rate": 0.00033923665462432613, "loss": 4.6993, "step": 23450 }, { "epoch": 0.4804128355824954, "grad_norm": 0.25743168592453003, "learning_rate": 0.0003391041953710764, "loss": 4.722, "step": 23460 }, { "epoch": 0.48061761513730467, "grad_norm": 0.2827824354171753, "learning_rate": 0.0003389716178212541, "loss": 4.7193, "step": 23470 }, { "epoch": 0.48082239469211396, "grad_norm": 0.27589377760887146, "learning_rate": 0.0003388389220876053, "loss": 4.7486, "step": 23480 }, { "epoch": 0.4810271742469232, "grad_norm": 0.31261277198791504, "learning_rate": 0.0003387061082829769, "loss": 4.7422, "step": 23490 }, { "epoch": 0.48123195380173245, "grad_norm": 0.28861501812934875, "learning_rate": 0.0003385731765203159, "loss": 4.6575, "step": 23500 }, { "epoch": 0.4814367333565417, "grad_norm": 0.3081093728542328, "learning_rate": 0.0003384401269126697, "loss": 4.7299, "step": 23510 }, { "epoch": 0.48164151291135093, "grad_norm": 0.3001258373260498, "learning_rate": 0.000338306959573186, "loss": 4.7162, "step": 23520 }, { "epoch": 0.4818462924661602, "grad_norm": 0.3464072644710541, "learning_rate": 0.0003381736746151126, "loss": 4.6819, "step": 23530 }, { "epoch": 0.4820510720209694, "grad_norm": 0.2706254720687866, "learning_rate": 0.00033804027215179726, "loss": 4.7218, "step": 23540 }, { "epoch": 0.48225585157577866, "grad_norm": 0.28934282064437866, "learning_rate": 0.0003379067522966877, "loss": 4.715, "step": 23550 }, { "epoch": 0.4824606311305879, "grad_norm": 0.29465875029563904, "learning_rate": 0.0003377731151633315, "loss": 4.6931, "step": 23560 }, { "epoch": 0.48266541068539714, "grad_norm": 0.3002893626689911, "learning_rate": 0.00033763936086537587, "loss": 4.7003, "step": 23570 }, { "epoch": 0.48287019024020644, "grad_norm": 0.28075167536735535, "learning_rate": 0.00033750548951656784, "loss": 4.6993, "step": 23580 }, { "epoch": 0.4830749697950157, "grad_norm": 0.30318784713745117, "learning_rate": 0.0003373715012307538, "loss": 4.7241, "step": 23590 }, { "epoch": 0.4832797493498249, "grad_norm": 0.29993346333503723, "learning_rate": 0.0003372373961218797, "loss": 4.691, "step": 23600 }, { "epoch": 0.48348452890463417, "grad_norm": 0.282902330160141, "learning_rate": 0.00033710317430399084, "loss": 4.727, "step": 23610 }, { "epoch": 0.4836893084594434, "grad_norm": 0.27338647842407227, "learning_rate": 0.0003369688358912317, "loss": 4.7511, "step": 23620 }, { "epoch": 0.48389408801425265, "grad_norm": 0.2635759115219116, "learning_rate": 0.000336834380997846, "loss": 4.6827, "step": 23630 }, { "epoch": 0.4840988675690619, "grad_norm": 0.26382386684417725, "learning_rate": 0.00033669980973817634, "loss": 4.712, "step": 23640 }, { "epoch": 0.48430364712387114, "grad_norm": 0.26698169112205505, "learning_rate": 0.0003365651222266645, "loss": 4.7189, "step": 23650 }, { "epoch": 0.4845084266786804, "grad_norm": 0.2726365029811859, "learning_rate": 0.00033643031857785103, "loss": 4.6874, "step": 23660 }, { "epoch": 0.4847132062334896, "grad_norm": 0.26503750681877136, "learning_rate": 0.0003362953989063753, "loss": 4.6864, "step": 23670 }, { "epoch": 0.4849179857882989, "grad_norm": 0.29617050290107727, "learning_rate": 0.0003361603633269752, "loss": 4.707, "step": 23680 }, { "epoch": 0.48512276534310816, "grad_norm": 0.29559576511383057, "learning_rate": 0.0003360252119544872, "loss": 4.7303, "step": 23690 }, { "epoch": 0.4853275448979174, "grad_norm": 0.27144870162010193, "learning_rate": 0.00033588994490384654, "loss": 4.7226, "step": 23700 }, { "epoch": 0.48553232445272665, "grad_norm": 0.2561703026294708, "learning_rate": 0.00033575456229008647, "loss": 4.7255, "step": 23710 }, { "epoch": 0.4857371040075359, "grad_norm": 0.26624596118927, "learning_rate": 0.00033561906422833874, "loss": 4.7074, "step": 23720 }, { "epoch": 0.48594188356234513, "grad_norm": 0.28305935859680176, "learning_rate": 0.00033548345083383313, "loss": 4.7052, "step": 23730 }, { "epoch": 0.4861466631171544, "grad_norm": 0.2771523594856262, "learning_rate": 0.00033534772222189763, "loss": 4.717, "step": 23740 }, { "epoch": 0.4863514426719636, "grad_norm": 0.2788209915161133, "learning_rate": 0.0003352118785079582, "loss": 4.7351, "step": 23750 }, { "epoch": 0.48655622222677286, "grad_norm": 0.2830262780189514, "learning_rate": 0.00033507591980753857, "loss": 4.6921, "step": 23760 }, { "epoch": 0.4867610017815821, "grad_norm": 0.27070164680480957, "learning_rate": 0.00033493984623626035, "loss": 4.711, "step": 23770 }, { "epoch": 0.4869657813363914, "grad_norm": 0.2940160036087036, "learning_rate": 0.0003348036579098428, "loss": 4.6986, "step": 23780 }, { "epoch": 0.48717056089120064, "grad_norm": 0.2605251669883728, "learning_rate": 0.0003346673549441028, "loss": 4.7118, "step": 23790 }, { "epoch": 0.4873753404460099, "grad_norm": 1.1082611083984375, "learning_rate": 0.00033453093745495474, "loss": 4.7271, "step": 23800 }, { "epoch": 0.48758012000081913, "grad_norm": 0.2787920832633972, "learning_rate": 0.0003343944055584104, "loss": 4.7058, "step": 23810 }, { "epoch": 0.48778489955562837, "grad_norm": 0.26609036326408386, "learning_rate": 0.0003342577593705787, "loss": 4.7453, "step": 23820 }, { "epoch": 0.4879896791104376, "grad_norm": 0.2606726288795471, "learning_rate": 0.000334120999007666, "loss": 4.7264, "step": 23830 }, { "epoch": 0.48819445866524686, "grad_norm": 0.2958138585090637, "learning_rate": 0.00033398412458597555, "loss": 4.7058, "step": 23840 }, { "epoch": 0.4883992382200561, "grad_norm": 0.28008124232292175, "learning_rate": 0.0003338471362219078, "loss": 4.7463, "step": 23850 }, { "epoch": 0.48860401777486534, "grad_norm": 0.3037973642349243, "learning_rate": 0.0003337100340319599, "loss": 4.6901, "step": 23860 }, { "epoch": 0.4888087973296746, "grad_norm": 0.2755992114543915, "learning_rate": 0.0003335728181327258, "loss": 4.7143, "step": 23870 }, { "epoch": 0.4890135768844839, "grad_norm": 0.2757311165332794, "learning_rate": 0.00033343548864089644, "loss": 4.7267, "step": 23880 }, { "epoch": 0.4892183564392931, "grad_norm": 0.2758731544017792, "learning_rate": 0.000333298045673259, "loss": 4.7356, "step": 23890 }, { "epoch": 0.48942313599410237, "grad_norm": 0.2638200521469116, "learning_rate": 0.0003331604893466974, "loss": 4.727, "step": 23900 }, { "epoch": 0.4896279155489116, "grad_norm": 0.2907838225364685, "learning_rate": 0.0003330228197781918, "loss": 4.7076, "step": 23910 }, { "epoch": 0.48983269510372085, "grad_norm": 0.27843689918518066, "learning_rate": 0.00033288503708481877, "loss": 4.722, "step": 23920 }, { "epoch": 0.4900374746585301, "grad_norm": 0.33251845836639404, "learning_rate": 0.0003327471413837511, "loss": 4.7263, "step": 23930 }, { "epoch": 0.49024225421333933, "grad_norm": 0.2933971583843231, "learning_rate": 0.0003326091327922576, "loss": 4.698, "step": 23940 }, { "epoch": 0.4904470337681486, "grad_norm": 0.29155203700065613, "learning_rate": 0.0003324710114277031, "loss": 4.7129, "step": 23950 }, { "epoch": 0.4906518133229578, "grad_norm": 0.2822931408882141, "learning_rate": 0.00033233277740754845, "loss": 4.7123, "step": 23960 }, { "epoch": 0.49085659287776706, "grad_norm": 0.2747707664966583, "learning_rate": 0.00033219443084935, "loss": 4.6819, "step": 23970 }, { "epoch": 0.4910613724325763, "grad_norm": 0.2611922323703766, "learning_rate": 0.00033205597187076035, "loss": 4.7257, "step": 23980 }, { "epoch": 0.4912661519873856, "grad_norm": 0.28688105940818787, "learning_rate": 0.0003319174005895271, "loss": 4.7097, "step": 23990 }, { "epoch": 0.49147093154219484, "grad_norm": 0.28427520394325256, "learning_rate": 0.0003317787171234937, "loss": 4.745, "step": 24000 }, { "epoch": 0.49147093154219484, "eval_loss": 4.720487117767334, "eval_runtime": 4.3928, "eval_samples_per_second": 265.433, "eval_steps_per_second": 33.236, "step": 24000 }, { "epoch": 0.4916757110970041, "grad_norm": 0.2942088544368744, "learning_rate": 0.00033163992159059887, "loss": 4.724, "step": 24010 }, { "epoch": 0.49188049065181333, "grad_norm": 0.3350241184234619, "learning_rate": 0.0003315010141088768, "loss": 4.7125, "step": 24020 }, { "epoch": 0.49208527020662257, "grad_norm": 0.2688390016555786, "learning_rate": 0.0003313619947964568, "loss": 4.6787, "step": 24030 }, { "epoch": 0.4922900497614318, "grad_norm": 0.39939308166503906, "learning_rate": 0.00033122286377156306, "loss": 4.7109, "step": 24040 }, { "epoch": 0.49249482931624106, "grad_norm": 0.260379433631897, "learning_rate": 0.0003310836211525152, "loss": 4.7032, "step": 24050 }, { "epoch": 0.4926996088710503, "grad_norm": 0.28268808126449585, "learning_rate": 0.00033094426705772734, "loss": 4.7005, "step": 24060 }, { "epoch": 0.49290438842585954, "grad_norm": 0.2926914393901825, "learning_rate": 0.00033080480160570865, "loss": 4.697, "step": 24070 }, { "epoch": 0.4931091679806688, "grad_norm": 0.3147825002670288, "learning_rate": 0.00033066522491506287, "loss": 4.6947, "step": 24080 }, { "epoch": 0.4933139475354781, "grad_norm": 0.27262210845947266, "learning_rate": 0.00033052553710448846, "loss": 4.7289, "step": 24090 }, { "epoch": 0.4935187270902873, "grad_norm": 0.2598920464515686, "learning_rate": 0.0003303857382927783, "loss": 4.7219, "step": 24100 }, { "epoch": 0.49372350664509657, "grad_norm": 0.3277185261249542, "learning_rate": 0.00033024582859881956, "loss": 4.7224, "step": 24110 }, { "epoch": 0.4939282861999058, "grad_norm": 0.29700154066085815, "learning_rate": 0.000330105808141594, "loss": 4.7177, "step": 24120 }, { "epoch": 0.49413306575471505, "grad_norm": 0.2686825096607208, "learning_rate": 0.0003299656770401772, "loss": 4.7232, "step": 24130 }, { "epoch": 0.4943378453095243, "grad_norm": 0.2853672504425049, "learning_rate": 0.00032982543541373925, "loss": 4.6688, "step": 24140 }, { "epoch": 0.49454262486433354, "grad_norm": 0.30086320638656616, "learning_rate": 0.00032968508338154383, "loss": 4.7038, "step": 24150 }, { "epoch": 0.4947474044191428, "grad_norm": 0.32596907019615173, "learning_rate": 0.0003295446210629488, "loss": 4.7365, "step": 24160 }, { "epoch": 0.494952183973952, "grad_norm": 0.266591876745224, "learning_rate": 0.0003294040485774057, "loss": 4.7252, "step": 24170 }, { "epoch": 0.49515696352876126, "grad_norm": 0.42513614892959595, "learning_rate": 0.0003292633660444597, "loss": 4.7194, "step": 24180 }, { "epoch": 0.49536174308357056, "grad_norm": 0.25697004795074463, "learning_rate": 0.00032912257358374967, "loss": 4.7353, "step": 24190 }, { "epoch": 0.4955665226383798, "grad_norm": 0.2750478684902191, "learning_rate": 0.00032898167131500787, "loss": 4.706, "step": 24200 }, { "epoch": 0.49577130219318905, "grad_norm": 0.2928435504436493, "learning_rate": 0.00032884065935806, "loss": 4.7039, "step": 24210 }, { "epoch": 0.4959760817479983, "grad_norm": 0.26540809869766235, "learning_rate": 0.0003286995378328251, "loss": 4.7086, "step": 24220 }, { "epoch": 0.49618086130280753, "grad_norm": 0.27740904688835144, "learning_rate": 0.00032855830685931515, "loss": 4.7378, "step": 24230 }, { "epoch": 0.4963856408576168, "grad_norm": 0.27485188841819763, "learning_rate": 0.0003284169665576355, "loss": 4.7151, "step": 24240 }, { "epoch": 0.496590420412426, "grad_norm": 0.37828922271728516, "learning_rate": 0.00032827551704798434, "loss": 4.6945, "step": 24250 }, { "epoch": 0.49679519996723526, "grad_norm": 0.28058096766471863, "learning_rate": 0.0003281339584506527, "loss": 4.721, "step": 24260 }, { "epoch": 0.4969999795220445, "grad_norm": 0.25854894518852234, "learning_rate": 0.0003279922908860244, "loss": 4.6989, "step": 24270 }, { "epoch": 0.49720475907685374, "grad_norm": 0.26653802394866943, "learning_rate": 0.00032785051447457603, "loss": 4.7013, "step": 24280 }, { "epoch": 0.49740953863166304, "grad_norm": 0.28719645738601685, "learning_rate": 0.00032770862933687657, "loss": 4.7329, "step": 24290 }, { "epoch": 0.4976143181864723, "grad_norm": 0.28432992100715637, "learning_rate": 0.00032756663559358764, "loss": 4.6853, "step": 24300 }, { "epoch": 0.4978190977412815, "grad_norm": 0.3140254318714142, "learning_rate": 0.000327424533365463, "loss": 4.7235, "step": 24310 }, { "epoch": 0.49802387729609077, "grad_norm": 0.28151458501815796, "learning_rate": 0.0003272823227733489, "loss": 4.6886, "step": 24320 }, { "epoch": 0.4982286568509, "grad_norm": 0.2725624740123749, "learning_rate": 0.00032714000393818367, "loss": 4.7217, "step": 24330 }, { "epoch": 0.49843343640570925, "grad_norm": 0.33774906396865845, "learning_rate": 0.0003269975769809976, "loss": 4.719, "step": 24340 }, { "epoch": 0.4986382159605185, "grad_norm": 0.28247374296188354, "learning_rate": 0.000326855042022913, "loss": 4.645, "step": 24350 }, { "epoch": 0.49884299551532774, "grad_norm": 0.2772592306137085, "learning_rate": 0.0003267123991851441, "loss": 4.6848, "step": 24360 }, { "epoch": 0.499047775070137, "grad_norm": 0.291071355342865, "learning_rate": 0.00032656964858899666, "loss": 4.7023, "step": 24370 }, { "epoch": 0.4992525546249462, "grad_norm": 0.2632277309894562, "learning_rate": 0.00032642679035586837, "loss": 4.7196, "step": 24380 }, { "epoch": 0.4994573341797555, "grad_norm": 0.27937376499176025, "learning_rate": 0.0003262838246072481, "loss": 4.722, "step": 24390 }, { "epoch": 0.49966211373456476, "grad_norm": 0.27597543597221375, "learning_rate": 0.0003261407514647166, "loss": 4.698, "step": 24400 }, { "epoch": 0.499866893289374, "grad_norm": 0.2707703709602356, "learning_rate": 0.00032599757104994547, "loss": 4.7336, "step": 24410 }, { "epoch": 0.5000716728441832, "grad_norm": 0.2789210081100464, "learning_rate": 0.00032585428348469794, "loss": 4.6859, "step": 24420 }, { "epoch": 0.5002764523989924, "grad_norm": 0.32933616638183594, "learning_rate": 0.00032571088889082813, "loss": 4.6719, "step": 24430 }, { "epoch": 0.5004812319538018, "grad_norm": 0.28977784514427185, "learning_rate": 0.0003255673873902812, "loss": 4.6751, "step": 24440 }, { "epoch": 0.500686011508611, "grad_norm": 0.30091533064842224, "learning_rate": 0.00032542377910509335, "loss": 4.6725, "step": 24450 }, { "epoch": 0.5008907910634203, "grad_norm": 0.29741963744163513, "learning_rate": 0.0003252800641573914, "loss": 4.7524, "step": 24460 }, { "epoch": 0.5010955706182295, "grad_norm": 0.2790796458721161, "learning_rate": 0.0003251362426693931, "loss": 4.6917, "step": 24470 }, { "epoch": 0.5013003501730388, "grad_norm": 0.2658741772174835, "learning_rate": 0.00032499231476340666, "loss": 4.6869, "step": 24480 }, { "epoch": 0.501505129727848, "grad_norm": 0.28098657727241516, "learning_rate": 0.0003248482805618307, "loss": 4.7246, "step": 24490 }, { "epoch": 0.5017099092826572, "grad_norm": 0.28023362159729004, "learning_rate": 0.00032470414018715446, "loss": 4.7324, "step": 24500 }, { "epoch": 0.5019146888374665, "grad_norm": 0.2921772599220276, "learning_rate": 0.00032455989376195744, "loss": 4.7123, "step": 24510 }, { "epoch": 0.5021194683922757, "grad_norm": 0.2592458724975586, "learning_rate": 0.000324415541408909, "loss": 4.71, "step": 24520 }, { "epoch": 0.502324247947085, "grad_norm": 0.2661096751689911, "learning_rate": 0.0003242710832507692, "loss": 4.7153, "step": 24530 }, { "epoch": 0.5025290275018942, "grad_norm": 0.2882750928401947, "learning_rate": 0.00032412651941038736, "loss": 4.7062, "step": 24540 }, { "epoch": 0.5027338070567035, "grad_norm": 0.29600468277931213, "learning_rate": 0.0003239818500107032, "loss": 4.6724, "step": 24550 }, { "epoch": 0.5029385866115127, "grad_norm": 0.25009870529174805, "learning_rate": 0.00032383707517474604, "loss": 4.7344, "step": 24560 }, { "epoch": 0.5031433661663219, "grad_norm": 0.2783048748970032, "learning_rate": 0.0003236921950256348, "loss": 4.7215, "step": 24570 }, { "epoch": 0.5033481457211312, "grad_norm": 0.27534037828445435, "learning_rate": 0.00032354720968657814, "loss": 4.7264, "step": 24580 }, { "epoch": 0.5035529252759404, "grad_norm": 0.2851855754852295, "learning_rate": 0.0003234021192808739, "loss": 4.7154, "step": 24590 }, { "epoch": 0.5037577048307497, "grad_norm": 0.2746172249317169, "learning_rate": 0.0003232569239319095, "loss": 4.7098, "step": 24600 }, { "epoch": 0.5039624843855589, "grad_norm": 0.27588728070259094, "learning_rate": 0.0003231116237631616, "loss": 4.7102, "step": 24610 }, { "epoch": 0.5041672639403681, "grad_norm": 0.3069898784160614, "learning_rate": 0.00032296621889819585, "loss": 4.6668, "step": 24620 }, { "epoch": 0.5043720434951774, "grad_norm": 0.28509026765823364, "learning_rate": 0.00032282070946066706, "loss": 4.7222, "step": 24630 }, { "epoch": 0.5045768230499867, "grad_norm": 0.25804194808006287, "learning_rate": 0.00032267509557431883, "loss": 4.6905, "step": 24640 }, { "epoch": 0.504781602604796, "grad_norm": 0.27046477794647217, "learning_rate": 0.00032252937736298385, "loss": 4.6901, "step": 24650 }, { "epoch": 0.5049863821596052, "grad_norm": 0.2739366888999939, "learning_rate": 0.00032238355495058324, "loss": 4.7051, "step": 24660 }, { "epoch": 0.5051911617144145, "grad_norm": 0.2906496226787567, "learning_rate": 0.00032223762846112685, "loss": 4.7007, "step": 24670 }, { "epoch": 0.5053959412692237, "grad_norm": 0.2622486352920532, "learning_rate": 0.0003220915980187132, "loss": 4.7171, "step": 24680 }, { "epoch": 0.505600720824033, "grad_norm": 0.30467307567596436, "learning_rate": 0.00032194546374752884, "loss": 4.6864, "step": 24690 }, { "epoch": 0.5058055003788422, "grad_norm": 0.2761250436306, "learning_rate": 0.000321799225771849, "loss": 4.708, "step": 24700 }, { "epoch": 0.5060102799336514, "grad_norm": 0.2742674946784973, "learning_rate": 0.0003216528842160369, "loss": 4.6926, "step": 24710 }, { "epoch": 0.5062150594884607, "grad_norm": 0.3191431760787964, "learning_rate": 0.0003215064392045439, "loss": 4.7402, "step": 24720 }, { "epoch": 0.5064198390432699, "grad_norm": 0.27406784892082214, "learning_rate": 0.00032135989086190926, "loss": 4.7149, "step": 24730 }, { "epoch": 0.5066246185980792, "grad_norm": 0.29739177227020264, "learning_rate": 0.00032121323931276024, "loss": 4.6829, "step": 24740 }, { "epoch": 0.5068293981528884, "grad_norm": 0.2993490993976593, "learning_rate": 0.00032106648468181183, "loss": 4.7107, "step": 24750 }, { "epoch": 0.5070341777076977, "grad_norm": 0.2782713770866394, "learning_rate": 0.00032091962709386666, "loss": 4.694, "step": 24760 }, { "epoch": 0.5072389572625069, "grad_norm": 0.2802233099937439, "learning_rate": 0.0003207726666738148, "loss": 4.6925, "step": 24770 }, { "epoch": 0.5074437368173161, "grad_norm": 0.27374786138534546, "learning_rate": 0.0003206256035466341, "loss": 4.7394, "step": 24780 }, { "epoch": 0.5076485163721254, "grad_norm": 0.2811621129512787, "learning_rate": 0.00032047843783738935, "loss": 4.7339, "step": 24790 }, { "epoch": 0.5078532959269346, "grad_norm": 0.28678399324417114, "learning_rate": 0.0003203311696712328, "loss": 4.7298, "step": 24800 }, { "epoch": 0.5080580754817439, "grad_norm": 0.26202142238616943, "learning_rate": 0.0003201837991734039, "loss": 4.7034, "step": 24810 }, { "epoch": 0.5082628550365531, "grad_norm": 0.2799505889415741, "learning_rate": 0.0003200363264692291, "loss": 4.7083, "step": 24820 }, { "epoch": 0.5084676345913623, "grad_norm": 0.29472848773002625, "learning_rate": 0.0003198887516841214, "loss": 4.7215, "step": 24830 }, { "epoch": 0.5086724141461716, "grad_norm": 0.2675095796585083, "learning_rate": 0.0003197410749435811, "loss": 4.715, "step": 24840 }, { "epoch": 0.508877193700981, "grad_norm": 0.2702561318874359, "learning_rate": 0.00031959329637319504, "loss": 4.6935, "step": 24850 }, { "epoch": 0.5090819732557902, "grad_norm": 0.30072882771492004, "learning_rate": 0.00031944541609863663, "loss": 4.7057, "step": 24860 }, { "epoch": 0.5092867528105994, "grad_norm": 0.2939676344394684, "learning_rate": 0.0003192974342456656, "loss": 4.7177, "step": 24870 }, { "epoch": 0.5094915323654087, "grad_norm": 0.2770099639892578, "learning_rate": 0.0003191493509401284, "loss": 4.7156, "step": 24880 }, { "epoch": 0.5096963119202179, "grad_norm": 0.2761791944503784, "learning_rate": 0.00031900116630795755, "loss": 4.716, "step": 24890 }, { "epoch": 0.5099010914750272, "grad_norm": 0.2724962532520294, "learning_rate": 0.0003188528804751718, "loss": 4.6724, "step": 24900 }, { "epoch": 0.5101058710298364, "grad_norm": 0.26774343848228455, "learning_rate": 0.0003187044935678758, "loss": 4.6928, "step": 24910 }, { "epoch": 0.5103106505846456, "grad_norm": 0.3066468834877014, "learning_rate": 0.00031855600571226053, "loss": 4.7164, "step": 24920 }, { "epoch": 0.5105154301394549, "grad_norm": 0.26715534925460815, "learning_rate": 0.00031840741703460237, "loss": 4.6918, "step": 24930 }, { "epoch": 0.5107202096942641, "grad_norm": 0.2691773772239685, "learning_rate": 0.00031825872766126385, "loss": 4.6799, "step": 24940 }, { "epoch": 0.5109249892490734, "grad_norm": 0.3044761121273041, "learning_rate": 0.00031810993771869275, "loss": 4.7003, "step": 24950 }, { "epoch": 0.5111297688038826, "grad_norm": 0.2579299807548523, "learning_rate": 0.0003179610473334228, "loss": 4.7094, "step": 24960 }, { "epoch": 0.5113345483586919, "grad_norm": 0.27381426095962524, "learning_rate": 0.0003178120566320727, "loss": 4.73, "step": 24970 }, { "epoch": 0.5115393279135011, "grad_norm": 0.2825131118297577, "learning_rate": 0.0003176629657413467, "loss": 4.6995, "step": 24980 }, { "epoch": 0.5117441074683103, "grad_norm": 0.29649683833122253, "learning_rate": 0.0003175137747880344, "loss": 4.6955, "step": 24990 }, { "epoch": 0.5119488870231196, "grad_norm": 0.2876654267311096, "learning_rate": 0.0003173644838990101, "loss": 4.6683, "step": 25000 }, { "epoch": 0.5121536665779288, "grad_norm": 0.3204991817474365, "learning_rate": 0.0003172150932012335, "loss": 4.6938, "step": 25010 }, { "epoch": 0.5123584461327381, "grad_norm": 0.2765672206878662, "learning_rate": 0.00031706560282174885, "loss": 4.7108, "step": 25020 }, { "epoch": 0.5125632256875473, "grad_norm": 0.358657568693161, "learning_rate": 0.0003169160128876854, "loss": 4.7241, "step": 25030 }, { "epoch": 0.5127680052423566, "grad_norm": 0.30940568447113037, "learning_rate": 0.0003167663235262569, "loss": 4.6864, "step": 25040 }, { "epoch": 0.5129727847971659, "grad_norm": 0.28657031059265137, "learning_rate": 0.0003166165348647618, "loss": 4.6723, "step": 25050 }, { "epoch": 0.5131775643519751, "grad_norm": 0.2804965674877167, "learning_rate": 0.00031646664703058287, "loss": 4.6655, "step": 25060 }, { "epoch": 0.5133823439067844, "grad_norm": 0.29186683893203735, "learning_rate": 0.00031631666015118727, "loss": 4.6773, "step": 25070 }, { "epoch": 0.5135871234615936, "grad_norm": 0.3029853403568268, "learning_rate": 0.00031616657435412643, "loss": 4.7079, "step": 25080 }, { "epoch": 0.5137919030164029, "grad_norm": 0.273480087518692, "learning_rate": 0.0003160163897670359, "loss": 4.6923, "step": 25090 }, { "epoch": 0.5139966825712121, "grad_norm": 0.2673560678958893, "learning_rate": 0.0003158661065176351, "loss": 4.7063, "step": 25100 }, { "epoch": 0.5142014621260214, "grad_norm": 0.29640883207321167, "learning_rate": 0.0003157157247337276, "loss": 4.6679, "step": 25110 }, { "epoch": 0.5144062416808306, "grad_norm": 0.2878381907939911, "learning_rate": 0.00031556524454320043, "loss": 4.6889, "step": 25120 }, { "epoch": 0.5146110212356398, "grad_norm": 0.2771507799625397, "learning_rate": 0.00031541466607402463, "loss": 4.7019, "step": 25130 }, { "epoch": 0.5148158007904491, "grad_norm": 0.28070971369743347, "learning_rate": 0.00031526398945425473, "loss": 4.7209, "step": 25140 }, { "epoch": 0.5150205803452583, "grad_norm": 0.34403181076049805, "learning_rate": 0.0003151132148120286, "loss": 4.6986, "step": 25150 }, { "epoch": 0.5152253599000676, "grad_norm": 0.31226861476898193, "learning_rate": 0.0003149623422755676, "loss": 4.6746, "step": 25160 }, { "epoch": 0.5154301394548768, "grad_norm": 0.48295944929122925, "learning_rate": 0.0003148113719731763, "loss": 4.7339, "step": 25170 }, { "epoch": 0.515634919009686, "grad_norm": 0.29848966002464294, "learning_rate": 0.00031466030403324235, "loss": 4.7034, "step": 25180 }, { "epoch": 0.5158396985644953, "grad_norm": 0.2727344036102295, "learning_rate": 0.0003145091385842366, "loss": 4.7082, "step": 25190 }, { "epoch": 0.5160444781193045, "grad_norm": 0.2666777968406677, "learning_rate": 0.00031435787575471264, "loss": 4.6865, "step": 25200 }, { "epoch": 0.5162492576741138, "grad_norm": 0.2528291940689087, "learning_rate": 0.000314206515673307, "loss": 4.7122, "step": 25210 }, { "epoch": 0.516454037228923, "grad_norm": 0.27205219864845276, "learning_rate": 0.0003140550584687388, "loss": 4.6817, "step": 25220 }, { "epoch": 0.5166588167837323, "grad_norm": 0.283069908618927, "learning_rate": 0.00031390350426980986, "loss": 4.6998, "step": 25230 }, { "epoch": 0.5168635963385415, "grad_norm": 0.2868604063987732, "learning_rate": 0.0003137518532054045, "loss": 4.7136, "step": 25240 }, { "epoch": 0.5170683758933509, "grad_norm": 0.2935883402824402, "learning_rate": 0.0003136001054044893, "loss": 4.7054, "step": 25250 }, { "epoch": 0.5172731554481601, "grad_norm": 0.28250062465667725, "learning_rate": 0.00031344826099611315, "loss": 4.7147, "step": 25260 }, { "epoch": 0.5174779350029693, "grad_norm": 0.2601027488708496, "learning_rate": 0.0003132963201094072, "loss": 4.7112, "step": 25270 }, { "epoch": 0.5176827145577786, "grad_norm": 0.2703922986984253, "learning_rate": 0.00031314428287358454, "loss": 4.6906, "step": 25280 }, { "epoch": 0.5178874941125878, "grad_norm": 0.2875763475894928, "learning_rate": 0.0003129921494179401, "loss": 4.6944, "step": 25290 }, { "epoch": 0.5180922736673971, "grad_norm": 0.3172433078289032, "learning_rate": 0.0003128399198718509, "loss": 4.6961, "step": 25300 }, { "epoch": 0.5182970532222063, "grad_norm": 0.28155481815338135, "learning_rate": 0.0003126875943647755, "loss": 4.7292, "step": 25310 }, { "epoch": 0.5185018327770156, "grad_norm": 0.30049318075180054, "learning_rate": 0.0003125351730262541, "loss": 4.7018, "step": 25320 }, { "epoch": 0.5187066123318248, "grad_norm": 0.2589489221572876, "learning_rate": 0.0003123826559859083, "loss": 4.6716, "step": 25330 }, { "epoch": 0.518911391886634, "grad_norm": 0.28580641746520996, "learning_rate": 0.0003122300433734413, "loss": 4.7012, "step": 25340 }, { "epoch": 0.5191161714414433, "grad_norm": 0.2864598035812378, "learning_rate": 0.0003120773353186375, "loss": 4.7456, "step": 25350 }, { "epoch": 0.5193209509962525, "grad_norm": 0.2721398174762726, "learning_rate": 0.0003119245319513623, "loss": 4.6883, "step": 25360 }, { "epoch": 0.5195257305510618, "grad_norm": 0.28199484944343567, "learning_rate": 0.0003117716334015624, "loss": 4.7168, "step": 25370 }, { "epoch": 0.519730510105871, "grad_norm": 0.2768019437789917, "learning_rate": 0.0003116186397992652, "loss": 4.7534, "step": 25380 }, { "epoch": 0.5199352896606803, "grad_norm": 0.27264705300331116, "learning_rate": 0.0003114655512745791, "loss": 4.6925, "step": 25390 }, { "epoch": 0.5201400692154895, "grad_norm": 0.2672605514526367, "learning_rate": 0.00031131236795769324, "loss": 4.6965, "step": 25400 }, { "epoch": 0.5203448487702987, "grad_norm": 0.29574844241142273, "learning_rate": 0.00031115908997887725, "loss": 4.6935, "step": 25410 }, { "epoch": 0.520549628325108, "grad_norm": 0.2922261655330658, "learning_rate": 0.00031100571746848143, "loss": 4.7158, "step": 25420 }, { "epoch": 0.5207544078799172, "grad_norm": 0.26851898431777954, "learning_rate": 0.0003108522505569362, "loss": 4.7006, "step": 25430 }, { "epoch": 0.5209591874347265, "grad_norm": 0.33217042684555054, "learning_rate": 0.0003106986893747526, "loss": 4.6862, "step": 25440 }, { "epoch": 0.5211639669895357, "grad_norm": 0.31240522861480713, "learning_rate": 0.00031054503405252167, "loss": 4.693, "step": 25450 }, { "epoch": 0.5213687465443451, "grad_norm": 0.2913259267807007, "learning_rate": 0.0003103912847209143, "loss": 4.6595, "step": 25460 }, { "epoch": 0.5215735260991543, "grad_norm": 0.2755216658115387, "learning_rate": 0.0003102374415106818, "loss": 4.7245, "step": 25470 }, { "epoch": 0.5217783056539635, "grad_norm": 0.30526289343833923, "learning_rate": 0.0003100835045526548, "loss": 4.6517, "step": 25480 }, { "epoch": 0.5219830852087728, "grad_norm": 0.27394744753837585, "learning_rate": 0.0003099294739777441, "loss": 4.7576, "step": 25490 }, { "epoch": 0.522187864763582, "grad_norm": 0.29140064120292664, "learning_rate": 0.00030977534991693986, "loss": 4.7065, "step": 25500 }, { "epoch": 0.5223926443183913, "grad_norm": 0.27797731757164, "learning_rate": 0.00030962113250131184, "loss": 4.6969, "step": 25510 }, { "epoch": 0.5225974238732005, "grad_norm": 0.2931949198246002, "learning_rate": 0.00030946682186200905, "loss": 4.7124, "step": 25520 }, { "epoch": 0.5228022034280098, "grad_norm": 0.2820892035961151, "learning_rate": 0.0003093124181302599, "loss": 4.7162, "step": 25530 }, { "epoch": 0.523006982982819, "grad_norm": 0.2782002389431, "learning_rate": 0.00030915792143737205, "loss": 4.7207, "step": 25540 }, { "epoch": 0.5232117625376282, "grad_norm": 0.31584373116493225, "learning_rate": 0.00030900333191473205, "loss": 4.7256, "step": 25550 }, { "epoch": 0.5234165420924375, "grad_norm": 0.28503715991973877, "learning_rate": 0.0003088486496938054, "loss": 4.6784, "step": 25560 }, { "epoch": 0.5236213216472467, "grad_norm": 0.32100310921669006, "learning_rate": 0.0003086938749061366, "loss": 4.6991, "step": 25570 }, { "epoch": 0.523826101202056, "grad_norm": 0.25826606154441833, "learning_rate": 0.0003085390076833487, "loss": 4.6634, "step": 25580 }, { "epoch": 0.5240308807568652, "grad_norm": 0.2814335823059082, "learning_rate": 0.00030838404815714345, "loss": 4.7095, "step": 25590 }, { "epoch": 0.5242356603116745, "grad_norm": 0.2817402780056, "learning_rate": 0.00030822899645930105, "loss": 4.6766, "step": 25600 }, { "epoch": 0.5244404398664837, "grad_norm": 0.2802383005619049, "learning_rate": 0.00030807385272168, "loss": 4.6997, "step": 25610 }, { "epoch": 0.5246452194212929, "grad_norm": 0.3037165701389313, "learning_rate": 0.00030791861707621734, "loss": 4.7262, "step": 25620 }, { "epoch": 0.5248499989761022, "grad_norm": 0.3113444447517395, "learning_rate": 0.00030776328965492804, "loss": 4.6834, "step": 25630 }, { "epoch": 0.5250547785309114, "grad_norm": 0.3027448356151581, "learning_rate": 0.0003076078705899051, "loss": 4.6792, "step": 25640 }, { "epoch": 0.5252595580857207, "grad_norm": 0.2720443606376648, "learning_rate": 0.0003074523600133197, "loss": 4.6895, "step": 25650 }, { "epoch": 0.52546433764053, "grad_norm": 0.30578285455703735, "learning_rate": 0.0003072967580574206, "loss": 4.6978, "step": 25660 }, { "epoch": 0.5256691171953393, "grad_norm": 0.2918396294116974, "learning_rate": 0.0003071410648545342, "loss": 4.6637, "step": 25670 }, { "epoch": 0.5258738967501485, "grad_norm": 0.2783021330833435, "learning_rate": 0.00030698528053706484, "loss": 4.6948, "step": 25680 }, { "epoch": 0.5260786763049577, "grad_norm": 0.2891443371772766, "learning_rate": 0.00030682940523749406, "loss": 4.7025, "step": 25690 }, { "epoch": 0.526283455859767, "grad_norm": 0.2706538736820221, "learning_rate": 0.00030667343908838103, "loss": 4.6635, "step": 25700 }, { "epoch": 0.5264882354145762, "grad_norm": 0.2786348760128021, "learning_rate": 0.0003065173822223617, "loss": 4.7087, "step": 25710 }, { "epoch": 0.5266930149693855, "grad_norm": 0.2681735157966614, "learning_rate": 0.0003063612347721497, "loss": 4.7101, "step": 25720 }, { "epoch": 0.5268977945241947, "grad_norm": 0.3078657388687134, "learning_rate": 0.0003062049968705355, "loss": 4.6899, "step": 25730 }, { "epoch": 0.527102574079004, "grad_norm": 0.2678840160369873, "learning_rate": 0.00030604866865038637, "loss": 4.6822, "step": 25740 }, { "epoch": 0.5273073536338132, "grad_norm": 0.30802249908447266, "learning_rate": 0.00030589225024464656, "loss": 4.6478, "step": 25750 }, { "epoch": 0.5275121331886224, "grad_norm": 0.2710878849029541, "learning_rate": 0.0003057357417863368, "loss": 4.6963, "step": 25760 }, { "epoch": 0.5277169127434317, "grad_norm": 0.2686658501625061, "learning_rate": 0.0003055791434085547, "loss": 4.7031, "step": 25770 }, { "epoch": 0.5279216922982409, "grad_norm": 0.2729041576385498, "learning_rate": 0.0003054224552444742, "loss": 4.7009, "step": 25780 }, { "epoch": 0.5281264718530502, "grad_norm": 0.27089405059814453, "learning_rate": 0.00030526567742734543, "loss": 4.6705, "step": 25790 }, { "epoch": 0.5283312514078594, "grad_norm": 0.2662869095802307, "learning_rate": 0.000305108810090495, "loss": 4.6951, "step": 25800 }, { "epoch": 0.5285360309626687, "grad_norm": 0.27419552206993103, "learning_rate": 0.00030495185336732556, "loss": 4.6835, "step": 25810 }, { "epoch": 0.5287408105174779, "grad_norm": 0.2641861140727997, "learning_rate": 0.00030479480739131575, "loss": 4.6933, "step": 25820 }, { "epoch": 0.5289455900722871, "grad_norm": 0.29191505908966064, "learning_rate": 0.0003046376722960201, "loss": 4.69, "step": 25830 }, { "epoch": 0.5291503696270964, "grad_norm": 0.2895081043243408, "learning_rate": 0.0003044804482150691, "loss": 4.6974, "step": 25840 }, { "epoch": 0.5293551491819056, "grad_norm": 0.28528445959091187, "learning_rate": 0.00030432313528216867, "loss": 4.7084, "step": 25850 }, { "epoch": 0.529559928736715, "grad_norm": 0.2768077850341797, "learning_rate": 0.0003041657336311004, "loss": 4.6845, "step": 25860 }, { "epoch": 0.5297647082915242, "grad_norm": 0.33595651388168335, "learning_rate": 0.0003040082433957214, "loss": 4.6987, "step": 25870 }, { "epoch": 0.5299694878463335, "grad_norm": 0.27888068556785583, "learning_rate": 0.00030385066470996403, "loss": 4.6871, "step": 25880 }, { "epoch": 0.5301742674011427, "grad_norm": 0.270297110080719, "learning_rate": 0.00030369299770783583, "loss": 4.689, "step": 25890 }, { "epoch": 0.530379046955952, "grad_norm": 0.27477559447288513, "learning_rate": 0.0003035352425234195, "loss": 4.7047, "step": 25900 }, { "epoch": 0.5305838265107612, "grad_norm": 0.3273790776729584, "learning_rate": 0.0003033773992908728, "loss": 4.7141, "step": 25910 }, { "epoch": 0.5307886060655704, "grad_norm": 0.31001225113868713, "learning_rate": 0.0003032194681444282, "loss": 4.7561, "step": 25920 }, { "epoch": 0.5309933856203797, "grad_norm": 0.2798680365085602, "learning_rate": 0.0003030614492183931, "loss": 4.7198, "step": 25930 }, { "epoch": 0.5311981651751889, "grad_norm": 0.27496957778930664, "learning_rate": 0.00030290334264714957, "loss": 4.7175, "step": 25940 }, { "epoch": 0.5314029447299982, "grad_norm": 0.2571429908275604, "learning_rate": 0.000302745148565154, "loss": 4.7221, "step": 25950 }, { "epoch": 0.5316077242848074, "grad_norm": 0.2705236077308655, "learning_rate": 0.0003025868671069373, "loss": 4.6772, "step": 25960 }, { "epoch": 0.5318125038396166, "grad_norm": 0.2997802495956421, "learning_rate": 0.0003024284984071047, "loss": 4.7076, "step": 25970 }, { "epoch": 0.5320172833944259, "grad_norm": 0.3122205436229706, "learning_rate": 0.00030227004260033574, "loss": 4.6439, "step": 25980 }, { "epoch": 0.5322220629492351, "grad_norm": 0.3267391622066498, "learning_rate": 0.00030211149982138397, "loss": 4.7256, "step": 25990 }, { "epoch": 0.5324268425040444, "grad_norm": 0.2765333950519562, "learning_rate": 0.0003019528702050767, "loss": 4.7069, "step": 26000 }, { "epoch": 0.5324268425040444, "eval_loss": 4.6982269287109375, "eval_runtime": 4.2785, "eval_samples_per_second": 272.527, "eval_steps_per_second": 34.124, "step": 26000 }, { "epoch": 0.5326316220588536, "grad_norm": 0.3126332461833954, "learning_rate": 0.00030179415388631545, "loss": 4.6768, "step": 26010 }, { "epoch": 0.5328364016136629, "grad_norm": 0.279086709022522, "learning_rate": 0.00030163535100007506, "loss": 4.7024, "step": 26020 }, { "epoch": 0.5330411811684721, "grad_norm": 0.29606544971466064, "learning_rate": 0.00030147646168140433, "loss": 4.6972, "step": 26030 }, { "epoch": 0.5332459607232813, "grad_norm": 0.2799825072288513, "learning_rate": 0.00030131748606542543, "loss": 4.7069, "step": 26040 }, { "epoch": 0.5334507402780906, "grad_norm": 0.2848356068134308, "learning_rate": 0.00030115842428733394, "loss": 4.6925, "step": 26050 }, { "epoch": 0.5336555198328999, "grad_norm": 0.284119188785553, "learning_rate": 0.00030099927648239864, "loss": 4.6824, "step": 26060 }, { "epoch": 0.5338602993877092, "grad_norm": 0.2721329629421234, "learning_rate": 0.00030084004278596154, "loss": 4.7047, "step": 26070 }, { "epoch": 0.5340650789425184, "grad_norm": 0.29070404171943665, "learning_rate": 0.0003006807233334378, "loss": 4.7209, "step": 26080 }, { "epoch": 0.5342698584973277, "grad_norm": 0.2773502469062805, "learning_rate": 0.00030052131826031523, "loss": 4.693, "step": 26090 }, { "epoch": 0.5344746380521369, "grad_norm": 0.2657264471054077, "learning_rate": 0.00030036182770215467, "loss": 4.7017, "step": 26100 }, { "epoch": 0.5346794176069462, "grad_norm": 0.2874513864517212, "learning_rate": 0.00030020225179458964, "loss": 4.6929, "step": 26110 }, { "epoch": 0.5348841971617554, "grad_norm": 0.2968789041042328, "learning_rate": 0.00030004259067332615, "loss": 4.7381, "step": 26120 }, { "epoch": 0.5350889767165646, "grad_norm": 0.3313628137111664, "learning_rate": 0.0002998828444741428, "loss": 4.6787, "step": 26130 }, { "epoch": 0.5352937562713739, "grad_norm": 0.46929121017456055, "learning_rate": 0.00029972301333289035, "loss": 4.7263, "step": 26140 }, { "epoch": 0.5354985358261831, "grad_norm": 0.27975574135780334, "learning_rate": 0.0002995630973854921, "loss": 4.6692, "step": 26150 }, { "epoch": 0.5357033153809924, "grad_norm": 0.2827834486961365, "learning_rate": 0.0002994030967679431, "loss": 4.685, "step": 26160 }, { "epoch": 0.5359080949358016, "grad_norm": 0.2770453095436096, "learning_rate": 0.00029924301161631076, "loss": 4.6679, "step": 26170 }, { "epoch": 0.5361128744906108, "grad_norm": 0.2641393542289734, "learning_rate": 0.00029908284206673415, "loss": 4.7198, "step": 26180 }, { "epoch": 0.5363176540454201, "grad_norm": 0.2863255441188812, "learning_rate": 0.00029892258825542416, "loss": 4.6987, "step": 26190 }, { "epoch": 0.5365224336002293, "grad_norm": 0.2789994776248932, "learning_rate": 0.0002987622503186633, "loss": 4.722, "step": 26200 }, { "epoch": 0.5367272131550386, "grad_norm": 0.2993677854537964, "learning_rate": 0.00029860182839280575, "loss": 4.6807, "step": 26210 }, { "epoch": 0.5369319927098478, "grad_norm": 0.3469982147216797, "learning_rate": 0.00029844132261427713, "loss": 4.6821, "step": 26220 }, { "epoch": 0.5371367722646571, "grad_norm": 0.2888258397579193, "learning_rate": 0.00029828073311957416, "loss": 4.7409, "step": 26230 }, { "epoch": 0.5373415518194663, "grad_norm": 0.2735867202281952, "learning_rate": 0.0002981200600452649, "loss": 4.6525, "step": 26240 }, { "epoch": 0.5375463313742755, "grad_norm": 0.29521840810775757, "learning_rate": 0.00029795930352798837, "loss": 4.681, "step": 26250 }, { "epoch": 0.5377511109290848, "grad_norm": 0.3799820840358734, "learning_rate": 0.0002977984637044549, "loss": 4.7131, "step": 26260 }, { "epoch": 0.5379558904838941, "grad_norm": 0.30397945642471313, "learning_rate": 0.00029763754071144514, "loss": 4.7276, "step": 26270 }, { "epoch": 0.5381606700387034, "grad_norm": 0.2679298520088196, "learning_rate": 0.0002974765346858109, "loss": 4.6958, "step": 26280 }, { "epoch": 0.5383654495935126, "grad_norm": 0.2828942835330963, "learning_rate": 0.00029731544576447447, "loss": 4.6866, "step": 26290 }, { "epoch": 0.5385702291483219, "grad_norm": 0.29038602113723755, "learning_rate": 0.00029715427408442843, "loss": 4.6415, "step": 26300 }, { "epoch": 0.5387750087031311, "grad_norm": 0.29893389344215393, "learning_rate": 0.000296993019782736, "loss": 4.6895, "step": 26310 }, { "epoch": 0.5389797882579404, "grad_norm": 0.29326602816581726, "learning_rate": 0.00029683168299653065, "loss": 4.6806, "step": 26320 }, { "epoch": 0.5391845678127496, "grad_norm": 0.2700636684894562, "learning_rate": 0.0002966702638630159, "loss": 4.7156, "step": 26330 }, { "epoch": 0.5393893473675588, "grad_norm": 0.28488901257514954, "learning_rate": 0.0002965087625194653, "loss": 4.6573, "step": 26340 }, { "epoch": 0.5395941269223681, "grad_norm": 0.2597222328186035, "learning_rate": 0.00029634717910322224, "loss": 4.6872, "step": 26350 }, { "epoch": 0.5397989064771773, "grad_norm": 0.295816570520401, "learning_rate": 0.0002961855137517002, "loss": 4.6942, "step": 26360 }, { "epoch": 0.5400036860319866, "grad_norm": 0.2972418963909149, "learning_rate": 0.0002960237666023821, "loss": 4.6899, "step": 26370 }, { "epoch": 0.5402084655867958, "grad_norm": 0.3499514162540436, "learning_rate": 0.00029586193779282025, "loss": 4.6558, "step": 26380 }, { "epoch": 0.540413245141605, "grad_norm": 0.28130412101745605, "learning_rate": 0.0002957000274606369, "loss": 4.6368, "step": 26390 }, { "epoch": 0.5406180246964143, "grad_norm": 0.3309668004512787, "learning_rate": 0.0002955380357435232, "loss": 4.7212, "step": 26400 }, { "epoch": 0.5408228042512235, "grad_norm": 0.2834167778491974, "learning_rate": 0.00029537596277923964, "loss": 4.7359, "step": 26410 }, { "epoch": 0.5410275838060328, "grad_norm": 0.2899566888809204, "learning_rate": 0.00029521380870561594, "loss": 4.6559, "step": 26420 }, { "epoch": 0.541232363360842, "grad_norm": 0.2890217900276184, "learning_rate": 0.00029505157366055065, "loss": 4.687, "step": 26430 }, { "epoch": 0.5414371429156513, "grad_norm": 0.2887726128101349, "learning_rate": 0.00029488925778201107, "loss": 4.7041, "step": 26440 }, { "epoch": 0.5416419224704605, "grad_norm": 0.2790029048919678, "learning_rate": 0.0002947268612080335, "loss": 4.7045, "step": 26450 }, { "epoch": 0.5418467020252697, "grad_norm": 0.27893850207328796, "learning_rate": 0.0002945643840767228, "loss": 4.699, "step": 26460 }, { "epoch": 0.5420514815800791, "grad_norm": 0.28486934304237366, "learning_rate": 0.00029440182652625224, "loss": 4.6918, "step": 26470 }, { "epoch": 0.5422562611348883, "grad_norm": 0.2898862957954407, "learning_rate": 0.0002942391886948635, "loss": 4.6883, "step": 26480 }, { "epoch": 0.5424610406896976, "grad_norm": 0.30720099806785583, "learning_rate": 0.00029407647072086647, "loss": 4.6698, "step": 26490 }, { "epoch": 0.5426658202445068, "grad_norm": 0.31864264607429504, "learning_rate": 0.00029391367274263946, "loss": 4.7138, "step": 26500 }, { "epoch": 0.5428705997993161, "grad_norm": 0.2910747528076172, "learning_rate": 0.00029375079489862854, "loss": 4.7209, "step": 26510 }, { "epoch": 0.5430753793541253, "grad_norm": 0.35545462369918823, "learning_rate": 0.0002935878373273478, "loss": 4.7109, "step": 26520 }, { "epoch": 0.5432801589089346, "grad_norm": 0.31063807010650635, "learning_rate": 0.00029342480016737916, "loss": 4.7099, "step": 26530 }, { "epoch": 0.5434849384637438, "grad_norm": 0.31367921829223633, "learning_rate": 0.0002932616835573722, "loss": 4.6925, "step": 26540 }, { "epoch": 0.543689718018553, "grad_norm": 0.28063562512397766, "learning_rate": 0.00029309848763604395, "loss": 4.6912, "step": 26550 }, { "epoch": 0.5438944975733623, "grad_norm": 0.27479425072669983, "learning_rate": 0.00029293521254217914, "loss": 4.6824, "step": 26560 }, { "epoch": 0.5440992771281715, "grad_norm": 0.2851077914237976, "learning_rate": 0.0002927718584146296, "loss": 4.6928, "step": 26570 }, { "epoch": 0.5443040566829808, "grad_norm": 0.27119210362434387, "learning_rate": 0.0002926084253923145, "loss": 4.7131, "step": 26580 }, { "epoch": 0.54450883623779, "grad_norm": 0.28119131922721863, "learning_rate": 0.00029244491361421997, "loss": 4.6774, "step": 26590 }, { "epoch": 0.5447136157925992, "grad_norm": 0.27679044008255005, "learning_rate": 0.0002922813232193993, "loss": 4.6574, "step": 26600 }, { "epoch": 0.5449183953474085, "grad_norm": 0.27654358744621277, "learning_rate": 0.0002921176543469726, "loss": 4.6822, "step": 26610 }, { "epoch": 0.5451231749022177, "grad_norm": 0.2731030583381653, "learning_rate": 0.0002919539071361265, "loss": 4.6278, "step": 26620 }, { "epoch": 0.545327954457027, "grad_norm": 0.27976152300834656, "learning_rate": 0.0002917900817261145, "loss": 4.7067, "step": 26630 }, { "epoch": 0.5455327340118362, "grad_norm": 0.2779357135295868, "learning_rate": 0.0002916261782562565, "loss": 4.6693, "step": 26640 }, { "epoch": 0.5457375135666455, "grad_norm": 0.2789212167263031, "learning_rate": 0.0002914621968659389, "loss": 4.6428, "step": 26650 }, { "epoch": 0.5459422931214547, "grad_norm": 0.2688707113265991, "learning_rate": 0.00029129813769461407, "loss": 4.686, "step": 26660 }, { "epoch": 0.5461470726762641, "grad_norm": 0.2800160348415375, "learning_rate": 0.00029113400088180084, "loss": 4.6864, "step": 26670 }, { "epoch": 0.5463518522310733, "grad_norm": 0.28964245319366455, "learning_rate": 0.0002909697865670839, "loss": 4.664, "step": 26680 }, { "epoch": 0.5465566317858825, "grad_norm": 0.2826152443885803, "learning_rate": 0.0002908054948901139, "loss": 4.6698, "step": 26690 }, { "epoch": 0.5467614113406918, "grad_norm": 0.2769240140914917, "learning_rate": 0.00029064112599060734, "loss": 4.6627, "step": 26700 }, { "epoch": 0.546966190895501, "grad_norm": 0.27277007699012756, "learning_rate": 0.00029047668000834625, "loss": 4.6499, "step": 26710 }, { "epoch": 0.5471709704503103, "grad_norm": 0.29175493121147156, "learning_rate": 0.0002903121570831783, "loss": 4.685, "step": 26720 }, { "epoch": 0.5473757500051195, "grad_norm": 0.2729916274547577, "learning_rate": 0.0002901475573550166, "loss": 4.6603, "step": 26730 }, { "epoch": 0.5475805295599288, "grad_norm": 0.2936178147792816, "learning_rate": 0.0002899828809638396, "loss": 4.6712, "step": 26740 }, { "epoch": 0.547785309114738, "grad_norm": 0.28730419278144836, "learning_rate": 0.00028981812804969087, "loss": 4.6672, "step": 26750 }, { "epoch": 0.5479900886695472, "grad_norm": 0.28215673565864563, "learning_rate": 0.000289653298752679, "loss": 4.6573, "step": 26760 }, { "epoch": 0.5481948682243565, "grad_norm": 0.3288278877735138, "learning_rate": 0.00028948839321297777, "loss": 4.6569, "step": 26770 }, { "epoch": 0.5483996477791657, "grad_norm": 0.30455297231674194, "learning_rate": 0.0002893234115708255, "loss": 4.6549, "step": 26780 }, { "epoch": 0.548604427333975, "grad_norm": 0.2895998954772949, "learning_rate": 0.0002891583539665256, "loss": 4.7075, "step": 26790 }, { "epoch": 0.5488092068887842, "grad_norm": 0.3010871112346649, "learning_rate": 0.0002889932205404457, "loss": 4.719, "step": 26800 }, { "epoch": 0.5490139864435934, "grad_norm": 0.2826119661331177, "learning_rate": 0.00028882801143301803, "loss": 4.6746, "step": 26810 }, { "epoch": 0.5492187659984027, "grad_norm": 0.27935516834259033, "learning_rate": 0.0002886627267847394, "loss": 4.7028, "step": 26820 }, { "epoch": 0.5494235455532119, "grad_norm": 0.29263758659362793, "learning_rate": 0.0002884973667361705, "loss": 4.7084, "step": 26830 }, { "epoch": 0.5496283251080212, "grad_norm": 0.2951434552669525, "learning_rate": 0.0002883319314279364, "loss": 4.6304, "step": 26840 }, { "epoch": 0.5498331046628304, "grad_norm": 0.2866668701171875, "learning_rate": 0.00028816642100072616, "loss": 4.685, "step": 26850 }, { "epoch": 0.5500378842176397, "grad_norm": 0.28918665647506714, "learning_rate": 0.0002880008355952927, "loss": 4.6649, "step": 26860 }, { "epoch": 0.5502426637724489, "grad_norm": 0.27269914746284485, "learning_rate": 0.0002878351753524524, "loss": 4.6594, "step": 26870 }, { "epoch": 0.5504474433272583, "grad_norm": 0.2877494990825653, "learning_rate": 0.0002876694404130858, "loss": 4.6656, "step": 26880 }, { "epoch": 0.5506522228820675, "grad_norm": 0.27912789583206177, "learning_rate": 0.0002875036309181367, "loss": 4.6644, "step": 26890 }, { "epoch": 0.5508570024368767, "grad_norm": 0.26564154028892517, "learning_rate": 0.0002873377470086121, "loss": 4.653, "step": 26900 }, { "epoch": 0.551061781991686, "grad_norm": 0.2571597397327423, "learning_rate": 0.0002871717888255828, "loss": 4.6591, "step": 26910 }, { "epoch": 0.5512665615464952, "grad_norm": 0.25915297865867615, "learning_rate": 0.00028700575651018214, "loss": 4.6818, "step": 26920 }, { "epoch": 0.5514713411013045, "grad_norm": 0.2806718647480011, "learning_rate": 0.0002868396502036071, "loss": 4.6864, "step": 26930 }, { "epoch": 0.5516761206561137, "grad_norm": 0.29026949405670166, "learning_rate": 0.00028667347004711715, "loss": 4.646, "step": 26940 }, { "epoch": 0.551880900210923, "grad_norm": 0.28536713123321533, "learning_rate": 0.0002865072161820348, "loss": 4.6575, "step": 26950 }, { "epoch": 0.5520856797657322, "grad_norm": 0.3099000155925751, "learning_rate": 0.00028634088874974513, "loss": 4.6635, "step": 26960 }, { "epoch": 0.5522904593205414, "grad_norm": 0.2998659014701843, "learning_rate": 0.0002861744878916958, "loss": 4.7011, "step": 26970 }, { "epoch": 0.5524952388753507, "grad_norm": 0.2926270067691803, "learning_rate": 0.00028600801374939694, "loss": 4.6806, "step": 26980 }, { "epoch": 0.5527000184301599, "grad_norm": 0.26582396030426025, "learning_rate": 0.00028584146646442097, "loss": 4.7549, "step": 26990 }, { "epoch": 0.5529047979849692, "grad_norm": 0.2821336090564728, "learning_rate": 0.0002856748461784026, "loss": 4.6877, "step": 27000 }, { "epoch": 0.5531095775397784, "grad_norm": 0.3004828989505768, "learning_rate": 0.00028550815303303846, "loss": 4.6975, "step": 27010 }, { "epoch": 0.5533143570945876, "grad_norm": 0.3053824007511139, "learning_rate": 0.00028534138717008727, "loss": 4.6588, "step": 27020 }, { "epoch": 0.5535191366493969, "grad_norm": 0.2883572280406952, "learning_rate": 0.00028517454873136966, "loss": 4.6864, "step": 27030 }, { "epoch": 0.5537239162042061, "grad_norm": 0.2585112750530243, "learning_rate": 0.00028500763785876764, "loss": 4.7093, "step": 27040 }, { "epoch": 0.5539286957590154, "grad_norm": 0.28137773275375366, "learning_rate": 0.00028484065469422534, "loss": 4.6768, "step": 27050 }, { "epoch": 0.5541334753138246, "grad_norm": 0.27896785736083984, "learning_rate": 0.00028467359937974786, "loss": 4.6621, "step": 27060 }, { "epoch": 0.5543382548686339, "grad_norm": 0.3065791726112366, "learning_rate": 0.000284506472057402, "loss": 4.6868, "step": 27070 }, { "epoch": 0.5545430344234432, "grad_norm": 0.34164634346961975, "learning_rate": 0.00028433927286931557, "loss": 4.6731, "step": 27080 }, { "epoch": 0.5547478139782525, "grad_norm": 0.2731534540653229, "learning_rate": 0.00028417200195767774, "loss": 4.6487, "step": 27090 }, { "epoch": 0.5549525935330617, "grad_norm": 0.3001730144023895, "learning_rate": 0.0002840046594647385, "loss": 4.7111, "step": 27100 }, { "epoch": 0.5551573730878709, "grad_norm": 0.3216342628002167, "learning_rate": 0.0002838372455328086, "loss": 4.6878, "step": 27110 }, { "epoch": 0.5553621526426802, "grad_norm": 0.28366807103157043, "learning_rate": 0.00028366976030425983, "loss": 4.7044, "step": 27120 }, { "epoch": 0.5555669321974894, "grad_norm": 0.5463992953300476, "learning_rate": 0.0002835022039215245, "loss": 4.6733, "step": 27130 }, { "epoch": 0.5557717117522987, "grad_norm": 0.26576077938079834, "learning_rate": 0.00028333457652709537, "loss": 4.658, "step": 27140 }, { "epoch": 0.5559764913071079, "grad_norm": 0.2999189496040344, "learning_rate": 0.0002831668782635256, "loss": 4.628, "step": 27150 }, { "epoch": 0.5561812708619172, "grad_norm": 0.2850971817970276, "learning_rate": 0.0002829991092734286, "loss": 4.6603, "step": 27160 }, { "epoch": 0.5563860504167264, "grad_norm": 0.2797244191169739, "learning_rate": 0.0002828312696994781, "loss": 4.6823, "step": 27170 }, { "epoch": 0.5565908299715356, "grad_norm": 0.28097304701805115, "learning_rate": 0.0002826633596844077, "loss": 4.7016, "step": 27180 }, { "epoch": 0.5567956095263449, "grad_norm": 0.28281205892562866, "learning_rate": 0.0002824953793710108, "loss": 4.7176, "step": 27190 }, { "epoch": 0.5570003890811541, "grad_norm": 0.29616957902908325, "learning_rate": 0.00028232732890214084, "loss": 4.6836, "step": 27200 }, { "epoch": 0.5572051686359634, "grad_norm": 0.26958492398262024, "learning_rate": 0.00028215920842071074, "loss": 4.6967, "step": 27210 }, { "epoch": 0.5574099481907726, "grad_norm": 0.33172526955604553, "learning_rate": 0.0002819910180696931, "loss": 4.6903, "step": 27220 }, { "epoch": 0.5576147277455819, "grad_norm": 0.2655125558376312, "learning_rate": 0.00028182275799211974, "loss": 4.6666, "step": 27230 }, { "epoch": 0.5578195073003911, "grad_norm": 0.29412057995796204, "learning_rate": 0.00028165442833108197, "loss": 4.6836, "step": 27240 }, { "epoch": 0.5580242868552003, "grad_norm": 0.29321762919425964, "learning_rate": 0.0002814860292297301, "loss": 4.6663, "step": 27250 }, { "epoch": 0.5582290664100096, "grad_norm": 0.2720133364200592, "learning_rate": 0.0002813175608312737, "loss": 4.6888, "step": 27260 }, { "epoch": 0.5584338459648188, "grad_norm": 0.27242472767829895, "learning_rate": 0.0002811490232789811, "loss": 4.6799, "step": 27270 }, { "epoch": 0.5586386255196282, "grad_norm": 0.30829906463623047, "learning_rate": 0.00028098041671617953, "loss": 4.6187, "step": 27280 }, { "epoch": 0.5588434050744374, "grad_norm": 0.286232054233551, "learning_rate": 0.0002808117412862549, "loss": 4.6596, "step": 27290 }, { "epoch": 0.5590481846292467, "grad_norm": 0.26213908195495605, "learning_rate": 0.00028064299713265157, "loss": 4.6717, "step": 27300 }, { "epoch": 0.5592529641840559, "grad_norm": 0.3313126862049103, "learning_rate": 0.00028047418439887254, "loss": 4.666, "step": 27310 }, { "epoch": 0.5594577437388651, "grad_norm": 0.3153354227542877, "learning_rate": 0.00028030530322847895, "loss": 4.6635, "step": 27320 }, { "epoch": 0.5596625232936744, "grad_norm": 0.2815357446670532, "learning_rate": 0.00028013635376509025, "loss": 4.694, "step": 27330 }, { "epoch": 0.5598673028484836, "grad_norm": 0.2900370657444, "learning_rate": 0.00027996733615238387, "loss": 4.7114, "step": 27340 }, { "epoch": 0.5600720824032929, "grad_norm": 0.28624969720840454, "learning_rate": 0.0002797982505340954, "loss": 4.7158, "step": 27350 }, { "epoch": 0.5602768619581021, "grad_norm": 0.2943468689918518, "learning_rate": 0.0002796290970540181, "loss": 4.6346, "step": 27360 }, { "epoch": 0.5604816415129114, "grad_norm": 0.2638726234436035, "learning_rate": 0.0002794598758560028, "loss": 4.6508, "step": 27370 }, { "epoch": 0.5606864210677206, "grad_norm": 0.2831834554672241, "learning_rate": 0.00027929058708395834, "loss": 4.6682, "step": 27380 }, { "epoch": 0.5608912006225298, "grad_norm": 0.29787468910217285, "learning_rate": 0.00027912123088185054, "loss": 4.6922, "step": 27390 }, { "epoch": 0.5610959801773391, "grad_norm": 0.2765185236930847, "learning_rate": 0.0002789518073937029, "loss": 4.6695, "step": 27400 }, { "epoch": 0.5613007597321483, "grad_norm": 0.28119799494743347, "learning_rate": 0.000278782316763596, "loss": 4.7068, "step": 27410 }, { "epoch": 0.5615055392869576, "grad_norm": 0.2748514413833618, "learning_rate": 0.0002786127591356676, "loss": 4.7453, "step": 27420 }, { "epoch": 0.5617103188417668, "grad_norm": 0.2974112033843994, "learning_rate": 0.0002784431346541123, "loss": 4.678, "step": 27430 }, { "epoch": 0.561915098396576, "grad_norm": 0.4376599192619324, "learning_rate": 0.00027827344346318175, "loss": 4.6976, "step": 27440 }, { "epoch": 0.5621198779513853, "grad_norm": 0.285873681306839, "learning_rate": 0.0002781036857071842, "loss": 4.6554, "step": 27450 }, { "epoch": 0.5623246575061945, "grad_norm": 0.3002917170524597, "learning_rate": 0.00027793386153048444, "loss": 4.7362, "step": 27460 }, { "epoch": 0.5625294370610038, "grad_norm": 0.2769624590873718, "learning_rate": 0.000277763971077504, "loss": 4.6749, "step": 27470 }, { "epoch": 0.5627342166158131, "grad_norm": 0.36543720960617065, "learning_rate": 0.00027759401449272043, "loss": 4.6384, "step": 27480 }, { "epoch": 0.5629389961706224, "grad_norm": 0.2904614806175232, "learning_rate": 0.00027742399192066784, "loss": 4.6709, "step": 27490 }, { "epoch": 0.5631437757254316, "grad_norm": 0.26392224431037903, "learning_rate": 0.00027725390350593623, "loss": 4.6671, "step": 27500 }, { "epoch": 0.5633485552802409, "grad_norm": 0.3175850510597229, "learning_rate": 0.00027708374939317167, "loss": 4.6972, "step": 27510 }, { "epoch": 0.5635533348350501, "grad_norm": 0.3197113871574402, "learning_rate": 0.0002769135297270762, "loss": 4.6675, "step": 27520 }, { "epoch": 0.5637581143898593, "grad_norm": 0.30047979950904846, "learning_rate": 0.0002767432446524075, "loss": 4.6708, "step": 27530 }, { "epoch": 0.5639628939446686, "grad_norm": 0.5802862644195557, "learning_rate": 0.0002765728943139788, "loss": 4.6787, "step": 27540 }, { "epoch": 0.5641676734994778, "grad_norm": 0.29684823751449585, "learning_rate": 0.000276402478856659, "loss": 4.7068, "step": 27550 }, { "epoch": 0.5643724530542871, "grad_norm": 0.27814826369285583, "learning_rate": 0.0002762319984253723, "loss": 4.7162, "step": 27560 }, { "epoch": 0.5645772326090963, "grad_norm": 0.2693076431751251, "learning_rate": 0.0002760614531650982, "loss": 4.6658, "step": 27570 }, { "epoch": 0.5647820121639056, "grad_norm": 0.3031403124332428, "learning_rate": 0.00027589084322087115, "loss": 4.6555, "step": 27580 }, { "epoch": 0.5649867917187148, "grad_norm": 0.3396504521369934, "learning_rate": 0.0002757201687377809, "loss": 4.696, "step": 27590 }, { "epoch": 0.565191571273524, "grad_norm": 0.33264583349227905, "learning_rate": 0.0002755494298609718, "loss": 4.6536, "step": 27600 }, { "epoch": 0.5653963508283333, "grad_norm": 0.43447214365005493, "learning_rate": 0.0002753786267356431, "loss": 4.6716, "step": 27610 }, { "epoch": 0.5656011303831425, "grad_norm": 0.27637383341789246, "learning_rate": 0.0002752077595070488, "loss": 4.6651, "step": 27620 }, { "epoch": 0.5658059099379518, "grad_norm": 0.28410792350769043, "learning_rate": 0.0002750368283204973, "loss": 4.6952, "step": 27630 }, { "epoch": 0.566010689492761, "grad_norm": 0.27863118052482605, "learning_rate": 0.00027486583332135116, "loss": 4.6437, "step": 27640 }, { "epoch": 0.5662154690475703, "grad_norm": 0.2986036539077759, "learning_rate": 0.00027469477465502776, "loss": 4.6924, "step": 27650 }, { "epoch": 0.5664202486023795, "grad_norm": 0.2995145916938782, "learning_rate": 0.000274523652466998, "loss": 4.6502, "step": 27660 }, { "epoch": 0.5666250281571887, "grad_norm": 0.2952560484409332, "learning_rate": 0.0002743524669027873, "loss": 4.6527, "step": 27670 }, { "epoch": 0.566829807711998, "grad_norm": 0.273057222366333, "learning_rate": 0.00027418121810797474, "loss": 4.6607, "step": 27680 }, { "epoch": 0.5670345872668073, "grad_norm": 0.612025260925293, "learning_rate": 0.00027400990622819304, "loss": 4.6507, "step": 27690 }, { "epoch": 0.5672393668216166, "grad_norm": 0.2794952690601349, "learning_rate": 0.000273838531409129, "loss": 4.6476, "step": 27700 }, { "epoch": 0.5674441463764258, "grad_norm": 0.30170899629592896, "learning_rate": 0.0002736670937965224, "loss": 4.6531, "step": 27710 }, { "epoch": 0.5676489259312351, "grad_norm": 0.2786957323551178, "learning_rate": 0.000273495593536167, "loss": 4.679, "step": 27720 }, { "epoch": 0.5678537054860443, "grad_norm": 0.3750832974910736, "learning_rate": 0.00027332403077390933, "loss": 4.6541, "step": 27730 }, { "epoch": 0.5680584850408535, "grad_norm": 0.2858746647834778, "learning_rate": 0.00027315240565564935, "loss": 4.6986, "step": 27740 }, { "epoch": 0.5682632645956628, "grad_norm": 0.28511595726013184, "learning_rate": 0.00027298071832733986, "loss": 4.6615, "step": 27750 }, { "epoch": 0.568468044150472, "grad_norm": 0.2617349326610565, "learning_rate": 0.00027280896893498686, "loss": 4.7171, "step": 27760 }, { "epoch": 0.5686728237052813, "grad_norm": 0.2863673269748688, "learning_rate": 0.0002726371576246489, "loss": 4.6974, "step": 27770 }, { "epoch": 0.5688776032600905, "grad_norm": 0.29414480924606323, "learning_rate": 0.00027246528454243714, "loss": 4.6836, "step": 27780 }, { "epoch": 0.5690823828148998, "grad_norm": 0.26643306016921997, "learning_rate": 0.00027229334983451547, "loss": 4.68, "step": 27790 }, { "epoch": 0.569287162369709, "grad_norm": 0.38181358575820923, "learning_rate": 0.0002721213536471001, "loss": 4.6501, "step": 27800 }, { "epoch": 0.5694919419245182, "grad_norm": 0.3115479052066803, "learning_rate": 0.0002719492961264595, "loss": 4.6587, "step": 27810 }, { "epoch": 0.5696967214793275, "grad_norm": 0.2777344286441803, "learning_rate": 0.0002717771774189142, "loss": 4.6839, "step": 27820 }, { "epoch": 0.5699015010341367, "grad_norm": 0.30036577582359314, "learning_rate": 0.000271604997670837, "loss": 4.6428, "step": 27830 }, { "epoch": 0.570106280588946, "grad_norm": 0.2738266885280609, "learning_rate": 0.0002714327570286525, "loss": 4.6306, "step": 27840 }, { "epoch": 0.5703110601437552, "grad_norm": 0.2899834215641022, "learning_rate": 0.0002712604556388369, "loss": 4.6794, "step": 27850 }, { "epoch": 0.5705158396985645, "grad_norm": 0.2756260931491852, "learning_rate": 0.00027108809364791847, "loss": 4.6225, "step": 27860 }, { "epoch": 0.5707206192533737, "grad_norm": 0.28180938959121704, "learning_rate": 0.0002709156712024766, "loss": 4.6943, "step": 27870 }, { "epoch": 0.5709253988081829, "grad_norm": 0.2718702554702759, "learning_rate": 0.0002707431884491423, "loss": 4.6975, "step": 27880 }, { "epoch": 0.5711301783629923, "grad_norm": 0.2886705696582794, "learning_rate": 0.00027057064553459787, "loss": 4.663, "step": 27890 }, { "epoch": 0.5713349579178015, "grad_norm": 0.41911399364471436, "learning_rate": 0.00027039804260557676, "loss": 4.6453, "step": 27900 }, { "epoch": 0.5715397374726108, "grad_norm": 0.28789734840393066, "learning_rate": 0.0002702253798088634, "loss": 4.6751, "step": 27910 }, { "epoch": 0.57174451702742, "grad_norm": 0.29850611090660095, "learning_rate": 0.0002700526572912931, "loss": 4.6786, "step": 27920 }, { "epoch": 0.5719492965822293, "grad_norm": 0.2884978652000427, "learning_rate": 0.0002698798751997522, "loss": 4.6775, "step": 27930 }, { "epoch": 0.5721540761370385, "grad_norm": 0.30702218413352966, "learning_rate": 0.00026970703368117737, "loss": 4.6594, "step": 27940 }, { "epoch": 0.5723588556918477, "grad_norm": 0.28914186358451843, "learning_rate": 0.000269534132882556, "loss": 4.6714, "step": 27950 }, { "epoch": 0.572563635246657, "grad_norm": 0.2664184272289276, "learning_rate": 0.000269361172950926, "loss": 4.6827, "step": 27960 }, { "epoch": 0.5727684148014662, "grad_norm": 0.27715256810188293, "learning_rate": 0.00026918815403337526, "loss": 4.6838, "step": 27970 }, { "epoch": 0.5729731943562755, "grad_norm": 0.37566494941711426, "learning_rate": 0.00026901507627704214, "loss": 4.6353, "step": 27980 }, { "epoch": 0.5731779739110847, "grad_norm": 0.27423736453056335, "learning_rate": 0.00026884193982911473, "loss": 4.6614, "step": 27990 }, { "epoch": 0.573382753465894, "grad_norm": 0.2827259600162506, "learning_rate": 0.0002686687448368315, "loss": 4.6946, "step": 28000 }, { "epoch": 0.573382753465894, "eval_loss": 4.683334827423096, "eval_runtime": 4.3861, "eval_samples_per_second": 265.841, "eval_steps_per_second": 33.287, "step": 28000 }, { "epoch": 0.5735875330207032, "grad_norm": 0.3005334138870239, "learning_rate": 0.0002684954914474801, "loss": 4.6974, "step": 28010 }, { "epoch": 0.5737923125755124, "grad_norm": 0.27823692560195923, "learning_rate": 0.0002683221798083984, "loss": 4.649, "step": 28020 }, { "epoch": 0.5739970921303217, "grad_norm": 0.300249308347702, "learning_rate": 0.0002681488100669734, "loss": 4.6579, "step": 28030 }, { "epoch": 0.5742018716851309, "grad_norm": 0.26772481203079224, "learning_rate": 0.0002679753823706417, "loss": 4.6904, "step": 28040 }, { "epoch": 0.5744066512399402, "grad_norm": 0.29746416211128235, "learning_rate": 0.00026780189686688937, "loss": 4.6793, "step": 28050 }, { "epoch": 0.5746114307947494, "grad_norm": 0.31912198662757874, "learning_rate": 0.00026762835370325116, "loss": 4.676, "step": 28060 }, { "epoch": 0.5748162103495587, "grad_norm": 0.2853984236717224, "learning_rate": 0.0002674547530273112, "loss": 4.6521, "step": 28070 }, { "epoch": 0.5750209899043679, "grad_norm": 0.2779002785682678, "learning_rate": 0.00026728109498670255, "loss": 4.681, "step": 28080 }, { "epoch": 0.5752257694591772, "grad_norm": 0.29643452167510986, "learning_rate": 0.00026710737972910687, "loss": 4.6636, "step": 28090 }, { "epoch": 0.5754305490139865, "grad_norm": 0.2981843948364258, "learning_rate": 0.00026693360740225465, "loss": 4.6901, "step": 28100 }, { "epoch": 0.5756353285687957, "grad_norm": 0.27709734439849854, "learning_rate": 0.0002667597781539247, "loss": 4.6693, "step": 28110 }, { "epoch": 0.575840108123605, "grad_norm": 0.27088525891304016, "learning_rate": 0.0002665858921319445, "loss": 4.6942, "step": 28120 }, { "epoch": 0.5760448876784142, "grad_norm": 0.2808799147605896, "learning_rate": 0.0002664119494841896, "loss": 4.6597, "step": 28130 }, { "epoch": 0.5762496672332235, "grad_norm": 0.29753655195236206, "learning_rate": 0.00026623795035858377, "loss": 4.696, "step": 28140 }, { "epoch": 0.5764544467880327, "grad_norm": 0.5138372182846069, "learning_rate": 0.000266063894903099, "loss": 4.6547, "step": 28150 }, { "epoch": 0.576659226342842, "grad_norm": 0.28008517622947693, "learning_rate": 0.0002658897832657548, "loss": 4.6697, "step": 28160 }, { "epoch": 0.5768640058976512, "grad_norm": 0.2961454689502716, "learning_rate": 0.00026571561559461866, "loss": 4.6935, "step": 28170 }, { "epoch": 0.5770687854524604, "grad_norm": 0.2739742398262024, "learning_rate": 0.00026554139203780586, "loss": 4.6588, "step": 28180 }, { "epoch": 0.5772735650072697, "grad_norm": 0.28070080280303955, "learning_rate": 0.000265367112743479, "loss": 4.7207, "step": 28190 }, { "epoch": 0.5774783445620789, "grad_norm": 0.36495405435562134, "learning_rate": 0.00026519277785984823, "loss": 4.6554, "step": 28200 }, { "epoch": 0.5776831241168882, "grad_norm": 0.31353211402893066, "learning_rate": 0.00026501838753517084, "loss": 4.6916, "step": 28210 }, { "epoch": 0.5778879036716974, "grad_norm": 0.2827132046222687, "learning_rate": 0.00026484394191775133, "loss": 4.675, "step": 28220 }, { "epoch": 0.5780926832265066, "grad_norm": 0.28942957520484924, "learning_rate": 0.0002646694411559412, "loss": 4.6867, "step": 28230 }, { "epoch": 0.5782974627813159, "grad_norm": 0.28087642788887024, "learning_rate": 0.0002644948853981389, "loss": 4.6868, "step": 28240 }, { "epoch": 0.5785022423361251, "grad_norm": 0.297259122133255, "learning_rate": 0.0002643202747927896, "loss": 4.6863, "step": 28250 }, { "epoch": 0.5787070218909344, "grad_norm": 0.28089985251426697, "learning_rate": 0.0002641456094883852, "loss": 4.682, "step": 28260 }, { "epoch": 0.5789118014457436, "grad_norm": 0.27846214175224304, "learning_rate": 0.00026397088963346383, "loss": 4.6831, "step": 28270 }, { "epoch": 0.5791165810005529, "grad_norm": 0.2802942991256714, "learning_rate": 0.0002637961153766104, "loss": 4.647, "step": 28280 }, { "epoch": 0.5793213605553622, "grad_norm": 0.29859694838523865, "learning_rate": 0.00026362128686645595, "loss": 4.6354, "step": 28290 }, { "epoch": 0.5795261401101715, "grad_norm": 0.2947337329387665, "learning_rate": 0.00026344640425167757, "loss": 4.662, "step": 28300 }, { "epoch": 0.5797309196649807, "grad_norm": 0.30136755108833313, "learning_rate": 0.00026327146768099824, "loss": 4.6889, "step": 28310 }, { "epoch": 0.5799356992197899, "grad_norm": 0.2920510768890381, "learning_rate": 0.0002630964773031872, "loss": 4.6877, "step": 28320 }, { "epoch": 0.5801404787745992, "grad_norm": 0.2797960340976715, "learning_rate": 0.0002629214332670592, "loss": 4.6452, "step": 28330 }, { "epoch": 0.5803452583294084, "grad_norm": 0.2966301739215851, "learning_rate": 0.0002627463357214747, "loss": 4.6694, "step": 28340 }, { "epoch": 0.5805500378842177, "grad_norm": 0.2787582576274872, "learning_rate": 0.0002625711848153395, "loss": 4.6764, "step": 28350 }, { "epoch": 0.5807548174390269, "grad_norm": 0.2939908802509308, "learning_rate": 0.000262395980697605, "loss": 4.6838, "step": 28360 }, { "epoch": 0.5809595969938361, "grad_norm": 0.2859007716178894, "learning_rate": 0.0002622207235172677, "loss": 4.6842, "step": 28370 }, { "epoch": 0.5811643765486454, "grad_norm": 0.27991899847984314, "learning_rate": 0.0002620454134233693, "loss": 4.6537, "step": 28380 }, { "epoch": 0.5813691561034546, "grad_norm": 0.2813095450401306, "learning_rate": 0.0002618700505649966, "loss": 4.6625, "step": 28390 }, { "epoch": 0.5815739356582639, "grad_norm": 0.2621093690395355, "learning_rate": 0.00026169463509128103, "loss": 4.6845, "step": 28400 }, { "epoch": 0.5817787152130731, "grad_norm": 0.4915701448917389, "learning_rate": 0.000261519167151399, "loss": 4.6664, "step": 28410 }, { "epoch": 0.5819834947678824, "grad_norm": 0.2945091724395752, "learning_rate": 0.0002613436468945712, "loss": 4.6899, "step": 28420 }, { "epoch": 0.5821882743226916, "grad_norm": 0.29733526706695557, "learning_rate": 0.0002611680744700633, "loss": 4.6791, "step": 28430 }, { "epoch": 0.5823930538775008, "grad_norm": 0.2509337365627289, "learning_rate": 0.00026099245002718504, "loss": 4.6735, "step": 28440 }, { "epoch": 0.5825978334323101, "grad_norm": 0.28541046380996704, "learning_rate": 0.0002608167737152902, "loss": 4.6413, "step": 28450 }, { "epoch": 0.5828026129871193, "grad_norm": 0.42691245675086975, "learning_rate": 0.00026064104568377717, "loss": 4.6507, "step": 28460 }, { "epoch": 0.5830073925419286, "grad_norm": 0.3065917193889618, "learning_rate": 0.00026046526608208793, "loss": 4.6493, "step": 28470 }, { "epoch": 0.5832121720967378, "grad_norm": 0.2934316396713257, "learning_rate": 0.0002602894350597085, "loss": 4.6747, "step": 28480 }, { "epoch": 0.583416951651547, "grad_norm": 0.2911780774593353, "learning_rate": 0.00026011355276616847, "loss": 4.7028, "step": 28490 }, { "epoch": 0.5836217312063564, "grad_norm": 0.2835933566093445, "learning_rate": 0.0002599376193510412, "loss": 4.6543, "step": 28500 }, { "epoch": 0.5838265107611657, "grad_norm": 0.2826501131057739, "learning_rate": 0.0002597616349639434, "loss": 4.7099, "step": 28510 }, { "epoch": 0.5840312903159749, "grad_norm": 0.3032357394695282, "learning_rate": 0.0002595855997545353, "loss": 4.6852, "step": 28520 }, { "epoch": 0.5842360698707841, "grad_norm": 0.29295429587364197, "learning_rate": 0.0002594095138725202, "loss": 4.6671, "step": 28530 }, { "epoch": 0.5844408494255934, "grad_norm": 0.2917841672897339, "learning_rate": 0.0002592333774676446, "loss": 4.7393, "step": 28540 }, { "epoch": 0.5846456289804026, "grad_norm": 0.28423553705215454, "learning_rate": 0.00025905719068969763, "loss": 4.7046, "step": 28550 }, { "epoch": 0.5848504085352119, "grad_norm": 0.3000679016113281, "learning_rate": 0.0002588809536885117, "loss": 4.6685, "step": 28560 }, { "epoch": 0.5850551880900211, "grad_norm": 0.2691282033920288, "learning_rate": 0.0002587046666139618, "loss": 4.6771, "step": 28570 }, { "epoch": 0.5852599676448303, "grad_norm": 0.304921954870224, "learning_rate": 0.0002585283296159654, "loss": 4.6754, "step": 28580 }, { "epoch": 0.5854647471996396, "grad_norm": 0.28128403425216675, "learning_rate": 0.00025835194284448237, "loss": 4.6541, "step": 28590 }, { "epoch": 0.5856695267544488, "grad_norm": 0.296207994222641, "learning_rate": 0.00025817550644951517, "loss": 4.6693, "step": 28600 }, { "epoch": 0.5858743063092581, "grad_norm": 0.27633747458457947, "learning_rate": 0.0002579990205811083, "loss": 4.6618, "step": 28610 }, { "epoch": 0.5860790858640673, "grad_norm": 0.272427499294281, "learning_rate": 0.0002578224853893483, "loss": 4.6575, "step": 28620 }, { "epoch": 0.5862838654188766, "grad_norm": 0.39729827642440796, "learning_rate": 0.0002576459010243637, "loss": 4.6775, "step": 28630 }, { "epoch": 0.5864886449736858, "grad_norm": 0.30692756175994873, "learning_rate": 0.00025746926763632486, "loss": 4.6986, "step": 28640 }, { "epoch": 0.586693424528495, "grad_norm": 0.46451130509376526, "learning_rate": 0.00025729258537544376, "loss": 4.6689, "step": 28650 }, { "epoch": 0.5868982040833043, "grad_norm": 0.2835638225078583, "learning_rate": 0.00025711585439197403, "loss": 4.6172, "step": 28660 }, { "epoch": 0.5871029836381135, "grad_norm": 0.27264466881752014, "learning_rate": 0.00025693907483621077, "loss": 4.6735, "step": 28670 }, { "epoch": 0.5873077631929228, "grad_norm": 0.2823847234249115, "learning_rate": 0.00025676224685849025, "loss": 4.6562, "step": 28680 }, { "epoch": 0.587512542747732, "grad_norm": 0.28739845752716064, "learning_rate": 0.00025658537060918997, "loss": 4.6648, "step": 28690 }, { "epoch": 0.5877173223025414, "grad_norm": 0.2688329815864563, "learning_rate": 0.00025640844623872855, "loss": 4.6809, "step": 28700 }, { "epoch": 0.5879221018573506, "grad_norm": 0.2778913080692291, "learning_rate": 0.0002562314738975655, "loss": 4.6522, "step": 28710 }, { "epoch": 0.5881268814121599, "grad_norm": 0.2616167664527893, "learning_rate": 0.0002560544537362011, "loss": 4.6539, "step": 28720 }, { "epoch": 0.5883316609669691, "grad_norm": 0.29110246896743774, "learning_rate": 0.0002558773859051762, "loss": 4.6622, "step": 28730 }, { "epoch": 0.5885364405217783, "grad_norm": 0.2961254119873047, "learning_rate": 0.00025570027055507247, "loss": 4.6437, "step": 28740 }, { "epoch": 0.5887412200765876, "grad_norm": 0.2791956663131714, "learning_rate": 0.00025552310783651165, "loss": 4.6672, "step": 28750 }, { "epoch": 0.5889459996313968, "grad_norm": 0.28366756439208984, "learning_rate": 0.00025534589790015595, "loss": 4.682, "step": 28760 }, { "epoch": 0.5891507791862061, "grad_norm": 0.28468218445777893, "learning_rate": 0.00025516864089670784, "loss": 4.6409, "step": 28770 }, { "epoch": 0.5893555587410153, "grad_norm": 0.2833494544029236, "learning_rate": 0.0002549913369769096, "loss": 4.6494, "step": 28780 }, { "epoch": 0.5895603382958245, "grad_norm": 0.27835631370544434, "learning_rate": 0.00025481398629154345, "loss": 4.6835, "step": 28790 }, { "epoch": 0.5897651178506338, "grad_norm": 0.26863643527030945, "learning_rate": 0.0002546365889914315, "loss": 4.7076, "step": 28800 }, { "epoch": 0.589969897405443, "grad_norm": 0.2861613929271698, "learning_rate": 0.0002544591452274354, "loss": 4.6564, "step": 28810 }, { "epoch": 0.5901746769602523, "grad_norm": 0.3046806752681732, "learning_rate": 0.00025428165515045643, "loss": 4.6513, "step": 28820 }, { "epoch": 0.5903794565150615, "grad_norm": 0.284868448972702, "learning_rate": 0.00025410411891143507, "loss": 4.6808, "step": 28830 }, { "epoch": 0.5905842360698708, "grad_norm": 0.26851439476013184, "learning_rate": 0.0002539265366613511, "loss": 4.6494, "step": 28840 }, { "epoch": 0.59078901562468, "grad_norm": 0.2809792757034302, "learning_rate": 0.00025374890855122366, "loss": 4.6212, "step": 28850 }, { "epoch": 0.5909937951794892, "grad_norm": 0.29327404499053955, "learning_rate": 0.00025357123473211055, "loss": 4.6527, "step": 28860 }, { "epoch": 0.5911985747342985, "grad_norm": 0.29546239972114563, "learning_rate": 0.0002533935153551087, "loss": 4.6834, "step": 28870 }, { "epoch": 0.5914033542891077, "grad_norm": 0.30554676055908203, "learning_rate": 0.00025321575057135365, "loss": 4.6672, "step": 28880 }, { "epoch": 0.591608133843917, "grad_norm": 0.2724545896053314, "learning_rate": 0.0002530379405320197, "loss": 4.6319, "step": 28890 }, { "epoch": 0.5918129133987263, "grad_norm": 0.29854893684387207, "learning_rate": 0.00025286008538831934, "loss": 4.6888, "step": 28900 }, { "epoch": 0.5920176929535356, "grad_norm": 0.29128697514533997, "learning_rate": 0.00025268218529150377, "loss": 4.6923, "step": 28910 }, { "epoch": 0.5922224725083448, "grad_norm": 0.30360209941864014, "learning_rate": 0.0002525042403928622, "loss": 4.6774, "step": 28920 }, { "epoch": 0.592427252063154, "grad_norm": 0.30676624178886414, "learning_rate": 0.00025232625084372197, "loss": 4.6821, "step": 28930 }, { "epoch": 0.5926320316179633, "grad_norm": 0.33180487155914307, "learning_rate": 0.0002521482167954484, "loss": 4.6972, "step": 28940 }, { "epoch": 0.5928368111727725, "grad_norm": 0.30034247040748596, "learning_rate": 0.0002519701383994447, "loss": 4.6403, "step": 28950 }, { "epoch": 0.5930415907275818, "grad_norm": 0.31630831956863403, "learning_rate": 0.00025179201580715184, "loss": 4.6675, "step": 28960 }, { "epoch": 0.593246370282391, "grad_norm": 0.305792897939682, "learning_rate": 0.0002516138491700482, "loss": 4.655, "step": 28970 }, { "epoch": 0.5934511498372003, "grad_norm": 0.32938066124916077, "learning_rate": 0.00025143563863964984, "loss": 4.687, "step": 28980 }, { "epoch": 0.5936559293920095, "grad_norm": 0.29817572236061096, "learning_rate": 0.00025125738436750984, "loss": 4.6642, "step": 28990 }, { "epoch": 0.5938607089468187, "grad_norm": 0.28466013073921204, "learning_rate": 0.00025107908650521876, "loss": 4.6554, "step": 29000 }, { "epoch": 0.594065488501628, "grad_norm": 0.29398080706596375, "learning_rate": 0.00025090074520440423, "loss": 4.6416, "step": 29010 }, { "epoch": 0.5942702680564372, "grad_norm": 0.29575416445732117, "learning_rate": 0.00025072236061673053, "loss": 4.685, "step": 29020 }, { "epoch": 0.5944750476112465, "grad_norm": 0.2865046560764313, "learning_rate": 0.0002505439328938991, "loss": 4.6354, "step": 29030 }, { "epoch": 0.5946798271660557, "grad_norm": 0.29523858428001404, "learning_rate": 0.00025036546218764786, "loss": 4.6587, "step": 29040 }, { "epoch": 0.594884606720865, "grad_norm": 0.2861136794090271, "learning_rate": 0.00025018694864975126, "loss": 4.667, "step": 29050 }, { "epoch": 0.5950893862756742, "grad_norm": 0.2811325788497925, "learning_rate": 0.00025000839243202044, "loss": 4.7131, "step": 29060 }, { "epoch": 0.5952941658304834, "grad_norm": 0.27377763390541077, "learning_rate": 0.00024982979368630247, "loss": 4.6591, "step": 29070 }, { "epoch": 0.5954989453852927, "grad_norm": 0.2821992337703705, "learning_rate": 0.0002496511525644808, "loss": 4.6562, "step": 29080 }, { "epoch": 0.5957037249401019, "grad_norm": 0.2800423204898834, "learning_rate": 0.0002494724692184748, "loss": 4.6791, "step": 29090 }, { "epoch": 0.5959085044949112, "grad_norm": 0.29585859179496765, "learning_rate": 0.00024929374380024003, "loss": 4.6861, "step": 29100 }, { "epoch": 0.5961132840497205, "grad_norm": 0.30531808733940125, "learning_rate": 0.00024911497646176744, "loss": 4.6698, "step": 29110 }, { "epoch": 0.5963180636045298, "grad_norm": 0.26787129044532776, "learning_rate": 0.0002489361673550839, "loss": 4.699, "step": 29120 }, { "epoch": 0.596522843159339, "grad_norm": 0.2912231981754303, "learning_rate": 0.0002487573166322517, "loss": 4.6863, "step": 29130 }, { "epoch": 0.5967276227141483, "grad_norm": 0.28360515832901, "learning_rate": 0.0002485784244453685, "loss": 4.661, "step": 29140 }, { "epoch": 0.5969324022689575, "grad_norm": 0.3958587646484375, "learning_rate": 0.0002483994909465673, "loss": 4.6813, "step": 29150 }, { "epoch": 0.5971371818237667, "grad_norm": 0.2817666828632355, "learning_rate": 0.00024822051628801625, "loss": 4.644, "step": 29160 }, { "epoch": 0.597341961378576, "grad_norm": 0.2872312366962433, "learning_rate": 0.0002480415006219183, "loss": 4.649, "step": 29170 }, { "epoch": 0.5975467409333852, "grad_norm": 0.30035293102264404, "learning_rate": 0.00024786244410051147, "loss": 4.6831, "step": 29180 }, { "epoch": 0.5977515204881945, "grad_norm": 0.29184991121292114, "learning_rate": 0.00024768334687606846, "loss": 4.6233, "step": 29190 }, { "epoch": 0.5979563000430037, "grad_norm": 0.2948676347732544, "learning_rate": 0.0002475042091008967, "loss": 4.6672, "step": 29200 }, { "epoch": 0.598161079597813, "grad_norm": 0.29934364557266235, "learning_rate": 0.0002473250309273379, "loss": 4.6515, "step": 29210 }, { "epoch": 0.5983658591526222, "grad_norm": 0.2856956124305725, "learning_rate": 0.00024714581250776814, "loss": 4.6733, "step": 29220 }, { "epoch": 0.5985706387074314, "grad_norm": 0.27090075612068176, "learning_rate": 0.00024696655399459796, "loss": 4.6661, "step": 29230 }, { "epoch": 0.5987754182622407, "grad_norm": 0.29719436168670654, "learning_rate": 0.0002467872555402718, "loss": 4.668, "step": 29240 }, { "epoch": 0.5989801978170499, "grad_norm": 0.2906520962715149, "learning_rate": 0.00024660791729726823, "loss": 4.6676, "step": 29250 }, { "epoch": 0.5991849773718592, "grad_norm": 0.29557523131370544, "learning_rate": 0.00024642853941809925, "loss": 4.6871, "step": 29260 }, { "epoch": 0.5993897569266684, "grad_norm": 0.2694222629070282, "learning_rate": 0.0002462491220553112, "loss": 4.6781, "step": 29270 }, { "epoch": 0.5995945364814776, "grad_norm": 0.3162340819835663, "learning_rate": 0.0002460696653614833, "loss": 4.662, "step": 29280 }, { "epoch": 0.5997993160362869, "grad_norm": 0.2700228989124298, "learning_rate": 0.0002458901694892289, "loss": 4.6583, "step": 29290 }, { "epoch": 0.6000040955910961, "grad_norm": 0.2790652811527252, "learning_rate": 0.0002457106345911942, "loss": 4.6658, "step": 29300 }, { "epoch": 0.6002088751459055, "grad_norm": 0.28301629424095154, "learning_rate": 0.0002455310608200587, "loss": 4.6695, "step": 29310 }, { "epoch": 0.6004136547007147, "grad_norm": 0.9047539234161377, "learning_rate": 0.000245351448328535, "loss": 4.6894, "step": 29320 }, { "epoch": 0.600618434255524, "grad_norm": 0.2879190146923065, "learning_rate": 0.0002451717972693686, "loss": 4.6464, "step": 29330 }, { "epoch": 0.6008232138103332, "grad_norm": 0.2830010950565338, "learning_rate": 0.0002449921077953379, "loss": 4.6697, "step": 29340 }, { "epoch": 0.6010279933651425, "grad_norm": 0.3032263219356537, "learning_rate": 0.0002448123800592539, "loss": 4.6552, "step": 29350 }, { "epoch": 0.6012327729199517, "grad_norm": 0.301294207572937, "learning_rate": 0.0002446326142139599, "loss": 4.6622, "step": 29360 }, { "epoch": 0.6014375524747609, "grad_norm": 0.28562310338020325, "learning_rate": 0.0002444528104123321, "loss": 4.6945, "step": 29370 }, { "epoch": 0.6016423320295702, "grad_norm": 0.2849845588207245, "learning_rate": 0.00024427296880727856, "loss": 4.6569, "step": 29380 }, { "epoch": 0.6018471115843794, "grad_norm": 0.29991352558135986, "learning_rate": 0.00024409308955173972, "loss": 4.6809, "step": 29390 }, { "epoch": 0.6020518911391887, "grad_norm": 0.3000006079673767, "learning_rate": 0.0002439131727986879, "loss": 4.6621, "step": 29400 }, { "epoch": 0.6022566706939979, "grad_norm": 0.298909455537796, "learning_rate": 0.00024373321870112738, "loss": 4.6476, "step": 29410 }, { "epoch": 0.6024614502488072, "grad_norm": 0.2760293185710907, "learning_rate": 0.00024355322741209415, "loss": 4.6741, "step": 29420 }, { "epoch": 0.6026662298036164, "grad_norm": 0.2685551345348358, "learning_rate": 0.0002433731990846559, "loss": 4.6493, "step": 29430 }, { "epoch": 0.6028710093584256, "grad_norm": 0.3154749274253845, "learning_rate": 0.0002431931338719118, "loss": 4.6618, "step": 29440 }, { "epoch": 0.6030757889132349, "grad_norm": 0.3553539514541626, "learning_rate": 0.0002430130319269923, "loss": 4.6515, "step": 29450 }, { "epoch": 0.6032805684680441, "grad_norm": 0.28778523206710815, "learning_rate": 0.00024283289340305922, "loss": 4.6685, "step": 29460 }, { "epoch": 0.6034853480228534, "grad_norm": 0.39844146370887756, "learning_rate": 0.0002426527184533054, "loss": 4.6413, "step": 29470 }, { "epoch": 0.6036901275776626, "grad_norm": 0.2765115201473236, "learning_rate": 0.0002424725072309547, "loss": 4.6781, "step": 29480 }, { "epoch": 0.6038949071324718, "grad_norm": 0.29303398728370667, "learning_rate": 0.00024229225988926182, "loss": 4.6654, "step": 29490 }, { "epoch": 0.6040996866872811, "grad_norm": 0.30903899669647217, "learning_rate": 0.00024211197658151207, "loss": 4.6571, "step": 29500 }, { "epoch": 0.6043044662420904, "grad_norm": 0.284907728433609, "learning_rate": 0.0002419316574610215, "loss": 4.6826, "step": 29510 }, { "epoch": 0.6045092457968997, "grad_norm": 0.29485589265823364, "learning_rate": 0.00024175130268113662, "loss": 4.6339, "step": 29520 }, { "epoch": 0.6047140253517089, "grad_norm": 0.2721925675868988, "learning_rate": 0.00024157091239523405, "loss": 4.6884, "step": 29530 }, { "epoch": 0.6049188049065182, "grad_norm": 0.2897637188434601, "learning_rate": 0.00024139048675672094, "loss": 4.6537, "step": 29540 }, { "epoch": 0.6051235844613274, "grad_norm": 0.29843610525131226, "learning_rate": 0.0002412100259190342, "loss": 4.694, "step": 29550 }, { "epoch": 0.6053283640161367, "grad_norm": 0.3276248872280121, "learning_rate": 0.00024102953003564076, "loss": 4.671, "step": 29560 }, { "epoch": 0.6055331435709459, "grad_norm": 0.3016703128814697, "learning_rate": 0.00024084899926003748, "loss": 4.6504, "step": 29570 }, { "epoch": 0.6057379231257551, "grad_norm": 0.2773389220237732, "learning_rate": 0.00024066843374575078, "loss": 4.6773, "step": 29580 }, { "epoch": 0.6059427026805644, "grad_norm": 0.2965388000011444, "learning_rate": 0.00024048783364633667, "loss": 4.6447, "step": 29590 }, { "epoch": 0.6061474822353736, "grad_norm": 0.27457672357559204, "learning_rate": 0.00024030719911538042, "loss": 4.6403, "step": 29600 }, { "epoch": 0.6063522617901829, "grad_norm": 0.3018428087234497, "learning_rate": 0.00024012653030649683, "loss": 4.6588, "step": 29610 }, { "epoch": 0.6065570413449921, "grad_norm": 0.2927612364292145, "learning_rate": 0.00023994582737332966, "loss": 4.6432, "step": 29620 }, { "epoch": 0.6067618208998014, "grad_norm": 0.29504019021987915, "learning_rate": 0.00023976509046955187, "loss": 4.6328, "step": 29630 }, { "epoch": 0.6069666004546106, "grad_norm": 0.3035086691379547, "learning_rate": 0.00023958431974886507, "loss": 4.613, "step": 29640 }, { "epoch": 0.6071713800094198, "grad_norm": 0.2993575632572174, "learning_rate": 0.00023940351536499974, "loss": 4.6828, "step": 29650 }, { "epoch": 0.6073761595642291, "grad_norm": 0.3321745991706848, "learning_rate": 0.00023922267747171513, "loss": 4.7281, "step": 29660 }, { "epoch": 0.6075809391190383, "grad_norm": 0.3499547243118286, "learning_rate": 0.00023904180622279876, "loss": 4.6341, "step": 29670 }, { "epoch": 0.6077857186738476, "grad_norm": 0.28242287039756775, "learning_rate": 0.00023886090177206666, "loss": 4.6274, "step": 29680 }, { "epoch": 0.6079904982286568, "grad_norm": 0.27522993087768555, "learning_rate": 0.00023867996427336298, "loss": 4.6453, "step": 29690 }, { "epoch": 0.608195277783466, "grad_norm": 0.2626899182796478, "learning_rate": 0.00023849899388056017, "loss": 4.6578, "step": 29700 }, { "epoch": 0.6084000573382754, "grad_norm": 0.3001188039779663, "learning_rate": 0.0002383179907475584, "loss": 4.6493, "step": 29710 }, { "epoch": 0.6086048368930846, "grad_norm": 0.2864462435245514, "learning_rate": 0.00023813695502828582, "loss": 4.6365, "step": 29720 }, { "epoch": 0.6088096164478939, "grad_norm": 0.30062445998191833, "learning_rate": 0.00023795588687669844, "loss": 4.6756, "step": 29730 }, { "epoch": 0.6090143960027031, "grad_norm": 0.29453182220458984, "learning_rate": 0.00023777478644677956, "loss": 4.6742, "step": 29740 }, { "epoch": 0.6092191755575124, "grad_norm": 0.3395213782787323, "learning_rate": 0.00023759365389254002, "loss": 4.696, "step": 29750 }, { "epoch": 0.6094239551123216, "grad_norm": 0.28882744908332825, "learning_rate": 0.0002374124893680181, "loss": 4.6679, "step": 29760 }, { "epoch": 0.6096287346671309, "grad_norm": 0.28007251024246216, "learning_rate": 0.00023723129302727927, "loss": 4.655, "step": 29770 }, { "epoch": 0.6098335142219401, "grad_norm": 0.2873763144016266, "learning_rate": 0.00023705006502441585, "loss": 4.6593, "step": 29780 }, { "epoch": 0.6100382937767493, "grad_norm": 0.2954859137535095, "learning_rate": 0.00023686880551354724, "loss": 4.6623, "step": 29790 }, { "epoch": 0.6102430733315586, "grad_norm": 0.3011667728424072, "learning_rate": 0.00023668751464881967, "loss": 4.6686, "step": 29800 }, { "epoch": 0.6104478528863678, "grad_norm": 0.28950291872024536, "learning_rate": 0.00023650619258440588, "loss": 4.6395, "step": 29810 }, { "epoch": 0.6106526324411771, "grad_norm": 0.2699313461780548, "learning_rate": 0.00023632483947450523, "loss": 4.6728, "step": 29820 }, { "epoch": 0.6108574119959863, "grad_norm": 0.8729771375656128, "learning_rate": 0.00023614345547334366, "loss": 4.6861, "step": 29830 }, { "epoch": 0.6110621915507956, "grad_norm": 0.3100840449333191, "learning_rate": 0.00023596204073517306, "loss": 4.6585, "step": 29840 }, { "epoch": 0.6112669711056048, "grad_norm": 0.2908143103122711, "learning_rate": 0.0002357805954142716, "loss": 4.6209, "step": 29850 }, { "epoch": 0.611471750660414, "grad_norm": 0.2910616397857666, "learning_rate": 0.0002355991196649435, "loss": 4.6485, "step": 29860 }, { "epoch": 0.6116765302152233, "grad_norm": 0.2798650860786438, "learning_rate": 0.0002354176136415189, "loss": 4.6634, "step": 29870 }, { "epoch": 0.6118813097700325, "grad_norm": 0.2711614668369293, "learning_rate": 0.00023523607749835351, "loss": 4.6278, "step": 29880 }, { "epoch": 0.6120860893248418, "grad_norm": 0.2735280990600586, "learning_rate": 0.00023505451138982873, "loss": 4.6552, "step": 29890 }, { "epoch": 0.612290868879651, "grad_norm": 0.31333765387535095, "learning_rate": 0.00023487291547035155, "loss": 4.6217, "step": 29900 }, { "epoch": 0.6124956484344602, "grad_norm": 0.27783825993537903, "learning_rate": 0.00023469128989435422, "loss": 4.6715, "step": 29910 }, { "epoch": 0.6127004279892696, "grad_norm": 0.34906432032585144, "learning_rate": 0.00023450963481629416, "loss": 4.6359, "step": 29920 }, { "epoch": 0.6129052075440788, "grad_norm": 0.27939939498901367, "learning_rate": 0.00023432795039065392, "loss": 4.6543, "step": 29930 }, { "epoch": 0.6131099870988881, "grad_norm": 0.2987031936645508, "learning_rate": 0.00023414623677194113, "loss": 4.6372, "step": 29940 }, { "epoch": 0.6133147666536973, "grad_norm": 0.27738839387893677, "learning_rate": 0.00023396449411468805, "loss": 4.618, "step": 29950 }, { "epoch": 0.6135195462085066, "grad_norm": 0.3081669807434082, "learning_rate": 0.00023378272257345173, "loss": 4.6476, "step": 29960 }, { "epoch": 0.6137243257633158, "grad_norm": 0.2913212776184082, "learning_rate": 0.00023360092230281386, "loss": 4.648, "step": 29970 }, { "epoch": 0.6139291053181251, "grad_norm": 0.2736920118331909, "learning_rate": 0.00023341909345738043, "loss": 4.6325, "step": 29980 }, { "epoch": 0.6141338848729343, "grad_norm": 0.27339574694633484, "learning_rate": 0.00023323723619178175, "loss": 4.67, "step": 29990 }, { "epoch": 0.6143386644277435, "grad_norm": 0.32638460397720337, "learning_rate": 0.0002330553506606723, "loss": 4.6309, "step": 30000 }, { "epoch": 0.6143386644277435, "eval_loss": 4.663181304931641, "eval_runtime": 4.3853, "eval_samples_per_second": 265.891, "eval_steps_per_second": 33.293, "step": 30000 }, { "epoch": 0.6145434439825528, "grad_norm": 0.2861756682395935, "learning_rate": 0.00023287343701873078, "loss": 4.6396, "step": 30010 }, { "epoch": 0.614748223537362, "grad_norm": 0.2900616228580475, "learning_rate": 0.0002326914954206595, "loss": 4.6473, "step": 30020 }, { "epoch": 0.6149530030921713, "grad_norm": 0.2840682864189148, "learning_rate": 0.00023250952602118475, "loss": 4.6181, "step": 30030 }, { "epoch": 0.6151577826469805, "grad_norm": 0.2963438928127289, "learning_rate": 0.00023232752897505638, "loss": 4.6336, "step": 30040 }, { "epoch": 0.6153625622017898, "grad_norm": 0.3062077462673187, "learning_rate": 0.00023214550443704778, "loss": 4.6366, "step": 30050 }, { "epoch": 0.615567341756599, "grad_norm": 0.29020142555236816, "learning_rate": 0.0002319634525619557, "loss": 4.6699, "step": 30060 }, { "epoch": 0.6157721213114082, "grad_norm": 0.28839603066444397, "learning_rate": 0.0002317813735046002, "loss": 4.6326, "step": 30070 }, { "epoch": 0.6159769008662175, "grad_norm": 0.3556162416934967, "learning_rate": 0.00023159926741982435, "loss": 4.6607, "step": 30080 }, { "epoch": 0.6161816804210267, "grad_norm": 0.320284366607666, "learning_rate": 0.00023141713446249424, "loss": 4.661, "step": 30090 }, { "epoch": 0.616386459975836, "grad_norm": 0.2787932753562927, "learning_rate": 0.0002312349747874989, "loss": 4.6549, "step": 30100 }, { "epoch": 0.6165912395306452, "grad_norm": 0.3046487271785736, "learning_rate": 0.00023105278854974998, "loss": 4.5999, "step": 30110 }, { "epoch": 0.6167960190854546, "grad_norm": 0.31222203373908997, "learning_rate": 0.0002308705759041818, "loss": 4.6837, "step": 30120 }, { "epoch": 0.6170007986402638, "grad_norm": 0.28878629207611084, "learning_rate": 0.00023068833700575102, "loss": 4.6751, "step": 30130 }, { "epoch": 0.617205578195073, "grad_norm": 0.28158316016197205, "learning_rate": 0.00023050607200943667, "loss": 4.6209, "step": 30140 }, { "epoch": 0.6174103577498823, "grad_norm": 0.27760252356529236, "learning_rate": 0.00023032378107024014, "loss": 4.6572, "step": 30150 }, { "epoch": 0.6176151373046915, "grad_norm": 0.2882702350616455, "learning_rate": 0.0002301414643431846, "loss": 4.6716, "step": 30160 }, { "epoch": 0.6178199168595008, "grad_norm": 0.3012171685695648, "learning_rate": 0.00022995912198331544, "loss": 4.6543, "step": 30170 }, { "epoch": 0.61802469641431, "grad_norm": 0.2790381908416748, "learning_rate": 0.00022977675414569953, "loss": 4.6313, "step": 30180 }, { "epoch": 0.6182294759691193, "grad_norm": 0.3032726049423218, "learning_rate": 0.00022959436098542573, "loss": 4.6493, "step": 30190 }, { "epoch": 0.6184342555239285, "grad_norm": 0.29249200224876404, "learning_rate": 0.00022941194265760419, "loss": 4.6433, "step": 30200 }, { "epoch": 0.6186390350787377, "grad_norm": 0.296486496925354, "learning_rate": 0.00022922949931736663, "loss": 4.6352, "step": 30210 }, { "epoch": 0.618843814633547, "grad_norm": 0.27457305788993835, "learning_rate": 0.0002290470311198659, "loss": 4.645, "step": 30220 }, { "epoch": 0.6190485941883562, "grad_norm": 0.2674858570098877, "learning_rate": 0.00022886453822027608, "loss": 4.6826, "step": 30230 }, { "epoch": 0.6192533737431655, "grad_norm": 0.27665963768959045, "learning_rate": 0.0002286820207737922, "loss": 4.6881, "step": 30240 }, { "epoch": 0.6194581532979747, "grad_norm": 0.2948571443557739, "learning_rate": 0.00022849947893563029, "loss": 4.6533, "step": 30250 }, { "epoch": 0.619662932852784, "grad_norm": 0.3000718355178833, "learning_rate": 0.00022831691286102706, "loss": 4.645, "step": 30260 }, { "epoch": 0.6198677124075932, "grad_norm": 0.2924703061580658, "learning_rate": 0.00022813432270523963, "loss": 4.6611, "step": 30270 }, { "epoch": 0.6200724919624024, "grad_norm": 0.281405508518219, "learning_rate": 0.00022795170862354586, "loss": 4.6348, "step": 30280 }, { "epoch": 0.6202772715172117, "grad_norm": 0.2816663086414337, "learning_rate": 0.0002277690707712439, "loss": 4.6586, "step": 30290 }, { "epoch": 0.6204820510720209, "grad_norm": 0.3089105188846588, "learning_rate": 0.00022758640930365206, "loss": 4.6627, "step": 30300 }, { "epoch": 0.6206868306268302, "grad_norm": 0.3414783775806427, "learning_rate": 0.0002274037243761088, "loss": 4.6521, "step": 30310 }, { "epoch": 0.6208916101816395, "grad_norm": 0.31398820877075195, "learning_rate": 0.00022722101614397241, "loss": 4.6378, "step": 30320 }, { "epoch": 0.6210963897364488, "grad_norm": 0.2950122058391571, "learning_rate": 0.00022703828476262113, "loss": 4.6725, "step": 30330 }, { "epoch": 0.621301169291258, "grad_norm": 0.3036250174045563, "learning_rate": 0.0002268555303874528, "loss": 4.6697, "step": 30340 }, { "epoch": 0.6215059488460672, "grad_norm": 0.29848480224609375, "learning_rate": 0.0002266727531738849, "loss": 4.6406, "step": 30350 }, { "epoch": 0.6217107284008765, "grad_norm": 0.3059265613555908, "learning_rate": 0.00022648995327735424, "loss": 4.6607, "step": 30360 }, { "epoch": 0.6219155079556857, "grad_norm": 0.27917057275772095, "learning_rate": 0.00022630713085331695, "loss": 4.6418, "step": 30370 }, { "epoch": 0.622120287510495, "grad_norm": 0.2982562482357025, "learning_rate": 0.0002261242860572483, "loss": 4.6318, "step": 30380 }, { "epoch": 0.6223250670653042, "grad_norm": 0.2788049280643463, "learning_rate": 0.00022594141904464266, "loss": 4.6585, "step": 30390 }, { "epoch": 0.6225298466201135, "grad_norm": 0.3022199273109436, "learning_rate": 0.00022575852997101334, "loss": 4.6664, "step": 30400 }, { "epoch": 0.6227346261749227, "grad_norm": 0.32720696926116943, "learning_rate": 0.00022557561899189213, "loss": 4.6251, "step": 30410 }, { "epoch": 0.6229394057297319, "grad_norm": 0.2773551344871521, "learning_rate": 0.00022539268626282978, "loss": 4.6615, "step": 30420 }, { "epoch": 0.6231441852845412, "grad_norm": 0.3055232763290405, "learning_rate": 0.00022520973193939533, "loss": 4.651, "step": 30430 }, { "epoch": 0.6233489648393504, "grad_norm": 0.3110123872756958, "learning_rate": 0.0002250267561771762, "loss": 4.6476, "step": 30440 }, { "epoch": 0.6235537443941597, "grad_norm": 0.2774081230163574, "learning_rate": 0.00022484375913177823, "loss": 4.6644, "step": 30450 }, { "epoch": 0.6237585239489689, "grad_norm": 0.291970819234848, "learning_rate": 0.00022466074095882507, "loss": 4.6617, "step": 30460 }, { "epoch": 0.6239633035037782, "grad_norm": 0.2835977077484131, "learning_rate": 0.0002244777018139586, "loss": 4.6475, "step": 30470 }, { "epoch": 0.6241680830585874, "grad_norm": 0.33001837134361267, "learning_rate": 0.0002242946418528383, "loss": 4.6724, "step": 30480 }, { "epoch": 0.6243728626133966, "grad_norm": 0.3122660219669342, "learning_rate": 0.00022411156123114157, "loss": 4.6418, "step": 30490 }, { "epoch": 0.6245776421682059, "grad_norm": 0.28016114234924316, "learning_rate": 0.00022392846010456334, "loss": 4.6693, "step": 30500 }, { "epoch": 0.6247824217230151, "grad_norm": 0.30062586069107056, "learning_rate": 0.00022374533862881574, "loss": 4.6688, "step": 30510 }, { "epoch": 0.6249872012778244, "grad_norm": 0.3291855752468109, "learning_rate": 0.00022356219695962852, "loss": 4.6277, "step": 30520 }, { "epoch": 0.6251919808326337, "grad_norm": 0.2959108054637909, "learning_rate": 0.00022337903525274837, "loss": 4.642, "step": 30530 }, { "epoch": 0.625396760387443, "grad_norm": 0.28458911180496216, "learning_rate": 0.00022319585366393927, "loss": 4.6513, "step": 30540 }, { "epoch": 0.6256015399422522, "grad_norm": 0.3079531192779541, "learning_rate": 0.00022301265234898185, "loss": 4.6694, "step": 30550 }, { "epoch": 0.6258063194970614, "grad_norm": 0.30680134892463684, "learning_rate": 0.0002228294314636736, "loss": 4.6353, "step": 30560 }, { "epoch": 0.6260110990518707, "grad_norm": 0.29334500432014465, "learning_rate": 0.00022264619116382875, "loss": 4.6928, "step": 30570 }, { "epoch": 0.6262158786066799, "grad_norm": 0.2923773527145386, "learning_rate": 0.00022246293160527793, "loss": 4.6535, "step": 30580 }, { "epoch": 0.6264206581614892, "grad_norm": 0.31806227564811707, "learning_rate": 0.00022227965294386822, "loss": 4.685, "step": 30590 }, { "epoch": 0.6266254377162984, "grad_norm": 0.285145103931427, "learning_rate": 0.0002220963553354628, "loss": 4.6419, "step": 30600 }, { "epoch": 0.6268302172711077, "grad_norm": 0.30672797560691833, "learning_rate": 0.0002219130389359412, "loss": 4.6372, "step": 30610 }, { "epoch": 0.6270349968259169, "grad_norm": 0.30848565697669983, "learning_rate": 0.00022172970390119874, "loss": 4.6253, "step": 30620 }, { "epoch": 0.6272397763807261, "grad_norm": 0.2922016680240631, "learning_rate": 0.0002215463503871466, "loss": 4.6615, "step": 30630 }, { "epoch": 0.6274445559355354, "grad_norm": 0.2957032322883606, "learning_rate": 0.0002213629785497119, "loss": 4.6535, "step": 30640 }, { "epoch": 0.6276493354903446, "grad_norm": 0.28136834502220154, "learning_rate": 0.00022117958854483703, "loss": 4.6324, "step": 30650 }, { "epoch": 0.6278541150451539, "grad_norm": 0.3000693917274475, "learning_rate": 0.00022099618052847997, "loss": 4.6512, "step": 30660 }, { "epoch": 0.6280588945999631, "grad_norm": 0.2786167562007904, "learning_rate": 0.00022081275465661403, "loss": 4.6648, "step": 30670 }, { "epoch": 0.6282636741547724, "grad_norm": 0.2739909291267395, "learning_rate": 0.00022062931108522772, "loss": 4.6412, "step": 30680 }, { "epoch": 0.6284684537095816, "grad_norm": 0.2978883683681488, "learning_rate": 0.00022044584997032453, "loss": 4.6781, "step": 30690 }, { "epoch": 0.6286732332643908, "grad_norm": 0.3022817373275757, "learning_rate": 0.00022026237146792292, "loss": 4.669, "step": 30700 }, { "epoch": 0.6288780128192001, "grad_norm": 0.33219870924949646, "learning_rate": 0.00022007887573405622, "loss": 4.6393, "step": 30710 }, { "epoch": 0.6290827923740093, "grad_norm": 0.29787546396255493, "learning_rate": 0.00021989536292477217, "loss": 4.6452, "step": 30720 }, { "epoch": 0.6292875719288187, "grad_norm": 0.3121150732040405, "learning_rate": 0.00021971183319613325, "loss": 4.6268, "step": 30730 }, { "epoch": 0.6294923514836279, "grad_norm": 0.30353349447250366, "learning_rate": 0.00021952828670421628, "loss": 4.6538, "step": 30740 }, { "epoch": 0.6296971310384372, "grad_norm": 0.28227660059928894, "learning_rate": 0.0002193447236051123, "loss": 4.6357, "step": 30750 }, { "epoch": 0.6299019105932464, "grad_norm": 0.31946688890457153, "learning_rate": 0.00021916114405492651, "loss": 4.6547, "step": 30760 }, { "epoch": 0.6301066901480556, "grad_norm": 0.2845208942890167, "learning_rate": 0.00021897754820977806, "loss": 4.6753, "step": 30770 }, { "epoch": 0.6303114697028649, "grad_norm": 0.2979859709739685, "learning_rate": 0.0002187939362258, "loss": 4.6488, "step": 30780 }, { "epoch": 0.6305162492576741, "grad_norm": 0.27406734228134155, "learning_rate": 0.00021861030825913905, "loss": 4.6294, "step": 30790 }, { "epoch": 0.6307210288124834, "grad_norm": 0.28868621587753296, "learning_rate": 0.0002184266644659555, "loss": 4.6798, "step": 30800 }, { "epoch": 0.6309258083672926, "grad_norm": 0.2830265164375305, "learning_rate": 0.00021824300500242324, "loss": 4.6774, "step": 30810 }, { "epoch": 0.6311305879221019, "grad_norm": 0.2919405400753021, "learning_rate": 0.00021805933002472946, "loss": 4.6297, "step": 30820 }, { "epoch": 0.6313353674769111, "grad_norm": 0.29010897874832153, "learning_rate": 0.00021787563968907433, "loss": 4.6662, "step": 30830 }, { "epoch": 0.6315401470317203, "grad_norm": 0.27563849091529846, "learning_rate": 0.0002176919341516714, "loss": 4.6353, "step": 30840 }, { "epoch": 0.6317449265865296, "grad_norm": 0.2839495539665222, "learning_rate": 0.00021750821356874685, "loss": 4.6538, "step": 30850 }, { "epoch": 0.6319497061413388, "grad_norm": 0.2820706069469452, "learning_rate": 0.0002173244780965398, "loss": 4.6616, "step": 30860 }, { "epoch": 0.6321544856961481, "grad_norm": 0.30122271180152893, "learning_rate": 0.00021714072789130205, "loss": 4.6481, "step": 30870 }, { "epoch": 0.6323592652509573, "grad_norm": 0.2987443208694458, "learning_rate": 0.00021695696310929794, "loss": 4.671, "step": 30880 }, { "epoch": 0.6325640448057666, "grad_norm": 0.32780030369758606, "learning_rate": 0.0002167731839068041, "loss": 4.6852, "step": 30890 }, { "epoch": 0.6327688243605758, "grad_norm": 0.3136660158634186, "learning_rate": 0.00021658939044010952, "loss": 4.6578, "step": 30900 }, { "epoch": 0.632973603915385, "grad_norm": 0.3008350431919098, "learning_rate": 0.0002164055828655152, "loss": 4.648, "step": 30910 }, { "epoch": 0.6331783834701943, "grad_norm": 0.2733580470085144, "learning_rate": 0.00021622176133933443, "loss": 4.684, "step": 30920 }, { "epoch": 0.6333831630250036, "grad_norm": 0.297954797744751, "learning_rate": 0.00021603792601789196, "loss": 4.6215, "step": 30930 }, { "epoch": 0.6335879425798129, "grad_norm": 0.3264220952987671, "learning_rate": 0.00021585407705752458, "loss": 4.6458, "step": 30940 }, { "epoch": 0.6337927221346221, "grad_norm": 0.3224235773086548, "learning_rate": 0.0002156702146145805, "loss": 4.6816, "step": 30950 }, { "epoch": 0.6339975016894314, "grad_norm": 0.356788694858551, "learning_rate": 0.00021548633884541953, "loss": 4.6502, "step": 30960 }, { "epoch": 0.6342022812442406, "grad_norm": 0.32108941674232483, "learning_rate": 0.00021530244990641272, "loss": 4.6607, "step": 30970 }, { "epoch": 0.6344070607990498, "grad_norm": 0.27904924750328064, "learning_rate": 0.00021511854795394243, "loss": 4.6723, "step": 30980 }, { "epoch": 0.6346118403538591, "grad_norm": 0.28624746203422546, "learning_rate": 0.00021493463314440186, "loss": 4.6719, "step": 30990 }, { "epoch": 0.6348166199086683, "grad_norm": 0.2936543822288513, "learning_rate": 0.00021475070563419535, "loss": 4.6705, "step": 31000 }, { "epoch": 0.6350213994634776, "grad_norm": 0.3128805458545685, "learning_rate": 0.000214566765579738, "loss": 4.6363, "step": 31010 }, { "epoch": 0.6352261790182868, "grad_norm": 0.2871984839439392, "learning_rate": 0.00021438281313745564, "loss": 4.6357, "step": 31020 }, { "epoch": 0.6354309585730961, "grad_norm": 0.30281564593315125, "learning_rate": 0.00021419884846378442, "loss": 4.6751, "step": 31030 }, { "epoch": 0.6356357381279053, "grad_norm": 0.310698926448822, "learning_rate": 0.00021401487171517105, "loss": 4.651, "step": 31040 }, { "epoch": 0.6358405176827145, "grad_norm": 0.28934794664382935, "learning_rate": 0.00021383088304807247, "loss": 4.6618, "step": 31050 }, { "epoch": 0.6360452972375238, "grad_norm": 0.30262723565101624, "learning_rate": 0.0002136468826189559, "loss": 4.6018, "step": 31060 }, { "epoch": 0.636250076792333, "grad_norm": 0.32639217376708984, "learning_rate": 0.0002134628705842983, "loss": 4.6411, "step": 31070 }, { "epoch": 0.6364548563471423, "grad_norm": 0.3025152087211609, "learning_rate": 0.0002132788471005867, "loss": 4.6209, "step": 31080 }, { "epoch": 0.6366596359019515, "grad_norm": 0.28025031089782715, "learning_rate": 0.00021309481232431777, "loss": 4.629, "step": 31090 }, { "epoch": 0.6368644154567608, "grad_norm": 0.5899404883384705, "learning_rate": 0.0002129107664119978, "loss": 4.6398, "step": 31100 }, { "epoch": 0.63706919501157, "grad_norm": 0.280439555644989, "learning_rate": 0.0002127267095201426, "loss": 4.6465, "step": 31110 }, { "epoch": 0.6372739745663792, "grad_norm": 0.2808097004890442, "learning_rate": 0.0002125426418052773, "loss": 4.6526, "step": 31120 }, { "epoch": 0.6374787541211886, "grad_norm": 0.32698628306388855, "learning_rate": 0.00021235856342393614, "loss": 4.6345, "step": 31130 }, { "epoch": 0.6376835336759978, "grad_norm": 0.28672513365745544, "learning_rate": 0.0002121744745326625, "loss": 4.6502, "step": 31140 }, { "epoch": 0.6378883132308071, "grad_norm": 0.2829684913158417, "learning_rate": 0.00021199037528800877, "loss": 4.6454, "step": 31150 }, { "epoch": 0.6380930927856163, "grad_norm": 0.2866877615451813, "learning_rate": 0.000211806265846536, "loss": 4.6324, "step": 31160 }, { "epoch": 0.6382978723404256, "grad_norm": 0.2783838212490082, "learning_rate": 0.00021162214636481422, "loss": 4.6448, "step": 31170 }, { "epoch": 0.6385026518952348, "grad_norm": 0.26938238739967346, "learning_rate": 0.00021143801699942142, "loss": 4.6041, "step": 31180 }, { "epoch": 0.638707431450044, "grad_norm": 0.35899850726127625, "learning_rate": 0.00021125387790694452, "loss": 4.6596, "step": 31190 }, { "epoch": 0.6389122110048533, "grad_norm": 0.3072209358215332, "learning_rate": 0.00021106972924397858, "loss": 4.6259, "step": 31200 }, { "epoch": 0.6391169905596625, "grad_norm": 0.3054019808769226, "learning_rate": 0.00021088557116712665, "loss": 4.6186, "step": 31210 }, { "epoch": 0.6393217701144718, "grad_norm": 0.30291202664375305, "learning_rate": 0.000210701403833, "loss": 4.638, "step": 31220 }, { "epoch": 0.639526549669281, "grad_norm": 0.31307119131088257, "learning_rate": 0.00021051722739821754, "loss": 4.6226, "step": 31230 }, { "epoch": 0.6397313292240903, "grad_norm": 0.2849477231502533, "learning_rate": 0.00021033304201940615, "loss": 4.6709, "step": 31240 }, { "epoch": 0.6399361087788995, "grad_norm": 0.29145699739456177, "learning_rate": 0.0002101488478532001, "loss": 4.6387, "step": 31250 }, { "epoch": 0.6401408883337087, "grad_norm": 0.5871053338050842, "learning_rate": 0.00020996464505624132, "loss": 4.614, "step": 31260 }, { "epoch": 0.640345667888518, "grad_norm": 0.28802430629730225, "learning_rate": 0.00020978043378517907, "loss": 4.6842, "step": 31270 }, { "epoch": 0.6405504474433272, "grad_norm": 0.3322353661060333, "learning_rate": 0.00020959621419666956, "loss": 4.6597, "step": 31280 }, { "epoch": 0.6407552269981365, "grad_norm": 0.2919965088367462, "learning_rate": 0.0002094119864473763, "loss": 4.6283, "step": 31290 }, { "epoch": 0.6409600065529457, "grad_norm": 0.29532623291015625, "learning_rate": 0.0002092277506939698, "loss": 4.6599, "step": 31300 }, { "epoch": 0.641164786107755, "grad_norm": 0.2982107698917389, "learning_rate": 0.00020904350709312727, "loss": 4.6808, "step": 31310 }, { "epoch": 0.6413695656625642, "grad_norm": 0.3330329954624176, "learning_rate": 0.00020885925580153259, "loss": 4.6049, "step": 31320 }, { "epoch": 0.6415743452173734, "grad_norm": 0.2956121265888214, "learning_rate": 0.00020867499697587612, "loss": 4.6494, "step": 31330 }, { "epoch": 0.6417791247721828, "grad_norm": 0.2796705961227417, "learning_rate": 0.00020849073077285475, "loss": 4.6404, "step": 31340 }, { "epoch": 0.641983904326992, "grad_norm": 0.2776346802711487, "learning_rate": 0.00020830645734917154, "loss": 4.6322, "step": 31350 }, { "epoch": 0.6421886838818013, "grad_norm": 0.2897033989429474, "learning_rate": 0.00020812217686153584, "loss": 4.6275, "step": 31360 }, { "epoch": 0.6423934634366105, "grad_norm": 0.29280951619148254, "learning_rate": 0.0002079378894666628, "loss": 4.6103, "step": 31370 }, { "epoch": 0.6425982429914198, "grad_norm": 0.2698272466659546, "learning_rate": 0.0002077535953212737, "loss": 4.6664, "step": 31380 }, { "epoch": 0.642803022546229, "grad_norm": 0.27823346853256226, "learning_rate": 0.00020756929458209524, "loss": 4.6506, "step": 31390 }, { "epoch": 0.6430078021010383, "grad_norm": 0.29688760638237, "learning_rate": 0.00020738498740585998, "loss": 4.691, "step": 31400 }, { "epoch": 0.6432125816558475, "grad_norm": 0.3246428668498993, "learning_rate": 0.00020720067394930594, "loss": 4.6097, "step": 31410 }, { "epoch": 0.6434173612106567, "grad_norm": 0.2977891266345978, "learning_rate": 0.00020701635436917633, "loss": 4.6249, "step": 31420 }, { "epoch": 0.643622140765466, "grad_norm": 0.28034883737564087, "learning_rate": 0.00020683202882221965, "loss": 4.629, "step": 31430 }, { "epoch": 0.6438269203202752, "grad_norm": 0.2818579375743866, "learning_rate": 0.00020664769746518951, "loss": 4.6321, "step": 31440 }, { "epoch": 0.6440316998750845, "grad_norm": 0.30722710490226746, "learning_rate": 0.00020646336045484449, "loss": 4.6416, "step": 31450 }, { "epoch": 0.6442364794298937, "grad_norm": 0.6170060634613037, "learning_rate": 0.0002062790179479478, "loss": 4.6672, "step": 31460 }, { "epoch": 0.644441258984703, "grad_norm": 0.3114047646522522, "learning_rate": 0.00020609467010126756, "loss": 4.6055, "step": 31470 }, { "epoch": 0.6446460385395122, "grad_norm": 0.32174360752105713, "learning_rate": 0.00020591031707157626, "loss": 4.63, "step": 31480 }, { "epoch": 0.6448508180943214, "grad_norm": 0.2874440848827362, "learning_rate": 0.00020572595901565075, "loss": 4.6365, "step": 31490 }, { "epoch": 0.6450555976491307, "grad_norm": 0.28202199935913086, "learning_rate": 0.00020554159609027236, "loss": 4.6228, "step": 31500 }, { "epoch": 0.6452603772039399, "grad_norm": 0.29981857538223267, "learning_rate": 0.0002053572284522265, "loss": 4.6578, "step": 31510 }, { "epoch": 0.6454651567587492, "grad_norm": 0.2835997939109802, "learning_rate": 0.00020517285625830248, "loss": 4.6091, "step": 31520 }, { "epoch": 0.6456699363135584, "grad_norm": 0.26887619495391846, "learning_rate": 0.00020498847966529354, "loss": 4.6203, "step": 31530 }, { "epoch": 0.6458747158683678, "grad_norm": 0.39226752519607544, "learning_rate": 0.00020480409882999668, "loss": 4.6366, "step": 31540 }, { "epoch": 0.646079495423177, "grad_norm": 0.28772279620170593, "learning_rate": 0.00020461971390921252, "loss": 4.6073, "step": 31550 }, { "epoch": 0.6462842749779862, "grad_norm": 0.2829737067222595, "learning_rate": 0.00020443532505974514, "loss": 4.6557, "step": 31560 }, { "epoch": 0.6464890545327955, "grad_norm": 0.28044554591178894, "learning_rate": 0.00020425093243840189, "loss": 4.6202, "step": 31570 }, { "epoch": 0.6466938340876047, "grad_norm": 0.2747333347797394, "learning_rate": 0.00020406653620199338, "loss": 4.6379, "step": 31580 }, { "epoch": 0.646898613642414, "grad_norm": 0.30595648288726807, "learning_rate": 0.00020388213650733344, "loss": 4.6443, "step": 31590 }, { "epoch": 0.6471033931972232, "grad_norm": 0.27070680260658264, "learning_rate": 0.00020369773351123856, "loss": 4.6596, "step": 31600 }, { "epoch": 0.6473081727520325, "grad_norm": 0.2805611789226532, "learning_rate": 0.00020351332737052823, "loss": 4.6697, "step": 31610 }, { "epoch": 0.6475129523068417, "grad_norm": 0.3111336827278137, "learning_rate": 0.00020332891824202461, "loss": 4.6382, "step": 31620 }, { "epoch": 0.6477177318616509, "grad_norm": 0.28219151496887207, "learning_rate": 0.00020314450628255228, "loss": 4.6541, "step": 31630 }, { "epoch": 0.6479225114164602, "grad_norm": 0.28346508741378784, "learning_rate": 0.00020296009164893838, "loss": 4.5858, "step": 31640 }, { "epoch": 0.6481272909712694, "grad_norm": 0.28758737444877625, "learning_rate": 0.00020277567449801222, "loss": 4.6349, "step": 31650 }, { "epoch": 0.6483320705260787, "grad_norm": 0.3404221534729004, "learning_rate": 0.00020259125498660533, "loss": 4.648, "step": 31660 }, { "epoch": 0.6485368500808879, "grad_norm": 0.28170305490493774, "learning_rate": 0.00020240683327155104, "loss": 4.6316, "step": 31670 }, { "epoch": 0.6487416296356971, "grad_norm": 0.28445863723754883, "learning_rate": 0.00020222240950968482, "loss": 4.6254, "step": 31680 }, { "epoch": 0.6489464091905064, "grad_norm": 0.27748405933380127, "learning_rate": 0.00020203798385784384, "loss": 4.6671, "step": 31690 }, { "epoch": 0.6491511887453156, "grad_norm": 0.3047666847705841, "learning_rate": 0.00020185355647286678, "loss": 4.6289, "step": 31700 }, { "epoch": 0.6493559683001249, "grad_norm": 0.29950475692749023, "learning_rate": 0.0002016691275115937, "loss": 4.681, "step": 31710 }, { "epoch": 0.6495607478549341, "grad_norm": 0.3117060959339142, "learning_rate": 0.0002014846971308662, "loss": 4.6548, "step": 31720 }, { "epoch": 0.6497655274097434, "grad_norm": 0.28721022605895996, "learning_rate": 0.00020130026548752708, "loss": 4.6387, "step": 31730 }, { "epoch": 0.6499703069645527, "grad_norm": 0.28561025857925415, "learning_rate": 0.00020111583273842002, "loss": 4.5913, "step": 31740 }, { "epoch": 0.650175086519362, "grad_norm": 0.2865810990333557, "learning_rate": 0.00020093139904038987, "loss": 4.6432, "step": 31750 }, { "epoch": 0.6503798660741712, "grad_norm": 0.27878129482269287, "learning_rate": 0.00020074696455028214, "loss": 4.6465, "step": 31760 }, { "epoch": 0.6505846456289804, "grad_norm": 0.28597012162208557, "learning_rate": 0.00020056252942494304, "loss": 4.6175, "step": 31770 }, { "epoch": 0.6507894251837897, "grad_norm": 0.29955506324768066, "learning_rate": 0.00020037809382121937, "loss": 4.6587, "step": 31780 }, { "epoch": 0.6509942047385989, "grad_norm": 0.2863275110721588, "learning_rate": 0.00020019365789595826, "loss": 4.6116, "step": 31790 }, { "epoch": 0.6511989842934082, "grad_norm": 0.2816847860813141, "learning_rate": 0.00020000922180600728, "loss": 4.6215, "step": 31800 }, { "epoch": 0.6514037638482174, "grad_norm": 0.3599647581577301, "learning_rate": 0.00019982478570821383, "loss": 4.6497, "step": 31810 }, { "epoch": 0.6516085434030267, "grad_norm": 0.34417176246643066, "learning_rate": 0.00019964034975942556, "loss": 4.6141, "step": 31820 }, { "epoch": 0.6518133229578359, "grad_norm": 0.28384891152381897, "learning_rate": 0.00019945591411649, "loss": 4.6682, "step": 31830 }, { "epoch": 0.6520181025126451, "grad_norm": 0.2900743782520294, "learning_rate": 0.00019927147893625434, "loss": 4.6286, "step": 31840 }, { "epoch": 0.6522228820674544, "grad_norm": 0.2789555490016937, "learning_rate": 0.00019908704437556548, "loss": 4.6383, "step": 31850 }, { "epoch": 0.6524276616222636, "grad_norm": 0.27704811096191406, "learning_rate": 0.00019890261059126945, "loss": 4.6133, "step": 31860 }, { "epoch": 0.6526324411770729, "grad_norm": 0.31395819783210754, "learning_rate": 0.00019871817774021205, "loss": 4.6613, "step": 31870 }, { "epoch": 0.6528372207318821, "grad_norm": 0.27373069524765015, "learning_rate": 0.00019853374597923813, "loss": 4.6638, "step": 31880 }, { "epoch": 0.6530420002866913, "grad_norm": 0.32742395997047424, "learning_rate": 0.00019834931546519155, "loss": 4.6096, "step": 31890 }, { "epoch": 0.6532467798415006, "grad_norm": 0.33728116750717163, "learning_rate": 0.00019816488635491516, "loss": 4.63, "step": 31900 }, { "epoch": 0.6534515593963098, "grad_norm": 0.32113081216812134, "learning_rate": 0.00019798045880525057, "loss": 4.6654, "step": 31910 }, { "epoch": 0.6536563389511191, "grad_norm": 0.4503423571586609, "learning_rate": 0.00019779603297303826, "loss": 4.6327, "step": 31920 }, { "epoch": 0.6538611185059283, "grad_norm": 0.2936616837978363, "learning_rate": 0.0001976116090151169, "loss": 4.6077, "step": 31930 }, { "epoch": 0.6540658980607376, "grad_norm": 0.3016274869441986, "learning_rate": 0.00019742718708832386, "loss": 4.5875, "step": 31940 }, { "epoch": 0.6542706776155469, "grad_norm": 0.2945941388607025, "learning_rate": 0.00019724276734949463, "loss": 4.6379, "step": 31950 }, { "epoch": 0.6544754571703562, "grad_norm": 0.2721746861934662, "learning_rate": 0.00019705834995546292, "loss": 4.6521, "step": 31960 }, { "epoch": 0.6546802367251654, "grad_norm": 0.2935369610786438, "learning_rate": 0.0001968739350630604, "loss": 4.6509, "step": 31970 }, { "epoch": 0.6548850162799746, "grad_norm": 0.31681233644485474, "learning_rate": 0.0001966895228291167, "loss": 4.6515, "step": 31980 }, { "epoch": 0.6550897958347839, "grad_norm": 0.31349703669548035, "learning_rate": 0.0001965051134104591, "loss": 4.6517, "step": 31990 }, { "epoch": 0.6552945753895931, "grad_norm": 0.2852419912815094, "learning_rate": 0.0001963207069639124, "loss": 4.6436, "step": 32000 }, { "epoch": 0.6552945753895931, "eval_loss": 4.642320156097412, "eval_runtime": 4.3846, "eval_samples_per_second": 265.933, "eval_steps_per_second": 33.299, "step": 32000 }, { "epoch": 0.6554993549444024, "grad_norm": 0.3128270208835602, "learning_rate": 0.00019613630364629907, "loss": 4.629, "step": 32010 }, { "epoch": 0.6557041344992116, "grad_norm": 0.30352678894996643, "learning_rate": 0.00019595190361443876, "loss": 4.6378, "step": 32020 }, { "epoch": 0.6559089140540209, "grad_norm": 0.290731281042099, "learning_rate": 0.0001957675070251485, "loss": 4.6158, "step": 32030 }, { "epoch": 0.6561136936088301, "grad_norm": 0.3104674518108368, "learning_rate": 0.0001955831140352422, "loss": 4.6631, "step": 32040 }, { "epoch": 0.6563184731636393, "grad_norm": 0.2891213893890381, "learning_rate": 0.00019539872480153078, "loss": 4.6223, "step": 32050 }, { "epoch": 0.6565232527184486, "grad_norm": 0.2789028286933899, "learning_rate": 0.0001952143394808221, "loss": 4.6225, "step": 32060 }, { "epoch": 0.6567280322732578, "grad_norm": 0.2898847758769989, "learning_rate": 0.00019502995822992043, "loss": 4.6441, "step": 32070 }, { "epoch": 0.6569328118280671, "grad_norm": 0.2967241406440735, "learning_rate": 0.00019484558120562675, "loss": 4.6284, "step": 32080 }, { "epoch": 0.6571375913828763, "grad_norm": 0.28619837760925293, "learning_rate": 0.00019466120856473848, "loss": 4.6193, "step": 32090 }, { "epoch": 0.6573423709376855, "grad_norm": 0.29414695501327515, "learning_rate": 0.0001944768404640492, "loss": 4.6234, "step": 32100 }, { "epoch": 0.6575471504924948, "grad_norm": 0.28031256794929504, "learning_rate": 0.00019429247706034865, "loss": 4.6213, "step": 32110 }, { "epoch": 0.657751930047304, "grad_norm": 0.3011209964752197, "learning_rate": 0.00019410811851042269, "loss": 4.6298, "step": 32120 }, { "epoch": 0.6579567096021133, "grad_norm": 0.3096432387828827, "learning_rate": 0.00019392376497105293, "loss": 4.6453, "step": 32130 }, { "epoch": 0.6581614891569225, "grad_norm": 0.28561797738075256, "learning_rate": 0.00019373941659901674, "loss": 4.6063, "step": 32140 }, { "epoch": 0.6583662687117319, "grad_norm": 0.3114117383956909, "learning_rate": 0.00019355507355108705, "loss": 4.5976, "step": 32150 }, { "epoch": 0.6585710482665411, "grad_norm": 0.2873578667640686, "learning_rate": 0.0001933707359840324, "loss": 4.6026, "step": 32160 }, { "epoch": 0.6587758278213504, "grad_norm": 0.2891336679458618, "learning_rate": 0.00019318640405461654, "loss": 4.6773, "step": 32170 }, { "epoch": 0.6589806073761596, "grad_norm": 0.3008500933647156, "learning_rate": 0.00019300207791959857, "loss": 4.6259, "step": 32180 }, { "epoch": 0.6591853869309688, "grad_norm": 0.3336765468120575, "learning_rate": 0.00019281775773573247, "loss": 4.6412, "step": 32190 }, { "epoch": 0.6593901664857781, "grad_norm": 0.2977740466594696, "learning_rate": 0.00019263344365976737, "loss": 4.6599, "step": 32200 }, { "epoch": 0.6595949460405873, "grad_norm": 0.30444103479385376, "learning_rate": 0.00019244913584844694, "loss": 4.652, "step": 32210 }, { "epoch": 0.6597997255953966, "grad_norm": 0.34079089760780334, "learning_rate": 0.00019226483445850974, "loss": 4.656, "step": 32220 }, { "epoch": 0.6600045051502058, "grad_norm": 0.28978464007377625, "learning_rate": 0.00019208053964668887, "loss": 4.6458, "step": 32230 }, { "epoch": 0.660209284705015, "grad_norm": 0.28713274002075195, "learning_rate": 0.0001918962515697117, "loss": 4.6382, "step": 32240 }, { "epoch": 0.6604140642598243, "grad_norm": 0.3080507516860962, "learning_rate": 0.00019171197038429994, "loss": 4.6429, "step": 32250 }, { "epoch": 0.6606188438146335, "grad_norm": 0.3299945592880249, "learning_rate": 0.00019152769624716947, "loss": 4.6411, "step": 32260 }, { "epoch": 0.6608236233694428, "grad_norm": 0.3015105426311493, "learning_rate": 0.0001913434293150302, "loss": 4.6008, "step": 32270 }, { "epoch": 0.661028402924252, "grad_norm": 0.28456807136535645, "learning_rate": 0.00019115916974458584, "loss": 4.6208, "step": 32280 }, { "epoch": 0.6612331824790613, "grad_norm": 0.4723943769931793, "learning_rate": 0.00019097491769253373, "loss": 4.6645, "step": 32290 }, { "epoch": 0.6614379620338705, "grad_norm": 0.2876240611076355, "learning_rate": 0.00019079067331556507, "loss": 4.6347, "step": 32300 }, { "epoch": 0.6616427415886798, "grad_norm": 0.40209028124809265, "learning_rate": 0.00019060643677036435, "loss": 4.6263, "step": 32310 }, { "epoch": 0.661847521143489, "grad_norm": 0.2876284122467041, "learning_rate": 0.0001904222082136096, "loss": 4.6409, "step": 32320 }, { "epoch": 0.6620523006982982, "grad_norm": 0.3027927875518799, "learning_rate": 0.00019023798780197173, "loss": 4.6446, "step": 32330 }, { "epoch": 0.6622570802531075, "grad_norm": 0.2995139956474304, "learning_rate": 0.0001900537756921151, "loss": 4.698, "step": 32340 }, { "epoch": 0.6624618598079168, "grad_norm": 0.28222161531448364, "learning_rate": 0.00018986957204069658, "loss": 4.6691, "step": 32350 }, { "epoch": 0.6626666393627261, "grad_norm": 0.2969187796115875, "learning_rate": 0.0001896853770043663, "loss": 4.6232, "step": 32360 }, { "epoch": 0.6628714189175353, "grad_norm": 0.2937006950378418, "learning_rate": 0.00018950119073976674, "loss": 4.6295, "step": 32370 }, { "epoch": 0.6630761984723446, "grad_norm": 0.3069112300872803, "learning_rate": 0.00018931701340353307, "loss": 4.6248, "step": 32380 }, { "epoch": 0.6632809780271538, "grad_norm": 0.2813648581504822, "learning_rate": 0.0001891328451522928, "loss": 4.6259, "step": 32390 }, { "epoch": 0.663485757581963, "grad_norm": 0.31179484724998474, "learning_rate": 0.00018894868614266578, "loss": 4.6409, "step": 32400 }, { "epoch": 0.6636905371367723, "grad_norm": 0.2886295020580292, "learning_rate": 0.000188764536531264, "loss": 4.6497, "step": 32410 }, { "epoch": 0.6638953166915815, "grad_norm": 0.28318530321121216, "learning_rate": 0.0001885803964746913, "loss": 4.6297, "step": 32420 }, { "epoch": 0.6641000962463908, "grad_norm": 0.28957971930503845, "learning_rate": 0.0001883962661295436, "loss": 4.6455, "step": 32430 }, { "epoch": 0.6643048758012, "grad_norm": 0.32726556062698364, "learning_rate": 0.00018821214565240842, "loss": 4.6106, "step": 32440 }, { "epoch": 0.6645096553560093, "grad_norm": 0.281751424074173, "learning_rate": 0.000188028035199865, "loss": 4.6541, "step": 32450 }, { "epoch": 0.6647144349108185, "grad_norm": 0.30658701062202454, "learning_rate": 0.000187843934928484, "loss": 4.659, "step": 32460 }, { "epoch": 0.6649192144656277, "grad_norm": 0.3314231038093567, "learning_rate": 0.00018765984499482735, "loss": 4.6528, "step": 32470 }, { "epoch": 0.665123994020437, "grad_norm": 0.2996695041656494, "learning_rate": 0.00018747576555544842, "loss": 4.6306, "step": 32480 }, { "epoch": 0.6653287735752462, "grad_norm": 0.30856266617774963, "learning_rate": 0.00018729169676689127, "loss": 4.6478, "step": 32490 }, { "epoch": 0.6655335531300555, "grad_norm": 0.2853170931339264, "learning_rate": 0.00018710763878569128, "loss": 4.6279, "step": 32500 }, { "epoch": 0.6657383326848647, "grad_norm": 0.31832218170166016, "learning_rate": 0.0001869235917683744, "loss": 4.659, "step": 32510 }, { "epoch": 0.665943112239674, "grad_norm": 0.3200172185897827, "learning_rate": 0.00018673955587145743, "loss": 4.6466, "step": 32520 }, { "epoch": 0.6661478917944832, "grad_norm": 0.32725638151168823, "learning_rate": 0.00018655553125144757, "loss": 4.5589, "step": 32530 }, { "epoch": 0.6663526713492924, "grad_norm": 0.32515549659729004, "learning_rate": 0.00018637151806484247, "loss": 4.6269, "step": 32540 }, { "epoch": 0.6665574509041018, "grad_norm": 0.3151293396949768, "learning_rate": 0.0001861875164681302, "loss": 4.6074, "step": 32550 }, { "epoch": 0.666762230458911, "grad_norm": 0.28322288393974304, "learning_rate": 0.00018600352661778867, "loss": 4.6232, "step": 32560 }, { "epoch": 0.6669670100137203, "grad_norm": 0.30027103424072266, "learning_rate": 0.00018581954867028612, "loss": 4.6656, "step": 32570 }, { "epoch": 0.6671717895685295, "grad_norm": 0.35422566533088684, "learning_rate": 0.00018563558278208035, "loss": 4.629, "step": 32580 }, { "epoch": 0.6673765691233388, "grad_norm": 0.30949604511260986, "learning_rate": 0.00018545162910961924, "loss": 4.6407, "step": 32590 }, { "epoch": 0.667581348678148, "grad_norm": 0.29101812839508057, "learning_rate": 0.00018526768780934005, "loss": 4.6443, "step": 32600 }, { "epoch": 0.6677861282329572, "grad_norm": 0.3094857931137085, "learning_rate": 0.00018508375903766963, "loss": 4.6171, "step": 32610 }, { "epoch": 0.6679909077877665, "grad_norm": 0.2886817157268524, "learning_rate": 0.00018489984295102417, "loss": 4.6539, "step": 32620 }, { "epoch": 0.6681956873425757, "grad_norm": 0.29238563776016235, "learning_rate": 0.00018471593970580892, "loss": 4.6522, "step": 32630 }, { "epoch": 0.668400466897385, "grad_norm": 0.307925283908844, "learning_rate": 0.00018453204945841835, "loss": 4.6576, "step": 32640 }, { "epoch": 0.6686052464521942, "grad_norm": 0.3067364990711212, "learning_rate": 0.00018434817236523589, "loss": 4.677, "step": 32650 }, { "epoch": 0.6688100260070035, "grad_norm": 0.3166733384132385, "learning_rate": 0.0001841643085826338, "loss": 4.6513, "step": 32660 }, { "epoch": 0.6690148055618127, "grad_norm": 0.30757683515548706, "learning_rate": 0.00018398045826697286, "loss": 4.6589, "step": 32670 }, { "epoch": 0.6692195851166219, "grad_norm": 0.3488108515739441, "learning_rate": 0.00018379662157460254, "loss": 4.5935, "step": 32680 }, { "epoch": 0.6694243646714312, "grad_norm": 0.29875683784484863, "learning_rate": 0.00018361279866186078, "loss": 4.6348, "step": 32690 }, { "epoch": 0.6696291442262404, "grad_norm": 0.310260146856308, "learning_rate": 0.00018342898968507355, "loss": 4.6554, "step": 32700 }, { "epoch": 0.6698339237810497, "grad_norm": 0.3225691318511963, "learning_rate": 0.00018324519480055528, "loss": 4.6241, "step": 32710 }, { "epoch": 0.6700387033358589, "grad_norm": 0.31346482038497925, "learning_rate": 0.00018306141416460814, "loss": 4.6227, "step": 32720 }, { "epoch": 0.6702434828906682, "grad_norm": 0.34079501032829285, "learning_rate": 0.00018287764793352236, "loss": 4.6266, "step": 32730 }, { "epoch": 0.6704482624454774, "grad_norm": 0.2941148579120636, "learning_rate": 0.00018269389626357588, "loss": 4.6512, "step": 32740 }, { "epoch": 0.6706530420002866, "grad_norm": 0.32868993282318115, "learning_rate": 0.00018251015931103427, "loss": 4.6312, "step": 32750 }, { "epoch": 0.670857821555096, "grad_norm": 0.2995958924293518, "learning_rate": 0.00018232643723215048, "loss": 4.6151, "step": 32760 }, { "epoch": 0.6710626011099052, "grad_norm": 0.3182849586009979, "learning_rate": 0.00018214273018316498, "loss": 4.652, "step": 32770 }, { "epoch": 0.6712673806647145, "grad_norm": 0.28864336013793945, "learning_rate": 0.00018195903832030525, "loss": 4.6374, "step": 32780 }, { "epoch": 0.6714721602195237, "grad_norm": 0.2957040071487427, "learning_rate": 0.00018177536179978608, "loss": 4.6219, "step": 32790 }, { "epoch": 0.671676939774333, "grad_norm": 0.31795555353164673, "learning_rate": 0.00018159170077780904, "loss": 4.624, "step": 32800 }, { "epoch": 0.6718817193291422, "grad_norm": 0.29311829805374146, "learning_rate": 0.00018140805541056262, "loss": 4.6721, "step": 32810 }, { "epoch": 0.6720864988839514, "grad_norm": 0.28791841864585876, "learning_rate": 0.00018122442585422193, "loss": 4.6213, "step": 32820 }, { "epoch": 0.6722912784387607, "grad_norm": 0.3204101026058197, "learning_rate": 0.0001810408122649488, "loss": 4.6312, "step": 32830 }, { "epoch": 0.6724960579935699, "grad_norm": 0.3177260756492615, "learning_rate": 0.00018085721479889114, "loss": 4.599, "step": 32840 }, { "epoch": 0.6727008375483792, "grad_norm": 0.3134481906890869, "learning_rate": 0.00018067363361218348, "loss": 4.6367, "step": 32850 }, { "epoch": 0.6729056171031884, "grad_norm": 0.31451141834259033, "learning_rate": 0.00018049006886094638, "loss": 4.6799, "step": 32860 }, { "epoch": 0.6731103966579977, "grad_norm": 0.31027689576148987, "learning_rate": 0.00018030652070128635, "loss": 4.666, "step": 32870 }, { "epoch": 0.6733151762128069, "grad_norm": 0.279913991689682, "learning_rate": 0.00018012298928929594, "loss": 4.6256, "step": 32880 }, { "epoch": 0.6735199557676161, "grad_norm": 0.2795368731021881, "learning_rate": 0.0001799394747810534, "loss": 4.6507, "step": 32890 }, { "epoch": 0.6737247353224254, "grad_norm": 0.3519022464752197, "learning_rate": 0.00017975597733262252, "loss": 4.6403, "step": 32900 }, { "epoch": 0.6739295148772346, "grad_norm": 0.3112083971500397, "learning_rate": 0.0001795724971000527, "loss": 4.6478, "step": 32910 }, { "epoch": 0.6741342944320439, "grad_norm": 0.281672865152359, "learning_rate": 0.0001793890342393786, "loss": 4.671, "step": 32920 }, { "epoch": 0.6743390739868531, "grad_norm": 0.3238334357738495, "learning_rate": 0.00017920558890662015, "loss": 4.6135, "step": 32930 }, { "epoch": 0.6745438535416624, "grad_norm": 0.29171717166900635, "learning_rate": 0.00017902216125778244, "loss": 4.6022, "step": 32940 }, { "epoch": 0.6747486330964716, "grad_norm": 0.31077757477760315, "learning_rate": 0.00017883875144885546, "loss": 4.6145, "step": 32950 }, { "epoch": 0.674953412651281, "grad_norm": 0.2973254323005676, "learning_rate": 0.000178655359635814, "loss": 4.6681, "step": 32960 }, { "epoch": 0.6751581922060902, "grad_norm": 0.292275607585907, "learning_rate": 0.0001784719859746176, "loss": 4.6316, "step": 32970 }, { "epoch": 0.6753629717608994, "grad_norm": 0.3061160147190094, "learning_rate": 0.00017828863062121028, "loss": 4.6434, "step": 32980 }, { "epoch": 0.6755677513157087, "grad_norm": 0.31540408730506897, "learning_rate": 0.00017810529373152057, "loss": 4.63, "step": 32990 }, { "epoch": 0.6757725308705179, "grad_norm": 0.31090953946113586, "learning_rate": 0.00017792197546146135, "loss": 4.6172, "step": 33000 }, { "epoch": 0.6759773104253272, "grad_norm": 0.28982412815093994, "learning_rate": 0.00017773867596692947, "loss": 4.6441, "step": 33010 }, { "epoch": 0.6761820899801364, "grad_norm": 0.2939661145210266, "learning_rate": 0.00017755539540380603, "loss": 4.6386, "step": 33020 }, { "epoch": 0.6763868695349456, "grad_norm": 0.28937020897865295, "learning_rate": 0.0001773721339279559, "loss": 4.6325, "step": 33030 }, { "epoch": 0.6765916490897549, "grad_norm": 0.2975783944129944, "learning_rate": 0.00017718889169522788, "loss": 4.6562, "step": 33040 }, { "epoch": 0.6767964286445641, "grad_norm": 0.294565349817276, "learning_rate": 0.0001770056688614541, "loss": 4.6427, "step": 33050 }, { "epoch": 0.6770012081993734, "grad_norm": 0.29006120562553406, "learning_rate": 0.0001768224655824504, "loss": 4.6094, "step": 33060 }, { "epoch": 0.6772059877541826, "grad_norm": 0.2915450930595398, "learning_rate": 0.00017663928201401598, "loss": 4.6345, "step": 33070 }, { "epoch": 0.6774107673089919, "grad_norm": 0.38605785369873047, "learning_rate": 0.00017645611831193332, "loss": 4.6323, "step": 33080 }, { "epoch": 0.6776155468638011, "grad_norm": 0.30939891934394836, "learning_rate": 0.00017627297463196793, "loss": 4.6566, "step": 33090 }, { "epoch": 0.6778203264186103, "grad_norm": 0.3041756749153137, "learning_rate": 0.00017608985112986823, "loss": 4.625, "step": 33100 }, { "epoch": 0.6780251059734196, "grad_norm": 0.31380847096443176, "learning_rate": 0.00017590674796136576, "loss": 4.6472, "step": 33110 }, { "epoch": 0.6782298855282288, "grad_norm": 0.309291809797287, "learning_rate": 0.0001757236652821743, "loss": 4.6401, "step": 33120 }, { "epoch": 0.6784346650830381, "grad_norm": 0.30570313334465027, "learning_rate": 0.00017554060324799059, "loss": 4.6379, "step": 33130 }, { "epoch": 0.6786394446378473, "grad_norm": 0.6258738040924072, "learning_rate": 0.0001753575620144938, "loss": 4.6244, "step": 33140 }, { "epoch": 0.6788442241926566, "grad_norm": 0.2956477105617523, "learning_rate": 0.00017517454173734511, "loss": 4.604, "step": 33150 }, { "epoch": 0.6790490037474659, "grad_norm": 0.28767263889312744, "learning_rate": 0.0001749915425721882, "loss": 4.6152, "step": 33160 }, { "epoch": 0.6792537833022751, "grad_norm": 0.3020152151584625, "learning_rate": 0.00017480856467464863, "loss": 4.625, "step": 33170 }, { "epoch": 0.6794585628570844, "grad_norm": 0.3043646216392517, "learning_rate": 0.00017462560820033404, "loss": 4.6266, "step": 33180 }, { "epoch": 0.6796633424118936, "grad_norm": 0.32564735412597656, "learning_rate": 0.00017444267330483357, "loss": 4.6177, "step": 33190 }, { "epoch": 0.6798681219667029, "grad_norm": 0.3014850914478302, "learning_rate": 0.00017425976014371814, "loss": 4.6599, "step": 33200 }, { "epoch": 0.6800729015215121, "grad_norm": 0.3051404654979706, "learning_rate": 0.00017407686887254033, "loss": 4.6245, "step": 33210 }, { "epoch": 0.6802776810763214, "grad_norm": 0.2853024899959564, "learning_rate": 0.00017389399964683392, "loss": 4.6068, "step": 33220 }, { "epoch": 0.6804824606311306, "grad_norm": 0.28191468119621277, "learning_rate": 0.0001737111526221141, "loss": 4.616, "step": 33230 }, { "epoch": 0.6806872401859398, "grad_norm": 0.2870134115219116, "learning_rate": 0.00017352832795387694, "loss": 4.6341, "step": 33240 }, { "epoch": 0.6808920197407491, "grad_norm": 0.3020907938480377, "learning_rate": 0.0001733455257975998, "loss": 4.5707, "step": 33250 }, { "epoch": 0.6810967992955583, "grad_norm": 0.31421855092048645, "learning_rate": 0.00017316274630874064, "loss": 4.6089, "step": 33260 }, { "epoch": 0.6813015788503676, "grad_norm": 0.30290868878364563, "learning_rate": 0.0001729799896427382, "loss": 4.6041, "step": 33270 }, { "epoch": 0.6815063584051768, "grad_norm": 0.28204119205474854, "learning_rate": 0.00017279725595501202, "loss": 4.6333, "step": 33280 }, { "epoch": 0.6817111379599861, "grad_norm": 0.2897128462791443, "learning_rate": 0.00017261454540096178, "loss": 4.6101, "step": 33290 }, { "epoch": 0.6819159175147953, "grad_norm": 0.3139142096042633, "learning_rate": 0.0001724318581359677, "loss": 4.6587, "step": 33300 }, { "epoch": 0.6821206970696045, "grad_norm": 0.2964023947715759, "learning_rate": 0.00017224919431539014, "loss": 4.6474, "step": 33310 }, { "epoch": 0.6823254766244138, "grad_norm": 0.31991204619407654, "learning_rate": 0.0001720665540945696, "loss": 4.6123, "step": 33320 }, { "epoch": 0.682530256179223, "grad_norm": 0.30030396580696106, "learning_rate": 0.0001718839376288263, "loss": 4.6197, "step": 33330 }, { "epoch": 0.6827350357340323, "grad_norm": 0.2920781672000885, "learning_rate": 0.0001717013450734604, "loss": 4.6595, "step": 33340 }, { "epoch": 0.6829398152888415, "grad_norm": 0.2850472927093506, "learning_rate": 0.00017151877658375177, "loss": 4.6699, "step": 33350 }, { "epoch": 0.6831445948436509, "grad_norm": 0.30785560607910156, "learning_rate": 0.0001713362323149597, "loss": 4.6184, "step": 33360 }, { "epoch": 0.6833493743984601, "grad_norm": 0.2960036098957062, "learning_rate": 0.00017115371242232302, "loss": 4.6207, "step": 33370 }, { "epoch": 0.6835541539532694, "grad_norm": 0.32618141174316406, "learning_rate": 0.00017097121706105965, "loss": 4.6288, "step": 33380 }, { "epoch": 0.6837589335080786, "grad_norm": 0.3154570460319519, "learning_rate": 0.00017078874638636692, "loss": 4.5996, "step": 33390 }, { "epoch": 0.6839637130628878, "grad_norm": 0.29100146889686584, "learning_rate": 0.00017060630055342076, "loss": 4.6553, "step": 33400 }, { "epoch": 0.6841684926176971, "grad_norm": 0.29402703046798706, "learning_rate": 0.00017042387971737632, "loss": 4.6209, "step": 33410 }, { "epoch": 0.6843732721725063, "grad_norm": 0.2883686423301697, "learning_rate": 0.00017024148403336738, "loss": 4.6541, "step": 33420 }, { "epoch": 0.6845780517273156, "grad_norm": 0.3109899163246155, "learning_rate": 0.00017005911365650643, "loss": 4.6319, "step": 33430 }, { "epoch": 0.6847828312821248, "grad_norm": 0.28234413266181946, "learning_rate": 0.00016987676874188415, "loss": 4.6673, "step": 33440 }, { "epoch": 0.684987610836934, "grad_norm": 0.32163006067276, "learning_rate": 0.00016969444944456985, "loss": 4.5921, "step": 33450 }, { "epoch": 0.6851923903917433, "grad_norm": 0.28209173679351807, "learning_rate": 0.00016951215591961104, "loss": 4.5846, "step": 33460 }, { "epoch": 0.6853971699465525, "grad_norm": 0.2900984585285187, "learning_rate": 0.00016932988832203306, "loss": 4.6055, "step": 33470 }, { "epoch": 0.6856019495013618, "grad_norm": 0.32391712069511414, "learning_rate": 0.00016914764680683953, "loss": 4.63, "step": 33480 }, { "epoch": 0.685806729056171, "grad_norm": 0.28276559710502625, "learning_rate": 0.0001689654315290116, "loss": 4.6479, "step": 33490 }, { "epoch": 0.6860115086109803, "grad_norm": 0.31503188610076904, "learning_rate": 0.00016878324264350833, "loss": 4.6333, "step": 33500 }, { "epoch": 0.6862162881657895, "grad_norm": 0.2737639248371124, "learning_rate": 0.00016860108030526623, "loss": 4.6192, "step": 33510 }, { "epoch": 0.6864210677205987, "grad_norm": 0.3093930184841156, "learning_rate": 0.00016841894466919924, "loss": 4.6159, "step": 33520 }, { "epoch": 0.686625847275408, "grad_norm": 0.2998504936695099, "learning_rate": 0.00016823683589019869, "loss": 4.6206, "step": 33530 }, { "epoch": 0.6868306268302172, "grad_norm": 0.30359306931495667, "learning_rate": 0.00016805475412313278, "loss": 4.629, "step": 33540 }, { "epoch": 0.6870354063850265, "grad_norm": 0.28318941593170166, "learning_rate": 0.00016787269952284704, "loss": 4.5992, "step": 33550 }, { "epoch": 0.6872401859398357, "grad_norm": 0.29930534958839417, "learning_rate": 0.00016769067224416386, "loss": 4.626, "step": 33560 }, { "epoch": 0.6874449654946451, "grad_norm": 0.28812313079833984, "learning_rate": 0.0001675086724418823, "loss": 4.6257, "step": 33570 }, { "epoch": 0.6876497450494543, "grad_norm": 0.2946549952030182, "learning_rate": 0.00016732670027077803, "loss": 4.6182, "step": 33580 }, { "epoch": 0.6878545246042636, "grad_norm": 0.28934773802757263, "learning_rate": 0.00016714475588560333, "loss": 4.6001, "step": 33590 }, { "epoch": 0.6880593041590728, "grad_norm": 0.2975788712501526, "learning_rate": 0.00016696283944108684, "loss": 4.6209, "step": 33600 }, { "epoch": 0.688264083713882, "grad_norm": 0.30068328976631165, "learning_rate": 0.0001667809510919333, "loss": 4.6335, "step": 33610 }, { "epoch": 0.6884688632686913, "grad_norm": 0.2877258360385895, "learning_rate": 0.0001665990909928237, "loss": 4.6544, "step": 33620 }, { "epoch": 0.6886736428235005, "grad_norm": 0.2939065098762512, "learning_rate": 0.00016641725929841494, "loss": 4.5978, "step": 33630 }, { "epoch": 0.6888784223783098, "grad_norm": 0.2837385833263397, "learning_rate": 0.0001662354561633398, "loss": 4.6174, "step": 33640 }, { "epoch": 0.689083201933119, "grad_norm": 0.31246641278266907, "learning_rate": 0.00016605368174220676, "loss": 4.6092, "step": 33650 }, { "epoch": 0.6892879814879282, "grad_norm": 0.33488237857818604, "learning_rate": 0.0001658719361895999, "loss": 4.6392, "step": 33660 }, { "epoch": 0.6894927610427375, "grad_norm": 0.30829524993896484, "learning_rate": 0.00016569021966007875, "loss": 4.6542, "step": 33670 }, { "epoch": 0.6896975405975467, "grad_norm": 0.3291963040828705, "learning_rate": 0.00016550853230817803, "loss": 4.636, "step": 33680 }, { "epoch": 0.689902320152356, "grad_norm": 0.29185518622398376, "learning_rate": 0.00016532687428840782, "loss": 4.6201, "step": 33690 }, { "epoch": 0.6901070997071652, "grad_norm": 0.27664265036582947, "learning_rate": 0.00016514524575525315, "loss": 4.6101, "step": 33700 }, { "epoch": 0.6903118792619745, "grad_norm": 0.2888222932815552, "learning_rate": 0.00016496364686317412, "loss": 4.6383, "step": 33710 }, { "epoch": 0.6905166588167837, "grad_norm": 0.29128146171569824, "learning_rate": 0.00016478207776660537, "loss": 4.6394, "step": 33720 }, { "epoch": 0.6907214383715929, "grad_norm": 0.299492746591568, "learning_rate": 0.0001646005386199564, "loss": 4.5757, "step": 33730 }, { "epoch": 0.6909262179264022, "grad_norm": 0.29915815591812134, "learning_rate": 0.00016441902957761131, "loss": 4.6178, "step": 33740 }, { "epoch": 0.6911309974812114, "grad_norm": 0.35317280888557434, "learning_rate": 0.0001642375507939283, "loss": 4.6439, "step": 33750 }, { "epoch": 0.6913357770360207, "grad_norm": 0.2961081266403198, "learning_rate": 0.00016405610242324004, "loss": 4.6273, "step": 33760 }, { "epoch": 0.69154055659083, "grad_norm": 0.2951876223087311, "learning_rate": 0.00016387468461985343, "loss": 4.5987, "step": 33770 }, { "epoch": 0.6917453361456393, "grad_norm": 0.33545276522636414, "learning_rate": 0.00016369329753804915, "loss": 4.6142, "step": 33780 }, { "epoch": 0.6919501157004485, "grad_norm": 0.45325595140457153, "learning_rate": 0.00016351194133208184, "loss": 4.6249, "step": 33790 }, { "epoch": 0.6921548952552578, "grad_norm": 0.29584935307502747, "learning_rate": 0.00016333061615617998, "loss": 4.617, "step": 33800 }, { "epoch": 0.692359674810067, "grad_norm": 0.28129342198371887, "learning_rate": 0.00016314932216454558, "loss": 4.6244, "step": 33810 }, { "epoch": 0.6925644543648762, "grad_norm": 0.2985042631626129, "learning_rate": 0.00016296805951135409, "loss": 4.6105, "step": 33820 }, { "epoch": 0.6927692339196855, "grad_norm": 0.2735280692577362, "learning_rate": 0.00016278682835075428, "loss": 4.6201, "step": 33830 }, { "epoch": 0.6929740134744947, "grad_norm": 0.3145197629928589, "learning_rate": 0.0001626056288368683, "loss": 4.6109, "step": 33840 }, { "epoch": 0.693178793029304, "grad_norm": 0.2885589599609375, "learning_rate": 0.00016242446112379132, "loss": 4.6203, "step": 33850 }, { "epoch": 0.6933835725841132, "grad_norm": 0.3050481975078583, "learning_rate": 0.00016224332536559144, "loss": 4.606, "step": 33860 }, { "epoch": 0.6935883521389224, "grad_norm": 0.3099835515022278, "learning_rate": 0.00016206222171630952, "loss": 4.6058, "step": 33870 }, { "epoch": 0.6937931316937317, "grad_norm": 0.2893453538417816, "learning_rate": 0.0001618811503299593, "loss": 4.6087, "step": 33880 }, { "epoch": 0.6939979112485409, "grad_norm": 0.2874526381492615, "learning_rate": 0.00016170011136052678, "loss": 4.6221, "step": 33890 }, { "epoch": 0.6942026908033502, "grad_norm": 0.28769010305404663, "learning_rate": 0.00016151910496197072, "loss": 4.6597, "step": 33900 }, { "epoch": 0.6944074703581594, "grad_norm": 0.3656673729419708, "learning_rate": 0.000161338131288222, "loss": 4.6209, "step": 33910 }, { "epoch": 0.6946122499129687, "grad_norm": 0.29951661825180054, "learning_rate": 0.00016115719049318372, "loss": 4.6536, "step": 33920 }, { "epoch": 0.6948170294677779, "grad_norm": 0.2999086081981659, "learning_rate": 0.00016097628273073096, "loss": 4.6137, "step": 33930 }, { "epoch": 0.6950218090225871, "grad_norm": 0.32161378860473633, "learning_rate": 0.0001607954081547108, "loss": 4.6369, "step": 33940 }, { "epoch": 0.6952265885773964, "grad_norm": 0.29670730233192444, "learning_rate": 0.00016061456691894213, "loss": 4.625, "step": 33950 }, { "epoch": 0.6954313681322056, "grad_norm": 0.32408106327056885, "learning_rate": 0.00016043375917721531, "loss": 4.6082, "step": 33960 }, { "epoch": 0.695636147687015, "grad_norm": 0.30162012577056885, "learning_rate": 0.0001602529850832923, "loss": 4.6108, "step": 33970 }, { "epoch": 0.6958409272418242, "grad_norm": 0.2911863327026367, "learning_rate": 0.0001600722447909065, "loss": 4.6358, "step": 33980 }, { "epoch": 0.6960457067966335, "grad_norm": 0.29991620779037476, "learning_rate": 0.0001598915384537625, "loss": 4.6072, "step": 33990 }, { "epoch": 0.6962504863514427, "grad_norm": 0.30318012833595276, "learning_rate": 0.00015971086622553615, "loss": 4.587, "step": 34000 }, { "epoch": 0.6962504863514427, "eval_loss": 4.624014377593994, "eval_runtime": 4.3958, "eval_samples_per_second": 265.255, "eval_steps_per_second": 33.214, "step": 34000 }, { "epoch": 0.696455265906252, "grad_norm": 0.2930164039134979, "learning_rate": 0.00015953022825987401, "loss": 4.6127, "step": 34010 }, { "epoch": 0.6966600454610612, "grad_norm": 0.31689298152923584, "learning_rate": 0.0001593496247103939, "loss": 4.604, "step": 34020 }, { "epoch": 0.6968648250158704, "grad_norm": 0.30203554034233093, "learning_rate": 0.00015916905573068395, "loss": 4.6311, "step": 34030 }, { "epoch": 0.6970696045706797, "grad_norm": 0.33920392394065857, "learning_rate": 0.00015898852147430313, "loss": 4.6343, "step": 34040 }, { "epoch": 0.6972743841254889, "grad_norm": 0.29707950353622437, "learning_rate": 0.00015880802209478092, "loss": 4.6333, "step": 34050 }, { "epoch": 0.6974791636802982, "grad_norm": 0.29657280445098877, "learning_rate": 0.00015862755774561697, "loss": 4.6022, "step": 34060 }, { "epoch": 0.6976839432351074, "grad_norm": 0.297585129737854, "learning_rate": 0.00015844712858028128, "loss": 4.5858, "step": 34070 }, { "epoch": 0.6978887227899166, "grad_norm": 0.30895403027534485, "learning_rate": 0.00015826673475221386, "loss": 4.6116, "step": 34080 }, { "epoch": 0.6980935023447259, "grad_norm": 0.29268407821655273, "learning_rate": 0.00015808637641482478, "loss": 4.5968, "step": 34090 }, { "epoch": 0.6982982818995351, "grad_norm": 0.2873637080192566, "learning_rate": 0.0001579060537214937, "loss": 4.6052, "step": 34100 }, { "epoch": 0.6985030614543444, "grad_norm": 0.2870800793170929, "learning_rate": 0.0001577257668255702, "loss": 4.642, "step": 34110 }, { "epoch": 0.6987078410091536, "grad_norm": 0.29790377616882324, "learning_rate": 0.0001575455158803732, "loss": 4.6406, "step": 34120 }, { "epoch": 0.6989126205639629, "grad_norm": 0.2890074551105499, "learning_rate": 0.0001573653010391913, "loss": 4.6389, "step": 34130 }, { "epoch": 0.6991174001187721, "grad_norm": 0.30178284645080566, "learning_rate": 0.0001571851224552822, "loss": 4.616, "step": 34140 }, { "epoch": 0.6993221796735813, "grad_norm": 0.32394957542419434, "learning_rate": 0.00015700498028187285, "loss": 4.6141, "step": 34150 }, { "epoch": 0.6995269592283906, "grad_norm": 0.32160764932632446, "learning_rate": 0.0001568248746721593, "loss": 4.6395, "step": 34160 }, { "epoch": 0.6997317387831998, "grad_norm": 0.3074822723865509, "learning_rate": 0.00015664480577930623, "loss": 4.6031, "step": 34170 }, { "epoch": 0.6999365183380092, "grad_norm": 0.3199803829193115, "learning_rate": 0.00015646477375644738, "loss": 4.6103, "step": 34180 }, { "epoch": 0.7001412978928184, "grad_norm": 0.30673491954803467, "learning_rate": 0.0001562847787566851, "loss": 4.5934, "step": 34190 }, { "epoch": 0.7003460774476277, "grad_norm": 0.6456322073936462, "learning_rate": 0.00015610482093309015, "loss": 4.6557, "step": 34200 }, { "epoch": 0.7005508570024369, "grad_norm": 0.3115524351596832, "learning_rate": 0.0001559249004387017, "loss": 4.6471, "step": 34210 }, { "epoch": 0.7007556365572462, "grad_norm": 0.32813143730163574, "learning_rate": 0.00015574501742652726, "loss": 4.611, "step": 34220 }, { "epoch": 0.7009604161120554, "grad_norm": 0.33972465991973877, "learning_rate": 0.0001555651720495424, "loss": 4.6277, "step": 34230 }, { "epoch": 0.7011651956668646, "grad_norm": 0.3158153295516968, "learning_rate": 0.0001553853644606906, "loss": 4.6047, "step": 34240 }, { "epoch": 0.7013699752216739, "grad_norm": 0.2914464771747589, "learning_rate": 0.00015520559481288336, "loss": 4.6304, "step": 34250 }, { "epoch": 0.7015747547764831, "grad_norm": 0.29459428787231445, "learning_rate": 0.00015502586325899978, "loss": 4.6396, "step": 34260 }, { "epoch": 0.7017795343312924, "grad_norm": 0.298574835062027, "learning_rate": 0.0001548461699518867, "loss": 4.6511, "step": 34270 }, { "epoch": 0.7019843138861016, "grad_norm": 0.3120969831943512, "learning_rate": 0.00015466651504435834, "loss": 4.6389, "step": 34280 }, { "epoch": 0.7021890934409108, "grad_norm": 0.30967774987220764, "learning_rate": 0.00015448689868919631, "loss": 4.6313, "step": 34290 }, { "epoch": 0.7023938729957201, "grad_norm": 0.2917247414588928, "learning_rate": 0.00015430732103914943, "loss": 4.6116, "step": 34300 }, { "epoch": 0.7025986525505293, "grad_norm": 0.287887305021286, "learning_rate": 0.00015412778224693348, "loss": 4.5931, "step": 34310 }, { "epoch": 0.7028034321053386, "grad_norm": 0.2971671223640442, "learning_rate": 0.00015394828246523135, "loss": 4.6077, "step": 34320 }, { "epoch": 0.7030082116601478, "grad_norm": 0.30850762128829956, "learning_rate": 0.00015376882184669273, "loss": 4.6117, "step": 34330 }, { "epoch": 0.7032129912149571, "grad_norm": 0.28424036502838135, "learning_rate": 0.000153589400543934, "loss": 4.6151, "step": 34340 }, { "epoch": 0.7034177707697663, "grad_norm": 0.3723028004169464, "learning_rate": 0.00015341001870953805, "loss": 4.6219, "step": 34350 }, { "epoch": 0.7036225503245755, "grad_norm": 0.29439976811408997, "learning_rate": 0.00015323067649605424, "loss": 4.6086, "step": 34360 }, { "epoch": 0.7038273298793848, "grad_norm": 0.3045384883880615, "learning_rate": 0.00015305137405599822, "loss": 4.601, "step": 34370 }, { "epoch": 0.7040321094341941, "grad_norm": 0.3009319007396698, "learning_rate": 0.00015287211154185198, "loss": 4.6008, "step": 34380 }, { "epoch": 0.7042368889890034, "grad_norm": 0.30488184094429016, "learning_rate": 0.00015269288910606322, "loss": 4.6439, "step": 34390 }, { "epoch": 0.7044416685438126, "grad_norm": 0.31904223561286926, "learning_rate": 0.0001525137069010458, "loss": 4.6291, "step": 34400 }, { "epoch": 0.7046464480986219, "grad_norm": 0.3596601188182831, "learning_rate": 0.0001523345650791793, "loss": 4.6407, "step": 34410 }, { "epoch": 0.7048512276534311, "grad_norm": 0.29149162769317627, "learning_rate": 0.00015215546379280902, "loss": 4.6207, "step": 34420 }, { "epoch": 0.7050560072082404, "grad_norm": 0.3004820644855499, "learning_rate": 0.00015197640319424577, "loss": 4.6233, "step": 34430 }, { "epoch": 0.7052607867630496, "grad_norm": 0.3024444878101349, "learning_rate": 0.00015179738343576565, "loss": 4.5967, "step": 34440 }, { "epoch": 0.7054655663178588, "grad_norm": 0.3134540319442749, "learning_rate": 0.00015161840466961017, "loss": 4.617, "step": 34450 }, { "epoch": 0.7056703458726681, "grad_norm": 0.2759465277194977, "learning_rate": 0.00015143946704798583, "loss": 4.6282, "step": 34460 }, { "epoch": 0.7058751254274773, "grad_norm": 0.2962445318698883, "learning_rate": 0.00015126057072306425, "loss": 4.5925, "step": 34470 }, { "epoch": 0.7060799049822866, "grad_norm": 0.2982873022556305, "learning_rate": 0.00015108171584698202, "loss": 4.6238, "step": 34480 }, { "epoch": 0.7062846845370958, "grad_norm": 0.29654350876808167, "learning_rate": 0.0001509029025718402, "loss": 4.5964, "step": 34490 }, { "epoch": 0.706489464091905, "grad_norm": 0.31302574276924133, "learning_rate": 0.0001507241310497047, "loss": 4.6509, "step": 34500 }, { "epoch": 0.7066942436467143, "grad_norm": 0.9012963175773621, "learning_rate": 0.00015054540143260588, "loss": 4.5972, "step": 34510 }, { "epoch": 0.7068990232015235, "grad_norm": 0.3148398995399475, "learning_rate": 0.0001503667138725386, "loss": 4.6054, "step": 34520 }, { "epoch": 0.7071038027563328, "grad_norm": 0.29970496892929077, "learning_rate": 0.00015018806852146147, "loss": 4.6303, "step": 34530 }, { "epoch": 0.707308582311142, "grad_norm": 0.2997286915779114, "learning_rate": 0.0001500094655312978, "loss": 4.6397, "step": 34540 }, { "epoch": 0.7075133618659513, "grad_norm": 0.5666271448135376, "learning_rate": 0.00014983090505393443, "loss": 4.6496, "step": 34550 }, { "epoch": 0.7077181414207605, "grad_norm": 0.31896817684173584, "learning_rate": 0.00014965238724122236, "loss": 4.6252, "step": 34560 }, { "epoch": 0.7079229209755697, "grad_norm": 0.3208785653114319, "learning_rate": 0.00014947391224497616, "loss": 4.6231, "step": 34570 }, { "epoch": 0.7081277005303791, "grad_norm": 0.2764456868171692, "learning_rate": 0.00014929548021697388, "loss": 4.6422, "step": 34580 }, { "epoch": 0.7083324800851883, "grad_norm": 0.47031036019325256, "learning_rate": 0.00014911709130895737, "loss": 4.5437, "step": 34590 }, { "epoch": 0.7085372596399976, "grad_norm": 0.3066767156124115, "learning_rate": 0.00014893874567263132, "loss": 4.6301, "step": 34600 }, { "epoch": 0.7087420391948068, "grad_norm": 0.2902284860610962, "learning_rate": 0.00014876044345966407, "loss": 4.5781, "step": 34610 }, { "epoch": 0.7089468187496161, "grad_norm": 0.29947999119758606, "learning_rate": 0.00014858218482168679, "loss": 4.6241, "step": 34620 }, { "epoch": 0.7091515983044253, "grad_norm": 0.28963664174079895, "learning_rate": 0.0001484039699102937, "loss": 4.6062, "step": 34630 }, { "epoch": 0.7093563778592346, "grad_norm": 0.2964884340763092, "learning_rate": 0.00014822579887704176, "loss": 4.6447, "step": 34640 }, { "epoch": 0.7095611574140438, "grad_norm": 0.2854631543159485, "learning_rate": 0.00014804767187345062, "loss": 4.632, "step": 34650 }, { "epoch": 0.709765936968853, "grad_norm": 0.30145594477653503, "learning_rate": 0.00014786958905100265, "loss": 4.6122, "step": 34660 }, { "epoch": 0.7099707165236623, "grad_norm": 0.30469653010368347, "learning_rate": 0.0001476915505611423, "loss": 4.6022, "step": 34670 }, { "epoch": 0.7101754960784715, "grad_norm": 0.29701927304267883, "learning_rate": 0.00014751355655527671, "loss": 4.5936, "step": 34680 }, { "epoch": 0.7103802756332808, "grad_norm": 0.27827855944633484, "learning_rate": 0.00014733560718477486, "loss": 4.6606, "step": 34690 }, { "epoch": 0.71058505518809, "grad_norm": 0.2819177210330963, "learning_rate": 0.00014715770260096802, "loss": 4.6358, "step": 34700 }, { "epoch": 0.7107898347428993, "grad_norm": 0.3499496579170227, "learning_rate": 0.0001469798429551492, "loss": 4.6445, "step": 34710 }, { "epoch": 0.7109946142977085, "grad_norm": 0.3143846094608307, "learning_rate": 0.00014680202839857336, "loss": 4.6115, "step": 34720 }, { "epoch": 0.7111993938525177, "grad_norm": 0.3078066408634186, "learning_rate": 0.00014662425908245705, "loss": 4.6021, "step": 34730 }, { "epoch": 0.711404173407327, "grad_norm": 0.2983221113681793, "learning_rate": 0.0001464465351579781, "loss": 4.6188, "step": 34740 }, { "epoch": 0.7116089529621362, "grad_norm": 0.3281814754009247, "learning_rate": 0.00014626885677627614, "loss": 4.5792, "step": 34750 }, { "epoch": 0.7118137325169455, "grad_norm": 0.3432719111442566, "learning_rate": 0.00014609122408845184, "loss": 4.6304, "step": 34760 }, { "epoch": 0.7120185120717547, "grad_norm": 0.28590285778045654, "learning_rate": 0.0001459136372455671, "loss": 4.5996, "step": 34770 }, { "epoch": 0.7122232916265641, "grad_norm": 0.3112219572067261, "learning_rate": 0.0001457360963986447, "loss": 4.6033, "step": 34780 }, { "epoch": 0.7124280711813733, "grad_norm": 0.30003437399864197, "learning_rate": 0.00014555860169866843, "loss": 4.6144, "step": 34790 }, { "epoch": 0.7126328507361825, "grad_norm": 0.2984277904033661, "learning_rate": 0.00014538115329658293, "loss": 4.6164, "step": 34800 }, { "epoch": 0.7128376302909918, "grad_norm": 0.3116559088230133, "learning_rate": 0.00014520375134329314, "loss": 4.6242, "step": 34810 }, { "epoch": 0.713042409845801, "grad_norm": 0.3221040666103363, "learning_rate": 0.0001450263959896648, "loss": 4.5388, "step": 34820 }, { "epoch": 0.7132471894006103, "grad_norm": 0.2988126873970032, "learning_rate": 0.0001448490873865239, "loss": 4.5904, "step": 34830 }, { "epoch": 0.7134519689554195, "grad_norm": 0.3226899206638336, "learning_rate": 0.00014467182568465666, "loss": 4.6182, "step": 34840 }, { "epoch": 0.7136567485102288, "grad_norm": 0.3137524425983429, "learning_rate": 0.00014449461103480945, "loss": 4.624, "step": 34850 }, { "epoch": 0.713861528065038, "grad_norm": 0.3023771643638611, "learning_rate": 0.0001443174435876887, "loss": 4.5885, "step": 34860 }, { "epoch": 0.7140663076198472, "grad_norm": 0.3101402819156647, "learning_rate": 0.00014414032349396058, "loss": 4.5458, "step": 34870 }, { "epoch": 0.7142710871746565, "grad_norm": 0.29617607593536377, "learning_rate": 0.00014396325090425103, "loss": 4.5918, "step": 34880 }, { "epoch": 0.7144758667294657, "grad_norm": 0.3024255931377411, "learning_rate": 0.00014378622596914552, "loss": 4.5991, "step": 34890 }, { "epoch": 0.714680646284275, "grad_norm": 0.3139723837375641, "learning_rate": 0.00014360924883918908, "loss": 4.6125, "step": 34900 }, { "epoch": 0.7148854258390842, "grad_norm": 0.3257853388786316, "learning_rate": 0.00014343231966488623, "loss": 4.6163, "step": 34910 }, { "epoch": 0.7150902053938935, "grad_norm": 0.30789676308631897, "learning_rate": 0.0001432554385967003, "loss": 4.623, "step": 34920 }, { "epoch": 0.7152949849487027, "grad_norm": 0.3053744435310364, "learning_rate": 0.00014307860578505413, "loss": 4.6242, "step": 34930 }, { "epoch": 0.7154997645035119, "grad_norm": 0.3180652856826782, "learning_rate": 0.00014290182138032936, "loss": 4.6371, "step": 34940 }, { "epoch": 0.7157045440583212, "grad_norm": 0.3113649785518646, "learning_rate": 0.00014272508553286634, "loss": 4.6102, "step": 34950 }, { "epoch": 0.7159093236131304, "grad_norm": 0.30352339148521423, "learning_rate": 0.00014254839839296428, "loss": 4.6238, "step": 34960 }, { "epoch": 0.7161141031679397, "grad_norm": 0.3152178227901459, "learning_rate": 0.00014237176011088097, "loss": 4.605, "step": 34970 }, { "epoch": 0.7163188827227489, "grad_norm": 0.2903915345668793, "learning_rate": 0.00014219517083683257, "loss": 4.5949, "step": 34980 }, { "epoch": 0.7165236622775583, "grad_norm": 0.30046382546424866, "learning_rate": 0.0001420186307209936, "loss": 4.6819, "step": 34990 }, { "epoch": 0.7167284418323675, "grad_norm": 0.5529968738555908, "learning_rate": 0.00014184213991349682, "loss": 4.5912, "step": 35000 }, { "epoch": 0.7169332213871767, "grad_norm": 0.3269819915294647, "learning_rate": 0.000141665698564433, "loss": 4.6077, "step": 35010 }, { "epoch": 0.717138000941986, "grad_norm": 0.2949140667915344, "learning_rate": 0.00014148930682385085, "loss": 4.5954, "step": 35020 }, { "epoch": 0.7173427804967952, "grad_norm": 0.30816182494163513, "learning_rate": 0.00014131296484175683, "loss": 4.667, "step": 35030 }, { "epoch": 0.7175475600516045, "grad_norm": 0.27781638503074646, "learning_rate": 0.00014113667276811524, "loss": 4.6247, "step": 35040 }, { "epoch": 0.7177523396064137, "grad_norm": 0.30939093232154846, "learning_rate": 0.00014096043075284784, "loss": 4.6372, "step": 35050 }, { "epoch": 0.717957119161223, "grad_norm": 0.2976728677749634, "learning_rate": 0.00014078423894583388, "loss": 4.5985, "step": 35060 }, { "epoch": 0.7181618987160322, "grad_norm": 0.30731210112571716, "learning_rate": 0.0001406080974969098, "loss": 4.6399, "step": 35070 }, { "epoch": 0.7183666782708414, "grad_norm": 0.3040066063404083, "learning_rate": 0.00014043200655586939, "loss": 4.6362, "step": 35080 }, { "epoch": 0.7185714578256507, "grad_norm": 0.3073359429836273, "learning_rate": 0.0001402559662724632, "loss": 4.624, "step": 35090 }, { "epoch": 0.7187762373804599, "grad_norm": 0.28751540184020996, "learning_rate": 0.00014007997679639899, "loss": 4.637, "step": 35100 }, { "epoch": 0.7189810169352692, "grad_norm": 0.3024277687072754, "learning_rate": 0.00013990403827734125, "loss": 4.6077, "step": 35110 }, { "epoch": 0.7191857964900784, "grad_norm": 0.32687389850616455, "learning_rate": 0.00013972815086491096, "loss": 4.5683, "step": 35120 }, { "epoch": 0.7193905760448877, "grad_norm": 0.2875973582267761, "learning_rate": 0.00013955231470868588, "loss": 4.6181, "step": 35130 }, { "epoch": 0.7195953555996969, "grad_norm": 0.307827889919281, "learning_rate": 0.0001393765299582, "loss": 4.6064, "step": 35140 }, { "epoch": 0.7198001351545061, "grad_norm": 0.2840040922164917, "learning_rate": 0.00013920079676294378, "loss": 4.6192, "step": 35150 }, { "epoch": 0.7200049147093154, "grad_norm": 0.42402467131614685, "learning_rate": 0.00013902511527236357, "loss": 4.5974, "step": 35160 }, { "epoch": 0.7202096942641246, "grad_norm": 0.34591928124427795, "learning_rate": 0.0001388494856358619, "loss": 4.6187, "step": 35170 }, { "epoch": 0.7204144738189339, "grad_norm": 0.32590940594673157, "learning_rate": 0.00013867390800279724, "loss": 4.5899, "step": 35180 }, { "epoch": 0.7206192533737432, "grad_norm": 0.28604620695114136, "learning_rate": 0.0001384983825224838, "loss": 4.5828, "step": 35190 }, { "epoch": 0.7208240329285525, "grad_norm": 0.3120392858982086, "learning_rate": 0.0001383229093441915, "loss": 4.6411, "step": 35200 }, { "epoch": 0.7210288124833617, "grad_norm": 0.3106038570404053, "learning_rate": 0.00013814748861714558, "loss": 4.5886, "step": 35210 }, { "epoch": 0.721233592038171, "grad_norm": 0.2892000675201416, "learning_rate": 0.00013797212049052697, "loss": 4.6308, "step": 35220 }, { "epoch": 0.7214383715929802, "grad_norm": 0.29461976885795593, "learning_rate": 0.00013779680511347153, "loss": 4.604, "step": 35230 }, { "epoch": 0.7216431511477894, "grad_norm": 0.31296923756599426, "learning_rate": 0.0001376215426350705, "loss": 4.6196, "step": 35240 }, { "epoch": 0.7218479307025987, "grad_norm": 0.28251272439956665, "learning_rate": 0.00013744633320437015, "loss": 4.6191, "step": 35250 }, { "epoch": 0.7220527102574079, "grad_norm": 0.3122941553592682, "learning_rate": 0.00013727117697037144, "loss": 4.6331, "step": 35260 }, { "epoch": 0.7222574898122172, "grad_norm": 0.32094359397888184, "learning_rate": 0.00013709607408203027, "loss": 4.6171, "step": 35270 }, { "epoch": 0.7224622693670264, "grad_norm": 0.2993016541004181, "learning_rate": 0.0001369210246882571, "loss": 4.6386, "step": 35280 }, { "epoch": 0.7226670489218356, "grad_norm": 0.30319884419441223, "learning_rate": 0.00013674602893791704, "loss": 4.5797, "step": 35290 }, { "epoch": 0.7228718284766449, "grad_norm": 0.48357534408569336, "learning_rate": 0.00013657108697982928, "loss": 4.5882, "step": 35300 }, { "epoch": 0.7230766080314541, "grad_norm": 0.3391391932964325, "learning_rate": 0.00013639619896276743, "loss": 4.6366, "step": 35310 }, { "epoch": 0.7232813875862634, "grad_norm": 0.32843708992004395, "learning_rate": 0.0001362213650354593, "loss": 4.5935, "step": 35320 }, { "epoch": 0.7234861671410726, "grad_norm": 0.2876911461353302, "learning_rate": 0.0001360465853465866, "loss": 4.6425, "step": 35330 }, { "epoch": 0.7236909466958819, "grad_norm": 0.2941429615020752, "learning_rate": 0.00013587186004478504, "loss": 4.619, "step": 35340 }, { "epoch": 0.7238957262506911, "grad_norm": 0.32524776458740234, "learning_rate": 0.00013569718927864386, "loss": 4.619, "step": 35350 }, { "epoch": 0.7241005058055003, "grad_norm": 0.3023153245449066, "learning_rate": 0.00013552257319670623, "loss": 4.6155, "step": 35360 }, { "epoch": 0.7243052853603096, "grad_norm": 0.30180665850639343, "learning_rate": 0.0001353480119474684, "loss": 4.6609, "step": 35370 }, { "epoch": 0.7245100649151188, "grad_norm": 0.3072219491004944, "learning_rate": 0.00013517350567938033, "loss": 4.5982, "step": 35380 }, { "epoch": 0.7247148444699282, "grad_norm": 0.3067971169948578, "learning_rate": 0.0001349990545408452, "loss": 4.5951, "step": 35390 }, { "epoch": 0.7249196240247374, "grad_norm": 0.31510570645332336, "learning_rate": 0.0001348246586802191, "loss": 4.617, "step": 35400 }, { "epoch": 0.7251244035795467, "grad_norm": 0.32711508870124817, "learning_rate": 0.00013465031824581123, "loss": 4.5909, "step": 35410 }, { "epoch": 0.7253291831343559, "grad_norm": 0.42030251026153564, "learning_rate": 0.00013447603338588378, "loss": 4.5875, "step": 35420 }, { "epoch": 0.7255339626891651, "grad_norm": 0.30846327543258667, "learning_rate": 0.00013430180424865156, "loss": 4.5955, "step": 35430 }, { "epoch": 0.7257387422439744, "grad_norm": 0.31136104464530945, "learning_rate": 0.0001341276309822818, "loss": 4.5894, "step": 35440 }, { "epoch": 0.7259435217987836, "grad_norm": 0.288819283246994, "learning_rate": 0.00013395351373489463, "loss": 4.6092, "step": 35450 }, { "epoch": 0.7261483013535929, "grad_norm": 0.30843454599380493, "learning_rate": 0.0001337794526545621, "loss": 4.5506, "step": 35460 }, { "epoch": 0.7263530809084021, "grad_norm": 0.32559868693351746, "learning_rate": 0.00013360544788930886, "loss": 4.6309, "step": 35470 }, { "epoch": 0.7265578604632114, "grad_norm": 0.34923264384269714, "learning_rate": 0.00013343149958711155, "loss": 4.6171, "step": 35480 }, { "epoch": 0.7267626400180206, "grad_norm": 0.296554833650589, "learning_rate": 0.00013325760789589865, "loss": 4.6257, "step": 35490 }, { "epoch": 0.7269674195728298, "grad_norm": 0.3782421350479126, "learning_rate": 0.00013308377296355074, "loss": 4.6031, "step": 35500 }, { "epoch": 0.7271721991276391, "grad_norm": 0.29011818766593933, "learning_rate": 0.00013290999493789985, "loss": 4.5951, "step": 35510 }, { "epoch": 0.7273769786824483, "grad_norm": 0.30199551582336426, "learning_rate": 0.0001327362739667298, "loss": 4.5943, "step": 35520 }, { "epoch": 0.7275817582372576, "grad_norm": 0.30701789259910583, "learning_rate": 0.00013256261019777593, "loss": 4.6092, "step": 35530 }, { "epoch": 0.7277865377920668, "grad_norm": 0.3072527050971985, "learning_rate": 0.00013238900377872487, "loss": 4.6298, "step": 35540 }, { "epoch": 0.727991317346876, "grad_norm": 0.3173460066318512, "learning_rate": 0.0001322154548572144, "loss": 4.6172, "step": 35550 }, { "epoch": 0.7281960969016853, "grad_norm": 0.3216269612312317, "learning_rate": 0.00013204196358083348, "loss": 4.6169, "step": 35560 }, { "epoch": 0.7284008764564945, "grad_norm": 0.30654504895210266, "learning_rate": 0.00013186853009712217, "loss": 4.6202, "step": 35570 }, { "epoch": 0.7286056560113038, "grad_norm": 0.31015145778656006, "learning_rate": 0.00013169515455357104, "loss": 4.5773, "step": 35580 }, { "epoch": 0.728810435566113, "grad_norm": 0.30494025349617004, "learning_rate": 0.00013152183709762175, "loss": 4.5958, "step": 35590 }, { "epoch": 0.7290152151209224, "grad_norm": 0.32751893997192383, "learning_rate": 0.00013134857787666632, "loss": 4.6148, "step": 35600 }, { "epoch": 0.7292199946757316, "grad_norm": 0.2925833761692047, "learning_rate": 0.0001311753770380474, "loss": 4.6189, "step": 35610 }, { "epoch": 0.7294247742305409, "grad_norm": 0.3233659565448761, "learning_rate": 0.00013100223472905793, "loss": 4.6007, "step": 35620 }, { "epoch": 0.7296295537853501, "grad_norm": 0.2875606417655945, "learning_rate": 0.00013082915109694106, "loss": 4.5747, "step": 35630 }, { "epoch": 0.7298343333401593, "grad_norm": 0.29883286356925964, "learning_rate": 0.00013065612628889012, "loss": 4.6044, "step": 35640 }, { "epoch": 0.7300391128949686, "grad_norm": 0.3009844720363617, "learning_rate": 0.0001304831604520482, "loss": 4.6102, "step": 35650 }, { "epoch": 0.7302438924497778, "grad_norm": 0.2949434518814087, "learning_rate": 0.00013031025373350845, "loss": 4.6147, "step": 35660 }, { "epoch": 0.7304486720045871, "grad_norm": 0.3077467679977417, "learning_rate": 0.00013013740628031368, "loss": 4.5675, "step": 35670 }, { "epoch": 0.7306534515593963, "grad_norm": 0.48693525791168213, "learning_rate": 0.0001299646182394564, "loss": 4.5524, "step": 35680 }, { "epoch": 0.7308582311142056, "grad_norm": 0.3349159061908722, "learning_rate": 0.00012979188975787833, "loss": 4.6117, "step": 35690 }, { "epoch": 0.7310630106690148, "grad_norm": 0.34192952513694763, "learning_rate": 0.00012961922098247083, "loss": 4.6365, "step": 35700 }, { "epoch": 0.731267790223824, "grad_norm": 0.29073071479797363, "learning_rate": 0.0001294466120600744, "loss": 4.6343, "step": 35710 }, { "epoch": 0.7314725697786333, "grad_norm": 0.32743415236473083, "learning_rate": 0.00012927406313747842, "loss": 4.5932, "step": 35720 }, { "epoch": 0.7316773493334425, "grad_norm": 0.29877787828445435, "learning_rate": 0.00012910157436142163, "loss": 4.6212, "step": 35730 }, { "epoch": 0.7318821288882518, "grad_norm": 0.3163726329803467, "learning_rate": 0.00012892914587859126, "loss": 4.597, "step": 35740 }, { "epoch": 0.732086908443061, "grad_norm": 0.3142485022544861, "learning_rate": 0.00012875677783562346, "loss": 4.5922, "step": 35750 }, { "epoch": 0.7322916879978703, "grad_norm": 0.2876586616039276, "learning_rate": 0.00012858447037910305, "loss": 4.6066, "step": 35760 }, { "epoch": 0.7324964675526795, "grad_norm": 0.308336079120636, "learning_rate": 0.00012841222365556312, "loss": 4.5955, "step": 35770 }, { "epoch": 0.7327012471074887, "grad_norm": 0.32495421171188354, "learning_rate": 0.00012824003781148523, "loss": 4.5893, "step": 35780 }, { "epoch": 0.732906026662298, "grad_norm": 0.29860562086105347, "learning_rate": 0.00012806791299329917, "loss": 4.5805, "step": 35790 }, { "epoch": 0.7331108062171073, "grad_norm": 0.3136485815048218, "learning_rate": 0.00012789584934738276, "loss": 4.6073, "step": 35800 }, { "epoch": 0.7333155857719166, "grad_norm": 0.29984745383262634, "learning_rate": 0.0001277238470200619, "loss": 4.621, "step": 35810 }, { "epoch": 0.7335203653267258, "grad_norm": 0.2900507152080536, "learning_rate": 0.0001275519061576103, "loss": 4.6068, "step": 35820 }, { "epoch": 0.7337251448815351, "grad_norm": 0.29403650760650635, "learning_rate": 0.0001273800269062493, "loss": 4.5958, "step": 35830 }, { "epoch": 0.7339299244363443, "grad_norm": 0.2939351201057434, "learning_rate": 0.00012720820941214804, "loss": 4.6091, "step": 35840 }, { "epoch": 0.7341347039911535, "grad_norm": 0.3304155170917511, "learning_rate": 0.0001270364538214231, "loss": 4.613, "step": 35850 }, { "epoch": 0.7343394835459628, "grad_norm": 0.5047217011451721, "learning_rate": 0.00012686476028013813, "loss": 4.6207, "step": 35860 }, { "epoch": 0.734544263100772, "grad_norm": 0.3045598566532135, "learning_rate": 0.00012669312893430444, "loss": 4.5792, "step": 35870 }, { "epoch": 0.7347490426555813, "grad_norm": 0.30130526423454285, "learning_rate": 0.00012652155992988015, "loss": 4.5705, "step": 35880 }, { "epoch": 0.7349538222103905, "grad_norm": 0.30585214495658875, "learning_rate": 0.00012635005341277044, "loss": 4.6215, "step": 35890 }, { "epoch": 0.7351586017651998, "grad_norm": 0.2994743287563324, "learning_rate": 0.00012617860952882746, "loss": 4.5876, "step": 35900 }, { "epoch": 0.735363381320009, "grad_norm": 0.3443034887313843, "learning_rate": 0.00012600722842384996, "loss": 4.6038, "step": 35910 }, { "epoch": 0.7355681608748182, "grad_norm": 0.30167892575263977, "learning_rate": 0.0001258359102435833, "loss": 4.635, "step": 35920 }, { "epoch": 0.7357729404296275, "grad_norm": 0.2863532304763794, "learning_rate": 0.00012566465513371946, "loss": 4.6039, "step": 35930 }, { "epoch": 0.7359777199844367, "grad_norm": 0.3017323911190033, "learning_rate": 0.00012549346323989657, "loss": 4.6143, "step": 35940 }, { "epoch": 0.736182499539246, "grad_norm": 0.29638591408729553, "learning_rate": 0.00012532233470769922, "loss": 4.628, "step": 35950 }, { "epoch": 0.7363872790940552, "grad_norm": 0.3080417811870575, "learning_rate": 0.000125151269682658, "loss": 4.6092, "step": 35960 }, { "epoch": 0.7365920586488645, "grad_norm": 0.3476355969905853, "learning_rate": 0.00012498026831024957, "loss": 4.5768, "step": 35970 }, { "epoch": 0.7367968382036737, "grad_norm": 0.3063572347164154, "learning_rate": 0.0001248093307358963, "loss": 4.6202, "step": 35980 }, { "epoch": 0.7370016177584829, "grad_norm": 0.30983617901802063, "learning_rate": 0.00012463845710496658, "loss": 4.6741, "step": 35990 }, { "epoch": 0.7372063973132923, "grad_norm": 0.3034866154193878, "learning_rate": 0.00012446764756277406, "loss": 4.5873, "step": 36000 }, { "epoch": 0.7372063973132923, "eval_loss": 4.60663366317749, "eval_runtime": 4.399, "eval_samples_per_second": 265.063, "eval_steps_per_second": 33.19, "step": 36000 }, { "epoch": 0.7374111768681015, "grad_norm": 0.28170254826545715, "learning_rate": 0.0001242969022545781, "loss": 4.6308, "step": 36010 }, { "epoch": 0.7376159564229108, "grad_norm": 0.3137228190898895, "learning_rate": 0.00012412622132558355, "loss": 4.5872, "step": 36020 }, { "epoch": 0.73782073597772, "grad_norm": 0.300894558429718, "learning_rate": 0.00012395560492094022, "loss": 4.6159, "step": 36030 }, { "epoch": 0.7380255155325293, "grad_norm": 0.3135361969470978, "learning_rate": 0.0001237850531857433, "loss": 4.5941, "step": 36040 }, { "epoch": 0.7382302950873385, "grad_norm": 0.2942076623439789, "learning_rate": 0.00012361456626503286, "loss": 4.6294, "step": 36050 }, { "epoch": 0.7384350746421477, "grad_norm": 0.3123762607574463, "learning_rate": 0.0001234441443037939, "loss": 4.5859, "step": 36060 }, { "epoch": 0.738639854196957, "grad_norm": 0.29585447907447815, "learning_rate": 0.00012327378744695613, "loss": 4.5828, "step": 36070 }, { "epoch": 0.7388446337517662, "grad_norm": 0.2900315523147583, "learning_rate": 0.00012310349583939388, "loss": 4.6177, "step": 36080 }, { "epoch": 0.7390494133065755, "grad_norm": 0.3082260489463806, "learning_rate": 0.00012293326962592603, "loss": 4.6403, "step": 36090 }, { "epoch": 0.7392541928613847, "grad_norm": 0.30478453636169434, "learning_rate": 0.0001227631089513159, "loss": 4.6214, "step": 36100 }, { "epoch": 0.739458972416194, "grad_norm": 0.2973614037036896, "learning_rate": 0.00012259301396027103, "loss": 4.6199, "step": 36110 }, { "epoch": 0.7396637519710032, "grad_norm": 0.2853838801383972, "learning_rate": 0.00012242298479744302, "loss": 4.5925, "step": 36120 }, { "epoch": 0.7398685315258124, "grad_norm": 0.3152982294559479, "learning_rate": 0.0001222530216074277, "loss": 4.5864, "step": 36130 }, { "epoch": 0.7400733110806217, "grad_norm": 0.29252418875694275, "learning_rate": 0.00012208312453476443, "loss": 4.6193, "step": 36140 }, { "epoch": 0.7402780906354309, "grad_norm": 0.325703889131546, "learning_rate": 0.00012191329372393674, "loss": 4.6008, "step": 36150 }, { "epoch": 0.7404828701902402, "grad_norm": 0.31771841645240784, "learning_rate": 0.00012174352931937161, "loss": 4.5972, "step": 36160 }, { "epoch": 0.7406876497450494, "grad_norm": 0.31104570627212524, "learning_rate": 0.00012157383146543955, "loss": 4.6389, "step": 36170 }, { "epoch": 0.7408924292998587, "grad_norm": 0.33067429065704346, "learning_rate": 0.00012140420030645448, "loss": 4.6244, "step": 36180 }, { "epoch": 0.7410972088546679, "grad_norm": 0.31437385082244873, "learning_rate": 0.00012123463598667366, "loss": 4.6347, "step": 36190 }, { "epoch": 0.7413019884094773, "grad_norm": 0.3336164355278015, "learning_rate": 0.00012106513865029754, "loss": 4.5844, "step": 36200 }, { "epoch": 0.7415067679642865, "grad_norm": 0.3269065320491791, "learning_rate": 0.00012089570844146937, "loss": 4.6002, "step": 36210 }, { "epoch": 0.7417115475190957, "grad_norm": 0.3056379556655884, "learning_rate": 0.00012072634550427562, "loss": 4.607, "step": 36220 }, { "epoch": 0.741916327073905, "grad_norm": 0.3031263053417206, "learning_rate": 0.00012055704998274531, "loss": 4.5836, "step": 36230 }, { "epoch": 0.7421211066287142, "grad_norm": 0.309474915266037, "learning_rate": 0.0001203878220208503, "loss": 4.5703, "step": 36240 }, { "epoch": 0.7423258861835235, "grad_norm": 0.29727813601493835, "learning_rate": 0.00012021866176250491, "loss": 4.5984, "step": 36250 }, { "epoch": 0.7425306657383327, "grad_norm": 0.28859925270080566, "learning_rate": 0.00012004956935156588, "loss": 4.632, "step": 36260 }, { "epoch": 0.742735445293142, "grad_norm": 0.30916011333465576, "learning_rate": 0.00011988054493183236, "loss": 4.5911, "step": 36270 }, { "epoch": 0.7429402248479512, "grad_norm": 0.3086493909358978, "learning_rate": 0.00011971158864704546, "loss": 4.6114, "step": 36280 }, { "epoch": 0.7431450044027604, "grad_norm": 0.2961837351322174, "learning_rate": 0.0001195427006408885, "loss": 4.5906, "step": 36290 }, { "epoch": 0.7433497839575697, "grad_norm": 0.32759809494018555, "learning_rate": 0.00011937388105698675, "loss": 4.6136, "step": 36300 }, { "epoch": 0.7435545635123789, "grad_norm": 0.32991155982017517, "learning_rate": 0.0001192051300389073, "loss": 4.6276, "step": 36310 }, { "epoch": 0.7437593430671882, "grad_norm": 0.30752769112586975, "learning_rate": 0.00011903644773015878, "loss": 4.5985, "step": 36320 }, { "epoch": 0.7439641226219974, "grad_norm": 0.2945796549320221, "learning_rate": 0.00011886783427419153, "loss": 4.6031, "step": 36330 }, { "epoch": 0.7441689021768066, "grad_norm": 0.29451462626457214, "learning_rate": 0.00011869928981439741, "loss": 4.5902, "step": 36340 }, { "epoch": 0.7443736817316159, "grad_norm": 0.2927614748477936, "learning_rate": 0.00011853081449410934, "loss": 4.6031, "step": 36350 }, { "epoch": 0.7445784612864251, "grad_norm": 0.31154003739356995, "learning_rate": 0.00011836240845660169, "loss": 4.6021, "step": 36360 }, { "epoch": 0.7447832408412344, "grad_norm": 0.3381774425506592, "learning_rate": 0.0001181940718450897, "loss": 4.6047, "step": 36370 }, { "epoch": 0.7449880203960436, "grad_norm": 0.30159881711006165, "learning_rate": 0.00011802580480272983, "loss": 4.5748, "step": 36380 }, { "epoch": 0.7451927999508529, "grad_norm": 0.3149918019771576, "learning_rate": 0.00011785760747261913, "loss": 4.5956, "step": 36390 }, { "epoch": 0.7453975795056621, "grad_norm": 0.3004690706729889, "learning_rate": 0.00011768947999779558, "loss": 4.5967, "step": 36400 }, { "epoch": 0.7456023590604715, "grad_norm": 0.47824764251708984, "learning_rate": 0.00011752142252123759, "loss": 4.6055, "step": 36410 }, { "epoch": 0.7458071386152807, "grad_norm": 0.3609195649623871, "learning_rate": 0.00011735343518586402, "loss": 4.5984, "step": 36420 }, { "epoch": 0.7460119181700899, "grad_norm": 0.30279165506362915, "learning_rate": 0.0001171855181345342, "loss": 4.6286, "step": 36430 }, { "epoch": 0.7462166977248992, "grad_norm": 0.32302072644233704, "learning_rate": 0.00011701767151004768, "loss": 4.668, "step": 36440 }, { "epoch": 0.7464214772797084, "grad_norm": 0.3194332420825958, "learning_rate": 0.00011684989545514408, "loss": 4.6015, "step": 36450 }, { "epoch": 0.7466262568345177, "grad_norm": 0.29721352458000183, "learning_rate": 0.00011668219011250299, "loss": 4.5754, "step": 36460 }, { "epoch": 0.7468310363893269, "grad_norm": 0.31784340739250183, "learning_rate": 0.00011651455562474385, "loss": 4.6197, "step": 36470 }, { "epoch": 0.7470358159441362, "grad_norm": 0.291959673166275, "learning_rate": 0.00011634699213442602, "loss": 4.5766, "step": 36480 }, { "epoch": 0.7472405954989454, "grad_norm": 0.30183809995651245, "learning_rate": 0.00011617949978404814, "loss": 4.5665, "step": 36490 }, { "epoch": 0.7474453750537546, "grad_norm": 0.2972424328327179, "learning_rate": 0.00011601207871604864, "loss": 4.5871, "step": 36500 }, { "epoch": 0.7476501546085639, "grad_norm": 0.28370097279548645, "learning_rate": 0.00011584472907280529, "loss": 4.6079, "step": 36510 }, { "epoch": 0.7478549341633731, "grad_norm": 0.3022497892379761, "learning_rate": 0.00011567745099663507, "loss": 4.5993, "step": 36520 }, { "epoch": 0.7480597137181824, "grad_norm": 0.3323211967945099, "learning_rate": 0.00011551024462979403, "loss": 4.6169, "step": 36530 }, { "epoch": 0.7482644932729916, "grad_norm": 0.29171788692474365, "learning_rate": 0.00011534311011447729, "loss": 4.6293, "step": 36540 }, { "epoch": 0.7484692728278008, "grad_norm": 0.2999872863292694, "learning_rate": 0.00011517604759281906, "loss": 4.5968, "step": 36550 }, { "epoch": 0.7486740523826101, "grad_norm": 0.3147103488445282, "learning_rate": 0.0001150090572068919, "loss": 4.555, "step": 36560 }, { "epoch": 0.7488788319374193, "grad_norm": 0.30209001898765564, "learning_rate": 0.00011484213909870743, "loss": 4.5927, "step": 36570 }, { "epoch": 0.7490836114922286, "grad_norm": 0.29491347074508667, "learning_rate": 0.0001146752934102156, "loss": 4.6015, "step": 36580 }, { "epoch": 0.7492883910470378, "grad_norm": 0.3398585021495819, "learning_rate": 0.00011450852028330481, "loss": 4.6068, "step": 36590 }, { "epoch": 0.7494931706018471, "grad_norm": 0.30051180720329285, "learning_rate": 0.00011434181985980182, "loss": 4.6119, "step": 36600 }, { "epoch": 0.7496979501566564, "grad_norm": 0.3194768726825714, "learning_rate": 0.00011417519228147153, "loss": 4.6278, "step": 36610 }, { "epoch": 0.7499027297114657, "grad_norm": 0.306112676858902, "learning_rate": 0.00011400863769001673, "loss": 4.5769, "step": 36620 }, { "epoch": 0.7501075092662749, "grad_norm": 0.30987900495529175, "learning_rate": 0.00011384215622707843, "loss": 4.5989, "step": 36630 }, { "epoch": 0.7503122888210841, "grad_norm": 0.3256874680519104, "learning_rate": 0.00011367574803423512, "loss": 4.5877, "step": 36640 }, { "epoch": 0.7505170683758934, "grad_norm": 0.2956451177597046, "learning_rate": 0.00011350941325300328, "loss": 4.6411, "step": 36650 }, { "epoch": 0.7507218479307026, "grad_norm": 0.32243770360946655, "learning_rate": 0.00011334315202483679, "loss": 4.6034, "step": 36660 }, { "epoch": 0.7509266274855119, "grad_norm": 0.3013778626918793, "learning_rate": 0.00011317696449112705, "loss": 4.6063, "step": 36670 }, { "epoch": 0.7511314070403211, "grad_norm": 0.307576447725296, "learning_rate": 0.00011301085079320275, "loss": 4.5892, "step": 36680 }, { "epoch": 0.7513361865951304, "grad_norm": 0.3025556802749634, "learning_rate": 0.00011284481107232991, "loss": 4.5921, "step": 36690 }, { "epoch": 0.7515409661499396, "grad_norm": 0.30304038524627686, "learning_rate": 0.00011267884546971133, "loss": 4.5788, "step": 36700 }, { "epoch": 0.7517457457047488, "grad_norm": 0.3039272427558899, "learning_rate": 0.0001125129541264871, "loss": 4.5963, "step": 36710 }, { "epoch": 0.7519505252595581, "grad_norm": 0.2991809546947479, "learning_rate": 0.00011234713718373409, "loss": 4.6294, "step": 36720 }, { "epoch": 0.7521553048143673, "grad_norm": 0.2960982322692871, "learning_rate": 0.0001121813947824657, "loss": 4.5752, "step": 36730 }, { "epoch": 0.7523600843691766, "grad_norm": 0.30794692039489746, "learning_rate": 0.00011201572706363215, "loss": 4.5912, "step": 36740 }, { "epoch": 0.7525648639239858, "grad_norm": 0.3139549791812897, "learning_rate": 0.00011185013416812012, "loss": 4.6058, "step": 36750 }, { "epoch": 0.752769643478795, "grad_norm": 0.3105529844760895, "learning_rate": 0.00011168461623675266, "loss": 4.5728, "step": 36760 }, { "epoch": 0.7529744230336043, "grad_norm": 0.3028867840766907, "learning_rate": 0.0001115191734102889, "loss": 4.5847, "step": 36770 }, { "epoch": 0.7531792025884135, "grad_norm": 0.2984515130519867, "learning_rate": 0.00011135380582942432, "loss": 4.6105, "step": 36780 }, { "epoch": 0.7533839821432228, "grad_norm": 0.3259239196777344, "learning_rate": 0.00011118851363479028, "loss": 4.5859, "step": 36790 }, { "epoch": 0.753588761698032, "grad_norm": 0.33328720927238464, "learning_rate": 0.00011102329696695414, "loss": 4.5849, "step": 36800 }, { "epoch": 0.7537935412528414, "grad_norm": 0.4639306366443634, "learning_rate": 0.00011085815596641898, "loss": 4.6112, "step": 36810 }, { "epoch": 0.7539983208076506, "grad_norm": 0.30320459604263306, "learning_rate": 0.00011069309077362338, "loss": 4.588, "step": 36820 }, { "epoch": 0.7542031003624599, "grad_norm": 0.3142356872558594, "learning_rate": 0.00011052810152894176, "loss": 4.6379, "step": 36830 }, { "epoch": 0.7544078799172691, "grad_norm": 0.30572620034217834, "learning_rate": 0.00011036318837268358, "loss": 4.5899, "step": 36840 }, { "epoch": 0.7546126594720783, "grad_norm": 0.3086054027080536, "learning_rate": 0.0001101983514450939, "loss": 4.6065, "step": 36850 }, { "epoch": 0.7548174390268876, "grad_norm": 0.29797255992889404, "learning_rate": 0.00011003359088635284, "loss": 4.6234, "step": 36860 }, { "epoch": 0.7550222185816968, "grad_norm": 0.2883145809173584, "learning_rate": 0.00010986890683657558, "loss": 4.5927, "step": 36870 }, { "epoch": 0.7552269981365061, "grad_norm": 0.2922157347202301, "learning_rate": 0.00010970429943581226, "loss": 4.6015, "step": 36880 }, { "epoch": 0.7554317776913153, "grad_norm": 0.31169426441192627, "learning_rate": 0.00010953976882404777, "loss": 4.5758, "step": 36890 }, { "epoch": 0.7556365572461246, "grad_norm": 0.3007355332374573, "learning_rate": 0.0001093753151412018, "loss": 4.6062, "step": 36900 }, { "epoch": 0.7558413368009338, "grad_norm": 0.4668189287185669, "learning_rate": 0.00010921093852712858, "loss": 4.5949, "step": 36910 }, { "epoch": 0.756046116355743, "grad_norm": 0.3131042420864105, "learning_rate": 0.00010904663912161659, "loss": 4.6125, "step": 36920 }, { "epoch": 0.7562508959105523, "grad_norm": 0.3142430782318115, "learning_rate": 0.000108882417064389, "loss": 4.6042, "step": 36930 }, { "epoch": 0.7564556754653615, "grad_norm": 0.2970556318759918, "learning_rate": 0.00010871827249510296, "loss": 4.5867, "step": 36940 }, { "epoch": 0.7566604550201708, "grad_norm": 0.28931474685668945, "learning_rate": 0.00010855420555334986, "loss": 4.6236, "step": 36950 }, { "epoch": 0.75686523457498, "grad_norm": 0.31609848141670227, "learning_rate": 0.00010839021637865498, "loss": 4.5771, "step": 36960 }, { "epoch": 0.7570700141297892, "grad_norm": 0.3069298565387726, "learning_rate": 0.00010822630511047762, "loss": 4.5832, "step": 36970 }, { "epoch": 0.7572747936845985, "grad_norm": 0.30308017134666443, "learning_rate": 0.00010806247188821052, "loss": 4.5756, "step": 36980 }, { "epoch": 0.7574795732394077, "grad_norm": 0.3147658705711365, "learning_rate": 0.00010789871685118033, "loss": 4.605, "step": 36990 }, { "epoch": 0.757684352794217, "grad_norm": 0.2971055805683136, "learning_rate": 0.00010773504013864722, "loss": 4.5754, "step": 37000 }, { "epoch": 0.7578891323490262, "grad_norm": 0.29778382182121277, "learning_rate": 0.00010757144188980441, "loss": 4.5987, "step": 37010 }, { "epoch": 0.7580939119038356, "grad_norm": 0.29821091890335083, "learning_rate": 0.0001074079222437788, "loss": 4.6008, "step": 37020 }, { "epoch": 0.7582986914586448, "grad_norm": 0.30124008655548096, "learning_rate": 0.00010724448133963023, "loss": 4.5957, "step": 37030 }, { "epoch": 0.7585034710134541, "grad_norm": 0.32034429907798767, "learning_rate": 0.00010708111931635172, "loss": 4.6259, "step": 37040 }, { "epoch": 0.7587082505682633, "grad_norm": 0.3446129262447357, "learning_rate": 0.00010691783631286895, "loss": 4.5889, "step": 37050 }, { "epoch": 0.7589130301230725, "grad_norm": 0.3032030463218689, "learning_rate": 0.00010675463246804066, "loss": 4.5788, "step": 37060 }, { "epoch": 0.7591178096778818, "grad_norm": 0.2998063266277313, "learning_rate": 0.00010659150792065813, "loss": 4.6174, "step": 37070 }, { "epoch": 0.759322589232691, "grad_norm": 0.31689971685409546, "learning_rate": 0.00010642846280944527, "loss": 4.5681, "step": 37080 }, { "epoch": 0.7595273687875003, "grad_norm": 0.6803306341171265, "learning_rate": 0.0001062654972730585, "loss": 4.6225, "step": 37090 }, { "epoch": 0.7597321483423095, "grad_norm": 0.3014300763607025, "learning_rate": 0.0001061026114500863, "loss": 4.6133, "step": 37100 }, { "epoch": 0.7599369278971188, "grad_norm": 0.30094754695892334, "learning_rate": 0.00010593980547904978, "loss": 4.5921, "step": 37110 }, { "epoch": 0.760141707451928, "grad_norm": 0.30403319001197815, "learning_rate": 0.00010577707949840165, "loss": 4.6003, "step": 37120 }, { "epoch": 0.7603464870067372, "grad_norm": 0.30160754919052124, "learning_rate": 0.00010561443364652697, "loss": 4.6264, "step": 37130 }, { "epoch": 0.7605512665615465, "grad_norm": 0.28557589650154114, "learning_rate": 0.00010545186806174254, "loss": 4.5817, "step": 37140 }, { "epoch": 0.7607560461163557, "grad_norm": 0.3045676052570343, "learning_rate": 0.00010528938288229686, "loss": 4.6053, "step": 37150 }, { "epoch": 0.760960825671165, "grad_norm": 0.2812917232513428, "learning_rate": 0.00010512697824637016, "loss": 4.6218, "step": 37160 }, { "epoch": 0.7611656052259742, "grad_norm": 0.30035391449928284, "learning_rate": 0.00010496465429207407, "loss": 4.5886, "step": 37170 }, { "epoch": 0.7613703847807834, "grad_norm": 0.36716723442077637, "learning_rate": 0.0001048024111574517, "loss": 4.5839, "step": 37180 }, { "epoch": 0.7615751643355927, "grad_norm": 0.3121277689933777, "learning_rate": 0.00010464024898047724, "loss": 4.5836, "step": 37190 }, { "epoch": 0.7617799438904019, "grad_norm": 0.3098382353782654, "learning_rate": 0.00010447816789905628, "loss": 4.5854, "step": 37200 }, { "epoch": 0.7619847234452112, "grad_norm": 0.295818030834198, "learning_rate": 0.00010431616805102523, "loss": 4.6093, "step": 37210 }, { "epoch": 0.7621895030000205, "grad_norm": 0.32165735960006714, "learning_rate": 0.00010415424957415151, "loss": 4.6063, "step": 37220 }, { "epoch": 0.7623942825548298, "grad_norm": 0.3486019968986511, "learning_rate": 0.00010399241260613344, "loss": 4.5722, "step": 37230 }, { "epoch": 0.762599062109639, "grad_norm": 0.29253706336021423, "learning_rate": 0.00010383065728459986, "loss": 4.6153, "step": 37240 }, { "epoch": 0.7628038416644483, "grad_norm": 0.31030285358428955, "learning_rate": 0.00010366898374711035, "loss": 4.5734, "step": 37250 }, { "epoch": 0.7630086212192575, "grad_norm": 0.30715376138687134, "learning_rate": 0.0001035073921311547, "loss": 4.5758, "step": 37260 }, { "epoch": 0.7632134007740667, "grad_norm": 0.302558571100235, "learning_rate": 0.00010334588257415321, "loss": 4.6307, "step": 37270 }, { "epoch": 0.763418180328876, "grad_norm": 0.2951318919658661, "learning_rate": 0.00010318445521345636, "loss": 4.6049, "step": 37280 }, { "epoch": 0.7636229598836852, "grad_norm": 0.29280537366867065, "learning_rate": 0.00010302311018634481, "loss": 4.566, "step": 37290 }, { "epoch": 0.7638277394384945, "grad_norm": 0.2905118763446808, "learning_rate": 0.00010286184763002893, "loss": 4.5883, "step": 37300 }, { "epoch": 0.7640325189933037, "grad_norm": 0.3478008210659027, "learning_rate": 0.00010270066768164927, "loss": 4.5553, "step": 37310 }, { "epoch": 0.764237298548113, "grad_norm": 0.34253203868865967, "learning_rate": 0.00010253957047827603, "loss": 4.5991, "step": 37320 }, { "epoch": 0.7644420781029222, "grad_norm": 0.30275651812553406, "learning_rate": 0.00010237855615690887, "loss": 4.5748, "step": 37330 }, { "epoch": 0.7646468576577314, "grad_norm": 0.30308985710144043, "learning_rate": 0.00010221762485447719, "loss": 4.5899, "step": 37340 }, { "epoch": 0.7648516372125407, "grad_norm": 0.28643912076950073, "learning_rate": 0.00010205677670783971, "loss": 4.5908, "step": 37350 }, { "epoch": 0.7650564167673499, "grad_norm": 0.33123040199279785, "learning_rate": 0.00010189601185378444, "loss": 4.6209, "step": 37360 }, { "epoch": 0.7652611963221592, "grad_norm": 0.32299116253852844, "learning_rate": 0.00010173533042902847, "loss": 4.5494, "step": 37370 }, { "epoch": 0.7654659758769684, "grad_norm": 0.3314318358898163, "learning_rate": 0.00010157473257021819, "loss": 4.578, "step": 37380 }, { "epoch": 0.7656707554317776, "grad_norm": 0.3351702094078064, "learning_rate": 0.00010141421841392853, "loss": 4.6043, "step": 37390 }, { "epoch": 0.7658755349865869, "grad_norm": 0.3096257150173187, "learning_rate": 0.00010125378809666361, "loss": 4.593, "step": 37400 }, { "epoch": 0.7660803145413961, "grad_norm": 0.33093225955963135, "learning_rate": 0.00010109344175485602, "loss": 4.5737, "step": 37410 }, { "epoch": 0.7662850940962055, "grad_norm": 0.2949813902378082, "learning_rate": 0.00010093317952486701, "loss": 4.5586, "step": 37420 }, { "epoch": 0.7664898736510147, "grad_norm": 0.306266725063324, "learning_rate": 0.00010077300154298634, "loss": 4.594, "step": 37430 }, { "epoch": 0.766694653205824, "grad_norm": 0.3152048587799072, "learning_rate": 0.0001006129079454321, "loss": 4.5772, "step": 37440 }, { "epoch": 0.7668994327606332, "grad_norm": 0.32708024978637695, "learning_rate": 0.00010045289886835058, "loss": 4.6239, "step": 37450 }, { "epoch": 0.7671042123154425, "grad_norm": 0.30479657649993896, "learning_rate": 0.00010029297444781631, "loss": 4.6122, "step": 37460 }, { "epoch": 0.7673089918702517, "grad_norm": 0.30367597937583923, "learning_rate": 0.00010013313481983155, "loss": 4.5983, "step": 37470 }, { "epoch": 0.7675137714250609, "grad_norm": 0.3227415382862091, "learning_rate": 9.997338012032678e-05, "loss": 4.5872, "step": 37480 }, { "epoch": 0.7677185509798702, "grad_norm": 0.3020555078983307, "learning_rate": 9.981371048516015e-05, "loss": 4.5939, "step": 37490 }, { "epoch": 0.7679233305346794, "grad_norm": 0.31975632905960083, "learning_rate": 9.965412605011728e-05, "loss": 4.6055, "step": 37500 }, { "epoch": 0.7681281100894887, "grad_norm": 0.3118675649166107, "learning_rate": 9.949462695091156e-05, "loss": 4.5831, "step": 37510 }, { "epoch": 0.7683328896442979, "grad_norm": 0.30988624691963196, "learning_rate": 9.933521332318374e-05, "loss": 4.6069, "step": 37520 }, { "epoch": 0.7685376691991072, "grad_norm": 0.3117412328720093, "learning_rate": 9.917588530250195e-05, "loss": 4.5924, "step": 37530 }, { "epoch": 0.7687424487539164, "grad_norm": 0.30757349729537964, "learning_rate": 9.901664302436131e-05, "loss": 4.5811, "step": 37540 }, { "epoch": 0.7689472283087256, "grad_norm": 0.397675096988678, "learning_rate": 9.885748662418423e-05, "loss": 4.6202, "step": 37550 }, { "epoch": 0.7691520078635349, "grad_norm": 0.37244829535484314, "learning_rate": 9.869841623732001e-05, "loss": 4.6222, "step": 37560 }, { "epoch": 0.7693567874183441, "grad_norm": 0.2990577518939972, "learning_rate": 9.853943199904483e-05, "loss": 4.5844, "step": 37570 }, { "epoch": 0.7695615669731534, "grad_norm": 0.30558040738105774, "learning_rate": 9.838053404456163e-05, "loss": 4.5972, "step": 37580 }, { "epoch": 0.7697663465279626, "grad_norm": 0.30110323429107666, "learning_rate": 9.822172250899982e-05, "loss": 4.6029, "step": 37590 }, { "epoch": 0.7699711260827719, "grad_norm": 0.3397175371646881, "learning_rate": 9.806299752741557e-05, "loss": 4.5866, "step": 37600 }, { "epoch": 0.7701759056375811, "grad_norm": 0.31180909276008606, "learning_rate": 9.790435923479113e-05, "loss": 4.5937, "step": 37610 }, { "epoch": 0.7703806851923904, "grad_norm": 0.3076137900352478, "learning_rate": 9.77458077660353e-05, "loss": 4.5775, "step": 37620 }, { "epoch": 0.7705854647471997, "grad_norm": 0.3023734986782074, "learning_rate": 9.758734325598294e-05, "loss": 4.6057, "step": 37630 }, { "epoch": 0.7707902443020089, "grad_norm": 0.3215528428554535, "learning_rate": 9.742896583939494e-05, "loss": 4.5711, "step": 37640 }, { "epoch": 0.7709950238568182, "grad_norm": 0.3696591854095459, "learning_rate": 9.72706756509582e-05, "loss": 4.5684, "step": 37650 }, { "epoch": 0.7711998034116274, "grad_norm": 0.3098335862159729, "learning_rate": 9.711247282528533e-05, "loss": 4.5918, "step": 37660 }, { "epoch": 0.7714045829664367, "grad_norm": 0.2871721088886261, "learning_rate": 9.695435749691481e-05, "loss": 4.6285, "step": 37670 }, { "epoch": 0.7716093625212459, "grad_norm": 0.3242889940738678, "learning_rate": 9.679632980031053e-05, "loss": 4.5914, "step": 37680 }, { "epoch": 0.7718141420760551, "grad_norm": 0.2907813787460327, "learning_rate": 9.663838986986182e-05, "loss": 4.6088, "step": 37690 }, { "epoch": 0.7720189216308644, "grad_norm": 0.3036256432533264, "learning_rate": 9.64805378398836e-05, "loss": 4.5727, "step": 37700 }, { "epoch": 0.7722237011856736, "grad_norm": 0.3210693597793579, "learning_rate": 9.63227738446159e-05, "loss": 4.5851, "step": 37710 }, { "epoch": 0.7724284807404829, "grad_norm": 0.2903750538825989, "learning_rate": 9.616509801822391e-05, "loss": 4.6, "step": 37720 }, { "epoch": 0.7726332602952921, "grad_norm": 0.32857513427734375, "learning_rate": 9.600751049479783e-05, "loss": 4.5819, "step": 37730 }, { "epoch": 0.7728380398501014, "grad_norm": 0.3713500499725342, "learning_rate": 9.585001140835283e-05, "loss": 4.5911, "step": 37740 }, { "epoch": 0.7730428194049106, "grad_norm": 0.29100435972213745, "learning_rate": 9.56926008928287e-05, "loss": 4.6329, "step": 37750 }, { "epoch": 0.7732475989597198, "grad_norm": 0.3058205544948578, "learning_rate": 9.553527908209005e-05, "loss": 4.5956, "step": 37760 }, { "epoch": 0.7734523785145291, "grad_norm": 0.3001843988895416, "learning_rate": 9.53780461099261e-05, "loss": 4.5968, "step": 37770 }, { "epoch": 0.7736571580693383, "grad_norm": 0.36424750089645386, "learning_rate": 9.522090211005034e-05, "loss": 4.5813, "step": 37780 }, { "epoch": 0.7738619376241476, "grad_norm": 0.30522090196609497, "learning_rate": 9.506384721610074e-05, "loss": 4.57, "step": 37790 }, { "epoch": 0.7740667171789568, "grad_norm": 0.3051900863647461, "learning_rate": 9.490688156163943e-05, "loss": 4.601, "step": 37800 }, { "epoch": 0.774271496733766, "grad_norm": 0.29674139618873596, "learning_rate": 9.475000528015279e-05, "loss": 4.6044, "step": 37810 }, { "epoch": 0.7744762762885753, "grad_norm": 0.32481276988983154, "learning_rate": 9.459321850505088e-05, "loss": 4.6168, "step": 37820 }, { "epoch": 0.7746810558433846, "grad_norm": 0.32346010208129883, "learning_rate": 9.443652136966797e-05, "loss": 4.6014, "step": 37830 }, { "epoch": 0.7748858353981939, "grad_norm": 0.30109137296676636, "learning_rate": 9.427991400726191e-05, "loss": 4.5801, "step": 37840 }, { "epoch": 0.7750906149530031, "grad_norm": 0.2935716509819031, "learning_rate": 9.412339655101432e-05, "loss": 4.5461, "step": 37850 }, { "epoch": 0.7752953945078124, "grad_norm": 0.34225064516067505, "learning_rate": 9.396696913403034e-05, "loss": 4.5706, "step": 37860 }, { "epoch": 0.7755001740626216, "grad_norm": 0.3051619529724121, "learning_rate": 9.381063188933836e-05, "loss": 4.6069, "step": 37870 }, { "epoch": 0.7757049536174309, "grad_norm": 0.30239740014076233, "learning_rate": 9.365438494989039e-05, "loss": 4.5534, "step": 37880 }, { "epoch": 0.7759097331722401, "grad_norm": 0.31871289014816284, "learning_rate": 9.349822844856133e-05, "loss": 4.5807, "step": 37890 }, { "epoch": 0.7761145127270493, "grad_norm": 0.31296980381011963, "learning_rate": 9.334216251814943e-05, "loss": 4.5859, "step": 37900 }, { "epoch": 0.7763192922818586, "grad_norm": 0.295104444026947, "learning_rate": 9.318618729137574e-05, "loss": 4.6118, "step": 37910 }, { "epoch": 0.7765240718366678, "grad_norm": 0.2946823835372925, "learning_rate": 9.30303029008843e-05, "loss": 4.5727, "step": 37920 }, { "epoch": 0.7767288513914771, "grad_norm": 0.42669346928596497, "learning_rate": 9.287450947924183e-05, "loss": 4.6105, "step": 37930 }, { "epoch": 0.7769336309462863, "grad_norm": 0.3017657399177551, "learning_rate": 9.27188071589377e-05, "loss": 4.5842, "step": 37940 }, { "epoch": 0.7771384105010956, "grad_norm": 0.3124864399433136, "learning_rate": 9.25631960723839e-05, "loss": 4.5936, "step": 37950 }, { "epoch": 0.7773431900559048, "grad_norm": 0.30705711245536804, "learning_rate": 9.240767635191459e-05, "loss": 4.585, "step": 37960 }, { "epoch": 0.777547969610714, "grad_norm": 0.3201311230659485, "learning_rate": 9.225224812978652e-05, "loss": 4.5776, "step": 37970 }, { "epoch": 0.7777527491655233, "grad_norm": 0.31811508536338806, "learning_rate": 9.209691153817835e-05, "loss": 4.591, "step": 37980 }, { "epoch": 0.7779575287203325, "grad_norm": 0.30487823486328125, "learning_rate": 9.194166670919105e-05, "loss": 4.6108, "step": 37990 }, { "epoch": 0.7781623082751418, "grad_norm": 0.29707804322242737, "learning_rate": 9.178651377484742e-05, "loss": 4.591, "step": 38000 }, { "epoch": 0.7781623082751418, "eval_loss": 4.590142726898193, "eval_runtime": 4.4005, "eval_samples_per_second": 264.969, "eval_steps_per_second": 33.178, "step": 38000 }, { "epoch": 0.778367087829951, "grad_norm": 0.3073808252811432, "learning_rate": 9.163145286709218e-05, "loss": 4.5674, "step": 38010 }, { "epoch": 0.7785718673847603, "grad_norm": 0.28783494234085083, "learning_rate": 9.147648411779186e-05, "loss": 4.6029, "step": 38020 }, { "epoch": 0.7787766469395696, "grad_norm": 0.3142697215080261, "learning_rate": 9.132160765873433e-05, "loss": 4.5866, "step": 38030 }, { "epoch": 0.7789814264943788, "grad_norm": 0.31494787335395813, "learning_rate": 9.116682362162925e-05, "loss": 4.5598, "step": 38040 }, { "epoch": 0.7791862060491881, "grad_norm": 0.6348517537117004, "learning_rate": 9.10121321381076e-05, "loss": 4.5746, "step": 38050 }, { "epoch": 0.7793909856039973, "grad_norm": 0.3252737522125244, "learning_rate": 9.085753333972174e-05, "loss": 4.5977, "step": 38060 }, { "epoch": 0.7795957651588066, "grad_norm": 0.31998464465141296, "learning_rate": 9.070302735794493e-05, "loss": 4.5725, "step": 38070 }, { "epoch": 0.7798005447136158, "grad_norm": 0.2928812801837921, "learning_rate": 9.054861432417179e-05, "loss": 4.5628, "step": 38080 }, { "epoch": 0.7800053242684251, "grad_norm": 0.29339835047721863, "learning_rate": 9.039429436971783e-05, "loss": 4.5714, "step": 38090 }, { "epoch": 0.7802101038232343, "grad_norm": 0.3693881630897522, "learning_rate": 9.024006762581924e-05, "loss": 4.6113, "step": 38100 }, { "epoch": 0.7804148833780435, "grad_norm": 0.32775619626045227, "learning_rate": 9.008593422363316e-05, "loss": 4.537, "step": 38110 }, { "epoch": 0.7806196629328528, "grad_norm": 0.3084632456302643, "learning_rate": 8.993189429423723e-05, "loss": 4.5917, "step": 38120 }, { "epoch": 0.780824442487662, "grad_norm": 0.29846644401550293, "learning_rate": 8.97779479686296e-05, "loss": 4.563, "step": 38130 }, { "epoch": 0.7810292220424713, "grad_norm": 0.30271339416503906, "learning_rate": 8.962409537772886e-05, "loss": 4.5815, "step": 38140 }, { "epoch": 0.7812340015972805, "grad_norm": 0.345956027507782, "learning_rate": 8.947033665237398e-05, "loss": 4.5569, "step": 38150 }, { "epoch": 0.7814387811520898, "grad_norm": 0.3943561613559723, "learning_rate": 8.931667192332389e-05, "loss": 4.5851, "step": 38160 }, { "epoch": 0.781643560706899, "grad_norm": 0.3255510628223419, "learning_rate": 8.91631013212576e-05, "loss": 4.587, "step": 38170 }, { "epoch": 0.7818483402617082, "grad_norm": 0.34418126940727234, "learning_rate": 8.900962497677426e-05, "loss": 4.588, "step": 38180 }, { "epoch": 0.7820531198165175, "grad_norm": 0.35830941796302795, "learning_rate": 8.885624302039275e-05, "loss": 4.6346, "step": 38190 }, { "epoch": 0.7822578993713267, "grad_norm": 0.4159878194332123, "learning_rate": 8.870295558255168e-05, "loss": 4.6069, "step": 38200 }, { "epoch": 0.782462678926136, "grad_norm": 0.3376203179359436, "learning_rate": 8.854976279360932e-05, "loss": 4.5778, "step": 38210 }, { "epoch": 0.7826674584809452, "grad_norm": 0.3103373944759369, "learning_rate": 8.839666478384341e-05, "loss": 4.5563, "step": 38220 }, { "epoch": 0.7828722380357546, "grad_norm": 0.2928243577480316, "learning_rate": 8.824366168345117e-05, "loss": 4.5848, "step": 38230 }, { "epoch": 0.7830770175905638, "grad_norm": 0.31541907787323, "learning_rate": 8.80907536225489e-05, "loss": 4.6161, "step": 38240 }, { "epoch": 0.783281797145373, "grad_norm": 0.2942469120025635, "learning_rate": 8.793794073117235e-05, "loss": 4.5592, "step": 38250 }, { "epoch": 0.7834865767001823, "grad_norm": 0.643306314945221, "learning_rate": 8.778522313927606e-05, "loss": 4.5243, "step": 38260 }, { "epoch": 0.7836913562549915, "grad_norm": 0.3006947636604309, "learning_rate": 8.763260097673374e-05, "loss": 4.5895, "step": 38270 }, { "epoch": 0.7838961358098008, "grad_norm": 0.3057100176811218, "learning_rate": 8.748007437333787e-05, "loss": 4.5599, "step": 38280 }, { "epoch": 0.78410091536461, "grad_norm": 0.3289993703365326, "learning_rate": 8.73276434587997e-05, "loss": 4.5967, "step": 38290 }, { "epoch": 0.7843056949194193, "grad_norm": 0.30016857385635376, "learning_rate": 8.717530836274908e-05, "loss": 4.5848, "step": 38300 }, { "epoch": 0.7845104744742285, "grad_norm": 0.3119611144065857, "learning_rate": 8.702306921473424e-05, "loss": 4.6063, "step": 38310 }, { "epoch": 0.7847152540290377, "grad_norm": 0.3316038250923157, "learning_rate": 8.687092614422205e-05, "loss": 4.5819, "step": 38320 }, { "epoch": 0.784920033583847, "grad_norm": 0.3957536518573761, "learning_rate": 8.671887928059754e-05, "loss": 4.608, "step": 38330 }, { "epoch": 0.7851248131386562, "grad_norm": 0.322087824344635, "learning_rate": 8.656692875316401e-05, "loss": 4.6024, "step": 38340 }, { "epoch": 0.7853295926934655, "grad_norm": 0.33519017696380615, "learning_rate": 8.641507469114263e-05, "loss": 4.588, "step": 38350 }, { "epoch": 0.7855343722482747, "grad_norm": 0.3098636865615845, "learning_rate": 8.626331722367276e-05, "loss": 4.58, "step": 38360 }, { "epoch": 0.785739151803084, "grad_norm": 0.2896534204483032, "learning_rate": 8.611165647981161e-05, "loss": 4.5414, "step": 38370 }, { "epoch": 0.7859439313578932, "grad_norm": 0.28859734535217285, "learning_rate": 8.596009258853388e-05, "loss": 4.5305, "step": 38380 }, { "epoch": 0.7861487109127024, "grad_norm": 0.3139727711677551, "learning_rate": 8.580862567873216e-05, "loss": 4.5886, "step": 38390 }, { "epoch": 0.7863534904675117, "grad_norm": 0.30453288555145264, "learning_rate": 8.565725587921651e-05, "loss": 4.5885, "step": 38400 }, { "epoch": 0.7865582700223209, "grad_norm": 0.3050139844417572, "learning_rate": 8.550598331871435e-05, "loss": 4.5737, "step": 38410 }, { "epoch": 0.7867630495771302, "grad_norm": 0.31867069005966187, "learning_rate": 8.535480812587047e-05, "loss": 4.5477, "step": 38420 }, { "epoch": 0.7869678291319395, "grad_norm": 0.3078056573867798, "learning_rate": 8.520373042924688e-05, "loss": 4.5869, "step": 38430 }, { "epoch": 0.7871726086867488, "grad_norm": 0.31189337372779846, "learning_rate": 8.505275035732246e-05, "loss": 4.5967, "step": 38440 }, { "epoch": 0.787377388241558, "grad_norm": 0.30547335743904114, "learning_rate": 8.49018680384934e-05, "loss": 4.6224, "step": 38450 }, { "epoch": 0.7875821677963672, "grad_norm": 0.31830891966819763, "learning_rate": 8.475108360107242e-05, "loss": 4.5486, "step": 38460 }, { "epoch": 0.7877869473511765, "grad_norm": 0.3292132318019867, "learning_rate": 8.460039717328928e-05, "loss": 4.5746, "step": 38470 }, { "epoch": 0.7879917269059857, "grad_norm": 0.30216705799102783, "learning_rate": 8.444980888329024e-05, "loss": 4.5959, "step": 38480 }, { "epoch": 0.788196506460795, "grad_norm": 0.31861424446105957, "learning_rate": 8.429931885913815e-05, "loss": 4.5709, "step": 38490 }, { "epoch": 0.7884012860156042, "grad_norm": 0.29862529039382935, "learning_rate": 8.414892722881231e-05, "loss": 4.5941, "step": 38500 }, { "epoch": 0.7886060655704135, "grad_norm": 0.33931490778923035, "learning_rate": 8.399863412020836e-05, "loss": 4.5432, "step": 38510 }, { "epoch": 0.7888108451252227, "grad_norm": 0.33243846893310547, "learning_rate": 8.3848439661138e-05, "loss": 4.5719, "step": 38520 }, { "epoch": 0.789015624680032, "grad_norm": 0.3161660134792328, "learning_rate": 8.369834397932919e-05, "loss": 4.6025, "step": 38530 }, { "epoch": 0.7892204042348412, "grad_norm": 0.3273294270038605, "learning_rate": 8.354834720242594e-05, "loss": 4.5742, "step": 38540 }, { "epoch": 0.7894251837896504, "grad_norm": 0.34032896161079407, "learning_rate": 8.339844945798793e-05, "loss": 4.5872, "step": 38550 }, { "epoch": 0.7896299633444597, "grad_norm": 0.30530282855033875, "learning_rate": 8.32486508734908e-05, "loss": 4.5377, "step": 38560 }, { "epoch": 0.7898347428992689, "grad_norm": 0.30698108673095703, "learning_rate": 8.309895157632582e-05, "loss": 4.5683, "step": 38570 }, { "epoch": 0.7900395224540782, "grad_norm": 0.3153245151042938, "learning_rate": 8.294935169379989e-05, "loss": 4.5721, "step": 38580 }, { "epoch": 0.7902443020088874, "grad_norm": 0.3361703157424927, "learning_rate": 8.279985135313514e-05, "loss": 4.5559, "step": 38590 }, { "epoch": 0.7904490815636966, "grad_norm": 0.37249207496643066, "learning_rate": 8.26504506814693e-05, "loss": 4.5314, "step": 38600 }, { "epoch": 0.7906538611185059, "grad_norm": 0.32031577825546265, "learning_rate": 8.250114980585521e-05, "loss": 4.5006, "step": 38610 }, { "epoch": 0.7908586406733151, "grad_norm": 0.3019481897354126, "learning_rate": 8.235194885326088e-05, "loss": 4.5888, "step": 38620 }, { "epoch": 0.7910634202281244, "grad_norm": 0.30363279581069946, "learning_rate": 8.220284795056943e-05, "loss": 4.5844, "step": 38630 }, { "epoch": 0.7912681997829337, "grad_norm": 0.317623496055603, "learning_rate": 8.205384722457863e-05, "loss": 4.596, "step": 38640 }, { "epoch": 0.791472979337743, "grad_norm": 0.31147992610931396, "learning_rate": 8.190494680200138e-05, "loss": 4.5856, "step": 38650 }, { "epoch": 0.7916777588925522, "grad_norm": 0.3023722171783447, "learning_rate": 8.175614680946502e-05, "loss": 4.5863, "step": 38660 }, { "epoch": 0.7918825384473615, "grad_norm": 0.3793337345123291, "learning_rate": 8.160744737351163e-05, "loss": 4.57, "step": 38670 }, { "epoch": 0.7920873180021707, "grad_norm": 0.3383292555809021, "learning_rate": 8.145884862059773e-05, "loss": 4.5943, "step": 38680 }, { "epoch": 0.7922920975569799, "grad_norm": 0.35156553983688354, "learning_rate": 8.131035067709425e-05, "loss": 4.5936, "step": 38690 }, { "epoch": 0.7924968771117892, "grad_norm": 0.32815688848495483, "learning_rate": 8.116195366928637e-05, "loss": 4.6, "step": 38700 }, { "epoch": 0.7927016566665984, "grad_norm": 0.32850661873817444, "learning_rate": 8.101365772337341e-05, "loss": 4.6056, "step": 38710 }, { "epoch": 0.7929064362214077, "grad_norm": 0.31346115469932556, "learning_rate": 8.086546296546884e-05, "loss": 4.5885, "step": 38720 }, { "epoch": 0.7931112157762169, "grad_norm": 0.327250212430954, "learning_rate": 8.071736952159988e-05, "loss": 4.6033, "step": 38730 }, { "epoch": 0.7933159953310261, "grad_norm": 0.31105944514274597, "learning_rate": 8.056937751770782e-05, "loss": 4.5954, "step": 38740 }, { "epoch": 0.7935207748858354, "grad_norm": 0.384827196598053, "learning_rate": 8.042148707964746e-05, "loss": 4.5761, "step": 38750 }, { "epoch": 0.7937255544406446, "grad_norm": 0.44899365305900574, "learning_rate": 8.027369833318739e-05, "loss": 4.6025, "step": 38760 }, { "epoch": 0.7939303339954539, "grad_norm": 0.30384767055511475, "learning_rate": 8.012601140400968e-05, "loss": 4.589, "step": 38770 }, { "epoch": 0.7941351135502631, "grad_norm": 0.29325801134109497, "learning_rate": 7.997842641770983e-05, "loss": 4.5984, "step": 38780 }, { "epoch": 0.7943398931050724, "grad_norm": 0.2913464903831482, "learning_rate": 7.983094349979666e-05, "loss": 4.5825, "step": 38790 }, { "epoch": 0.7945446726598816, "grad_norm": 0.3160167634487152, "learning_rate": 7.968356277569202e-05, "loss": 4.5716, "step": 38800 }, { "epoch": 0.7947494522146908, "grad_norm": 0.3500344157218933, "learning_rate": 7.953628437073108e-05, "loss": 4.589, "step": 38810 }, { "epoch": 0.7949542317695001, "grad_norm": 0.30142685770988464, "learning_rate": 7.938910841016185e-05, "loss": 4.6059, "step": 38820 }, { "epoch": 0.7951590113243093, "grad_norm": 0.33031365275382996, "learning_rate": 7.92420350191454e-05, "loss": 4.5984, "step": 38830 }, { "epoch": 0.7953637908791187, "grad_norm": 0.3248433470726013, "learning_rate": 7.909506432275524e-05, "loss": 4.5974, "step": 38840 }, { "epoch": 0.7955685704339279, "grad_norm": 0.30029308795928955, "learning_rate": 7.894819644597784e-05, "loss": 4.6273, "step": 38850 }, { "epoch": 0.7957733499887372, "grad_norm": 0.3154873549938202, "learning_rate": 7.880143151371225e-05, "loss": 4.577, "step": 38860 }, { "epoch": 0.7959781295435464, "grad_norm": 0.38484513759613037, "learning_rate": 7.865476965076965e-05, "loss": 4.5998, "step": 38870 }, { "epoch": 0.7961829090983557, "grad_norm": 0.3259485363960266, "learning_rate": 7.850821098187391e-05, "loss": 4.5566, "step": 38880 }, { "epoch": 0.7963876886531649, "grad_norm": 0.3180842995643616, "learning_rate": 7.836175563166099e-05, "loss": 4.5578, "step": 38890 }, { "epoch": 0.7965924682079741, "grad_norm": 0.30802080035209656, "learning_rate": 7.821540372467901e-05, "loss": 4.5905, "step": 38900 }, { "epoch": 0.7967972477627834, "grad_norm": 0.30659064650535583, "learning_rate": 7.806915538538812e-05, "loss": 4.5798, "step": 38910 }, { "epoch": 0.7970020273175926, "grad_norm": 0.30987635254859924, "learning_rate": 7.792301073816049e-05, "loss": 4.5848, "step": 38920 }, { "epoch": 0.7972068068724019, "grad_norm": 0.30157986283302307, "learning_rate": 7.777696990727992e-05, "loss": 4.5751, "step": 38930 }, { "epoch": 0.7974115864272111, "grad_norm": 0.3000452518463135, "learning_rate": 7.763103301694193e-05, "loss": 4.6118, "step": 38940 }, { "epoch": 0.7976163659820203, "grad_norm": 0.3065103590488434, "learning_rate": 7.74852001912538e-05, "loss": 4.5769, "step": 38950 }, { "epoch": 0.7978211455368296, "grad_norm": 0.31863853335380554, "learning_rate": 7.733947155423429e-05, "loss": 4.6016, "step": 38960 }, { "epoch": 0.7980259250916388, "grad_norm": 0.321843683719635, "learning_rate": 7.719384722981349e-05, "loss": 4.5946, "step": 38970 }, { "epoch": 0.7982307046464481, "grad_norm": 0.33706068992614746, "learning_rate": 7.704832734183276e-05, "loss": 4.5924, "step": 38980 }, { "epoch": 0.7984354842012573, "grad_norm": 0.32501912117004395, "learning_rate": 7.690291201404476e-05, "loss": 4.559, "step": 38990 }, { "epoch": 0.7986402637560666, "grad_norm": 0.3753548860549927, "learning_rate": 7.675760137011317e-05, "loss": 4.5857, "step": 39000 }, { "epoch": 0.7988450433108758, "grad_norm": 0.36469289660453796, "learning_rate": 7.66123955336125e-05, "loss": 4.5677, "step": 39010 }, { "epoch": 0.799049822865685, "grad_norm": 0.31120210886001587, "learning_rate": 7.646729462802842e-05, "loss": 4.5682, "step": 39020 }, { "epoch": 0.7992546024204943, "grad_norm": 0.320067822933197, "learning_rate": 7.632229877675706e-05, "loss": 4.5813, "step": 39030 }, { "epoch": 0.7994593819753036, "grad_norm": 0.3053686320781708, "learning_rate": 7.617740810310543e-05, "loss": 4.6089, "step": 39040 }, { "epoch": 0.7996641615301129, "grad_norm": 0.31952452659606934, "learning_rate": 7.603262273029099e-05, "loss": 4.5673, "step": 39050 }, { "epoch": 0.7998689410849221, "grad_norm": 0.8099822998046875, "learning_rate": 7.588794278144173e-05, "loss": 4.5772, "step": 39060 }, { "epoch": 0.8000737206397314, "grad_norm": 0.3223879635334015, "learning_rate": 7.574336837959602e-05, "loss": 4.6052, "step": 39070 }, { "epoch": 0.8002785001945406, "grad_norm": 0.30738234519958496, "learning_rate": 7.559889964770216e-05, "loss": 4.5682, "step": 39080 }, { "epoch": 0.8004832797493499, "grad_norm": 0.30835387110710144, "learning_rate": 7.545453670861897e-05, "loss": 4.5833, "step": 39090 }, { "epoch": 0.8006880593041591, "grad_norm": 0.29173022508621216, "learning_rate": 7.531027968511512e-05, "loss": 4.633, "step": 39100 }, { "epoch": 0.8008928388589683, "grad_norm": 0.30273181200027466, "learning_rate": 7.516612869986932e-05, "loss": 4.5468, "step": 39110 }, { "epoch": 0.8010976184137776, "grad_norm": 0.3193420171737671, "learning_rate": 7.502208387546986e-05, "loss": 4.5844, "step": 39120 }, { "epoch": 0.8013023979685868, "grad_norm": 0.29910337924957275, "learning_rate": 7.487814533441498e-05, "loss": 4.5595, "step": 39130 }, { "epoch": 0.8015071775233961, "grad_norm": 0.2982931435108185, "learning_rate": 7.47343131991125e-05, "loss": 4.5478, "step": 39140 }, { "epoch": 0.8017119570782053, "grad_norm": 0.3038698732852936, "learning_rate": 7.459058759187964e-05, "loss": 4.5984, "step": 39150 }, { "epoch": 0.8019167366330145, "grad_norm": 0.3017473816871643, "learning_rate": 7.444696863494314e-05, "loss": 4.5882, "step": 39160 }, { "epoch": 0.8021215161878238, "grad_norm": 0.39330267906188965, "learning_rate": 7.430345645043899e-05, "loss": 4.6182, "step": 39170 }, { "epoch": 0.802326295742633, "grad_norm": 0.305624783039093, "learning_rate": 7.416005116041241e-05, "loss": 4.6245, "step": 39180 }, { "epoch": 0.8025310752974423, "grad_norm": 0.33081772923469543, "learning_rate": 7.401675288681769e-05, "loss": 4.5942, "step": 39190 }, { "epoch": 0.8027358548522515, "grad_norm": 0.29383283853530884, "learning_rate": 7.387356175151816e-05, "loss": 4.595, "step": 39200 }, { "epoch": 0.8029406344070608, "grad_norm": 0.2955620288848877, "learning_rate": 7.37304778762859e-05, "loss": 4.5823, "step": 39210 }, { "epoch": 0.80314541396187, "grad_norm": 0.33134540915489197, "learning_rate": 7.358750138280199e-05, "loss": 4.568, "step": 39220 }, { "epoch": 0.8033501935166792, "grad_norm": 0.31039419770240784, "learning_rate": 7.344463239265595e-05, "loss": 4.6131, "step": 39230 }, { "epoch": 0.8035549730714885, "grad_norm": 0.3270411789417267, "learning_rate": 7.330187102734603e-05, "loss": 4.5861, "step": 39240 }, { "epoch": 0.8037597526262978, "grad_norm": 0.3229215443134308, "learning_rate": 7.315921740827896e-05, "loss": 4.5928, "step": 39250 }, { "epoch": 0.8039645321811071, "grad_norm": 0.33217817544937134, "learning_rate": 7.301667165676977e-05, "loss": 4.5854, "step": 39260 }, { "epoch": 0.8041693117359163, "grad_norm": 0.3282391130924225, "learning_rate": 7.28742338940418e-05, "loss": 4.5349, "step": 39270 }, { "epoch": 0.8043740912907256, "grad_norm": 0.2931309938430786, "learning_rate": 7.273190424122665e-05, "loss": 4.5938, "step": 39280 }, { "epoch": 0.8045788708455348, "grad_norm": 0.3457520008087158, "learning_rate": 7.258968281936367e-05, "loss": 4.5426, "step": 39290 }, { "epoch": 0.804783650400344, "grad_norm": 0.5809755921363831, "learning_rate": 7.24475697494005e-05, "loss": 4.5743, "step": 39300 }, { "epoch": 0.8049884299551533, "grad_norm": 0.30670252442359924, "learning_rate": 7.230556515219257e-05, "loss": 4.5955, "step": 39310 }, { "epoch": 0.8051932095099625, "grad_norm": 0.30223187804222107, "learning_rate": 7.216366914850285e-05, "loss": 4.5722, "step": 39320 }, { "epoch": 0.8053979890647718, "grad_norm": 0.29483914375305176, "learning_rate": 7.202188185900221e-05, "loss": 4.5918, "step": 39330 }, { "epoch": 0.805602768619581, "grad_norm": 0.2921166718006134, "learning_rate": 7.188020340426893e-05, "loss": 4.5968, "step": 39340 }, { "epoch": 0.8058075481743903, "grad_norm": 0.3002696633338928, "learning_rate": 7.173863390478887e-05, "loss": 4.571, "step": 39350 }, { "epoch": 0.8060123277291995, "grad_norm": 0.29883912205696106, "learning_rate": 7.159717348095501e-05, "loss": 4.5579, "step": 39360 }, { "epoch": 0.8062171072840087, "grad_norm": 0.29541438817977905, "learning_rate": 7.145582225306774e-05, "loss": 4.5117, "step": 39370 }, { "epoch": 0.806421886838818, "grad_norm": 0.32466834783554077, "learning_rate": 7.131458034133457e-05, "loss": 4.602, "step": 39380 }, { "epoch": 0.8066266663936272, "grad_norm": 0.2947414219379425, "learning_rate": 7.117344786587003e-05, "loss": 4.5947, "step": 39390 }, { "epoch": 0.8068314459484365, "grad_norm": 0.3411257565021515, "learning_rate": 7.103242494669562e-05, "loss": 4.5912, "step": 39400 }, { "epoch": 0.8070362255032457, "grad_norm": 0.31953033804893494, "learning_rate": 7.089151170373952e-05, "loss": 4.5708, "step": 39410 }, { "epoch": 0.807241005058055, "grad_norm": 0.32291150093078613, "learning_rate": 7.075070825683684e-05, "loss": 4.5931, "step": 39420 }, { "epoch": 0.8074457846128642, "grad_norm": 0.31212931871414185, "learning_rate": 7.061001472572914e-05, "loss": 4.5684, "step": 39430 }, { "epoch": 0.8076505641676734, "grad_norm": 0.30312198400497437, "learning_rate": 7.046943123006464e-05, "loss": 4.6063, "step": 39440 }, { "epoch": 0.8078553437224828, "grad_norm": 0.2988637685775757, "learning_rate": 7.032895788939791e-05, "loss": 4.562, "step": 39450 }, { "epoch": 0.808060123277292, "grad_norm": 0.3108832538127899, "learning_rate": 7.018859482318989e-05, "loss": 4.5668, "step": 39460 }, { "epoch": 0.8082649028321013, "grad_norm": 0.30393844842910767, "learning_rate": 7.004834215080773e-05, "loss": 4.59, "step": 39470 }, { "epoch": 0.8084696823869105, "grad_norm": 0.3082975745201111, "learning_rate": 6.990819999152467e-05, "loss": 4.5609, "step": 39480 }, { "epoch": 0.8086744619417198, "grad_norm": 0.34625446796417236, "learning_rate": 6.976816846452003e-05, "loss": 4.5768, "step": 39490 }, { "epoch": 0.808879241496529, "grad_norm": 0.2989501953125, "learning_rate": 6.96282476888789e-05, "loss": 4.5678, "step": 39500 }, { "epoch": 0.8090840210513383, "grad_norm": 0.3937240540981293, "learning_rate": 6.948843778359241e-05, "loss": 4.567, "step": 39510 }, { "epoch": 0.8092888006061475, "grad_norm": 0.32027167081832886, "learning_rate": 6.934873886755717e-05, "loss": 4.5707, "step": 39520 }, { "epoch": 0.8094935801609567, "grad_norm": 0.3013392984867096, "learning_rate": 6.920915105957555e-05, "loss": 4.5628, "step": 39530 }, { "epoch": 0.809698359715766, "grad_norm": 0.3077249825000763, "learning_rate": 6.90696744783554e-05, "loss": 4.5951, "step": 39540 }, { "epoch": 0.8099031392705752, "grad_norm": 0.3117411434650421, "learning_rate": 6.893030924250996e-05, "loss": 4.56, "step": 39550 }, { "epoch": 0.8101079188253845, "grad_norm": 0.29708433151245117, "learning_rate": 6.879105547055789e-05, "loss": 4.5894, "step": 39560 }, { "epoch": 0.8103126983801937, "grad_norm": 0.29647096991539, "learning_rate": 6.865191328092282e-05, "loss": 4.5692, "step": 39570 }, { "epoch": 0.810517477935003, "grad_norm": 0.2984810471534729, "learning_rate": 6.851288279193371e-05, "loss": 4.5446, "step": 39580 }, { "epoch": 0.8107222574898122, "grad_norm": 0.30566325783729553, "learning_rate": 6.837396412182444e-05, "loss": 4.5553, "step": 39590 }, { "epoch": 0.8109270370446214, "grad_norm": 0.31332987546920776, "learning_rate": 6.82351573887339e-05, "loss": 4.5493, "step": 39600 }, { "epoch": 0.8111318165994307, "grad_norm": 0.34441831707954407, "learning_rate": 6.80964627107055e-05, "loss": 4.5772, "step": 39610 }, { "epoch": 0.8113365961542399, "grad_norm": 0.29685527086257935, "learning_rate": 6.79578802056877e-05, "loss": 4.628, "step": 39620 }, { "epoch": 0.8115413757090492, "grad_norm": 0.30922380089759827, "learning_rate": 6.781940999153345e-05, "loss": 4.615, "step": 39630 }, { "epoch": 0.8117461552638584, "grad_norm": 0.3546513617038727, "learning_rate": 6.7681052186e-05, "loss": 4.5814, "step": 39640 }, { "epoch": 0.8119509348186678, "grad_norm": 0.3195394277572632, "learning_rate": 6.75428069067493e-05, "loss": 4.56, "step": 39650 }, { "epoch": 0.812155714373477, "grad_norm": 0.3146194815635681, "learning_rate": 6.740467427134746e-05, "loss": 4.5716, "step": 39660 }, { "epoch": 0.8123604939282862, "grad_norm": 0.5386914610862732, "learning_rate": 6.726665439726485e-05, "loss": 4.5808, "step": 39670 }, { "epoch": 0.8125652734830955, "grad_norm": 0.31036967039108276, "learning_rate": 6.712874740187589e-05, "loss": 4.5571, "step": 39680 }, { "epoch": 0.8127700530379047, "grad_norm": 0.3137417435646057, "learning_rate": 6.69909534024591e-05, "loss": 4.5759, "step": 39690 }, { "epoch": 0.812974832592714, "grad_norm": 0.3191232979297638, "learning_rate": 6.68532725161968e-05, "loss": 4.6141, "step": 39700 }, { "epoch": 0.8131796121475232, "grad_norm": 0.30594417452812195, "learning_rate": 6.671570486017507e-05, "loss": 4.5617, "step": 39710 }, { "epoch": 0.8133843917023325, "grad_norm": 0.31159308552742004, "learning_rate": 6.657825055138382e-05, "loss": 4.5892, "step": 39720 }, { "epoch": 0.8135891712571417, "grad_norm": 0.37162676453590393, "learning_rate": 6.64409097067166e-05, "loss": 4.5883, "step": 39730 }, { "epoch": 0.8137939508119509, "grad_norm": 0.2946584224700928, "learning_rate": 6.630368244297034e-05, "loss": 4.5841, "step": 39740 }, { "epoch": 0.8139987303667602, "grad_norm": 0.3434107303619385, "learning_rate": 6.616656887684547e-05, "loss": 4.5739, "step": 39750 }, { "epoch": 0.8142035099215694, "grad_norm": 0.3184031844139099, "learning_rate": 6.602956912494567e-05, "loss": 4.5956, "step": 39760 }, { "epoch": 0.8144082894763787, "grad_norm": 0.2952013909816742, "learning_rate": 6.589268330377794e-05, "loss": 4.5757, "step": 39770 }, { "epoch": 0.8146130690311879, "grad_norm": 0.31298646330833435, "learning_rate": 6.57559115297522e-05, "loss": 4.5678, "step": 39780 }, { "epoch": 0.8148178485859972, "grad_norm": 0.3092459440231323, "learning_rate": 6.561925391918158e-05, "loss": 4.6025, "step": 39790 }, { "epoch": 0.8150226281408064, "grad_norm": 0.3376334607601166, "learning_rate": 6.548271058828195e-05, "loss": 4.5802, "step": 39800 }, { "epoch": 0.8152274076956156, "grad_norm": 0.5934453010559082, "learning_rate": 6.534628165317209e-05, "loss": 4.4972, "step": 39810 }, { "epoch": 0.8154321872504249, "grad_norm": 0.31874537467956543, "learning_rate": 6.520996722987354e-05, "loss": 4.5606, "step": 39820 }, { "epoch": 0.8156369668052341, "grad_norm": 0.3155401945114136, "learning_rate": 6.507376743431037e-05, "loss": 4.5875, "step": 39830 }, { "epoch": 0.8158417463600434, "grad_norm": 0.327118843793869, "learning_rate": 6.493768238230929e-05, "loss": 4.576, "step": 39840 }, { "epoch": 0.8160465259148527, "grad_norm": 0.3194453716278076, "learning_rate": 6.480171218959918e-05, "loss": 4.5893, "step": 39850 }, { "epoch": 0.816251305469662, "grad_norm": 0.3144104480743408, "learning_rate": 6.466585697181148e-05, "loss": 4.5746, "step": 39860 }, { "epoch": 0.8164560850244712, "grad_norm": 0.31391122937202454, "learning_rate": 6.453011684447983e-05, "loss": 4.5492, "step": 39870 }, { "epoch": 0.8166608645792804, "grad_norm": 0.3237147927284241, "learning_rate": 6.439449192303992e-05, "loss": 4.5762, "step": 39880 }, { "epoch": 0.8168656441340897, "grad_norm": 0.3276256322860718, "learning_rate": 6.425898232282941e-05, "loss": 4.5694, "step": 39890 }, { "epoch": 0.8170704236888989, "grad_norm": 0.30594319105148315, "learning_rate": 6.412358815908801e-05, "loss": 4.5978, "step": 39900 }, { "epoch": 0.8172752032437082, "grad_norm": 0.29324057698249817, "learning_rate": 6.398830954695732e-05, "loss": 4.5764, "step": 39910 }, { "epoch": 0.8174799827985174, "grad_norm": 0.319190114736557, "learning_rate": 6.385314660148041e-05, "loss": 4.564, "step": 39920 }, { "epoch": 0.8176847623533267, "grad_norm": 0.31613823771476746, "learning_rate": 6.371809943760221e-05, "loss": 4.5646, "step": 39930 }, { "epoch": 0.8178895419081359, "grad_norm": 0.3119392991065979, "learning_rate": 6.358316817016913e-05, "loss": 4.5953, "step": 39940 }, { "epoch": 0.8180943214629451, "grad_norm": 0.3238719403743744, "learning_rate": 6.344835291392903e-05, "loss": 4.6055, "step": 39950 }, { "epoch": 0.8182991010177544, "grad_norm": 0.3044256567955017, "learning_rate": 6.331365378353109e-05, "loss": 4.5751, "step": 39960 }, { "epoch": 0.8185038805725636, "grad_norm": 0.3040337860584259, "learning_rate": 6.317907089352579e-05, "loss": 4.5915, "step": 39970 }, { "epoch": 0.8187086601273729, "grad_norm": 0.33757469058036804, "learning_rate": 6.304460435836458e-05, "loss": 4.5909, "step": 39980 }, { "epoch": 0.8189134396821821, "grad_norm": 0.3282129764556885, "learning_rate": 6.291025429240023e-05, "loss": 4.5882, "step": 39990 }, { "epoch": 0.8191182192369914, "grad_norm": 0.3238188326358795, "learning_rate": 6.277602080988619e-05, "loss": 4.5796, "step": 40000 }, { "epoch": 0.8191182192369914, "eval_loss": 4.575884819030762, "eval_runtime": 4.2712, "eval_samples_per_second": 272.991, "eval_steps_per_second": 34.182, "step": 40000 }, { "epoch": 0.8193229987918006, "grad_norm": 0.3096143901348114, "learning_rate": 6.264190402497693e-05, "loss": 4.5938, "step": 40010 }, { "epoch": 0.8195277783466098, "grad_norm": 0.3325948417186737, "learning_rate": 6.25079040517277e-05, "loss": 4.5276, "step": 40020 }, { "epoch": 0.8197325579014191, "grad_norm": 0.34610995650291443, "learning_rate": 6.237402100409431e-05, "loss": 4.5836, "step": 40030 }, { "epoch": 0.8199373374562283, "grad_norm": 0.30577176809310913, "learning_rate": 6.224025499593318e-05, "loss": 4.611, "step": 40040 }, { "epoch": 0.8201421170110376, "grad_norm": 0.3057403266429901, "learning_rate": 6.21066061410013e-05, "loss": 4.5841, "step": 40050 }, { "epoch": 0.8203468965658469, "grad_norm": 0.31216299533843994, "learning_rate": 6.197307455295575e-05, "loss": 4.5334, "step": 40060 }, { "epoch": 0.8205516761206562, "grad_norm": 0.3139328062534332, "learning_rate": 6.183966034535418e-05, "loss": 4.5958, "step": 40070 }, { "epoch": 0.8207564556754654, "grad_norm": 0.2837695777416229, "learning_rate": 6.170636363165432e-05, "loss": 4.5757, "step": 40080 }, { "epoch": 0.8209612352302746, "grad_norm": 0.30150434374809265, "learning_rate": 6.157318452521385e-05, "loss": 4.5938, "step": 40090 }, { "epoch": 0.8211660147850839, "grad_norm": 0.3371080458164215, "learning_rate": 6.14401231392906e-05, "loss": 4.562, "step": 40100 }, { "epoch": 0.8213707943398931, "grad_norm": 0.3148494064807892, "learning_rate": 6.130717958704226e-05, "loss": 4.5811, "step": 40110 }, { "epoch": 0.8215755738947024, "grad_norm": 0.3045748770236969, "learning_rate": 6.117435398152634e-05, "loss": 4.5936, "step": 40120 }, { "epoch": 0.8217803534495116, "grad_norm": 0.2946961522102356, "learning_rate": 6.104164643569985e-05, "loss": 4.5752, "step": 40130 }, { "epoch": 0.8219851330043209, "grad_norm": 0.30119648575782776, "learning_rate": 6.0909057062419604e-05, "loss": 4.5784, "step": 40140 }, { "epoch": 0.8221899125591301, "grad_norm": 0.32062435150146484, "learning_rate": 6.077658597444189e-05, "loss": 4.6356, "step": 40150 }, { "epoch": 0.8223946921139393, "grad_norm": 0.34336739778518677, "learning_rate": 6.064423328442237e-05, "loss": 4.5859, "step": 40160 }, { "epoch": 0.8225994716687486, "grad_norm": 0.3080267906188965, "learning_rate": 6.051199910491605e-05, "loss": 4.5982, "step": 40170 }, { "epoch": 0.8228042512235578, "grad_norm": 0.29620522260665894, "learning_rate": 6.0379883548377026e-05, "loss": 4.6064, "step": 40180 }, { "epoch": 0.8230090307783671, "grad_norm": 0.29631346464157104, "learning_rate": 6.024788672715873e-05, "loss": 4.5781, "step": 40190 }, { "epoch": 0.8232138103331763, "grad_norm": 0.3337494134902954, "learning_rate": 6.0116008753513375e-05, "loss": 4.5816, "step": 40200 }, { "epoch": 0.8234185898879856, "grad_norm": 0.3190304934978485, "learning_rate": 5.9984249739592315e-05, "loss": 4.6157, "step": 40210 }, { "epoch": 0.8236233694427948, "grad_norm": 0.31650981307029724, "learning_rate": 5.985260979744563e-05, "loss": 4.5511, "step": 40220 }, { "epoch": 0.823828148997604, "grad_norm": 0.30655616521835327, "learning_rate": 5.972108903902218e-05, "loss": 4.5511, "step": 40230 }, { "epoch": 0.8240329285524133, "grad_norm": 0.31079205870628357, "learning_rate": 5.958968757616943e-05, "loss": 4.5771, "step": 40240 }, { "epoch": 0.8242377081072225, "grad_norm": 0.31540369987487793, "learning_rate": 5.945840552063346e-05, "loss": 4.5963, "step": 40250 }, { "epoch": 0.8244424876620319, "grad_norm": 0.31257107853889465, "learning_rate": 5.932724298405879e-05, "loss": 4.5456, "step": 40260 }, { "epoch": 0.8246472672168411, "grad_norm": 0.32176050543785095, "learning_rate": 5.919620007798823e-05, "loss": 4.6102, "step": 40270 }, { "epoch": 0.8248520467716504, "grad_norm": 0.2959055006504059, "learning_rate": 5.906527691386279e-05, "loss": 4.5573, "step": 40280 }, { "epoch": 0.8250568263264596, "grad_norm": 0.30281054973602295, "learning_rate": 5.893447360302189e-05, "loss": 4.5637, "step": 40290 }, { "epoch": 0.8252616058812688, "grad_norm": 0.2997758984565735, "learning_rate": 5.8803790256702815e-05, "loss": 4.5415, "step": 40300 }, { "epoch": 0.8254663854360781, "grad_norm": 0.3132002651691437, "learning_rate": 5.8673226986040944e-05, "loss": 4.6224, "step": 40310 }, { "epoch": 0.8256711649908873, "grad_norm": 0.3072299063205719, "learning_rate": 5.854278390206951e-05, "loss": 4.564, "step": 40320 }, { "epoch": 0.8258759445456966, "grad_norm": 0.30027976632118225, "learning_rate": 5.8412461115719565e-05, "loss": 4.5962, "step": 40330 }, { "epoch": 0.8260807241005058, "grad_norm": 0.29778626561164856, "learning_rate": 5.828225873781972e-05, "loss": 4.5576, "step": 40340 }, { "epoch": 0.8262855036553151, "grad_norm": 0.30664029717445374, "learning_rate": 5.815217687909633e-05, "loss": 4.4757, "step": 40350 }, { "epoch": 0.8264902832101243, "grad_norm": 0.3251498341560364, "learning_rate": 5.80222156501733e-05, "loss": 4.5965, "step": 40360 }, { "epoch": 0.8266950627649335, "grad_norm": 0.3085460960865021, "learning_rate": 5.789237516157173e-05, "loss": 4.5657, "step": 40370 }, { "epoch": 0.8268998423197428, "grad_norm": 0.2986665666103363, "learning_rate": 5.776265552371027e-05, "loss": 4.5658, "step": 40380 }, { "epoch": 0.827104621874552, "grad_norm": 0.3298749327659607, "learning_rate": 5.7633056846904674e-05, "loss": 4.5702, "step": 40390 }, { "epoch": 0.8273094014293613, "grad_norm": 0.29386669397354126, "learning_rate": 5.750357924136793e-05, "loss": 4.5646, "step": 40400 }, { "epoch": 0.8275141809841705, "grad_norm": 0.3200646936893463, "learning_rate": 5.7374222817209877e-05, "loss": 4.539, "step": 40410 }, { "epoch": 0.8277189605389798, "grad_norm": 0.32497406005859375, "learning_rate": 5.724498768443747e-05, "loss": 4.5672, "step": 40420 }, { "epoch": 0.827923740093789, "grad_norm": 0.3958125114440918, "learning_rate": 5.711587395295446e-05, "loss": 4.6014, "step": 40430 }, { "epoch": 0.8281285196485982, "grad_norm": 0.30142369866371155, "learning_rate": 5.6986881732561394e-05, "loss": 4.5771, "step": 40440 }, { "epoch": 0.8283332992034075, "grad_norm": 0.3324086666107178, "learning_rate": 5.6858011132955455e-05, "loss": 4.5214, "step": 40450 }, { "epoch": 0.8285380787582168, "grad_norm": 0.3738360106945038, "learning_rate": 5.6729262263730324e-05, "loss": 4.5905, "step": 40460 }, { "epoch": 0.8287428583130261, "grad_norm": 0.3204920291900635, "learning_rate": 5.660063523437631e-05, "loss": 4.5946, "step": 40470 }, { "epoch": 0.8289476378678353, "grad_norm": 0.3194018602371216, "learning_rate": 5.647213015427992e-05, "loss": 4.5686, "step": 40480 }, { "epoch": 0.8291524174226446, "grad_norm": 0.3234045207500458, "learning_rate": 5.634374713272412e-05, "loss": 4.6237, "step": 40490 }, { "epoch": 0.8293571969774538, "grad_norm": 0.3342304229736328, "learning_rate": 5.6215486278888016e-05, "loss": 4.5502, "step": 40500 }, { "epoch": 0.829561976532263, "grad_norm": 0.31595751643180847, "learning_rate": 5.608734770184681e-05, "loss": 4.5497, "step": 40510 }, { "epoch": 0.8297667560870723, "grad_norm": 0.3042425215244293, "learning_rate": 5.595933151057173e-05, "loss": 4.5548, "step": 40520 }, { "epoch": 0.8299715356418815, "grad_norm": 0.3285609185695648, "learning_rate": 5.583143781392994e-05, "loss": 4.545, "step": 40530 }, { "epoch": 0.8301763151966908, "grad_norm": 0.30174845457077026, "learning_rate": 5.570366672068443e-05, "loss": 4.5705, "step": 40540 }, { "epoch": 0.8303810947515, "grad_norm": 0.3143061697483063, "learning_rate": 5.557601833949384e-05, "loss": 4.5473, "step": 40550 }, { "epoch": 0.8305858743063093, "grad_norm": 0.3141131103038788, "learning_rate": 5.544849277891262e-05, "loss": 4.5631, "step": 40560 }, { "epoch": 0.8307906538611185, "grad_norm": 0.29421284794807434, "learning_rate": 5.532109014739053e-05, "loss": 4.5634, "step": 40570 }, { "epoch": 0.8309954334159277, "grad_norm": 0.3055809438228607, "learning_rate": 5.519381055327304e-05, "loss": 4.5718, "step": 40580 }, { "epoch": 0.831200212970737, "grad_norm": 0.3031426668167114, "learning_rate": 5.506665410480085e-05, "loss": 4.5831, "step": 40590 }, { "epoch": 0.8314049925255462, "grad_norm": 0.29634007811546326, "learning_rate": 5.493962091010998e-05, "loss": 4.5598, "step": 40600 }, { "epoch": 0.8316097720803555, "grad_norm": 0.3001534342765808, "learning_rate": 5.481271107723167e-05, "loss": 4.5927, "step": 40610 }, { "epoch": 0.8318145516351647, "grad_norm": 0.3114277422428131, "learning_rate": 5.4685924714092105e-05, "loss": 4.5577, "step": 40620 }, { "epoch": 0.832019331189974, "grad_norm": 0.33328965306282043, "learning_rate": 5.4559261928512575e-05, "loss": 4.5839, "step": 40630 }, { "epoch": 0.8322241107447832, "grad_norm": 0.30717888474464417, "learning_rate": 5.4432722828209305e-05, "loss": 4.5541, "step": 40640 }, { "epoch": 0.8324288902995924, "grad_norm": 0.2950066328048706, "learning_rate": 5.430630752079333e-05, "loss": 4.5491, "step": 40650 }, { "epoch": 0.8326336698544017, "grad_norm": 0.32168129086494446, "learning_rate": 5.4180016113770284e-05, "loss": 4.5931, "step": 40660 }, { "epoch": 0.832838449409211, "grad_norm": 0.36206507682800293, "learning_rate": 5.405384871454058e-05, "loss": 4.5691, "step": 40670 }, { "epoch": 0.8330432289640203, "grad_norm": 0.298918753862381, "learning_rate": 5.39278054303991e-05, "loss": 4.6016, "step": 40680 }, { "epoch": 0.8332480085188295, "grad_norm": 0.2934432327747345, "learning_rate": 5.380188636853527e-05, "loss": 4.5834, "step": 40690 }, { "epoch": 0.8334527880736388, "grad_norm": 0.30298522114753723, "learning_rate": 5.367609163603269e-05, "loss": 4.562, "step": 40700 }, { "epoch": 0.833657567628448, "grad_norm": 0.3066384792327881, "learning_rate": 5.355042133986938e-05, "loss": 4.5785, "step": 40710 }, { "epoch": 0.8338623471832572, "grad_norm": 0.31653040647506714, "learning_rate": 5.3424875586917514e-05, "loss": 4.5656, "step": 40720 }, { "epoch": 0.8340671267380665, "grad_norm": 0.32101601362228394, "learning_rate": 5.329945448394331e-05, "loss": 4.5812, "step": 40730 }, { "epoch": 0.8342719062928757, "grad_norm": 0.3386152386665344, "learning_rate": 5.317415813760709e-05, "loss": 4.5628, "step": 40740 }, { "epoch": 0.834476685847685, "grad_norm": 0.3049459159374237, "learning_rate": 5.3048986654462875e-05, "loss": 4.6111, "step": 40750 }, { "epoch": 0.8346814654024942, "grad_norm": 0.29170680046081543, "learning_rate": 5.2923940140958716e-05, "loss": 4.6017, "step": 40760 }, { "epoch": 0.8348862449573035, "grad_norm": 0.3085028827190399, "learning_rate": 5.279901870343622e-05, "loss": 4.5733, "step": 40770 }, { "epoch": 0.8350910245121127, "grad_norm": 0.2996695935726166, "learning_rate": 5.2674222448130714e-05, "loss": 4.6044, "step": 40780 }, { "epoch": 0.8352958040669219, "grad_norm": 0.3270297646522522, "learning_rate": 5.254955148117109e-05, "loss": 4.6011, "step": 40790 }, { "epoch": 0.8355005836217312, "grad_norm": 0.303653359413147, "learning_rate": 5.2425005908579636e-05, "loss": 4.5859, "step": 40800 }, { "epoch": 0.8357053631765404, "grad_norm": 0.31010833382606506, "learning_rate": 5.230058583627204e-05, "loss": 4.5363, "step": 40810 }, { "epoch": 0.8359101427313497, "grad_norm": 0.32453277707099915, "learning_rate": 5.217629137005726e-05, "loss": 4.5699, "step": 40820 }, { "epoch": 0.8361149222861589, "grad_norm": 0.3030214309692383, "learning_rate": 5.2052122615637433e-05, "loss": 4.5729, "step": 40830 }, { "epoch": 0.8363197018409682, "grad_norm": 0.30498307943344116, "learning_rate": 5.192807967860769e-05, "loss": 4.6002, "step": 40840 }, { "epoch": 0.8365244813957774, "grad_norm": 0.2962140142917633, "learning_rate": 5.180416266445638e-05, "loss": 4.5837, "step": 40850 }, { "epoch": 0.8367292609505866, "grad_norm": 0.2984965145587921, "learning_rate": 5.168037167856452e-05, "loss": 4.5627, "step": 40860 }, { "epoch": 0.836934040505396, "grad_norm": 0.32174327969551086, "learning_rate": 5.1556706826206105e-05, "loss": 4.5712, "step": 40870 }, { "epoch": 0.8371388200602052, "grad_norm": 0.3211826980113983, "learning_rate": 5.143316821254784e-05, "loss": 4.587, "step": 40880 }, { "epoch": 0.8373435996150145, "grad_norm": 0.3006008267402649, "learning_rate": 5.130975594264909e-05, "loss": 4.5925, "step": 40890 }, { "epoch": 0.8375483791698237, "grad_norm": 0.3336687386035919, "learning_rate": 5.118647012146176e-05, "loss": 4.5807, "step": 40900 }, { "epoch": 0.837753158724633, "grad_norm": 0.2978768050670624, "learning_rate": 5.106331085383009e-05, "loss": 4.5507, "step": 40910 }, { "epoch": 0.8379579382794422, "grad_norm": 0.30832746624946594, "learning_rate": 5.094027824449092e-05, "loss": 4.5604, "step": 40920 }, { "epoch": 0.8381627178342514, "grad_norm": 0.3506377339363098, "learning_rate": 5.0817372398073246e-05, "loss": 4.5421, "step": 40930 }, { "epoch": 0.8383674973890607, "grad_norm": 0.30899617075920105, "learning_rate": 5.069459341909837e-05, "loss": 4.5737, "step": 40940 }, { "epoch": 0.8385722769438699, "grad_norm": 0.3059753179550171, "learning_rate": 5.057194141197945e-05, "loss": 4.5215, "step": 40950 }, { "epoch": 0.8387770564986792, "grad_norm": 0.40165573358535767, "learning_rate": 5.044941648102195e-05, "loss": 4.5892, "step": 40960 }, { "epoch": 0.8389818360534884, "grad_norm": 0.3088058829307556, "learning_rate": 5.0327018730423206e-05, "loss": 4.5868, "step": 40970 }, { "epoch": 0.8391866156082977, "grad_norm": 0.31243231892585754, "learning_rate": 5.0204748264272194e-05, "loss": 4.5605, "step": 40980 }, { "epoch": 0.8393913951631069, "grad_norm": 0.309691458940506, "learning_rate": 5.008260518654988e-05, "loss": 4.578, "step": 40990 }, { "epoch": 0.8395961747179161, "grad_norm": 0.34093743562698364, "learning_rate": 4.996058960112884e-05, "loss": 4.5843, "step": 41000 }, { "epoch": 0.8398009542727254, "grad_norm": 0.31313571333885193, "learning_rate": 4.9838701611773155e-05, "loss": 4.5593, "step": 41010 }, { "epoch": 0.8400057338275346, "grad_norm": 0.35509222745895386, "learning_rate": 4.9716941322138465e-05, "loss": 4.5816, "step": 41020 }, { "epoch": 0.8402105133823439, "grad_norm": 0.32133156061172485, "learning_rate": 4.959530883577184e-05, "loss": 4.5616, "step": 41030 }, { "epoch": 0.8404152929371531, "grad_norm": 0.31465378403663635, "learning_rate": 4.9473804256111563e-05, "loss": 4.5598, "step": 41040 }, { "epoch": 0.8406200724919624, "grad_norm": 0.32268306612968445, "learning_rate": 4.9352427686487155e-05, "loss": 4.6018, "step": 41050 }, { "epoch": 0.8408248520467716, "grad_norm": 0.3060460686683655, "learning_rate": 4.923117923011935e-05, "loss": 4.53, "step": 41060 }, { "epoch": 0.841029631601581, "grad_norm": 0.3377113938331604, "learning_rate": 4.9110058990119886e-05, "loss": 4.5621, "step": 41070 }, { "epoch": 0.8412344111563902, "grad_norm": 0.30276891589164734, "learning_rate": 4.898906706949151e-05, "loss": 4.5804, "step": 41080 }, { "epoch": 0.8414391907111994, "grad_norm": 0.3168056905269623, "learning_rate": 4.8868203571127756e-05, "loss": 4.5622, "step": 41090 }, { "epoch": 0.8416439702660087, "grad_norm": 0.3039287030696869, "learning_rate": 4.874746859781305e-05, "loss": 4.5951, "step": 41100 }, { "epoch": 0.8418487498208179, "grad_norm": 0.3187820613384247, "learning_rate": 4.862686225222251e-05, "loss": 4.541, "step": 41110 }, { "epoch": 0.8420535293756272, "grad_norm": 0.35254228115081787, "learning_rate": 4.85063846369217e-05, "loss": 4.5532, "step": 41120 }, { "epoch": 0.8422583089304364, "grad_norm": 0.3221297264099121, "learning_rate": 4.838603585436692e-05, "loss": 4.6066, "step": 41130 }, { "epoch": 0.8424630884852456, "grad_norm": 0.3298701345920563, "learning_rate": 4.826581600690476e-05, "loss": 4.5563, "step": 41140 }, { "epoch": 0.8426678680400549, "grad_norm": 0.30806881189346313, "learning_rate": 4.814572519677223e-05, "loss": 4.591, "step": 41150 }, { "epoch": 0.8428726475948641, "grad_norm": 0.3149655759334564, "learning_rate": 4.8025763526096625e-05, "loss": 4.5419, "step": 41160 }, { "epoch": 0.8430774271496734, "grad_norm": 0.30739855766296387, "learning_rate": 4.790593109689541e-05, "loss": 4.598, "step": 41170 }, { "epoch": 0.8432822067044826, "grad_norm": 0.30234190821647644, "learning_rate": 4.7786228011076104e-05, "loss": 4.5808, "step": 41180 }, { "epoch": 0.8434869862592919, "grad_norm": 0.3341679573059082, "learning_rate": 4.76666543704362e-05, "loss": 4.5671, "step": 41190 }, { "epoch": 0.8436917658141011, "grad_norm": 0.33971861004829407, "learning_rate": 4.75472102766632e-05, "loss": 4.5599, "step": 41200 }, { "epoch": 0.8438965453689103, "grad_norm": 0.3058462142944336, "learning_rate": 4.742789583133438e-05, "loss": 4.5596, "step": 41210 }, { "epoch": 0.8441013249237196, "grad_norm": 0.32935091853141785, "learning_rate": 4.730871113591682e-05, "loss": 4.5711, "step": 41220 }, { "epoch": 0.8443061044785288, "grad_norm": 0.30706486105918884, "learning_rate": 4.7189656291767125e-05, "loss": 4.604, "step": 41230 }, { "epoch": 0.8445108840333381, "grad_norm": 0.32395413517951965, "learning_rate": 4.707073140013158e-05, "loss": 4.5554, "step": 41240 }, { "epoch": 0.8447156635881473, "grad_norm": 0.3302842080593109, "learning_rate": 4.6951936562146025e-05, "loss": 4.5623, "step": 41250 }, { "epoch": 0.8449204431429566, "grad_norm": 0.3194911777973175, "learning_rate": 4.683327187883548e-05, "loss": 4.5365, "step": 41260 }, { "epoch": 0.8451252226977659, "grad_norm": 0.6366714835166931, "learning_rate": 4.671473745111446e-05, "loss": 4.4751, "step": 41270 }, { "epoch": 0.8453300022525752, "grad_norm": 0.3164162337779999, "learning_rate": 4.659633337978668e-05, "loss": 4.5657, "step": 41280 }, { "epoch": 0.8455347818073844, "grad_norm": 0.30572304129600525, "learning_rate": 4.6478059765544955e-05, "loss": 4.5952, "step": 41290 }, { "epoch": 0.8457395613621936, "grad_norm": 0.5788365602493286, "learning_rate": 4.635991670897117e-05, "loss": 4.5821, "step": 41300 }, { "epoch": 0.8459443409170029, "grad_norm": 0.31250980496406555, "learning_rate": 4.624190431053628e-05, "loss": 4.5627, "step": 41310 }, { "epoch": 0.8461491204718121, "grad_norm": 0.35064229369163513, "learning_rate": 4.6124022670599875e-05, "loss": 4.5678, "step": 41320 }, { "epoch": 0.8463539000266214, "grad_norm": 0.31303027272224426, "learning_rate": 4.600627188941067e-05, "loss": 4.5762, "step": 41330 }, { "epoch": 0.8465586795814306, "grad_norm": 0.33292683959007263, "learning_rate": 4.5888652067105774e-05, "loss": 4.5662, "step": 41340 }, { "epoch": 0.8467634591362398, "grad_norm": 0.34634777903556824, "learning_rate": 4.577116330371116e-05, "loss": 4.5643, "step": 41350 }, { "epoch": 0.8469682386910491, "grad_norm": 0.2940186858177185, "learning_rate": 4.5653805699141286e-05, "loss": 4.5593, "step": 41360 }, { "epoch": 0.8471730182458583, "grad_norm": 0.3219406008720398, "learning_rate": 4.553657935319901e-05, "loss": 4.558, "step": 41370 }, { "epoch": 0.8473777978006676, "grad_norm": 0.2876724898815155, "learning_rate": 4.5419484365575636e-05, "loss": 4.5379, "step": 41380 }, { "epoch": 0.8475825773554768, "grad_norm": 0.3168316185474396, "learning_rate": 4.53025208358508e-05, "loss": 4.5693, "step": 41390 }, { "epoch": 0.8477873569102861, "grad_norm": 0.3215070962905884, "learning_rate": 4.518568886349217e-05, "loss": 4.5863, "step": 41400 }, { "epoch": 0.8479921364650953, "grad_norm": 0.3174405097961426, "learning_rate": 4.506898854785564e-05, "loss": 4.5721, "step": 41410 }, { "epoch": 0.8481969160199045, "grad_norm": 0.3024792969226837, "learning_rate": 4.495241998818527e-05, "loss": 4.5722, "step": 41420 }, { "epoch": 0.8484016955747138, "grad_norm": 0.2967088222503662, "learning_rate": 4.483598328361278e-05, "loss": 4.5923, "step": 41430 }, { "epoch": 0.848606475129523, "grad_norm": 0.31650328636169434, "learning_rate": 4.4719678533158015e-05, "loss": 4.5574, "step": 41440 }, { "epoch": 0.8488112546843323, "grad_norm": 0.30982720851898193, "learning_rate": 4.46035058357285e-05, "loss": 4.5649, "step": 41450 }, { "epoch": 0.8490160342391415, "grad_norm": 0.4292488992214203, "learning_rate": 4.448746529011951e-05, "loss": 4.5609, "step": 41460 }, { "epoch": 0.8492208137939508, "grad_norm": 0.3487224578857422, "learning_rate": 4.4371556995013805e-05, "loss": 4.48, "step": 41470 }, { "epoch": 0.8494255933487601, "grad_norm": 0.2968166172504425, "learning_rate": 4.425578104898181e-05, "loss": 4.5607, "step": 41480 }, { "epoch": 0.8496303729035694, "grad_norm": 0.3533106744289398, "learning_rate": 4.414013755048139e-05, "loss": 4.5658, "step": 41490 }, { "epoch": 0.8498351524583786, "grad_norm": 0.31141340732574463, "learning_rate": 4.402462659785771e-05, "loss": 4.5568, "step": 41500 }, { "epoch": 0.8500399320131878, "grad_norm": 0.32396769523620605, "learning_rate": 4.3909248289343305e-05, "loss": 4.5341, "step": 41510 }, { "epoch": 0.8502447115679971, "grad_norm": 0.32039234042167664, "learning_rate": 4.3794002723057756e-05, "loss": 4.6012, "step": 41520 }, { "epoch": 0.8504494911228063, "grad_norm": 0.30332982540130615, "learning_rate": 4.367888999700795e-05, "loss": 4.5529, "step": 41530 }, { "epoch": 0.8506542706776156, "grad_norm": 0.30191323161125183, "learning_rate": 4.356391020908761e-05, "loss": 4.5609, "step": 41540 }, { "epoch": 0.8508590502324248, "grad_norm": 0.32889968156814575, "learning_rate": 4.3449063457077555e-05, "loss": 4.5905, "step": 41550 }, { "epoch": 0.851063829787234, "grad_norm": 0.31554514169692993, "learning_rate": 4.3334349838645393e-05, "loss": 4.5765, "step": 41560 }, { "epoch": 0.8512686093420433, "grad_norm": 0.29832085967063904, "learning_rate": 4.321976945134556e-05, "loss": 4.602, "step": 41570 }, { "epoch": 0.8514733888968525, "grad_norm": 0.32858118414878845, "learning_rate": 4.3105322392619154e-05, "loss": 4.5643, "step": 41580 }, { "epoch": 0.8516781684516618, "grad_norm": 0.3178858757019043, "learning_rate": 4.299100875979392e-05, "loss": 4.6036, "step": 41590 }, { "epoch": 0.851882948006471, "grad_norm": 0.5597391724586487, "learning_rate": 4.287682865008413e-05, "loss": 4.5431, "step": 41600 }, { "epoch": 0.8520877275612803, "grad_norm": 0.30292996764183044, "learning_rate": 4.27627821605904e-05, "loss": 4.5701, "step": 41610 }, { "epoch": 0.8522925071160895, "grad_norm": 0.3429625630378723, "learning_rate": 4.2648869388299926e-05, "loss": 4.5742, "step": 41620 }, { "epoch": 0.8524972866708987, "grad_norm": 0.308368444442749, "learning_rate": 4.2535090430085944e-05, "loss": 4.5558, "step": 41630 }, { "epoch": 0.852702066225708, "grad_norm": 0.3081679344177246, "learning_rate": 4.242144538270807e-05, "loss": 4.5283, "step": 41640 }, { "epoch": 0.8529068457805172, "grad_norm": 0.2995325028896332, "learning_rate": 4.2307934342811976e-05, "loss": 4.5819, "step": 41650 }, { "epoch": 0.8531116253353265, "grad_norm": 0.30443987250328064, "learning_rate": 4.219455740692939e-05, "loss": 4.5793, "step": 41660 }, { "epoch": 0.8533164048901357, "grad_norm": 0.30008235573768616, "learning_rate": 4.208131467147804e-05, "loss": 4.5818, "step": 41670 }, { "epoch": 0.8535211844449451, "grad_norm": 0.3058653473854065, "learning_rate": 4.1968206232761387e-05, "loss": 4.5616, "step": 41680 }, { "epoch": 0.8537259639997543, "grad_norm": 0.2992249131202698, "learning_rate": 4.185523218696879e-05, "loss": 4.5618, "step": 41690 }, { "epoch": 0.8539307435545636, "grad_norm": 0.29264143109321594, "learning_rate": 4.1742392630175294e-05, "loss": 4.5486, "step": 41700 }, { "epoch": 0.8541355231093728, "grad_norm": 0.3090275526046753, "learning_rate": 4.162968765834166e-05, "loss": 4.6067, "step": 41710 }, { "epoch": 0.854340302664182, "grad_norm": 0.3257957696914673, "learning_rate": 4.1517117367313983e-05, "loss": 4.5606, "step": 41720 }, { "epoch": 0.8545450822189913, "grad_norm": 0.2962327003479004, "learning_rate": 4.1404681852824025e-05, "loss": 4.5485, "step": 41730 }, { "epoch": 0.8547498617738005, "grad_norm": 0.3346448540687561, "learning_rate": 4.1292381210488906e-05, "loss": 4.5218, "step": 41740 }, { "epoch": 0.8549546413286098, "grad_norm": 0.2993049919605255, "learning_rate": 4.118021553581086e-05, "loss": 4.5781, "step": 41750 }, { "epoch": 0.855159420883419, "grad_norm": 0.2944900393486023, "learning_rate": 4.106818492417759e-05, "loss": 4.5567, "step": 41760 }, { "epoch": 0.8553642004382283, "grad_norm": 0.36183977127075195, "learning_rate": 4.095628947086181e-05, "loss": 4.5426, "step": 41770 }, { "epoch": 0.8555689799930375, "grad_norm": 0.3020537495613098, "learning_rate": 4.0844529271021315e-05, "loss": 4.5681, "step": 41780 }, { "epoch": 0.8557737595478467, "grad_norm": 0.30587658286094666, "learning_rate": 4.073290441969888e-05, "loss": 4.5776, "step": 41790 }, { "epoch": 0.855978539102656, "grad_norm": 0.30620551109313965, "learning_rate": 4.062141501182222e-05, "loss": 4.6001, "step": 41800 }, { "epoch": 0.8561833186574652, "grad_norm": 0.32741957902908325, "learning_rate": 4.051006114220379e-05, "loss": 4.5872, "step": 41810 }, { "epoch": 0.8563880982122745, "grad_norm": 0.3100329637527466, "learning_rate": 4.0398842905540744e-05, "loss": 4.5438, "step": 41820 }, { "epoch": 0.8565928777670837, "grad_norm": 0.315884530544281, "learning_rate": 4.028776039641502e-05, "loss": 4.5589, "step": 41830 }, { "epoch": 0.856797657321893, "grad_norm": 0.30159467458724976, "learning_rate": 4.017681370929307e-05, "loss": 4.5447, "step": 41840 }, { "epoch": 0.8570024368767022, "grad_norm": 0.30200254917144775, "learning_rate": 4.0066002938525846e-05, "loss": 4.5295, "step": 41850 }, { "epoch": 0.8572072164315114, "grad_norm": 0.33325955271720886, "learning_rate": 4e-05, "loss": 4.578, "step": 41860 }, { "epoch": 0.8574119959863207, "grad_norm": 0.3209024965763092, "learning_rate": 4e-05, "loss": 4.5784, "step": 41870 }, { "epoch": 0.85761677554113, "grad_norm": 0.3402951955795288, "learning_rate": 4e-05, "loss": 4.5534, "step": 41880 }, { "epoch": 0.8578215550959393, "grad_norm": 0.292206346988678, "learning_rate": 4e-05, "loss": 4.5532, "step": 41890 }, { "epoch": 0.8580263346507485, "grad_norm": 0.3007727563381195, "learning_rate": 4e-05, "loss": 4.543, "step": 41900 }, { "epoch": 0.8582311142055578, "grad_norm": 0.31559112668037415, "learning_rate": 4e-05, "loss": 4.5518, "step": 41910 }, { "epoch": 0.858435893760367, "grad_norm": 0.30795371532440186, "learning_rate": 4e-05, "loss": 4.5488, "step": 41920 }, { "epoch": 0.8586406733151762, "grad_norm": 0.3247368037700653, "learning_rate": 4e-05, "loss": 4.5674, "step": 41930 }, { "epoch": 0.8588454528699855, "grad_norm": 0.32226502895355225, "learning_rate": 4e-05, "loss": 4.5727, "step": 41940 }, { "epoch": 0.8590502324247947, "grad_norm": 0.31973397731781006, "learning_rate": 4e-05, "loss": 4.5588, "step": 41950 }, { "epoch": 0.859255011979604, "grad_norm": 0.31184062361717224, "learning_rate": 4e-05, "loss": 4.5854, "step": 41960 }, { "epoch": 0.8594597915344132, "grad_norm": 0.30193692445755005, "learning_rate": 4e-05, "loss": 4.5686, "step": 41970 }, { "epoch": 0.8596645710892225, "grad_norm": 0.3234449028968811, "learning_rate": 4e-05, "loss": 4.5888, "step": 41980 }, { "epoch": 0.8598693506440317, "grad_norm": 0.3161271810531616, "learning_rate": 4e-05, "loss": 4.5499, "step": 41990 }, { "epoch": 0.8600741301988409, "grad_norm": 0.32918792963027954, "learning_rate": 4e-05, "loss": 4.5268, "step": 42000 }, { "epoch": 0.8600741301988409, "eval_loss": 4.56494665145874, "eval_runtime": 4.3997, "eval_samples_per_second": 265.017, "eval_steps_per_second": 33.184, "step": 42000 }, { "epoch": 0.8602789097536502, "grad_norm": 0.30065762996673584, "learning_rate": 4e-05, "loss": 4.5516, "step": 42010 }, { "epoch": 0.8604836893084594, "grad_norm": 0.30190443992614746, "learning_rate": 4e-05, "loss": 4.5792, "step": 42020 }, { "epoch": 0.8606884688632687, "grad_norm": 0.3021637797355652, "learning_rate": 4e-05, "loss": 4.5673, "step": 42030 }, { "epoch": 0.8608932484180779, "grad_norm": 0.3549637496471405, "learning_rate": 4e-05, "loss": 4.5634, "step": 42040 }, { "epoch": 0.8610980279728871, "grad_norm": 0.3007580637931824, "learning_rate": 4e-05, "loss": 4.5767, "step": 42050 }, { "epoch": 0.8613028075276964, "grad_norm": 0.37332987785339355, "learning_rate": 4e-05, "loss": 4.531, "step": 42060 }, { "epoch": 0.8615075870825056, "grad_norm": 0.32356762886047363, "learning_rate": 4e-05, "loss": 4.5386, "step": 42070 }, { "epoch": 0.861712366637315, "grad_norm": 0.30995801091194153, "learning_rate": 4e-05, "loss": 4.5428, "step": 42080 }, { "epoch": 0.8619171461921242, "grad_norm": 0.30891141295433044, "learning_rate": 4e-05, "loss": 4.5678, "step": 42090 }, { "epoch": 0.8621219257469335, "grad_norm": 0.3204641044139862, "learning_rate": 4e-05, "loss": 4.5559, "step": 42100 }, { "epoch": 0.8623267053017427, "grad_norm": 0.30229073762893677, "learning_rate": 4e-05, "loss": 4.5764, "step": 42110 }, { "epoch": 0.862531484856552, "grad_norm": 0.30723774433135986, "learning_rate": 4e-05, "loss": 4.5702, "step": 42120 }, { "epoch": 0.8627362644113612, "grad_norm": 0.3078003227710724, "learning_rate": 4e-05, "loss": 4.5797, "step": 42130 }, { "epoch": 0.8629410439661704, "grad_norm": 0.29912465810775757, "learning_rate": 4e-05, "loss": 4.5567, "step": 42140 }, { "epoch": 0.8631458235209797, "grad_norm": 0.3218294680118561, "learning_rate": 4e-05, "loss": 4.5727, "step": 42150 }, { "epoch": 0.8633506030757889, "grad_norm": 0.29612016677856445, "learning_rate": 4e-05, "loss": 4.5948, "step": 42160 }, { "epoch": 0.8635553826305982, "grad_norm": 0.299450546503067, "learning_rate": 4e-05, "loss": 4.5824, "step": 42170 }, { "epoch": 0.8637601621854074, "grad_norm": 0.32398706674575806, "learning_rate": 4e-05, "loss": 4.6054, "step": 42180 }, { "epoch": 0.8639649417402167, "grad_norm": 0.3253730237483978, "learning_rate": 4e-05, "loss": 4.5552, "step": 42190 }, { "epoch": 0.8641697212950259, "grad_norm": 0.311602920293808, "learning_rate": 4e-05, "loss": 4.5308, "step": 42200 }, { "epoch": 0.8643745008498351, "grad_norm": 0.3064606189727783, "learning_rate": 4e-05, "loss": 4.5182, "step": 42210 }, { "epoch": 0.8645792804046444, "grad_norm": 0.3285084068775177, "learning_rate": 4e-05, "loss": 4.5551, "step": 42220 }, { "epoch": 0.8647840599594536, "grad_norm": 0.2966082990169525, "learning_rate": 4e-05, "loss": 4.5678, "step": 42230 }, { "epoch": 0.8649888395142629, "grad_norm": 0.3242189288139343, "learning_rate": 4e-05, "loss": 4.5797, "step": 42240 }, { "epoch": 0.8651936190690721, "grad_norm": 0.30055972933769226, "learning_rate": 4e-05, "loss": 4.5374, "step": 42250 }, { "epoch": 0.8653983986238813, "grad_norm": 0.3144788444042206, "learning_rate": 4e-05, "loss": 4.5678, "step": 42260 }, { "epoch": 0.8656031781786906, "grad_norm": 0.33898335695266724, "learning_rate": 4e-05, "loss": 4.5581, "step": 42270 }, { "epoch": 0.8658079577334998, "grad_norm": 0.41513872146606445, "learning_rate": 4e-05, "loss": 4.5611, "step": 42280 }, { "epoch": 0.8660127372883092, "grad_norm": 0.3530650734901428, "learning_rate": 4e-05, "loss": 4.5378, "step": 42290 }, { "epoch": 0.8662175168431184, "grad_norm": 0.3262985646724701, "learning_rate": 4e-05, "loss": 4.5524, "step": 42300 }, { "epoch": 0.8664222963979277, "grad_norm": 0.35833224654197693, "learning_rate": 4e-05, "loss": 4.5554, "step": 42310 }, { "epoch": 0.8666270759527369, "grad_norm": 0.33766356110572815, "learning_rate": 4e-05, "loss": 4.5149, "step": 42320 }, { "epoch": 0.8668318555075462, "grad_norm": 0.30668026208877563, "learning_rate": 4e-05, "loss": 4.5318, "step": 42330 }, { "epoch": 0.8670366350623554, "grad_norm": 0.2991671562194824, "learning_rate": 4e-05, "loss": 4.6324, "step": 42340 }, { "epoch": 0.8672414146171646, "grad_norm": 0.3278384208679199, "learning_rate": 4e-05, "loss": 4.5374, "step": 42350 }, { "epoch": 0.8674461941719739, "grad_norm": 0.3133397400379181, "learning_rate": 4e-05, "loss": 4.5671, "step": 42360 }, { "epoch": 0.8676509737267831, "grad_norm": 0.3142462968826294, "learning_rate": 4e-05, "loss": 4.5164, "step": 42370 }, { "epoch": 0.8678557532815924, "grad_norm": 0.3174171447753906, "learning_rate": 4e-05, "loss": 4.572, "step": 42380 }, { "epoch": 0.8680605328364016, "grad_norm": 0.3513506352901459, "learning_rate": 4e-05, "loss": 4.5723, "step": 42390 }, { "epoch": 0.8682653123912109, "grad_norm": 0.300601065158844, "learning_rate": 4e-05, "loss": 4.5689, "step": 42400 }, { "epoch": 0.8684700919460201, "grad_norm": 0.313957154750824, "learning_rate": 4e-05, "loss": 4.5398, "step": 42410 }, { "epoch": 0.8686748715008293, "grad_norm": 0.31928592920303345, "learning_rate": 4e-05, "loss": 4.5752, "step": 42420 }, { "epoch": 0.8688796510556386, "grad_norm": 0.32813411951065063, "learning_rate": 4e-05, "loss": 4.5802, "step": 42430 }, { "epoch": 0.8690844306104478, "grad_norm": 0.34862348437309265, "learning_rate": 4e-05, "loss": 4.583, "step": 42440 }, { "epoch": 0.8692892101652571, "grad_norm": 0.3211762607097626, "learning_rate": 4e-05, "loss": 4.5685, "step": 42450 }, { "epoch": 0.8694939897200663, "grad_norm": 0.3033827245235443, "learning_rate": 4e-05, "loss": 4.536, "step": 42460 }, { "epoch": 0.8696987692748755, "grad_norm": 0.3189534544944763, "learning_rate": 4e-05, "loss": 4.5762, "step": 42470 }, { "epoch": 0.8699035488296848, "grad_norm": 0.3091621398925781, "learning_rate": 4e-05, "loss": 4.5487, "step": 42480 }, { "epoch": 0.8701083283844941, "grad_norm": 0.3051370680332184, "learning_rate": 4e-05, "loss": 4.5098, "step": 42490 }, { "epoch": 0.8703131079393034, "grad_norm": 0.30030304193496704, "learning_rate": 4e-05, "loss": 4.5589, "step": 42500 }, { "epoch": 0.8705178874941126, "grad_norm": 0.3978952169418335, "learning_rate": 4e-05, "loss": 4.5803, "step": 42510 }, { "epoch": 0.8707226670489219, "grad_norm": 0.3171425759792328, "learning_rate": 4e-05, "loss": 4.5616, "step": 42520 }, { "epoch": 0.8709274466037311, "grad_norm": 0.31384173035621643, "learning_rate": 4e-05, "loss": 4.559, "step": 42530 }, { "epoch": 0.8711322261585404, "grad_norm": 0.3421635627746582, "learning_rate": 4e-05, "loss": 4.5509, "step": 42540 }, { "epoch": 0.8713370057133496, "grad_norm": 0.3201238214969635, "learning_rate": 4e-05, "loss": 4.5384, "step": 42550 }, { "epoch": 0.8715417852681588, "grad_norm": 0.30009725689888, "learning_rate": 4e-05, "loss": 4.5758, "step": 42560 }, { "epoch": 0.8717465648229681, "grad_norm": 0.3192896544933319, "learning_rate": 4e-05, "loss": 4.5587, "step": 42570 }, { "epoch": 0.8719513443777773, "grad_norm": 0.3128572702407837, "learning_rate": 4e-05, "loss": 4.5677, "step": 42580 }, { "epoch": 0.8721561239325866, "grad_norm": 0.33970656991004944, "learning_rate": 4e-05, "loss": 4.5495, "step": 42590 }, { "epoch": 0.8723609034873958, "grad_norm": 0.3380085825920105, "learning_rate": 4e-05, "loss": 4.529, "step": 42600 }, { "epoch": 0.872565683042205, "grad_norm": 0.3085273504257202, "learning_rate": 4e-05, "loss": 4.5476, "step": 42610 }, { "epoch": 0.8727704625970143, "grad_norm": 0.3030671775341034, "learning_rate": 4e-05, "loss": 4.561, "step": 42620 }, { "epoch": 0.8729752421518235, "grad_norm": 0.30912870168685913, "learning_rate": 4e-05, "loss": 4.5538, "step": 42630 }, { "epoch": 0.8731800217066328, "grad_norm": 0.30605942010879517, "learning_rate": 4e-05, "loss": 4.5321, "step": 42640 }, { "epoch": 0.873384801261442, "grad_norm": 0.43395280838012695, "learning_rate": 4e-05, "loss": 4.5088, "step": 42650 }, { "epoch": 0.8735895808162513, "grad_norm": 0.3205019235610962, "learning_rate": 4e-05, "loss": 4.5411, "step": 42660 }, { "epoch": 0.8737943603710605, "grad_norm": 0.3275338113307953, "learning_rate": 4e-05, "loss": 4.5494, "step": 42670 }, { "epoch": 0.8739991399258698, "grad_norm": 0.30848512053489685, "learning_rate": 4e-05, "loss": 4.5023, "step": 42680 }, { "epoch": 0.8742039194806791, "grad_norm": 0.311075896024704, "learning_rate": 4e-05, "loss": 4.5639, "step": 42690 }, { "epoch": 0.8744086990354883, "grad_norm": 0.3298903703689575, "learning_rate": 4e-05, "loss": 4.6063, "step": 42700 }, { "epoch": 0.8746134785902976, "grad_norm": 0.3591431677341461, "learning_rate": 4e-05, "loss": 4.5294, "step": 42710 }, { "epoch": 0.8748182581451068, "grad_norm": 0.3300228714942932, "learning_rate": 4e-05, "loss": 4.5411, "step": 42720 }, { "epoch": 0.8750230376999161, "grad_norm": 0.31195318698883057, "learning_rate": 4e-05, "loss": 4.5967, "step": 42730 }, { "epoch": 0.8752278172547253, "grad_norm": 0.3583310842514038, "learning_rate": 4e-05, "loss": 4.5543, "step": 42740 }, { "epoch": 0.8754325968095346, "grad_norm": 0.302247017621994, "learning_rate": 4e-05, "loss": 4.6008, "step": 42750 }, { "epoch": 0.8756373763643438, "grad_norm": 0.31350117921829224, "learning_rate": 4e-05, "loss": 4.5515, "step": 42760 }, { "epoch": 0.875842155919153, "grad_norm": 0.32353147864341736, "learning_rate": 4e-05, "loss": 4.5739, "step": 42770 }, { "epoch": 0.8760469354739623, "grad_norm": 0.29817453026771545, "learning_rate": 4e-05, "loss": 4.5479, "step": 42780 }, { "epoch": 0.8762517150287715, "grad_norm": 0.30565959215164185, "learning_rate": 4e-05, "loss": 4.5859, "step": 42790 }, { "epoch": 0.8764564945835808, "grad_norm": 0.31741973757743835, "learning_rate": 4e-05, "loss": 4.5799, "step": 42800 }, { "epoch": 0.87666127413839, "grad_norm": 0.3071531653404236, "learning_rate": 4e-05, "loss": 4.5868, "step": 42810 }, { "epoch": 0.8768660536931993, "grad_norm": 0.31487077474594116, "learning_rate": 4e-05, "loss": 4.5586, "step": 42820 }, { "epoch": 0.8770708332480085, "grad_norm": 0.309831827878952, "learning_rate": 4e-05, "loss": 4.591, "step": 42830 }, { "epoch": 0.8772756128028177, "grad_norm": 0.30910763144493103, "learning_rate": 4e-05, "loss": 4.5528, "step": 42840 }, { "epoch": 0.877480392357627, "grad_norm": 0.33074498176574707, "learning_rate": 4e-05, "loss": 4.5742, "step": 42850 }, { "epoch": 0.8776851719124362, "grad_norm": 0.30193716287612915, "learning_rate": 4e-05, "loss": 4.54, "step": 42860 }, { "epoch": 0.8778899514672455, "grad_norm": 0.31014299392700195, "learning_rate": 4e-05, "loss": 4.5476, "step": 42870 }, { "epoch": 0.8780947310220547, "grad_norm": 0.3074629008769989, "learning_rate": 4e-05, "loss": 4.5636, "step": 42880 }, { "epoch": 0.878299510576864, "grad_norm": 0.3182469606399536, "learning_rate": 4e-05, "loss": 4.5752, "step": 42890 }, { "epoch": 0.8785042901316733, "grad_norm": 0.3700508177280426, "learning_rate": 4e-05, "loss": 4.5677, "step": 42900 }, { "epoch": 0.8787090696864825, "grad_norm": 0.31222066283226013, "learning_rate": 4e-05, "loss": 4.5701, "step": 42910 }, { "epoch": 0.8789138492412918, "grad_norm": 0.32930290699005127, "learning_rate": 4e-05, "loss": 4.5347, "step": 42920 }, { "epoch": 0.879118628796101, "grad_norm": 0.3188677132129669, "learning_rate": 4e-05, "loss": 4.5241, "step": 42930 }, { "epoch": 0.8793234083509103, "grad_norm": 0.3106885552406311, "learning_rate": 4e-05, "loss": 4.5822, "step": 42940 }, { "epoch": 0.8795281879057195, "grad_norm": 0.31368157267570496, "learning_rate": 4e-05, "loss": 4.5754, "step": 42950 }, { "epoch": 0.8797329674605288, "grad_norm": 0.30185890197753906, "learning_rate": 4e-05, "loss": 4.5788, "step": 42960 }, { "epoch": 0.879937747015338, "grad_norm": 0.3285743296146393, "learning_rate": 4e-05, "loss": 4.5545, "step": 42970 }, { "epoch": 0.8801425265701472, "grad_norm": 0.29756179451942444, "learning_rate": 4e-05, "loss": 4.5765, "step": 42980 }, { "epoch": 0.8803473061249565, "grad_norm": 0.3096250891685486, "learning_rate": 4e-05, "loss": 4.5244, "step": 42990 }, { "epoch": 0.8805520856797657, "grad_norm": 0.31590718030929565, "learning_rate": 4e-05, "loss": 4.554, "step": 43000 }, { "epoch": 0.880756865234575, "grad_norm": 0.3368012011051178, "learning_rate": 4e-05, "loss": 4.5296, "step": 43010 }, { "epoch": 0.8809616447893842, "grad_norm": 0.3366091251373291, "learning_rate": 4e-05, "loss": 4.5244, "step": 43020 }, { "epoch": 0.8811664243441935, "grad_norm": 0.30473312735557556, "learning_rate": 4e-05, "loss": 4.5565, "step": 43030 }, { "epoch": 0.8813712038990027, "grad_norm": 0.3344012498855591, "learning_rate": 4e-05, "loss": 4.5351, "step": 43040 }, { "epoch": 0.8815759834538119, "grad_norm": 0.34798553586006165, "learning_rate": 4e-05, "loss": 4.5809, "step": 43050 }, { "epoch": 0.8817807630086212, "grad_norm": 0.32808583974838257, "learning_rate": 4e-05, "loss": 4.5515, "step": 43060 }, { "epoch": 0.8819855425634304, "grad_norm": 0.3296571373939514, "learning_rate": 4e-05, "loss": 4.5609, "step": 43070 }, { "epoch": 0.8821903221182397, "grad_norm": 0.3262360692024231, "learning_rate": 4e-05, "loss": 4.5828, "step": 43080 }, { "epoch": 0.8823951016730489, "grad_norm": 0.31047987937927246, "learning_rate": 4e-05, "loss": 4.5527, "step": 43090 }, { "epoch": 0.8825998812278583, "grad_norm": 0.30903884768486023, "learning_rate": 4e-05, "loss": 4.5466, "step": 43100 }, { "epoch": 0.8828046607826675, "grad_norm": 0.3132905066013336, "learning_rate": 4e-05, "loss": 4.5694, "step": 43110 }, { "epoch": 0.8830094403374767, "grad_norm": 0.3017004132270813, "learning_rate": 4e-05, "loss": 4.5603, "step": 43120 }, { "epoch": 0.883214219892286, "grad_norm": 0.3122783601284027, "learning_rate": 4e-05, "loss": 4.5507, "step": 43130 }, { "epoch": 0.8834189994470952, "grad_norm": 0.3365567922592163, "learning_rate": 4e-05, "loss": 4.603, "step": 43140 }, { "epoch": 0.8836237790019045, "grad_norm": 0.30653059482574463, "learning_rate": 4e-05, "loss": 4.5499, "step": 43150 }, { "epoch": 0.8838285585567137, "grad_norm": 0.3418235778808594, "learning_rate": 4e-05, "loss": 4.5295, "step": 43160 }, { "epoch": 0.884033338111523, "grad_norm": 0.30453822016716003, "learning_rate": 4e-05, "loss": 4.626, "step": 43170 }, { "epoch": 0.8842381176663322, "grad_norm": 0.31867340207099915, "learning_rate": 4e-05, "loss": 4.5563, "step": 43180 }, { "epoch": 0.8844428972211414, "grad_norm": 0.29632946848869324, "learning_rate": 4e-05, "loss": 4.5367, "step": 43190 }, { "epoch": 0.8846476767759507, "grad_norm": 0.30943238735198975, "learning_rate": 4e-05, "loss": 4.5314, "step": 43200 }, { "epoch": 0.8848524563307599, "grad_norm": 0.3099985122680664, "learning_rate": 4e-05, "loss": 4.543, "step": 43210 }, { "epoch": 0.8850572358855692, "grad_norm": 0.35353514552116394, "learning_rate": 4e-05, "loss": 4.579, "step": 43220 }, { "epoch": 0.8852620154403784, "grad_norm": 0.3314644992351532, "learning_rate": 4e-05, "loss": 4.5508, "step": 43230 }, { "epoch": 0.8854667949951877, "grad_norm": 0.3235672414302826, "learning_rate": 4e-05, "loss": 4.5293, "step": 43240 }, { "epoch": 0.8856715745499969, "grad_norm": 0.306649386882782, "learning_rate": 4e-05, "loss": 4.5802, "step": 43250 }, { "epoch": 0.8858763541048061, "grad_norm": 0.3142267167568207, "learning_rate": 4e-05, "loss": 4.5543, "step": 43260 }, { "epoch": 0.8860811336596154, "grad_norm": 0.3082157075405121, "learning_rate": 4e-05, "loss": 4.5236, "step": 43270 }, { "epoch": 0.8862859132144246, "grad_norm": 0.31786486506462097, "learning_rate": 4e-05, "loss": 4.5831, "step": 43280 }, { "epoch": 0.8864906927692339, "grad_norm": 0.31692785024642944, "learning_rate": 4e-05, "loss": 4.5494, "step": 43290 }, { "epoch": 0.8866954723240432, "grad_norm": 0.333349347114563, "learning_rate": 4e-05, "loss": 4.5663, "step": 43300 }, { "epoch": 0.8869002518788525, "grad_norm": 0.39977821707725525, "learning_rate": 4e-05, "loss": 4.5384, "step": 43310 }, { "epoch": 0.8871050314336617, "grad_norm": 0.31643402576446533, "learning_rate": 4e-05, "loss": 4.5889, "step": 43320 }, { "epoch": 0.887309810988471, "grad_norm": 0.3167843818664551, "learning_rate": 4e-05, "loss": 4.5776, "step": 43330 }, { "epoch": 0.8875145905432802, "grad_norm": 0.32853609323501587, "learning_rate": 4e-05, "loss": 4.5614, "step": 43340 }, { "epoch": 0.8877193700980894, "grad_norm": 0.3533945679664612, "learning_rate": 4e-05, "loss": 4.563, "step": 43350 }, { "epoch": 0.8879241496528987, "grad_norm": 0.3062794506549835, "learning_rate": 4e-05, "loss": 4.5707, "step": 43360 }, { "epoch": 0.8881289292077079, "grad_norm": 0.3259894847869873, "learning_rate": 4e-05, "loss": 4.5285, "step": 43370 }, { "epoch": 0.8883337087625172, "grad_norm": 0.3182832598686218, "learning_rate": 4e-05, "loss": 4.5414, "step": 43380 }, { "epoch": 0.8885384883173264, "grad_norm": 0.29748913645744324, "learning_rate": 4e-05, "loss": 4.5454, "step": 43390 }, { "epoch": 0.8887432678721356, "grad_norm": 0.32087647914886475, "learning_rate": 4e-05, "loss": 4.5723, "step": 43400 }, { "epoch": 0.8889480474269449, "grad_norm": 0.317262202501297, "learning_rate": 4e-05, "loss": 4.5367, "step": 43410 }, { "epoch": 0.8891528269817541, "grad_norm": 0.3280443847179413, "learning_rate": 4e-05, "loss": 4.5594, "step": 43420 }, { "epoch": 0.8893576065365634, "grad_norm": 0.3304807245731354, "learning_rate": 4e-05, "loss": 4.5985, "step": 43430 }, { "epoch": 0.8895623860913726, "grad_norm": 0.6725199818611145, "learning_rate": 4e-05, "loss": 4.5865, "step": 43440 }, { "epoch": 0.8897671656461819, "grad_norm": 0.33667856454849243, "learning_rate": 4e-05, "loss": 4.5777, "step": 43450 }, { "epoch": 0.8899719452009911, "grad_norm": 0.31030434370040894, "learning_rate": 4e-05, "loss": 4.5264, "step": 43460 }, { "epoch": 0.8901767247558003, "grad_norm": 0.31836238503456116, "learning_rate": 4e-05, "loss": 4.5815, "step": 43470 }, { "epoch": 0.8903815043106096, "grad_norm": 0.3213574290275574, "learning_rate": 4e-05, "loss": 4.5866, "step": 43480 }, { "epoch": 0.8905862838654188, "grad_norm": 0.37443673610687256, "learning_rate": 4e-05, "loss": 4.5857, "step": 43490 }, { "epoch": 0.8907910634202282, "grad_norm": 0.3113357126712799, "learning_rate": 4e-05, "loss": 4.5476, "step": 43500 }, { "epoch": 0.8909958429750374, "grad_norm": 0.45056983828544617, "learning_rate": 4e-05, "loss": 4.5835, "step": 43510 }, { "epoch": 0.8912006225298467, "grad_norm": 0.40995118021965027, "learning_rate": 4e-05, "loss": 4.5072, "step": 43520 }, { "epoch": 0.8914054020846559, "grad_norm": 0.31137964129447937, "learning_rate": 4e-05, "loss": 4.567, "step": 43530 }, { "epoch": 0.8916101816394651, "grad_norm": 0.327394962310791, "learning_rate": 4e-05, "loss": 4.5762, "step": 43540 }, { "epoch": 0.8918149611942744, "grad_norm": 0.31518206000328064, "learning_rate": 4e-05, "loss": 4.5505, "step": 43550 }, { "epoch": 0.8920197407490836, "grad_norm": 0.3340022563934326, "learning_rate": 4e-05, "loss": 4.5584, "step": 43560 }, { "epoch": 0.8922245203038929, "grad_norm": 0.3266768753528595, "learning_rate": 4e-05, "loss": 4.5653, "step": 43570 }, { "epoch": 0.8924292998587021, "grad_norm": 0.3193603456020355, "learning_rate": 4e-05, "loss": 4.569, "step": 43580 }, { "epoch": 0.8926340794135114, "grad_norm": 0.35704925656318665, "learning_rate": 4e-05, "loss": 4.5703, "step": 43590 }, { "epoch": 0.8928388589683206, "grad_norm": 0.30354368686676025, "learning_rate": 4e-05, "loss": 4.549, "step": 43600 }, { "epoch": 0.8930436385231298, "grad_norm": 0.344433069229126, "learning_rate": 4e-05, "loss": 4.5633, "step": 43610 }, { "epoch": 0.8932484180779391, "grad_norm": 0.32204896211624146, "learning_rate": 4e-05, "loss": 4.557, "step": 43620 }, { "epoch": 0.8934531976327483, "grad_norm": 0.3304206430912018, "learning_rate": 4e-05, "loss": 4.5733, "step": 43630 }, { "epoch": 0.8936579771875576, "grad_norm": 0.30991101264953613, "learning_rate": 4e-05, "loss": 4.5768, "step": 43640 }, { "epoch": 0.8938627567423668, "grad_norm": 0.3452933728694916, "learning_rate": 4e-05, "loss": 4.5519, "step": 43650 }, { "epoch": 0.8940675362971761, "grad_norm": 0.36112919449806213, "learning_rate": 4e-05, "loss": 4.5493, "step": 43660 }, { "epoch": 0.8942723158519853, "grad_norm": 0.30103543400764465, "learning_rate": 4e-05, "loss": 4.5507, "step": 43670 }, { "epoch": 0.8944770954067945, "grad_norm": 0.32138338685035706, "learning_rate": 4e-05, "loss": 4.5913, "step": 43680 }, { "epoch": 0.8946818749616038, "grad_norm": 0.30917757749557495, "learning_rate": 4e-05, "loss": 4.5479, "step": 43690 }, { "epoch": 0.894886654516413, "grad_norm": 0.3278537392616272, "learning_rate": 4e-05, "loss": 4.5493, "step": 43700 }, { "epoch": 0.8950914340712224, "grad_norm": 0.3118800222873688, "learning_rate": 4e-05, "loss": 4.5413, "step": 43710 }, { "epoch": 0.8952962136260316, "grad_norm": 0.4983890950679779, "learning_rate": 4e-05, "loss": 4.5461, "step": 43720 }, { "epoch": 0.8955009931808409, "grad_norm": 0.322793185710907, "learning_rate": 4e-05, "loss": 4.5156, "step": 43730 }, { "epoch": 0.8957057727356501, "grad_norm": 0.34943801164627075, "learning_rate": 4e-05, "loss": 4.5893, "step": 43740 }, { "epoch": 0.8959105522904593, "grad_norm": 0.546573281288147, "learning_rate": 4e-05, "loss": 4.5652, "step": 43750 }, { "epoch": 0.8961153318452686, "grad_norm": 0.4451698362827301, "learning_rate": 4e-05, "loss": 4.5489, "step": 43760 }, { "epoch": 0.8963201114000778, "grad_norm": 0.32471200823783875, "learning_rate": 4e-05, "loss": 4.5438, "step": 43770 }, { "epoch": 0.8965248909548871, "grad_norm": 0.3037568926811218, "learning_rate": 4e-05, "loss": 4.5591, "step": 43780 }, { "epoch": 0.8967296705096963, "grad_norm": 0.3210413157939911, "learning_rate": 4e-05, "loss": 4.5663, "step": 43790 }, { "epoch": 0.8969344500645056, "grad_norm": 0.33913564682006836, "learning_rate": 4e-05, "loss": 4.5989, "step": 43800 }, { "epoch": 0.8971392296193148, "grad_norm": 0.3082035481929779, "learning_rate": 4e-05, "loss": 4.5827, "step": 43810 }, { "epoch": 0.897344009174124, "grad_norm": 0.34691470861434937, "learning_rate": 4e-05, "loss": 4.5176, "step": 43820 }, { "epoch": 0.8975487887289333, "grad_norm": 0.3143591284751892, "learning_rate": 4e-05, "loss": 4.5473, "step": 43830 }, { "epoch": 0.8977535682837425, "grad_norm": 0.2929944396018982, "learning_rate": 4e-05, "loss": 4.5722, "step": 43840 }, { "epoch": 0.8979583478385518, "grad_norm": 0.3059447705745697, "learning_rate": 4e-05, "loss": 4.5399, "step": 43850 }, { "epoch": 0.898163127393361, "grad_norm": 0.3196268081665039, "learning_rate": 4e-05, "loss": 4.5565, "step": 43860 }, { "epoch": 0.8983679069481703, "grad_norm": 0.3168346881866455, "learning_rate": 4e-05, "loss": 4.5745, "step": 43870 }, { "epoch": 0.8985726865029795, "grad_norm": 0.31572356820106506, "learning_rate": 4e-05, "loss": 4.5789, "step": 43880 }, { "epoch": 0.8987774660577887, "grad_norm": 0.30310994386672974, "learning_rate": 4e-05, "loss": 4.5478, "step": 43890 }, { "epoch": 0.898982245612598, "grad_norm": 0.3090900480747223, "learning_rate": 4e-05, "loss": 4.5711, "step": 43900 }, { "epoch": 0.8991870251674073, "grad_norm": 0.31497764587402344, "learning_rate": 4e-05, "loss": 4.5667, "step": 43910 }, { "epoch": 0.8993918047222166, "grad_norm": 0.3240913152694702, "learning_rate": 4e-05, "loss": 4.5514, "step": 43920 }, { "epoch": 0.8995965842770258, "grad_norm": 0.3159175515174866, "learning_rate": 4e-05, "loss": 4.5659, "step": 43930 }, { "epoch": 0.8998013638318351, "grad_norm": 0.30631309747695923, "learning_rate": 4e-05, "loss": 4.5506, "step": 43940 }, { "epoch": 0.9000061433866443, "grad_norm": 0.3201664984226227, "learning_rate": 4e-05, "loss": 4.535, "step": 43950 }, { "epoch": 0.9002109229414536, "grad_norm": 0.32394763827323914, "learning_rate": 4e-05, "loss": 4.5698, "step": 43960 }, { "epoch": 0.9004157024962628, "grad_norm": 0.3251607120037079, "learning_rate": 4e-05, "loss": 4.5394, "step": 43970 }, { "epoch": 0.900620482051072, "grad_norm": 0.3281831443309784, "learning_rate": 4e-05, "loss": 4.5592, "step": 43980 }, { "epoch": 0.9008252616058813, "grad_norm": 0.32417479157447815, "learning_rate": 4e-05, "loss": 4.5466, "step": 43990 }, { "epoch": 0.9010300411606905, "grad_norm": 0.3111242651939392, "learning_rate": 4e-05, "loss": 4.5398, "step": 44000 }, { "epoch": 0.9010300411606905, "eval_loss": 4.560854434967041, "eval_runtime": 4.4095, "eval_samples_per_second": 264.43, "eval_steps_per_second": 33.11, "step": 44000 }, { "epoch": 0.9012348207154998, "grad_norm": 0.3045864999294281, "learning_rate": 4e-05, "loss": 4.5549, "step": 44010 }, { "epoch": 0.901439600270309, "grad_norm": 0.30493706464767456, "learning_rate": 4e-05, "loss": 4.5632, "step": 44020 }, { "epoch": 0.9016443798251182, "grad_norm": 0.338446706533432, "learning_rate": 4e-05, "loss": 4.5729, "step": 44030 }, { "epoch": 0.9018491593799275, "grad_norm": 0.3336330056190491, "learning_rate": 4e-05, "loss": 4.533, "step": 44040 }, { "epoch": 0.9020539389347367, "grad_norm": 0.33428311347961426, "learning_rate": 4e-05, "loss": 4.5455, "step": 44050 }, { "epoch": 0.902258718489546, "grad_norm": 0.3272670805454254, "learning_rate": 4e-05, "loss": 4.5272, "step": 44060 }, { "epoch": 0.9024634980443552, "grad_norm": 0.33472102880477905, "learning_rate": 4e-05, "loss": 4.5763, "step": 44070 }, { "epoch": 0.9026682775991645, "grad_norm": 0.3110867440700531, "learning_rate": 4e-05, "loss": 4.5516, "step": 44080 }, { "epoch": 0.9028730571539737, "grad_norm": 0.3802002966403961, "learning_rate": 4e-05, "loss": 4.5858, "step": 44090 }, { "epoch": 0.9030778367087829, "grad_norm": 0.3172958493232727, "learning_rate": 4e-05, "loss": 4.5838, "step": 44100 }, { "epoch": 0.9032826162635923, "grad_norm": 0.32478705048561096, "learning_rate": 4e-05, "loss": 4.5695, "step": 44110 }, { "epoch": 0.9034873958184015, "grad_norm": 0.32184964418411255, "learning_rate": 4e-05, "loss": 4.597, "step": 44120 }, { "epoch": 0.9036921753732108, "grad_norm": 0.3221733272075653, "learning_rate": 4e-05, "loss": 4.5722, "step": 44130 }, { "epoch": 0.90389695492802, "grad_norm": 0.31805217266082764, "learning_rate": 4e-05, "loss": 4.6067, "step": 44140 }, { "epoch": 0.9041017344828293, "grad_norm": 0.3431871235370636, "learning_rate": 4e-05, "loss": 4.5375, "step": 44150 }, { "epoch": 0.9043065140376385, "grad_norm": 0.302630752325058, "learning_rate": 4e-05, "loss": 4.5408, "step": 44160 }, { "epoch": 0.9045112935924478, "grad_norm": 0.31336262822151184, "learning_rate": 4e-05, "loss": 4.5731, "step": 44170 }, { "epoch": 0.904716073147257, "grad_norm": 0.3739093244075775, "learning_rate": 4e-05, "loss": 4.5485, "step": 44180 }, { "epoch": 0.9049208527020662, "grad_norm": 0.31083807349205017, "learning_rate": 4e-05, "loss": 4.556, "step": 44190 }, { "epoch": 0.9051256322568755, "grad_norm": 0.31648892164230347, "learning_rate": 4e-05, "loss": 4.5896, "step": 44200 }, { "epoch": 0.9053304118116847, "grad_norm": 0.3103238344192505, "learning_rate": 4e-05, "loss": 4.5782, "step": 44210 }, { "epoch": 0.905535191366494, "grad_norm": 0.3150661289691925, "learning_rate": 4e-05, "loss": 4.5722, "step": 44220 }, { "epoch": 0.9057399709213032, "grad_norm": 0.30225515365600586, "learning_rate": 4e-05, "loss": 4.5595, "step": 44230 }, { "epoch": 0.9059447504761124, "grad_norm": 0.31010523438453674, "learning_rate": 4e-05, "loss": 4.5365, "step": 44240 }, { "epoch": 0.9061495300309217, "grad_norm": 0.3108595013618469, "learning_rate": 4e-05, "loss": 4.5376, "step": 44250 }, { "epoch": 0.9063543095857309, "grad_norm": 0.31890687346458435, "learning_rate": 4e-05, "loss": 4.5588, "step": 44260 }, { "epoch": 0.9065590891405402, "grad_norm": 0.3646460473537445, "learning_rate": 4e-05, "loss": 4.5203, "step": 44270 }, { "epoch": 0.9067638686953494, "grad_norm": 0.33907946944236755, "learning_rate": 4e-05, "loss": 4.571, "step": 44280 }, { "epoch": 0.9069686482501587, "grad_norm": 0.34278184175491333, "learning_rate": 4e-05, "loss": 4.5299, "step": 44290 }, { "epoch": 0.9071734278049679, "grad_norm": 0.30765822529792786, "learning_rate": 4e-05, "loss": 4.5202, "step": 44300 }, { "epoch": 0.9073782073597771, "grad_norm": 0.32606104016304016, "learning_rate": 4e-05, "loss": 4.576, "step": 44310 }, { "epoch": 0.9075829869145865, "grad_norm": 0.3165709376335144, "learning_rate": 4e-05, "loss": 4.5648, "step": 44320 }, { "epoch": 0.9077877664693957, "grad_norm": 0.31286153197288513, "learning_rate": 4e-05, "loss": 4.5698, "step": 44330 }, { "epoch": 0.907992546024205, "grad_norm": 0.3222188353538513, "learning_rate": 4e-05, "loss": 4.5682, "step": 44340 }, { "epoch": 0.9081973255790142, "grad_norm": 0.332038015127182, "learning_rate": 4e-05, "loss": 4.5676, "step": 44350 }, { "epoch": 0.9084021051338235, "grad_norm": 0.32790395617485046, "learning_rate": 4e-05, "loss": 4.585, "step": 44360 }, { "epoch": 0.9086068846886327, "grad_norm": 0.3362048864364624, "learning_rate": 4e-05, "loss": 4.5528, "step": 44370 }, { "epoch": 0.908811664243442, "grad_norm": 0.32595202326774597, "learning_rate": 4e-05, "loss": 4.563, "step": 44380 }, { "epoch": 0.9090164437982512, "grad_norm": 0.3537389636039734, "learning_rate": 4e-05, "loss": 4.5868, "step": 44390 }, { "epoch": 0.9092212233530604, "grad_norm": 0.3152723014354706, "learning_rate": 4e-05, "loss": 4.567, "step": 44400 }, { "epoch": 0.9094260029078697, "grad_norm": 0.310278058052063, "learning_rate": 4e-05, "loss": 4.5368, "step": 44410 }, { "epoch": 0.9096307824626789, "grad_norm": 0.3246089518070221, "learning_rate": 4e-05, "loss": 4.5708, "step": 44420 }, { "epoch": 0.9098355620174882, "grad_norm": 0.3013988435268402, "learning_rate": 4e-05, "loss": 4.5438, "step": 44430 }, { "epoch": 0.9100403415722974, "grad_norm": 0.33126717805862427, "learning_rate": 4e-05, "loss": 4.5729, "step": 44440 }, { "epoch": 0.9102451211271066, "grad_norm": 0.31058260798454285, "learning_rate": 4e-05, "loss": 4.5681, "step": 44450 }, { "epoch": 0.9104499006819159, "grad_norm": 0.5047004818916321, "learning_rate": 4e-05, "loss": 4.5414, "step": 44460 }, { "epoch": 0.9106546802367251, "grad_norm": 0.3140256702899933, "learning_rate": 4e-05, "loss": 4.5565, "step": 44470 }, { "epoch": 0.9108594597915344, "grad_norm": 0.42531779408454895, "learning_rate": 4e-05, "loss": 4.5674, "step": 44480 }, { "epoch": 0.9110642393463436, "grad_norm": 0.3069171905517578, "learning_rate": 4e-05, "loss": 4.388, "step": 44490 }, { "epoch": 0.9112690189011529, "grad_norm": 0.3190549314022064, "learning_rate": 4e-05, "loss": 4.5512, "step": 44500 }, { "epoch": 0.9114737984559621, "grad_norm": 0.31640002131462097, "learning_rate": 4e-05, "loss": 4.5761, "step": 44510 }, { "epoch": 0.9116785780107715, "grad_norm": 0.31353873014450073, "learning_rate": 4e-05, "loss": 4.5616, "step": 44520 }, { "epoch": 0.9118833575655807, "grad_norm": 0.3220072090625763, "learning_rate": 4e-05, "loss": 4.5697, "step": 44530 }, { "epoch": 0.9120881371203899, "grad_norm": 0.35407111048698425, "learning_rate": 4e-05, "loss": 4.5294, "step": 44540 }, { "epoch": 0.9122929166751992, "grad_norm": 0.3476737439632416, "learning_rate": 4e-05, "loss": 4.5531, "step": 44550 }, { "epoch": 0.9124976962300084, "grad_norm": 0.32941582798957825, "learning_rate": 4e-05, "loss": 4.5444, "step": 44560 }, { "epoch": 0.9127024757848177, "grad_norm": 0.31432151794433594, "learning_rate": 4e-05, "loss": 4.6001, "step": 44570 }, { "epoch": 0.9129072553396269, "grad_norm": 0.3961334228515625, "learning_rate": 4e-05, "loss": 4.578, "step": 44580 }, { "epoch": 0.9131120348944362, "grad_norm": 0.8139594793319702, "learning_rate": 4e-05, "loss": 4.5495, "step": 44590 }, { "epoch": 0.9133168144492454, "grad_norm": 0.3410652279853821, "learning_rate": 4e-05, "loss": 4.6008, "step": 44600 }, { "epoch": 0.9135215940040546, "grad_norm": 0.30633702874183655, "learning_rate": 4e-05, "loss": 4.5491, "step": 44610 }, { "epoch": 0.9137263735588639, "grad_norm": 0.3259240388870239, "learning_rate": 4e-05, "loss": 4.556, "step": 44620 }, { "epoch": 0.9139311531136731, "grad_norm": 0.33315950632095337, "learning_rate": 4e-05, "loss": 4.5631, "step": 44630 }, { "epoch": 0.9141359326684824, "grad_norm": 0.3183877170085907, "learning_rate": 4e-05, "loss": 4.5838, "step": 44640 }, { "epoch": 0.9143407122232916, "grad_norm": 0.3292442858219147, "learning_rate": 4e-05, "loss": 4.5895, "step": 44650 }, { "epoch": 0.9145454917781008, "grad_norm": 0.33461958169937134, "learning_rate": 4e-05, "loss": 4.5707, "step": 44660 }, { "epoch": 0.9147502713329101, "grad_norm": 0.5829816460609436, "learning_rate": 4e-05, "loss": 4.5615, "step": 44670 }, { "epoch": 0.9149550508877193, "grad_norm": 0.31403544545173645, "learning_rate": 4e-05, "loss": 4.5727, "step": 44680 }, { "epoch": 0.9151598304425286, "grad_norm": 0.3185792863368988, "learning_rate": 4e-05, "loss": 4.5475, "step": 44690 }, { "epoch": 0.9153646099973378, "grad_norm": 0.34621044993400574, "learning_rate": 4e-05, "loss": 4.5746, "step": 44700 }, { "epoch": 0.9155693895521471, "grad_norm": 0.29865196347236633, "learning_rate": 4e-05, "loss": 4.5576, "step": 44710 }, { "epoch": 0.9157741691069564, "grad_norm": 0.3081812560558319, "learning_rate": 4e-05, "loss": 4.5532, "step": 44720 }, { "epoch": 0.9159789486617657, "grad_norm": 0.32022973895072937, "learning_rate": 4e-05, "loss": 4.5551, "step": 44730 }, { "epoch": 0.9161837282165749, "grad_norm": 0.318764865398407, "learning_rate": 4e-05, "loss": 4.524, "step": 44740 }, { "epoch": 0.9163885077713841, "grad_norm": 0.31628209352493286, "learning_rate": 4e-05, "loss": 4.5506, "step": 44750 }, { "epoch": 0.9165932873261934, "grad_norm": 0.3148849308490753, "learning_rate": 4e-05, "loss": 4.533, "step": 44760 }, { "epoch": 0.9167980668810026, "grad_norm": 0.3610849380493164, "learning_rate": 4e-05, "loss": 4.5294, "step": 44770 }, { "epoch": 0.9170028464358119, "grad_norm": 0.31631144881248474, "learning_rate": 4e-05, "loss": 4.5483, "step": 44780 }, { "epoch": 0.9172076259906211, "grad_norm": 0.3129100203514099, "learning_rate": 4e-05, "loss": 4.5571, "step": 44790 }, { "epoch": 0.9174124055454304, "grad_norm": 0.4124535918235779, "learning_rate": 4e-05, "loss": 4.5468, "step": 44800 }, { "epoch": 0.9176171851002396, "grad_norm": 0.3188845217227936, "learning_rate": 4e-05, "loss": 4.5311, "step": 44810 }, { "epoch": 0.9178219646550488, "grad_norm": 0.32192277908325195, "learning_rate": 4e-05, "loss": 4.5175, "step": 44820 }, { "epoch": 0.9180267442098581, "grad_norm": 0.31410878896713257, "learning_rate": 4e-05, "loss": 4.5588, "step": 44830 }, { "epoch": 0.9182315237646673, "grad_norm": 0.32310882210731506, "learning_rate": 4e-05, "loss": 4.5471, "step": 44840 }, { "epoch": 0.9184363033194766, "grad_norm": 0.3387526273727417, "learning_rate": 4e-05, "loss": 4.5345, "step": 44850 }, { "epoch": 0.9186410828742858, "grad_norm": 0.3155510425567627, "learning_rate": 4e-05, "loss": 4.6002, "step": 44860 }, { "epoch": 0.918845862429095, "grad_norm": 0.3108542859554291, "learning_rate": 4e-05, "loss": 4.5653, "step": 44870 }, { "epoch": 0.9190506419839043, "grad_norm": 0.30360421538352966, "learning_rate": 4e-05, "loss": 4.565, "step": 44880 }, { "epoch": 0.9192554215387135, "grad_norm": 0.29860028624534607, "learning_rate": 4e-05, "loss": 4.5713, "step": 44890 }, { "epoch": 0.9194602010935228, "grad_norm": 0.32581397891044617, "learning_rate": 4e-05, "loss": 4.5627, "step": 44900 }, { "epoch": 0.919664980648332, "grad_norm": 0.32004186511039734, "learning_rate": 4e-05, "loss": 4.5541, "step": 44910 }, { "epoch": 0.9198697602031414, "grad_norm": 0.3210557699203491, "learning_rate": 4e-05, "loss": 4.5672, "step": 44920 }, { "epoch": 0.9200745397579506, "grad_norm": 0.34228792786598206, "learning_rate": 4e-05, "loss": 4.5653, "step": 44930 }, { "epoch": 0.9202793193127599, "grad_norm": 0.3095720410346985, "learning_rate": 4e-05, "loss": 4.5762, "step": 44940 }, { "epoch": 0.9204840988675691, "grad_norm": 0.34053659439086914, "learning_rate": 4e-05, "loss": 4.5799, "step": 44950 }, { "epoch": 0.9206888784223783, "grad_norm": 0.37338224053382874, "learning_rate": 4e-05, "loss": 4.5936, "step": 44960 }, { "epoch": 0.9208936579771876, "grad_norm": 0.3042196035385132, "learning_rate": 4e-05, "loss": 4.5614, "step": 44970 }, { "epoch": 0.9210984375319968, "grad_norm": 0.31008726358413696, "learning_rate": 4e-05, "loss": 4.5336, "step": 44980 }, { "epoch": 0.9213032170868061, "grad_norm": 0.3075076937675476, "learning_rate": 4e-05, "loss": 4.5432, "step": 44990 }, { "epoch": 0.9215079966416153, "grad_norm": 0.4271358549594879, "learning_rate": 4e-05, "loss": 4.5865, "step": 45000 }, { "epoch": 0.9217127761964246, "grad_norm": 0.3291766941547394, "learning_rate": 4e-05, "loss": 4.5299, "step": 45010 }, { "epoch": 0.9219175557512338, "grad_norm": 0.34627512097358704, "learning_rate": 4e-05, "loss": 4.5393, "step": 45020 }, { "epoch": 0.922122335306043, "grad_norm": 0.31433627009391785, "learning_rate": 4e-05, "loss": 4.569, "step": 45030 }, { "epoch": 0.9223271148608523, "grad_norm": 0.33536338806152344, "learning_rate": 4e-05, "loss": 4.5606, "step": 45040 }, { "epoch": 0.9225318944156615, "grad_norm": 0.31146395206451416, "learning_rate": 4e-05, "loss": 4.5485, "step": 45050 }, { "epoch": 0.9227366739704708, "grad_norm": 0.35025808215141296, "learning_rate": 4e-05, "loss": 4.5272, "step": 45060 }, { "epoch": 0.92294145352528, "grad_norm": 0.318724125623703, "learning_rate": 4e-05, "loss": 4.6059, "step": 45070 }, { "epoch": 0.9231462330800893, "grad_norm": 0.31548213958740234, "learning_rate": 4e-05, "loss": 4.5931, "step": 45080 }, { "epoch": 0.9233510126348985, "grad_norm": 0.35086187720298767, "learning_rate": 4e-05, "loss": 4.5536, "step": 45090 }, { "epoch": 0.9235557921897077, "grad_norm": 0.44874176383018494, "learning_rate": 4e-05, "loss": 4.5411, "step": 45100 }, { "epoch": 0.923760571744517, "grad_norm": 0.3447568118572235, "learning_rate": 4e-05, "loss": 4.541, "step": 45110 }, { "epoch": 0.9239653512993262, "grad_norm": 0.34852394461631775, "learning_rate": 4e-05, "loss": 4.5474, "step": 45120 }, { "epoch": 0.9241701308541356, "grad_norm": 0.3143792152404785, "learning_rate": 4e-05, "loss": 4.5602, "step": 45130 }, { "epoch": 0.9243749104089448, "grad_norm": 0.3151521682739258, "learning_rate": 4e-05, "loss": 4.5705, "step": 45140 }, { "epoch": 0.9245796899637541, "grad_norm": 0.34277409315109253, "learning_rate": 4e-05, "loss": 4.5778, "step": 45150 }, { "epoch": 0.9247844695185633, "grad_norm": 0.3213508427143097, "learning_rate": 4e-05, "loss": 4.562, "step": 45160 }, { "epoch": 0.9249892490733725, "grad_norm": 0.30300119519233704, "learning_rate": 4e-05, "loss": 4.5725, "step": 45170 }, { "epoch": 0.9251940286281818, "grad_norm": 0.3328953981399536, "learning_rate": 4e-05, "loss": 4.5863, "step": 45180 }, { "epoch": 0.925398808182991, "grad_norm": 0.342203289270401, "learning_rate": 4e-05, "loss": 4.5504, "step": 45190 }, { "epoch": 0.9256035877378003, "grad_norm": 0.3172464966773987, "learning_rate": 4e-05, "loss": 4.586, "step": 45200 }, { "epoch": 0.9258083672926095, "grad_norm": 0.3222777247428894, "learning_rate": 4e-05, "loss": 4.5512, "step": 45210 }, { "epoch": 0.9260131468474188, "grad_norm": 0.31430578231811523, "learning_rate": 4e-05, "loss": 4.5716, "step": 45220 }, { "epoch": 0.926217926402228, "grad_norm": 0.3673795759677887, "learning_rate": 4e-05, "loss": 4.5336, "step": 45230 }, { "epoch": 0.9264227059570372, "grad_norm": 0.35292544960975647, "learning_rate": 4e-05, "loss": 4.5614, "step": 45240 }, { "epoch": 0.9266274855118465, "grad_norm": 0.31124240159988403, "learning_rate": 4e-05, "loss": 4.5489, "step": 45250 }, { "epoch": 0.9268322650666557, "grad_norm": 0.2988051772117615, "learning_rate": 4e-05, "loss": 4.5485, "step": 45260 }, { "epoch": 0.927037044621465, "grad_norm": 0.314090371131897, "learning_rate": 4e-05, "loss": 4.5719, "step": 45270 }, { "epoch": 0.9272418241762742, "grad_norm": 0.32014816999435425, "learning_rate": 4e-05, "loss": 4.601, "step": 45280 }, { "epoch": 0.9274466037310835, "grad_norm": 0.32191765308380127, "learning_rate": 4e-05, "loss": 4.5615, "step": 45290 }, { "epoch": 0.9276513832858927, "grad_norm": 0.33087530732154846, "learning_rate": 4e-05, "loss": 4.6015, "step": 45300 }, { "epoch": 0.9278561628407019, "grad_norm": 0.3054031729698181, "learning_rate": 4e-05, "loss": 4.5461, "step": 45310 }, { "epoch": 0.9280609423955112, "grad_norm": 0.31847915053367615, "learning_rate": 4e-05, "loss": 4.5764, "step": 45320 }, { "epoch": 0.9282657219503205, "grad_norm": 0.31255143880844116, "learning_rate": 4e-05, "loss": 4.5722, "step": 45330 }, { "epoch": 0.9284705015051298, "grad_norm": 0.3137277364730835, "learning_rate": 4e-05, "loss": 4.5806, "step": 45340 }, { "epoch": 0.928675281059939, "grad_norm": 0.30868300795555115, "learning_rate": 4e-05, "loss": 4.561, "step": 45350 }, { "epoch": 0.9288800606147483, "grad_norm": 0.3062467873096466, "learning_rate": 4e-05, "loss": 4.5745, "step": 45360 }, { "epoch": 0.9290848401695575, "grad_norm": 0.3123590648174286, "learning_rate": 4e-05, "loss": 4.5523, "step": 45370 }, { "epoch": 0.9292896197243667, "grad_norm": 0.34719258546829224, "learning_rate": 4e-05, "loss": 4.5775, "step": 45380 }, { "epoch": 0.929494399279176, "grad_norm": 0.33141425251960754, "learning_rate": 4e-05, "loss": 4.5424, "step": 45390 }, { "epoch": 0.9296991788339852, "grad_norm": 0.35615402460098267, "learning_rate": 4e-05, "loss": 4.5491, "step": 45400 }, { "epoch": 0.9299039583887945, "grad_norm": 0.31258466839790344, "learning_rate": 4e-05, "loss": 4.5409, "step": 45410 }, { "epoch": 0.9301087379436037, "grad_norm": 0.3252570629119873, "learning_rate": 4e-05, "loss": 4.6003, "step": 45420 }, { "epoch": 0.930313517498413, "grad_norm": 0.3250385820865631, "learning_rate": 4e-05, "loss": 4.5763, "step": 45430 }, { "epoch": 0.9305182970532222, "grad_norm": 0.32301104068756104, "learning_rate": 4e-05, "loss": 4.5527, "step": 45440 }, { "epoch": 0.9307230766080314, "grad_norm": 0.33571791648864746, "learning_rate": 4e-05, "loss": 4.5366, "step": 45450 }, { "epoch": 0.9309278561628407, "grad_norm": 0.3223508894443512, "learning_rate": 4e-05, "loss": 4.5766, "step": 45460 }, { "epoch": 0.9311326357176499, "grad_norm": 0.38597506284713745, "learning_rate": 4e-05, "loss": 4.5291, "step": 45470 }, { "epoch": 0.9313374152724592, "grad_norm": 0.31094756722450256, "learning_rate": 4e-05, "loss": 4.5959, "step": 45480 }, { "epoch": 0.9315421948272684, "grad_norm": 0.3371400237083435, "learning_rate": 4e-05, "loss": 4.5605, "step": 45490 }, { "epoch": 0.9317469743820777, "grad_norm": 0.3200722932815552, "learning_rate": 4e-05, "loss": 4.5638, "step": 45500 }, { "epoch": 0.9319517539368869, "grad_norm": 0.3067943751811981, "learning_rate": 4e-05, "loss": 4.5505, "step": 45510 }, { "epoch": 0.9321565334916961, "grad_norm": 0.3259231746196747, "learning_rate": 4e-05, "loss": 4.5644, "step": 45520 }, { "epoch": 0.9323613130465055, "grad_norm": 0.3075989782810211, "learning_rate": 4e-05, "loss": 4.5436, "step": 45530 }, { "epoch": 0.9325660926013147, "grad_norm": 0.32256752252578735, "learning_rate": 4e-05, "loss": 4.5671, "step": 45540 }, { "epoch": 0.932770872156124, "grad_norm": 0.3286307156085968, "learning_rate": 4e-05, "loss": 4.5673, "step": 45550 }, { "epoch": 0.9329756517109332, "grad_norm": 0.3804124891757965, "learning_rate": 4e-05, "loss": 4.5413, "step": 45560 }, { "epoch": 0.9331804312657425, "grad_norm": 0.3336116373538971, "learning_rate": 4e-05, "loss": 4.4751, "step": 45570 }, { "epoch": 0.9333852108205517, "grad_norm": 0.31247478723526, "learning_rate": 4e-05, "loss": 4.6037, "step": 45580 }, { "epoch": 0.933589990375361, "grad_norm": 0.32099565863609314, "learning_rate": 4e-05, "loss": 4.5669, "step": 45590 }, { "epoch": 0.9337947699301702, "grad_norm": 0.316138356924057, "learning_rate": 4e-05, "loss": 4.5341, "step": 45600 }, { "epoch": 0.9339995494849794, "grad_norm": 0.3377699851989746, "learning_rate": 4e-05, "loss": 4.56, "step": 45610 }, { "epoch": 0.9342043290397887, "grad_norm": 0.3313315510749817, "learning_rate": 4e-05, "loss": 4.5437, "step": 45620 }, { "epoch": 0.9344091085945979, "grad_norm": 0.318208783864975, "learning_rate": 4e-05, "loss": 4.5828, "step": 45630 }, { "epoch": 0.9346138881494072, "grad_norm": 0.33342334628105164, "learning_rate": 4e-05, "loss": 4.5715, "step": 45640 }, { "epoch": 0.9348186677042164, "grad_norm": 0.3693804442882538, "learning_rate": 4e-05, "loss": 4.5503, "step": 45650 }, { "epoch": 0.9350234472590256, "grad_norm": 0.3329488933086395, "learning_rate": 4e-05, "loss": 4.5663, "step": 45660 }, { "epoch": 0.9352282268138349, "grad_norm": 0.3339942693710327, "learning_rate": 4e-05, "loss": 4.5704, "step": 45670 }, { "epoch": 0.9354330063686441, "grad_norm": 0.3084401488304138, "learning_rate": 4e-05, "loss": 4.5613, "step": 45680 }, { "epoch": 0.9356377859234534, "grad_norm": 0.32649314403533936, "learning_rate": 4e-05, "loss": 4.5802, "step": 45690 }, { "epoch": 0.9358425654782626, "grad_norm": 0.39799973368644714, "learning_rate": 4e-05, "loss": 4.5817, "step": 45700 }, { "epoch": 0.9360473450330719, "grad_norm": 0.34029585123062134, "learning_rate": 4e-05, "loss": 4.5592, "step": 45710 }, { "epoch": 0.9362521245878811, "grad_norm": 0.3444403111934662, "learning_rate": 4e-05, "loss": 4.5635, "step": 45720 }, { "epoch": 0.9364569041426903, "grad_norm": 0.34990790486335754, "learning_rate": 4e-05, "loss": 4.5199, "step": 45730 }, { "epoch": 0.9366616836974997, "grad_norm": 0.32193541526794434, "learning_rate": 4e-05, "loss": 4.5397, "step": 45740 }, { "epoch": 0.9368664632523089, "grad_norm": 0.321878045797348, "learning_rate": 4e-05, "loss": 4.5521, "step": 45750 }, { "epoch": 0.9370712428071182, "grad_norm": 0.32384225726127625, "learning_rate": 4e-05, "loss": 4.5573, "step": 45760 }, { "epoch": 0.9372760223619274, "grad_norm": 0.30843082070350647, "learning_rate": 4e-05, "loss": 4.5788, "step": 45770 }, { "epoch": 0.9374808019167367, "grad_norm": 0.3273121416568756, "learning_rate": 4e-05, "loss": 4.5212, "step": 45780 }, { "epoch": 0.9376855814715459, "grad_norm": 0.3097444772720337, "learning_rate": 4e-05, "loss": 4.5484, "step": 45790 }, { "epoch": 0.9378903610263551, "grad_norm": 0.32343387603759766, "learning_rate": 4e-05, "loss": 4.5447, "step": 45800 }, { "epoch": 0.9380951405811644, "grad_norm": 0.30633851885795593, "learning_rate": 4e-05, "loss": 4.5429, "step": 45810 }, { "epoch": 0.9382999201359736, "grad_norm": 0.32246527075767517, "learning_rate": 4e-05, "loss": 4.5258, "step": 45820 }, { "epoch": 0.9385046996907829, "grad_norm": 0.34284520149230957, "learning_rate": 4e-05, "loss": 4.583, "step": 45830 }, { "epoch": 0.9387094792455921, "grad_norm": 0.32740962505340576, "learning_rate": 4e-05, "loss": 4.5577, "step": 45840 }, { "epoch": 0.9389142588004014, "grad_norm": 0.35072508454322815, "learning_rate": 4e-05, "loss": 4.5987, "step": 45850 }, { "epoch": 0.9391190383552106, "grad_norm": 0.30853354930877686, "learning_rate": 4e-05, "loss": 4.535, "step": 45860 }, { "epoch": 0.9393238179100198, "grad_norm": 0.3089509606361389, "learning_rate": 4e-05, "loss": 4.5692, "step": 45870 }, { "epoch": 0.9395285974648291, "grad_norm": 0.3152104914188385, "learning_rate": 4e-05, "loss": 4.5415, "step": 45880 }, { "epoch": 0.9397333770196383, "grad_norm": 0.33188074827194214, "learning_rate": 4e-05, "loss": 4.5635, "step": 45890 }, { "epoch": 0.9399381565744476, "grad_norm": 0.31641748547554016, "learning_rate": 4e-05, "loss": 4.5238, "step": 45900 }, { "epoch": 0.9401429361292568, "grad_norm": 0.3385186791419983, "learning_rate": 4e-05, "loss": 4.5406, "step": 45910 }, { "epoch": 0.940347715684066, "grad_norm": 0.33622637391090393, "learning_rate": 4e-05, "loss": 4.5564, "step": 45920 }, { "epoch": 0.9405524952388753, "grad_norm": 0.32635486125946045, "learning_rate": 4e-05, "loss": 4.5768, "step": 45930 }, { "epoch": 0.9407572747936847, "grad_norm": 0.318810373544693, "learning_rate": 4e-05, "loss": 4.5554, "step": 45940 }, { "epoch": 0.9409620543484939, "grad_norm": 0.3088144063949585, "learning_rate": 4e-05, "loss": 4.5607, "step": 45950 }, { "epoch": 0.9411668339033031, "grad_norm": 0.36967018246650696, "learning_rate": 4e-05, "loss": 4.5944, "step": 45960 }, { "epoch": 0.9413716134581124, "grad_norm": 0.3272745907306671, "learning_rate": 4e-05, "loss": 4.5561, "step": 45970 }, { "epoch": 0.9415763930129216, "grad_norm": 0.30740636587142944, "learning_rate": 4e-05, "loss": 4.5619, "step": 45980 }, { "epoch": 0.9417811725677309, "grad_norm": 0.3262521028518677, "learning_rate": 4e-05, "loss": 4.5829, "step": 45990 }, { "epoch": 0.9419859521225401, "grad_norm": 0.3408719599246979, "learning_rate": 4e-05, "loss": 4.5521, "step": 46000 }, { "epoch": 0.9419859521225401, "eval_loss": 4.558419704437256, "eval_runtime": 4.3817, "eval_samples_per_second": 266.108, "eval_steps_per_second": 33.321, "step": 46000 }, { "epoch": 0.9421907316773493, "grad_norm": 0.3294796049594879, "learning_rate": 4e-05, "loss": 4.5504, "step": 46010 }, { "epoch": 0.9423955112321586, "grad_norm": 0.31579163670539856, "learning_rate": 4e-05, "loss": 4.5573, "step": 46020 }, { "epoch": 0.9426002907869678, "grad_norm": 0.34491610527038574, "learning_rate": 4e-05, "loss": 4.5627, "step": 46030 }, { "epoch": 0.9428050703417771, "grad_norm": 0.31735047698020935, "learning_rate": 4e-05, "loss": 4.535, "step": 46040 }, { "epoch": 0.9430098498965863, "grad_norm": 0.3627663850784302, "learning_rate": 4e-05, "loss": 4.5832, "step": 46050 }, { "epoch": 0.9432146294513956, "grad_norm": 0.3657461106777191, "learning_rate": 4e-05, "loss": 4.5651, "step": 46060 }, { "epoch": 0.9434194090062048, "grad_norm": 0.31911537051200867, "learning_rate": 4e-05, "loss": 4.5475, "step": 46070 }, { "epoch": 0.943624188561014, "grad_norm": 0.38343313336372375, "learning_rate": 4e-05, "loss": 4.5401, "step": 46080 }, { "epoch": 0.9438289681158233, "grad_norm": 0.3523477613925934, "learning_rate": 4e-05, "loss": 4.5553, "step": 46090 }, { "epoch": 0.9440337476706325, "grad_norm": 0.31751102209091187, "learning_rate": 4e-05, "loss": 4.571, "step": 46100 }, { "epoch": 0.9442385272254418, "grad_norm": 0.3188442289829254, "learning_rate": 4e-05, "loss": 4.5349, "step": 46110 }, { "epoch": 0.944443306780251, "grad_norm": 0.36513665318489075, "learning_rate": 4e-05, "loss": 4.5515, "step": 46120 }, { "epoch": 0.9446480863350603, "grad_norm": 0.31229493021965027, "learning_rate": 4e-05, "loss": 4.5726, "step": 46130 }, { "epoch": 0.9448528658898696, "grad_norm": 0.31164535880088806, "learning_rate": 4e-05, "loss": 4.561, "step": 46140 }, { "epoch": 0.9450576454446789, "grad_norm": 0.32203739881515503, "learning_rate": 4e-05, "loss": 4.545, "step": 46150 }, { "epoch": 0.9452624249994881, "grad_norm": 0.3296814262866974, "learning_rate": 4e-05, "loss": 4.4901, "step": 46160 }, { "epoch": 0.9454672045542973, "grad_norm": 0.3368031084537506, "learning_rate": 4e-05, "loss": 4.5014, "step": 46170 }, { "epoch": 0.9456719841091066, "grad_norm": 0.3548653721809387, "learning_rate": 4e-05, "loss": 4.562, "step": 46180 }, { "epoch": 0.9458767636639158, "grad_norm": 0.3123794198036194, "learning_rate": 4e-05, "loss": 4.5671, "step": 46190 }, { "epoch": 0.9460815432187251, "grad_norm": 0.3127918243408203, "learning_rate": 4e-05, "loss": 4.5618, "step": 46200 }, { "epoch": 0.9462863227735343, "grad_norm": 0.3214389979839325, "learning_rate": 4e-05, "loss": 4.5453, "step": 46210 }, { "epoch": 0.9464911023283435, "grad_norm": 0.3183940649032593, "learning_rate": 4e-05, "loss": 4.5651, "step": 46220 }, { "epoch": 0.9466958818831528, "grad_norm": 0.30345025658607483, "learning_rate": 4e-05, "loss": 4.5724, "step": 46230 }, { "epoch": 0.946900661437962, "grad_norm": 0.30393528938293457, "learning_rate": 4e-05, "loss": 4.589, "step": 46240 }, { "epoch": 0.9471054409927713, "grad_norm": 0.35439199209213257, "learning_rate": 4e-05, "loss": 4.5866, "step": 46250 }, { "epoch": 0.9473102205475805, "grad_norm": 0.32344692945480347, "learning_rate": 4e-05, "loss": 4.5418, "step": 46260 }, { "epoch": 0.9475150001023898, "grad_norm": 0.34853124618530273, "learning_rate": 4e-05, "loss": 4.5437, "step": 46270 }, { "epoch": 0.947719779657199, "grad_norm": 0.33046939969062805, "learning_rate": 4e-05, "loss": 4.5398, "step": 46280 }, { "epoch": 0.9479245592120082, "grad_norm": 0.5163120627403259, "learning_rate": 4e-05, "loss": 4.5591, "step": 46290 }, { "epoch": 0.9481293387668175, "grad_norm": 0.3317198157310486, "learning_rate": 4e-05, "loss": 4.4819, "step": 46300 }, { "epoch": 0.9483341183216267, "grad_norm": 0.32500961422920227, "learning_rate": 4e-05, "loss": 4.5252, "step": 46310 }, { "epoch": 0.948538897876436, "grad_norm": 0.34918829798698425, "learning_rate": 4e-05, "loss": 4.5657, "step": 46320 }, { "epoch": 0.9487436774312452, "grad_norm": 0.32156091928482056, "learning_rate": 4e-05, "loss": 4.5388, "step": 46330 }, { "epoch": 0.9489484569860546, "grad_norm": 0.3192023038864136, "learning_rate": 4e-05, "loss": 4.5612, "step": 46340 }, { "epoch": 0.9491532365408638, "grad_norm": 0.32930466532707214, "learning_rate": 4e-05, "loss": 4.5581, "step": 46350 }, { "epoch": 0.949358016095673, "grad_norm": 0.30861541628837585, "learning_rate": 4e-05, "loss": 4.5325, "step": 46360 }, { "epoch": 0.9495627956504823, "grad_norm": 0.30958154797554016, "learning_rate": 4e-05, "loss": 4.5478, "step": 46370 }, { "epoch": 0.9497675752052915, "grad_norm": 0.3103031516075134, "learning_rate": 4e-05, "loss": 4.5132, "step": 46380 }, { "epoch": 0.9499723547601008, "grad_norm": 0.31730809807777405, "learning_rate": 4e-05, "loss": 4.5647, "step": 46390 }, { "epoch": 0.95017713431491, "grad_norm": 0.33080166578292847, "learning_rate": 4e-05, "loss": 4.5706, "step": 46400 }, { "epoch": 0.9503819138697193, "grad_norm": 0.33791664242744446, "learning_rate": 4e-05, "loss": 4.5262, "step": 46410 }, { "epoch": 0.9505866934245285, "grad_norm": 0.3538973033428192, "learning_rate": 4e-05, "loss": 4.5165, "step": 46420 }, { "epoch": 0.9507914729793377, "grad_norm": 0.3252312242984772, "learning_rate": 4e-05, "loss": 4.5591, "step": 46430 }, { "epoch": 0.950996252534147, "grad_norm": 0.3147037923336029, "learning_rate": 4e-05, "loss": 4.584, "step": 46440 }, { "epoch": 0.9512010320889562, "grad_norm": 0.3096960484981537, "learning_rate": 4e-05, "loss": 4.5081, "step": 46450 }, { "epoch": 0.9514058116437655, "grad_norm": 0.32310470938682556, "learning_rate": 4e-05, "loss": 4.5431, "step": 46460 }, { "epoch": 0.9516105911985747, "grad_norm": 0.3410964012145996, "learning_rate": 4e-05, "loss": 4.5303, "step": 46470 }, { "epoch": 0.951815370753384, "grad_norm": 0.32467493414878845, "learning_rate": 4e-05, "loss": 4.608, "step": 46480 }, { "epoch": 0.9520201503081932, "grad_norm": 0.317932665348053, "learning_rate": 4e-05, "loss": 4.5558, "step": 46490 }, { "epoch": 0.9522249298630024, "grad_norm": 0.3132171332836151, "learning_rate": 4e-05, "loss": 4.5643, "step": 46500 }, { "epoch": 0.9524297094178117, "grad_norm": 0.3279651701450348, "learning_rate": 4e-05, "loss": 4.5692, "step": 46510 }, { "epoch": 0.9526344889726209, "grad_norm": 0.3203917443752289, "learning_rate": 4e-05, "loss": 4.557, "step": 46520 }, { "epoch": 0.9528392685274302, "grad_norm": 0.32041600346565247, "learning_rate": 4e-05, "loss": 4.5214, "step": 46530 }, { "epoch": 0.9530440480822394, "grad_norm": 0.326524943113327, "learning_rate": 4e-05, "loss": 4.5483, "step": 46540 }, { "epoch": 0.9532488276370488, "grad_norm": 0.337062269449234, "learning_rate": 4e-05, "loss": 4.5515, "step": 46550 }, { "epoch": 0.953453607191858, "grad_norm": 0.3208051919937134, "learning_rate": 4e-05, "loss": 4.5726, "step": 46560 }, { "epoch": 0.9536583867466673, "grad_norm": 0.3239714801311493, "learning_rate": 4e-05, "loss": 4.5697, "step": 46570 }, { "epoch": 0.9538631663014765, "grad_norm": 0.36318278312683105, "learning_rate": 4e-05, "loss": 4.5534, "step": 46580 }, { "epoch": 0.9540679458562857, "grad_norm": 0.31034743785858154, "learning_rate": 4e-05, "loss": 4.5486, "step": 46590 }, { "epoch": 0.954272725411095, "grad_norm": 0.31020817160606384, "learning_rate": 4e-05, "loss": 4.5603, "step": 46600 }, { "epoch": 0.9544775049659042, "grad_norm": 0.38229674100875854, "learning_rate": 4e-05, "loss": 4.5629, "step": 46610 }, { "epoch": 0.9546822845207135, "grad_norm": 0.32211023569107056, "learning_rate": 4e-05, "loss": 4.5421, "step": 46620 }, { "epoch": 0.9548870640755227, "grad_norm": 0.3195001780986786, "learning_rate": 4e-05, "loss": 4.5701, "step": 46630 }, { "epoch": 0.955091843630332, "grad_norm": 0.3793138861656189, "learning_rate": 4e-05, "loss": 4.5458, "step": 46640 }, { "epoch": 0.9552966231851412, "grad_norm": 0.3213236927986145, "learning_rate": 4e-05, "loss": 4.565, "step": 46650 }, { "epoch": 0.9555014027399504, "grad_norm": 0.304331511259079, "learning_rate": 4e-05, "loss": 4.5342, "step": 46660 }, { "epoch": 0.9557061822947597, "grad_norm": 0.3650117516517639, "learning_rate": 4e-05, "loss": 4.518, "step": 46670 }, { "epoch": 0.9559109618495689, "grad_norm": 0.3532241880893707, "learning_rate": 4e-05, "loss": 4.5568, "step": 46680 }, { "epoch": 0.9561157414043782, "grad_norm": 0.3322131335735321, "learning_rate": 4e-05, "loss": 4.584, "step": 46690 }, { "epoch": 0.9563205209591874, "grad_norm": 0.3379342257976532, "learning_rate": 4e-05, "loss": 4.5572, "step": 46700 }, { "epoch": 0.9565253005139966, "grad_norm": 0.31786176562309265, "learning_rate": 4e-05, "loss": 4.5634, "step": 46710 }, { "epoch": 0.9567300800688059, "grad_norm": 0.31300413608551025, "learning_rate": 4e-05, "loss": 4.5617, "step": 46720 }, { "epoch": 0.9569348596236151, "grad_norm": 0.3562588691711426, "learning_rate": 4e-05, "loss": 4.5802, "step": 46730 }, { "epoch": 0.9571396391784244, "grad_norm": 0.3157178461551666, "learning_rate": 4e-05, "loss": 4.5796, "step": 46740 }, { "epoch": 0.9573444187332337, "grad_norm": 0.34579920768737793, "learning_rate": 4e-05, "loss": 4.5561, "step": 46750 }, { "epoch": 0.957549198288043, "grad_norm": 0.3953477442264557, "learning_rate": 4e-05, "loss": 4.5164, "step": 46760 }, { "epoch": 0.9577539778428522, "grad_norm": 0.3326374590396881, "learning_rate": 4e-05, "loss": 4.556, "step": 46770 }, { "epoch": 0.9579587573976615, "grad_norm": 0.31724971532821655, "learning_rate": 4e-05, "loss": 4.606, "step": 46780 }, { "epoch": 0.9581635369524707, "grad_norm": 0.32476869225502014, "learning_rate": 4e-05, "loss": 4.5487, "step": 46790 }, { "epoch": 0.9583683165072799, "grad_norm": 0.33330246806144714, "learning_rate": 4e-05, "loss": 4.5204, "step": 46800 }, { "epoch": 0.9585730960620892, "grad_norm": 0.3505427837371826, "learning_rate": 4e-05, "loss": 4.5862, "step": 46810 }, { "epoch": 0.9587778756168984, "grad_norm": 0.3296929597854614, "learning_rate": 4e-05, "loss": 4.5904, "step": 46820 }, { "epoch": 0.9589826551717077, "grad_norm": 0.31201407313346863, "learning_rate": 4e-05, "loss": 4.5867, "step": 46830 }, { "epoch": 0.9591874347265169, "grad_norm": 0.31656521558761597, "learning_rate": 4e-05, "loss": 4.5303, "step": 46840 }, { "epoch": 0.9593922142813261, "grad_norm": 0.3408479392528534, "learning_rate": 4e-05, "loss": 4.5728, "step": 46850 }, { "epoch": 0.9595969938361354, "grad_norm": 0.3489954471588135, "learning_rate": 4e-05, "loss": 4.508, "step": 46860 }, { "epoch": 0.9598017733909446, "grad_norm": 0.32541826367378235, "learning_rate": 4e-05, "loss": 4.5661, "step": 46870 }, { "epoch": 0.9600065529457539, "grad_norm": 0.7616615295410156, "learning_rate": 4e-05, "loss": 4.5414, "step": 46880 }, { "epoch": 0.9602113325005631, "grad_norm": 0.3301410377025604, "learning_rate": 4e-05, "loss": 4.5427, "step": 46890 }, { "epoch": 0.9604161120553724, "grad_norm": 0.31184107065200806, "learning_rate": 4e-05, "loss": 4.5606, "step": 46900 }, { "epoch": 0.9606208916101816, "grad_norm": 0.3217566907405853, "learning_rate": 4e-05, "loss": 4.5715, "step": 46910 }, { "epoch": 0.9608256711649908, "grad_norm": 0.3093664348125458, "learning_rate": 4e-05, "loss": 4.519, "step": 46920 }, { "epoch": 0.9610304507198001, "grad_norm": 0.31126877665519714, "learning_rate": 4e-05, "loss": 4.5201, "step": 46930 }, { "epoch": 0.9612352302746093, "grad_norm": 0.3557184636592865, "learning_rate": 4e-05, "loss": 4.5645, "step": 46940 }, { "epoch": 0.9614400098294187, "grad_norm": 0.3358568847179413, "learning_rate": 4e-05, "loss": 4.5646, "step": 46950 }, { "epoch": 0.9616447893842279, "grad_norm": 0.34344351291656494, "learning_rate": 4e-05, "loss": 4.5857, "step": 46960 }, { "epoch": 0.9618495689390372, "grad_norm": 0.3141220510005951, "learning_rate": 4e-05, "loss": 4.5409, "step": 46970 }, { "epoch": 0.9620543484938464, "grad_norm": 0.32317131757736206, "learning_rate": 4e-05, "loss": 4.5726, "step": 46980 }, { "epoch": 0.9622591280486557, "grad_norm": 0.3247339427471161, "learning_rate": 4e-05, "loss": 4.5388, "step": 46990 }, { "epoch": 0.9624639076034649, "grad_norm": 0.3239203989505768, "learning_rate": 4e-05, "loss": 4.5892, "step": 47000 }, { "epoch": 0.9626686871582741, "grad_norm": 0.35522451996803284, "learning_rate": 4e-05, "loss": 4.5013, "step": 47010 }, { "epoch": 0.9628734667130834, "grad_norm": 0.32539790868759155, "learning_rate": 4e-05, "loss": 4.569, "step": 47020 }, { "epoch": 0.9630782462678926, "grad_norm": 0.41466546058654785, "learning_rate": 4e-05, "loss": 4.5335, "step": 47030 }, { "epoch": 0.9632830258227019, "grad_norm": 0.33411848545074463, "learning_rate": 4e-05, "loss": 4.5566, "step": 47040 }, { "epoch": 0.9634878053775111, "grad_norm": 0.3331572711467743, "learning_rate": 4e-05, "loss": 4.5539, "step": 47050 }, { "epoch": 0.9636925849323204, "grad_norm": 0.38816383481025696, "learning_rate": 4e-05, "loss": 4.5683, "step": 47060 }, { "epoch": 0.9638973644871296, "grad_norm": 0.3279906213283539, "learning_rate": 4e-05, "loss": 4.5379, "step": 47070 }, { "epoch": 0.9641021440419388, "grad_norm": 0.3425002992153168, "learning_rate": 4e-05, "loss": 4.5698, "step": 47080 }, { "epoch": 0.9643069235967481, "grad_norm": 0.320560097694397, "learning_rate": 4e-05, "loss": 4.5611, "step": 47090 }, { "epoch": 0.9645117031515573, "grad_norm": 0.3480735719203949, "learning_rate": 4e-05, "loss": 4.5903, "step": 47100 }, { "epoch": 0.9647164827063666, "grad_norm": 0.6516593098640442, "learning_rate": 4e-05, "loss": 4.5654, "step": 47110 }, { "epoch": 0.9649212622611758, "grad_norm": 0.32140108942985535, "learning_rate": 4e-05, "loss": 4.5859, "step": 47120 }, { "epoch": 0.965126041815985, "grad_norm": 0.34245118498802185, "learning_rate": 4e-05, "loss": 4.5617, "step": 47130 }, { "epoch": 0.9653308213707943, "grad_norm": 0.32911524176597595, "learning_rate": 4e-05, "loss": 4.5324, "step": 47140 }, { "epoch": 0.9655356009256036, "grad_norm": 0.3488873839378357, "learning_rate": 4e-05, "loss": 4.5632, "step": 47150 }, { "epoch": 0.9657403804804129, "grad_norm": 0.3380149304866791, "learning_rate": 4e-05, "loss": 4.5397, "step": 47160 }, { "epoch": 0.9659451600352221, "grad_norm": 0.3188472092151642, "learning_rate": 4e-05, "loss": 4.573, "step": 47170 }, { "epoch": 0.9661499395900314, "grad_norm": 0.3182220458984375, "learning_rate": 4e-05, "loss": 4.5646, "step": 47180 }, { "epoch": 0.9663547191448406, "grad_norm": 0.3335462808609009, "learning_rate": 4e-05, "loss": 4.5637, "step": 47190 }, { "epoch": 0.9665594986996499, "grad_norm": 0.32872721552848816, "learning_rate": 4e-05, "loss": 4.545, "step": 47200 }, { "epoch": 0.9667642782544591, "grad_norm": 0.31972619891166687, "learning_rate": 4e-05, "loss": 4.5491, "step": 47210 }, { "epoch": 0.9669690578092683, "grad_norm": 0.3711688816547394, "learning_rate": 4e-05, "loss": 4.5732, "step": 47220 }, { "epoch": 0.9671738373640776, "grad_norm": 0.3483497202396393, "learning_rate": 4e-05, "loss": 4.5873, "step": 47230 }, { "epoch": 0.9673786169188868, "grad_norm": 0.3899175822734833, "learning_rate": 4e-05, "loss": 4.5106, "step": 47240 }, { "epoch": 0.9675833964736961, "grad_norm": 0.3247251808643341, "learning_rate": 4e-05, "loss": 4.5453, "step": 47250 }, { "epoch": 0.9677881760285053, "grad_norm": 0.32415032386779785, "learning_rate": 4e-05, "loss": 4.4923, "step": 47260 }, { "epoch": 0.9679929555833146, "grad_norm": 0.32311588525772095, "learning_rate": 4e-05, "loss": 4.583, "step": 47270 }, { "epoch": 0.9681977351381238, "grad_norm": 0.3264346122741699, "learning_rate": 4e-05, "loss": 4.5331, "step": 47280 }, { "epoch": 0.968402514692933, "grad_norm": 0.34599217772483826, "learning_rate": 4e-05, "loss": 4.5525, "step": 47290 }, { "epoch": 0.9686072942477423, "grad_norm": 0.31252530217170715, "learning_rate": 4e-05, "loss": 4.5669, "step": 47300 }, { "epoch": 0.9688120738025515, "grad_norm": 0.32626235485076904, "learning_rate": 4e-05, "loss": 4.6154, "step": 47310 }, { "epoch": 0.9690168533573608, "grad_norm": 0.3365563750267029, "learning_rate": 4e-05, "loss": 4.5359, "step": 47320 }, { "epoch": 0.96922163291217, "grad_norm": 0.3094619810581207, "learning_rate": 4e-05, "loss": 4.571, "step": 47330 }, { "epoch": 0.9694264124669792, "grad_norm": 0.3282161355018616, "learning_rate": 4e-05, "loss": 4.5689, "step": 47340 }, { "epoch": 0.9696311920217885, "grad_norm": 0.33691951632499695, "learning_rate": 4e-05, "loss": 4.575, "step": 47350 }, { "epoch": 0.9698359715765978, "grad_norm": 0.3157297968864441, "learning_rate": 4e-05, "loss": 4.5732, "step": 47360 }, { "epoch": 0.9700407511314071, "grad_norm": 0.3412298858165741, "learning_rate": 4e-05, "loss": 4.558, "step": 47370 }, { "epoch": 0.9702455306862163, "grad_norm": 0.41333726048469543, "learning_rate": 4e-05, "loss": 4.5732, "step": 47380 }, { "epoch": 0.9704503102410256, "grad_norm": 0.3142140209674835, "learning_rate": 4e-05, "loss": 4.5775, "step": 47390 }, { "epoch": 0.9706550897958348, "grad_norm": 0.36438649892807007, "learning_rate": 4e-05, "loss": 4.5636, "step": 47400 }, { "epoch": 0.9708598693506441, "grad_norm": 0.4539795219898224, "learning_rate": 4e-05, "loss": 4.5388, "step": 47410 }, { "epoch": 0.9710646489054533, "grad_norm": 0.32517075538635254, "learning_rate": 4e-05, "loss": 4.5736, "step": 47420 }, { "epoch": 0.9712694284602625, "grad_norm": 0.34917378425598145, "learning_rate": 4e-05, "loss": 4.5442, "step": 47430 }, { "epoch": 0.9714742080150718, "grad_norm": 0.320481538772583, "learning_rate": 4e-05, "loss": 4.5308, "step": 47440 }, { "epoch": 0.971678987569881, "grad_norm": 0.32384875416755676, "learning_rate": 4e-05, "loss": 4.5529, "step": 47450 }, { "epoch": 0.9718837671246903, "grad_norm": 0.32067564129829407, "learning_rate": 4e-05, "loss": 4.5583, "step": 47460 }, { "epoch": 0.9720885466794995, "grad_norm": 0.33786696195602417, "learning_rate": 4e-05, "loss": 4.565, "step": 47470 }, { "epoch": 0.9722933262343088, "grad_norm": 0.303669273853302, "learning_rate": 4e-05, "loss": 4.5522, "step": 47480 }, { "epoch": 0.972498105789118, "grad_norm": 0.3249942660331726, "learning_rate": 4e-05, "loss": 4.5413, "step": 47490 }, { "epoch": 0.9727028853439272, "grad_norm": 0.3326072096824646, "learning_rate": 4e-05, "loss": 4.5572, "step": 47500 }, { "epoch": 0.9729076648987365, "grad_norm": 0.3195289969444275, "learning_rate": 4e-05, "loss": 4.5653, "step": 47510 }, { "epoch": 0.9731124444535457, "grad_norm": 0.3481857478618622, "learning_rate": 4e-05, "loss": 4.5637, "step": 47520 }, { "epoch": 0.973317224008355, "grad_norm": 0.4060842990875244, "learning_rate": 4e-05, "loss": 4.5625, "step": 47530 }, { "epoch": 0.9735220035631642, "grad_norm": 0.3097035586833954, "learning_rate": 4e-05, "loss": 4.5666, "step": 47540 }, { "epoch": 0.9737267831179734, "grad_norm": 0.3601599633693695, "learning_rate": 4e-05, "loss": 4.6152, "step": 47550 }, { "epoch": 0.9739315626727828, "grad_norm": 0.3242459297180176, "learning_rate": 4e-05, "loss": 4.5512, "step": 47560 }, { "epoch": 0.974136342227592, "grad_norm": 0.3202042877674103, "learning_rate": 4e-05, "loss": 4.5766, "step": 47570 }, { "epoch": 0.9743411217824013, "grad_norm": 0.33238908648490906, "learning_rate": 4e-05, "loss": 4.5425, "step": 47580 }, { "epoch": 0.9745459013372105, "grad_norm": 0.3203936815261841, "learning_rate": 4e-05, "loss": 4.5736, "step": 47590 }, { "epoch": 0.9747506808920198, "grad_norm": 0.32392263412475586, "learning_rate": 4e-05, "loss": 4.5763, "step": 47600 }, { "epoch": 0.974955460446829, "grad_norm": 0.3200019598007202, "learning_rate": 4e-05, "loss": 4.5398, "step": 47610 }, { "epoch": 0.9751602400016383, "grad_norm": 0.3280363976955414, "learning_rate": 4e-05, "loss": 4.568, "step": 47620 }, { "epoch": 0.9753650195564475, "grad_norm": 0.3076491057872772, "learning_rate": 4e-05, "loss": 4.5231, "step": 47630 }, { "epoch": 0.9755697991112567, "grad_norm": 0.32782208919525146, "learning_rate": 4e-05, "loss": 4.5419, "step": 47640 }, { "epoch": 0.975774578666066, "grad_norm": 0.33048853278160095, "learning_rate": 4e-05, "loss": 4.516, "step": 47650 }, { "epoch": 0.9759793582208752, "grad_norm": 0.3323243260383606, "learning_rate": 4e-05, "loss": 4.5358, "step": 47660 }, { "epoch": 0.9761841377756845, "grad_norm": 0.34078171849250793, "learning_rate": 4e-05, "loss": 4.5525, "step": 47670 }, { "epoch": 0.9763889173304937, "grad_norm": 0.3658129870891571, "learning_rate": 4e-05, "loss": 4.5469, "step": 47680 }, { "epoch": 0.976593696885303, "grad_norm": 0.31622937321662903, "learning_rate": 4e-05, "loss": 4.5849, "step": 47690 }, { "epoch": 0.9767984764401122, "grad_norm": 0.3148673474788666, "learning_rate": 4e-05, "loss": 4.5298, "step": 47700 }, { "epoch": 0.9770032559949214, "grad_norm": 0.3100682199001312, "learning_rate": 4e-05, "loss": 4.5349, "step": 47710 }, { "epoch": 0.9772080355497307, "grad_norm": 0.38230493664741516, "learning_rate": 4e-05, "loss": 4.5278, "step": 47720 }, { "epoch": 0.9774128151045399, "grad_norm": 0.339199960231781, "learning_rate": 4e-05, "loss": 4.5906, "step": 47730 }, { "epoch": 0.9776175946593492, "grad_norm": 0.3349709212779999, "learning_rate": 4e-05, "loss": 4.53, "step": 47740 }, { "epoch": 0.9778223742141584, "grad_norm": 0.3122599422931671, "learning_rate": 4e-05, "loss": 4.5659, "step": 47750 }, { "epoch": 0.9780271537689678, "grad_norm": 0.3261248469352722, "learning_rate": 4e-05, "loss": 4.5501, "step": 47760 }, { "epoch": 0.978231933323777, "grad_norm": 0.31458988785743713, "learning_rate": 4e-05, "loss": 4.531, "step": 47770 }, { "epoch": 0.9784367128785862, "grad_norm": 0.313297837972641, "learning_rate": 4e-05, "loss": 4.5518, "step": 47780 }, { "epoch": 0.9786414924333955, "grad_norm": 0.33505669236183167, "learning_rate": 4e-05, "loss": 4.5619, "step": 47790 }, { "epoch": 0.9788462719882047, "grad_norm": 0.32072460651397705, "learning_rate": 4e-05, "loss": 4.5512, "step": 47800 }, { "epoch": 0.979051051543014, "grad_norm": 0.3592844605445862, "learning_rate": 4e-05, "loss": 4.5553, "step": 47810 }, { "epoch": 0.9792558310978232, "grad_norm": 0.35318103432655334, "learning_rate": 4e-05, "loss": 4.516, "step": 47820 }, { "epoch": 0.9794606106526325, "grad_norm": 0.3392002284526825, "learning_rate": 4e-05, "loss": 4.5302, "step": 47830 }, { "epoch": 0.9796653902074417, "grad_norm": 0.3325815200805664, "learning_rate": 4e-05, "loss": 4.5727, "step": 47840 }, { "epoch": 0.9798701697622509, "grad_norm": 0.3270987272262573, "learning_rate": 4e-05, "loss": 4.4671, "step": 47850 }, { "epoch": 0.9800749493170602, "grad_norm": 0.32717788219451904, "learning_rate": 4e-05, "loss": 4.5396, "step": 47860 }, { "epoch": 0.9802797288718694, "grad_norm": 0.32797834277153015, "learning_rate": 4e-05, "loss": 4.57, "step": 47870 }, { "epoch": 0.9804845084266787, "grad_norm": 0.35029080510139465, "learning_rate": 4e-05, "loss": 4.5932, "step": 47880 }, { "epoch": 0.9806892879814879, "grad_norm": 0.3288595676422119, "learning_rate": 4e-05, "loss": 4.5366, "step": 47890 }, { "epoch": 0.9808940675362972, "grad_norm": 0.33771997690200806, "learning_rate": 4e-05, "loss": 4.5549, "step": 47900 }, { "epoch": 0.9810988470911064, "grad_norm": 0.32012638449668884, "learning_rate": 4e-05, "loss": 4.5551, "step": 47910 }, { "epoch": 0.9813036266459156, "grad_norm": 0.30867964029312134, "learning_rate": 4e-05, "loss": 4.5521, "step": 47920 }, { "epoch": 0.9815084062007249, "grad_norm": 0.3192601799964905, "learning_rate": 4e-05, "loss": 4.5501, "step": 47930 }, { "epoch": 0.9817131857555341, "grad_norm": 0.3232973515987396, "learning_rate": 4e-05, "loss": 4.5697, "step": 47940 }, { "epoch": 0.9819179653103434, "grad_norm": 0.3176027238368988, "learning_rate": 4e-05, "loss": 4.5854, "step": 47950 }, { "epoch": 0.9821227448651526, "grad_norm": 0.359607070684433, "learning_rate": 4e-05, "loss": 4.5621, "step": 47960 }, { "epoch": 0.982327524419962, "grad_norm": 0.41966933012008667, "learning_rate": 4e-05, "loss": 4.5726, "step": 47970 }, { "epoch": 0.9825323039747712, "grad_norm": 0.32640331983566284, "learning_rate": 4e-05, "loss": 4.5528, "step": 47980 }, { "epoch": 0.9827370835295804, "grad_norm": 0.3352060317993164, "learning_rate": 4e-05, "loss": 4.5369, "step": 47990 }, { "epoch": 0.9829418630843897, "grad_norm": 0.3422849774360657, "learning_rate": 4e-05, "loss": 4.5835, "step": 48000 }, { "epoch": 0.9829418630843897, "eval_loss": 4.555622100830078, "eval_runtime": 4.3696, "eval_samples_per_second": 266.845, "eval_steps_per_second": 33.413, "step": 48000 } ], "logging_steps": 10, "max_steps": 48833, "num_input_tokens_seen": 0, "num_train_epochs": 9223372036854775807, "save_steps": 2000, "stateful_callbacks": { "TrainerControl": { "args": { "should_epoch_stop": false, "should_evaluate": false, "should_log": false, "should_save": true, "should_training_stop": false }, "attributes": {} } }, "total_flos": 0.0, "train_batch_size": 8, "trial_name": null, "trial_params": null }